## These packages may not directly come with anaconda installation, so ensure you have installed on your computer.
#!pip install pymysql
#!pip install sqlalchemy
## Also MySQL installation and configuration is needed.(Somehow you have to establish a connection to MySQL database)


import pandas as pd
import numpy as np

#Task 1: Read the given dataset
def read_data_from_csv():
    #df =read the 'fatality.csv' file
    df = pd.read_csv('./fatality.csv')
    return df

#Task 2: Clean the given dataset
def data_cleaning():
    # do not edit the predefined function name
    df = read_data_from_csv()
    # Remove the 'Unnamed' column
    df.drop(columns = ['Unnamed'], inplace=True)
    
    return df

#Task 3: Export the cleaned dataset to "fatalities_cleaned.csv"
def export_the_dataset():
    # do not edit the predefined function name
    df=data_cleaning()
    #write your code to export the cleaned dataset and set the index=false and return the same as 'df'
    df.to_csv('./fatalities_cleaned.csv', index=False)
    return df

#TASK 4: Load the Cleaned dataset 'fatalities_cleaned.csv' to the database provided.
#follow the instruction in the Task 4 description and complete the task as per it.

#check if mysql table is created using "fatalities_cleaned"
#Use this final dataset and upload it on the provided database for performing analysis in  MySQL
#To run this task click on the terminal and click on the run project


## Create a cleaned dataset file as per above functions.
export_the_dataset()


df_cleaned = pd.read_csv('./fatalities_cleaned.csv')


df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14914 entries, 0 to 14913
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             14914 non-null  int64 
 1   incident_date  14914 non-null  object
 2   day_of_week    14914 non-null  object
 3   city           14914 non-null  object
 4   state          14914 non-null  object
 5   description    14914 non-null  object
 6   plan           14914 non-null  object
 7   citation       14914 non-null  object
dtypes: int64(1), object(7)
memory usage: 932.2+ KB


df_cleaned.isnull().sum()

id               0
incident_date    0
day_of_week      0
city             0
state            0
description      0
plan             0
citation         0
dtype: int64


df_cleaned.head()


# pandas dataframe
df_cleaned = pd.read_csv('./fatalities_cleaned.csv')
df_cleaned.head()


from sqlalchemy import create_engine
import pymysql


## Define database details
user = 'b8e35269'
password = 'Cab#22se'
host = 'localhost'
port = 3306
database = 'b8e35269'


conn_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
conn_string

'mysql+pymysql://b8e35269:Cab#22se@localhost:3306/b8e35269'


engine = create_engine(conn_string)


connection = engine.connect()


print("Number of reported incidents: {}".format(df_cleaned.shape[0]))

Number of reported incidents: 14914


query = "select count(*) as n_fatalities from fatalities_cleaned;"
pd.read_sql(query, connection)


# now lets convert the column 'incident_date' to datetime object. It would have been done while laoding itself using parse_dates option.

df_cleaned['incident_date'] = pd.to_datetime(df_cleaned.incident_date)


# create a column to store the year information
df_cleaned['incident_year'] = df_cleaned.incident_date.dt.year
yearwise_incidents = df_cleaned.groupby('incident_year').agg('count')[['id']].reset_index()
yearwise_incidents.rename(columns={'id': 'n_fatalities'}, inplace=True)
yearwise_incidents['previous_year'] = yearwise_incidents.n_fatalities.shift(1)
yearwise_incidents['year_to_year'] = np.round(100 * (yearwise_incidents.n_fatalities - yearwise_incidents.previous_year)/yearwise_incidents.previous_year)
yearwise_incidents.iloc[:-1, :] # as we don't need the data for the year 2022 (the last row)


query = """With table_year_incidents as (
select year(incident_date) as incident_year, count(*) as n_fatalities
from fatalities_cleaned
group by year(incident_date)
order by 1
    )
select incident_year, n_fatalities,
lag(n_fatalities) over() as previous_year,
round(100 * (n_fatalities - lag(n_fatalities) over() )/
            (lag(n_fatalities) over())) as year_to_year
from table_year_incidents
where incident_year != 2022"""

pd.read_sql(query, connection)


# print("Number of fatalities that received a citation: {}"
#       .format(len(df_cleaned[df_cleaned['citation'] != 'unknown'])))

citationwise_fatalities = df_cleaned.groupby('citation').agg('count')[['id']].reset_index()
citationwise_fatalities.rename(columns={'id':'n_fatalities'},inplace=True)
citationwise_fatalities.sort_values('n_fatalities', ascending=False)


query = """select citation, count(*) as n_fatalities
from fatalities_cleaned 
group by citation
order by n_fatalities desc
"""
pd.read_sql(query, connection)


day_of_week_fatalities = df_cleaned.groupby('day_of_week').agg(['count'])['id'].reset_index()
day_of_week_fatalities.rename(columns = {'count': 'n_fatalities'}, inplace=True)
day_of_week_fatalities['percentage'] = np.round(100*day_of_week_fatalities.n_fatalities / day_of_week_fatalities.n_fatalities.sum(),2)
day_of_week_fatalities.sort_values('percentage', ascending=False)


query = """select day_of_week, count(*) as n_fatalities,
round(100 * count(*)/sum(count(*)) over () , 2) as percentage
from fatalities_cleaned
group by day_of_week
order by percentage desc
"""
pd.read_sql(query, connection)


print("Number of fatalities involving welding: {}"
      .format(df_cleaned.description.str.lower().str.contains('weld').sum()))

Number of fatalities involving welding: 79


query = """select count(*) as welding_fatalities
from fatalities_cleaned
where lower(description) like '%%weld%%'
"""
pd.read_sql(query, connection)


weldings_filter = df_cleaned.description.str.lower().str.contains('weld')
print("Last 5 fatalities during welding:")

Last 5 fatalities during welding:


df_cleaned.loc[weldings_filter,:].sort_values(by='incident_date', ascending=False).head()


query = """select *
from fatalities_cleaned
where lower(description) like '%%weld%%'
order by incident_date desc
limit 5
"""
pd.read_sql(query, connection)


statewise_fatalities = df_cleaned.groupby('state').agg('count')[['id']].reset_index()
statewise_fatalities.rename(columns={'id':'n_fatalities'}, inplace=True)
print("Top 5 states which have most fatal incidents:")
statewise_fatalities.sort_values('n_fatalities', ascending=False).head()

Top 5 states which have most fatal incidents:


query = """select state, count(*) as n_fatalities
from fatalities_cleaned
group by state
order by n_fatalities desc
limit 5
"""
pd.read_sql(query, connection)


stabb_filter = df_cleaned.description.str.lower().str.contains('stabb')
stabbing_fatalities = df_cleaned[stabb_filter]
statewise_stabbing_fatalities = stabbing_fatalities.groupby('state').agg('count')['id'].reset_index()
statewise_stabbing_fatalities.rename(columns={'id':'stabbing_fatalities'}, inplace=True)
print("Top 5 states which have most fatal incidents:")
statewise_stabbing_fatalities.sort_values('stabbing_fatalities', ascending=False).head()

Top 5 states which have most fatal incidents:


query = """select state, count(*) as stabbing_fatalities
from fatalities_cleaned
where lower(description) like '%%stabb%%'
group by state
order by stabbing_fatalities desc
limit 5
"""
pd.read_sql(query, connection)


shoot_filter = df_cleaned.description.str.lower().str.contains('shot')
shooting_fatalities = df_cleaned[shoot_filter]
statewise_shooting_fatalities = shooting_fatalities.groupby('state').agg('count')['id'].reset_index()
statewise_shooting_fatalities.rename(columns={'id':'n_fatalities'}, inplace=True)
print("Top 10 states which have most fatal incidents:")
statewise_shooting_fatalities.sort_values(['n_fatalities','state'], ascending=False).head(10)

Top 10 states which have most fatal incidents:


query = """select state, count(*) as shooting_fatalities
from fatalities_cleaned
where lower(description) like '%%shot%%'
group by state
order by shooting_fatalities desc
limit 10
"""
pd.read_sql(query, connection)


yearwise_shooting_fatalities = shooting_fatalities.groupby('incident_year').agg('count')[['id']].reset_index()
yearwise_shooting_fatalities.rename(columns={'id':'shooting_fatalities'}, inplace=True)


yearwise_shooting_fatalities.sort_values('shooting_fatalities', ascending=False)


query = """select year(incident_date) as incident_year, count(*) as shooting_fatalities
from fatalities_cleaned
where lower(description) like '%%shot%%'
group by year(incident_date)
order by shooting_fatalities desc
"""
pd.read_sql(query, connection)


connection.close()

	id	incident_date	day_of_week	city	state	description	plan	citation
0	1	2009-06-23	tuesday	tampa	florida	Victim was inside a building bundling material...	unknown	unknown
1	2	2009-06-30	tuesday	conroe	texas	Victim was on a scaffold doing repairs on an a...	unknown	unknown
2	3	2009-07-06	monday	miami	florida	Victim was conducting roofing work, and fell t...	unknown	unknown
3	4	2009-07-09	thursday	north platte	nebraska	Victim was climbing a ladder while servicing a...	unknown	unknown
4	5	2009-07-11	saturday	greensburg	pennsylvania	Employees were throwing away old files. They h...	unknown	unknown
...	...	...	...	...	...	...	...	...
14909	14910	2017-01-03	tuesday	cut and shoot	texas	Worker died in fall from tree.	federal	no
14910	14911	2017-01-03	tuesday	glendale	kentucky	Worker fatally stabbed by resident.	state	yes
14911	14912	2017-01-03	tuesday	la verkin	utah	Worker died after being struck by excavator co...	state	no
14912	14913	2017-01-03	tuesday	mount pleasant	texas	Worker died after being struck by tanker truck.	federal	yes
14913	14914	2017-01-02	monday	shakopee	minnesota	Worker died in fall inside grain tank.	state	yes

	id	incident_date	day_of_week	city	state	description	plan	citation
0	1	2009-06-23	tuesday	tampa	florida	Victim was inside a building bundling material...	unknown	unknown
1	2	2009-06-30	tuesday	conroe	texas	Victim was on a scaffold doing repairs on an a...	unknown	unknown
2	3	2009-07-06	monday	miami	florida	Victim was conducting roofing work, and fell t...	unknown	unknown
3	4	2009-07-09	thursday	north platte	nebraska	Victim was climbing a ladder while servicing a...	unknown	unknown
4	5	2009-07-11	saturday	greensburg	pennsylvania	Employees were throwing away old files. They h...	unknown	unknown

	id	incident_date	day_of_week	city	state	description	plan	citation
0	1	2009-06-23	tuesday	tampa	florida	Victim was inside a building bundling material...	unknown	unknown
1	2	2009-06-30	tuesday	conroe	texas	Victim was on a scaffold doing repairs on an a...	unknown	unknown
2	3	2009-07-06	monday	miami	florida	Victim was conducting roofing work, and fell t...	unknown	unknown
3	4	2009-07-09	thursday	north platte	nebraska	Victim was climbing a ladder while servicing a...	unknown	unknown
4	5	2009-07-11	saturday	greensburg	pennsylvania	Employees were throwing away old files. They h...	unknown	unknown

	incident_year	n_fatalities	previous_year	year_to_year
0	2009	515	NaN	NaN
1	2010	1110	515.0	116.0
2	2011	1185	1110.0	7.0
3	2012	997	1185.0	-16.0
4	2013	1189	997.0	19.0
5	2014	1345	1189.0	13.0
6	2015	1148	1345.0	-15.0
7	2016	1106	1148.0	-4.0
8	2017	1541	1106.0	39.0
9	2018	1260	1541.0	-18.0
10	2019	1376	1260.0	9.0
11	2020	1119	1376.0	-19.0
12	2021	950	1119.0	-15.0

	incident_year	n_fatalities	previous_year	year_to_year
0	2009	515	NaN	NaN
1	2010	1110	515.0	116.0
2	2011	1185	1110.0	7.0
3	2012	997	1185.0	-16.0
4	2013	1189	997.0	19.0
5	2014	1345	1189.0	13.0
6	2015	1148	1345.0	-15.0
7	2016	1106	1148.0	-4.0
8	2017	1541	1106.0	39.0
9	2018	1260	1541.0	-18.0
10	2019	1376	1260.0	9.0
11	2020	1119	1376.0	-19.0
12	2021	950	1119.0	-15.0

Connect to the MySQL database from jupyter notebook¶

1. What is the number of reported incidents?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

2. What is the year to year change for the number of fatal incidents?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

3. What is the number of fatalities that received a citation?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

4. What day of the week has the most fatalities and what is the overall percentage?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

5. What is the number of fatalities involving welding?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

6. Select the last 5 from the previous query¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

7. Select the top 5 states with the most fatal incidents.¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

8. What are the top 5 states that had the most workplace fatalities from stabbings?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

9. What are the top 10 states that had the most workplace fatalities from shootings?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

10. What is the total number of shooting deaths per year?¶

method 1 - using pandas dataframes¶

method 2 - using SQL query¶

	day_of_week	n_fatalities	percentage
5	tuesday	2728	18.29
6	wednesday	2706	18.14
1	monday	2626	17.61
4	thursday	2612	17.51
0	friday	2335	15.66
2	saturday	1177	7.89
3	sunday	730	4.89

	id	incident_date	day_of_week	city	state	description	plan	citation	incident_year
9587	9588	2021-04-14	wednesday	cleveland	ohio	Worker electrocuted by portable welding machine.	federal	yes	2021
9812	9813	2021-01-30	saturday	mission	texas	Worker died in welding explosion.	federal	yes	2021
9999	10000	2020-12-10	thursday	urbana	ohio	Worker fatally crushed by seam welder.	federal	yes	2020
10678	10679	2020-05-24	sunday	dallas	texas	Worker electrocted while welding HVAC pipe.	federal	no	2020
11743	11744	2019-07-08	monday	kingwood	texas	Worker electrocuted while welding air conditio...	federal	no	2019

	state	n_fatalities
47	texas	1758
5	california	1352
10	florida	1021
35	new york	726
15	illinois	635

	state	n_fatalities
13	indiana	28
4	california	23
40	texas	21
29	new york	20
8	florida	14
16	kentucky	13
34	oregon	9
25	nevada	9
12	illinois	9
44	washington	8