Author: Venkata Raghavendra Karthik Kanna
Last Modified: August 22, 2023

This notebook is an extension of the project conducted by Hicounselor.. The project, titled "Time Series Analysis of Medical Appointments Using SQL" aims to examine and understand the factors contributing to work-related fatalities, identify trends and patterns, and provide insights to improve workplace safety measures.

Hicounselor provides virtual sandbox machines/kernels where the project can be implemented. I have completed my project there as well. This notebook serves as an extension for anyone who wants to work on their own machines locally, using both Python dataframes and SQL methods.

Softwares used:

  • Python (numpy, pandas, sqlalchemy, pymsql)
  • MySQL
In [1]:
import numpy as np
import pandas as pd
In [2]:
def read_csv():
    # Method to read the CSV file "Hospital_patients_datasets.csv" using pandas.
    # Returns: Pandas DataFrame containing the data from the CSV file.
    ds = pd.read_csv('./Hospital_patients_datasets.csv')
    
    return ds


def check_duplicates():
    ds = read_csv()
    # Method to check for duplicate rows in the DataFrame.
    # Returns: The number of duplicated rows found in the DataFrame.
    ds = ds.duplicated().sum()
    return ds


def check_null_values():
    ds = read_csv()
    # Method to check for null (missing) values in the DataFrame.
    # Returns: A pandas Series indicating the count of null values for each column in the DataFrame.
    ds = ds.isna().sum()
    return ds



def converting_dtype():
    ds = read_csv()
    # Method to convert 'ScheduledDay' and 'AppointmentDay' columns to datetime objects.
    # Returns: DataFrame with 'ScheduledDay' and 'AppointmentDay' columns converted to datetime objects.
    ds['ScheduledDay'] = pd.to_datetime(pd.to_datetime(ds.ScheduledDay).dt.date)
    ds['AppointmentDay'] = pd.to_datetime(pd.to_datetime(ds.AppointmentDay).dt.date)
    return ds


def rename_columns():
    ds = converting_dtype()
    # Method to rename some columns in the DataFrame.
    # Returns: DataFrame with certain column names changed to new names.
    ds.rename(columns={'Hipertension' : 'Hypertension', 
                   'Handcap' : 'Handicap',
                   'SMS_received' : 'SMSReceived', 
                   'No-show' : 'NoShow'}, inplace=True)
    
    return ds

The above code is stored in module1.py file.

In [4]:
ds = rename_columns()
In [5]:
ds.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   PatientId       110527 non-null  float64       
 1   AppointmentID   110527 non-null  int64         
 2   Gender          110527 non-null  object        
 3   ScheduledDay    110527 non-null  datetime64[ns]
 4   AppointmentDay  110527 non-null  datetime64[ns]
 5   Age             110527 non-null  int64         
 6   Neighbourhood   110527 non-null  object        
 7   Scholarship     110527 non-null  int64         
 8   Hypertension    110527 non-null  int64         
 9   Diabetes        110527 non-null  int64         
 10  Alcoholism      110527 non-null  int64         
 11  Handicap        110527 non-null  int64         
 12  SMSReceived     110527 non-null  int64         
 13  NoShow          110527 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(8), object(3)
memory usage: 11.8+ MB
In [6]:
ds.describe()
Out[6]:
PatientId AppointmentID Age Scholarship Hypertension Diabetes Alcoholism Handicap SMSReceived
count 1.105270e+05 1.105270e+05 110527.000000 110527.000000 110527.000000 110527.000000 110527.000000 110527.000000 110527.000000
mean 1.474963e+14 5.675305e+06 37.088874 0.098266 0.197246 0.071865 0.030400 0.022248 0.321026
std 2.560949e+14 7.129575e+04 23.110205 0.297675 0.397921 0.258265 0.171686 0.161543 0.466873
min 3.920000e+04 5.030230e+06 -1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4.172615e+12 5.640286e+06 18.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 3.173180e+13 5.680573e+06 37.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 9.439170e+13 5.725524e+06 55.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
max 9.999820e+14 5.790484e+06 115.000000 1.000000 1.000000 1.000000 1.000000 4.000000 1.000000
In [7]:
import module1 as m1
In [8]:
def drop_columns():
    ds = m1.rename_columns()
    # Function to drop unnecessary columns from the DataFrame.
    # Returns: DataFrame with specified columns dropped.
    ds.drop([ 'PatientId', 'AppointmentID', 'Neighbourhood'], axis='columns', inplace=True)
    return ds


def create_bin():
    ds = drop_columns()
    #First Drop rows with Age == 0. 
    #HiCounselor's team result set has the row where Age==-1. To clear their checks while solving the
    #challenge have included this.
    
    #But for this analysis let's consider the data where Age is >0.
    ds.drop(ds[ds.Age <= 0].index, axis=0,inplace=True)
    #ds = ds[ds.Age > 0]
    # Generating labels for age intervals (e.g., '1 - 20', '21 - 40', etc.)
    labels = ["{0} - {1}".format(i, i + 20) for i in range(1, 118, 20)]

    # Using the pd.cut() function to categorize ages into groups(use bins = range(1, 130, 20) ,right=False and use the given labels)
    ds['Age_group'] = pd.cut(ds.Age, bins=range(1,130,20),right=False,labels=labels)
    # Returning the modified dataset with assigned age groups
    return ds


def drop():
    ds = create_bin()
    # Function to drop the original 'Age' column from the DataFrame.
    # Returns: DataFrame with the 'Age' column dropped.
    ds.drop('Age', axis='columns', inplace=True)
    return ds


def convert():
    ds = drop()
    # Function to convert 'NoShow' values into binary values (1 for 'Yes' and 0 for 'No').
    # Returns: DataFrame with 'NoShow' column values converted to 1s and 0s.
    ds.NoShow = ds.NoShow.map({'Yes':1,'No':0})
    
    return ds


def export_the_dataset():
    df = convert()
    # write your code to export the cleaned dataset and set the index=false and return the same as 'df'
    df.to_csv('patients.csv', index=False)
    return df


# TASK 6: Load the Cleaned dataset 'patients.csv' to the database provided.
# follow the instruction in the Task 5 description and complete the task as per it.

# check if mysql table is created using "patients"
# Use this final dataset and upload it on the provided database for performing analysis in MySQL
# To run this task click on the terminal and click on the run project
In [9]:
export_the_dataset()
Out[9]:
Gender ScheduledDay AppointmentDay Scholarship Hypertension Diabetes Alcoholism Handicap SMSReceived NoShow Age_group
0 F 2016-04-29 2016-04-29 0 1 0 0 0 0 0 61 - 81
1 M 2016-04-29 2016-04-29 0 0 0 0 0 0 0 41 - 61
2 F 2016-04-29 2016-04-29 0 0 0 0 0 0 0 61 - 81
3 F 2016-04-29 2016-04-29 0 0 0 0 0 0 0 1 - 21
4 F 2016-04-29 2016-04-29 0 1 1 0 0 0 0 41 - 61
... ... ... ... ... ... ... ... ... ... ... ...
110522 F 2016-05-03 2016-06-07 0 0 0 0 0 1 0 41 - 61
110523 F 2016-05-03 2016-06-07 0 0 0 0 0 1 0 41 - 61
110524 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 21 - 41
110525 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 21 - 41
110526 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 41 - 61

106987 rows × 11 columns

In [10]:
ds = pd.read_csv('./patients.csv', parse_dates=['ScheduledDay', 'AppointmentDay'])
In [11]:
ds
Out[11]:
Gender ScheduledDay AppointmentDay Scholarship Hypertension Diabetes Alcoholism Handicap SMSReceived NoShow Age_group
0 F 2016-04-29 2016-04-29 0 1 0 0 0 0 0 61 - 81
1 M 2016-04-29 2016-04-29 0 0 0 0 0 0 0 41 - 61
2 F 2016-04-29 2016-04-29 0 0 0 0 0 0 0 61 - 81
3 F 2016-04-29 2016-04-29 0 0 0 0 0 0 0 1 - 21
4 F 2016-04-29 2016-04-29 0 1 1 0 0 0 0 41 - 61
... ... ... ... ... ... ... ... ... ... ... ...
106982 F 2016-05-03 2016-06-07 0 0 0 0 0 1 0 41 - 61
106983 F 2016-05-03 2016-06-07 0 0 0 0 0 1 0 41 - 61
106984 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 21 - 41
106985 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 21 - 41
106986 F 2016-04-27 2016-06-07 0 0 0 0 0 1 0 41 - 61

106987 rows × 11 columns

In [12]:
ds.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106987 entries, 0 to 106986
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Gender          106987 non-null  object        
 1   ScheduledDay    106987 non-null  datetime64[ns]
 2   AppointmentDay  106987 non-null  datetime64[ns]
 3   Scholarship     106987 non-null  int64         
 4   Hypertension    106987 non-null  int64         
 5   Diabetes        106987 non-null  int64         
 6   Alcoholism      106987 non-null  int64         
 7   Handicap        106987 non-null  int64         
 8   SMSReceived     106987 non-null  int64         
 9   NoShow          106987 non-null  int64         
 10  Age_group       106987 non-null  object        
dtypes: datetime64[ns](2), int64(7), object(2)
memory usage: 9.0+ MB
In [13]:
## These packages may not directly come with anaconda installation, so ensure you have installed on your computer.
#!pip install pymysql
#!pip install sqlalchemy
## Also MySQL installation and configuration is needed.(Somehow you have to establish a connection to MySQL database)

Connect to the MySQL database from jupyter notebook¶

In [68]:
from sqlalchemy import create_engine
import pymysql
In [69]:
## Define database details
user = 'b00d40e4'
password = 'Cab#22se'
host = 'localhost'
port = 3306
database = 'b00d40e4'
In [70]:
conn_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
conn_string
Out[70]:
'mysql+pymysql://b00d40e4:Cab#22se@localhost:3306/b00d40e4'
In [71]:
engine = create_engine(conn_string)
In [72]:
connection = engine.connect()

At this point we have our data sources from which we can query the information based on our analysis questions.

1. How many values are there in the given dataset?¶

In [73]:
query = """select count(*) as n_count from patients;
"""
pd.read_sql(query, connection)
Out[73]:
n_count
0 106987

2. Count the number of appointments for each day in the given dataset:¶

In [74]:
query = """SELECT AppointmentDay, count(*) as n_appointments
from patients
group by AppointmentDay
order by 1;
"""
pd.read_sql(query, connection)
Out[74]:
AppointmentDay n_appointments
0 2016-04-29 3104
1 2016-05-02 4214
2 2016-05-03 4129
3 2016-05-04 4048
4 2016-05-05 4113
5 2016-05-06 3791
6 2016-05-09 4352
7 2016-05-10 4177
8 2016-05-11 4347
9 2016-05-12 4233
10 2016-05-13 3885
11 2016-05-14 39
12 2016-05-16 4449
13 2016-05-17 4227
14 2016-05-18 4220
15 2016-05-19 4109
16 2016-05-20 3707
17 2016-05-24 3876
18 2016-05-25 3768
19 2016-05-30 4360
20 2016-05-31 4158
21 2016-06-01 4351
22 2016-06-02 4204
23 2016-06-03 3978
24 2016-06-06 4528
25 2016-06-07 4264
26 2016-06-08 4356

3. Calculate the average number of appointments (Set to nearest whole number) per day in the given¶

In [75]:
query = """with daily_appointments as (SELECT AppointmentDay, 
 count(*) as n_appointments
 from patients
 group by 1)
 
 select round(avg(n_appointments)) as average_appointments_per_day from daily_appointments;
"""
pd.read_sql(query, connection)
Out[75]:
average_appointments_per_day
0 3962.0

4. Find the day with the highest number of appointments in the given dataset.¶

In [77]:
query = """with daily_appointments as (SELECT AppointmentDay, 
 count(*) as n_appointments
 from patients
 group by 1)
 
 select AppointmentDay,n_appointments
 from daily_appointments
 WHERE n_appointments = (select max(n_appointments) from daily_appointments);
"""
pd.read_sql(query, connection)
Out[77]:
AppointmentDay n_appointments
0 2016-06-06 4528

5. Calculate the monthly average number of appointments in the given dataset.¶

In [78]:
## %% is used so as to indicate and understood by sqlengine that it is actual %.
query = """SELECT date_format(AppointmentDay, '%%Y-%%m'), count(*) as n_appointments 
from patients group by 1
"""
pd.read_sql(query, connection)
Out[78]:
date_format(AppointmentDay, '%Y-%m') n_appointments
0 2016-04 3104
1 2016-05 78202
2 2016-06 25681

6. Find the month with the highest number of appointments in the given dataset.¶

In [79]:
query = """WITH monthly_appointments as (
    SELECT date_format(AppointmentDay, '%%Y-%%m') as AppointmentMonth, 
    count(*) as n_appointments 
    from patients 
    group by 1)

select * from monthly_appointments
order by n_appointments desc
limit 1;
"""
pd.read_sql(query, connection)
Out[79]:
AppointmentMonth n_appointments
0 2016-05 78202

7. Calculate the weekly average number of appointments in the given dataset.¶

In [80]:
query = """select year(AppointmentDay) as year, week(AppointmentDay) as week, count(*) as n_appointments_week 
from patients
group by 1,2

"""
pd.read_sql(query, connection)
Out[80]:
year week n_appointments_week
0 2016 17 3104
1 2016 18 20295
2 2016 19 21033
3 2016 20 20712
4 2016 21 7644
5 2016 22 21051
6 2016 23 13148

8. Find the week with the highest number of appointments in the given dataset.¶

In [81]:
query = """select year(AppointmentDay) as year, week(AppointmentDay) as week, count(*) as n_appointments_week 
from patients
group by 1,2
order by n_appointments_week DESC
limit 1;
"""
pd.read_sql(query, connection)
Out[81]:
year week n_appointments_week
0 2016 22 21051

9. What is the distribution of appointments based on gender in the dataset?¶

In [82]:
query = """select gender, count(*) as n_appointments
from patients
group by 1;
"""
pd.read_sql(query, connection)
Out[82]:
gender n_appointments
0 F 70118
1 M 36869

10. Calculate the number of appointments per weekday in the given dataset.¶

In [83]:
query = """select date_format(AppointmentDay,'%%W') as weekday, count(*) as n_appointments
from patients
group by 1;
"""
pd.read_sql(query, connection)
Out[83]:
weekday n_appointments
0 Friday 18465
1 Tuesday 24831
2 Monday 21903
3 Wednesday 25090
4 Thursday 16659
5 Saturday 39

11. Calculate the average time between scheduling and the appointment day in the given dataset.¶

In [84]:
query = """select round(avg(datediff(AppointmentDay,ScheduledDay))) as avg_wait_time
from patients
"""
pd.read_sql(query, connection)
Out[84]:
avg_wait_time
0 10.0
In [85]:
## There are some rows where ScheduledDay > AppointmentDay, which in general doesn't happen.
## We can ignore these instances as these are just 5 out of 106987 and the rounded average doesn't
## affect the result.
In [86]:
query = """select round(datediff(AppointmentDay,ScheduledDay)) as wait_time
from patients
where round(datediff(AppointmentDay,ScheduledDay)) < 0
"""
pd.read_sql(query, connection)
Out[86]:
wait_time
0 -1
1 -1
2 -1
3 -6
4 -1
In [87]:
query = """select round(avg(datediff(AppointmentDay,ScheduledDay))) as avg_wait_time
from patients
where round(datediff(AppointmentDay,ScheduledDay)) >= 0

"""
pd.read_sql(query, connection)
Out[87]:
avg_wait_time
0 10.0
In [88]:
connection.close()

Once we are done with the analysis we shall close the database connection

This marks the end of the notebook. If any modifications are made to this notebook, I shall upload the revised version to the github repo.

Thank you for reading. Have a great day!!