import numpy as np
import pandas as pd


def read_csv():
    # Method to read the CSV file "Hospital_patients_datasets.csv" using pandas.
    # Returns: Pandas DataFrame containing the data from the CSV file.
    ds = pd.read_csv('./Hospital_patients_datasets.csv')
    
    return ds


def check_duplicates():
    ds = read_csv()
    # Method to check for duplicate rows in the DataFrame.
    # Returns: The number of duplicated rows found in the DataFrame.
    ds = ds.duplicated().sum()
    return ds


def check_null_values():
    ds = read_csv()
    # Method to check for null (missing) values in the DataFrame.
    # Returns: A pandas Series indicating the count of null values for each column in the DataFrame.
    ds = ds.isna().sum()
    return ds



def converting_dtype():
    ds = read_csv()
    # Method to convert 'ScheduledDay' and 'AppointmentDay' columns to datetime objects.
    # Returns: DataFrame with 'ScheduledDay' and 'AppointmentDay' columns converted to datetime objects.
    ds['ScheduledDay'] = pd.to_datetime(pd.to_datetime(ds.ScheduledDay).dt.date)
    ds['AppointmentDay'] = pd.to_datetime(pd.to_datetime(ds.AppointmentDay).dt.date)
    return ds


def rename_columns():
    ds = converting_dtype()
    # Method to rename some columns in the DataFrame.
    # Returns: DataFrame with certain column names changed to new names.
    ds.rename(columns={'Hipertension' : 'Hypertension', 
                   'Handcap' : 'Handicap',
                   'SMS_received' : 'SMSReceived', 
                   'No-show' : 'NoShow'}, inplace=True)
    
    return ds


ds = rename_columns()


ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   PatientId       110527 non-null  float64       
 1   AppointmentID   110527 non-null  int64         
 2   Gender          110527 non-null  object        
 3   ScheduledDay    110527 non-null  datetime64[ns]
 4   AppointmentDay  110527 non-null  datetime64[ns]
 5   Age             110527 non-null  int64         
 6   Neighbourhood   110527 non-null  object        
 7   Scholarship     110527 non-null  int64         
 8   Hypertension    110527 non-null  int64         
 9   Diabetes        110527 non-null  int64         
 10  Alcoholism      110527 non-null  int64         
 11  Handicap        110527 non-null  int64         
 12  SMSReceived     110527 non-null  int64         
 13  NoShow          110527 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(8), object(3)
memory usage: 11.8+ MB


ds.describe()


import module1 as m1


def drop_columns():
    ds = m1.rename_columns()
    # Function to drop unnecessary columns from the DataFrame.
    # Returns: DataFrame with specified columns dropped.
    ds.drop([ 'PatientId', 'AppointmentID', 'Neighbourhood'], axis='columns', inplace=True)
    return ds


def create_bin():
    ds = drop_columns()
    #First Drop rows with Age == 0. 
    #HiCounselor's team result set has the row where Age==-1. To clear their checks while solving the
    #challenge have included this.
    
    #But for this analysis let's consider the data where Age is >0.
    ds.drop(ds[ds.Age <= 0].index, axis=0,inplace=True)
    #ds = ds[ds.Age > 0]
    # Generating labels for age intervals (e.g., '1 - 20', '21 - 40', etc.)
    labels = ["{0} - {1}".format(i, i + 20) for i in range(1, 118, 20)]

    # Using the pd.cut() function to categorize ages into groups(use bins = range(1, 130, 20) ,right=False and use the given labels)
    ds['Age_group'] = pd.cut(ds.Age, bins=range(1,130,20),right=False,labels=labels)
    # Returning the modified dataset with assigned age groups
    return ds


def drop():
    ds = create_bin()
    # Function to drop the original 'Age' column from the DataFrame.
    # Returns: DataFrame with the 'Age' column dropped.
    ds.drop('Age', axis='columns', inplace=True)
    return ds


def convert():
    ds = drop()
    # Function to convert 'NoShow' values into binary values (1 for 'Yes' and 0 for 'No').
    # Returns: DataFrame with 'NoShow' column values converted to 1s and 0s.
    ds.NoShow = ds.NoShow.map({'Yes':1,'No':0})
    
    return ds


def export_the_dataset():
    df = convert()
    # write your code to export the cleaned dataset and set the index=false and return the same as 'df'
    df.to_csv('patients.csv', index=False)
    return df


# TASK 6: Load the Cleaned dataset 'patients.csv' to the database provided.
# follow the instruction in the Task 5 description and complete the task as per it.

# check if mysql table is created using "patients"
# Use this final dataset and upload it on the provided database for performing analysis in MySQL
# To run this task click on the terminal and click on the run project


export_the_dataset()


ds = pd.read_csv('./patients.csv', parse_dates=['ScheduledDay', 'AppointmentDay'])

ds


ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106987 entries, 0 to 106986
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Gender          106987 non-null  object        
 1   ScheduledDay    106987 non-null  datetime64[ns]
 2   AppointmentDay  106987 non-null  datetime64[ns]
 3   Scholarship     106987 non-null  int64         
 4   Hypertension    106987 non-null  int64         
 5   Diabetes        106987 non-null  int64         
 6   Alcoholism      106987 non-null  int64         
 7   Handicap        106987 non-null  int64         
 8   SMSReceived     106987 non-null  int64         
 9   NoShow          106987 non-null  int64         
 10  Age_group       106987 non-null  object        
dtypes: datetime64[ns](2), int64(7), object(2)
memory usage: 9.0+ MB


## These packages may not directly come with anaconda installation, so ensure you have installed on your computer.
#!pip install pymysql
#!pip install sqlalchemy
## Also MySQL installation and configuration is needed.(Somehow you have to establish a connection to MySQL database)


from sqlalchemy import create_engine
import pymysql


## Define database details
user = 'b00d40e4'
password = 'Cab#22se'
host = 'localhost'
port = 3306
database = 'b00d40e4'


conn_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
conn_string

'mysql+pymysql://b00d40e4:Cab#22se@localhost:3306/b00d40e4'


engine = create_engine(conn_string)


connection = engine.connect()


query = """select count(*) as n_count from patients;
"""
pd.read_sql(query, connection)


query = """SELECT AppointmentDay, count(*) as n_appointments
from patients
group by AppointmentDay
order by 1;
"""
pd.read_sql(query, connection)


query = """with daily_appointments as (SELECT AppointmentDay, 
 count(*) as n_appointments
 from patients
 group by 1)
 
 select round(avg(n_appointments)) as average_appointments_per_day from daily_appointments;
"""
pd.read_sql(query, connection)


query = """with daily_appointments as (SELECT AppointmentDay, 
 count(*) as n_appointments
 from patients
 group by 1)
 
 select AppointmentDay,n_appointments
 from daily_appointments
 WHERE n_appointments = (select max(n_appointments) from daily_appointments);
"""
pd.read_sql(query, connection)


## %% is used so as to indicate and understood by sqlengine that it is actual %.
query = """SELECT date_format(AppointmentDay, '%%Y-%%m'), count(*) as n_appointments 
from patients group by 1
"""
pd.read_sql(query, connection)


query = """WITH monthly_appointments as (
    SELECT date_format(AppointmentDay, '%%Y-%%m') as AppointmentMonth, 
    count(*) as n_appointments 
    from patients 
    group by 1)

select * from monthly_appointments
order by n_appointments desc
limit 1;
"""
pd.read_sql(query, connection)


query = """select year(AppointmentDay) as year, week(AppointmentDay) as week, count(*) as n_appointments_week 
from patients
group by 1,2

"""
pd.read_sql(query, connection)


query = """select year(AppointmentDay) as year, week(AppointmentDay) as week, count(*) as n_appointments_week 
from patients
group by 1,2
order by n_appointments_week DESC
limit 1;
"""
pd.read_sql(query, connection)


query = """select gender, count(*) as n_appointments
from patients
group by 1;
"""
pd.read_sql(query, connection)


query = """select date_format(AppointmentDay,'%%W') as weekday, count(*) as n_appointments
from patients
group by 1;
"""
pd.read_sql(query, connection)


query = """select round(avg(datediff(AppointmentDay,ScheduledDay))) as avg_wait_time
from patients
"""
pd.read_sql(query, connection)


## There are some rows where ScheduledDay > AppointmentDay, which in general doesn't happen.
## We can ignore these instances as these are just 5 out of 106987 and the rounded average doesn't
## affect the result.


query = """select round(datediff(AppointmentDay,ScheduledDay)) as wait_time
from patients
where round(datediff(AppointmentDay,ScheduledDay)) < 0
"""
pd.read_sql(query, connection)


query = """select round(avg(datediff(AppointmentDay,ScheduledDay))) as avg_wait_time
from patients
where round(datediff(AppointmentDay,ScheduledDay)) >= 0

"""
pd.read_sql(query, connection)


connection.close()

	PatientId	AppointmentID	Age	Scholarship	Hypertension	Diabetes	Alcoholism	Handicap	SMSReceived
count	1.105270e+05	1.105270e+05	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000
mean	1.474963e+14	5.675305e+06	37.088874	0.098266	0.197246	0.071865	0.030400	0.022248	0.321026
std	2.560949e+14	7.129575e+04	23.110205	0.297675	0.397921	0.258265	0.171686	0.161543	0.466873
min	3.920000e+04	5.030230e+06	-1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	4.172615e+12	5.640286e+06	18.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	3.173180e+13	5.680573e+06	37.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	9.439170e+13	5.725524e+06	55.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
max	9.999820e+14	5.790484e+06	115.000000	1.000000	1.000000	1.000000	1.000000	4.000000	1.000000

	Gender	ScheduledDay	AppointmentDay	Scholarship	Hypertension	Diabetes	Alcoholism	Handicap	SMSReceived	NoShow	Age_group
0	F	2016-04-29	2016-04-29	0	1	0	0	0	0	0	61 - 81
1	M	2016-04-29	2016-04-29	0	0	0	0	0	0	0	41 - 61
2	F	2016-04-29	2016-04-29	0	0	0	0	0	0	0	61 - 81
3	F	2016-04-29	2016-04-29	0	0	0	0	0	0	0	1 - 21
4	F	2016-04-29	2016-04-29	0	1	1	0	0	0	0	41 - 61
...	...	...	...	...	...	...	...	...	...	...	...
110522	F	2016-05-03	2016-06-07	0	0	0	0	0	1	0	41 - 61
110523	F	2016-05-03	2016-06-07	0	0	0	0	0	1	0	41 - 61
110524	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	21 - 41
110525	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	21 - 41
110526	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	41 - 61

	Gender	ScheduledDay	AppointmentDay	Scholarship	Hypertension	Diabetes	Alcoholism	Handicap	SMSReceived	NoShow	Age_group
0	F	2016-04-29	2016-04-29	0	1	0	0	0	0	0	61 - 81
1	M	2016-04-29	2016-04-29	0	0	0	0	0	0	0	41 - 61
2	F	2016-04-29	2016-04-29	0	0	0	0	0	0	0	61 - 81
3	F	2016-04-29	2016-04-29	0	0	0	0	0	0	0	1 - 21
4	F	2016-04-29	2016-04-29	0	1	1	0	0	0	0	41 - 61
...	...	...	...	...	...	...	...	...	...	...	...
106982	F	2016-05-03	2016-06-07	0	0	0	0	0	1	0	41 - 61
106983	F	2016-05-03	2016-06-07	0	0	0	0	0	1	0	41 - 61
106984	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	21 - 41
106985	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	21 - 41
106986	F	2016-04-27	2016-06-07	0	0	0	0	0	1	0	41 - 61

	AppointmentDay	n_appointments
0	2016-04-29	3104
1	2016-05-02	4214
2	2016-05-03	4129
3	2016-05-04	4048
4	2016-05-05	4113
5	2016-05-06	3791
6	2016-05-09	4352
7	2016-05-10	4177
8	2016-05-11	4347
9	2016-05-12	4233
10	2016-05-13	3885
11	2016-05-14	39
12	2016-05-16	4449
13	2016-05-17	4227
14	2016-05-18	4220
15	2016-05-19	4109
16	2016-05-20	3707
17	2016-05-24	3876
18	2016-05-25	3768
19	2016-05-30	4360
20	2016-05-31	4158
21	2016-06-01	4351
22	2016-06-02	4204
23	2016-06-03	3978
24	2016-06-06	4528
25	2016-06-07	4264
26	2016-06-08	4356

	date_format(AppointmentDay, '%Y-%m')	n_appointments
0	2016-04	3104
1	2016-05	78202
2	2016-06	25681

Connect to the MySQL database from jupyter notebook¶

1. How many values are there in the given dataset?¶

2. Count the number of appointments for each day in the given dataset:¶

3. Calculate the average number of appointments (Set to nearest whole number) per day in the given¶

4. Find the day with the highest number of appointments in the given dataset.¶

5. Calculate the monthly average number of appointments in the given dataset.¶

6. Find the month with the highest number of appointments in the given dataset.¶

7. Calculate the weekly average number of appointments in the given dataset.¶

8. Find the week with the highest number of appointments in the given dataset.¶

9. What is the distribution of appointments based on gender in the dataset?¶

10. Calculate the number of appointments per weekday in the given dataset.¶

11. Calculate the average time between scheduling and the appointment day in the given dataset.¶

	year	week	n_appointments_week
0	2016	17	3104
1	2016	18	20295
2	2016	19	21033
3	2016	20	20712
4	2016	21	7644
5	2016	22	21051
6	2016	23	13148

	weekday	n_appointments
0	Friday	18465
1	Tuesday	24831
2	Monday	21903
3	Wednesday	25090
4	Thursday	16659
5	Saturday	39

	gender	n_appointments
0	F	70118
1	M	36869