import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


df_2006 = pd.read_csv(r"D:/et4_e/coursework/2021/2006.csv.bz2")
df_2007 = pd.read_csv(r"D:/et4_e/coursework/2021/2007.csv.bz2")
airport_df = pd.read_csv(r"D:/et4_e/coursework/2021/airports.csv")
carrier_df = pd.read_csv(r"D:/et4_e/coursework/2021/carriers.csv")
planes_df = pd.read_csv(r"D:/et4_e/coursework/2021/plane-data.csv")
merged_df = pd.concat([df_2007,df_2006],ignore_index=True)


# To view all columns
pd.set_option('display.max_columns', None)


merged_df.head()


# Find dimension of merged data
print(df_2006.shape,df_2007.shape)
merged_df.shape

(7141922, 29) (7453215, 29)

(14595137, 29)


merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14595137 entries, 0 to 14595136
Data columns (total 29 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Year               int64  
 1   Month              int64  
 2   DayofMonth         int64  
 3   DayOfWeek          int64  
 4   DepTime            float64
 5   CRSDepTime         int64  
 6   ArrTime            float64
 7   CRSArrTime         int64  
 8   UniqueCarrier      object 
 9   FlightNum          int64  
 10  TailNum            object 
 11  ActualElapsedTime  float64
 12  CRSElapsedTime     float64
 13  AirTime            float64
 14  ArrDelay           float64
 15  DepDelay           float64
 16  Origin             object 
 17  Dest               object 
 18  Distance           int64  
 19  TaxiIn             int64  
 20  TaxiOut            int64  
 21  Cancelled          int64  
 22  CancellationCode   object 
 23  Diverted           int64  
 24  CarrierDelay       int64  
 25  WeatherDelay       int64  
 26  NASDelay           int64  
 27  SecurityDelay      int64  
 28  LateAircraftDelay  int64  
dtypes: float64(7), int64(17), object(5)
memory usage: 3.2+ GB


merged_df.describe().T


# Correlation Matrix
corrmat = merged_df.corr()
sns.heatmap(corrmat, square=True)
plt.show()


# Checking the amount of cancelled flights
merged_df.Cancelled.sum()

282682


# Check for missing value
merged_df.isna().sum()

Year                        0
Month                       0
DayofMonth                  0
DayOfWeek                   0
DepTime                282682
CRSDepTime                  0
ArrTime                316047
CRSArrTime                  0
UniqueCarrier               0
FlightNum                   0
TailNum                    22
ActualElapsedTime      316047
CRSElapsedTime            998
AirTime                316047
ArrDelay               316047
DepDelay               282682
Origin                      0
Dest                        0
Distance                    0
TaxiIn                      0
TaxiOut                     0
Cancelled                   0
CancellationCode     14312454
Diverted                    0
CarrierDelay                0
WeatherDelay                0
NASDelay                    0
SecurityDelay               0
LateAircraftDelay           0
dtype: int64


# Check for duplicates
len(merged_df[merged_df.duplicated()])

34


# Drop duplicates
merged_df = merged_df.drop_duplicates()


# Create Date column
merged_df['Date'] = pd.to_datetime(dict(year=merged_df.Year, month=merged_df.Month, day=merged_df.DayofMonth))
merged_df.sample(3)[['Year','Month','DayofMonth','Date']]


# Create labelled column for easier visualisation
merged_df['Month_label'] = pd.to_datetime(merged_df['Date']).dt.month_name().str[:3]
merged_df['Day_label'] = pd.to_datetime(merged_df['Date']).dt.day_name().str[:3]


# Rename 'Cancelled' and 'Diverted' columns from numeric values to categorical values
merged_df['Cancelled'] = merged_df['Cancelled'].replace([1, 0], ["Cancelled", "Not Cancelled"])
merged_df['Diverted'] = merged_df['Diverted'].replace([1, 0], ["Diverted", "Not Diverted"])


# Rename CancellationCodes
CancellationCodes = {"A" : "Carrier", "B" : "Weather", "C" : "National Air System (NAS)", "D" : "Security"}
merged_df["CancellationCode"].replace(CancellationCodes, inplace=True)


# Change selected data types (numeric to categorical)
merged_df['Year'] = merged_df['Year'].astype(object)
merged_df['Month'] = merged_df['Month'].astype(object)
merged_df['DayofMonth'] = merged_df['DayofMonth'].astype(object)
merged_df['DayOfWeek'] = merged_df['DayOfWeek'].astype(object)
merged_df['FlightNum'] = merged_df['FlightNum'].astype(object)
merged_df['Cancelled'] = merged_df['Cancelled'].astype(object)
merged_df['Diverted'] = merged_df['Diverted'].astype(object)


# Imputate Null values with interpolation
merged_df['DepTime'] = merged_df['DepTime'].interpolate(limit_direction='both', axis=0) 
merged_df['ArrTime'] = merged_df['ArrTime'].interpolate(limit_direction='both', axis=0) 
merged_df['ActualElapsedTime'] = merged_df['ActualElapsedTime'].interpolate(limit_direction ='both', axis=0) 
merged_df['CRSElapsedTime'] = merged_df['CRSElapsedTime'].interpolate(limit_direction ='both', axis=0)
merged_df['AirTime'] = merged_df['AirTime'].interpolate(limit_direction ='both', axis=0) 
merged_df['ArrDelay'] = merged_df['ArrDelay'].interpolate(limit_direction ='both', axis=0) 
merged_df['DepDelay'] = merged_df['DepDelay'].interpolate(limit_direction ='both', axis=0)


# Remove the 22 missing TailNum data since it is insignificant compared to the large dataset
merged_df = merged_df[merged_df['TailNum'].notna()]


# Checking for missing values
merged_df.isnull().sum()

Year                        0
Month                       0
DayofMonth                  0
DayOfWeek                   0
DepTime                     0
CRSDepTime                  0
ArrTime                     0
CRSArrTime                  0
UniqueCarrier               0
FlightNum                   0
TailNum                     0
ActualElapsedTime           0
CRSElapsedTime              0
AirTime                     0
ArrDelay                    0
DepDelay                    0
Origin                      0
Dest                        0
Distance                    0
TaxiIn                      0
TaxiOut                     0
Cancelled                   0
CancellationCode     14312420
Diverted                    0
CarrierDelay                0
WeatherDelay                0
NASDelay                    0
SecurityDelay               0
LateAircraftDelay           0
Date                        0
Month_label                 0
Day_label                   0
dtype: int64


# Creating new column showing ArrDelay > 15mins
# 0 = No Delay, 1= Delay

merged_df['DelayStatus'] = np.where(merged_df['ArrDelay']>15, '1','0')
merged_df


merged_df.DelayStatus.value_counts()

0    11169298
1     3425783
Name: DelayStatus, dtype: int64


merged_df.DelayStatus.value_counts(normalize=True)

0    0.765278
1    0.234722
Name: DelayStatus, dtype: float64


# Delayed Flights in Year 2006
merged_df.loc[merged_df['Year'] == 2006].DelayStatus.value_counts(normalize=True)

0    0.775126
1    0.224874
Name: DelayStatus, dtype: float64


# Delayed Flights in Year 2007
merged_df.loc[merged_df['Year'] == 2007].DelayStatus.value_counts(normalize=True)

0    0.755842
1    0.244158
Name: DelayStatus, dtype: float64


merged_df_sample = merged_df.sample(frac=0.10, replace=False, random_state = 42)

flight_cancelled = merged_df_sample.query("Cancelled == 'Cancelled'")
flight_notcancelled = merged_df_sample.query("Cancelled == 'Not Cancelled'")


merged_df_sample.shape

(1459508, 33)


sns.boxplot(x=merged_df_sample["Month"], y=merged_df_sample["ArrDelay"] )
plt.title('Boxplot Distribution of ArrDelay (mins) by Month')

Text(0.5, 1.0, 'Boxplot Distribution of ArrDelay (mins) by Month')


plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
sns.distplot(merged_df_sample['ArrDelay'])
plt.subplot(1,2,2)
sns.distplot(merged_df_sample['DepDelay'])

plt.show()


len(airport_df.iata.unique())

3376


print('There are ', len(flight_notcancelled.Origin.unique()), ' Unique Origin Airport Codes.')
print('There are ', len(flight_notcancelled.Dest.unique()), ' Unique Destination Airport Codes.')

There are  305  Unique Origin Airport Codes.
There are  306  Unique Destination Airport Codes.


df_origin = flight_notcancelled.Origin.unique()
df_dest = flight_notcancelled.Dest.unique()


difference_1 = [item for item in df_origin if item not in df_dest]
print(difference_1 , 'is in Origin but not in Dest.')

['PIR', 'GLH'] is in Origin but not in Dest.


difference_2 = [item for item in df_dest if item not in df_origin]
print(difference_2 , 'are in Dest but not in Origin.')

['PVU', 'PUB', 'OGD'] are in Dest but not in Origin.


# Finding total flights from above difference_2
flight_notcancelled.query('(Dest == "PVU") | (Dest == "PUB") | (Dest == "OGD")').Dest.count()

4


flight_notcancelled.query('(Origin == "GLH")|(Origin == "PIR")').Origin.count()

2


print("Percentage count of each month:")
print(merged_df.Month.value_counts(normalize=True))

print("Percentage count of each day:")
print(merged_df.DayOfWeek.value_counts(normalize=True))

Percentage count of each month:
8     0.087838
7     0.087001
3     0.085262
10    0.085077
5     0.084585
6     0.084110
12    0.083514
1     0.082414
4     0.082219
11    0.081626
9     0.081200
2     0.075152
Name: Month, dtype: float64
Percentage count of each day:
1    0.148030
5    0.147877
4    0.147357
3    0.146063
2    0.144493
7    0.140599
6    0.125581
Name: DayOfWeek, dtype: float64


# Full data
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(14, 6))

# bar plot for cancellation
ax1.set_title('Month (%)')
merged_df.Month_label.value_counts(normalize=True).plot.bar(color='cornflowerblue', width=0.9, ax=ax1)

# bar plot for diverted
ax2.set_title('DayOfWeek (%)')
p1= merged_df.Day_label.value_counts(normalize=True).plot.bar(color='cornflowerblue', width=0.9, ax=ax2)

plt.show()


# Sample
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(14, 6))

# bar plot for cancellation
ax1.set_title('Month (%)')
merged_df_sample.Month_label.value_counts(normalize=True).plot.bar(color = 'cornflowerblue', width=0.9, ax=ax1)

# bar plot for diverted
ax2.set_title('DayOfWeek (%)')
p1= merged_df_sample.Day_label.value_counts(normalize=True).plot.bar(color = 'cornflowerblue', width=0.9, ax=ax2)

plt.show()


plt.figure(figsize=(10, 4))
merged_df['UniqueCarrier'].value_counts().sort_values().plot(kind = 'bar',color='cornflowerblue')
plt.title('Total number of flights per Airline')
plt.xlabel('UniqueCarrier/Airline')
plt.ylabel('Number of flights (Sum)')

plt.show()


carrier_df.query("Code == 'WN' or Code == 'AA' or Code == 'OO'or Code == 'MQ'or Code == 'UA'")


plt.hist(x=flight_notcancelled['Distance'], bins='auto')
plt.xlabel('Distance (miles)')
plt.ylabel('Frequency')

meandist = flight_notcancelled.Distance.mean()
plt.axvline(x = meandist, color = 'red', linestyle = '--', label = 'Mean Distance')
print('The Mean Distance for Non-Cancelled flights is', meandist, 'miles.')

plt.xlim([0, 2500])
plt.ylim([0, 33000])
plt.xlabel('Distance (miles)')
plt.ylabel('Total Count of Flights')
plt.legend()
plt.title('Distribution of Distance for Non-Cancelled flights')

plt.show()

The Mean Distance for Non-Cancelled flights is 726.5590713246221 miles.


plt.hist(x=flight_cancelled['Distance'], bins='auto')
plt.xlabel('Distance (miles)')
plt.ylabel('Frequency')

meandist = flight_cancelled.Distance.mean()
plt.axvline(x = meandist, color = 'red', linestyle = '--', label = 'Mean Distance')
print('The Mean Distance for Cancelled flights is', meandist, 'miles.')

plt.xlim([0, 2000])
plt.ylim([0, 2300])
plt.xlabel('Distance (miles)')
plt.ylabel('Total Count of Flights')
plt.legend()
plt.title('Distribution of Distance for Cancelled flights')

plt.show()

The Mean Distance for Cancelled flights is 567.0817275275915 miles.


sns.countplot(data=merged_df, y = 'CancellationCode', color = 'cornflowerblue',
            order = merged_df['CancellationCode'].value_counts().index)
plt.ylabel('Cancellation Reason')

Text(0, 0.5, 'Cancellation Reason')


merged_df['CancellationCode'].value_counts(normalize=True)

Carrier                      0.433134
Weather                      0.353183
National Air System (NAS)    0.212990
Security                     0.000693
Name: CancellationCode, dtype: float64


plt.figure(figsize = (10,4))

order = flight_cancelled['CancellationCode'].value_counts().index
sns.countplot(data = flight_cancelled, x = 'CancellationCode', order = order , color = 'cornflowerblue')

# Include count and % 
for bar in range(flight_cancelled['CancellationCode'].value_counts().shape[0]):
    count = flight_cancelled['CancellationCode'].value_counts()[bar]
    pct = (round(flight_cancelled['CancellationCode'].value_counts(normalize=True), 5)*100).round(2)[bar]
    plt.text(x=bar, y=count, s=f"{count} ({pct}%)", va='top', ha='center')

plt.xlabel('Cancellation Reason')
plt.ylabel('Count')
plt.title('Cancellation Reasons Count')

plt.show()


# Cancelled Flights in Year 2006
merged_df.loc[merged_df['Year'] == 2006].Cancelled.value_counts(normalize=True)

Not Cancelled    0.982927
Cancelled        0.017073
Name: Cancelled, dtype: float64


# Cancelled Flights in Year 2007
merged_df.loc[merged_df['Year'] == 2007].Cancelled.value_counts(normalize=True)

Not Cancelled    0.978435
Cancelled        0.021565
Name: Cancelled, dtype: float64


plt.figure(figsize = (10,4))
order =  flight_cancelled.UniqueCarrier.value_counts().index
sns.countplot(data = flight_cancelled , x = 'UniqueCarrier', order = order , color = 'cornflowerblue')

plt.xlabel('UniqueCarrier')
plt.ylabel('Total Count of Flights')
plt.title('Number of Cancelled Flights by UniqueCarrier')

plt.show()


carrier_df.query("Code == 'MQ' or Code == 'AA' or Code == 'OO'")


plt.figure(figsize = (10,4))

order = flight_cancelled['Month_label'].value_counts().index
sns.countplot(data = flight_cancelled , x = 'Month_label', order = order , color = 'cornflowerblue')

plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Cancellation Flights per month Count')

plt.show()


plt.figure(figsize = (10,4))

# Plot
order = flight_cancelled['Day_label'].value_counts().index
sns.countplot(data = flight_cancelled , x = 'Day_label', order = order , color = 'cornflowerblue')

plt.xlabel('Day of Week')
plt.ylabel('Count')
plt.title('Cancellation Flights per Day Count')

plt.show()


fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(10, 4))

# bar plot for cancellation
ax1.set_title('Cancelled Vs. Not Cancelled (%)')
merged_df_sample.Cancelled.value_counts(normalize=True).plot.bar(color = 'cornflowerblue', width=0.9, ax=ax1)

# bar plot for diverted
ax2.set_title('Diverted Vs. Not Diverted (%)')
p1= merged_df_sample.Diverted.value_counts(normalize=True).plot.bar(color = 'cornflowerblue', width=0.9, ax=ax2)

# bar plot for delayed
ax3.set_title('Delayed Vs. Not Delayed (%)')
p1= merged_df_sample.DelayStatus.value_counts(normalize=True).plot.bar(color = 'cornflowerblue', width=0.9, ax=ax3)

plt.show()


plt.figure(figsize=(10,4))

# plot
sns.scatterplot(x='DepDelay', y='ArrDelay', data=merged_df_sample, color = 'cornflowerblue')

# line to know max of all scores
line = max(max(merged_df['DepDelay']) , max(merged_df['ArrDelay']))
plt.plot([0,line], [0,line], color = 'cadetblue')

plt.xlabel('Departure Delay (min)')
plt.ylabel('Arrival Delay (min)')
plt.title('Relationship between DepDelay & ArrDelay')

plt.xlim([0, 2000])
plt.ylim([0, 2000])

plt.show()


plt.figure(figsize=(10,4))

# plot
sns.scatterplot(x='Distance', y='ArrDelay', data=merged_df_sample, color = 'cornflowerblue')

# Mean line
meandist = merged_df_sample.Distance.mean()
plt.axvline(x = meandist, color = 'red', linestyle = '--', label = 'Mean Distance')
print('The Mean Distance is', meandist, 'miles.')

plt.legend()
plt.xlabel('Distance (miles)')
plt.ylabel('Arrival Delay (min)')
plt.title('Relationship between Distance & ArrDelay')

plt.xlim([0, 4500])
plt.ylim([0, 2000])

plt.show()

The Mean Distance is 723.4800117573867 miles.


# Create new dataframe considering only delayed flights
delay15 = flight_notcancelled[flight_notcancelled.ArrDelay > 15]


diff_delay = delay15.filter(['Month','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay = diff_delay.groupby('Month')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Month')
plt.ylabel('Average Delay (mins)')
plt.title('Average Individual Delay by Month (ArrDelay>15)')

plt.show()


merged_df[['UniqueCarrier','ArrDelay','DepDelay']].groupby(['UniqueCarrier']).sum().sort_values(by='DepDelay').plot(kind='bar')

plt.legend()
plt.ylabel('Total Delay (mins)')
plt.title("Total Delay of flights (non-cancelled)")

plt.show()


carrier_df.query("Code == 'WN' or Code == 'AA' or Code == 'MQ'or Code == 'UA'or Code == 'OO'")


# Create copy of dataframe
df_q1 = flight_notcancelled.copy()


# Categorising ArrTime and DepTime by 6 periods: Midnight, Early Morning, Late Morning, Afternoon, Evening, Night
def time_interval(cols):
    if cols >= 500 and cols < 900:
        return "Early Morning"
    elif cols >= 900 and cols < 1200:
        return "Late Morning"
    elif cols >= 1200 and cols < 1700:
        return "Afternoon"
    elif cols >= 1700 and cols < 2100:
        return "Evening"
    elif cols >= 2100 and cols < 2400:
        return "Night"
    else:
        return "Midnight"
    
# Getting Time Period for ArrTime and DepTime
df_q1['ArrPeriod'] = df_q1["ArrTime"].apply(time_interval)
df_q1['DepPeriod'] = df_q1["DepTime"].apply(time_interval)

# Setting order for the Period
df_q1['ArrPeriod'] = pd.Categorical(df_q1['ArrPeriod'],
                                   categories=['Midnight','Early Morning','Late Morning', 'Afternoon', 'Evening', 'Night'],
                                   ordered=True)
df_q1['DepPeriod'] = pd.Categorical(df_q1['DepPeriod'],
                                   categories=['Midnight','Early Morning','Late Morning', 'Afternoon', 'Evening', 'Night'],
                                   ordered=True)


# Comparing Delay Status Distribution
plt.figure(figsize=(7,4))

sns.countplot(x="ArrPeriod", hue="DelayStatus", data=df_q1, palette="Set2")

plt.ylabel('Total Delay (Mins)')
plt.title("Distribution of Delay Status across DayOfWeek")

plt.show()


plt.figure(figsize=(7,4))

# Plot for DepDelay
x= df_q1.groupby('DepPeriod').mean().index
y= df_q1.groupby('DepPeriod').DepDelay.mean().values
sns.lineplot(x, y, marker='.', markersize=15, color='cornflowerblue', label='DepDelay')

# Plot for ArrDelay
x1= df_q1.groupby('ArrPeriod').mean().index
y2= df_q1.groupby('ArrPeriod').ArrDelay.mean().values
sns.lineplot(x1, y2, marker='.', markersize=15, color='green', label='ArrDelay')

plt.legend()
plt.xlabel('Time Period')
plt.ylabel('Average Delay (mins)')
plt.title("Average Delay vs Time Period")

plt.show()


df_q1.groupby('ArrPeriod')['ArrDelay','DepDelay'].mean()


# Bar Chart
df_q1[['ArrPeriod','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']].groupby(['ArrPeriod']).mean().sort_values(by='ArrPeriod').plot(kind='bar')

plt.legend()
plt.ylabel('Average Delay (mins)')
plt.title("Average Individual Delay of flights")

plt.show()


# Comparing Delay Status Distribution

sns.countplot(x="DayOfWeek", hue="DelayStatus", data=flight_notcancelled, palette="Set2")

plt.ylabel('Total Delay (Mins)')
plt.title("Distribution of Delay Status across DayOfWeek")

plt.show()


# Bar Chart
flight_notcancelled[['DayOfWeek','ArrDelay','DepDelay']].groupby(['DayOfWeek']).mean().sort_values(by='DayOfWeek').plot(kind='bar')

plt.legend()
plt.ylabel('Average Delay (mins)')
plt.title("Average Delay of flights")

plt.show()


# Line Plot
flight_notcancelled.groupby('DayOfWeek')['ArrDelay','DepDelay'].mean().plot()

plt.legend()
plt.xlabel('DayOfWeek')
plt.ylabel('Average Delay (mins)')
plt.title('Average Arr/Dep Delay by DayOfWeek')

plt.show()


flight_notcancelled.groupby('DayOfWeek')['ArrDelay','DepDelay'].mean()


# Bar Chart
flight_notcancelled[['DayOfWeek','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']].groupby(['DayOfWeek']).mean().sort_values(by='DayOfWeek').plot(kind='bar')

plt.legend()
plt.ylabel('Average Delay (mins)')
plt.title("Average Individual Delay of flights")

plt.show()


# Comparing Delay Status Distribution
plt.figure(figsize=(8,4))

sns.countplot(x="Month", hue="DelayStatus", data=flight_notcancelled, palette="Set2")

plt.ylabel('Total Delay (Mins)')
plt.title("Distribution of Delay Status across Month")

plt.show()


# Bar Chart
plt.figure(figsize=(8,4))

flight_notcancelled[['Month','ArrDelay','DepDelay']].groupby(['Month']).mean().sort_values(by='Month').plot(kind='bar')

plt.legend()
plt.ylabel('Average Delay (mins)')
plt.title("Average Delay of flights")

plt.show()

<Figure size 576x288 with 0 Axes>


# Line Plot
flight_notcancelled.groupby('Month')['ArrDelay','DepDelay'].mean().plot()

plt.xlabel('Month')
plt.ylabel('Average Delay (mins)')
plt.title('Average Arr/Dep Delay by Month')

plt.show()


flight_notcancelled.groupby('Month')['ArrDelay','DepDelay'].mean()


# Bar Chart
flight_notcancelled[['Month','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']].groupby(['Month']).mean().sort_values(by='Month').plot(kind='bar')

plt.legend()
plt.ylabel('Average Delay (mins)')
plt.title("Average Individual Delay of flights")

plt.show()


# Line Plot
flight_notcancelled.groupby('DayofMonth')['ArrDelay','DepDelay'].mean().plot()

plt.xlabel('DayofMonth')
plt.ylabel('Average Delay (mins)')
plt.title('Average Arr/Dep Delay within Month')

plt.show()


flight_notcancelled.groupby('DayofMonth')['ArrDelay','DepDelay'].mean()


# Create subset with selected columns
df_FD = flight_notcancelled[['UniqueCarrier', 'DelayStatus']]


# Converting to integer type
df_FD['DelayStatus'] = df_FD['DelayStatus'].astype(int)
df_FD.DelayStatus.value_counts()

0    1102017
1     329312
Name: DelayStatus, dtype: int64


# Group the Carriers, add DelayStatus values
try1 = df_FD.groupby(['UniqueCarrier']).sum().reset_index()

# Create dataframe to add all flights
try2 = flight_notcancelled.UniqueCarrier.value_counts().rename_axis('UniqueCarrier').reset_index(name='TotalFlights')

# Create new dataframe with merged data
d_tgt = pd.merge(try1, try2, on='UniqueCarrier')

d_tgt.head()


# Adding column to see the percentage of delayed flights
d_tgt['PercentageDelayed'] = round((d_tgt['DelayStatus']*100)/d_tgt['TotalFlights'], 2)
d_tgt.rename({'DelayStatus': 'DelayedFlights'}, axis=1, inplace=True)


# Set threshold of value of delayed flights as 23.47%
plt.figure(figsize=(10, 4))
d_tgt.groupby('UniqueCarrier').PercentageDelayed.sum().sort_values(ascending=False).plot.bar(color ='cornflowerblue')
plt.hlines(y=23.47, xmin=-2, xmax=21, colors='r', linestyles='--', label='Threshold (23.47%)')

plt.title('Delayed Flights (%) per Airline')
plt.xlabel('UniqueCarrier/Airline')
plt.ylabel('Delayed Flights (%)')
plt.legend()

plt.show()


# Create a new dataframe
df_q2 = flight_notcancelled.copy()


# Extract issue_date from plane_df
df_q2['issue_date'] = df_q2['TailNum'].map(planes_df.set_index('tailnum')['issue_date'])
df_q2


# Check for missing value
df_q2['issue_date'].isna().sum()

235185


# Extract non-missing issue_date dataset
df_q2 = df_q2[df_q2['issue_date'].notna()]
df_q2.drop(df_q2.index[df_q2['issue_date'] == 'None'], inplace = True)


# Convert to datetime format
df_q2['issue_date'] = pd.to_datetime(df_q2['issue_date'], format='%m/%d/%Y')


df_q2.groupby('issue_date')['ArrDelay','DepDelay'].sum().plot()

plt.xlabel('Issue Date')
plt.ylabel('Total Delay (mins)')
plt.title('Total Arr/Dep Delay with Issue Date')

plt.show()


df_q2['issue_date'].value_counts()

2004-05-13    9716
1986-12-22    9467
2000-10-20    6618
2002-05-30    6193
2005-09-27    5803
              ... 
1991-10-25       2
1997-02-26       1
2007-10-16       1
2007-09-13       1
1997-04-22       1
Name: issue_date, Length: 2367, dtype: int64


# Plot DepDelay and ArrDelay
sns.lineplot(data=df_q2, x= 'issue_date', y='DepDelay',color = 'red', label= "DepDelay")
sns.lineplot(data=df_q2, x= 'issue_date', y='ArrDelay',color = 'cornflowerblue', label= "ArrDelay")

plt.xlabel('Issue Date')
plt.ylabel('Average Delay (mins)')
plt.title('Average Arr/Dep Delay with Issue Date')
plt.legend()

plt.show()


# Extract year value from issue date
df_q2['issue_year'] = df_q2['issue_date'].dt.year


# Comparing Delay Status Distribution
plt.figure(figsize=(15,4))

sns.countplot(x="issue_year", hue="DelayStatus", data=df_q2, palette="Set2")

plt.ylabel('Total Delay (Mins)')
plt.title("Distribution of Delay Status across DayOfWeek")

plt.show()


# To find the plane age
df_q2 = df_q2[df_q2.issue_year != 2008]
df_q2['plane_age'] = max(df_q2['issue_year']) - df_q2['issue_year']


# Line Plot
df_q2.groupby('plane_age')['ArrDelay','DepDelay'].mean().plot()

plt.xlabel('Aircraft Age')
plt.ylabel('Average Delay (mins)')
plt.title('Average Arr/Dep Delay by Aircraft Age')

plt.show()


# Plot Delay Types according to its aircraft age
df_q2a = df_q2.filter(['plane_age','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
df_q2a.groupby('plane_age')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.legend()
plt.xlabel('Aircraft Age')
plt.ylabel('Avg Delay (mins)')
plt.title('Average Delay by Aircraft Age')

plt.show()


old_planes = df_q2.query("issue_date < '1993-01-01'")
normal_planes = df_q2.query("issue_date > '1992-12-31'")


# Create new dataframe considering only delayed flights
old_delay15 = old_planes[old_planes.ArrDelay > 15]
normal_delay15 = normal_planes[normal_planes.ArrDelay > 15]


# Plot Delay Types according to its issue date
diff_delay = df_q2.filter(['issue_date','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay.groupby('issue_date')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.legend()
plt.xlabel('Issue Date')
plt.ylabel('Avg Delay (mins)')
plt.title('Average Delay by Issue Date (Full Data)')

plt.show()


# Plot Delay Types according to its issue date
diff_delay1 = old_delay15.filter(['issue_date','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay1.groupby('issue_date')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.legend()
plt.xlabel('Issue Date')
plt.ylabel('Avg Delay (mins)')
plt.title('Average Delay by Issue Date (Old Planes)')

plt.show()


diff_delay2 = normal_delay15.filter(['issue_date','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay2.groupby('issue_date')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.xlabel('Issue Date')
plt.ylabel('Average Delay (mins)')
plt.title('Average Delay by Issue Date (Normal Planes)')

plt.legend(loc='upper left')
plt.show()


# Line Plot
df_q2.groupby('plane_age')['Distance'].mean().plot()

plt.xlabel('Aircraft Age')
plt.ylabel('Average Distance (miles)')
plt.title('Average Distance per Aircraft Age')

plt.show()


# Plot Old and Normal Planes
sns.lineplot(data=old_planes, x= 'Distance', y='ArrDelay',color = 'red', label= "Old Planes (Before 1993)")
sns.lineplot(data=normal_planes, x= 'Distance', y='ArrDelay',color = 'cornflowerblue', label= "Normal Planes (1993 Onwards)")


plt.xlabel('Distance (miles)')
plt.ylabel('Average ArrDelay (mins)')
plt.title('Average ArrDelay by Distance')

plt.legend()
plt.show()


# Line Plot
df_q2.groupby('Distance')['CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'].mean().plot()

plt.xlabel('Distance (miles)')
plt.ylabel('Average Delay (mins)')
plt.title('Average Individual Delay by Distance')

plt.show()


diff_delay3 = old_delay15.filter(['Distance','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay3.groupby('Distance')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.xlabel('Distance (miles)')
plt.ylabel('Average Delay (mins)')
plt.title('Average Delay by Distance (Old Planes)')

plt.show()


diff_delay4 = normal_delay15.filter(['Distance','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)
diff_delay4.groupby('Distance')['LateAircraftDelay','CarrierDelay','NASDelay','WeatherDelay','SecurityDelay'].mean().plot()

plt.xlabel('Distance (miles)')
plt.ylabel('Average Delay (mins)')
plt.title('Average Delay by Distance (Normal Planes)')

plt.show()


df_q2.corr()['plane_age']

DepTime             -0.000099
CRSDepTime          -0.001067
ArrTime              0.012662
CRSArrTime           0.012775
ActualElapsedTime    0.027313
CRSElapsedTime       0.024720
AirTime              0.025847
ArrDelay             0.002407
DepDelay            -0.002675
Distance             0.022989
TaxiIn              -0.003102
TaxiOut              0.010804
CarrierDelay        -0.007479
WeatherDelay        -0.007438
NASDelay             0.004987
SecurityDelay       -0.000538
LateAircraftDelay    0.004915
issue_year          -1.000000
plane_age            1.000000
Name: plane_age, dtype: float64


# Create subset of relevant variables then find correlation
subset = df_q2[['DepDelay','ArrDelay','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']]
corrmat = subset.corr()
sns.heatmap(corrmat, square=True)
plt.show()


# Create new dataframe
df_q3 = flight_notcancelled.copy()


# Create new column as its flight route
df_q3["FlightRoute"] = df_q3["Origin"] + " to " + df_q3["Dest"]


# Most popular routes
df_q3["FlightRoute"].value_counts()

OGG to HNL    2975
HNL to OGG    2858
SAN to LAX    2755
LAX to LAS    2749
LAX to SAN    2731
              ... 
ORD to RFD       1
MHT to BOS       1
GSO to ROA       1
TYS to AUS       1
BDL to BOS       1
Name: FlightRoute, Length: 5196, dtype: int64


# Extract the top 5 flight routes
most_flights = df_q3.query("FlightRoute == 'HNL to OGG'|FlightRoute == 'OGG to HNL'|FlightRoute == 'LAX to LAS' |FlightRoute == 'SAN to LAX'|FlightRoute == 'LAX to SAN' ")
most_flights


plt.figure(figsize = (14,8))

sns.countplot(x="Month", data=most_flights, hue='FlightRoute', palette='Set2')

plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Count of flights per Flight Route by Month')

plt.show()


df_q3['state'] = df_q3['Origin'].map(airport_df.set_index('iata')['state'])
df_q3


plt.figure(figsize = (14,8))

order = df_q3['state'].value_counts().index
sns.countplot(data = df_q3 , x = 'state', order=order, color = 'cornflowerblue')

plt.xlabel('state')
plt.ylabel('Count')
plt.title('Total Number of Flights by State')

plt.show()


pop_state = df_q3.query('(state == "CA") | (state == "TX") | (state == "IL")')
pop_state


plt.figure(figsize = (14,8))

sns.countplot(x="Year", data=pop_state, hue='state', palette = 'Set2')

plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Flight Counts per state by Month')

plt.show()


plt.figure(figsize = (14,8))

sns.countplot(x="Month", data=pop_state, hue='state', palette = 'Set2')

plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Flight Counts per state by Month')

plt.show()


merged_df['TailNum'].value_counts()

0         177944
000000     21644
N308SW      8569
N478HA      8195
N479HA      8079
           ...  
N194JB         1
N78009         1
N857NW         1
N576SK         1
N824\A         1
Name: TailNum, Length: 5817, dtype: int64


# First extraction of data with top most TailNum counts
df_q4_1 = merged_df.query("TailNum == 'N308SW' & LateAircraftDelay > 15 ")


df_q4_1['Date'].value_counts().head()

2006-01-02    9
2006-04-21    9
2007-01-31    9
2006-06-25    8
2006-05-21    8
Name: Date, dtype: int64


# Extracting dataset that matches our findings
df_q4_1 = merged_df.query("Date == '2006-01-02' & TailNum == 'N308SW'")
df_q4_1.sort_values(by=['DepTime'])


# Important Columns
df_q4_1.query("UniqueCarrier == 'WN'")[['FlightNum','DepTime', 'CRSDepTime','ArrTime','CRSArrTime','TailNum','ArrDelay','DepDelay','Origin','Dest','LateAircraftDelay','Date']].sort_values(by=['DepTime'])


# First extraction of data with second most TailNum counts
df_q4_2 = merged_df.query("TailNum == 'N478HA' & LateAircraftDelay > 15 ")


df_q4_2['Date'].value_counts().head()

2006-03-31    8
2007-09-08    7
2007-09-28    7
2006-04-28    6
2006-12-17    5
Name: Date, dtype: int64


# Extracting dataset that matches our findings
df_q4_2 = merged_df.query("Date == '2006-03-31' & TailNum == 'N478HA'")
df_q4_2.sort_values(by=['DepTime'])


df_q4_2[['FlightNum','DepTime', 'CRSDepTime','ArrTime','CRSArrTime','TailNum','ArrDelay','DepDelay','Origin','Dest','LateAircraftDelay','Date']].sort_values(by=['DepTime'])[-8:]


from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error


# Create new dataframe
df_q5 = flight_notcancelled.copy()
df_q5.head(5)


# Create features (X) and labels (Y)
X = df_q5[['DepDelay','Distance','TaxiIn','TaxiOut','ActualElapsedTime','AirTime','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']]
Y = df_q5['ArrDelay']


# Split into train-test data
X_test, X_train, Y_test, Y_train = train_test_split(X, Y, test_size=0.30, random_state=42)
print(X_train.shape, X_train.shape)
print(Y_test.shape, Y_test.shape)

(429399, 11) (429399, 11)
(1001930,) (1001930,)


# Load model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, Y_train)

LinearRegression()


# Predicting delays
mlr_pred  = lm.predict(X_test)
mlr_pred

array([ 4.92213735, 68.93785102, 29.24754481, ..., -3.67150069,
       16.04323536, -8.29463399])


# Evaluating Model & Accuracy Score
print('The R2 score is' , lm.score(X_test,Y_test))
print('The Mean Square Error (MSE) is',mean_squared_error(Y_test,mlr_pred))
print('The Root Mean Square Error (RMSE) is', np.sqrt(mean_squared_error(Y_test,mlr_pred)))

The R2 score is 0.9571000082975876
The Mean Square Error (MSE) is 62.13641092589014
The Root Mean Square Error (RMSE) is 7.882665217164188


# Use model to predict on feature X
mlr_wpred = lm.predict(X)
mlr_wpred

array([-2.61439299,  1.02934672, -3.90919782, ..., 38.45652781,
       -4.86880419, -6.55794758])


# Creating copy of data for regression
reg_df_q5 = df_q5


# Create new column of MLR prediction
reg_df_q5['MLR_DelayPrediction'] = mlr_wpred


# Getting output results
mlr_results = reg_df_q5.loc[(reg_df_q5['MLR_DelayPrediction'] == mlr_wpred)]
mlr_results[['UniqueCarrier','Origin','Dest','ArrDelay','MLR_DelayPrediction','DelayStatus']][0:5]


# Load model
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=10, random_state=42)
rf_reg.fit(X_train,Y_train)

RandomForestRegressor(n_estimators=10, random_state=42)


# Predicting delays
rf_regpred = rf_reg.predict(X_test)
rf_regpred

array([ 0.3, 74.9, 30. , ..., -3.7, 15. , -8.9])


# Evaluating Model & Accuracy Score
print('The R2 score is' , rf_reg.score(X_test,Y_test))
print('The Mean Square Error (MSE) is',mean_squared_error(Y_test,rf_regpred))
print('The Root Mean Square Error (RMSE) is', np.sqrt(mean_squared_error(Y_test,rf_regpred)))

The R2 score is 0.9624242454978393
The Mean Square Error (MSE) is 54.42477795316974
The Root Mean Square Error (RMSE) is 7.377315091086305


# Use model to predict on whole dataset
rf_pred = rf_reg.predict(X)
rf_pred

array([ -0.3,   3.6, -11.8, ...,  43. ,  -5.9,  -5.8])


# Create new column of RF prediction
reg_df_q5['RF_DelayPrediction'] = rf_pred


rf_results = reg_df_q5.loc[(reg_df_q5['RF_DelayPrediction'] == rf_pred)]
rf_results[['UniqueCarrier','Origin','Dest','ArrDelay','RF_DelayPrediction','DelayStatus']][:5]


# Create features (X) and labels (Y)
X1 = df_q5[['DepDelay','Distance','TaxiIn','TaxiOut','ActualElapsedTime','AirTime','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']]
Y2 = df_q5['DelayStatus']


x_train_d, x_test_d, y_train_d, y_test_d = train_test_split(X1, Y2, test_size = 0.3, random_state=42)


from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()


LR_model.fit(x_train_d, y_train_d)
LR_regpred = LR_model.predict(x_test_d)


# Use model to predict on whole dataset
LR_pred = LR_model.predict(X1)
LR_pred

array(['0', '0', '0', ..., '1', '0', '0'], dtype=object)


# Create new column of LR prediction
reg_df_q5['LR_Prediction'] = LR_pred


# Selected dataframes and prediction output
LR_results = reg_df_q5.loc[(reg_df_q5['LR_Prediction'] == LR_pred)]
LR_results[['UniqueCarrier','Origin','Dest','LR_Prediction','DelayStatus']][0:5]


# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_d, LR_regpred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    330709
           1       0.96      0.98      0.97     98690

    accuracy                           0.99    429399
   macro avg       0.98      0.98      0.98    429399
weighted avg       0.99      0.99      0.99    429399


# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_d, LR_regpred))

[[327185   3524]
 [  2065  96625]]


# Evaluating Model & Accuracy Score
print('The R2 score is' , LR_model.score(x_test_d,y_test_d))
print('The Mean Square Error (MSE) is',mean_squared_error(y_test_d,LR_regpred))
print('The Root Mean Square Error (RMSE) is', np.sqrt(mean_squared_error(y_test_d,LR_regpred)))

The R2 score is 0.9869841336379451
The Mean Square Error (MSE) is 0.013015866362054871
The Root Mean Square Error (RMSE) is 0.11408709989326081

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	CancellationCode	CarrierDelay	LateAircraftDelay
0	2007	1	1	1	1232.0	1225	1341.0	1340	WN	2891	N351	69.0	75.0	54.0	1.0	7.0	SMF	ONT	389	4	11	NaN	0	0
1	2007	1	1	1	1918.0	1905	2043.0	2035	WN	462	N370	85.0	90.0	74.0	8.0	13.0	SMF	PDX	479	5	6	NaN	0	0
2	2007	1	1	1	2206.0	2130	2334.0	2300	WN	1229	N685	88.0	90.0	73.0	34.0	36.0	SMF	PDX	479	6	9	NaN	3	31
3	2007	1	1	1	1230.0	1200	1356.0	1330	WN	1355	N364	86.0	90.0	75.0	26.0	30.0	SMF	PDX	479	3	8	NaN	23	3
4	2007	1	1	1	831.0	830	957.0	1000	WN	2278	N480	86.0	90.0	74.0	-3.0	1.0	SMF	PDX	479	3	9	NaN	0	0

	count	mean	std	min	25%	50%	75%	max
Year	14595137.0	2006.510664	0.499886	2006.0	2006.0	2007.0	2007.0	2007.0
Month	14595137.0	6.538310	3.425110	1.0	4.0	7.0	10.0	12.0
DayofMonth	14595137.0	15.728207	8.783863	1.0	8.0	16.0	23.0	31.0
DayOfWeek	14595137.0	3.941697	1.992110	1.0	2.0	4.0	6.0	7.0
DepTime	14312455.0	1339.795582	478.215967	1.0	930.0	1329.0	1732.0	2930.0
CRSDepTime	14595137.0	1331.404610	463.654205	0.0	930.0	1324.0	1720.0	2359.0
ArrTime	14279090.0	1484.467382	504.090036	1.0	1108.0	1515.0	1911.0	2955.0
CRSArrTime	14595137.0	1495.574683	480.164376	0.0	1115.0	1520.0	1906.0	2400.0
FlightNum	14595137.0	2187.446435	1980.504301	1.0	587.0	1501.0	3499.0	9619.0
ActualElapsedTime	14279090.0	126.193699	71.248664	5.0	75.0	108.0	156.0	1879.0
CRSElapsedTime	14594139.0	127.218035	70.336905	-1240.0	77.0	109.0	157.0	1430.0
AirTime	14279090.0	102.834872	72.382622	-1425.0	54.0	84.0	131.0	1958.0
ArrDelay	14279090.0	9.451859	38.000104	-592.0	-9.0	-1.0	13.0	2598.0
DepDelay	14312455.0	10.758819	34.877805	-1200.0	-4.0	0.0	10.0	2601.0
Distance	14595137.0	723.814232	568.335124	11.0	317.0	569.0	951.0	4962.0
TaxiIn	14595137.0	6.872862	22.081397	0.0	4.0	5.0	8.0	1501.0
TaxiOut	14595137.0	16.028638	11.565315	0.0	10.0	13.0	19.0	602.0
Cancelled	14595137.0	0.019368	0.137815	0.0	0.0	0.0	0.0	1.0
Diverted	14595137.0	0.002286	0.047758	0.0	0.0	0.0	0.0	1.0
CarrierDelay	14595137.0	3.635713	19.870826	0.0	0.0	0.0	0.0	2580.0
WeatherDelay	14595137.0	0.725823	9.085721	0.0	0.0	0.0	0.0	1429.0
NASDelay	14595137.0	3.686945	15.910850	0.0	0.0	0.0	0.0	1392.0
SecurityDelay	14595137.0	0.027288	1.183342	0.0	0.0	0.0	0.0	382.0
LateAircraftDelay	14595137.0	4.813299	20.598181	0.0	0.0	0.0	0.0	1366.0

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	Date	Month_label	Day_label	DelayStatus
0	2007	1	1	1	1232.0	1225	1341.0	1340	WN	2891	N351	69.0	75.0	54.0	1.0	7.0	SMF	ONT	389	4	11	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-01-01	Jan	Mon	0
1	2007	1	1	1	1918.0	1905	2043.0	2035	WN	462	N370	85.0	90.0	74.0	8.0	13.0	SMF	PDX	479	5	6	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-01-01	Jan	Mon	0
2	2007	1	1	1	2206.0	2130	2334.0	2300	WN	1229	N685	88.0	90.0	73.0	34.0	36.0	SMF	PDX	479	6	9	Not Cancelled	NaN	Not Diverted	3	0	0	0	31	2007-01-01	Jan	Mon	1
3	2007	1	1	1	1230.0	1200	1356.0	1330	WN	1355	N364	86.0	90.0	75.0	26.0	30.0	SMF	PDX	479	3	8	Not Cancelled	NaN	Not Diverted	23	0	0	0	3	2007-01-01	Jan	Mon	1
4	2007	1	1	1	831.0	830	957.0	1000	WN	2278	N480	86.0	90.0	74.0	-3.0	1.0	SMF	PDX	479	3	9	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-01-01	Jan	Mon	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14595132	2006	12	29	5	1246.0	1249	1452.0	1459	DL	1675	N905DE	126.0	130.0	108.0	-7.0	-3.0	ATL	EWR	745	6	12	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-29	Dec	Fri	0
14595133	2006	12	29	5	1225.0	1155	2033.0	1931	DL	1676	N651DL	308.0	276.0	261.0	62.0	30.0	SEA	ATL	2182	12	35	Not Cancelled	NaN	Not Diverted	30	0	32	0	0	2006-12-29	Dec	Fri	1
14595134	2006	12	29	5	2118.0	2115	2254.0	2241	DL	1676	N143DA	96.0	86.0	60.0	13.0	3.0	ATL	MCO	403	9	27	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-29	Dec	Fri	0
14595135	2006	12	29	5	2122.0	2127	2209.0	2223	DL	1677	N904DA	107.0	116.0	81.0	-14.0	-5.0	SLC	SJC	585	5	21	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-29	Dec	Fri	0
14595136	2006	12	29	5	2042.0	2045	2213.0	2235	DL	1678	N914DN	211.0	230.0	191.0	-22.0	-3.0	CVG	SLC	1449	5	15	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-29	Dec	Fri	0

	Code	Description
100	AA	American Airlines Inc.
848	MQ	American Eagle Airlines Inc.
949	OO	Skywest Airlines Inc.
1297	UA	United Air Lines Inc.
1388	WN	Southwest Airlines Co.

	Code	Description
100	AA	American Airlines Inc.
848	MQ	American Eagle Airlines Inc.
949	OO	Skywest Airlines Inc.

	Year	Month	DayofMonth	Date
9989370	2006	5	15	2006-05-15
10679957	2006	6	6	2006-06-06
13945296	2006	11	9	2006-11-09

	ArrDelay	DepDelay
ArrPeriod
Midnight	68.691210	67.668800
Early Morning	-2.629913	-0.329221
Late Morning	1.326621	2.817979
Afternoon	6.039461	7.684434
Evening	12.699964	13.518453
Night	22.100972	22.646892

	ArrDelay	DepDelay
DayOfWeek
1	9.706076	11.115190
2	7.210507	8.516248
3	8.980213	9.782257
4	12.216688	12.398298
5	12.944968	13.509117
6	5.583741	8.692829
7	9.244071	11.038142

	ArrDelay	DepDelay
Month
1	7.651337	9.398433
2	10.335359	11.424177
3	8.978117	10.694113
4	7.652325	9.271485
5	7.121950	8.552550
6	14.235764	14.722072
7	12.849599	13.828892
8	10.701020	11.742373
9	6.057809	7.386887
10	8.501795	9.173222
11	6.033901	8.181639
12	13.793222	14.782222

	ArrDelay	DepDelay
DayofMonth
1	9.247080	10.374597
2	10.355887	11.454968
3	8.818566	10.250027
4	6.851954	8.697963
5	8.745867	9.913006
6	6.803094	8.224274
7	7.349336	8.793673
8	6.158959	8.201658
9	6.664747	8.557264
10	8.725496	10.097192
11	9.283930	10.495677
12	9.310679	10.611710
13	9.124182	10.629409
14	9.881783	10.981216
15	11.931632	12.704664
16	11.661524	12.653342
17	10.389115	11.615786
18	10.069138	11.045238
19	11.940864	12.599455
20	10.307294	11.782669
21	10.730140	11.629272
22	12.712159	13.327803
23	9.910426	11.278571
24	8.688454	10.218898
25	9.585357	10.836983
26	11.787908	12.686850
27	10.766162	11.659527
28	10.213926	11.527774
29	8.975394	10.579391
30	8.671562	10.163374
31	8.205463	9.786340

	UniqueCarrier	DelayStatus	TotalFlights
0	9E	5047	25013
1	AA	31593	124663
2	AQ	540	7949
3	AS	7633	31332
4	B6	8956	34198

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	Date	Month_label	Day_label	DelayStatus	issue_date
11427221	2006	7	13	4	2132.0	2130	18.0	24	AA	314	N072AA	166.0	174.0	134.0	-6.0	2.0	MIA	JFK	1090	11	21	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-07-13	Jul	Thu	0	NaN
14306699	2006	12	9	6	2117.0	2120	2226.0	2225	EV	4520	N758EV	129.0	125.0	97.0	1.0	-3.0	SLC	SFO	599	5	27	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-09	Dec	Sat	0	04/29/2005
13821464	2006	11	19	7	1247.0	1250	1415.0	1430	NW	1137	N309US	88.0	100.0	68.0	-15.0	-3.0	BWI	DTW	408	4	16	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-11-19	Nov	Sun	0	05/23/2006
1856981	2007	4	11	3	1720.0	1720	1813.0	1820	WN	39	N503SW	53.0	60.0	44.0	-7.0	0.0	HOU	HRL	276	2	7	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-04-11	Apr	Wed	0	03/30/2000
14014779	2006	12	11	1	1832.0	1825	1937.0	1935	WN	2310	N353	65.0	70.0	52.0	2.0	7.0	BWI	PVD	328	3	10	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-11	Dec	Mon	0	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
481913	2007	1	24	3	955.0	1000	1208.0	1150	9E	4759	88869E	133.0	110.0	101.0	18.0	-5.0	RDU	IND	489	9	23	Not Cancelled	NaN	Not Diverted	0	0	18	0	0	2007-01-24	Jan	Wed	1	NaN
1630112	2007	3	6	2	1523.0	1530	1741.0	1735	MQ	4847	N729AE	138.0	125.0	96.0	6.0	-7.0	LGA	DTW	501	13	29	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-03-06	Mar	Tue	0	11/17/2000
515861	2007	1	15	1	1548.0	1500	2037.0	1955	AA	1346	N3CGAA	169.0	175.0	136.0	42.0	48.0	SNA	DFW	1205	20	13	Not Cancelled	NaN	Not Diverted	0	0	4	0	38	2007-01-15	Jan	Mon	1	NaN
5117427	2007	9	12	3	1414.0	1415	1727.0	1727	XE	7810	N14173	133.0	132.0	109.0	0.0	-1.0	LAX	DEN	862	5	19	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-09-12	Sep	Wed	0	11/24/2004
9527990	2006	4	13	4	1059.0	1106	1149.0	1200	MQ	3312	N376AE	50.0	54.0	28.0	-11.0	-7.0	SPS	DFW	113	12	10	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-04-13	Apr	Thu	0	NaN

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	Date	Month_label	Day_label	DelayStatus	FlightRoute
2363983	2007	4	7	6	1853.0	1905	1923.0	1939	AQ	67	N828AL	30.0	34.0	22.0	-16.0	-12.0	OGG	HNL	100	3	5	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-04-07	Apr	Sat	0	OGG to HNL
1115453	2007	2	24	6	900.0	900	932.0	934	AQ	63	N837AL	32.0	34.0	20.0	-2.0	0.0	OGG	HNL	100	7	5	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-02-24	Feb	Sat	0	OGG to HNL
12550714	2006	9	15	5	1841.0	1840	1917.0	1917	HA	196	N475HA	36.0	37.0	22.0	0.0	1.0	HNL	OGG	100	6	8	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-09-15	Sep	Fri	0	HNL to OGG
2363901	2007	4	15	7	1345.0	1340	1420.0	1414	AQ	71	N824AL	35.0	34.0	23.0	6.0	5.0	OGG	HNL	100	5	7	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-04-15	Apr	Sun	0	OGG to HNL
2636311	2007	5	11	5	748.0	745	835.0	832	OO	6160	N576SW	47.0	47.0	30.0	3.0	3.0	SAN	LAX	109	5	12	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-05-11	May	Fri	0	SAN to LAX
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
6762703	2007	11	11	7	1804.0	1805	1834.0	1839	AQ	275	N808AL	30.0	34.0	21.0	-5.0	-1.0	OGG	HNL	100	3	6	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-11-11	Nov	Sun	0	OGG to HNL
12552231	2006	9	30	6	1257.0	1300	1333.0	1334	HA	529	N477HA	36.0	34.0	21.0	-1.0	-3.0	OGG	HNL	100	7	8	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-09-30	Sep	Sat	0	OGG to HNL
10264891	2006	5	5	5	904.0	900	939.0	935	AQ	206	N808AL	35.0	35.0	24.0	4.0	4.0	HNL	OGG	100	3	8	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-05-05	May	Fri	0	HNL to OGG
6489595	2007	11	20	2	1037.0	1041	1146.0	1157	UA	1548	N449UA	69.0	76.0	45.0	-11.0	-4.0	LAX	LAS	236	7	17	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2007-11-20	Nov	Tue	0	LAX to LAS
14499943	2006	12	7	4	1256.0	1300	1334.0	1337	AQ	210	N824AL	38.0	37.0	23.0	-3.0	-4.0	HNL	OGG	100	4	11	Not Cancelled	NaN	Not Diverted	0	0	0	0	0	2006-12-07	Dec	Thu	0	HNL to OGG

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	LateAircraftDelay	Date	Month_label	Day_label	DelayStatus
7486653	2006	1	2	1	630.0	630	743.0	730	WN	1783	N308SW	73.0	60.0	55.0	13.0	0.0	LAS	LAX	236	4	14	Not Cancelled	NaN	Not Diverted	0	0	0	0	2006-01-02	Jan	Mon	0
7486870	2006	1	2	1	809.0	800	929.0	915	WN	2662	N308SW	80.0	75.0	65.0	14.0	9.0	LAX	OAK	337	3	12	Not Cancelled	NaN	Not Diverted	0	0	0	0	2006-01-02	Jan	Mon	0
7611530	2006	1	2	1	955.0	947	1057.0	1052	OO	3717	N308SW	62.0	65.0	40.0	5.0	8.0	SLC	TWF	175	3	19	Not Cancelled	NaN	Not Diverted	0	0	0	0	2006-01-02	Jan	Mon	0
7487432	2006	1	2	1	1012.0	950	1139.0	1105	WN	1717	N308SW	87.0	75.0	76.0	34.0	22.0	OAK	ONT	361	3	8	Not Cancelled	NaN	Not Diverted	8	0	12	14	2006-01-02	Jan	Mon	1
7611531	2006	1	2	1	1147.0	1130	1242.0	1231	OO	3718	N308SW	55.0	61.0	38.0	11.0	17.0	TWF	SLC	175	10	7	Not Cancelled	NaN	Not Diverted	0	0	0	0	2006-01-02	Jan	Mon	0
7487573	2006	1	2	1	1203.0	1130	1321.0	1250	WN	1067	N308SW	78.0	80.0	63.0	31.0	33.0	ONT	SMF	389	5	10	Not Cancelled	NaN	Not Diverted	0	0	0	31	2006-01-02	Jan	Mon	1
7611487	2006	1	2	1	1302.0	1105	1408.0	1204	OO	3680	N308SW	66.0	59.0	42.0	124.0	117.0	SLC	PIH	150	5	19	Not Cancelled	NaN	Not Diverted	0	0	0	124	2006-01-02	Jan	Mon	1
7611488	2006	1	2	1	1418.0	1335	1509.0	1424	OO	3680	N308SW	51.0	49.0	37.0	45.0	43.0	PIH	SLC	150	6	8	Not Cancelled	NaN	Not Diverted	0	0	0	45	2006-01-02	Jan	Mon	1
7488368	2006	1	2	1	1420.0	1315	1535.0	1430	WN	1417	N308SW	75.0	75.0	65.0	65.0	65.0	SMF	ONT	389	5	5	Not Cancelled	NaN	Not Diverted	34	0	0	31	2006-01-02	Jan	Mon	1
7487566	2006	1	2	1	1610.0	1455	1719.0	1610	WN	936	N308SW	69.0	75.0	60.0	69.0	75.0	ONT	SJC	333	4	5	Not Cancelled	NaN	Not Diverted	9	0	0	60	2006-01-02	Jan	Mon	1
7611505	2006	1	2	1	1627.0	1540	1635.0	1541	OO	3699	N308SW	68.0	61.0	51.0	54.0	47.0	SLC	EKO	200	4	13	Not Cancelled	NaN	Not Diverted	0	54	0	0	2006-01-02	Jan	Mon	1
7611506	2006	1	2	1	1648.0	1600	1844.0	1757	OO	3699	N308SW	56.0	57.0	45.0	47.0	48.0	EKO	SLC	200	4	7	Not Cancelled	NaN	Not Diverted	0	47	0	0	2006-01-02	Jan	Mon	1
7488258	2006	1	2	1	1750.0	1635	1927.0	1820	WN	936	N308SW	97.0	105.0	83.0	67.0	75.0	SJC	PDX	569	5	9	Not Cancelled	NaN	Not Diverted	5	0	0	62	2006-01-02	Jan	Mon	1
7487633	2006	1	2	1	2000.0	1845	2135.0	2030	WN	1091	N308SW	95.0	105.0	87.0	65.0	75.0	PDX	SJC	569	3	5	Not Cancelled	NaN	Not Diverted	7	0	0	58	2006-01-02	Jan	Mon	1
7611544	2006	1	2	1	2110.0	2055	2233.0	2221	OO	3729	N308SW	83.0	86.0	57.0	12.0	15.0	SLC	COD	298	6	20	Not Cancelled	NaN	Not Diverted	0	0	0	0	2006-01-02	Jan	Mon	0
7488239	2006	1	2	1	2155.0	2050	2305.0	2200	WN	1091	N308SW	70.0	70.0	52.0	65.0	65.0	SJC	LAX	308	7	11	Not Cancelled	NaN	Not Diverted	0	0	0	65	2006-01-02	Jan	Mon	1
7486831	2006	1	2	1	2322.0	2225	20.0	2325	WN	1091	N308SW	58.0	60.0	48.0	55.0	57.0	LAX	LAS	236	4	6	Not Cancelled	NaN	Not Diverted	0	0	0	55	2006-01-02	Jan	Mon	1

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	LateAircraftDelay	Date	Month_label	Day_label	DelayStatus
8922601	2006	3	31	5	518.0	520	559.0	557	HA	106	N478HA	41.0	37.0	24.0	2.0	-2.0	HNL	OGG	100	5	12	Not Cancelled	NaN	Not Diverted	0	0	0	2006-03-31	Mar	Fri	0
8922574	2006	3	31	5	623.0	625	701.0	659	HA	105	N478HA	38.0	34.0	24.0	2.0	-2.0	OGG	HNL	100	8	6	Not Cancelled	NaN	Not Diverted	0	0	0	2006-03-31	Mar	Fri	0
8922934	2006	3	31	5	728.0	735	819.0	818	HA	118	N478HA	51.0	43.0	30.0	1.0	-7.0	HNL	KOA	163	4	17	Not Cancelled	NaN	Not Diverted	0	0	0	2006-03-31	Mar	Fri	0
8922903	2006	3	31	5	844.0	848	936.0	928	HA	117	N478HA	52.0	40.0	41.0	8.0	-4.0	KOA	HNL	163	4	7	Not Cancelled	NaN	Not Diverted	0	0	0	2006-03-31	Mar	Fri	0
8924619	2006	3	31	5	1002.0	1005	1037.0	1042	HA	316	N478HA	35.0	37.0	22.0	-5.0	-3.0	HNL	OGG	100	5	8	Not Cancelled	NaN	Not Diverted	0	0	0	2006-03-31	Mar	Fri	0
8924588	2006	3	31	5	1130.0	1110	1222.0	1144	HA	315	N478HA	52.0	34.0	35.0	38.0	20.0	OGG	HNL	100	10	7	Not Cancelled	NaN	Not Diverted	0	20	18	2006-03-31	Mar	Fri	1
8923833	2006	3	31	5	1326.0	1215	1418.0	1258	HA	178	N478HA	52.0	43.0	31.0	80.0	71.0	HNL	KOA	163	7	14	Not Cancelled	NaN	Not Diverted	0	33	47	2006-03-31	Mar	Fri	1
8923864	2006	3	31	5	1450.0	1328	1524.0	1357	HA	179	N478HA	34.0	29.0	18.0	87.0	82.0	KOA	OGG	84	6	10	Not Cancelled	NaN	Not Diverted	2	0	85	2006-03-31	Mar	Fri	1
8923895	2006	3	31	5	1544.0	1425	1628.0	1459	HA	179	N478HA	44.0	34.0	27.0	89.0	79.0	OGG	HNL	100	8	9	Not Cancelled	NaN	Not Diverted	0	0	89	2006-03-31	Mar	Fri	1
8924412	2006	3	31	5	1702.0	1545	1800.0	1636	HA	262	N478HA	58.0	51.0	36.0	84.0	77.0	HNL	ITO	216	6	16	Not Cancelled	NaN	Not Diverted	0	0	84	2006-03-31	Mar	Fri	1
8924381	2006	3	31	5	1824.0	1705	1914.0	1752	HA	261	N478HA	50.0	47.0	37.0	82.0	79.0	ITO	HNL	216	6	7	Not Cancelled	NaN	Not Diverted	0	0	82	2006-03-31	Mar	Fri	1
8924557	2006	3	31	5	1944.0	1825	2026.0	1908	HA	308	N478HA	42.0	43.0	29.0	78.0	79.0	HNL	KOA	163	4	9	Not Cancelled	NaN	Not Diverted	0	0	78	2006-03-31	Mar	Fri	1
8924526	2006	3	31	5	2047.0	1938	2130.0	2018	HA	307	N478HA	43.0	40.0	30.0	72.0	69.0	KOA	HNL	163	6	7	Not Cancelled	NaN	Not Diverted	0	0	72	2006-03-31	Mar	Fri	1

	UniqueCarrier	Origin	Dest	ArrDelay	MLR_DelayPrediction
11427221	AA	MIA	JFK	-6.0	-2.614393
14306699	EV	SLC	SFO	1.0	1.029347
13821464	NW	BWI	DTW	-15.0	-3.909198
1856981	WN	HOU	HRL	-7.0	-7.567691
14014779	WN	BWI	PVD	2.0	-3.301670

	UniqueCarrier	Origin	Dest	ArrDelay	RF_DelayPrediction
11427221	AA	MIA	JFK	-6.0	-0.300
14306699	EV	SLC	SFO	1.0	3.600
13821464	NW	BWI	DTW	-15.0	-11.800
1856981	WN	HOU	HRL	-7.0	-5.975
14014779	WN	BWI	PVD	2.0	0.700

Flight Analysis in the U.S.¶

Table of Contents¶

Introduction¶

Import Data & Libraries¶

Understanding Data¶

Data Pre-Processing/Cleaning¶

Creating Delay Status¶

Creating sample of entire data¶

Checking for outliers¶

Checking for missing data between Origin & Dest¶

Exploratory Data Analysis (EDA)¶

Total Flight Distribution¶

Total Flight Distribution of Full Data by Month¶

Total Number of Flights per Airline¶

Distribution of Distance¶

Cancellation¶

Cancellation Reasons¶

Cancelled Flights Distribution¶

Month & Day with the most cancelled flights¶

Delayed Flights¶

Relationship between ArrDelay & Depdelay¶

Relationship of Type of Delay & Average ArrDelay¶

Total Sum of Delay (per Airline)¶

Q1. Best time, day and time of year to minimise flight delays?¶

Best Time of the Day¶

Distribution of Average Delay by Time Period¶

Best Day of the Week¶

Distribution of Day Delay¶

Best Month of the Year¶

Distribution of Monthly Delay¶

Best Day of the Month¶

Distribution of DayofMonth Delay¶

Delayed Flights (%) per Airline¶

Q2. Are older planes more prone to delays?¶

Distribution of Arr/Dep Delay by Issue Date¶

Total Sum of Delay by Issue Date¶

Average Delay with Issue Date¶

Distribution of Arr/Dep Delay by Issue Year¶

Comparison of Old and Normal Planes¶

Distribution of Individual Delays¶

Average Delay by Issue Date per Delay Type¶

Distance Distribution of Old & Normal Planes¶

Average Delay by Distance¶

Q3. Show how the no. of people flying between various locations change with time.¶

Distribution by Flight Routes¶

Distribution by State¶

Q4. Can cascading failures where an airport's delay causing delay in another be detected?¶

Methodology¶

Initial Test¶

Secondary Testing¶

Q5. Construct a model predicting delays.¶

Multiple Linear Regression¶

Random Forest¶

Logistic Regression¶