Load Relevant Libraries

library(DBI)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(geosphere)

Establish database connection using RSQLite package

conn <- dbConnect(RSQLite::SQLite(), "flights_r.db")

Read data from CSV files into respective names

airports <- read.csv("airports.csv", header = TRUE)
carriers <- read.csv("carriers.csv", header = TRUE)
planes <- read.csv("plane-data.csv", header = TRUE)

Loop to create a table in the database

# Iterate over 10 years (1997 to 2006) to read and process CSV files in the format "year.csv.bz2"
# Data from these files are then written into "flights" table in SQLite database
for(i in c(1997:2006)) {
  filename <- paste0(i, ".csv.bz2")
  print(paste("Processing:", filename))
  flights <- read.csv(filename, header = TRUE)
  if(i == 1997) {
    dbWriteTable(conn, "flights", flights)
  } else {
    dbWriteTable(conn, "flights", flights, append = TRUE)
  }
}

List and check each table and fields in the connected SQLite database

dbListTables(conn)
## [1] "airports"       "airports.csv"   "carriers"       "carriers.csv"  
## [5] "flights"        "ontime"         "plane-data.csv" "planes"
dbListFields(conn, "flights")
##  [1] "Year"              "Month"             "DayofMonth"       
##  [4] "DayOfWeek"         "DepTime"           "CRSDepTime"       
##  [7] "ArrTime"           "CRSArrTime"        "UniqueCarrier"    
## [10] "FlightNum"         "TailNum"           "ActualElapsedTime"
## [13] "CRSElapsedTime"    "AirTime"           "ArrDelay"         
## [16] "DepDelay"          "Origin"            "Dest"             
## [19] "Distance"          "TaxiIn"            "TaxiOut"          
## [22] "Cancelled"         "CancellationCode"  "Diverted"         
## [25] "CarrierDelay"      "WeatherDelay"      "NASDelay"         
## [28] "SecurityDelay"     "LateAircraftDelay"
dbListFields(conn, "airports")
## [1] "iata"    "airport" "city"    "state"   "country" "lat"     "long"
dbListFields(conn, "carriers")
## [1] "Code"        "Description"
dbListFields(conn, "planes")
## [1] "tailnum"       "type"          "manufacturer"  "issue_date"   
## [5] "model"         "status"        "aircraft_type" "engine_type"  
## [9] "year"

Part a)

Query to calculate average delay for each time interval, day of week, and year

This SQL query calculates the average delay for flights within specific time intervals (0000-0359, 0400-0759, 0800-1159, 1200-1559, 1600-1959, 2000-2359) on each day of the week and for each year between 1997 and 2006. It groups the data by year, day of the week, and time interval, then calculates the average delay (using the ‘avg_delay’ variable) for each group.

q1 <- paste0("
  SELECT year, dayofweek, 
         CASE 
           WHEN deptime BETWEEN '0000' AND '0359' THEN '0000-0359'
           WHEN deptime BETWEEN '0400' AND '0759' THEN '0400-0759'
           WHEN deptime BETWEEN '0800' AND '1159' THEN '0800-1159'
           WHEN deptime BETWEEN '1200' AND '1559' THEN '1200-1559'
           WHEN deptime BETWEEN '1600' AND '1959' THEN '1600-1959'
           WHEN deptime BETWEEN '2000' AND '2359' THEN '2000-2359'
         END AS time_interval,
         AVG(depdelay) AS avg_delay
  FROM flights
  WHERE year BETWEEN 1997 AND 2006 AND deptime <= '2359'
  GROUP BY year, dayofweek, time_interval
")

Execute the query

q1 <- dbGetQuery(conn, q1)

View results of the query

View(q1)

Construct a boxplot

To visualise distribution of delay for each day of the week

ggplot(q1, aes(x = factor(DayOfWeek), y = avg_delay, fill = factor(DayOfWeek))) +
  geom_boxplot() +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Distribution of Delays by Day of the Week",
       x = "Day of the Week",
       y = "Average Delay",
       fill = "Day of the Week") +
  theme_bw()

Convert ‘time_interval’ to factor

This ensures correct ordering of data

q1$time_interval <- factor(q1$time_interval, levels = unique(q1$time_interval))

Construct a heat map for each year

Each tile represents the avg delay for the corresponding time interval, day of week and year

ggplot(q1, aes(x = time_interval, y = DayOfWeek, fill = avg_delay)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  facet_wrap(~Year, scales = "free_y") +
  labs(title = "Average Delay Heatmap by Time Interval, Day of the Week, and Year",
       x = "Time Interval",
       y = "Day of the Week",
       fill = "Average Delay") +
  theme_bw() +
  scale_x_discrete(labels = c('0000', '0400', '0800', '1200', '1600', '2000'))

Part b)

Query to calculate average delay according to each associated plane age, and removing anomalies

This SQL query calculates the average delay for flights based on the age of the associated plane. It computes the age of each plane by subtracting the year of the flight (f.year) from the year the plane was manufactured (p.year). It then calculates the average departure delay (avg_delay) for flights associated with planes of each age. The query filters out flights with null departure delays (f.depdelay IS NOT NULL) and planes with missing or invalid manufacturing years (p.year NOT IN (’‘, ’None’, ‘0000’) and LENGTH(p.year) = 4). Additionally, it ensures that the calculated plane age is non-negative ((f.year - p.year) >= 0) to avoid negative values. Finally, the results are grouped by plane age.

q2 <- dbGetQuery(conn, "
    SELECT 
        (f.year - p.year) AS plane_age,
        AVG(f.depdelay) AS avg_delay
    FROM 
        flights AS f
    INNER JOIN 
        planes AS p ON f.tailnum = p.tailnum
    WHERE 
        f.depdelay IS NOT NULL
        AND p.year IS NOT NULL
        AND p.year NOT IN ('', 'None', '0000')
        AND (f.year - p.year) >= 0  -- Ensure positive plane_age values
        AND LENGTH(p.year) = 4      -- Ensure p.year is a 4-digit number
    GROUP BY 
        plane_age
")

Calculate correlation coefficient

correlation <- cor(q2$plane_age, q2$avg_delay)

Create a scatter plot

This ggplot2 code creates a scatter plot (geom_point()) with a trend line (geom_smooth(method = “lm”, se = FALSE)) representing the relationship between the age of the plane (plane_age) and the average delay (avg_delay). It also adds a title (labs(title = …)) that includes the correlation coefficient (correlation) rounded to two decimal places, and labels for the x-axis (Plane Age) and y-axis (Average Delay).

ggplot(q2, aes(x = plane_age, y = avg_delay)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = paste("Average Delay vs. Plane Age (Correlation:", round(correlation, 2), ")"),
       x = "Plane Age",
       y = "Average Delay")
## `geom_smooth()` using formula = 'y ~ x'

Part c)

Use dbplyr to create a database table abstraction

flights_db <- tbl(conn, "flights")

Load data from the database tables

airports <- tbl(conn, "airports")
carriers <- tbl(conn, "carriers")
planes <- tbl(conn, "planes")

Merge tables to get all necessary information

flights <- flights_db %>%
  left_join(airports, by = c("Origin" = "iata")) %>%
  left_join(airports, by = c("Dest" = "iata"), suffix = c("_origin", "_dest")) %>%
  left_join(carriers, by = c("UniqueCarrier" = "Code")) %>%
  left_join(planes, by = c("TailNum" = "tailnum"))

Feature engineering of distance using provided airport coordinates

flights <- flights %>%
  mutate(Distance = sqrt((lat_dest - lat_origin) ^ 2 + (long_dest - long_origin) ^ 2))

Specify selected features for the model

features <- c('CRSDepTime', 'CRSArrTime', 'Distance')

Fill missing values

flights <- flights %>%
  mutate(Distance = if_else(is.na(Distance), 0, Distance))

Initialize lists to store coefficients and models

coefficients <- list()
models <- list()

Logistic Regression Model Fitting

This code iterates through the 10 years of data, processing each year to be fitted into the logistic regression model.

for (year in 1997:2006) {
  cat("Processing data for year:", year, "\n")
  
  # Calculate total number of rows for the year
  total_rows_query <- dbSendQuery(conn, paste0("SELECT COUNT(*) FROM flights WHERE Year = ", year))
  total_rows_result <- dbFetch(total_rows_query)
  total_rows <- ifelse(is.null(total_rows_result), 0, total_rows_result[[1]])
  dbClearResult(total_rows_query)
  
  # Skip to the next year if there are no rows for the current year
  if (total_rows == 0) {
    cat("No data for year", year, "\n")
    next
  }
  
  # Query database for the year
  query <- paste0("SELECT * FROM flights WHERE Year = ", year)
  subset_data <- dbGetQuery(conn, query)
  
  # Convert to data frame
  subset_data <- as.data.frame(subset_data)
  
  # Sample data for the year
  sample_size <- min(nrow(subset_data), 10000)
  subset_data <- subset_data %>%
    sample_n(size = sample_size, replace = FALSE)
  
  # Extract features and target variable
  X <- subset_data[, features]
  y <- subset_data$Diverted
  
  # Fit logistic regression model for the year
  model <- glm(y ~ ., data = X, family = binomial)
  
  # Store coefficients and fitted model
  coefficients[[as.character(year)]] <- coef(model)
  models[[as.character(year)]] <- list(model = model, coefficients = coef(model))
}
## Processing data for year: 1997 
## Processing data for year: 1998 
## Processing data for year: 1999 
## Processing data for year: 2000 
## Processing data for year: 2001 
## Processing data for year: 2002 
## Processing data for year: 2003 
## Processing data for year: 2004 
## Processing data for year: 2005 
## Processing data for year: 2006

Iterate through the models and print coefficients for each year

for (year in 1997:2006) {
  cat("Logistic Regression Model for Year", year, ":\n")
  model_data <- models[[as.character(year)]]
  model <- model_data$model
  coefficients <- model_data$coefficients
  cat("Intercept:", coef(model)[1], "\n")
  for (i in 2:length(coefficients)) {
    feature <- features[i - 1]
    coefficient <- coefficients[i]
    cat(feature, ":", coefficient, "\n")
  }
  cat("\n")
}
## Logistic Regression Model for Year 1997 :
## Intercept: -7.236703 
## CRSDepTime : 0.0003140569 
## CRSArrTime : 3.743028e-05 
## Distance : 0.0007034126 
## 
## Logistic Regression Model for Year 1998 :
## Intercept: -7.103178 
## CRSDepTime : 0.0002733503 
## CRSArrTime : -7.669215e-05 
## Distance : 0.0008609159 
## 
## Logistic Regression Model for Year 1999 :
## Intercept: -8.229476 
## CRSDepTime : 0.001089458 
## CRSArrTime : 8.724587e-05 
## Distance : 0.0003852868 
## 
## Logistic Regression Model for Year 2000 :
## Intercept: -5.399231 
## CRSDepTime : -0.001196651 
## CRSArrTime : 0.000163614 
## Distance : 0.0005313225 
## 
## Logistic Regression Model for Year 2001 :
## Intercept: -7.446507 
## CRSDepTime : -0.0007399036 
## CRSArrTime : 0.001185209 
## Distance : 0.0004984196 
## 
## Logistic Regression Model for Year 2002 :
## Intercept: -8.183335 
## CRSDepTime : 0.0009593334 
## CRSArrTime : 0.0001046534 
## Distance : 0.0004903434 
## 
## Logistic Regression Model for Year 2003 :
## Intercept: -6.59028 
## CRSDepTime : 0.0004509175 
## CRSArrTime : -0.0005422799 
## Distance : 0.0003683572 
## 
## Logistic Regression Model for Year 2004 :
## Intercept: -8.011486 
## CRSDepTime : -0.001404717 
## CRSArrTime : 0.002337025 
## Distance : 0.0002066785 
## 
## Logistic Regression Model for Year 2005 :
## Intercept: -7.390778 
## CRSDepTime : 0.0008176982 
## CRSArrTime : -0.0004227273 
## Distance : 0.000324556 
## 
## Logistic Regression Model for Year 2006 :
## Intercept: -8.019679 
## CRSDepTime : -0.0006937221 
## CRSArrTime : 0.001138285 
## Distance : 0.0008472753

Extract coefficients for each year

coefficients_df <- data.frame(
  Year = 1997:2006,
  CRSDepTime = sapply(models, function(x) coef(x$model)[2]),
  CRSArrTime = sapply(models, function(x) coef(x$model)[3]),
  Distance = sapply(models, function(x) coef(x$model)[4])
)

Convert coefficients to long format

coefficients_long <- tidyr::gather(coefficients_df, key = "Feature", value = "Coefficient", -Year)

Plot coefficients for visualization across years

ggplot(coefficients_long, aes(x = Year, y = Coefficient, color = Feature)) +
  geom_line() +
  geom_point() +
  labs(title = "Logistic Regression Coefficients Across Years",
       x = "Year",
       y = "Coefficient Value") +
  scale_color_manual(values = c( "CRSDepTime" = "blue", 
                                "CRSArrTime" = "orange", 
                                "Distance" = "green")) +
  theme_minimal() +
  theme(legend.position = "top")

Disconnect from connection

dbDisconnect(conn)