library(DBI)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(geosphere)
conn <- dbConnect(RSQLite::SQLite(), "flights_r.db")
airports <- read.csv("airports.csv", header = TRUE)
carriers <- read.csv("carriers.csv", header = TRUE)
planes <- read.csv("plane-data.csv", header = TRUE)
# Iterate over 10 years (1997 to 2006) to read and process CSV files in the format "year.csv.bz2"
# Data from these files are then written into "flights" table in SQLite database
for(i in c(1997:2006)) {
filename <- paste0(i, ".csv.bz2")
print(paste("Processing:", filename))
flights <- read.csv(filename, header = TRUE)
if(i == 1997) {
dbWriteTable(conn, "flights", flights)
} else {
dbWriteTable(conn, "flights", flights, append = TRUE)
}
}
dbListTables(conn)
## [1] "airports" "airports.csv" "carriers" "carriers.csv"
## [5] "flights" "ontime" "plane-data.csv" "planes"
dbListFields(conn, "flights")
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "CRSDepTime"
## [7] "ArrTime" "CRSArrTime" "UniqueCarrier"
## [10] "FlightNum" "TailNum" "ActualElapsedTime"
## [13] "CRSElapsedTime" "AirTime" "ArrDelay"
## [16] "DepDelay" "Origin" "Dest"
## [19] "Distance" "TaxiIn" "TaxiOut"
## [22] "Cancelled" "CancellationCode" "Diverted"
## [25] "CarrierDelay" "WeatherDelay" "NASDelay"
## [28] "SecurityDelay" "LateAircraftDelay"
dbListFields(conn, "airports")
## [1] "iata" "airport" "city" "state" "country" "lat" "long"
dbListFields(conn, "carriers")
## [1] "Code" "Description"
dbListFields(conn, "planes")
## [1] "tailnum" "type" "manufacturer" "issue_date"
## [5] "model" "status" "aircraft_type" "engine_type"
## [9] "year"
This SQL query calculates the average delay for flights within specific time intervals (0000-0359, 0400-0759, 0800-1159, 1200-1559, 1600-1959, 2000-2359) on each day of the week and for each year between 1997 and 2006. It groups the data by year, day of the week, and time interval, then calculates the average delay (using the ‘avg_delay’ variable) for each group.
q1 <- paste0("
SELECT year, dayofweek,
CASE
WHEN deptime BETWEEN '0000' AND '0359' THEN '0000-0359'
WHEN deptime BETWEEN '0400' AND '0759' THEN '0400-0759'
WHEN deptime BETWEEN '0800' AND '1159' THEN '0800-1159'
WHEN deptime BETWEEN '1200' AND '1559' THEN '1200-1559'
WHEN deptime BETWEEN '1600' AND '1959' THEN '1600-1959'
WHEN deptime BETWEEN '2000' AND '2359' THEN '2000-2359'
END AS time_interval,
AVG(depdelay) AS avg_delay
FROM flights
WHERE year BETWEEN 1997 AND 2006 AND deptime <= '2359'
GROUP BY year, dayofweek, time_interval
")
q1 <- dbGetQuery(conn, q1)
View(q1)
To visualise distribution of delay for each day of the week
ggplot(q1, aes(x = factor(DayOfWeek), y = avg_delay, fill = factor(DayOfWeek))) +
geom_boxplot() +
scale_fill_brewer(palette = "Set3") +
labs(title = "Distribution of Delays by Day of the Week",
x = "Day of the Week",
y = "Average Delay",
fill = "Day of the Week") +
theme_bw()
This ensures correct ordering of data
q1$time_interval <- factor(q1$time_interval, levels = unique(q1$time_interval))
Each tile represents the avg delay for the corresponding time interval, day of week and year
ggplot(q1, aes(x = time_interval, y = DayOfWeek, fill = avg_delay)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
facet_wrap(~Year, scales = "free_y") +
labs(title = "Average Delay Heatmap by Time Interval, Day of the Week, and Year",
x = "Time Interval",
y = "Day of the Week",
fill = "Average Delay") +
theme_bw() +
scale_x_discrete(labels = c('0000', '0400', '0800', '1200', '1600', '2000'))
This SQL query calculates the average delay for flights based on the age of the associated plane. It computes the age of each plane by subtracting the year of the flight (f.year) from the year the plane was manufactured (p.year). It then calculates the average departure delay (avg_delay) for flights associated with planes of each age. The query filters out flights with null departure delays (f.depdelay IS NOT NULL) and planes with missing or invalid manufacturing years (p.year NOT IN (’‘, ’None’, ‘0000’) and LENGTH(p.year) = 4). Additionally, it ensures that the calculated plane age is non-negative ((f.year - p.year) >= 0) to avoid negative values. Finally, the results are grouped by plane age.
q2 <- dbGetQuery(conn, "
SELECT
(f.year - p.year) AS plane_age,
AVG(f.depdelay) AS avg_delay
FROM
flights AS f
INNER JOIN
planes AS p ON f.tailnum = p.tailnum
WHERE
f.depdelay IS NOT NULL
AND p.year IS NOT NULL
AND p.year NOT IN ('', 'None', '0000')
AND (f.year - p.year) >= 0 -- Ensure positive plane_age values
AND LENGTH(p.year) = 4 -- Ensure p.year is a 4-digit number
GROUP BY
plane_age
")
correlation <- cor(q2$plane_age, q2$avg_delay)
This ggplot2 code creates a scatter plot (geom_point()) with a trend line (geom_smooth(method = “lm”, se = FALSE)) representing the relationship between the age of the plane (plane_age) and the average delay (avg_delay). It also adds a title (labs(title = …)) that includes the correlation coefficient (correlation) rounded to two decimal places, and labels for the x-axis (Plane Age) and y-axis (Average Delay).
ggplot(q2, aes(x = plane_age, y = avg_delay)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = paste("Average Delay vs. Plane Age (Correlation:", round(correlation, 2), ")"),
x = "Plane Age",
y = "Average Delay")
## `geom_smooth()` using formula = 'y ~ x'
flights_db <- tbl(conn, "flights")
airports <- tbl(conn, "airports")
carriers <- tbl(conn, "carriers")
planes <- tbl(conn, "planes")
flights <- flights_db %>%
left_join(airports, by = c("Origin" = "iata")) %>%
left_join(airports, by = c("Dest" = "iata"), suffix = c("_origin", "_dest")) %>%
left_join(carriers, by = c("UniqueCarrier" = "Code")) %>%
left_join(planes, by = c("TailNum" = "tailnum"))
flights <- flights %>%
mutate(Distance = sqrt((lat_dest - lat_origin) ^ 2 + (long_dest - long_origin) ^ 2))
features <- c('CRSDepTime', 'CRSArrTime', 'Distance')
flights <- flights %>%
mutate(Distance = if_else(is.na(Distance), 0, Distance))
coefficients <- list()
models <- list()
This code iterates through the 10 years of data, processing each year to be fitted into the logistic regression model.
for (year in 1997:2006) {
cat("Processing data for year:", year, "\n")
# Calculate total number of rows for the year
total_rows_query <- dbSendQuery(conn, paste0("SELECT COUNT(*) FROM flights WHERE Year = ", year))
total_rows_result <- dbFetch(total_rows_query)
total_rows <- ifelse(is.null(total_rows_result), 0, total_rows_result[[1]])
dbClearResult(total_rows_query)
# Skip to the next year if there are no rows for the current year
if (total_rows == 0) {
cat("No data for year", year, "\n")
next
}
# Query database for the year
query <- paste0("SELECT * FROM flights WHERE Year = ", year)
subset_data <- dbGetQuery(conn, query)
# Convert to data frame
subset_data <- as.data.frame(subset_data)
# Sample data for the year
sample_size <- min(nrow(subset_data), 10000)
subset_data <- subset_data %>%
sample_n(size = sample_size, replace = FALSE)
# Extract features and target variable
X <- subset_data[, features]
y <- subset_data$Diverted
# Fit logistic regression model for the year
model <- glm(y ~ ., data = X, family = binomial)
# Store coefficients and fitted model
coefficients[[as.character(year)]] <- coef(model)
models[[as.character(year)]] <- list(model = model, coefficients = coef(model))
}
## Processing data for year: 1997
## Processing data for year: 1998
## Processing data for year: 1999
## Processing data for year: 2000
## Processing data for year: 2001
## Processing data for year: 2002
## Processing data for year: 2003
## Processing data for year: 2004
## Processing data for year: 2005
## Processing data for year: 2006
for (year in 1997:2006) {
cat("Logistic Regression Model for Year", year, ":\n")
model_data <- models[[as.character(year)]]
model <- model_data$model
coefficients <- model_data$coefficients
cat("Intercept:", coef(model)[1], "\n")
for (i in 2:length(coefficients)) {
feature <- features[i - 1]
coefficient <- coefficients[i]
cat(feature, ":", coefficient, "\n")
}
cat("\n")
}
## Logistic Regression Model for Year 1997 :
## Intercept: -7.236703
## CRSDepTime : 0.0003140569
## CRSArrTime : 3.743028e-05
## Distance : 0.0007034126
##
## Logistic Regression Model for Year 1998 :
## Intercept: -7.103178
## CRSDepTime : 0.0002733503
## CRSArrTime : -7.669215e-05
## Distance : 0.0008609159
##
## Logistic Regression Model for Year 1999 :
## Intercept: -8.229476
## CRSDepTime : 0.001089458
## CRSArrTime : 8.724587e-05
## Distance : 0.0003852868
##
## Logistic Regression Model for Year 2000 :
## Intercept: -5.399231
## CRSDepTime : -0.001196651
## CRSArrTime : 0.000163614
## Distance : 0.0005313225
##
## Logistic Regression Model for Year 2001 :
## Intercept: -7.446507
## CRSDepTime : -0.0007399036
## CRSArrTime : 0.001185209
## Distance : 0.0004984196
##
## Logistic Regression Model for Year 2002 :
## Intercept: -8.183335
## CRSDepTime : 0.0009593334
## CRSArrTime : 0.0001046534
## Distance : 0.0004903434
##
## Logistic Regression Model for Year 2003 :
## Intercept: -6.59028
## CRSDepTime : 0.0004509175
## CRSArrTime : -0.0005422799
## Distance : 0.0003683572
##
## Logistic Regression Model for Year 2004 :
## Intercept: -8.011486
## CRSDepTime : -0.001404717
## CRSArrTime : 0.002337025
## Distance : 0.0002066785
##
## Logistic Regression Model for Year 2005 :
## Intercept: -7.390778
## CRSDepTime : 0.0008176982
## CRSArrTime : -0.0004227273
## Distance : 0.000324556
##
## Logistic Regression Model for Year 2006 :
## Intercept: -8.019679
## CRSDepTime : -0.0006937221
## CRSArrTime : 0.001138285
## Distance : 0.0008472753
coefficients_df <- data.frame(
Year = 1997:2006,
CRSDepTime = sapply(models, function(x) coef(x$model)[2]),
CRSArrTime = sapply(models, function(x) coef(x$model)[3]),
Distance = sapply(models, function(x) coef(x$model)[4])
)
coefficients_long <- tidyr::gather(coefficients_df, key = "Feature", value = "Coefficient", -Year)
ggplot(coefficients_long, aes(x = Year, y = Coefficient, color = Feature)) +
geom_line() +
geom_point() +
labs(title = "Logistic Regression Coefficients Across Years",
x = "Year",
y = "Coefficient Value") +
scale_color_manual(values = c( "CRSDepTime" = "blue",
"CRSArrTime" = "orange",
"Distance" = "green")) +
theme_minimal() +
theme(legend.position = "top")
dbDisconnect(conn)