# --- All the R-libraries used for this analysis ---
# Ecosystem of modern R-packages
library(tidyverse)
# For parsing JSON
library(tidyjson)
# For reshaping data frames, e.g. `spread`
library(reshape2)
# For `rollmean`
library(zoo)
# For outputting dataframes as HTML tables
library(kableExtra)
# For use of `percent_format()`
library(scales)
# For custom color schemes in ggplot
library(RColorBrewer)
# Alternative JSON parsing library
library(jsonlite)
# For `lookup_by_column`
library(rqdatatable)
# For various datetime functionality
library(lubridate)
# For assignment pipe functionality
library(magrittr)
A master’s degree is a 5 year long endeavour that often is summarized in a measly, one-page grade sheet. That feels quite unsatisfying as there are so many aspects of an education that ends up being entirely neglected. As a student that is nearing the end of my education in “Applied Physics and Mathematics” with a specialization in statistics at NTNU I want to rectify this problem
I have been tracking my time usage as a student since I started at the university in August 2014. Every time I sat down with study related work I started a Toggl timer. If I have to go to the bathroom, talk with another student, etc., I stop the timer even if it is for only two minutes. It is therefore intended that the data set can accurately reflect the number of effective hours I have spent studying for the last 5 years. I hope this may prove insightful for other students, professors wondering what their students are doing, and (hopefully) prospective employers.
I have written a small python library for requesting/scraping data from various sources such as Toggl, Runkeeper, SleepCycle, Sleep as Android, and grades.no. It is this data we will sanitize, visualize, and analyze using R in this article. At any point you can press the “Code” button on the right hand side if you wonder how the data is visualized.
The Toggl time entries end up looking like this:
# Reading JSON generated by python library
toggl_json <- tidyjson::read_json(path = "../data/toggl/tidy_details.json")
# Parse the JSON structure into a tidy data frame
toggl <- toggl_json %>%
as.tbl_json() %>%
gather_array() %>%
spread_values(
description = jstring("description"),
date = jstring("start"),
start = jstring("start"),
end = jstring("end"),
study_session_duration = jnumber("dur"),
project = jstring("project")
) %>%
mutate(
date = date(date),
start = ymd_hms(start),
end = ymd_hms(end),
study_session_duration = study_session_duration / (1000 * 60 * 60)
) %>%
rename(course = "project") %>%
dplyr::filter(study_session_duration < 12) %>%
select(-document.id, -array.index)
# Sanitize the descriptions into 8 main categories
toggl$description %<>%
str_replace_all(c(
".*[Øø]v.*" = "Exercise",
".*[Ff]orelesning.*" = "Lecture",
".*[Tt]eori.*" = "Theory",
".*Eksamen.*" = "Exam",
".*Euler.*" = "Exercise",
".*Det tenkende.*" = "Theory",
".*Ethics.*" = "Theory",
".*Nedlasting.*" = "Organization",
".*Innlevering.*" = "Hand-in",
".*Lab.*" = "Hand-in",
".*Seminar.*" = "Lecture",
".*Pedag.*" = "Pedagogics",
".*Lære bort.*" = "Pedagogics",
".*Maple.*" = "Hand-in",
".*Theory.*" = "Theory",
"Mattelab" = "Hand-in",
".*[Ll]ese.*" = "Theory",
".*Wunderlist.*" = "Organization",
".*[Kk]apittel.*" = "Theory",
".*[Rr]epetisjon.*" = "Repetition",
"Inn" = "Hand-in",
"Intervju" = "Exercise",
"Google Calendar" = "Organization",
"Kok" = "Hand-in",
".*[Oo]rganiser.*" = "Organization",
"Anbefalte oppgaver" = "Exercise"
))
# Assign the remaining empty descriptions as "Hand-in"
toggl$description[toggl$description == ""] <- "Hand-in"
# And consider Pedagogics as Exercise, as I have been in recent times
toggl$description[toggl$description == "Pedagogics"] <- "Exercise"
# We calculate the percentage used in each work type.
# This is used to reorder the description factor variable accordingly.
percentage <- toggl %>%
group_by(description) %>%
summarize(hours = sum(study_session_duration)) %>%
mutate(percentage = 100 * hours / sum(hours)) %>%
select(-hours)
toggl %<>%
inner_join(percentage, by = "description") %>%
mutate(description = ordered(description)) %>%
mutate(description = fct_reorder(description, percentage)) %>%
select(-percentage)
# Show example data in report
toggls <- dim(toggl)[1]
toggl[c(1:3, (toggls-3):toggls), c("course", "description", "start", "end", "study_session_duration")]
Each entry contains the course I have been working on, the nature of the work, and start/end datetimes. The summary statistics are as follows:
toggl_summary <- list(
total_time_entries = toggls,
total_work_days = toggl$date %>% unique() %>% length(),
total_study_hours = round(toggl$study_session_duration %>% sum()),
start_date = toggl$date %>% min(),
end_date = toggl$date %>% max()
)
toggl_summary$percentage = sprintf(
"%.0f%%",
100 * (
toggl_summary$total_work_days /
as.numeric(toggl_summary$end_date - toggl_summary$start_date)
)
)
toggl_summary %>%
as_tibble() %>%
kable(
format = "pandoc",
align = "c",
col.names = c(
"Total entries",
"Total work days",
"Total study hours",
"First entry",
"Last entry",
"Days spent working"
)
)
Total entries | Total work days | Total study hours | First entry | Last entry | Days spent working |
---|---|---|---|---|---|
4837 | 822 | 3906 | 2014-08-30 | 2019-06-05 | 47% |
What you quickly realize when tracking your own effective time use, is how little of it you actually have during an average day. To illustrate this point, let’s plot the effective lengths of all my work days as a histogram grouped into half hour intervals.
days <- toggl %>%
group_by(date) %>%
summarize(study_hours = sum(study_session_duration)) %>%
add_column(rolling_mean = rollmean(.$study_hours, 30, na.pad=TRUE))
# Set the theme for all future plots
theme_set(theme_minimal() + theme(legend.position = "bottom"))
colors <- list(
green = "#536D3D",
light_green = "#9AB73C",
orange = "#E5D017",
dark_orange = "#E59E15",
red = "#DB4801",
dark_red = "#FF0000"
)
# Helper function for x-axis representing hours
scale_hours <- function(..., y = FALSE) {
fun <- ifelse(y, scale_y_continuous, scale_x_continuous)
fun(
...,
labels = function(x) sprintf("%.0f h", x)
)
}
binwidth <- 0.5
days %>%
ggplot() +
aes(x = study_hours, y = binwidth * ..density..) +
geom_histogram(
aes(fill = ..density..),
binwidth = binwidth,
color = "white",
show.legend = FALSE
) +
geom_vline(
aes(xintercept = mean(study_hours)),
color = colors$red,
size = 3
) +
geom_text(
aes(x = mean(study_hours) + 0.36, y = 0.112),
label = "Mean",
color = colors$red
) +
scale_y_continuous(labels = percent_format()) +
scale_hours(breaks = 0:12) +
scale_color_discrete("") +
ylab("Relative frequency") +
xlab("Length of work day")
I consider myself an above average student when it comes to the time I invest in my studies, but still I only track on average 4.75 hours for the subset of days I study. The data also portrays a large standard deviation of 2.18.
This might by an example of the superiority bias or it may be true that most people actually overestimate their effective work hours during an average day. Based on data gathered by other students at my university I lean towards the latter.
The “nature of the work” is categorized into seven different types of work:
So how does the work distribute across these categories?
toggl %>%
group_by(description) %>%
summarize(study_hours = sum(study_session_duration)) %>%
ggplot() +
aes(x = reorder(description, study_hours), y = study_hours) +
geom_bar(stat = "identity", aes(fill = description)) +
coord_flip() +
scale_fill_brewer(palette = "Spectral", direction = -1) +
ylab("Total Study Hours") +
xlab("") +
scale_hours(
breaks = seq(0, 1500, by = 500),
y = TRUE
) +
theme(legend.position = "none")
The work is dominated by the Hand-in
category. This comes as no surprise to me. Most of the semester is spent handing in one project after another, often with little time in between. I will come back to how this often adversely affects my final grade further down.
We can try to visualize what kind of work dominates the workdays of different length.
hours_per_day <- toggl %>%
group_by(date) %>%
summarize(study_hours = sum(study_session_duration))
hours_per_category_per_day <- toggl %>%
group_by(date, description) %>%
summarize(category_hours = sum(study_session_duration)) %>%
ungroup()
categories_per_day <- hours_per_day %>%
inner_join(hours_per_category_per_day) %>%
group_by(date) %>%
spread(description, category_hours, fill = 0)
binwidth <- 0.5
categories_per_day %>%
gather(category, hours, colnames(.)[-c(1:2)]) %>%
group_by(group = cut(study_hours, breaks = seq(0, 12, by=0.5))) %>%
ungroup() %>%
mutate(group = (as.numeric(group) - 1) / 2) %>%
mutate(category = ordered(category, levels = levels(toggl$description))) %>%
select(-date, -study_hours) %>%
group_by(group, category) %>%
summarize(hours = sum(hours)) %>%
ggplot() +
aes(x = group + 0.25) +
aes(y = hours) +
aes(fill = category) +
geom_col() +
scale_hours(breaks = seq(0, 12)) +
scale_hours(y = TRUE) +
scale_fill_brewer("", palette = "Spectral", direction = -1) +
xlab("Length of work day") +
ylab("Total hours spent") +
theme(legend.position = "right")
The conclusion that can be drawn from this plot is that the longer the days, the more probable it is that most of it has been spent on obligatory hand-ins. Most projects last for two/three weeks, and in order to get a good grade it often requires full time work on at least half of those days (from personal experience). Taking into account that I have to attend 3 other courses at the same times, some of them even with their own projects, it often leads to quite long work days.
Until now we have only looked at aggregate statistics, but has the amount of work changed over time?
rsi_start <- date("2016-01-15")
sick_leave <- tibble(
interval = c(date("2017-12-14"), date("2018-08-03"))
)
days %>%
ggplot() +
geom_ribbon(
data = sick_leave,
aes(
x = interval,
ymin = 0,
ymax = Inf,
y = 1
),
alpha = 0.5,
fill = colors$red
) +
annotate(
"text",
x = mean(sick_leave$interval),
y = 9,
label = "RSI\n Sick Leave",
color = colors$red
) +
geom_vline(
aes(xintercept = rsi_start),
color = colors$red
) +
annotate(
"text",
x = rsi_start + 10,
y = 9,
label = "First RSI\nSymptom",
color = colors$red,
hjust = 0
) +
aes(x = date, y = study_hours) +
geom_point(aes(color = study_hours), show.legend = FALSE) +
geom_smooth(linetype = "dashed") +
xlab("Day") +
ylab("Work hours") +
scale_y_continuous(expand = c(0, 0), limits = c(0, NA)) +
scale_fill_manual("", values = colors$red) +
scale_linetype_manual("", values = c(1)) +
theme(legend.position = "bottom")
There are 4 distinct phases here:
2014 \(\rightarrow\) 2016:
The start of my studies. The work was unfamiliar and required a lot of time investment. As I became more comfortable (and got a part time job) the work slowly decreased over time.
2016 \(\rightarrow\) 2018:
The start of my studies. The work was unfamiliar and required a lot of time investment. As I became more comfortable (and got a part time job) the work slowly decreased over time.
2018 \(\rightarrow\) mid-2018:
A doctor-recommended sick leave in December, 2017. I had to drop out of my studies right before my final exams and take the entire next semester off, effectively setting me back one year.
mid-2018 \(\rightarrow\) present:
Continuing where I left the studies off. Since then I have been slowly increasing the work load, hopefully with more self-care.
My two cents regarding RSI issues: if you at any point still feel pain or tiredness in your arms when you wake up the day after a heavy work session, take an immediate break. Don’t start working until you wake up and feel completely normal again. If the frequency of RSI pains increase over time, find a doctor which takes the issue seriously. My first doctor basically described a healthy dose of “putting on a brave face” and a not-so-healthy dose of ever-increasing strong painkillers. I have been tracking my RSI pains on a crude 0-2 scale since the beginning, which we will come back to later.
What makes education especially interesting from a data analysis perspective is that your performance is quantitatively measured at the end of each semester in the form of a final grade. In my case these measurements look like this:
grades <- tibble(
course = toggl %>%
select(course) %>%
unique() %>%
arrange(course) %>%
unlist(),
course_code = c(
"EXPH0004", "Org", "TDT4102", "TDT4105", "TDT4120", "TDT4136", "TDT4145",
"TEP4105", "TFY4145", "TFY4155", "TFY4165", "TFY4215", "TIØ4146",
"TIØ4258", "TMA4100", "TMA4105", "TMA4115", "TMA4120", "TMA4145",
"TMA4150", "TMA4180", "TMA4195", "TMA4212", "TMA4215", "TMA4245",
"TMA4265", "TMA4267", "TMA4268", "TMA4295", "TMA4300", "TMA4315",
"TMA4320", "TMA4500", "TMA4850", "TMT4110"
),
course_name = c(
"Science, Philosophy & Ethics",
"Organization",
"Object Oriented Programming (C++)",
"Introduction to IT (Matlab)",
"Algorithms and Data Structures",
"Introduction to Artificial Intelligence",
"Data Modelling and Databases",
"Fluid Mechanics",
"Mechanical Physics",
"Electromagnetism",
"Thermodynamics",
"Introduction to Quantum Physics",
"Finance for Science Students",
"Technical Leadership",
"Univariate Calculus",
"Multivariate Calculus",
"Linear Algebra",
"Complex Calculus and Transformations",
"Linear Methods",
"Algebra",
"Optimization I",
"Mathematical Modelling",
"Numerical Solutions to Differential Equations",
"Numerical Mathematics",
"Introductory Statistics",
"Stochastic Processes",
"Linear Statistical models",
"Statistical Learning",
"Statistical Inference",
"Computer Intensive Statistical Methods",
"Generalized Linear Models",
"Introduction to Scientific Computation",
"Project Thesis",
"Experts in Teams",
"Chemistry"
),
grade = as.ordered(c(
"A", NA, "A", "A", "A", NA, "B", "B", "A", "A", "B", "B", "A", "B", "A", "A",
"A", "B", "C", NA, "C", "C", "C", "C", "A", "D", "B", "A", NA, "B", "A", "B",
NA, "A", "B"
)),
semester = c(
1, NA, 4, 1, 5, 7, 10, 3, 1, 2, 3, 10, 9, 3, 1, 2, 2,
3, 5, 6, 6, 9, 6, 9, 4, 5, 6, 10, 5, 10, 9, 4, 11, 10, 2
)
) %>%
arrange(semester) %>%
mutate(season = ifelse(semester %% 2, "H", "V")) %>%
mutate(year = 2014 + floor((semester) / 2)) %>%
mutate(semester_code = paste(season, year, sep = ""))
# --- Add grade statistics to data frame ---
fetch_grade_statistics <- function (grade) {
jsonlite::fromJSON(
sprintf(
"https://grades.no/api/courses/%s/grades/%s",
grade$course_code,
grade$semester_code
)
) %>%
as_tibble() %>%
mutate(course_code = grade$course_code) %>%
select(-semester_code)
}
# Rename in order to hit correct API-endpoint
grades[3,]$course_code <- "FY1001"
grades[5,]$course_code <- "FY1003"
# Add the grade statistics
grade_statistics <- grades %>%
filter(semester < 10) %>%
rowwise() %>%
do(row = fetch_grade_statistics(as.list(.))) %>%
use_series(row) %>%
bind_rows()
grades %<>% left_join(grade_statistics, by = "course_code")
# Rename back to original course codes
grades[3,]$course_code <- "TFY4145"
grades[5,]$course_code <- "TFY4155"
# Print the entire tibble except the statistics
grades %>% select(semester, course_code, course_name, grade)
A more aggregate summary can also be made for the impatient:
grades %>%
drop_na(grade) %>%
group_by(grade) %>%
summarize(total = n()) %>%
ggplot() +
aes(x = "", y = total, fill = grade) +
geom_col() +
scale_fill_manual(values = as.vector(unlist(colors))) +
geom_text(
aes(
y = 30 - 0.5 * total - cumsum(c(0, total[1:3])),
x = 1,
label = paste(total, " ", grade, "'s", sep = "")
),
color = "white",
size = 6
) +
theme(
axis.text = element_blank(),
panel.grid = element_blank(),
legend.position = "right"
) +
ylab("")
But what makes this data really interesting is when we combine it with the time tracking data…
We start by testing the first hypothesis that falls to mind, namely hard work pays off. Many, including myself, think that time investment is the strongest predictor for the final grade in a course, but is that really the case here?
courses <- toggl %>%
group_by(course) %>%
summarize(study_hours = sum(study_session_duration)) %>%
inner_join(grades)
courses %>%
drop_na(grade) %>%
ggplot() +
aes(x = reorder(course_name, -study_hours), y = study_hours) +
geom_bar(stat = "identity", aes(fill = grade)) +
coord_flip() +
theme(legend.position = "bottom") +
scale_fill_manual(
values = as.vector(unlist(colors)),
na.value="gray"
) +
scale_hours(y = TRUE) +
ylab("Time Spent on Course") +
xlab("") +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()
)
If the hypothesis would hold perfectly we would expect the colors to be arranged in order, going from orange on the top to dark green on the bottom. This is obviously not the case here, but there are some minor trends. All courses with more than 180 hours of time investment yielded a final grade of A, while the courses below this limit show a greater degree of variability.
So we just concluded that time investment alone is not a perfect predictor for final grades. Perhaps some courses are more difficult than others? In order to account for this possibility we may investigate the performance relative to other students that participated in the same exam. Luck has it that grade statistics off all the courses at NTNU are available through a JSON API. We will now plot all the grade percentiles for my courses (matching the specific semesters) while indicating which percentile range I belong to.
# Calculate lower and upper percentile for my grade groups.
# Quite ugly, but it works.
grade_cummulatives <- grades[, letters[1:6]] %>%
apply(1, cumsum) %>%
t() %>%
as_tibble() %>%
mutate_all(~ . / f) %>%
bind_cols(grades %>% select(-one_of(letters[1:6])), .) %>%
mutate(lower_grade_chr = tolower(.$grade)) %>%
mutate(upper_grade_int = as.numeric(as.factor(lower_grade_chr)) - 1) %>%
mutate(upper_grade_int = ifelse(upper_grade_int == 0, 1, upper_grade_int)) %>%
mutate(upper_grade_chr = letters[upper_grade_int]) %>%
lookup_by_column("lower_grade_chr", "lower_grade_percentile") %>%
lookup_by_column("upper_grade_chr", "upper_grade_percentile") %>%
mutate(upper_grade_percentile = replace(upper_grade_percentile, grade == "A", 0)) %>%
mutate(lower_grade_percentile = 1 - lower_grade_percentile) %>%
mutate(upper_grade_percentile = 1 - upper_grade_percentile) %>%
filter(!is.na(grade)) %>%
mutate(row_number = row_number()) %>%
as_tibble() %>%
inner_join(courses %>% select(course_code, study_hours), by = "course_code")
# Calculate the grade intervals for each grade and each course.
# Joining with study_hours for sorting purposes.
grade_intervals <- tibble(
course_name = rep(grade_cummulatives$course_name, 6),
grade = sort(rep(ordered(toupper(letters[1:6])), dim(grade_cummulatives)[1])),
ymin = c(
1 - grade_cummulatives$a,
1 - grade_cummulatives$b,
1 - grade_cummulatives$c,
1 - grade_cummulatives$d,
1 - grade_cummulatives$e,
1 - grade_cummulatives$f
),
ymax = c(
rep(1, length(grade_cummulatives$a)),
1 - grade_cummulatives$a,
1 - grade_cummulatives$b,
1 - grade_cummulatives$c,
1 - grade_cummulatives$d,
1 - grade_cummulatives$e
)
) %>%
inner_join(
courses %>% select(course_name, study_hours),
by = "course_name"
)
# Remove the grade interval that my grades fall within
grade_intervals <- grade_cummulatives %>%
select(course_name, grade) %>%
anti_join(grade_intervals, .)
size <- 6
grade_cummulatives %>%
drop_na(lower_grade_percentile) %>%
ggplot() +
aes(x = reorder(course_name, -study_hours)) +
geom_linerange(
data = grade_intervals %>% drop_na(ymin),
aes(
ymin = ymin,
ymax = ymax,
color = grade
),
size = 0.2 * size,
alpha = 0.9
) +
geom_linerange(
aes(
ymin = lower_grade_percentile,
ymax = upper_grade_percentile,
color = grade
),
size = 0.85 * size
) +
geom_linerange(
aes(
ymin = 1.035,
ymax = 1.035 + study_hours / 700
),
size = 0.4 * size,
alpha = 0.7,
color = "gray"
) +
scale_y_continuous(breaks = seq(0, 1, 0.25), labels = percent_format()) +
scale_color_manual("", values = as.vector(unlist(colors))) +
scale_x_discrete(expand = c(0.02, 0, 0.08, 0)) +
annotate("text", y = 1.15, x = 26, label = "Time Investment", color = "gray") +
xlab("") +
ylab("") +
coord_flip() +
theme(panel.grid = element_blank()) +
guides(col = guide_legend(nrow = 1))
In a perfect world, the wider bars indicating my percentile range would slowly shift towards the right as more time is being spent on the courses. Unfortunately this is yet again not the case.
What this plot rather does tell us is that grades are entirely non-comparable. In a perfect world you would expect two truths to hold:
Each grade letter indicates a percentile range of which the students belong to.
This is not the case, since my C in “Numerical Solutions to Differential Equations” is in the bottom third of the student population, while “Linear Methods” is at the top ~75%. I know for a fact that these two courses had an almost perfect overlap in which students that were enrolled, so a student sample difference is not enough to explain for this fact.
Time investment is proportional to the percentage range of the final grade.
My 26 hours of time investment in “Finance for Science Students” yielded an A, while 125 hours were required for “General Linear Models” which also yielded an A during the same semester. That is almost a 5-to-1 difference!
What we have ignored so far is time. Have my grade-relative performance changed over time?
semester_position <- courses %>%
group_by(semester) %>%
summarize(count = n()) %>%
mutate(count = cumsum(count)) %>%
mutate(mid = rollmean(c(0, count), 2)) %>%
drop_na()
courses %>%
filter(!is.na(semester)) %>%
ggplot() +
aes(x = reorder(course_code, semester), y = study_hours) +
geom_bar(stat = "identity", aes(fill = grade)) +
geom_vline(
data = semester_position,
aes(xintercept = count + 0.5),
color = "gray"
) +
geom_label(
data = semester_position,
aes(
label = semester,
x = mid + 0.5,
y = 230
),
color = "gray",
label.r = unit(0.2, "lines")
) +
theme(legend.position = "bottom") +
scale_fill_manual(
values = as.vector(unlist(colors)),
na.value="gray"
) +
scale_hours(y = TRUE) +
ylim(0, 240) +
theme(
axis.text.x = element_text(angle = 90),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
) +
labs(
x = "",
y = "Time Spent on Course",
caption = "The gray lines delineate each semester."
)
There is no way around the fact that my grades have gotten worse over time. Getting a part-time job, my RSI issues, and generally less time investment over time may all be contributing factors to this, but…
I think the most important factor is the enrollment size of the courses. The early courses that were obligatory for all students at the university were much less competitive than my more recent, higher-level courses at the Mathematical Institute. There is admittedly proportionally more students smarter than me as time goes on, making it harder to stand out on the exam. To argue for this point, take a look at the following plot:
class_size <- toggl %>%
group_by(course) %>%
summarize(study_hours = sum(study_session_duration)) %>%
inner_join(grades, by = "course") %>%
group_by(course) %>%
mutate(class_size = a + b + c + d + f) %>%
select(-a, -b, -c, -d, -e, -f, -study_hours, -grade, -course_name, -semester) %>%
inner_join(grade_cummulatives, by = "course")
text_include <- c(
"Finance for Science Students",
"Mechanical Physics",
"Univariate Calculus",
"Introduction to IT (Matlab)"
)
class_size %>%
ggplot() +
aes(x = class_size, y = study_hours, color = grade) +
geom_point(size = 6) +
geom_text(
data = class_size %>% filter(course_name %in% text_include),
aes(label = course_name, y = study_hours + 7),
hjust = "inward"
) +
geom_text(
aes(label = semester),
color = "white",
size = 4
) +
scale_color_manual(values = as.vector(unlist(colors))) +
scale_hours(y = TRUE, limits = c(0, NA)) +
xlim(0, 1450) +
labs(
x = "Number of Students Enrolled",
y = "Total Study Hours",
caption = "The white numbers indicate the semester"
) +
theme(legend.position = "right")
This strengthens my belief in that small, specialized courses are more difficult on average. The less the enrollment size, the more work is required in order to achieve an A, and vice versa. Taking a look at the white number labels, which indicate which semester the course was taken, it looks like a causal time trend is less likely.
PS: The “Finance for Science Students” is a large outlier here. The professor has basically been giving the same exam for a couple of decades, and I took advantage of that fact.
Since time investment alone does not entirely explain the variations observed in the final grades, other factors must be at play. Several predictors can be proposed, some of which I have data for. We will start by the how the time is spent for each course, grouped by final grade.
not_normalized <- toggl %>%
group_by(course, description) %>%
summarize(time = sum(study_session_duration)) %>%
ungroup() %>%
right_join(grades) %>%
filter(!is.na(grade)) %>%
group_by(grade, description) %>%
summarize(time = sum(time))
not_normalized %>%
ggplot() +
aes(x = grade, y = time, fill = description) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Spectral", direction = -1) +
scale_hours("Hours Spent on Courses", y = TRUE) +
xlab("Final Grade") +
theme(legend.position = "right")
This plot is not suitable for what we are trying to explain; it mainly shows the total time invested in each grade letter. It is interesting to note though that half of my time have been spent on courses that resulted in a final grade of A even though half of my grades are not As.
Let’s normalize the height of the bars and let the width represent the proportional time investment instead. That way we can focus on work type instead of work duration.
# Calculate time proportion used for each grade.
# This is used to scale the width of the grade columns accordingly.
grade_total <- not_normalized %>%
group_by(grade) %>%
summarize(grade_total = sum(time)) %>%
mutate(grade_total = grade_total / sum(grade_total))
# Calculate the midpoint for each bar such that they lay besides each other.
# Purely done for presentation purposes. This is basically the cumulative median.
position <- tibble(
grade = ordered(c("A", "B", "C", "D")),
x = c(0, cumsum(grade_total$grade_total)[1:3]) + grade_total$grade_total / 2
)
# Normalize time usage to 1 within each grade.
# The time aesthetic is now always 1, making comparisons easier.
normalized <- not_normalized %>%
group_by(grade) %>%
mutate(time = time / sum(time)) %>%
ungroup() %>%
inner_join(grade_total, by = "grade") %>%
inner_join(position, by = "grade")
normalized %>%
ggplot() +
aes(
x = x,
y = time,
fill = description,
order = description,
width = grade_total
) +
geom_col(color = "white") +
annotate(
geom = "text",
label = normalized$grade %>% unique,
y = 1.05,
x = normalized$x %>% unique(),
color = colors[1:4],
size = 6
) +
scale_fill_brewer(palette = "Spectral", direction = -1) +
scale_y_continuous("Time Spent by Category", labels = percent_format()) +
scale_x_continuous("Time Spent by Grade", labels = percent_format()) +
coord_fixed(ratio = 1) +
theme(
legend.position = "right",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
Notice that the Hand-in
category is vastly under-represented in the A
column. This confirms my long held suspicion of the fact that courses that require a lot of project work leaves less time for exam preparations which has negative consequences.
This leads into one of my biggest gripes with the current grading scheme. I (and many of my fellow students) spend a lot of time on project work as it is educationally rewarding, but the projects often only weigh in at ~20-30% of the final grade. Compare this to the actual proportional time investment across the board:
percentage %>%
arrange(-percentage) %>%
mutate(percentage = sprintf("%.2f%%", percentage)) %>%
select(description, percentage) %>%
kable(col.names = c("Work type", "Proportional time used"), format = "pandoc")
Work type | Proportional time used |
---|---|
Hand-in | 40.03% |
Theory | 18.55% |
Repetition | 14.49% |
Exercise | 12.78% |
Lecture | 10.48% |
Exam | 2.01% |
Organization | 1.66% |
While having in mind that I might be too biased towards my own work preferences, I still think that most courses could preferably change the project weighting to at least 50% in order to reflect this statistical fact.
I get the impression that many people have this romanticized view of college consisting of frantic all-nighters at the end of the semester, cramming for exams. For me the exact opposite has been true. I have here collected my sleep data from two different sources: SleepCycle and Sleep as Android. The former app is really cool since it allows you to register boolean data points at the end of each day before you go to bed. That way you can keep track of stuff like coffee consumption, RSI issues, and so on.
The data looks like this:
# For use of `str_extract`
library(stringr)
#--- Importing SleepCycle data ---#
sleep_json <- tidyjson::read_json(path = "../data/sleep_sessions.json")
sleep <- sleep_json %>%
as.tbl_json() %>%
enter_object("sleep_sessions") %>%
gather_keys() %>%
spread_values(
start_date = jstring("start_local"),
stop_date = jstring("stop_local"),
start = jstring("start_local"),
stop = jstring("stop_local"),
wakeup_mood = jnumber("rating"),
heartrate = jstring("heartrate"),
sleep_quality = jnumber("stats_sq"),
sleep_duration = jnumber("stats_duration"),
steps = jstring("steps"),
sleep_notes = jstring("sleep_notes")
) %>%
mutate(
heartrate = str_extract(heartrate, "\\d+") %>% as.numeric(),
steps = str_extract(steps, "\\d+") %>% as.numeric(),
start_date = date(start_date),
stop_date = date(stop_date),
start = ymd_hms(start),
stop = ymd_hms(stop),
sleep_duration = sleep_duration / (60 * 60)
) %>%
select(-document.id, -key) %>%
add_column(source = "iOS")
#--- Converting sleep notes ---#
sleep_notes <- sleep$sleep_notes %>%
str_split(pattern = ", ")
note_categories <- sleep_notes %>%
unlist() %>%
unique() %>%
extract(-1)
for (note_category in note_categories) {
if (note_category %in% colnames(sleep)) break
boolean_vector <- vector()
for (sleep_note in sleep_notes) {
has_note <- note_category %in% sleep_note
boolean_vector <- c(boolean_vector, has_note)
}
sleep[[note_category]] <- boolean_vector
}
#--- Importing Sleep as Android data ---#
android_json <- tidyjson::read_json(path = "../data/sleep/android.json")
android <- android_json %>%
as.tbl_json() %>%
gather_array() %>%
spread_values(
start = jstring("from"),
stop = jstring("to"),
sleep_duration = jnumber("hours")
) %>%
mutate(
start = dmy_hm(start),
stop = dmy_hm(stop),
start_date = date(start),
stop_date = date(stop)
) %>%
add_column(source = "Android") %>%
select(-document.id, -array.index)
if (!("Android" %in% sleep$source)) {
sleep %<>% bind_rows(android)
}
# Add the final data set to the day summary data
days %<>% full_join(sleep, by = c("date" = "stop_date"))
# Show part of data in report
sleeps <- dim(sleep)[1]
sleep[100:105, c("start", "stop", "Drank coffee", "RSI Pains")]
This kind of data lends it very well to trend visualization. That is what we will do here, plotting the time I went to bed and woke up over time:
translation <- 18
time_of_day <- trans_new(
"time_of_day",
transform = function(x) {(x - translation) %% 24},
inverse = function(x) {(x + translation) %% 24},
breaks = function(x) {0:50}
)
sleep %>%
mutate(
in_bed = ((hour(start) + minute(start) / 60)),
out_of_bed = ((hour(stop) + minute(stop) / 60))
) %>%
ggplot() +
aes(x = stop_date) +
geom_segment(
aes(
x = stop_date,
xend = stop_date,
y = in_bed,
yend = out_of_bed,
color = sleep_duration
)
) +
geom_smooth(
aes(y = in_bed, linetype = "Going to bed"),
fill = "#E59E15",
color = "#77AB43",
alpha = 0.3
) +
geom_smooth(
aes(y = out_of_bed, linetype = "Out of bed"),
fill = "#E59E15",
color = "#77AB43",
alpha = 0.3
) +
scale_y_continuous(
name = "Time of day",
trans = time_of_day,
labels = function(x) sprintf("%.0f:00", x)
) +
scale_linetype_manual(
values = c("dotted", "dashed"),
name = "Sleep trend",
labels = c("Going to bed", "Waking up")
) +
xlab("Date") +
theme(legend.position = "none")
This data shows some clear month-over-month trends. Periods where I am waking up early every day are often followed up with a large correction in the other direction, more than what regression to the mean seems to account for.
While when I sleep portrays some clear trends, what about how long I sleep? If Matthew Walker’s Why We Sleep has taught me anything it is that I should sleep on average eight to nine hours a day. Let’s see:
sleep %>%
mutate(rolling_mean = rollmean(sleep_duration, 30, na.pad = TRUE)) %>%
filter(sleep_duration > 4) %>%
ggplot() +
aes(x = stop_date, y = sleep_duration) +
geom_point(aes(color = sleep_duration), alpha = 0.5) +
geom_smooth(size = 3) +
geom_line(aes(y = rolling_mean)) +
scale_y_continuous(breaks = 1:15) +
scale_hours(breaks = seq(4, 16, 2), y = TRUE) +
xlab("Date") +
ylab("Sleep Duration") +
theme(legend.position = "none")
On average I sleep 8.73 hours a night. This is most likely enough, although there are some outliers.
Is there any correlation between how many hours I work and how many hours I slept the night before? Let’s apply a linear model:
sleep_days <- days %>%
select(date, study_hours) %>%
inner_join(sleep, by = c("date" = "stop_date"))
duration_model <- lm(study_hours ~ sleep_duration, data = sleep_days)
duration_r2 <- summary(duration_model)$r.squared
duration_coefficient <- coefficients(duration_model)[2]
model_theme <- list(
xaxis = scale_hours(breaks = 1:24),
yaxis = scale_hours(y = TRUE, breaks = 0:24),
theme = theme(panel.grid.minor = element_blank()),
ylab = ylab("Total Study Hours")
)
sleep_days %>%
select(sleep_duration, study_hours) %>%
filter(sleep_duration > 5) %>%
ggplot() +
aes(x = sleep_duration, y = study_hours) +
geom_point() +
stat_smooth(method = "lm") +
xlab("Sleep duration") +
model_theme
For every additional hour I sleep I am expected to work 0.36 hours less. The regression is significant, but explains very little of the variability in the data.
Perhaps when I wake up is a better predictor?
wakeups <- sleep_days %>%
filter(sleep_duration > 5) %>%
mutate(wakeup = hour(stop) + minute(stop) / 60)
wakeup_model <- lm(study_hours ~ wakeup, data = wakeups)
wakeup_r2 <- summary(wakeup_model)$r.squared
wakeup_coefficient <- coefficients(wakeup_model)[2]
wakeups %>%
ggplot() +
aes(x = wakeup, y = study_hours) +
geom_point() +
stat_smooth(method = "lm") +
xlab("Wake Up Time") +
model_theme
This is marginally better model, for every hour I delay waking up I can expect on average 0.4 hours less of effective work that day.
But what I think is really occurring here is that when I start the work day is what matters. Let’s check this:
work_starts <- toggl %>%
group_by(date) %>%
summarize(
first_entry = min(start),
study_hours = sum(study_session_duration)
) %>%
mutate(
work_start = hour(first_entry) + minute(first_entry) / 60
) %>%
select(-first_entry)
work_start_model <- lm(study_hours ~ work_start, data = work_starts)
work_start_r2 <- summary(work_start_model)$r.squared
work_start_coefficient <- coefficients(work_start_model)[2]
work_starts %>%
ggplot() +
aes(x = work_start, y = study_hours) +
geom_point() +
stat_smooth(method = "lm") +
xlab("Start Of Workday") +
model_theme
Yes, this is the best model so far. It explains 19% of the variability in the data compared to 3% and 6% in the previous models. For every hour I delay starting the work day I can expect to have 0.3 less effective hours of work that same day. Might be common sense, but there you have it!
sleep %>%
select(stop_date, sleep_duration) %>%
rename(date = stop_date) %>%
ggplot() +
aes(x = sleep_duration) +
geom_histogram(binwidth = 0.05)