Note that since I write these codes for one of my current projects, not all of them are crucial in creating the plots.
- First, we need to load the following packages and define a function to compute differences and logarithmic differences (this is to compute the growth rates): This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
library(reshape2); library(dplyr); library(readxl); library(readr); library (zoo); library(ggplot2); library(ggrepel); library(prismatic); library(ggsci); library(paletteer) #for plotting #functions to add growth rate (100*log change) #[Warnings: NA will be produced for the diff where previous value is zero] diff<- function(x){x-dplyr::lag(x)} #daily change pctdiff<-function(x){100*(log(x)-log(dplyr::lag(x)))} - Then, we extract the latest data from the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE)'s Github repository. There are three main variables of interest: The daily cumulative numbers of confirmed cases, recovered cases and death tolls: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
confirm<-read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"))[,-c(3,4)] recover<-read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"))[,-c(3,4)] death<-read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"))[,-c(3,4)] colnames(confirm)[1:2]=colnames(recover)[1:2]=colnames(death)[1:2]=c("region", "country") - Next, we can do some reshaping the panel data from wide to long format: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
confirm_world <- melt(confirm, id.vars = c("region", "country"), variable.name = 'date',value.name = 'confirm'); confirm_world$date<-as.Date(confirm_world$date,"%m/%d/%y"); confirm_world$country[confirm_world$country=='US']<-"United States" recover_world <- melt(recover, id.vars = c("region", "country"), variable.name = 'date',value.name = 'recover'); recover_world$date<-as.Date(recover_world$date,"%m/%d/%y"); recover_world$country[recover_world$country=='US']<-"United States" death_world <- melt(death, id.vars = c("region", "country"), variable.name = 'date',value.name = 'death'); death_world$date<-as.Date(death_world$date,"%m/%d/%y"); death_world$country[death_world$country=='US']<-"United States" - The next step is to compute the growth rates of these variables: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
confirm_world <- confirm_world %>% group_by(country,date) %>% summarise_at(c('confirm'),sum,na.rm=TRUE) %>% group_by(country) %>% mutate(change_confirm=pctdiff(confirm)) %>% #log diff in confirmed cases mutate(new_confirm=diff(confirm)) %>% #new cases --> % change = this value divided by existing confirm cases mutate(change_new_confirm=pctdiff(new_confirm)); #log change in new cases confirm_world[sapply(confirm_world, is.na)]<-NA confirm_world[sapply(confirm_world, is.infinite)]<-NA # recover_world <- recover_world %>% group_by(country,date) %>% summarise_at(c('recover'),sum,na.rm=TRUE) %>% group_by(country) %>% mutate(change_recover=pctdiff(recover)) %>% #log diff in recovered cases mutate(new_recover=diff(recover)) %>% #new cases --> % change = this value divided by existing recover cases mutate(change_new_recover=pctdiff(new_recover)); #log change in new cases recover_world[sapply(recover_world, is.na)]<-NA recover_world[sapply(recover_world, is.infinite)]<-NA # death_world <- death_world %>% group_by(country,date) %>% summarise_at(c('death'),sum,na.rm=TRUE) %>% group_by(country) %>% mutate(change_death=pctdiff(death)) %>% #log diff in death cases mutate(new_death=diff(death)) %>% #new cases --> % change = this value divided by existing death cases mutate(change_new_death=pctdiff(new_death)); #log change in new cases death_world[sapply(death_world, is.na)]<-NA death_world[sapply(death_world, is.infinite)]<-NA - I then create a nice theme for our ggplot2 plots: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
commonTheme = list(labs(color="",fill=""), theme_bw(), theme(axis.line = element_line(colour = "black"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank()), theme(legend.position="none"), theme(plot.title = element_text(size = rel(2), face = "bold"), plot.margin = unit(c(1,1,1,1), "cm"), plot.subtitle = element_text(size = rel(1.5)), plot.caption = element_text(size = rel(1.2)), axis.text.y = element_text(size = rel(2)), axis.title.x = element_text(size = rel(1.5)), axis.title.y = element_text(size = rel(1.5)), axis.text.x = element_text(size = rel(2)), legend.text = element_text(size = rel(2)) )) - Now we can begin plotting. The first thing to do is to get the data for some representative countries. I include the big players that are featured heavily in the news and two developing ones (my home country and its neighbor, can you guess which one I am from?). You can modify this country list anyway you like: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
focus_countries <- c('Australia','China','United Kingdom','United States', 'Iran', 'Japan','Korea, South','Italy','France','Spain','Vietnam','Cambodia'); focus_cn <- c("AUS","CHN", "GBR", "USA", "JPN",'IRN', "KOR", "ITA", "FRA", "ESP",'VNM','CAM') - Then plot the daily cumulative epidemic curves using the confirmed cases and deaths. This result in the following plots: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
##Daily cumulative curves---- confirm_curve <- confirm_world %>% filter(country %in% focus_countries) %>% filter(confirm > 100) %>% #cumulative value since the 100th case mutate(days_elapsed = date - min(date), end_label = ifelse(date == max(date), country, NA)); # confirm_curve %>% ggplot(mapping = aes(x = as.numeric(days_elapsed), y = confirm, color = country, label = end_label, group = country)) + geom_line(size = 0.8) + geom_text_repel(size = 8, nudge_x = 1.1, nudge_y = 0.1, segment.color = NA) + guides(color = FALSE) + scale_color_manual(values = prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)) + scale_y_continuous(labels = scales::comma_format(accuracy = 1), breaks = 10^seq(2, round(log(max(confirm_curve$confirm))/log(10)-1,0)), #find the power scale of a base 2), trans = "log10") + labs(x = "Days since 100th confirmed case", y = "Cumulative number of cases (log scale)", title = "Daily Cumulative Confirmed Cases from COVID-19, Selected Countries", subtitle = paste("Data as of", format(max(confirm_curve$date), "%A, %B %e, %Y")), caption = "Long Vo: longvoqnu@gmail.com / Data: JHU") + commonTheme # death_curve<-death_world %>% filter(country %in% focus_countries) %>% filter(death > 30) %>% #cumulative value since the 100th case mutate(days_elapsed = date - min(date), end_label = ifelse(date == max(date), country, NA)) ; death_curve %>% ggplot(mapping = aes(x = as.numeric(days_elapsed), y = death, color = country, label = end_label, group = country)) + geom_line(size = 0.8) + geom_text_repel(size = 8, nudge_x = 1.1, nudge_y = 0.1, segment.color = NA) + guides(color = FALSE) + scale_color_manual(values = prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)) + scale_y_continuous(labels = scales::comma_format(accuracy = 1), breaks = 10^seq(2, round(log(max(death_curve$death))/log(10)-1,0)), #find the power scale of a base 2), trans = "log10") + labs(x = "Days since 30th death", y = "Cumulative number of cases (log scale)", title = "Daily Cumulative Death Tolls from COVID-19, Selected Countries", subtitle = paste("Data as of", format(max(death_curve$date), "%A, %B %e, %Y")), caption = "Long Vo: longvoqnu@gmail.com / Data: JHU") + commonTheme - And finally, we can plot different varieties of these curves using the changes in cases and in death tolls: This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
new_confirm_curve <- confirm_world %>% filter(country %in% focus_countries) %>% mutate(new_confirm_ma=rollapply(new_confirm,7,mean,align='right',fill=NA)) %>% #compute 1-week moving average filter(new_confirm_ma > 10) %>% #cumulative value since the 100th case (MA) mutate(days_elapsed = date - min(date), end_label = ifelse(date == max(date), country, NA)); # new_confirm_curve %>% ggplot(mapping = aes(x = as.numeric(days_elapsed), y = new_confirm_ma, color = country, label = end_label, group = country)) + geom_line(size = 0.8) + geom_text_repel(size = 8, nudge_x = 1.1, nudge_y = 0.1, segment.color = NA) + guides(color = FALSE) + scale_color_manual(values = prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)) + scale_y_continuous(labels = scales::comma_format(accuracy = 1), breaks = 2^seq(2, round(log(max(new_confirm_curve$new_confirm_ma))/log(2)-1,0)), #find the power scale of a base 2), trans = "log2") + labs(x = "Days since new cases passed 100", y = "Number of new cases (log scale)", title = "Daily New Cases from COVID-19 (7-day rolling average), Selected Countries", subtitle = paste("Data as of", format(max(confirm_curve$date), "%A, %B%e, %Y")), caption = "Long Vo: longvoqnu@gmail.com/ Data: JHU") + commonTheme # new_death_curve <- death_world %>% filter(country %in% focus_countries) %>% mutate(new_death_ma=rollapply(new_death,7,mean,align='right',fill=NA)) %>% #compute 1-week moving average filter(new_death_ma > 3) %>% #cumulative value since the 50th case (MA) mutate(days_elapsed = date - min(date), end_label = ifelse(date == max(date), country, NA)); # new_death_curve %>% ggplot(mapping = aes(x = as.numeric(days_elapsed), y = new_death_ma, color = country, label = end_label, group = country)) + geom_line(size = 0.8) + geom_text_repel(size = 8, nudge_x = 1.1, nudge_y = 0.1, segment.color = NA) + guides(color = FALSE) + scale_color_manual(values = prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)) + scale_y_continuous(labels = scales::comma_format(accuracy = 1), breaks = 2^seq(2, round(log(max(new_death_curve$new_death_ma))/log(2)-1,0)), #find the power scale of a base 2), trans = "log2") + labs(x = "Days since deaths passed 3", y = "Number of deaths (log scale)", title = "Daily Death Tolls from COVID-19 (7-day rolling average), Selected Countries", subtitle = paste("Data as of", format(max(death_curve$date), "%A, %B%e, %Y")), caption = "Long Vo: longvoqnu@gmail.com/ Data: JHU") + commonTheme