In this lab, I practiced data wrangling and visualization skills using real-time COVID-19 data maintained by the New York Times. If you scroll down this page, you will see different tables demonstrating COVID trends across the state of California.
#LOAD DATA from NY Times
covid_url=("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid= read_csv(covid_url) #covid= data from nytimes
#STEP2: subset Covid Data -> filters data to CA Counties
CA_covid= covid%>%
filter(state=="California")%>%
group_by(county) %>%
mutate(newCases = c(cases[1], diff(cases))) %>%
ungroup() %>%
filter(date >=max(date)-13)
#STEP 3
#Most cumulative cases
CA_cumulative= CA_covid %>%
filter(date==max(date)) %>%
slice_max(cases, n=5) %>%
select(county,cases)
#Most new cases
CA_new= CA_covid %>%
filter(date==max(date)) %>%
slice_max(newCases, n=5) %>%
select(county, newCases)
#make table using Knitr::Kable
knitr::kable(CA_cumulative,
caption = "Counties with Most Cumulative Cases",
col.names = c("County", "Total Cases"))
County | Total Cases |
---|---|
Los Angeles | 253985 |
Riverside | 55073 |
Orange | 52121 |
San Bernardino | 50699 |
San Diego | 42742 |
knitr::kable(CA_new,
caption = "Counties with Most New Cases",
col.names = c("County", "Total New Cases"))
County | Total New Cases |
---|---|
Los Angeles | 809 |
San Diego | 265 |
Orange | 185 |
Fresno | 159 |
San Bernardino | 156 |
#STEP4/5 : Load population data
library(readxl)
pop = read_excel('../data/PopulationEstimates.xls', skip = 2) %>%
select(pop2019 = POP_ESTIMATE_2019, fips = FIPStxt) #change column names
#STEP 6 #examine data
names(pop)
## [1] "pop2019" "fips"
dim(pop)
## [1] 3273 2
names(CA_covid)
## [1] "date" "county" "state" "fips" "cases" "deaths" "newCases"
dim(CA_covid)
## [1] 812 7
#STEP 7
#join COVID & Population data
all_data= left_join(covid, pop, by='fips')
These are the 13 counties deemed safe in terms of the number of covid cases among their residents. Over the most recent 14 days, if more than 100 people (per 100,000 residents) are infected with Covid, they are placed on the State Watch List.
last_14_days= all_data %>%
filter(state==c("California")) %>%
group_by(county, date) %>%
summarize(totalcases=sum(cases, na.rm = TRUE),
pop2019= sum(pop2019, na.rm = TRUE)) %>%
ungroup() %>%
filter(date>=max(date)-13)%>%
group_by(county,pop2019)%>%
mutate(newCases=totalcases-lag(totalcases))%>%
summarize(totalcases= sum(newCases,na.rm = TRUE)) %>%
mutate(per100= totalcases/ (pop2019/100000)) %>%
filter(per100 <=100)%>%
pull(county)
last_14_days
## [1] "Alpine" "Amador" "Del Norte" "El Dorado"
## [5] "Humboldt" "Inyo" "Lake" "Lassen"
## [9] "Mariposa" "Mendocino" "Mono" "Napa"
## [13] "Nevada" "Placer" "Plumas" "Riverside"
## [17] "San Francisco" "San Luis Obispo" "Shasta" "Sierra"
## [21] "Siskiyou" "Solano" "Tehama" "Trinity"
## [25] "Tuolumne"
knitr::kable(last_14_days, caption = paste('Safe counties across State'),
col.names = c('County'))
County |
---|
Alpine |
Amador |
Del Norte |
El Dorado |
Humboldt |
Inyo |
Lake |
Lassen |
Mariposa |
Mendocino |
Mono |
Napa |
Nevada |
Placer |
Plumas |
Riverside |
San Francisco |
San Luis Obispo |
Shasta |
Sierra |
Siskiyou |
Solano |
Tehama |
Trinity |
Tuolumne |
This graph shows the number of daily, new cases at the state level for NY, CA, LA, and FL.
#QUESTION 2
##In this question, we are going to look at the story of 4 states and the impact scale can have on data interprtation.
#The states include: New York, California, Louisiana, and Florida.
##Your task is to make a faceted bar plot showing the number of daily, new cases at the state level.
#Step 1: group data to state/county level, then filter it to the 4 states of interest
four_states=covid %>%
group_by(state, date)%>%
summarise(cases=sum(cases))%>%
ungroup()%>%
filter(state %in% c("New York", "California", "Louisiana", "Florida")) %>%
group_by(state) %>%
mutate(newCases=cases-lag(cases))%>%
mutate(roll7=zoo::rollmean(newCases,7,fill=NA, align='right'))
ggplot(data= four_states, aes(x=date))+
geom_col(aes(y=newCases), col='red')+
geom_line(aes(y=roll7), col='black')+
facet_wrap(~state)+
labs(title = "Daily New Cases in NY, CA, LA, FL",
x="Date",
y="New Cases",
caption = 'From NY Times COVID Data',
theme_bw())
*Code below did not run as expected
The following graphs demonstrate the cases per capita of each state. This data is a better representation of the COVID situation in each state, as the graph above is strictly raw data and can be misleading.
percap_state_data= four_states%>% left_join(pop, by= c(“state”=‘fips’)) %>% mutate(new_percap= newCases/pop2019, perCap7=rollmean(new_percap, 7, fill=NA, align=‘center’))
#Graph new cases per capita ggplot(data = percap_state_data, aes(x=date)) + geom_col(aes(y=new_percap), col=‘green’) + geom_line(aes(y=perCap7), col= “black”)+ facet_wrap(~state) + labs(title = “Daily New Cases Per Capita”, x=“Date”, y=“New Cases”, caption = ‘From NY Times COVID Data’, theme_bw())