Lab 02

In this lab, I practiced data wrangling and visualization skills using real-time COVID-19 data maintained by the New York Times. If you scroll down this page, you will see different tables demonstrating COVID trends across the state of California.

#LOAD DATA from NY Times
covid_url=("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid= read_csv(covid_url)  #covid= data from nytimes

#STEP2: subset Covid Data -> filters data to CA Counties
CA_covid= covid%>%
  filter(state=="California")%>%
  group_by(county) %>%
  mutate(newCases = c(cases[1], diff(cases))) %>%
  ungroup() %>%
  filter(date >=max(date)-13)

#STEP 3
#Most cumulative cases
CA_cumulative= CA_covid %>%
  filter(date==max(date)) %>%
  slice_max(cases, n=5) %>%
  select(county,cases)
#Most new cases
CA_new= CA_covid %>%
  filter(date==max(date)) %>%
  slice_max(newCases, n=5) %>%
  select(county, newCases)

#make table using Knitr::Kable
knitr::kable(CA_cumulative,
             caption = "Counties with Most Cumulative Cases",
             col.names = c("County", "Total Cases"))

Counties with Most Cumulative Cases
County	Total Cases
Los Angeles	253985
Riverside	55073
Orange	52121
San Bernardino	50699
San Diego	42742

knitr::kable(CA_new,
             caption = "Counties with Most New Cases",
             col.names = c("County", "Total New Cases"))

Counties with Most New Cases
County	Total New Cases
Los Angeles	809
San Diego	265
Orange	185
Fresno	159
San Bernardino	156

#STEP4/5 : Load population data

library(readxl)
pop = read_excel('../data/PopulationEstimates.xls', skip = 2) %>%
  select(pop2019 = POP_ESTIMATE_2019, fips = FIPStxt) #change column names

#STEP 6 #examine data
names(pop)

## [1] "pop2019" "fips"

dim(pop)

## [1] 3273    2

names(CA_covid)

## [1] "date"     "county"   "state"    "fips"     "cases"    "deaths"   "newCases"

dim(CA_covid)

## [1] 812   7

#STEP 7
#join COVID & Population data

all_data= left_join(covid, pop, by='fips')

These are the 13 counties deemed safe in terms of the number of covid cases among their residents. Over the most recent 14 days, if more than 100 people (per 100,000 residents) are infected with Covid, they are placed on the State Watch List.

last_14_days= all_data %>%
  filter(state==c("California")) %>%
  group_by(county, date) %>%
  summarize(totalcases=sum(cases, na.rm = TRUE),
            pop2019= sum(pop2019, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(date>=max(date)-13)%>%
  group_by(county,pop2019)%>%
  mutate(newCases=totalcases-lag(totalcases))%>%
  summarize(totalcases= sum(newCases,na.rm = TRUE)) %>%
  mutate(per100= totalcases/ (pop2019/100000)) %>%
  filter(per100 <=100)%>%
  pull(county)
last_14_days

##  [1] "Alpine"          "Amador"          "Del Norte"       "El Dorado"      
##  [5] "Humboldt"        "Inyo"            "Lake"            "Lassen"         
##  [9] "Mariposa"        "Mendocino"       "Mono"            "Napa"           
## [13] "Nevada"          "Placer"          "Plumas"          "Riverside"      
## [17] "San Francisco"   "San Luis Obispo" "Shasta"          "Sierra"         
## [21] "Siskiyou"        "Solano"          "Tehama"          "Trinity"        
## [25] "Tuolumne"

knitr::kable(last_14_days, caption = paste('Safe counties across State'),
             col.names = c('County'))

Safe counties across State
County
Alpine
Amador
Del Norte
El Dorado
Humboldt
Inyo
Lake
Lassen
Mariposa
Mendocino
Mono
Napa
Nevada
Placer
Plumas
Riverside
San Francisco
San Luis Obispo
Shasta
Sierra
Siskiyou
Solano
Tehama
Trinity
Tuolumne

This graph shows the number of daily, new cases at the state level for NY, CA, LA, and FL.

#QUESTION 2
##In this question, we are going to look at the story of 4 states and the impact scale can have on data interprtation. 
#The states include: New York, California, Louisiana, and Florida.
##Your task is to make a faceted bar plot showing the number of daily, new cases at the state level.

#Step 1: group data to state/county level, then filter it to the 4 states of interest

four_states=covid %>%
  group_by(state, date)%>%
  summarise(cases=sum(cases))%>%
  ungroup()%>%
  filter(state %in% c("New York", "California", "Louisiana", "Florida")) %>%
  group_by(state) %>%
  mutate(newCases=cases-lag(cases))%>%
  mutate(roll7=zoo::rollmean(newCases,7,fill=NA, align='right'))
  
ggplot(data= four_states, aes(x=date))+
  geom_col(aes(y=newCases), col='red')+
  geom_line(aes(y=roll7), col='black')+
  facet_wrap(~state)+
  labs(title = "Daily New Cases in NY, CA, LA, FL",
       x="Date",
       y="New Cases",
       caption = 'From NY Times COVID Data',
       theme_bw())

*Code below did not run as expected

The following graphs demonstrate the cases per capita of each state. This data is a better representation of the COVID situation in each state, as the graph above is strictly raw data and can be misleading.

percap_state_data= four_states%>% left_join(pop, by= c(“state”=‘fips’)) %>% mutate(new_percap= newCases/pop2019, perCap7=rollmean(new_percap, 7, fill=NA, align=‘center’))

#Graph new cases per capita ggplot(data = percap_state_data, aes(x=date)) + geom_col(aes(y=new_percap), col=‘green’) + geom_line(aes(y=perCap7), col= “black”)+ facet_wrap(~state) + labs(title = “Daily New Cases Per Capita”, x=“Date”, y=“New Cases”, caption = ‘From NY Times COVID Data’, theme_bw())

Lab 02

COVID-19 Pandemic

Liam Finn

8/24/2020