Original post published to DataScience+
In this post I will show how to collect data from a webpage and to analyze or visualize in R. For this task I will use the rvest
package and will get the data from Wikipedia. I got the idea to write this post from Fisseha Berhane.
I will gain access to the prevalence of obesity in United States from Wikipedia page, then I will plot it in the map. Lets begin with loading the required packages.
## LOAD THE PACKAGES ####
library(rvest)
library(ggplot2)
library(dplyr)
library(scales)
## LOAD THE DATA ####
obesity = read_html("https://en.wikipedia.org/wiki/Obesity_in_the_United_States")
obesity = obesity %>%
html_nodes("table") %>%
.[[1]]%>%
html_table(fill=T)
head(obesity)
State and District of Columbia Obese adults Overweight (incl. obese) adults
1 Alabama 30.1% 65.4%
2 Alaska 27.3% 64.5%
3 Arizona 23.3% 59.5%
4 Arkansas 28.1% 64.7%
5 California 23.1% 59.4%
6 Colorado 21.0% 55.0%
Obese children and adolescents Obesity rank
1 16.7% 3
2 11.1% 14
3 12.2% 40
4 16.4% 9
5 13.2% 41
6 9.9% 51
## CLEAN THE DATA ####
str(obesity)
'data.frame': 51 obs. of 5 variables:
$ State and District of Columbia : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Obese adults : chr "30.1%" "27.3%" "23.3%" "28.1%" ...
$ Overweight (incl. obese) adults: chr "65.4%" "64.5%" "59.5%" "64.7%" ...
$ Obese children and adolescents : chr "16.7%" "11.1%" "12.2%" "16.4%" ...
$ Obesity rank : int 3 14 40 9 41 51 49 43 22 39 ...
# remove the % and make the data numeric
for(i in 2:4){
obesity[,i] = gsub("%", "", obesity[,i])
obesity[,i] = as.numeric(obesity[,i])
}
# check data again
str(obesity)
'data.frame': 51 obs. of 5 variables:
$ State and District of Columbia : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Obese adults : num 30.1 27.3 23.3 28.1 23.1 21 20.8 22.1 25.9 23.3 ...
$ Overweight (incl. obese) adults: num 65.4 64.5 59.5 64.7 59.4 55 58.7 55 63.9 60.8 ...
$ Obese children and adolescents : num 16.7 11.1 12.2 16.4 13.2 9.9 12.3 14.8 22.8 14.4 ...
$ Obesity rank : int 3 14 40 9 41 51 49 43 22 39 ...
names(obesity)
[1] "State and District of Columbia" "Obese adults"
[3] "Overweight (incl. obese) adults" "Obese children and adolescents"
[5] "Obesity rank"
names(obesity) = make.names(names(obesity))
names(obesity)
[1] "State.and.District.of.Columbia" "Obese.adults"
[3] "Overweight..incl..obese..adults" "Obese.children.and.adolescents"
[5] "Obesity.rank"
# load the map data
states = map_data("state")
str(states)
'data.frame': 15537 obs. of 6 variables:
$ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
$ lat : num 30.4 30.4 30.4 30.3 30.3 ...
$ group : num 1 1 1 1 1 1 1 1 1 1 ...
$ order : int 1 2 3 4 5 6 7 8 9 10 ...
$ region : chr "alabama" "alabama" "alabama" "alabama" ...
$ subregion: chr NA NA NA NA ...
# create a new variable name for state
obesity$region = tolower(obesity$State.and.District.of.Columbia)
states = merge(states, obesity, by="region", all.x=T)
str(states)
'data.frame': 15537 obs. of 11 variables:
$ region : chr "alabama" "alabama" "alabama" "alabama" ...
$ long : num -87.5 -87.5 -87.5 -87.5 -87.6 ...
$ lat : num 30.4 30.4 30.4 30.3 30.3 ...
$ group : num 1 1 1 1 1 1 1 1 1 1 ...
$ order : int 1 2 3 4 5 6 7 8 9 10 ...
$ subregion : chr NA NA NA NA ...
$ State.and.District.of.Columbia : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
$ Obese.adults : num 30.1 30.1 30.1 30.1 30.1 30.1 30.1 30.1 30.1 30.1 ...
$ Overweight..incl..obese..adults: num 65.4 65.4 65.4 65.4 65.4 65.4 65.4 65.4 65.4 65.4 ...
$ Obese.children.and.adolescents : num 16.7 16.7 16.7 16.7 16.7 16.7 16.7 16.7 16.7 16.7 ...
$ Obesity.rank : int 3 3 3 3 3 3 3 3 3 3 ...
## MAKE THE PLOT ####
# adults
ggplot(states, aes(x = long, y = lat, group = group, fill = Obese.adults)) +
geom_polygon(color = "white") +
scale_fill_gradient(name = "Percent", low = "#feceda", high = "#c81f49", guide = "colorbar", na.value="black", breaks = pretty_breaks(n = 5)) +
labs(title="Prevalence of Obesity in Adults") +
coord_map()
# children
ggplot(states, aes(x = long, y = lat, group = group, fill = Obese.children.and.adolescents)) +
geom_polygon(color = "white") +
scale_fill_gradient(name = "Percent", low = "#feceda", high = "#c81f49", guide = "colorbar", na.value="black", breaks = pretty_breaks(n = 5)) +
labs(title="Prevalence of Obesity in Children") +
coord_map()
statenames = states %>%
group_by(region) %>%
summarise(
long = mean(range(long)),
lat = mean(range(lat)),
group = mean(group),
Obese.adults = mean(Obese.adults),
Obese.children.and.adolescents = mean(Obese.children.and.adolescents)
)
geom_text(data=statenames, aes(x = long, y = lat, label = region), size=3)
© 2021 TechTarget, Inc.
Powered by
Badges | Report an Issue | Privacy Policy | Terms of Service
Most Popular Content on DSC
To not miss this type of content in the future, subscribe to our newsletter.
Other popular resources
Archives: 2008-2014 | 2015-2016 | 2017-2019 | Book 1 | Book 2 | More
Most popular articles
You need to be a member of Data Science Central to add comments!
Join Data Science Central