library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
lifeExp_data <- read_csv("data/Life Expectancy Data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Status = col_character()
## )
## See spec(...) for full column specifications.
continent_data <- read_csv("data/Countries-Continents.csv")
## Parsed with column specification:
## cols(
## Continent = col_character(),
## Country = col_character()
## )
whole_data <- left_join(lifeExp_data, continent_data, by = c("Country"="Country"), na = c("NA", "N/A"))
GDP
as GDP_per_capita
names(whole_data)[names(whole_data) == "GDP"] <- "GDP_per_capita"
summary(whole_data)
## Country Year Status Life expectancy
## Length:2938 Min. :2000 Length:2938 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2012 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## NA's :10
## Adult Mortality infant deaths Alcohol percentage expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## Hepatitis B Measles BMI under-five deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Total expenditure Diphtheria HIV/AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP_per_capita Population thinness 1-19 years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60
## Median : 1766.95 Median :1.387e+06 Median : 3.30
## Mean : 7483.16 Mean :1.275e+07 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20
## Max. :119172.74 Max. :1.294e+09 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness 5-9 years Income composition of resources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
## Continent
## Length:2938
## Class :character
## Mode :character
##
##
##
##
glimpse(whole_data)
## Observations: 2,938
## Variables: 23
## $ Country <chr> "Afghanistan", "Afghanistan", "Afgh…
## $ Year <dbl> 2015, 2014, 2013, 2012, 2011, 2010,…
## $ Status <chr> "Developing", "Developing", "Develo…
## $ `Life expectancy` <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8,…
## $ `Adult Mortality` <dbl> 263, 271, 268, 272, 275, 279, 281, …
## $ `infant deaths` <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82,…
## $ Alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,…
## $ `percentage expenditure` <dbl> 71.279624, 73.523582, 73.219243, 78…
## $ `Hepatitis B` <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63,…
## $ Measles <dbl> 1154, 492, 430, 2787, 3013, 1989, 2…
## $ BMI <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7,…
## $ `under-five deaths` <dbl> 83, 86, 89, 93, 97, 102, 106, 110, …
## $ Polio <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, …
## $ `Total expenditure` <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20,…
## $ Diphtheria <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63,…
## $ `HIV/AIDS` <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, …
## $ GDP_per_capita <dbl> 584.25921, 612.69651, 631.74498, 66…
## $ Population <dbl> 33736494, 327582, 31731688, 3696958…
## $ `thinness 1-19 years` <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4,…
## $ `thinness 5-9 years` <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4,…
## $ `Income composition of resources` <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, …
## $ Schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9…
## $ Continent <chr> "Asia", "Asia", "Asia", "Asia", "As…
whole_data %>%
mutate(GDP = GDP_per_capita * Population)
## # A tibble: 2,938 x 24
## Country Year Status `Life expectanc… `Adult Mortalit… `infant deaths`
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Afghan… 2015 Devel… 65 263 62
## 2 Afghan… 2014 Devel… 59.9 271 64
## 3 Afghan… 2013 Devel… 59.9 268 66
## 4 Afghan… 2012 Devel… 59.5 272 69
## 5 Afghan… 2011 Devel… 59.2 275 71
## 6 Afghan… 2010 Devel… 58.8 279 74
## 7 Afghan… 2009 Devel… 58.6 281 77
## 8 Afghan… 2008 Devel… 58.1 287 80
## 9 Afghan… 2007 Devel… 57.5 295 82
## 10 Afghan… 2006 Devel… 57.3 295 84
## # … with 2,928 more rows, and 18 more variables: Alcohol <dbl>, `percentage
## # expenditure` <dbl>, `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>,
## # `under-five deaths` <dbl>, Polio <dbl>, `Total expenditure` <dbl>,
## # Diphtheria <dbl>, `HIV/AIDS` <dbl>, GDP_per_capita <dbl>, Population <dbl>,
## # `thinness 1-19 years` <dbl>, `thinness 5-9 years` <dbl>, `Income
## # composition of resources` <dbl>, Schooling <dbl>, Continent <chr>,
## # GDP <dbl>
ggplot(whole_data, aes(x = `Life expectancy`)) +
geom_histogram() +
labs(title = "Life Expectancy in All Countries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10 rows containing non-finite values (stat_bin).
whole_data %>%
drop_na() %>%
filter(GDP_per_capita != 0) %>%
ggplot(aes(x = GDP_per_capita)) +
geom_histogram() +
labs(title = "GDP per capita in All Countries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(whole_data, aes(x = GDP_per_capita, y = `Life expectancy`)) +
geom_point() +
geom_smooth() +
labs(title = "Life Expectancy and GDP per capita in All Countries")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 453 rows containing non-finite values (stat_smooth).
## Warning: Removed 453 rows containing missing values (geom_point).
ggplot(whole_data, aes(x = Status, y = `Life expectancy`, color = Status)) +
geom_boxplot() +
labs(title = "Boxplot for Life Expectancy in Developing and Developed Countries")
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).
developing_data <- filter(whole_data, Status == 'Developing')
ggplot(developing_data, aes(x = GDP_per_capita, y = `Life expectancy`)) +
geom_point() +
geom_smooth(method = "lm", colour = "red") +
guides(color = FALSE) +
labs(title = "GDP per capita and Life Expectancy in Developing Countries")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 389 rows containing non-finite values (stat_smooth).
## Warning: Removed 389 rows containing missing values (geom_point).
developed_data <- filter(whole_data, Status == 'Developed')
ggplot(developed_data, aes(x = GDP_per_capita, y = `Life expectancy`)) +
geom_point() +
geom_smooth(method = "lm", colour = "red") +
guides(color = FALSE) +
labs(title = "GDP per capita and Life Ecpectancy in Develped Countries")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## Warning: Removed 64 rows containing missing values (geom_point).