library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)

1: Data Collection

Import datasets

lifeExp_data <- read_csv("data/Life Expectancy Data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Status = col_character()
## )
## See spec(...) for full column specifications.
continent_data <- read_csv("data/Countries-Continents.csv")
## Parsed with column specification:
## cols(
##   Continent = col_character(),
##   Country = col_character()
## )

2: Data Preparation

Merge two source of data & clean NAs

whole_data <- left_join(lifeExp_data, continent_data, by = c("Country"="Country"), na = c("NA", "N/A"))

Rename the colomn GDP as GDP_per_capita

names(whole_data)[names(whole_data) == "GDP"] <- "GDP_per_capita"
summary(whole_data)
##    Country               Year         Status          Life expectancy
##  Length:2938        Min.   :2000   Length:2938        Min.   :36.30  
##  Class :character   1st Qu.:2004   Class :character   1st Qu.:63.10  
##  Mode  :character   Median :2008   Mode  :character   Median :72.10  
##                     Mean   :2008                      Mean   :69.22  
##                     3rd Qu.:2012                      3rd Qu.:75.70  
##                     Max.   :2015                      Max.   :89.00  
##                                                       NA's   :10     
##  Adult Mortality infant deaths       Alcohol        percentage expenditure
##  Min.   :  1.0   Min.   :   0.0   Min.   : 0.0100   Min.   :    0.000     
##  1st Qu.: 74.0   1st Qu.:   0.0   1st Qu.: 0.8775   1st Qu.:    4.685     
##  Median :144.0   Median :   3.0   Median : 3.7550   Median :   64.913     
##  Mean   :164.8   Mean   :  30.3   Mean   : 4.6029   Mean   :  738.251     
##  3rd Qu.:228.0   3rd Qu.:  22.0   3rd Qu.: 7.7025   3rd Qu.:  441.534     
##  Max.   :723.0   Max.   :1800.0   Max.   :17.8700   Max.   :19479.912     
##  NA's   :10                       NA's   :194                             
##   Hepatitis B       Measles              BMI        under-five deaths
##  Min.   : 1.00   Min.   :     0.0   Min.   : 1.00   Min.   :   0.00  
##  1st Qu.:77.00   1st Qu.:     0.0   1st Qu.:19.30   1st Qu.:   0.00  
##  Median :92.00   Median :    17.0   Median :43.50   Median :   4.00  
##  Mean   :80.94   Mean   :  2419.6   Mean   :38.32   Mean   :  42.04  
##  3rd Qu.:97.00   3rd Qu.:   360.2   3rd Qu.:56.20   3rd Qu.:  28.00  
##  Max.   :99.00   Max.   :212183.0   Max.   :87.30   Max.   :2500.00  
##  NA's   :553                        NA's   :34                       
##      Polio       Total expenditure   Diphtheria       HIV/AIDS     
##  Min.   : 3.00   Min.   : 0.370    Min.   : 2.00   Min.   : 0.100  
##  1st Qu.:78.00   1st Qu.: 4.260    1st Qu.:78.00   1st Qu.: 0.100  
##  Median :93.00   Median : 5.755    Median :93.00   Median : 0.100  
##  Mean   :82.55   Mean   : 5.938    Mean   :82.32   Mean   : 1.742  
##  3rd Qu.:97.00   3rd Qu.: 7.492    3rd Qu.:97.00   3rd Qu.: 0.800  
##  Max.   :99.00   Max.   :17.600    Max.   :99.00   Max.   :50.600  
##  NA's   :19      NA's   :226       NA's   :19                      
##  GDP_per_capita        Population        thinness  1-19 years
##  Min.   :     1.68   Min.   :3.400e+01   Min.   : 0.10       
##  1st Qu.:   463.94   1st Qu.:1.958e+05   1st Qu.: 1.60       
##  Median :  1766.95   Median :1.387e+06   Median : 3.30       
##  Mean   :  7483.16   Mean   :1.275e+07   Mean   : 4.84       
##  3rd Qu.:  5910.81   3rd Qu.:7.420e+06   3rd Qu.: 7.20       
##  Max.   :119172.74   Max.   :1.294e+09   Max.   :27.70       
##  NA's   :448         NA's   :652         NA's   :34          
##  thinness 5-9 years Income composition of resources   Schooling    
##  Min.   : 0.10      Min.   :0.0000                  Min.   : 0.00  
##  1st Qu.: 1.50      1st Qu.:0.4930                  1st Qu.:10.10  
##  Median : 3.30      Median :0.6770                  Median :12.30  
##  Mean   : 4.87      Mean   :0.6276                  Mean   :11.99  
##  3rd Qu.: 7.20      3rd Qu.:0.7790                  3rd Qu.:14.30  
##  Max.   :28.60      Max.   :0.9480                  Max.   :20.70  
##  NA's   :34         NA's   :167                     NA's   :163    
##   Continent        
##  Length:2938       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
glimpse(whole_data)
## Observations: 2,938
## Variables: 23
## $ Country                           <chr> "Afghanistan", "Afghanistan", "Afgh…
## $ Year                              <dbl> 2015, 2014, 2013, 2012, 2011, 2010,…
## $ Status                            <chr> "Developing", "Developing", "Develo…
## $ `Life expectancy`                 <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8,…
## $ `Adult Mortality`                 <dbl> 263, 271, 268, 272, 275, 279, 281, …
## $ `infant deaths`                   <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82,…
## $ Alcohol                           <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,…
## $ `percentage expenditure`          <dbl> 71.279624, 73.523582, 73.219243, 78…
## $ `Hepatitis B`                     <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63,…
## $ Measles                           <dbl> 1154, 492, 430, 2787, 3013, 1989, 2…
## $ BMI                               <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7,…
## $ `under-five deaths`               <dbl> 83, 86, 89, 93, 97, 102, 106, 110, …
## $ Polio                             <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, …
## $ `Total expenditure`               <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20,…
## $ Diphtheria                        <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63,…
## $ `HIV/AIDS`                        <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, …
## $ GDP_per_capita                    <dbl> 584.25921, 612.69651, 631.74498, 66…
## $ Population                        <dbl> 33736494, 327582, 31731688, 3696958…
## $ `thinness  1-19 years`            <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4,…
## $ `thinness 5-9 years`              <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4,…
## $ `Income composition of resources` <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, …
## $ Schooling                         <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9…
## $ Continent                         <chr> "Asia", "Asia", "Asia", "Asia", "As…

Create a numeric variable: GDP

whole_data %>% 
  mutate(GDP = GDP_per_capita * Population)
## # A tibble: 2,938 x 24
##    Country  Year Status `Life expectanc… `Adult Mortalit… `infant deaths`
##    <chr>   <dbl> <chr>             <dbl>            <dbl>           <dbl>
##  1 Afghan…  2015 Devel…             65                263              62
##  2 Afghan…  2014 Devel…             59.9              271              64
##  3 Afghan…  2013 Devel…             59.9              268              66
##  4 Afghan…  2012 Devel…             59.5              272              69
##  5 Afghan…  2011 Devel…             59.2              275              71
##  6 Afghan…  2010 Devel…             58.8              279              74
##  7 Afghan…  2009 Devel…             58.6              281              77
##  8 Afghan…  2008 Devel…             58.1              287              80
##  9 Afghan…  2007 Devel…             57.5              295              82
## 10 Afghan…  2006 Devel…             57.3              295              84
## # … with 2,928 more rows, and 18 more variables: Alcohol <dbl>, `percentage
## #   expenditure` <dbl>, `Hepatitis B` <dbl>, Measles <dbl>, BMI <dbl>,
## #   `under-five deaths` <dbl>, Polio <dbl>, `Total expenditure` <dbl>,
## #   Diphtheria <dbl>, `HIV/AIDS` <dbl>, GDP_per_capita <dbl>, Population <dbl>,
## #   `thinness 1-19 years` <dbl>, `thinness 5-9 years` <dbl>, `Income
## #   composition of resources` <dbl>, Schooling <dbl>, Continent <chr>,
## #   GDP <dbl>

3: Visual Analysis

Histogram for Life Expectancy in All Countries

ggplot(whole_data, aes(x = `Life expectancy`)) +
  geom_histogram() +
  labs(title = "Life Expectancy in All Countries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 10 rows containing non-finite values (stat_bin).

Histogram for GDP per capita in All Countries

whole_data %>% 
  drop_na() %>%
  filter(GDP_per_capita != 0) %>%
  ggplot(aes(x = GDP_per_capita)) +
  geom_histogram() +
  labs(title = "GDP per capita in All Countries")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatter Plot between Life Expectancy and GDP per capita in All Countries

ggplot(whole_data, aes(x = GDP_per_capita, y = `Life expectancy`)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Life Expectancy and GDP per capita in All Countries")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 453 rows containing non-finite values (stat_smooth).
## Warning: Removed 453 rows containing missing values (geom_point).

Boxplot for Life Expectancy in Developing and Developed Countries

ggplot(whole_data, aes(x = Status, y = `Life expectancy`, color = Status)) + 
  geom_boxplot() + 
  labs(title = "Boxplot for Life Expectancy in Developing and Developed Countries")
## Warning: Removed 10 rows containing non-finite values (stat_boxplot).

Scatter Plot between Life Expectancy and GDP in Developing Countries

developing_data <- filter(whole_data, Status == 'Developing')
ggplot(developing_data, aes(x = GDP_per_capita, y = `Life expectancy`)) + 
  geom_point() + 
  geom_smooth(method = "lm", colour = "red") + 
  guides(color = FALSE) + 
  labs(title = "GDP per capita and Life Expectancy in Developing Countries")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 389 rows containing non-finite values (stat_smooth).
## Warning: Removed 389 rows containing missing values (geom_point).

Scatter Plot between Life Expectancy and GDP per capita in Developed Countries

developed_data <- filter(whole_data, Status == 'Developed')
ggplot(developed_data, aes(x = GDP_per_capita, y = `Life expectancy`)) + 
  geom_point() + 
  geom_smooth(method = "lm", colour = "red") + 
  guides(color = FALSE) +
  labs(title = "GDP per capita and Life Ecpectancy in Develped Countries")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## Warning: Removed 64 rows containing missing values (geom_point).