5 Basic data manipulation

Learning objectives

  • Learn how to read/write data to/from files with different formats (.tsv, .csv)
  • Familiarize with basic operations of data frames
  • Index and subset data frames using base R functions
  • Manipulate specific data frame columns
  • Joining by columns and rows

For this section we will use the package gapminder that we installed earlier.

library(gapminder)
dim(gapminder)
## [1] 1704    6

View the data frame

View(gapminder)
summary(gapminder$lifeExp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.60   48.20   60.71   59.47   70.85   82.60

5.1 Reading/writing data

5.1.1 Text files

Writing tables to a file using write.table()

aust <- gapminder[gapminder$country == "Australia",]
write.table(aust,
            file="../data/gapminder_australia.csv",
            sep=",")
write.table(aust,
            file="../data/gapminder_australia.csv",
            sep=",",
            quote=FALSE, 
            row.names=FALSE)
write.table(aust,
            file="../data/gapminder_australia.tsv",
            sep="\t",
            quote=FALSE, 
            row.names=FALSE)

Other functions to write to a file

africa<-gapminder[gapminder$continent=="Africa",]
write.csv(gapminder[gapminder$continent=="Africa",],
          file = "../data/gapminder_africa.csv",
          row.names = FALSE)
class(africa$continent)
## [1] "factor"

Reading data from a file

africa<-read.csv("../data/gapminder_africa.csv",sep = ",",header = T)
class(africa$continent)
## [1] "character"
head(africa)
##   country continent year lifeExp      pop gdpPercap
## 1 Algeria    Africa 1952  43.077  9279525  2449.008
## 2 Algeria    Africa 1957  45.685 10270856  3013.976
## 3 Algeria    Africa 1962  48.303 11000948  2550.817
## 4 Algeria    Africa 1967  51.407 12760499  3246.992
## 5 Algeria    Africa 1972  54.518 14760787  4182.664
## 6 Algeria    Africa 1977  58.014 17152804  4910.417
africa<-read.table("../data/gapminder_africa.csv",sep = ",",header = T,stringsAsFactors = T)
class(africa$continent)
## [1] "factor"

5.1.2 R objects

Using .RDS files

saveRDS(africa,file = "../objects/africa.RDS")
africa<-readRDS(file = "../objects/africa.RDS")

Using .RData files

americas<-gapminder[gapminder$continent=="Americas",]
save(africa,americas,file = "../objects/continents.RData")
load(file = "../objects/continents.RData",verbose = T)
## Loading objects:
##   africa
##   americas

5.2 Exploring data frames

5.2.1 Adding columns and rows

Individually adding columns

mean_children <- sample(1:10,nrow(aust),replace = TRUE)
aust$mean_children <- mean_children

#head(aust)
aust$GDP <- aust$pop * aust$gdpPercap

head(aust)
## # A tibble: 6 × 8
##   country   continent  year lifeExp      pop gdpPercap mean_children           GDP
##   <fct>     <fct>     <int>   <dbl>    <int>     <dbl>         <int>         <dbl>
## 1 Australia Oceania    1952    69.1  8691212    10040.             8  87256254102.
## 2 Australia Oceania    1957    70.3  9712569    10950.            10 106349227169.
## 3 Australia Oceania    1962    70.9 10794968    12217.             8 131884573002.
## 4 Australia Oceania    1967    71.1 11872264    14526.             6 172457986742.
## 5 Australia Oceania    1972    71.9 13177000    16789.             9 221223770658.
## 6 Australia Oceania    1977    73.5 14074100    18334.             6 258037329175.
mean_bikes <- sample(1:4,nrow(aust),replace = TRUE) # Check what happens if they don't have the same number of rows
aust[,"mean_bikes"]<-mean_bikes
head(aust)
## # A tibble: 6 × 9
##   country   continent  year lifeExp      pop gdpPercap mean_children           GDP mean_bikes
##   <fct>     <fct>     <int>   <dbl>    <int>     <dbl>         <int>         <dbl>      <int>
## 1 Australia Oceania    1952    69.1  8691212    10040.             8  87256254102.          1
## 2 Australia Oceania    1957    70.3  9712569    10950.            10 106349227169.          3
## 3 Australia Oceania    1962    70.9 10794968    12217.             8 131884573002.          4
## 4 Australia Oceania    1967    71.1 11872264    14526.             6 172457986742.          2
## 5 Australia Oceania    1972    71.9 13177000    16789.             9 221223770658.          3
## 6 Australia Oceania    1977    73.5 14074100    18334.             6 258037329175.          4

Combining data frames

aust <- gapminder[gapminder$country=="Australia",]
df <- data.frame(mean_children=sample(1:10,nrow(aust),replace = TRUE),
               mean_bikes=sample(1:4,nrow(aust),replace = TRUE))
head(df)
##   mean_children mean_bikes
## 1             1          3
## 2             6          2
## 3             7          1
## 4             9          1
## 5             8          1
## 6            10          1
aust <- cbind(aust,df)
head(aust)
##     country continent year lifeExp      pop gdpPercap mean_children mean_bikes
## 1 Australia   Oceania 1952   69.12  8691212  10039.60             1          3
## 2 Australia   Oceania 1957   70.33  9712569  10949.65             6          2
## 3 Australia   Oceania 1962   70.93 10794968  12217.23             7          1
## 4 Australia   Oceania 1967   71.10 11872264  14526.12             9          1
## 5 Australia   Oceania 1972   71.93 13177000  16788.63             8          1
## 6 Australia   Oceania 1977   73.49 14074100  18334.20            10          1

Individually adding rows

new_row<-list("country" = "Australia",
               "continent" = "Oceania",
               "year" = 2022,
               "lifeExp" = mean(aust$lifeExp),
               "pop" = mean(aust$pop),
               "gdpPercap" = mean(aust$gdpPercap),
               "mean_children" = floor(mean(aust$mean_children)),
               "mean_bikes" = floor(mean(aust$mean_children))) # Why did I create it as list? 
new_row
## $country
## [1] "Australia"
## 
## $continent
## [1] "Oceania"
## 
## $year
## [1] 2022
## 
## $lifeExp
## [1] 74.66292
## 
## $pop
## [1] 14649313
## 
## $gdpPercap
## [1] 19980.6
## 
## $mean_children
## [1] 5
## 
## $mean_bikes
## [1] 5
aust<-rbind(aust,new_row)
tail(aust)
##      country continent year  lifeExp      pop gdpPercap mean_children mean_bikes
## 8  Australia   Oceania 1987 76.32000 16257249  21888.89             9          2
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77             6          1
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94             3          4
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75             3          3
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37             2          1
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60             5          5

Combining data frames by rows

dim(aust)
## [1] 13  8
aust_double<-rbind(aust,aust)
dim(aust_double)
## [1] 26  8

5.2.2 Removing columns and rows

aust<-aust[,-ncol(aust)]# remove the last column
head(aust)
##     country continent year lifeExp      pop gdpPercap mean_children
## 1 Australia   Oceania 1952   69.12  8691212  10039.60             1
## 2 Australia   Oceania 1957   70.33  9712569  10949.65             6
## 3 Australia   Oceania 1962   70.93 10794968  12217.23             7
## 4 Australia   Oceania 1967   71.10 11872264  14526.12             9
## 5 Australia   Oceania 1972   71.93 13177000  16788.63             8
## 6 Australia   Oceania 1977   73.49 14074100  18334.20            10
aust<-aust[,colnames(aust)!="mean_children"]# remove column by name
head(aust)
##     country continent year lifeExp      pop gdpPercap
## 1 Australia   Oceania 1952   69.12  8691212  10039.60
## 2 Australia   Oceania 1957   70.33  9712569  10949.65
## 3 Australia   Oceania 1962   70.93 10794968  12217.23
## 4 Australia   Oceania 1967   71.10 11872264  14526.12
## 5 Australia   Oceania 1972   71.93 13177000  16788.63
## 6 Australia   Oceania 1977   73.49 14074100  18334.20
dim(aust[-1,]) # Remove the first row
## [1] 12  6
dim(aust[-1*1:10,]) # Remove the first 10 rows
## [1] 3 6

5.2.3 Applying filters

aust[aust$lifeExp>=70,] 
##      country continent year  lifeExp      pop gdpPercap
## 2  Australia   Oceania 1957 70.33000  9712569  10949.65
## 3  Australia   Oceania 1962 70.93000 10794968  12217.23
## 4  Australia   Oceania 1967 71.10000 11872264  14526.12
## 5  Australia   Oceania 1972 71.93000 13177000  16788.63
## 6  Australia   Oceania 1977 73.49000 14074100  18334.20
## 7  Australia   Oceania 1982 74.74000 15184200  19477.01
## 8  Australia   Oceania 1987 76.32000 16257249  21888.89
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60
aust[aust$gdpPercap>=mean(aust$gdpPercap),] 
##      country continent year  lifeExp      pop gdpPercap
## 8  Australia   Oceania 1987 76.32000 16257249  21888.89
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60

How to get unique entries/remove duplicates

unique(aust_double)
##      country continent year  lifeExp      pop gdpPercap mean_children mean_bikes
## 1  Australia   Oceania 1952 69.12000  8691212  10039.60             1          3
## 2  Australia   Oceania 1957 70.33000  9712569  10949.65             6          2
## 3  Australia   Oceania 1962 70.93000 10794968  12217.23             7          1
## 4  Australia   Oceania 1967 71.10000 11872264  14526.12             9          1
## 5  Australia   Oceania 1972 71.93000 13177000  16788.63             8          1
## 6  Australia   Oceania 1977 73.49000 14074100  18334.20            10          1
## 7  Australia   Oceania 1982 74.74000 15184200  19477.01             2          1
## 8  Australia   Oceania 1987 76.32000 16257249  21888.89             9          2
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77             6          1
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94             3          4
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75             3          3
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37             2          1
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60             5          5

To remove empty rows

# First lets add an empty row
na.list<-rep(NA,ncol(aust))
aust<-rbind(aust,na.list)
tail(aust)
##      country continent year  lifeExp      pop gdpPercap
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60
## 14      <NA>      <NA>   NA       NA       NA        NA
aust<-aust[!is.na(aust$country),]
tail(aust)
##      country continent year  lifeExp      pop gdpPercap
## 8  Australia   Oceania 1987 76.32000 16257249  21888.89
## 9  Australia   Oceania 1992 77.56000 17481977  23424.77
## 10 Australia   Oceania 1997 78.83000 18565243  26997.94
## 11 Australia   Oceania 2002 80.37000 19546792  30687.75
## 12 Australia   Oceania 2007 81.23500 20434176  34435.37
## 13 Australia   Oceania 2022 74.66292 14649313  19980.60

5.2.4 Editing specific elements

aust[1,"lifeExp"]<-aust[1,"lifeExp"]+1 

5.3 Hands-on: basic data manipulation

  1. Write a data processing snippet to include only the data points collected after 1995 in Asian countries as a CSV file.
  2. Separate the gapminder data frame into 5 individual data frames, one for each continent. Store those 5 data frames as an RData file called continents.RData in the objects folder.
  3. Finish exploring the gapminder data frame and:
  • Find the number of rows and the number of columns
  • Print the data type of each column
  • Explain the meaning of everything that str(gapminder) prints
  1. In which years has the GDP of Canada been larger than the average of all data points recorded for Canada?
  2. Find the mean life expectancy of Switzerland before and after 2000
  3. You discovered that all the entries from 2007 are actually from 2008. Create a copy of the full gapminder data frame in an object called gp. Then change the year column to correct the entries from 2007.
  4. Bonus - Find the mean life expectancy and mean gdp per continent using the function tapply