Ch. 4 R Syntax Primer

4.1 Overview of Approach

This tutorial is written as a guide for the question,“how do I do X in R?”

4.2 Installing packages

# install packages from CRAN ----
install.packages(c("xlsx", "tidyverse", "devtools"))

4.2.1 Aside: Some of my favorite packages

Install them all in one go with the code below:

install.packages(c("tidyverse", "devtools", "qualtRics",
                   "tidytext", "textdata", "topicmodels",
                   "wordcloud", "ggwordcloud", "lme4", "sjPlot",
                   "janitor", "esquisse", "rio",
                   "cowplot", "stringi", "digest"))

4.3 Assign values into variables

this_is_double <- 1
this_is_double2 = 1.0
this_is_logical <- TRUE
this_is_date <- as.Date("2020-10-13")
this_is_list <- list(x = rnorm(100), y = rnorm(10))
this_is_dataframe <- data.frame(x = rnorm(100), y = rnorm(100))
this_is_tibble <- tibble::tibble(x = rnorm(100), y = rnorm(100))
this_is_vector <- c(1,2,3)

4.4 Check underlying types

class(this_is_double)
## [1] "numeric"
class(this_is_double2)
## [1] "numeric"
class(this_is_logical)
## [1] "logical"
class(this_is_date)
## [1] "Date"
class(this_is_list)
## [1] "list"
class(this_is_dataframe)
## [1] "data.frame"
class(this_is_tibble)
## [1] "tbl_df"     "tbl"        "data.frame"
class(this_is_vector)
## [1] "numeric"
# generate a sequence ----
seq_1_100 <- seq(1, 100, by=1)
seq_0_1 <- seq(0, 1, by=0.1)
seq_0_1
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
# sample from vector of values -----
list_to_sample <- c("A", "1", "2", "B")
sampled_value = sample(list_to_sample, size=1)
sampled_value
## [1] "A"
sampled_values_replace = sample(list_to_sample, size=10, replace=TRUE)
sampled_values_replace
##  [1] "B" "A" "A" "1" "1" "1" "B" "B" "1" "A"

4.5 File operations

# list files
all_csv_files = list.files("data", 
                       pattern="*.csv", 
                       recursive = T, 
                       full.names = T)
all_csv_files
## [1] "data/demo_fileread.csv"         "data/demo_qualtrics_export.csv"
## [3] "data/exp1.csv"                  "data/fake reviews dataset.csv"
all_files = list.files("data", 
                       pattern="*", 
                       recursive = T, 
                       full.names = T)
all_files
## [1] "data/demo_fileread.csv"              
## [2] "data/demo_fileread.sas"              
## [3] "data/demo_fileread.sav"              
## [4] "data/demo_fileread.txt"              
## [5] "data/demo_fileread.xlsx"             
## [6] "data/demo_qualtrics_export.csv"      
## [7] "data/exp1.csv"                       
## [8] "data/fake reviews dataset.csv"       
## [9] "data/OneExcelWorkbookWith3Sheet.xlsx"
# check if directory exists
dir.exists("data")
## [1] TRUE
# create directory
dir.create("test")

# create file if doesn't exist
file <- "test/dataframe.R"
if (file.exists(file)) {
  cat("The file already exists")
} else {
  file.create(file)
  print(paste0("File created: ",file))
}
## [1] "File created: test/dataframe.R"
# remove directory
unlink("test", recursive=TRUE)

4.6 Reading and writing files

library(readxl)
library(readr)
library(haven)

# write files of
df_for_examples = tibble(x = rnorm(100), y = rnorm(100), study="demo_rnorm_in_r")

write_csv(df_for_examples, "data/demo_fileread.csv")
write_delim(df_for_examples,"data/demo_fileread.txt",delim="\t")
write_sav(df_for_examples, "data/demo_fileread.sav")
write_sas(df_for_examples, "data/demo_fileread.sas")

4.7 Export multiple dataframes into a single Excel workbook

library(rio)
df_1 = data.frame(x= rnorm(10), dataset=1)
df_2 = data.frame(x= rnorm(10), dataset=2)
df_3 = data.frame(x= rnorm(10), dataset=3)

export(format="xlsx", file="OneExcelWorkbookWith3Sheet.xlsx", x=list(df1=df_1, df2=df_2, df3=df_3))

4.8 Sanitize column names

## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# create demo data
df_janitor = tibble::tibble(`A long column Name with Mixed case`=rnorm(20),
                    `COLUMN_NAME`=rnorm(20),
                    `GROUP_$$`=rnorm(20)) %>%
  janitor::clean_names()
# load data ----

# plaintext data files
library(readr)

# csv
dataset_csv <- read_csv("data/demo_fileread.csv", na = "empty")
## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): study
## dbl (2): x, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# tab-delimited file
dataset_tab <- read_delim("data/demo_fileread.txt", delim = "\t", 
                      escape_double = FALSE, 
                      trim_ws = TRUE)
## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): study
## dbl (2): x, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Excel file
library(readxl)
dataset_excel <- read_excel("data/demo_fileread.xlsx", sheet = "Sheet1")

# read SPSS and SAS data file
library(haven)
dataset_sav <- read_sav("data/demo_fileread.sav")
#dataset_sas <- read_sav("data/demo_fileread.sas")
# dataset operations -----

# stack two or more datasets
df1 = data.frame(x= rnorm(10), dataset=1)
df2 = data.frame(x= rnorm(10), dataset=2)
df3 = data.frame(x= rnorm(10), dataset=3)
df_all = bind_rows(df1, df2, df3)

# join datasets by common identifier
df_link <- data.frame(dataset=c(1,2,3,4), description=c("mock1", "mock2", "mock3","mock4"))
df_all_link_innerjoin <- df_all %>% inner_join(df_link)
## Joining, by = "dataset"
df_all_link_fulljoin <- df_all %>% full_join(df_link) # end up with 1 extra due to entry for dataset 4
## Joining, by = "dataset"

4.9 Binning data

n = 50
data_to_bin = data.frame(x = rnorm(n=n,mean=500))
data_binned_label = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,400,500,600,Inf)))
data_binned_num = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,600,800), labels=FALSE))
data_binned_ord = data_to_bin %>% mutate(x_bin = cut(x, ordered_result=T, breaks=c(0,300,400,500,600,Inf)))

knitr::kable(head(data_binned_label))
x x_bin
499.1370 (400,500]
499.5823 (400,500]
500.5440 (500,600]
500.2241 (500,600]
499.3293 (400,500]
497.1275 (400,500]
knitr::kable(head(data_binned_num))
x x_bin
499.1370 2
499.5823 2
500.5440 2
500.2241 2
499.3293 2
497.1275 2

4.10 String manipulations

## Search for data in data
grep("APP", c("APPLES", "APP", "APPLICATION"))
## [1] 1 2 3
grep("APP", c("APPLES", "APP", "APPLICATION"), value=T)
## [1] "APPLES"      "APP"         "APPLICATION"
## Replace data in data
gsub("APPLES","BANANAS",c("APPLES", "BANANAS"))
## [1] "BANANAS" "BANANAS"
gsub("ERR","ERROR",c("ERR", "ERRROR"))
## [1] "ERROR"    "ERRORROR"
# stringi stuff
library(stringi)

stri_count_fixed("ACATGAACGGGTACACACTG", "ACA", overlap=TRUE)
## [1] 3
x <- c("spam", "")
stri_length(x)
## [1] 4 0
stri_dup(letters[1:5], 1:5)
## [1] "a"     "bb"    "ccc"   "dddd"  "eeeee"
words <- list(c("spam", "bacon", "sausage", "spam"), c("eggs", "spam"))
stri_join_list(words, sep=", ")
## [1] "spam, bacon, sausage, spam" "eggs, spam"

4.11 Generate Random data

x = rnorm(n=500,mean=300,sd=3)

4.12 Simple stats

mean(x)
## [1] 299.932
sd(x)
## [1] 3.084418
min(x)
## [1] 287.8004
max(x)
## [1] 308.4349
quantile(x,c(0.6))
##      60% 
## 300.6428
quantile(x,c(0.95))
##      95% 
## 305.0184
quantile(x,c(0.99))
##      99% 
## 307.3388

4.13 Data quality reports

## Data quality inspection
knitr::kable(skimr::skim(x)) # one variable 
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric data 0 1 299.932 3.084418 287.8004 297.8895 299.8438 302.0092 308.4349 ▁▂▇▆▂
knitr::kable(skimr::skim(dataset_csv)) # or a whole dataset!
skim_type skim_variable n_missing complete_rate character.min character.max character.empty character.n_unique character.whitespace numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
character study 0 1 15 15 0 1 0 NA NA NA NA NA NA NA NA
numeric x 0 1 NA NA NA NA NA -0.0046568 0.9139402 -1.928314 -0.6578389 -0.0151540 0.5899338 3.078970 ▃▇▇▂▁
numeric y 0 1 NA NA NA NA NA 0.0841022 0.9242699 -1.819545 -0.4945237 0.0930338 0.6743415 2.496702 ▂▆▇▃▁
dqr = skimr::skim(dataset_csv) # save to a variable to later export as csv
knitr::kable(dqr)
skim_type skim_variable n_missing complete_rate character.min character.max character.empty character.n_unique character.whitespace numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
character study 0 1 15 15 0 1 0 NA NA NA NA NA NA NA NA
numeric x 0 1 NA NA NA NA NA -0.0046568 0.9139402 -1.928314 -0.6578389 -0.0151540 0.5899338 3.078970 ▃▇▇▂▁
numeric y 0 1 NA NA NA NA NA 0.0841022 0.9242699 -1.819545 -0.4945237 0.0930338 0.6743415 2.496702 ▂▆▇▃▁

4.14 Data integrity

MD5 and SHA256 hashes of data are helpful when making snapshots of data for long-term storage and curation. It ensures that the receiver can verify this hash upon opening the dataset and knowing that no data was lost or corrupted in transmission.

library(digest)
digest("a", algo="md5")
## [1] "127a2ec00989b9f7faf671ed470be7f8"
digest("a", algo="sha256")
## [1] "fb1a678ef965ad4a66c712d2161f20319091cb4e7611e1925df671018c833f72"