Ch. 4 R Primer (Syntax and Basic Functions)

4.1 Overview of Approach

This tutorial is written as a guide for the question,“how do I do X in R?”

4.2 Installing packages

# install packages from CRAN ----
install.packages(c("tidyverse", "devtools"))

4.2.1 Aside: Some of my favorite packages

Install them all in one go with the code below:

install.packages(c("tidyverse", "devtools", "qualtRics",
                   "tidytext", "textdata", "topicmodels",
                   "wordcloud", "ggwordcloud", "lme4", "sjPlot",
                   "janitor", "esquisse", "rio",
                   "cowplot", "stringi", "digest"))

4.3 Assign values into variables

this_is_double <- 1
this_is_double2 = 1.0
this_is_logical <- TRUE
this_is_date <- as.Date("2020-10-13")
this_is_list <- list(x = rnorm(100), y = rnorm(10))
this_is_dataframe <- data.frame(x = rnorm(100), y = rnorm(100))
this_is_tibble <- tibble::tibble(x = rnorm(100), y = rnorm(100))
this_is_vector <- c(1,2,3)

4.4 Check underlying types

class(this_is_double)

## [1] "numeric"

class(this_is_double2)

## [1] "numeric"

class(this_is_logical)

## [1] "logical"

class(this_is_date)

## [1] "Date"

class(this_is_list)

## [1] "list"

class(this_is_dataframe)

## [1] "data.frame"

class(this_is_tibble)

## [1] "tbl_df"     "tbl"        "data.frame"

class(this_is_vector)

## [1] "numeric"

# generate a sequence ----
seq_1_100 <- seq(1, 100, by=1)
seq_0_1 <- seq(0, 1, by=0.1)
seq_0_1

##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0

# sample from vector of values -----
list_to_sample <- c("A", "1", "2", "B")
sampled_value = sample(list_to_sample, size=1)
sampled_value

## [1] "A"

sampled_values_replace = sample(list_to_sample, size=10, replace=TRUE)
sampled_values_replace

##  [1] "B" "A" "A" "1" "1" "1" "B" "B" "1" "A"

4.5 File operations

# list files
all_csv_files = list.files("data", 
                       pattern="*.csv", 
                       recursive = T, 
                       full.names = T)
all_csv_files

## [1] "data/demo_fileread.csv"         "data/demo_qualtrics_export.csv"
## [3] "data/exp1.csv"                  "data/fake reviews dataset.csv"

all_files = list.files("data", 
                       pattern="*", 
                       recursive = T, 
                       full.names = T)
all_files

## [1] "data/demo_fileread.csv"              
## [2] "data/demo_fileread.sas"              
## [3] "data/demo_fileread.sav"              
## [4] "data/demo_fileread.txt"              
## [5] "data/demo_fileread.xlsx"             
## [6] "data/demo_qualtrics_export.csv"      
## [7] "data/exp1.csv"                       
## [8] "data/fake reviews dataset.csv"       
## [9] "data/OneExcelWorkbookWith3Sheet.xlsx"

# check if directory exists
dir.exists("data")

## [1] TRUE

# create directory
dir.create("test")

# create file if doesn't exist
file <- "test/dataframe.R"
if (file.exists(file)) {
  cat("The file already exists")
} else {
  file.create(file)
  print(paste0("File created: ",file))
}

## [1] "File created: test/dataframe.R"

# remove directory
unlink("test", recursive=TRUE)

4.6 Reading and writing files

library(readxl)
library(readr)
library(haven)

# write files of
df_for_examples = tibble(x = rnorm(100), y = rnorm(100), study="demo_rnorm_in_r")

write_csv(df_for_examples, "data/demo_fileread.csv")
write_delim(df_for_examples,"data/demo_fileread.txt",delim="\t")
write_sav(df_for_examples, "data/demo_fileread.sav")
write_sas(df_for_examples, "data/demo_fileread.sas")

4.7 Export multiple dataframes into a single Excel workbook

library(rio)
df_1 = data.frame(x= rnorm(10), dataset=1)
df_2 = data.frame(x= rnorm(10), dataset=2)
df_3 = data.frame(x= rnorm(10), dataset=3)

export(format="xlsx", file="OneExcelWorkbookWith3Sheet.xlsx", x=list(df1=df_1, df2=df_2, df3=df_3))

4.8 Sanitize column names

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

# create demo data
df_janitor_before = tibble::tibble(`A long column Name with Mixed case`=rnorm(20),
                    `COLUMN_NAME`=rnorm(20),
                    `GROUP_$$`=rnorm(20))
knitr::kable(df_janitor_before)

A long column Name with Mixed case	COLUMN_NAME	GROUP_$$
-0.7945638	0.1772229	0.2534504
0.4909454	-0.7211788	-1.1048349
0.5248923	0.2007723	0.1152928
0.0471006	-1.1862365	-0.2875457
1.6110202	0.4735888	-0.7717135
-0.6449934	0.6928074	0.3194950
-1.2587768	-0.2330394	-0.4386248
-1.6392799	-0.9691855	0.4423736
1.4137857	1.1570549	0.7572905
-0.6622113	-0.0525426	1.1595245
-0.4822451	1.1067091	0.6455063
-0.8985352	1.3170614	-0.2237963
-0.8988488	-1.7460213	-1.2421912
2.0983915	-1.0894656	-0.8343506
-0.6571887	-0.9399994	-0.7770962
-0.2447002	1.3487610	-0.0367744
-0.3160510	-0.2375835	-0.2182758
0.8635464	0.8370331	0.2030638
2.1001766	-1.7200397	-0.3829877
-1.0131119	-0.4722650	-1.0560745

df_janitor_after = df_janitor_before %>%
  janitor::clean_names()
knitr::kable(df_janitor_after)

a_long_column_name_with_mixed_case	column_name	group
-0.7945638	0.1772229	0.2534504
0.4909454	-0.7211788	-1.1048349
0.5248923	0.2007723	0.1152928
0.0471006	-1.1862365	-0.2875457
1.6110202	0.4735888	-0.7717135
-0.6449934	0.6928074	0.3194950
-1.2587768	-0.2330394	-0.4386248
-1.6392799	-0.9691855	0.4423736
1.4137857	1.1570549	0.7572905
-0.6622113	-0.0525426	1.1595245
-0.4822451	1.1067091	0.6455063
-0.8985352	1.3170614	-0.2237963
-0.8988488	-1.7460213	-1.2421912
2.0983915	-1.0894656	-0.8343506
-0.6571887	-0.9399994	-0.7770962
-0.2447002	1.3487610	-0.0367744
-0.3160510	-0.2375835	-0.2182758
0.8635464	0.8370331	0.2030638
2.1001766	-1.7200397	-0.3829877
-1.0131119	-0.4722650	-1.0560745

# load data ----

# plaintext data files
library(readr)

# csv
dataset_csv <- read_csv("data/demo_fileread.csv", na = "empty")

## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): study
## dbl (2): x, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# tab-delimited file
dataset_tab <- read_delim("data/demo_fileread.txt", delim = "\t", 
                      escape_double = FALSE, 
                      trim_ws = TRUE)

## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): study
## dbl (2): x, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Excel file
library(readxl)
dataset_excel <- read_excel("data/demo_fileread.xlsx", sheet = "Sheet1")

# read SPSS and SAS data file
library(haven)
dataset_sav <- read_sav("data/demo_fileread.sav")
#dataset_sas <- read_sav("data/demo_fileread.sas")

# dataset operations -----

# stack two or more datasets
df1 = data.frame(x= rnorm(10), dataset=1)
df2 = data.frame(x= rnorm(10), dataset=2)
df3 = data.frame(x= rnorm(10), dataset=3)
df_all = bind_rows(df1, df2, df3)

# join datasets by common identifier
df_link <- data.frame(dataset=c(1,2,3,4), description=c("mock1", "mock2", "mock3","mock4"))
df_all_link_innerjoin <- df_all %>% inner_join(df_link)

## Joining, by = "dataset"

df_all_link_fulljoin <- df_all %>% full_join(df_link) # end up with 1 extra due to entry for dataset 4

## Joining, by = "dataset"

4.9 Binning data

n = 50
data_to_bin = data.frame(x = rnorm(n=n,mean=500))
data_binned_label = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,400,500,600,Inf)))
data_binned_num = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,600,800), labels=FALSE))
data_binned_ord = data_to_bin %>% mutate(x_bin = cut(x, ordered_result=T, breaks=c(0,300,400,500,600,Inf)))

knitr::kable(head(data_binned_label))

x	x_bin
499.1370	(400,500]
499.5823	(400,500]
500.5440	(500,600]
500.2241	(500,600]
499.3293	(400,500]
497.1275	(400,500]

knitr::kable(head(data_binned_num))

x	x_bin
499.1370	2
499.5823	2
500.5440	2
500.2241	2
499.3293	2
497.1275	2

4.10 String manipulations

## Search for data in data
grep("APP", c("APPLES", "APP", "APPLICATION"))

## [1] 1 2 3

grep("APP", c("APPLES", "APP", "APPLICATION"), value=T)

## [1] "APPLES"      "APP"         "APPLICATION"

## Replace data in data
gsub("APPLES","BANANAS",c("APPLES", "BANANAS"))

## [1] "BANANAS" "BANANAS"

gsub("ERR","ERROR",c("ERR", "ERRROR"))

## [1] "ERROR"    "ERRORROR"

# stringi stuff
library(stringi)

stri_count_fixed("ACATGAACGGGTACACACTG", "ACA", overlap=TRUE)

## [1] 3

x <- c("spam", "")
stri_length(x)

## [1] 4 0

stri_dup(letters[1:5], 1:5)

## [1] "a"     "bb"    "ccc"   "dddd"  "eeeee"

words <- list(c("spam", "bacon", "sausage", "spam"), c("eggs", "spam"))
stri_join_list(words, sep=", ")

## [1] "spam, bacon, sausage, spam" "eggs, spam"

4.11 Generate Random data

x = rnorm(n=500,mean=300,sd=3)

4.12 Simple stats

mean(x)

## [1] 299.932

sd(x)

## [1] 3.084418

min(x)

## [1] 287.8004

max(x)

## [1] 308.4349

quantile(x,c(0.6))

##      60% 
## 300.6428

quantile(x,c(0.95))

##      95% 
## 305.0184

quantile(x,c(0.99))

##      99% 
## 307.3388

4.13 Data quality reports

## Data quality inspection
knitr::kable(skimr::skim(x)) # one variable

skim_type	skim_variable	n_missing	complete_rate	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
numeric	data	0	1	299.932	3.084418	287.8004	297.8895	299.8438	302.0092	308.4349	▁▂▇▆▂

knitr::kable(skimr::skim(dataset_csv)) # or a whole dataset!

skim_type	skim_variable	complete_rate	character.min	character.max	character.empty	character.n_unique	character.whitespace	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
character	study	1	15	15	0	1	0	NA	NA	NA	NA	NA	NA	NA	NA
numeric	x	1	NA	NA	NA	NA	NA	-0.0046568	0.9139402	-1.928314	-0.6578389	-0.0151540	0.5899338	3.078970	▃▇▇▂▁
numeric	y	1	NA	NA	NA	NA	NA	0.0841022	0.9242699	-1.819545	-0.4945237	0.0930338	0.6743415	2.496702	▂▆▇▃▁

dqr = skimr::skim(dataset_csv) # save to a variable to later export as csv
knitr::kable(dqr)

skim_type	skim_variable	complete_rate	character.min	character.max	character.empty	character.n_unique	character.whitespace	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
character	study	1	15	15	0	1	0	NA	NA	NA	NA	NA	NA	NA	NA
numeric	x	1	NA	NA	NA	NA	NA	-0.0046568	0.9139402	-1.928314	-0.6578389	-0.0151540	0.5899338	3.078970	▃▇▇▂▁
numeric	y	1	NA	NA	NA	NA	NA	0.0841022	0.9242699	-1.819545	-0.4945237	0.0930338	0.6743415	2.496702	▂▆▇▃▁

4.14 Data integrity

MD5 and SHA256 hashes of data are helpful when making snapshots of data for long-term storage and curation. It ensures that the receiver can verify this hash upon opening the dataset and knowing that no data was lost or corrupted in transmission.

library(digest)
digest("a", algo="md5")

## [1] "127a2ec00989b9f7faf671ed470be7f8"

digest("a", algo="sha256")

## [1] "fb1a678ef965ad4a66c712d2161f20319091cb4e7611e1925df671018c833f72"

3 Reproducible Science Tools & Platforms

5 Anatomy of an R Project