Installing packages
# install packages from CRAN ----
install.packages(c("xlsx", "tidyverse", "devtools"))
Aside: Some of my favorite packages
Install them all in one go with the code below:
install.packages(c("tidyverse", "devtools", "qualtRics",
"tidytext", "textdata", "topicmodels",
"wordcloud", "ggwordcloud", "lme4", "sjPlot",
"janitor", "esquisse", "rio",
"cowplot", "stringi", "digest"))
Check underlying types
## [1] "numeric"
## [1] "numeric"
## [1] "logical"
## [1] "Date"
## [1] "list"
## [1] "data.frame"
## [1] "tbl_df" "tbl" "data.frame"
## [1] "numeric"
# generate a sequence ----
seq_1_100 <- seq(1, 100, by=1)
seq_0_1 <- seq(0, 1, by=0.1)
seq_0_1
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
# sample from vector of values -----
list_to_sample <- c("A", "1", "2", "B")
sampled_value = sample(list_to_sample, size=1)
sampled_value
## [1] "A"
sampled_values_replace = sample(list_to_sample, size=10, replace=TRUE)
sampled_values_replace
## [1] "B" "A" "A" "1" "1" "1" "B" "B" "1" "A"
File operations
# list files
all_csv_files = list.files("data",
pattern="*.csv",
recursive = T,
full.names = T)
all_csv_files
## [1] "data/demo_fileread.csv" "data/demo_qualtrics_export.csv"
## [3] "data/exp1.csv" "data/fake reviews dataset.csv"
all_files = list.files("data",
pattern="*",
recursive = T,
full.names = T)
all_files
## [1] "data/demo_fileread.csv"
## [2] "data/demo_fileread.sas"
## [3] "data/demo_fileread.sav"
## [4] "data/demo_fileread.txt"
## [5] "data/demo_fileread.xlsx"
## [6] "data/demo_qualtrics_export.csv"
## [7] "data/exp1.csv"
## [8] "data/fake reviews dataset.csv"
## [9] "data/OneExcelWorkbookWith3Sheet.xlsx"
## [1] TRUE
## [1] "File created: test/dataframe.R"
# remove directory
unlink("test", recursive=TRUE)
Sanitize column names
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# load data ----
# plaintext data files
library(readr)
# csv
dataset_csv <- read_csv("data/demo_fileread.csv", na = "empty")
## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): study
## dbl (2): x, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# tab-delimited file
dataset_tab <- read_delim("data/demo_fileread.txt", delim = "\t",
escape_double = FALSE,
trim_ws = TRUE)
## Rows: 100 Columns: 3
## ── Column specification ─────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): study
## dbl (2): x, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Excel file
library(readxl)
dataset_excel <- read_excel("data/demo_fileread.xlsx", sheet = "Sheet1")
# read SPSS and SAS data file
library(haven)
dataset_sav <- read_sav("data/demo_fileread.sav")
#dataset_sas <- read_sav("data/demo_fileread.sas")
# dataset operations -----
# stack two or more datasets
df1 = data.frame(x= rnorm(10), dataset=1)
df2 = data.frame(x= rnorm(10), dataset=2)
df3 = data.frame(x= rnorm(10), dataset=3)
df_all = bind_rows(df1, df2, df3)
# join datasets by common identifier
df_link <- data.frame(dataset=c(1,2,3,4), description=c("mock1", "mock2", "mock3","mock4"))
df_all_link_innerjoin <- df_all %>% inner_join(df_link)
## Joining, by = "dataset"
df_all_link_fulljoin <- df_all %>% full_join(df_link) # end up with 1 extra due to entry for dataset 4
## Joining, by = "dataset"
Binning data
n = 50
data_to_bin = data.frame(x = rnorm(n=n,mean=500))
data_binned_label = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,400,500,600,Inf)))
data_binned_num = data_to_bin %>% mutate(x_bin = cut(x, breaks=c(0,300,600,800), labels=FALSE))
data_binned_ord = data_to_bin %>% mutate(x_bin = cut(x, ordered_result=T, breaks=c(0,300,400,500,600,Inf)))
knitr::kable(head(data_binned_label))
499.1370 |
(400,500] |
499.5823 |
(400,500] |
500.5440 |
(500,600] |
500.2241 |
(500,600] |
499.3293 |
(400,500] |
497.1275 |
(400,500] |
499.1370 |
2 |
499.5823 |
2 |
500.5440 |
2 |
500.2241 |
2 |
499.3293 |
2 |
497.1275 |
2 |
String manipulations
## Search for data in data
grep("APP", c("APPLES", "APP", "APPLICATION"))
## [1] 1 2 3
grep("APP", c("APPLES", "APP", "APPLICATION"), value=T)
## [1] "APPLES" "APP" "APPLICATION"
## Replace data in data
gsub("APPLES","BANANAS",c("APPLES", "BANANAS"))
## [1] "BANANAS" "BANANAS"
gsub("ERR","ERROR",c("ERR", "ERRROR"))
## [1] "ERROR" "ERRORROR"
## [1] 3
## [1] 4 0
## [1] "a" "bb" "ccc" "dddd" "eeeee"
words <- list(c("spam", "bacon", "sausage", "spam"), c("eggs", "spam"))
stri_join_list(words, sep=", ")
## [1] "spam, bacon, sausage, spam" "eggs, spam"
Data integrity
MD5 and SHA256 hashes of data are helpful when making snapshots of data for long-term storage and curation. It ensures that the receiver can verify this hash upon opening the dataset and knowing that no data was lost or corrupted in transmission.
## [1] "127a2ec00989b9f7faf671ed470be7f8"
## [1] "fb1a678ef965ad4a66c712d2161f20319091cb4e7611e1925df671018c833f72"