Ch. 7 Data Wrangling - Special considerations with survey data
7.1 Overview
In this example, we will work with the qualtRics
package to read in Qualtrics exports, and apply minimal pre-processing.
7.2 Load data
Read in survey and split StartDate into two columns for easy filtering (we don’t need pilot data)
library(qualtRics)
library(tidyverse)
# set filename of survey file
fn = "data/demo_qualtrics_export.csv"
# filter out pilot data
PILOT_START_DATE = as.Date("2022-06-21")
qualtrics_df <- read_survey(fn) %>%
separate(StartDate, into=c("dt_date", "dt_time"), sep=" ", remove=F) %>%
mutate(is_pilot_data = dt_date < PILOT_START_DATE) %>%
filter(!is_pilot_data) %>%
mutate(participant_id = as.character(ResponseId))
##
## ── Column specification ─────────────────────────────────────────────────────────────────
## cols(
## .default = col_character(),
## StartDate = col_datetime(format = ""),
## EndDate = col_datetime(format = ""),
## Progress = col_double(),
## `Duration (in seconds)` = col_double(),
## Finished = col_logical(),
## RecordedDate = col_datetime(format = ""),
## RecipientLastName = col_logical(),
## RecipientFirstName = col_logical(),
## RecipientEmail = col_logical(),
## ExternalReference = col_logical(),
## LocationLatitude = col_double(),
## LocationLongitude = col_double(),
## demo_yob = col_double(),
## `demo_timer_First Click` = col_double(),
## `demo_timer_Last Click` = col_double(),
## `demo_timer_Page Submit` = col_double(),
## `demo_timer_Click Count` = col_double(),
## `tipi_timer_First Click` = col_double(),
## `tipi_timer_Last Click` = col_double(),
## `tipi_timer_Page Submit` = col_double()
## # ... with 1 more columns
## )
## ℹ Use `spec()` for the full column specifications.
7.3 Split demographics into seperate dataset
df_demo = qualtrics_df %>%
select(ResponseId, contains("demo"))
knitr::kable(df_demo)# %>% kableExtra::kable_paper(.)
ResponseId | demo_yob | demo_state | demo_educ | demo_timer_First Click | demo_timer_Last Click | demo_timer_Page Submit | demo_timer_Click Count |
---|---|---|---|---|---|---|---|
R_24ratHMHlPXg31b | 1991 | FL | DOCTORAL DEGREE (EXAMPLE: PhD, EdD) | 1.066 | 6.14 | 6.898 | 3 |
7.4 Split out survey metadata
df_metadata = qualtrics_df %>%
select(ResponseId, contains("Date"), contains("duration"), contains("progress"))
knitr::kable(df_metadata)# %>% kableExtra::kable_paper(.)
ResponseId | StartDate | dt_date | EndDate | RecordedDate | Duration (in seconds) | Progress |
---|---|---|---|---|---|---|
R_24ratHMHlPXg31b | 2022-06-22 10:59:58 | 2022-06-22 | 2022-06-22 11:00:15 | 2022-06-22 11:00:15 | 17 | 100 |
7.5 Split timer data into seperate dataset
df_timings = qualtrics_df %>%
select(ResponseId, contains("timer"))
knitr::kable(df_timings)# %>% kableExtra::kable_paper(.)
ResponseId | demo_timer_First Click | demo_timer_Last Click | demo_timer_Page Submit | demo_timer_Click Count | tipi_timer_First Click | tipi_timer_Last Click | tipi_timer_Page Submit | tipi_timer_Click Count |
---|---|---|---|---|---|---|---|---|
R_24ratHMHlPXg31b | 1.066 | 6.14 | 6.898 | 3 | 0.731 | 5.849 | 6.542 | 10 |
7.6 Score TIPI data
7.6.1 Create reverse-scoring function
reverse_code <- function(df, ques_label) {
out_df = df %>%
filter(tipi_question == ques_label) %>%
mutate(response_n = recode(response,
`Disagree strongly` = 7,
`Disagree moderately` = 6,
`Disagree a little` = 5,
`Neither agree nor disagree` = 4,
`Agree a little` = 3,
`Agree moderately` = 2,
`Agree strongly` = 1,
.default=-999))
return(out_df)
}
7.6.3 Transform TIPI from wide to long
tipi_long = tipi_df %>%
pivot_longer(cols=TIPI_1:TIPI_10, names_to="tipi_question", values_to="response")
7.6.5 Apply Reverse Coding
NOTE: Extraversion: 1, 6R; Agreeableness: 2R, 7; Conscientiousness; 3, 8R; Emotional Stability: 4R, 9; Openness to Experiences: 5, 10R
tipi_long_recode2 = reverse_code(tipi_long_r, "TIPI_2")
tipi_long_recode4 = reverse_code(tipi_long_r, "TIPI_4")
tipi_long_recode6 = reverse_code(tipi_long_r, "TIPI_6")
tipi_long_recode8 = reverse_code(tipi_long_r, "TIPI_8")
tipi_long_recode10 = reverse_code(tipi_long_r, "TIPI_10")
tipi_long_nonrecoded = tipi_long_r %>%
filter(tipi_question %in% c("TIPI_1","TIPI_3","TIPI_5","TIPI_7","TIPI_9"))
tipi_long_recode_all = bind_rows(tipi_long_nonrecoded,
tipi_long_recode2,
tipi_long_recode4,
tipi_long_recode6,
tipi_long_recode8,
tipi_long_recode10)
7.6.6 Finally transform to wide format
tipi_wide_recoded = tipi_long_recode_all %>%
select(-response) %>%
pivot_wider(names_from="tipi_question", values_from="response_n") %>%
rowwise() %>%
mutate(tipi_extraversion = mean(c(TIPI_1, TIPI_6), na.rm=T)) %>%
mutate(tipi_agreeableness = mean(c(TIPI_2, TIPI_7), na.rm=T),
tipi_conscientiousness = mean(c(TIPI_3, TIPI_8), na.rm=T),
tipi_emostability = mean(c(TIPI_4, TIPI_9), na.rm=T),
tipi_openexperience = mean(c(TIPI_5, TIPI_10), na.rm=T))
knitr::kable(tipi_wide_recoded %>% select(contains("TIPI_", ignore.case = F)))# %>% kableExtra::kable_paper(.)
TIPI_1 | TIPI_3 | TIPI_5 | TIPI_7 | TIPI_9 | TIPI_2 | TIPI_4 | TIPI_6 | TIPI_8 | TIPI_10 |
---|---|---|---|---|---|---|---|---|---|
1 | 7 | 1 | 7 | 7 | 1 | 7 | 1 | 7 | 7 |
knitr::kable(tipi_wide_recoded %>% select(contains("tipi_", ignore.case = F)))# %>% kableExtra::kable_paper(.)
tipi_extraversion | tipi_agreeableness | tipi_conscientiousness | tipi_emostability | tipi_openexperience |
---|---|---|---|---|
1 | 4 | 7 | 7 | 4 |