Skip to contents

This function calculates the mean, median, and standard deviation of exposure to an air pollutant for each participant based on their recorded start and end dates, and county/state.

Usage

summarise_exposure(
  participants_df,
  air_quality_df,
  date_col,
  pollutant_col,
  start_col,
  end_col,
  county_name = "county",
  state_name = "state",
  group_vars = NULL
)

Arguments

participants_df

A dataframe containing participant information, including start and end dates.

air_quality_df

A dataframe containing air quality measurements, with date and pollutant values.

date_col

A string specifying the column name in air_quality_df that contains date values.

pollutant_col

A string specifying the column name in air_quality_df that contains pollutant values.

start_col

A string specifying the column name in participants_df that contains the start date.

end_col

A string specifying the column name in participants_df that contains the end date.

county_name

A string specifying the column name in participants_df and air_quality_df that contains county names.

state_name

A string specifying the column name in participants_df and air_quality_df that contains state names.

group_vars

A character vector of additional grouping variables from participants_df (e.g., participant ID, age, smoking status). Default is NULL.

Value

A tibble containing the mean, median, and standard deviation of exposure for each participant during their study period, along with the number of valid exposure records.

Examples

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#>  dplyr     1.1.4      readr     2.1.5
#>  forcats   1.0.0      stringr   1.5.1
#>  ggplot2   3.5.1      tibble    3.2.1
#>  lubridate 1.9.3      tidyr     1.3.1
#>  purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#>  dplyr::filter() masks stats::filter()
#>  dplyr::lag()    masks stats::lag()
#>  Use the conflicted package to force all conflicts to become errors

# Example air quality data (PM2.5 levels)
air_quality_df <- tibble(
  date = seq(as.Date("2000-01-01"), as.Date("2020-12-31"), by = "day"),
  pm25_level = runif(length(date), min = 5, max = 80),
  county_name = rep(c("CountyA", "CountyB", "CountyC"), length.out = length(date)),
  state_name = rep(c("StateX", "StateY", "StateZ"), length.out = length(date))
)

# Example participants data
participants_df <- tibble(
  participant_id = 1:5,
  start_date = as.Date(c("2005-06-01", "2010-01-01", "2015-03-15", "2008-07-10", "2012-09-20")),
  end_date = as.Date(c("2005-12-31", "2010-12-31", "2015-09-30", "2009-05-20", "2013-06-15")),
  age = c(65, 72, 50, 60, 58),
  county_name = c("CountyA", "CountyB", "CountyC", "CountyA", "CountyB"),
  state_name = c("StateX", "StateY", "StateZ", "StateX", "StateY"),
  smoking_status = c("Never", "Former", "Current", "Never", "Former")
)

# Compute exposure
exposure_results <- summarise_exposure(
  participants_df = participants_df,
  air_quality_df = air_quality_df,
  date_col = "date",
  pollutant_col = "pm25_level",
  start_col = "start_date",
  end_col = "end_date",
  county_name = "county_name",
  state_name = "state_name",
  group_vars = c("participant_id", "age", "smoking_status")
)

print(exposure_results)
#> # A tibble: 5 × 7
#>   participant_id   age smoking_status mean_exposure median_exposure sd_exposure
#>            <int> <dbl> <chr>                  <dbl>           <dbl>       <dbl>
#> 1              1    65 Never                   43.1            44.3        22.1
#> 2              2    72 Former                  38.3            33.1        21.0
#> 3              3    50 Current                 40.6            42.0        20.4
#> 4              4    60 Never                   43.5            44.3        22.4
#> 5              5    58 Former                  44.8            47.1        20.4
#> # ℹ 1 more variable: n_exposure_records <int>