how-to-create-qnr-metadata-dta
How to create a questionnaire metadata data set from the JSON file
To create a questionnaire metadata data set, one must:
- Extract data from JSON
- Reshape data into rectangular form
- Extract data from reusable categories
- Reshape that data into desired form
- Combine JSON and reusable categories data
- Select data relevant for questions and variables
- Write data to a Stata file
At present, this can be achieved with the R script below.
In the future, this will likely be done through a simple function in the {susometa} package, and potentially in a selector command that wraps this function.
# =============================================================================
# Set up paths
# =============================================================================
# NOTE: provide paths with / rather than \
json_dir <- ""
out_dir <- ""
# =============================================================================
# Install required packages
# =============================================================================
# for package installation
if (!require("pak")) {
install.packages("pak")
}
# install required packages
required_packages <- c(
"stringr",
"lsms-worldbank/susometa",
"fs",
"haven"
)
pak::pak(required_packages)
# =============================================================================
# Ingest JSON questionnaire metadata
# =============================================================================
qnr_df <- susometa::parse_questionnaire(path = fs::path(json_dir, "document.json"))
# =============================================================================
# Add reusable categories
# =============================================================================
categories_df_raw <- susometa::parse_categories(dir = fs::path(json_dir, "Categories"))
categories_df <- susometa::reshape_categories(categories_df = categories_df_raw)
qnr_df <- susometa::join_categories(
qnr_json_df = qnr_df,
categories_df = categories_df
)
# =============================================================================
# Make the metadata data frame Stata-friendly in form and content
# =============================================================================
# function to convert Boolean values to integer
from_bool_to_int <- function(x) {
dplyr::case_when(
x == TRUE ~ 1,
x == FALSE ~ 0,
TRUE ~ NA
) %>%
as.integer()
}
# create a Stata-friendly version of the data frame
stata_qnr_df <- qnr_df %>%
# rename columns with illegal names
dplyr::rename_with(
.fn = ~ stringr::str_replace_all(
string = .x,
pattern = "\\.",
replacement = "_"
)
) %>%
dplyr::mutate(
# remove newline from string variables
dplyr::across(
.cols = tidyselect::where(is.character),
.fns = ~ stringr::str_replace_all(
string = .x,
pattern = "\n",
replacement = ""
)
),
dplyr::across(
.cols = tidyselect::starts_with("is_"),
.fns = ~ from_bool_to_int(.x)
)
) %>%
# remove columns with stubborn newline characters
dplyr::select(-text)
# =============================================================================
# Extract data for questions in the export data
# =============================================================================
# get question metadata
q_df <- stata_qnr_df |>
# keep only questions and relevant metadata attributes
susometa::get_questions() |>
# remove columns that may have newline characters
dplyr::select(
-dplyr::matches("_expression") # filter, enablement, and validation expressions
)
# get variable metadata
var_df <- susometa::get_variables(qnr_df = stata_qnr_df) |>
# populate `varname` column with `name_variable`
dplyr::mutate(varname = name_variable)
# combine question and variable metadata
qv_df <- dplyr::bind_rows(q_df, var_df)
# =============================================================================
# Save metadata to a Stata file
# =============================================================================
# question and variable metadata
haven::write_dta(data = qv_df, path = fs::path(out_dir, "question_metadata.dta"))