library(dplyr)
library(tidyr)
library(ggplot2)
library(languageR)
This version reads the current cleaned CSV files and keeps their
original column names at import.
Only inside R, the columns that are used later in the analysis are
mapped to the older internal names such as age.group,
item.order, target.english, etc.
hun.data <- read.csv("../3-CleanData/QEl1_data_hun_children_clean.csv", check.names = FALSE)
deu.data <- read.csv("../3-CleanData/QEl1_data_deu_children_clean.csv", check.names = FALSE)
yor.data <- read.csv("../3-CleanData/QEl1_data_yor_children_clean.csv", check.names = FALSE)
cmn.data <- read.csv("../3-CleanData/QEl1_data_cmn_children_clean.csv", check.names = FALSE)
mal.data <- read.csv("../3-CleanData/QEl1_data_mal_children_clean.csv", check.names = FALSE)
rename_if_present <- function(df, mapping) {
for (new_name in names(mapping)) {
old_name <- mapping[[new_name]]
if (old_name %in% names(df)) {
names(df)[names(df) == old_name] <- new_name
}
}
df
}
add_missing_columns <- function(df, columns, value = 0) {
for (col in columns) {
if (!col %in% names(df)) {
df[[col]] <- value
}
}
df
}
standardize_child_data <- function(df, lang_name) {
# Replace missing values in the raw CSV with 0, as in the original script.
df[is.na(df)] <- 0
df <- as_tibble(df)
# Map current CSV names to the old internal names expected by the analysis.
common_mapping <- c(
age.group = "group",
participant = "participant_id",
age.in.mo = "age_in_mo",
item.order = "order",
item.no = "item",
test.date = "test_date",
target.english = "target_in_english",
error.code = "error_code",
subj.for.obj = "subj_for_object",
obj.for.subj = "object_for_subject",
who.for.which = "who_for_which",
who.for.what = "who_for_what",
what.for.which = "what_for_which",
what.for.who = "what_for_who",
which.for.who = "which_for_who",
which.for.what = "which_for_what",
drop.after.which = "np_drop_after_which",
verb.num = "errors_on_verbs_number",
subj.num = "error_on_subjects_numbers",
obj.num = "error_on_objects_numbers",
null.obj = "object_omission",
null.subj = "subject_omission",
object.pro = "substitution_of_object_with_pronouns",
subject.pro = "substitution_of_subject_with_pronouns",
case.subj = "case_error_subject",
case.obj = "case_error_object",
dcm = "drop_of_differential_case_marking",
role.inversion = "inversion_of_roles_subj_obj",
other.v = "use_of_other_verbs",
tense.aspect = "tense_aspect_error",
s.pass = "short_passive",
l.pass = "long_passives",
extra.demo = "additional_demonstrative_for_determiner",
numeral = "addition_of_a_numeral",
other.grammar = "other_grammatical_constructions",
other.error = "other_errors",
code.switch = "code_switching",
no.res = "no_response",
frag = "fragment_use_of_only_wh_element",
answer = "answer_to_the_question",
res.dp = "resumptive_definite",
want = "want_to_v_for_is_v_ing",
res.n = "resumptive_n",
topic = "topicalized",
gender = "gender_error"
)
df <- rename_if_present(df, common_mapping)
# Language-specific aliases where the same coding category has a different CSV name.
if ("fragment" %in% names(df) && !"frag" %in% names(df)) {
names(df)[names(df) == "fragment"] <- "frag"
}
if ("no_question_or_fragment_use_of_only_wh_element" %in% names(df) && !"frag" %in% names(df)) {
names(df)[names(df) == "no_question_or_fragment_use_of_only_wh_element"] <- "frag"
}
# Keep language-specific target-language columns only if needed later; the analysis uses target.english and target.
# Columns missing in some languages are added with 0 so that bind_rows/select is stable.
needed_columns <- c(
"age.group", "participant", "item.order", "item.no", "age.in.mo",
"transcription", "gloss", "target.english", "target",
"correct", "grammatical", "subj.for.obj", "obj.for.subj",
"who.for.which", "who.for.what", "what.for.which", "what.for.who",
"which.for.who", "which.for.what", "drop.after.which", "verb.num",
"subj.num", "obj.num", "null.obj", "null.subj", "object.pro",
"subject.pro", "case.subj", "case.obj", "role.inversion", "other.v",
"tense.aspect", "cleft", "s.pass", "l.pass", "extra.demo",
"other.grammar", "other.error", "code.switch", "no.res", "frag",
"answer", "res.dp", "want", "res.n", "numeral", "topic", "gender",
"dislocation"
)
df <- add_missing_columns(df, needed_columns, value = 0)
df %>%
mutate(lang = lang_name) %>%
mutate(age.in.mo = as.numeric(age.in.mo)) %>%
select(lang, all_of(needed_columns))
}
# First standardize column names, then exclude participants using the new internal name `participant`.
mal.data <- standardize_child_data(mal.data, "malayalam") %>%
filter(participant != "LDmalC012")
deu.data <- standardize_child_data(deu.data, "german") %>%
filter(participant != "LDdeuC012")
hun.data <- standardize_child_data(hun.data, "hungarian")
cmn.data <- standardize_child_data(cmn.data, "mandarin")
yor.data <- standardize_child_data(yor.data, "yoruba")
all.select <- bind_rows(hun.data, cmn.data, yor.data, deu.data, mal.data)
Our design: - 2 (who vs. which) x 2 (subject vs. object) plus what-object question - each type of question was elicited 6 times - what-question was elicited only as object question, and we will not use this in some of our analysis.
The item numbers in each type of wh-phrase*argument combinations are as follows:
In order to be able to sort them by the type of questions elicited, we will create a new variable called “item.type”.
who.subject <- c("S1", "S2", "S3", "S4", "S5", "S6")
who.object <- c("O1", "O2", "O3", "O4", "O5", "O6")
which.subject <- c("S7", "S8", "S9", "S10", "S11", "S12")
which.object <- c("O7", "O8", "O9", "O10", "O11", "O12")
what.object <- c("O13", "O14", "O15", "O16", "O17", "O18")
In addition, we will create new variables “argument.type” and “wh.type”.
subjectQ <- c("S1", "S2", "S3", "S4", "S5", "S6","S7", "S8", "S9", "S10", "S11", "S12" )
objectQ <- c("O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", "O10", "O11", "O12","O13", "O14", "O15", "O16", "O17", "O18")
whoQ <- c("S1", "S2", "S3", "S4", "S5", "S6","O1", "O2", "O3", "O4", "O5", "O6")
whichQ <- c("S7", "S8", "S9", "S10", "S11", "S12", "O7", "O8", "O9", "O10", "O11", "O12")
whatQ <- c("O13", "O14", "O15", "O16", "O17", "O18")
We balanced items so that the number feature of the wh-phrase and the in-situ argument match in half of the experiment, and mismatch in the other half.
We specify below, which items had matching or mismatching number features.
match <- c("S1", "S2", "S3", "O1", "O2", "O3", "S7", "S8", "S9", "O7", "O8", "O9","O13", "O14", "O18")
mismatch <- c("S4", "S5", "S6", "O4", "O5", "O6", "S10", "S11", "S12", "O10", "O11", "O12", "O16", "O15","O17")
all.select <- all.select%>%
filter(!is.na(target)) %>%
mutate(wh.type = case_when(
item.no %in% whoQ ~ "whoQ",
item.no %in% whichQ ~ "whichQ",
item.no %in% whatQ ~ "whatQ"
)) %>%
mutate(argument.type = case_when (
item.no %in% subjectQ ~ "subject",
item.no %in% objectQ ~ "object"
)) %>%
mutate(item.type = case_when(
item.no %in% who.subject ~"whoS",
item.no %in% who.object ~"whoO",
item.no %in% which.subject ~"whichS",
item.no %in% which.object ~"whichO",
item.no %in% what.object ~"whatO"
))%>%
mutate(feature.match = case_when(
item.no %in% match ~ .5,
item.no %in% mismatch ~ -.5
))
all.select <- all.select %>% filter(age.in.mo > 0)
all.select <- all.select%>%
mutate(passives = case_when(s.pass == 1 ~ 1,
l.pass == 1 ~ 1,
TRUE ~ 0))
all.select <- all.select %>%
mutate(covert.pro = case_when(null.obj == 1 | null.subj == 1 ~ 1,
TRUE ~ 0)) %>%
mutate(overt.pro = case_when(subject.pro == 1 | object.pro == 1 ~ 1,
TRUE ~ 0))%>%
mutate(pronom = case_when(covert.pro == 1 | overt.pro == 1 ~ 1,
TRUE ~ 0 ))
all.select <- all.select %>%
mutate(wrong.wh = case_when(what.for.which == 1 | what.for.who == 1 | who.for.what == 1 | who.for.which == 1 | which.for.what == 1 | which.for.who == 1 ~ 1,
TRUE ~ 0))
all.select <- all.select %>%
mutate(reversal = case_when(
passives == 0 & subj.for.obj == 1 | obj.for.subj == 1 | role.inversion == 1 ~ 1,
TRUE ~ 0
))
all.select <- all.select %>%
mutate(case.error = case_when(
case.obj == 1 | case.subj == 1 ~ 1,
TRUE ~ 0
))
all.select <- all.select %>%
mutate(resumptive = case_when(
res.dp == 1 | res.n == 1 ~ 1,
TRUE ~ 0
))
all.select <- all.select %>%
mutate(num.error = case_when(
subj.num == 1 | obj.num == 1 ~ 1,
TRUE ~ 0
))
non.germ <- all.select %>%
filter(lang != "german")
germ <- all.select %>%
filter(lang == "german") %>%
filter( item.no != "O9" & item.no != "S7")
all.select <- rbind(germ, non.germ)
write.csv(all.select, "all.select.csv", row.names = FALSE)