library(dplyr)
library(tidyr)
library(ggplot2)
library(languageR)

Loading the coded data for each language

This version reads the current cleaned CSV files and keeps their original column names at import.
Only inside R, the columns that are used later in the analysis are mapped to the older internal names such as age.group, item.order, target.english, etc.

hun.data <- read.csv("../3-CleanData/QEl1_data_hun_children_clean.csv", check.names = FALSE)
deu.data <- read.csv("../3-CleanData/QEl1_data_deu_children_clean.csv", check.names = FALSE)
yor.data <- read.csv("../3-CleanData/QEl1_data_yor_children_clean.csv", check.names = FALSE)
cmn.data <- read.csv("../3-CleanData/QEl1_data_cmn_children_clean.csv", check.names = FALSE)
mal.data <- read.csv("../3-CleanData/QEl1_data_mal_children_clean.csv", check.names = FALSE)

Helper functions for current CSV column names

rename_if_present <- function(df, mapping) {
  for (new_name in names(mapping)) {
    old_name <- mapping[[new_name]]
    if (old_name %in% names(df)) {
      names(df)[names(df) == old_name] <- new_name
    }
  }
  df
}

add_missing_columns <- function(df, columns, value = 0) {
  for (col in columns) {
    if (!col %in% names(df)) {
      df[[col]] <- value
    }
  }
  df
}

standardize_child_data <- function(df, lang_name) {
  # Replace missing values in the raw CSV with 0, as in the original script.
  df[is.na(df)] <- 0
  df <- as_tibble(df)

  # Map current CSV names to the old internal names expected by the analysis.
  common_mapping <- c(
    age.group = "group",
    participant = "participant_id",
    age.in.mo = "age_in_mo",
    item.order = "order",
    item.no = "item",
    test.date = "test_date",
    target.english = "target_in_english",
    error.code = "error_code",
    subj.for.obj = "subj_for_object",
    obj.for.subj = "object_for_subject",
    who.for.which = "who_for_which",
    who.for.what = "who_for_what",
    what.for.which = "what_for_which",
    what.for.who = "what_for_who",
    which.for.who = "which_for_who",
    which.for.what = "which_for_what",
    drop.after.which = "np_drop_after_which",
    verb.num = "errors_on_verbs_number",
    subj.num = "error_on_subjects_numbers",
    obj.num = "error_on_objects_numbers",
    null.obj = "object_omission",
    null.subj = "subject_omission",
    object.pro = "substitution_of_object_with_pronouns",
    subject.pro = "substitution_of_subject_with_pronouns",
    case.subj = "case_error_subject",
    case.obj = "case_error_object",
    dcm = "drop_of_differential_case_marking",
    role.inversion = "inversion_of_roles_subj_obj",
    other.v = "use_of_other_verbs",
    tense.aspect = "tense_aspect_error",
    s.pass = "short_passive",
    l.pass = "long_passives",
    extra.demo = "additional_demonstrative_for_determiner",
    numeral = "addition_of_a_numeral",
    other.grammar = "other_grammatical_constructions",
    other.error = "other_errors",
    code.switch = "code_switching",
    no.res = "no_response",
    frag = "fragment_use_of_only_wh_element",
    answer = "answer_to_the_question",
    res.dp = "resumptive_definite",
    want = "want_to_v_for_is_v_ing",
    res.n = "resumptive_n",
    topic = "topicalized",
    gender = "gender_error"
  )

  df <- rename_if_present(df, common_mapping)

  # Language-specific aliases where the same coding category has a different CSV name.
  if ("fragment" %in% names(df) && !"frag" %in% names(df)) {
    names(df)[names(df) == "fragment"] <- "frag"
  }
  if ("no_question_or_fragment_use_of_only_wh_element" %in% names(df) && !"frag" %in% names(df)) {
    names(df)[names(df) == "no_question_or_fragment_use_of_only_wh_element"] <- "frag"
  }

  # Keep language-specific target-language columns only if needed later; the analysis uses target.english and target.
  # Columns missing in some languages are added with 0 so that bind_rows/select is stable.
  needed_columns <- c(
    "age.group", "participant", "item.order", "item.no", "age.in.mo",
    "transcription", "gloss", "target.english", "target",
    "correct", "grammatical", "subj.for.obj", "obj.for.subj",
    "who.for.which", "who.for.what", "what.for.which", "what.for.who",
    "which.for.who", "which.for.what", "drop.after.which", "verb.num",
    "subj.num", "obj.num", "null.obj", "null.subj", "object.pro",
    "subject.pro", "case.subj", "case.obj", "role.inversion", "other.v",
    "tense.aspect", "cleft", "s.pass", "l.pass", "extra.demo",
    "other.grammar", "other.error", "code.switch", "no.res", "frag",
    "answer", "res.dp", "want", "res.n", "numeral", "topic", "gender",
    "dislocation"
  )

  df <- add_missing_columns(df, needed_columns, value = 0)

  df %>%
    mutate(lang = lang_name) %>%
    mutate(age.in.mo = as.numeric(age.in.mo)) %>%
    select(lang, all_of(needed_columns))
}

Excluding participants

# First standardize column names, then exclude participants using the new internal name `participant`.
mal.data <- standardize_child_data(mal.data, "malayalam") %>%
  filter(participant != "LDmalC012")

deu.data <- standardize_child_data(deu.data, "german") %>%
  filter(participant != "LDdeuC012")

hun.data <- standardize_child_data(hun.data, "hungarian")
cmn.data <- standardize_child_data(cmn.data, "mandarin")
yor.data <- standardize_child_data(yor.data, "yoruba")

Combining all child data

all.select <- bind_rows(hun.data, cmn.data, yor.data, deu.data, mal.data)

creating variables for analysis

Our design: - 2 (who vs. which) x 2 (subject vs. object) plus what-object question - each type of question was elicited 6 times - what-question was elicited only as object question, and we will not use this in some of our analysis.

The item numbers in each type of wh-phrase*argument combinations are as follows:

In order to be able to sort them by the type of questions elicited, we will create a new variable called “item.type”.

who.subject <- c("S1", "S2", "S3", "S4", "S5", "S6")
who.object <- c("O1", "O2", "O3", "O4", "O5", "O6")
which.subject <- c("S7", "S8", "S9", "S10", "S11", "S12")
which.object <- c("O7", "O8", "O9", "O10", "O11", "O12")
what.object <- c("O13", "O14", "O15", "O16", "O17", "O18")

In addition, we will create new variables “argument.type” and “wh.type”.

subjectQ <- c("S1", "S2", "S3", "S4", "S5", "S6","S7", "S8", "S9", "S10", "S11", "S12" )
objectQ <- c("O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", "O10", "O11", "O12","O13", "O14", "O15", "O16", "O17", "O18")

whoQ <- c("S1", "S2", "S3", "S4", "S5", "S6","O1", "O2", "O3", "O4", "O5", "O6")
whichQ <- c("S7", "S8", "S9", "S10", "S11", "S12", "O7", "O8", "O9", "O10", "O11", "O12")
whatQ <- c("O13", "O14", "O15", "O16", "O17", "O18")

We balanced items so that the number feature of the wh-phrase and the in-situ argument match in half of the experiment, and mismatch in the other half.

We specify below, which items had matching or mismatching number features.

match <- c("S1", "S2", "S3", "O1", "O2", "O3", "S7", "S8", "S9", "O7", "O8", "O9","O13", "O14", "O18")
mismatch <- c("S4", "S5", "S6", "O4", "O5", "O6", "S10", "S11", "S12", "O10", "O11", "O12", "O16", "O15","O17")

creating variables

all.select <- all.select%>%
        filter(!is.na(target)) %>%
        mutate(wh.type = case_when(
               item.no %in% whoQ ~ "whoQ",
               item.no %in% whichQ ~ "whichQ", 
               item.no %in% whatQ ~ "whatQ"
        )) %>%
        mutate(argument.type = case_when (
          item.no %in% subjectQ ~ "subject",
          item.no %in% objectQ ~ "object"
        )) %>%
        mutate(item.type = case_when(
                item.no %in% who.subject ~"whoS", 
                item.no %in% who.object ~"whoO", 
                item.no %in% which.subject ~"whichS", 
                item.no %in% which.object ~"whichO", 
                item.no %in% what.object ~"whatO"
        ))%>%
        mutate(feature.match = case_when(
                item.no %in% match ~ .5,
                item.no %in% mismatch ~ -.5
        ))

all.select <- all.select %>% filter(age.in.mo > 0)

creating a variable called passive which include both short and long passives

all.select <- all.select%>%
        mutate(passives = case_when(s.pass == 1 ~ 1,
                                    l.pass == 1 ~ 1,
                                    TRUE ~ 0))

creating a variable each for covert and overt pronouns whether that is allowed in the language or not

all.select <- all.select %>%
        mutate(covert.pro = case_when(null.obj == 1 | null.subj == 1 ~ 1,
                                    TRUE ~ 0)) %>%
        mutate(overt.pro = case_when(subject.pro == 1 | object.pro  == 1 ~ 1,
                                     TRUE ~ 0))%>%
        mutate(pronom = case_when(covert.pro == 1 | overt.pro == 1 ~ 1,
                                  TRUE ~ 0 ))

creating a variable “wrong.wh” which indicates the wrong wh-used, whether that is allowed in the language or not

all.select <- all.select %>%
        mutate(wrong.wh = case_when(what.for.which == 1 | what.for.who == 1 | who.for.what == 1 | who.for.which == 1 |  which.for.what == 1 | which.for.who == 1 ~ 1,
                                   TRUE ~ 0)) 

creating a variable “reversal” which indicates the reversal of the roles (both subject to object and object to subject) that is not a passive

all.select <- all.select %>%
        mutate(reversal = case_when(
                passives == 0 & subj.for.obj == 1 | obj.for.subj == 1 | role.inversion == 1 ~ 1,
                TRUE ~ 0
        ))

creating a variable “case.error” which is 1 whenever there is a case error

all.select <- all.select %>%
        mutate(case.error = case_when(
                case.obj == 1 | case.subj == 1 ~ 1,
                TRUE ~ 0
        ))

creating a variable “resumptive” which is 1 whenever there is a resumptive element

all.select <- all.select %>%
        mutate(resumptive = case_when(
                res.dp == 1 | res.n == 1 ~ 1,
                TRUE ~ 0
        ))

creating a raviable “num.error” which is 1 whenever there is a number error on noun

all.select <- all.select %>%
        mutate(num.error = case_when(
                subj.num == 1 | obj.num == 1 ~ 1,
                TRUE ~ 0
        ))
non.germ <- all.select %>%
        filter(lang != "german")

germ <- all.select %>%
        filter(lang == "german") %>%
        filter( item.no != "O9" & item.no != "S7")

all.select <- rbind(germ, non.germ)

write.csv(all.select, "all.select.csv", row.names = FALSE)