################################################################################
# Title:        Fuzzy Matching of Food Items
# Author:       ZR, RL, KG 
# Date:         2025-10-06
################################################################################

##### 1. Install and Load Packages -----------------------------------------------------
# install.packages(c("tidyverse", "readxl", "openxlsx", "stringdist"))

library(tidyverse)
library(readxl)
library(openxlsx)
library(stringdist)

##### 2. Define Paths ----------------------------------------------------------
setwd("...")  # Define working directory

##### 3. Load Datasets ----------------------------------------------------------

# Primary dataset: FoodEx2
dataset1 <- read_excel("...") %>%
  select(L1_FoodEx2_desc, L7_FoodEx2_code, L7_FoodEx2_desc, matching_action) %>%
  filter(L1_FoodEx2_desc == "Fruit and fruit products") %>%
  filter(matching_action %in% c("match_non_composite", "match_composite")) %>%
  distinct()

# Secondary dataset: Agribalyse v3.2 (cleaned)
dataset2 <- read_excel("...") %>%
  select(agb_code, food_subgroup_agb, agb_name) %>%
  filter(food_subgroup_agb %in% c("fruit", "jams and similar products"))

##### 4. Clean and Tokenize ----------------------------------------------------------

clean_and_tokenize <- function(str) {
  str <- tolower(str)
  str <- gsub("[[:punct:]]", "", str)
  strsplit(str, "\\s+")[[1]]
}

cleaned_names_db1 <- lapply(dataset1$L7_FoodEx2_desc, clean_and_tokenize)
cleaned_names_db2 <- lapply(dataset2$agb_name, clean_and_tokenize)

##### 5. Define Similarity Functions ----------------------------------------------------------

jaro_winkler_distance <- function(t1, t2) {
  stringdist(paste(t1, collapse = " "), paste(t2, collapse = " "), method = "jw")
}

levenshtein_distance <- function(t1, t2) {
  stringdist(paste(t1, collapse = " "), paste(t2, collapse = " "), method = "lv")
}

dice_coefficient <- function(t1, t2) {
  intersect_len <- length(intersect(t1, t2))
  union_len <- length(union(t1, t2))
  if (union_len == 0) return(0)
  return(pmin((2 * intersect_len) / union_len, 1))
}

overlap_coefficient <- function(t1, t2) {
  intersect_len <- length(intersect(t1, t2))
  denom <- max(length(t1), length(t2))
  if (denom == 0) return(0)
  return(intersect_len / denom)
}

word_match_score <- function(t1, t2) {
  common <- sapply(t1, function(w1) any(sapply(t2, function(w2) agrepl(w1, w2, max.distance = 0.1))))
  return(100 * (sum(common) / max(length(t1), length(t2))))
}

##### 6. Calculate All Similarity Scores ----------------------------------------------------------

matches <- list()

for (i in seq_along(cleaned_names_db1)) {
  t1 <- cleaned_names_db1[[i]]
  for (j in seq_along(cleaned_names_db2)) {
    t2 <- cleaned_names_db2[[j]]
    
    matches[[length(matches) + 1]] <- list(
      fx2_code = dataset1$L7_FoodEx2_code[i],
      fx2_name = dataset1$L7_FoodEx2_desc[i],
      agb_code = dataset2$agb_code[j],
      agb_name = dataset2$agb_name[j],
      jaro = jaro_winkler_distance(t1, t2),
      lev = levenshtein_distance(t1, t2),
      dice = dice_coefficient(t1, t2),
      overlap = overlap_coefficient(t1, t2),
      wordmatch_score = word_match_score(t1, t2)
    )
  }
}

similarity_scores <- bind_rows(matches)

##### 7. Normalize Scores and Calculate Metascore ----------------------------------------------------------

normalize <- function(x) (x - min(x)) / (max(x) - min(x))
similarity_scores <- similarity_scores %>%
  mutate(
    inv_norm_jaro = 1 - normalize(jaro),
    inv_norm_lev = 1 - normalize(lev),
    norm_dice = normalize(dice),
    norm_overlap = normalize(overlap),
    norm_wordmatch = normalize(wordmatch_score),
    metascore = inv_norm_jaro + inv_norm_lev + norm_dice + norm_overlap + norm_wordmatch
  )

##### 8. Rank and Filter Top Matches ----------------------------------------------------------

ranked <- similarity_scores %>%
  group_by(fx2_name) %>%
  arrange(desc(metascore)) %>%
  mutate(rank = row_number()) %>%
  ungroup()

top5 <- ranked %>% filter(rank <= 5)
others <- ranked %>% filter(rank > 5)

##### 9. Export Results ----------------------------------------------------------

write.xlsx(top5, file = "plausible_matches.xlsx")
write.xlsx(others, file = "non_plausible_matches.xlsx")
write.xlsx(similarity_scores, file = "all_matches_with_scores.xlsx")

##### END ----------------------------------------------------------------------