import pandas as pd import re import unicodedata # --- helper normalization --- def norm_txt(x: str) -> str: if x is None: return "" x = unicodedata.normalize("NFKC", str(x)) x = x.lower() x = re.sub(r"\s+", " ", x).strip() x = re.sub(r"[-‐–—]", " ", x) return x class VocabularyAnalyser: def __init__(self, glossary_file: str): # Load glossary CSV (first column = base + variants, comma-separated) raw = pd.read_csv(glossary_file) gloss_list = [] for idx, row in raw.iterrows(): forms = [] for cell in row: if pd.isna(cell): continue for chunk in str(cell).split(","): token = norm_txt(chunk) if token: forms.append(token) if not forms: continue base = forms[0] for form in forms: gloss_list.append({ "row": idx + 1, "base": base, "form": form, "len": len(form), "words": len(form.split()) }) self.gloss_forms = ( pd.DataFrame(gloss_list) .drop_duplicates(["base", "form"]) .sort_values(["words", "len"], ascending=[False, False]) ) def match_one_utterance(self, text: str): """Return list of matched base terms for a given utterance text.""" s = norm_txt(text) if not s: return [] locs = [] for fm, bs, wd in zip(self.gloss_forms["form"], self.gloss_forms["base"], self.gloss_forms["words"]): if not fm: continue fm_escaped = re.escape(fm) pattern = r"\b" + fm_escaped + r"\b" for m in re.finditer(pattern, s): locs.append({ "start": m.start(), "end": m.end(), "form": fm, "base": bs, "words": wd, "len": m.end() - m.start() }) if not locs: return [] # prioritize: more tokens > longer span > earlier start locs_df = pd.DataFrame(locs).sort_values( ["words", "len", "start"], ascending=[False, False, True] ) used = [False] * len(s) keep_bases = [] for _, row in locs_df.iterrows(): rng = range(row["start"], row["end"]) if not any(used[i] for i in rng): keep_bases.append(row["base"]) for i in rng: used[i] = True return sorted(set(keep_bases)) def run_analysis(self, transcript): """Mutate transcript utterances by adding vocabulary_terms list.""" for utt in transcript.utterances: matches = self.match_one_utterance(utt.text) utt.vocabulary_terms = matches return transcript