| import pandas as pd |
| import re |
| import unicodedata |
| import weakref |
|
|
| |
| def norm_txt(x: str) -> str: |
| if x is None: |
| return "" |
| x = unicodedata.normalize("NFKC", str(x)) |
| x = x.lower() |
| x = re.sub(r"\s+", " ", x).strip() |
| x = re.sub(r"[-‐–—]", " ", x) |
| return x |
|
|
| class VocabularyAnalyser: |
| def __init__(self, glossary_file: str): |
| |
| raw = pd.read_csv(glossary_file) |
| terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt) |
|
|
| gloss_list = [] |
| for idx, cell in enumerate(terms_col, start=1): |
| items = [t.strip() for t in cell.split(",") if t.strip()] |
| if not items: |
| continue |
| base = items[0] |
| for form in items: |
| gloss_list.append({ |
| "row": idx, |
| "base": base, |
| "form": form, |
| "len": len(form), |
| "words": len(form.split()) |
| }) |
|
|
| self.gloss_forms = ( |
| pd.DataFrame(gloss_list) |
| .drop_duplicates(["base", "form"]) |
| .sort_values(["words", "len"], ascending=[False, False]) |
| ) |
|
|
| def match_one_utterance(self, text: str): |
| """Return list of matched base terms for a given utterance text.""" |
| s = norm_txt(text) |
| if not s: |
| return [] |
|
|
| locs = [] |
| for fm, bs, wd in zip(self.gloss_forms["form"], |
| self.gloss_forms["base"], |
| self.gloss_forms["words"]): |
| if not fm: |
| continue |
| fm_escaped = re.escape(fm) |
| pattern = r"\b" + fm_escaped + r"\b" |
| for m in re.finditer(pattern, s): |
| locs.append({ |
| "start": m.start(), |
| "end": m.end(), |
| "form": fm, |
| "base": bs, |
| "words": wd, |
| "len": m.end() - m.start() |
| }) |
|
|
| if not locs: |
| return [] |
|
|
| |
| locs_df = pd.DataFrame(locs).sort_values( |
| ["words", "len", "start"], ascending=[False, False, True] |
| ) |
|
|
| used = [False] * len(s) |
| keep_bases = [] |
| for _, row in locs_df.iterrows(): |
| rng = range(row["start"], row["end"]) |
| if not any(used[i] for i in rng): |
| keep_bases.append(row["base"]) |
| for i in rng: |
| used[i] = True |
|
|
| return sorted(set(keep_bases)) |
|
|
| def run_analysis(self, transcript): |
| """Mutate transcript utterances by adding vocabulary_terms list.""" |
| for utt in transcript.utterances: |
| matches = self.match_one_utterance(utt.text) |
| utt.vocabulary_terms = matches |
| return transcript |