|
|
import pandas as pd |
|
|
import re |
|
|
import unicodedata |
|
|
|
|
|
|
|
|
def norm_txt(x: str) -> str: |
|
|
if x is None: |
|
|
return "" |
|
|
x = unicodedata.normalize("NFKC", str(x)) |
|
|
x = x.lower() |
|
|
x = re.sub(r"\s+", " ", x).strip() |
|
|
x = re.sub(r"[-‐–—]", " ", x) |
|
|
return x |
|
|
|
|
|
class VocabularyAnalyser: |
|
|
def __init__(self, glossary_file: str): |
|
|
|
|
|
raw = pd.read_csv(glossary_file) |
|
|
|
|
|
gloss_list = [] |
|
|
for idx, row in raw.iterrows(): |
|
|
forms = [] |
|
|
for cell in row: |
|
|
if pd.isna(cell): |
|
|
continue |
|
|
for chunk in str(cell).split(","): |
|
|
token = norm_txt(chunk) |
|
|
if token: |
|
|
forms.append(token) |
|
|
|
|
|
if not forms: |
|
|
continue |
|
|
|
|
|
base = forms[0] |
|
|
for form in forms: |
|
|
gloss_list.append({ |
|
|
"row": idx + 1, |
|
|
"base": base, |
|
|
"form": form, |
|
|
"len": len(form), |
|
|
"words": len(form.split()) |
|
|
}) |
|
|
|
|
|
self.gloss_forms = ( |
|
|
pd.DataFrame(gloss_list) |
|
|
.drop_duplicates(["base", "form"]) |
|
|
.sort_values(["words", "len"], ascending=[False, False]) |
|
|
) |
|
|
|
|
|
def match_one_utterance(self, text: str): |
|
|
"""Return list of matched base terms for a given utterance text.""" |
|
|
s = norm_txt(text) |
|
|
if not s: |
|
|
return [] |
|
|
|
|
|
locs = [] |
|
|
for fm, bs, wd in zip(self.gloss_forms["form"], |
|
|
self.gloss_forms["base"], |
|
|
self.gloss_forms["words"]): |
|
|
if not fm: |
|
|
continue |
|
|
fm_escaped = re.escape(fm) |
|
|
pattern = r"\b" + fm_escaped + r"\b" |
|
|
for m in re.finditer(pattern, s): |
|
|
locs.append({ |
|
|
"start": m.start(), |
|
|
"end": m.end(), |
|
|
"form": fm, |
|
|
"base": bs, |
|
|
"words": wd, |
|
|
"len": m.end() - m.start() |
|
|
}) |
|
|
|
|
|
if not locs: |
|
|
return [] |
|
|
|
|
|
|
|
|
locs_df = pd.DataFrame(locs).sort_values( |
|
|
["words", "len", "start"], ascending=[False, False, True] |
|
|
) |
|
|
|
|
|
used = [False] * len(s) |
|
|
keep_bases = [] |
|
|
for _, row in locs_df.iterrows(): |
|
|
rng = range(row["start"], row["end"]) |
|
|
if not any(used[i] for i in rng): |
|
|
keep_bases.append(row["base"]) |
|
|
for i in rng: |
|
|
used[i] = True |
|
|
|
|
|
return sorted(set(keep_bases)) |
|
|
|
|
|
def run_analysis(self, transcript): |
|
|
"""Mutate transcript utterances by adding vocabulary_terms list.""" |
|
|
for utt in transcript.utterances: |
|
|
matches = self.match_one_utterance(utt.text) |
|
|
utt.vocabulary_terms = matches |
|
|
return transcript |