File size: 3,090 Bytes
df684a5 a0fc189 df684a5 a0fc189 df684a5 a0fc189 df684a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import pandas as pd
import re
import unicodedata
# --- helper normalization ---
def norm_txt(x: str) -> str:
if x is None:
return ""
x = unicodedata.normalize("NFKC", str(x))
x = x.lower()
x = re.sub(r"\s+", " ", x).strip()
x = re.sub(r"[-‐–—]", " ", x)
return x
class VocabularyAnalyser:
def __init__(self, glossary_file: str):
# Load glossary CSV (first column = base + variants, comma-separated)
raw = pd.read_csv(glossary_file)
gloss_list = []
for idx, row in raw.iterrows():
forms = []
for cell in row:
if pd.isna(cell):
continue
for chunk in str(cell).split(","):
token = norm_txt(chunk)
if token:
forms.append(token)
if not forms:
continue
base = forms[0]
for form in forms:
gloss_list.append({
"row": idx + 1,
"base": base,
"form": form,
"len": len(form),
"words": len(form.split())
})
self.gloss_forms = (
pd.DataFrame(gloss_list)
.drop_duplicates(["base", "form"])
.sort_values(["words", "len"], ascending=[False, False])
)
def match_one_utterance(self, text: str):
"""Return list of matched base terms for a given utterance text."""
s = norm_txt(text)
if not s:
return []
locs = []
for fm, bs, wd in zip(self.gloss_forms["form"],
self.gloss_forms["base"],
self.gloss_forms["words"]):
if not fm:
continue
fm_escaped = re.escape(fm)
pattern = r"\b" + fm_escaped + r"\b"
for m in re.finditer(pattern, s):
locs.append({
"start": m.start(),
"end": m.end(),
"form": fm,
"base": bs,
"words": wd,
"len": m.end() - m.start()
})
if not locs:
return []
# prioritize: more tokens > longer span > earlier start
locs_df = pd.DataFrame(locs).sort_values(
["words", "len", "start"], ascending=[False, False, True]
)
used = [False] * len(s)
keep_bases = []
for _, row in locs_df.iterrows():
rng = range(row["start"], row["end"])
if not any(used[i] for i in rng):
keep_bases.append(row["base"])
for i in rng:
used[i] = True
return sorted(set(keep_bases))
def run_analysis(self, transcript):
"""Mutate transcript utterances by adding vocabulary_terms list."""
for utt in transcript.utterances:
matches = self.match_one_utterance(utt.text)
utt.vocabulary_terms = matches
return transcript |