stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

ikarasz commited on Dec 29, 2025

Commit

c487058

·

1 Parent(s): 06bc1c7

count all variants

Files changed (1) hide show

measures/VocabularyAnalyser.py +15 -8

measures/VocabularyAnalyser.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import pandas as pd
 import re
 import unicodedata
-import weakref
 # --- helper normalization ---
 def norm_txt(x: str) -> str:
@@ -17,17 +16,25 @@ class VocabularyAnalyser:
     def __init__(self, glossary_file: str):
         # Load glossary CSV (first column = base + variants, comma-separated)
         raw = pd.read_csv(glossary_file)
-        terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
         gloss_list = []
-        for idx, cell in enumerate(terms_col, start=1):
-            items = [t.strip() for t in cell.split(",") if t.strip()]
-            if not items:
                 continue
-            base = items[0]
-            for form in items:
                 gloss_list.append({
-                    "row": idx,
                     "base": base,
                     "form": form,
                     "len": len(form),

 import pandas as pd
 import re
 import unicodedata
 # --- helper normalization ---
 def norm_txt(x: str) -> str:
     def __init__(self, glossary_file: str):
         # Load glossary CSV (first column = base + variants, comma-separated)
         raw = pd.read_csv(glossary_file)
         gloss_list = []
+        for idx, row in raw.iterrows():
+            forms = []
+            for cell in row:
+                if pd.isna(cell):
+                    continue
+                for chunk in str(cell).split(","):
+                    token = norm_txt(chunk)
+                    if token:
+                        forms.append(token)
+            if not forms:
                 continue
+            base = forms[0]
+            for form in forms:
                 gloss_list.append({
+                    "row": idx + 1,
                     "base": base,
                     "form": form,
                     "len": len(form),