count all variants
Browse files
measures/VocabularyAnalyser.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import re
|
| 3 |
import unicodedata
|
| 4 |
-
import weakref
|
| 5 |
|
| 6 |
# --- helper normalization ---
|
| 7 |
def norm_txt(x: str) -> str:
|
|
@@ -17,17 +16,25 @@ class VocabularyAnalyser:
|
|
| 17 |
def __init__(self, glossary_file: str):
|
| 18 |
# Load glossary CSV (first column = base + variants, comma-separated)
|
| 19 |
raw = pd.read_csv(glossary_file)
|
| 20 |
-
terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
|
| 21 |
|
| 22 |
gloss_list = []
|
| 23 |
-
for idx,
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
continue
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
gloss_list.append({
|
| 30 |
-
"row": idx,
|
| 31 |
"base": base,
|
| 32 |
"form": form,
|
| 33 |
"len": len(form),
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import re
|
| 3 |
import unicodedata
|
|
|
|
| 4 |
|
| 5 |
# --- helper normalization ---
|
| 6 |
def norm_txt(x: str) -> str:
|
|
|
|
| 16 |
def __init__(self, glossary_file: str):
|
| 17 |
# Load glossary CSV (first column = base + variants, comma-separated)
|
| 18 |
raw = pd.read_csv(glossary_file)
|
|
|
|
| 19 |
|
| 20 |
gloss_list = []
|
| 21 |
+
for idx, row in raw.iterrows():
|
| 22 |
+
forms = []
|
| 23 |
+
for cell in row:
|
| 24 |
+
if pd.isna(cell):
|
| 25 |
+
continue
|
| 26 |
+
for chunk in str(cell).split(","):
|
| 27 |
+
token = norm_txt(chunk)
|
| 28 |
+
if token:
|
| 29 |
+
forms.append(token)
|
| 30 |
+
|
| 31 |
+
if not forms:
|
| 32 |
continue
|
| 33 |
+
|
| 34 |
+
base = forms[0]
|
| 35 |
+
for form in forms:
|
| 36 |
gloss_list.append({
|
| 37 |
+
"row": idx + 1,
|
| 38 |
"base": base,
|
| 39 |
"form": form,
|
| 40 |
"len": len(form),
|