ikarasz commited on
Commit
c487058
·
1 Parent(s): 06bc1c7

count all variants

Browse files
Files changed (1) hide show
  1. measures/VocabularyAnalyser.py +15 -8
measures/VocabularyAnalyser.py CHANGED
@@ -1,7 +1,6 @@
1
  import pandas as pd
2
  import re
3
  import unicodedata
4
- import weakref
5
 
6
  # --- helper normalization ---
7
  def norm_txt(x: str) -> str:
@@ -17,17 +16,25 @@ class VocabularyAnalyser:
17
  def __init__(self, glossary_file: str):
18
  # Load glossary CSV (first column = base + variants, comma-separated)
19
  raw = pd.read_csv(glossary_file)
20
- terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
21
 
22
  gloss_list = []
23
- for idx, cell in enumerate(terms_col, start=1):
24
- items = [t.strip() for t in cell.split(",") if t.strip()]
25
- if not items:
 
 
 
 
 
 
 
 
26
  continue
27
- base = items[0]
28
- for form in items:
 
29
  gloss_list.append({
30
- "row": idx,
31
  "base": base,
32
  "form": form,
33
  "len": len(form),
 
1
  import pandas as pd
2
  import re
3
  import unicodedata
 
4
 
5
  # --- helper normalization ---
6
  def norm_txt(x: str) -> str:
 
16
  def __init__(self, glossary_file: str):
17
  # Load glossary CSV (first column = base + variants, comma-separated)
18
  raw = pd.read_csv(glossary_file)
 
19
 
20
  gloss_list = []
21
+ for idx, row in raw.iterrows():
22
+ forms = []
23
+ for cell in row:
24
+ if pd.isna(cell):
25
+ continue
26
+ for chunk in str(cell).split(","):
27
+ token = norm_txt(chunk)
28
+ if token:
29
+ forms.append(token)
30
+
31
+ if not forms:
32
  continue
33
+
34
+ base = forms[0]
35
+ for form in forms:
36
  gloss_list.append({
37
+ "row": idx + 1,
38
  "base": base,
39
  "form": form,
40
  "len": len(form),