dominic-fischer commited on
Commit
bf60bb7
·
1 Parent(s): c71fb13

update from github upstream (8197d38919dee305eebf595d7224cf7a3da5348a)

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Romansh Lemmatizer Demo
3
  emoji: 💻
4
  colorFrom: purple
5
  colorTo: yellow
@@ -12,18 +12,18 @@ pinned: false
12
  license: mit
13
  ---
14
 
15
- # Basic Lemmatizer for Romansh Varieties <span style="color:gray">(Beta)</span>: Demo
16
 
17
- This demo visualises the functionalities of the package "romansh_lemmatizer", available at:
18
 
19
- https://github.com/ZurichNLP/romansh_lemmatizer
20
 
21
  The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
22
- Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries for the five standard Romansh idioms: Sursilvan, Sutsilvan, Surmiran, Puter, and Vallader, as well as the dictionary for Rumantsch Grischun.
23
 
24
  For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
25
 
26
- <img src="https://huggingface.co/spaces/ZurichNLP/romansh-lemmatizer/resolve/main/illustration.png"
27
  alt="illustration" width="400"/>
28
 
29
  Typical use cases for the lemmatizer include:
 
1
  ---
2
+ title: RUMLEM: Romansh Lemmatizer Demo
3
  emoji: 💻
4
  colorFrom: purple
5
  colorTo: yellow
 
12
  license: mit
13
  ---
14
 
15
+ # Dictionary-Based Lemmatizer for Romansh Varieties <span style="color:gray">(Beta)</span>: Demo
16
 
17
+ This demo visualises the functionalities of the package "rumlem", available at:
18
 
19
+ https://github.com/ZurichNLP/rumlem
20
 
21
  The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
22
+ Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries of the five primary Romansh idioms Sursilvan, Sutsilvan, Surmiran, Puter and Vallader, as well as in the dictionary of the standard variety Rumantsch Grischun.
23
 
24
  For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
25
 
26
+ <img src="https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png"
27
  alt="illustration" width="400"/>
28
 
29
  Typical use cases for the lemmatizer include:
lemmatizer/__init__.py CHANGED
@@ -1 +1,3 @@
1
- from lemmatizer.lemmatizer import Lemmatizer
 
 
 
1
+ from .lemmatizer import Lemmatizer, Doc, Idiom, Lemma, MorphAnalysis
2
+
3
+ __all__ = ["Lemmatizer", "Doc", "Idiom", "Lemma", "MorphAnalysis"]
lemmatizer/__pycache__/__init__.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/__init__.cpython-311.pyc and b/lemmatizer/__pycache__/__init__.cpython-311.pyc differ
 
lemmatizer/__pycache__/analyzer.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/analyzer.cpython-311.pyc and b/lemmatizer/__pycache__/analyzer.cpython-311.pyc differ
 
lemmatizer/__pycache__/edittree.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/edittree.cpython-311.pyc and b/lemmatizer/__pycache__/edittree.cpython-311.pyc differ
 
lemmatizer/__pycache__/idiom_id.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/idiom_id.cpython-311.pyc and b/lemmatizer/__pycache__/idiom_id.cpython-311.pyc differ
 
lemmatizer/__pycache__/lemmatizer.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc and b/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc differ
 
lemmatizer/__pycache__/tokenizer.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/tokenizer.cpython-311.pyc and b/lemmatizer/__pycache__/tokenizer.cpython-311.pyc differ
 
lemmatizer/__pycache__/utils.cpython-311.pyc CHANGED
Binary files a/lemmatizer/__pycache__/utils.cpython-311.pyc and b/lemmatizer/__pycache__/utils.cpython-311.pyc differ
 
lemmatizer/analyzer.py CHANGED
@@ -2,9 +2,8 @@ import json
2
  from pathlib import Path
3
  import pickle
4
  import sys
5
-
6
- from jiwer import cer
7
-
8
  import lemmatizer.edittree as edittree
9
 
10
  BASE_DIR = Path(__file__).parent
@@ -33,20 +32,23 @@ class Analyzer:
33
  with open(json_path, "r", encoding="utf-8") as f:
34
  self.dict = json.load(f)
35
 
36
- lem = []
37
- for v in self.dict.values():
38
- lem += v["lemma"]
39
 
40
- self.lemma = lem
41
- if self.learned_et:
42
- self.edit_trees = []
43
 
 
 
44
  for pos in "noun", "adj", "verb":
45
  et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
46
- sys.modules["edittree"] = edittree
47
  with open(et_path, "rb") as f:
48
  self.edit_trees += pickle.load(f)
49
 
 
 
 
 
 
50
  self.in_voc = in_voc
51
 
52
  other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
@@ -54,70 +56,82 @@ class Analyzer:
54
  with open(other_de_path, "r", encoding="utf-8") as f:
55
  self.other_de = json.load(f)
56
 
57
- def get_lemma(self, tok: str):
58
- """Obtain lemma through table look up; backs off
59
- to unsupervised edit tree rules if no lemma found
60
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  tok = tok.lower().strip()
62
  entry = self.dict.get(tok)
63
- entry_ls = []
64
- if entry:
65
- entry_ls.extend(entry["lemma"])
66
- if tok in self.other_de: # Augment the results with other_de entries
67
- amount = len(self.other_de[tok]) # we need to add the lemma as many times as there are de translations for correct zipping later
68
- tok_ls = [tok] * amount
69
- entry_ls.extend(tok_ls)
70
- if entry_ls:
71
- return entry_ls
72
-
73
- # Check if there's a lemma from the edit trees
74
- if self.learned_et:
75
- et_out = self._et_lemma(tok)
76
- if et_out:
77
- return [et_out]
78
 
79
- # Assume the token is a lemma
80
- return [tok] if tok in self.in_voc else [None]
81
 
82
- def _et_lemma(self, tok: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  candidates = []
84
 
85
- for et_pack in self.edit_trees:
86
- et = et_pack["et"]
87
- out = et.apply(tok)
88
-
89
- if out != -1:
90
- candidates.append(out)
91
-
92
- strong = [c for c in candidates if c in self.lemma]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  if len(strong) > 1:
95
- # Choose the candidate with the lowest edit distance to the tok:
96
- dist = {}
97
- for c in strong:
98
- dist[c] = cer(tok, c)
99
- out = min(dist, key=dist.get)
100
- return out if out in self.in_voc else None
101
-
102
- return strong[0] if strong and strong[0] in self.in_voc else None
103
-
104
- def get_unimorph(self, tok: str):
105
- """Obtain Unimorph annotation for N, V, and ADJ
106
- in the Pledari Grond Dict"""
107
- tok = tok.lower().strip()
108
- entry = self.dict.get(tok)
109
- if entry:
110
- return entry["unimorph"]
111
- return [None]
112
-
113
- def get_de(self, tok: str):
114
- """Obtain the German word corresponding to Romansh terms in the Pledari Grond Dict"""
115
- tok = tok.lower().strip()
116
- entry = self.dict.get(tok)
117
- entry_ls = []
118
- if entry:
119
- entry_ls.extend(entry["DStichwort"])
120
- if tok in self.other_de: # Augment the results with other_de entries
121
- entry_ls.extend(self.other_de[tok])
122
- return entry_ls if entry_ls else [None]
123
 
 
 
 
2
  from pathlib import Path
3
  import pickle
4
  import sys
5
+ from rapidfuzz.distance import Levenshtein
6
+ from collections import defaultdict
 
7
  import lemmatizer.edittree as edittree
8
 
9
  BASE_DIR = Path(__file__).parent
 
32
  with open(json_path, "r", encoding="utf-8") as f:
33
  self.dict = json.load(f)
34
 
35
+ self.lemma = {lemma for v in self.dict.values() for lemma in v["lemma"]}
 
 
36
 
37
+ self.edit_trees = []
38
+ self.et_suffix_index = defaultdict(list)
 
39
 
40
+ if self.learned_et:
41
+ sys.modules["edittree"] = edittree
42
  for pos in "noun", "adj", "verb":
43
  et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
 
44
  with open(et_path, "rb") as f:
45
  self.edit_trees += pickle.load(f)
46
 
47
+ for et_pack in self.edit_trees:
48
+ suffix = self._get_expected_suffix(et_pack)
49
+ if suffix is not None:
50
+ self.et_suffix_index[suffix].append(et_pack)
51
+
52
  self.in_voc = in_voc
53
 
54
  other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
 
56
  with open(other_de_path, "r", encoding="utf-8") as f:
57
  self.other_de = json.load(f)
58
 
59
+ self.lemma_by_pos = defaultdict(set)
60
+ for v in self.dict.values():
61
+ for lemma, unimorph in zip(v["lemma"], v["unimorph"]):
62
+ if unimorph:
63
+ pos = unimorph.split(";")[0] # "N", "V", "ADJ", "V.PTCP"
64
+ self.lemma_by_pos[pos].add(lemma)
65
+
66
+ def _get_expected_suffix(self, et_pack) -> str | None:
67
+ """Recurse down the right spine to find the expected suffix."""
68
+ node = et_pack["et"]
69
+ while node is not None:
70
+ if isinstance(node.val[0], str): # leaf node
71
+ return node.val[0]
72
+ node = node.right
73
+ return None
74
+
75
+ def analyze(self, tok: str):
76
  tok = tok.lower().strip()
77
  entry = self.dict.get(tok)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ lemmas, de_list, unimorph_list = [], [], []
 
80
 
81
+ if entry:
82
+ lemmas.extend(entry["lemma"])
83
+ de_list.extend(entry["DStichwort"])
84
+ unimorph_list.extend(entry["unimorph"])
85
+
86
+ if tok in self.other_de:
87
+ amount = len(self.other_de[tok])
88
+ lemmas.extend([tok] * amount)
89
+ de_list.extend(self.other_de[tok])
90
+ unimorph_list.extend([None] * amount)
91
+
92
+ if lemmas:
93
+ return lemmas, de_list, unimorph_list
94
+
95
+ if tok in self.in_voc:
96
+ return [tok], [None], [None]
97
+ return [None], [None], [None]
98
+
99
+ def _et_analyze(self, tok: str):
100
  candidates = []
101
 
102
+ for suffix, packs in self.et_suffix_index.items():
103
+ if tok.endswith(suffix):
104
+ for et_pack in packs:
105
+ out = et_pack["et"].apply(tok)
106
+ if out != -1:
107
+ candidates.append((out, et_pack.get("majority_tag")))
108
+
109
+ strong_tagged = []
110
+ strong_untagged = []
111
+
112
+ for c, tag in candidates:
113
+ if c not in self.lemma:
114
+ continue
115
+ if tag:
116
+ pos = tag.split(";")[0]
117
+ pos_lemmas = self.lemma_by_pos.get(pos, self.lemma)
118
+ if c in pos_lemmas:
119
+ strong_tagged.append((c, tag))
120
+ else:
121
+ if len(c) <= len(tok): # lemma can't be longer than inflected form
122
+ strong_untagged.append((c, tag))
123
+
124
+ # Prefer tagged candidates; only use untagged if nothing else found
125
+ strong = strong_tagged if strong_tagged else strong_untagged
126
+
127
+ if not strong:
128
+ return None, None
129
 
130
  if len(strong) > 1:
131
+ dist = {c: Levenshtein.normalized_distance(tok, c) for c, _ in strong}
132
+ best = min(dist, key=dist.get)
133
+ match = next((c, tag) for c, tag in strong if c == best)
134
+ return match if match[0] in self.in_voc else (None, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ c, tag = strong[0]
137
+ return (c, tag) if c in self.in_voc else (None, None)
lemmatizer/in_voc/rm-puter.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/in_voc/rm-rumgr.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/in_voc/rm-surmiran.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/in_voc/rm-sursilv.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/in_voc/rm-sutsilv.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/in_voc/rm-vallader.txt CHANGED
The diff for this file is too large to render. See raw diff
 
lemmatizer/lemmatizer.py CHANGED
@@ -157,28 +157,47 @@ class Lemmatizer:
157
 
158
  def __call__(self, text: str) -> Doc:
159
  # Tokenize the text
 
160
  toks = self.tokenizer.tokenize(text)
161
-
162
  tok_obj = []
163
 
 
 
164
  for t in toks:
165
- full_lemma = defaultdict(list)
166
  t_lower = t.lower()
167
- from itertools import zip_longest
168
-
169
  for idiom in Idiom:
170
- a = self._analyzers[idiom]
171
- lemma = a.get_lemma(t_lower)
172
- de = a.get_de(t_lower)
173
- unimorph = a.get_unimorph(t_lower)
174
-
175
- # safer: align lists of possibly different length
176
- for l, d, u in zip_longest(lemma, de, unimorph, fillvalue=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if l:
178
  lem = Lemma(idiom, l, d if d else "null")
179
  analysis = MorphAnalysis(get_features(u))
180
  full_lemma[lem].append(analysis)
181
-
182
  tok_obj.append(Token(t, full_lemma, self.idiom))
183
 
184
- return Doc(text, tok_obj, self.in_voc, self.idiom,)
 
157
 
158
  def __call__(self, text: str) -> Doc:
159
  # Tokenize the text
160
+ from itertools import zip_longest
161
  toks = self.tokenizer.tokenize(text)
 
162
  tok_obj = []
163
 
164
+ # First pass: dict only, all idioms
165
+ first_pass = {} # tok -> {idiom: (lemmas, de, unimorph)}
166
  for t in toks:
 
167
  t_lower = t.lower()
168
+ first_pass[t_lower] = {}
 
169
  for idiom in Idiom:
170
+ result = self._analyzers[idiom].analyze(t_lower)
171
+ if result[0] != [None]:
172
+ first_pass[t_lower][idiom] = result
173
+
174
+ # Detect idiom
175
+ if self.idiom:
176
+ best_idiom = self.idiom
177
+ else:
178
+ scores = get_scores([t.lower() for t in toks], self.in_voc)
179
+ best_idiom = max(scores, key=scores.get)
180
+
181
+ # Second pass: ET for unresolved tokens, best idiom only
182
+ best_analyzer = self._analyzers[best_idiom]
183
+ for t_lower, idiom_results in first_pass.items():
184
+ result = idiom_results.get(best_idiom)
185
+ # Run ET if no result, or if the only result is the token itself (in_voc fallback)
186
+ if result is None or result == ([t_lower], [None], [None]):
187
+ et_lemma, et_tag = best_analyzer._et_analyze(t_lower)
188
+ if et_lemma:
189
+ idiom_results[best_idiom] = ([et_lemma], [None], [et_tag])
190
+
191
+ # Build token objects
192
+ for t in toks:
193
+ t_lower = t.lower()
194
+ full_lemma = defaultdict(list)
195
+ for idiom, (lemmas, de_list, unimorph_list) in first_pass[t_lower].items():
196
+ for l, d, u in zip_longest(lemmas, de_list, unimorph_list, fillvalue=None):
197
  if l:
198
  lem = Lemma(idiom, l, d if d else "null")
199
  analysis = MorphAnalysis(get_features(u))
200
  full_lemma[lem].append(analysis)
 
201
  tok_obj.append(Token(t, full_lemma, self.idiom))
202
 
203
+ return Doc(text, tok_obj, self.in_voc, self.idiom)