Commit ·
bf60bb7
1
Parent(s): c71fb13
update from github upstream (8197d38919dee305eebf595d7224cf7a3da5348a)
Browse files- README.md +6 -6
- lemmatizer/__init__.py +3 -1
- lemmatizer/__pycache__/__init__.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/analyzer.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/edittree.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/idiom_id.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/lemmatizer.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/tokenizer.cpython-311.pyc +0 -0
- lemmatizer/__pycache__/utils.cpython-311.pyc +0 -0
- lemmatizer/analyzer.py +82 -68
- lemmatizer/in_voc/rm-puter.txt +0 -0
- lemmatizer/in_voc/rm-rumgr.txt +0 -0
- lemmatizer/in_voc/rm-surmiran.txt +0 -0
- lemmatizer/in_voc/rm-sursilv.txt +0 -0
- lemmatizer/in_voc/rm-sutsilv.txt +0 -0
- lemmatizer/in_voc/rm-vallader.txt +0 -0
- lemmatizer/lemmatizer.py +32 -13
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Romansh Lemmatizer Demo
|
| 3 |
emoji: 💻
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: yellow
|
|
@@ -12,18 +12,18 @@ pinned: false
|
|
| 12 |
license: mit
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
-
This demo visualises the functionalities of the package "
|
| 18 |
|
| 19 |
-
https://github.com/ZurichNLP/
|
| 20 |
|
| 21 |
The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
|
| 22 |
-
Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries
|
| 23 |
|
| 24 |
For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
|
| 25 |
|
| 26 |
-
<img src="https://huggingface.co/spaces/ZurichNLP/
|
| 27 |
alt="illustration" width="400"/>
|
| 28 |
|
| 29 |
Typical use cases for the lemmatizer include:
|
|
|
|
| 1 |
---
|
| 2 |
+
title: RUMLEM: Romansh Lemmatizer Demo
|
| 3 |
emoji: 💻
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: yellow
|
|
|
|
| 12 |
license: mit
|
| 13 |
---
|
| 14 |
|
| 15 |
+
# Dictionary-Based Lemmatizer for Romansh Varieties <span style="color:gray">(Beta)</span>: Demo
|
| 16 |
|
| 17 |
+
This demo visualises the functionalities of the package "rumlem", available at:
|
| 18 |
|
| 19 |
+
https://github.com/ZurichNLP/rumlem
|
| 20 |
|
| 21 |
The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
|
| 22 |
+
Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries of the five primary Romansh idioms Sursilvan, Sutsilvan, Surmiran, Puter and Vallader, as well as in the dictionary of the standard variety Rumantsch Grischun.
|
| 23 |
|
| 24 |
For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
|
| 25 |
|
| 26 |
+
<img src="https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png"
|
| 27 |
alt="illustration" width="400"/>
|
| 28 |
|
| 29 |
Typical use cases for the lemmatizer include:
|
lemmatizer/__init__.py
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
from
|
|
|
|
|
|
|
|
|
| 1 |
+
from .lemmatizer import Lemmatizer, Doc, Idiom, Lemma, MorphAnalysis
|
| 2 |
+
|
| 3 |
+
__all__ = ["Lemmatizer", "Doc", "Idiom", "Lemma", "MorphAnalysis"]
|
lemmatizer/__pycache__/__init__.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/__init__.cpython-311.pyc and b/lemmatizer/__pycache__/__init__.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/analyzer.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/analyzer.cpython-311.pyc and b/lemmatizer/__pycache__/analyzer.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/edittree.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/edittree.cpython-311.pyc and b/lemmatizer/__pycache__/edittree.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/idiom_id.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/idiom_id.cpython-311.pyc and b/lemmatizer/__pycache__/idiom_id.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/lemmatizer.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc and b/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/tokenizer.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/tokenizer.cpython-311.pyc and b/lemmatizer/__pycache__/tokenizer.cpython-311.pyc differ
|
|
|
lemmatizer/__pycache__/utils.cpython-311.pyc
CHANGED
|
Binary files a/lemmatizer/__pycache__/utils.cpython-311.pyc and b/lemmatizer/__pycache__/utils.cpython-311.pyc differ
|
|
|
lemmatizer/analyzer.py
CHANGED
|
@@ -2,9 +2,8 @@ import json
|
|
| 2 |
from pathlib import Path
|
| 3 |
import pickle
|
| 4 |
import sys
|
| 5 |
-
|
| 6 |
-
from
|
| 7 |
-
|
| 8 |
import lemmatizer.edittree as edittree
|
| 9 |
|
| 10 |
BASE_DIR = Path(__file__).parent
|
|
@@ -33,20 +32,23 @@ class Analyzer:
|
|
| 33 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 34 |
self.dict = json.load(f)
|
| 35 |
|
| 36 |
-
|
| 37 |
-
for v in self.dict.values():
|
| 38 |
-
lem += v["lemma"]
|
| 39 |
|
| 40 |
-
self.
|
| 41 |
-
|
| 42 |
-
self.edit_trees = []
|
| 43 |
|
|
|
|
|
|
|
| 44 |
for pos in "noun", "adj", "verb":
|
| 45 |
et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
|
| 46 |
-
sys.modules["edittree"] = edittree
|
| 47 |
with open(et_path, "rb") as f:
|
| 48 |
self.edit_trees += pickle.load(f)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
self.in_voc = in_voc
|
| 51 |
|
| 52 |
other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
|
|
@@ -54,70 +56,82 @@ class Analyzer:
|
|
| 54 |
with open(other_de_path, "r", encoding="utf-8") as f:
|
| 55 |
self.other_de = json.load(f)
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
tok = tok.lower().strip()
|
| 62 |
entry = self.dict.get(tok)
|
| 63 |
-
entry_ls = []
|
| 64 |
-
if entry:
|
| 65 |
-
entry_ls.extend(entry["lemma"])
|
| 66 |
-
if tok in self.other_de: # Augment the results with other_de entries
|
| 67 |
-
amount = len(self.other_de[tok]) # we need to add the lemma as many times as there are de translations for correct zipping later
|
| 68 |
-
tok_ls = [tok] * amount
|
| 69 |
-
entry_ls.extend(tok_ls)
|
| 70 |
-
if entry_ls:
|
| 71 |
-
return entry_ls
|
| 72 |
-
|
| 73 |
-
# Check if there's a lemma from the edit trees
|
| 74 |
-
if self.learned_et:
|
| 75 |
-
et_out = self._et_lemma(tok)
|
| 76 |
-
if et_out:
|
| 77 |
-
return [et_out]
|
| 78 |
|
| 79 |
-
|
| 80 |
-
return [tok] if tok in self.in_voc else [None]
|
| 81 |
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
candidates = []
|
| 84 |
|
| 85 |
-
for
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
if len(strong) > 1:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
for c in strong
|
| 98 |
-
|
| 99 |
-
out = min(dist, key=dist.get)
|
| 100 |
-
return out if out in self.in_voc else None
|
| 101 |
-
|
| 102 |
-
return strong[0] if strong and strong[0] in self.in_voc else None
|
| 103 |
-
|
| 104 |
-
def get_unimorph(self, tok: str):
|
| 105 |
-
"""Obtain Unimorph annotation for N, V, and ADJ
|
| 106 |
-
in the Pledari Grond Dict"""
|
| 107 |
-
tok = tok.lower().strip()
|
| 108 |
-
entry = self.dict.get(tok)
|
| 109 |
-
if entry:
|
| 110 |
-
return entry["unimorph"]
|
| 111 |
-
return [None]
|
| 112 |
-
|
| 113 |
-
def get_de(self, tok: str):
|
| 114 |
-
"""Obtain the German word corresponding to Romansh terms in the Pledari Grond Dict"""
|
| 115 |
-
tok = tok.lower().strip()
|
| 116 |
-
entry = self.dict.get(tok)
|
| 117 |
-
entry_ls = []
|
| 118 |
-
if entry:
|
| 119 |
-
entry_ls.extend(entry["DStichwort"])
|
| 120 |
-
if tok in self.other_de: # Augment the results with other_de entries
|
| 121 |
-
entry_ls.extend(self.other_de[tok])
|
| 122 |
-
return entry_ls if entry_ls else [None]
|
| 123 |
|
|
|
|
|
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
import pickle
|
| 4 |
import sys
|
| 5 |
+
from rapidfuzz.distance import Levenshtein
|
| 6 |
+
from collections import defaultdict
|
|
|
|
| 7 |
import lemmatizer.edittree as edittree
|
| 8 |
|
| 9 |
BASE_DIR = Path(__file__).parent
|
|
|
|
| 32 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 33 |
self.dict = json.load(f)
|
| 34 |
|
| 35 |
+
self.lemma = {lemma for v in self.dict.values() for lemma in v["lemma"]}
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
self.edit_trees = []
|
| 38 |
+
self.et_suffix_index = defaultdict(list)
|
|
|
|
| 39 |
|
| 40 |
+
if self.learned_et:
|
| 41 |
+
sys.modules["edittree"] = edittree
|
| 42 |
for pos in "noun", "adj", "verb":
|
| 43 |
et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
|
|
|
|
| 44 |
with open(et_path, "rb") as f:
|
| 45 |
self.edit_trees += pickle.load(f)
|
| 46 |
|
| 47 |
+
for et_pack in self.edit_trees:
|
| 48 |
+
suffix = self._get_expected_suffix(et_pack)
|
| 49 |
+
if suffix is not None:
|
| 50 |
+
self.et_suffix_index[suffix].append(et_pack)
|
| 51 |
+
|
| 52 |
self.in_voc = in_voc
|
| 53 |
|
| 54 |
other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
|
|
|
|
| 56 |
with open(other_de_path, "r", encoding="utf-8") as f:
|
| 57 |
self.other_de = json.load(f)
|
| 58 |
|
| 59 |
+
self.lemma_by_pos = defaultdict(set)
|
| 60 |
+
for v in self.dict.values():
|
| 61 |
+
for lemma, unimorph in zip(v["lemma"], v["unimorph"]):
|
| 62 |
+
if unimorph:
|
| 63 |
+
pos = unimorph.split(";")[0] # "N", "V", "ADJ", "V.PTCP"
|
| 64 |
+
self.lemma_by_pos[pos].add(lemma)
|
| 65 |
+
|
| 66 |
+
def _get_expected_suffix(self, et_pack) -> str | None:
|
| 67 |
+
"""Recurse down the right spine to find the expected suffix."""
|
| 68 |
+
node = et_pack["et"]
|
| 69 |
+
while node is not None:
|
| 70 |
+
if isinstance(node.val[0], str): # leaf node
|
| 71 |
+
return node.val[0]
|
| 72 |
+
node = node.right
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
def analyze(self, tok: str):
|
| 76 |
tok = tok.lower().strip()
|
| 77 |
entry = self.dict.get(tok)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
lemmas, de_list, unimorph_list = [], [], []
|
|
|
|
| 80 |
|
| 81 |
+
if entry:
|
| 82 |
+
lemmas.extend(entry["lemma"])
|
| 83 |
+
de_list.extend(entry["DStichwort"])
|
| 84 |
+
unimorph_list.extend(entry["unimorph"])
|
| 85 |
+
|
| 86 |
+
if tok in self.other_de:
|
| 87 |
+
amount = len(self.other_de[tok])
|
| 88 |
+
lemmas.extend([tok] * amount)
|
| 89 |
+
de_list.extend(self.other_de[tok])
|
| 90 |
+
unimorph_list.extend([None] * amount)
|
| 91 |
+
|
| 92 |
+
if lemmas:
|
| 93 |
+
return lemmas, de_list, unimorph_list
|
| 94 |
+
|
| 95 |
+
if tok in self.in_voc:
|
| 96 |
+
return [tok], [None], [None]
|
| 97 |
+
return [None], [None], [None]
|
| 98 |
+
|
| 99 |
+
def _et_analyze(self, tok: str):
|
| 100 |
candidates = []
|
| 101 |
|
| 102 |
+
for suffix, packs in self.et_suffix_index.items():
|
| 103 |
+
if tok.endswith(suffix):
|
| 104 |
+
for et_pack in packs:
|
| 105 |
+
out = et_pack["et"].apply(tok)
|
| 106 |
+
if out != -1:
|
| 107 |
+
candidates.append((out, et_pack.get("majority_tag")))
|
| 108 |
+
|
| 109 |
+
strong_tagged = []
|
| 110 |
+
strong_untagged = []
|
| 111 |
+
|
| 112 |
+
for c, tag in candidates:
|
| 113 |
+
if c not in self.lemma:
|
| 114 |
+
continue
|
| 115 |
+
if tag:
|
| 116 |
+
pos = tag.split(";")[0]
|
| 117 |
+
pos_lemmas = self.lemma_by_pos.get(pos, self.lemma)
|
| 118 |
+
if c in pos_lemmas:
|
| 119 |
+
strong_tagged.append((c, tag))
|
| 120 |
+
else:
|
| 121 |
+
if len(c) <= len(tok): # lemma can't be longer than inflected form
|
| 122 |
+
strong_untagged.append((c, tag))
|
| 123 |
+
|
| 124 |
+
# Prefer tagged candidates; only use untagged if nothing else found
|
| 125 |
+
strong = strong_tagged if strong_tagged else strong_untagged
|
| 126 |
+
|
| 127 |
+
if not strong:
|
| 128 |
+
return None, None
|
| 129 |
|
| 130 |
if len(strong) > 1:
|
| 131 |
+
dist = {c: Levenshtein.normalized_distance(tok, c) for c, _ in strong}
|
| 132 |
+
best = min(dist, key=dist.get)
|
| 133 |
+
match = next((c, tag) for c, tag in strong if c == best)
|
| 134 |
+
return match if match[0] in self.in_voc else (None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
c, tag = strong[0]
|
| 137 |
+
return (c, tag) if c in self.in_voc else (None, None)
|
lemmatizer/in_voc/rm-puter.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/in_voc/rm-rumgr.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/in_voc/rm-surmiran.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/in_voc/rm-sursilv.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/in_voc/rm-sutsilv.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/in_voc/rm-vallader.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lemmatizer/lemmatizer.py
CHANGED
|
@@ -157,28 +157,47 @@ class Lemmatizer:
|
|
| 157 |
|
| 158 |
def __call__(self, text: str) -> Doc:
|
| 159 |
# Tokenize the text
|
|
|
|
| 160 |
toks = self.tokenizer.tokenize(text)
|
| 161 |
-
|
| 162 |
tok_obj = []
|
| 163 |
|
|
|
|
|
|
|
| 164 |
for t in toks:
|
| 165 |
-
full_lemma = defaultdict(list)
|
| 166 |
t_lower = t.lower()
|
| 167 |
-
|
| 168 |
-
|
| 169 |
for idiom in Idiom:
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
if l:
|
| 178 |
lem = Lemma(idiom, l, d if d else "null")
|
| 179 |
analysis = MorphAnalysis(get_features(u))
|
| 180 |
full_lemma[lem].append(analysis)
|
| 181 |
-
|
| 182 |
tok_obj.append(Token(t, full_lemma, self.idiom))
|
| 183 |
|
| 184 |
-
return Doc(text, tok_obj, self.in_voc, self.idiom
|
|
|
|
| 157 |
|
| 158 |
def __call__(self, text: str) -> Doc:
|
| 159 |
# Tokenize the text
|
| 160 |
+
from itertools import zip_longest
|
| 161 |
toks = self.tokenizer.tokenize(text)
|
|
|
|
| 162 |
tok_obj = []
|
| 163 |
|
| 164 |
+
# First pass: dict only, all idioms
|
| 165 |
+
first_pass = {} # tok -> {idiom: (lemmas, de, unimorph)}
|
| 166 |
for t in toks:
|
|
|
|
| 167 |
t_lower = t.lower()
|
| 168 |
+
first_pass[t_lower] = {}
|
|
|
|
| 169 |
for idiom in Idiom:
|
| 170 |
+
result = self._analyzers[idiom].analyze(t_lower)
|
| 171 |
+
if result[0] != [None]:
|
| 172 |
+
first_pass[t_lower][idiom] = result
|
| 173 |
+
|
| 174 |
+
# Detect idiom
|
| 175 |
+
if self.idiom:
|
| 176 |
+
best_idiom = self.idiom
|
| 177 |
+
else:
|
| 178 |
+
scores = get_scores([t.lower() for t in toks], self.in_voc)
|
| 179 |
+
best_idiom = max(scores, key=scores.get)
|
| 180 |
+
|
| 181 |
+
# Second pass: ET for unresolved tokens, best idiom only
|
| 182 |
+
best_analyzer = self._analyzers[best_idiom]
|
| 183 |
+
for t_lower, idiom_results in first_pass.items():
|
| 184 |
+
result = idiom_results.get(best_idiom)
|
| 185 |
+
# Run ET if no result, or if the only result is the token itself (in_voc fallback)
|
| 186 |
+
if result is None or result == ([t_lower], [None], [None]):
|
| 187 |
+
et_lemma, et_tag = best_analyzer._et_analyze(t_lower)
|
| 188 |
+
if et_lemma:
|
| 189 |
+
idiom_results[best_idiom] = ([et_lemma], [None], [et_tag])
|
| 190 |
+
|
| 191 |
+
# Build token objects
|
| 192 |
+
for t in toks:
|
| 193 |
+
t_lower = t.lower()
|
| 194 |
+
full_lemma = defaultdict(list)
|
| 195 |
+
for idiom, (lemmas, de_list, unimorph_list) in first_pass[t_lower].items():
|
| 196 |
+
for l, d, u in zip_longest(lemmas, de_list, unimorph_list, fillvalue=None):
|
| 197 |
if l:
|
| 198 |
lem = Lemma(idiom, l, d if d else "null")
|
| 199 |
analysis = MorphAnalysis(get_features(u))
|
| 200 |
full_lemma[lem].append(analysis)
|
|
|
|
| 201 |
tok_obj.append(Token(t, full_lemma, self.idiom))
|
| 202 |
|
| 203 |
+
return Doc(text, tok_obj, self.in_voc, self.idiom)
|