Spaces:

ZurichNLP
/

rumlem

Running

App Files Files Community

dominic-fischer commited on Apr 14

Commit

bf60bb7

1 Parent(s): c71fb13

update from github upstream (8197d38919dee305eebf595d7224cf7a3da5348a)

Browse files

Files changed (17) hide show

README.md +6 -6
lemmatizer/__init__.py +3 -1
lemmatizer/__pycache__/__init__.cpython-311.pyc +0 -0
lemmatizer/__pycache__/analyzer.cpython-311.pyc +0 -0
lemmatizer/__pycache__/edittree.cpython-311.pyc +0 -0
lemmatizer/__pycache__/idiom_id.cpython-311.pyc +0 -0
lemmatizer/__pycache__/lemmatizer.cpython-311.pyc +0 -0
lemmatizer/__pycache__/tokenizer.cpython-311.pyc +0 -0
lemmatizer/__pycache__/utils.cpython-311.pyc +0 -0
lemmatizer/analyzer.py +82 -68
lemmatizer/in_voc/rm-puter.txt +0 -0
lemmatizer/in_voc/rm-rumgr.txt +0 -0
lemmatizer/in_voc/rm-surmiran.txt +0 -0
lemmatizer/in_voc/rm-sursilv.txt +0 -0
lemmatizer/in_voc/rm-sutsilv.txt +0 -0
lemmatizer/in_voc/rm-vallader.txt +0 -0
lemmatizer/lemmatizer.py +32 -13

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Romansh Lemmatizer Demo
 emoji: 💻
 colorFrom: purple
 colorTo: yellow
@@ -12,18 +12,18 @@ pinned: false
 license: mit
 ---
-# Basic Lemmatizer for Romansh Varieties <span style="color:gray">(Beta)</span>: Demo
-This demo visualises the functionalities of the package "romansh_lemmatizer", available at:
-https://github.com/ZurichNLP/romansh_lemmatizer
 The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
-Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries for the five standard Romansh idioms: Sursilvan, Sutsilvan, Surmiran, Puter, and Vallader, as well as the dictionary for Rumantsch Grischun.
 For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
-<img src="https://huggingface.co/spaces/ZurichNLP/romansh-lemmatizer/resolve/main/illustration.png"
      alt="illustration" width="400"/>
 Typical use cases for the lemmatizer include:

 ---
+title: RUMLEM: Romansh Lemmatizer Demo
 emoji: 💻
 colorFrom: purple
 colorTo: yellow
 license: mit
 ---
+# Dictionary-Based Lemmatizer for Romansh Varieties <span style="color:gray">(Beta)</span>: Demo
+This demo visualises the functionalities of the package "rumlem", available at:
+https://github.com/ZurichNLP/rumlem
 The underlying Python package presents a basic dictionary-based lemmatizer for the Romansh language.
+Provided a Romansh text, the lemmatizer splits it into words and looks up each word in the [Pledari Grond](https://pledarigrond.ch/) dictionaries of the five primary Romansh idioms Sursilvan, Sutsilvan, Surmiran, Puter and Vallader, as well as in the dictionary of the standard variety Rumantsch Grischun.
 For example, if a Romansh text contains the word _lavuraiva_, the lemmatizer traces the word back to the Vallader and Puter dictionaries:
+<img src="https://huggingface.co/spaces/ZurichNLP/rumlem/resolve/main/illustration.png"
      alt="illustration" width="400"/>
 Typical use cases for the lemmatizer include:

lemmatizer/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- from ~~lemmatizer~~.lemmatizer import Lemmatizer


1	+ from .lemmatizer import Lemmatizer, Doc, Idiom, Lemma, MorphAnalysis
2	+
3	+ __all__ = ["Lemmatizer", "Doc", "Idiom", "Lemma", "MorphAnalysis"]

lemmatizer/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/__init__.cpython-311.pyc and b/lemmatizer/__pycache__/__init__.cpython-311.pyc differ

lemmatizer/__pycache__/analyzer.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/analyzer.cpython-311.pyc and b/lemmatizer/__pycache__/analyzer.cpython-311.pyc differ

lemmatizer/__pycache__/edittree.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/edittree.cpython-311.pyc and b/lemmatizer/__pycache__/edittree.cpython-311.pyc differ

lemmatizer/__pycache__/idiom_id.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/idiom_id.cpython-311.pyc and b/lemmatizer/__pycache__/idiom_id.cpython-311.pyc differ

lemmatizer/__pycache__/lemmatizer.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc and b/lemmatizer/__pycache__/lemmatizer.cpython-311.pyc differ

lemmatizer/__pycache__/tokenizer.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/tokenizer.cpython-311.pyc and b/lemmatizer/__pycache__/tokenizer.cpython-311.pyc differ

lemmatizer/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/lemmatizer/__pycache__/utils.cpython-311.pyc and b/lemmatizer/__pycache__/utils.cpython-311.pyc differ

lemmatizer/analyzer.py CHANGED Viewed

@@ -2,9 +2,8 @@ import json
 from pathlib import Path
 import pickle
 import sys
-from jiwer import cer
 import lemmatizer.edittree as edittree
 BASE_DIR = Path(__file__).parent
@@ -33,20 +32,23 @@ class Analyzer:
         with open(json_path, "r", encoding="utf-8") as f:
             self.dict = json.load(f)
-        lem = []
-        for v in self.dict.values():
-            lem += v["lemma"]
-        self.lemma = lem
-        if self.learned_et:
-            self.edit_trees = []
             for pos in "noun", "adj", "verb":
                 et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
-                sys.modules["edittree"] = edittree
                 with open(et_path, "rb") as f:
                     self.edit_trees += pickle.load(f)
         self.in_voc = in_voc
         other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
@@ -54,70 +56,82 @@ class Analyzer:
         with open(other_de_path, "r", encoding="utf-8") as f:
             self.other_de = json.load(f)
-    def get_lemma(self, tok: str):
-        """Obtain lemma through table look up; backs off
-        to unsupervised edit tree rules if no lemma found
-        """
         tok = tok.lower().strip()
         entry = self.dict.get(tok)
-        entry_ls = []
-        if entry:
-            entry_ls.extend(entry["lemma"])
-        if tok in self.other_de: # Augment the results with other_de entries
-              amount = len(self.other_de[tok]) # we need to add the lemma as many times as there are de translations for correct zipping later
-              tok_ls = [tok] * amount
-              entry_ls.extend(tok_ls)
-        if entry_ls:
-            return entry_ls
-        # Check if there's a lemma from the edit trees
-        if self.learned_et:
-            et_out = self._et_lemma(tok)
-            if et_out:
-                return [et_out]
-        # Assume the token is a lemma
-        return [tok] if tok in self.in_voc else [None]
-    def _et_lemma(self, tok: str):
         candidates = []
-        for et_pack in self.edit_trees:
-            et = et_pack["et"]
-            out = et.apply(tok)
-            if out != -1:
-                candidates.append(out)
-        strong = [c for c in candidates if c in self.lemma]
         if len(strong) > 1:
-            # Choose the candidate with the lowest edit distance to the tok:
-            dist = {}
-            for c in strong:
-                dist[c] = cer(tok, c)
-            out = min(dist, key=dist.get)
-            return out if out in self.in_voc else None
-        return strong[0] if strong and strong[0] in self.in_voc else None
-    def get_unimorph(self, tok: str):
-        """Obtain Unimorph annotation for N, V, and ADJ
-        in the Pledari Grond Dict"""
-        tok = tok.lower().strip()
-        entry = self.dict.get(tok)
-        if entry:
-            return entry["unimorph"]
-        return [None]
-    def get_de(self, tok: str):
-        """Obtain the German word corresponding to Romansh terms in the Pledari Grond Dict"""
-        tok = tok.lower().strip()
-        entry = self.dict.get(tok)
-        entry_ls = []
-        if entry:
-            entry_ls.extend(entry["DStichwort"])
-        if tok in self.other_de: # Augment the results with other_de entries
-           entry_ls.extend(self.other_de[tok])
-        return entry_ls if entry_ls else [None]

 from pathlib import Path
 import pickle
 import sys
+from rapidfuzz.distance import Levenshtein
+from collections import defaultdict
 import lemmatizer.edittree as edittree
 BASE_DIR = Path(__file__).parent
         with open(json_path, "r", encoding="utf-8") as f:
             self.dict = json.load(f)
+        self.lemma = {lemma for v in self.dict.values() for lemma in v["lemma"]}
+        self.edit_trees = []
+        self.et_suffix_index = defaultdict(list)
+        if self.learned_et:
+            sys.modules["edittree"] = edittree
             for pos in "noun", "adj", "verb":
                 et_path = BASE_DIR / "edit_trees" / f"{self.idiom}" / f"{pos}" / "et.txt"
                 with open(et_path, "rb") as f:
                     self.edit_trees += pickle.load(f)
+            for et_pack in self.edit_trees:
+                suffix = self._get_expected_suffix(et_pack)
+                if suffix is not None:
+                    self.et_suffix_index[suffix].append(et_pack)
         self.in_voc = in_voc
         other_de_path = BASE_DIR / "other_de" / f"{self.idiom}.json"
         with open(other_de_path, "r", encoding="utf-8") as f:
             self.other_de = json.load(f)
+        self.lemma_by_pos = defaultdict(set)
+        for v in self.dict.values():
+            for lemma, unimorph in zip(v["lemma"], v["unimorph"]):
+                if unimorph:
+                    pos = unimorph.split(";")[0]  # "N", "V", "ADJ", "V.PTCP"
+                    self.lemma_by_pos[pos].add(lemma)
+    def _get_expected_suffix(self, et_pack) -> str | None:
+        """Recurse down the right spine to find the expected suffix."""
+        node = et_pack["et"]
+        while node is not None:
+            if isinstance(node.val[0], str):  # leaf node
+                return node.val[0]
+            node = node.right
+        return None
+    def analyze(self, tok: str):
         tok = tok.lower().strip()
         entry = self.dict.get(tok)
+        lemmas, de_list, unimorph_list = [], [], []
+        if entry:
+            lemmas.extend(entry["lemma"])
+            de_list.extend(entry["DStichwort"])
+            unimorph_list.extend(entry["unimorph"])
+        if tok in self.other_de:
+            amount = len(self.other_de[tok])
+            lemmas.extend([tok] * amount)
+            de_list.extend(self.other_de[tok])
+            unimorph_list.extend([None] * amount)
+        if lemmas:
+            return lemmas, de_list, unimorph_list
+        if tok in self.in_voc:
+            return [tok], [None], [None]
+        return [None], [None], [None]
+    def _et_analyze(self, tok: str):
         candidates = []
+        for suffix, packs in self.et_suffix_index.items():
+            if tok.endswith(suffix):
+                for et_pack in packs:
+                    out = et_pack["et"].apply(tok)
+                    if out != -1:
+                        candidates.append((out, et_pack.get("majority_tag")))
+        strong_tagged = []
+        strong_untagged = []
+        for c, tag in candidates:
+            if c not in self.lemma:
+                continue
+            if tag:
+                pos = tag.split(";")[0]
+                pos_lemmas = self.lemma_by_pos.get(pos, self.lemma)
+                if c in pos_lemmas:
+                    strong_tagged.append((c, tag))
+            else:
+                if len(c) <= len(tok):  # lemma can't be longer than inflected form
+                    strong_untagged.append((c, tag))
+        # Prefer tagged candidates; only use untagged if nothing else found
+        strong = strong_tagged if strong_tagged else strong_untagged
+        if not strong:
+            return None, None
         if len(strong) > 1:
+            dist = {c: Levenshtein.normalized_distance(tok, c) for c, _ in strong}
+            best = min(dist, key=dist.get)
+            match = next((c, tag) for c, tag in strong if c == best)
+            return match if match[0] in self.in_voc else (None, None)
+        c, tag = strong[0]
+        return (c, tag) if c in self.in_voc else (None, None)

lemmatizer/in_voc/rm-puter.txt CHANGED Viewed