| import os |
| import json |
| from pathlib import Path |
| from typing import Dict, List, Literal, Optional, Union, Iterable |
| from typing_extensions import TypedDict, NotRequired |
|
|
| from spacy.language import Language |
| from spacy.pipeline import Pipe |
| from spacy.pipeline.lemmatizer import lemmatizer_score |
| from spacy.util import ensure_path |
| from spacy.tokens import Doc, Token |
|
|
| MATCH_ORDER = [ |
| "upos", |
| "Tense", |
| "VerbForm", |
| "Voice", |
| "Case", |
| "Gender", |
| "Number", |
| "Degree", |
| "Mood", |
| "Person", |
| "Aspect", |
| "Definite", |
| "PronType", |
| "Polarity", |
| "Poss", |
| "Reflex", |
| ] |
|
|
|
|
| class TableEntry(TypedDict): |
| form: str |
| lemma: str |
| upos: str |
| frequency: int |
| Tense: NotRequired[str] |
| VerbForm: NotRequired[str] |
| Voice: NotRequired[str] |
| Case: NotRequired[str] |
| Gender: NotRequired[str] |
| Number: NotRequired[str] |
| Degree: NotRequired[str] |
| Mood: NotRequired[str] |
| Person: NotRequired[str] |
| Aspect: NotRequired[str] |
| Definite: NotRequired[str] |
| PronType: NotRequired[str] |
| Polarity: NotRequired[str] |
| Poss: NotRequired[str] |
| Reflex: NotRequired[str] |
|
|
|
|
| FrequencyTable = Dict[str, List[TableEntry]] |
|
|
| LookupTable = Dict[str, str] |
|
|
|
|
| @Language.factory( |
| "frequency_lemmatizer", |
| assigns=["token.lemma"], |
| default_config={ |
| "overwrite": True, |
| "fallback_priority": "lookup", |
| }, |
| default_score_weights={"lemma_acc": 1.0}, |
| ) |
| def make_lemmatizer( |
| nlp: Language, |
| name: str, |
| overwrite: bool, |
| fallback_priority: Literal["lemma", "lookup"], |
| ): |
| return FrequencyLemmatizer( |
| nlp=nlp, |
| name=name, |
| overwrite=overwrite, |
| fallback_priority=fallback_priority, |
| ) |
|
|
|
|
| def max_freq_lemma(entries: List[TableEntry]) -> str: |
| """Returns lemma with highest frequency from the given entries.""" |
| max_index = 0 |
| n_entries = len(entries) |
| for index in range(1, n_entries): |
| if entries[index]["frequency"] > entries[max_index]["frequency"]: |
| max_index = index |
| return entries[max_index]["lemma"] |
|
|
|
|
| def match_lemma( |
| token_entry: TableEntry, table: FrequencyTable |
| ) -> Optional[str]: |
| """Returns a lemma for a token if it |
| can be found in the frequency table. |
| """ |
| |
| match = table.get(token_entry["form"], []) |
| if not match: |
| return None |
| |
| for match_property in MATCH_ORDER: |
| match_new = [ |
| entry |
| for entry in match |
| if entry.get(match_property, "") |
| == token_entry.get(match_property, "") |
| ] |
| if not match_new: |
| return max_freq_lemma(entries=match) |
| match = match_new |
| return max_freq_lemma(entries=match) |
|
|
|
|
| def read_json(path: str) -> Dict: |
| with open(path) as file: |
| res = json.load(file) |
| return res |
|
|
|
|
| def write_json(object: Dict, path: str) -> None: |
| with open(path, "w") as file: |
| json.dump(object, file) |
|
|
|
|
| class FrequencyLemmatizer(Pipe): |
| """ |
| Part-of-speech and morphology, and frequency |
| sensitive rule-based lemmatizer. |
| |
| Parameters |
| ---------- |
| overwrite: bool, default True |
| Specifies whether the frequency lemmatizer should overwrite |
| already assigned lemmas. |
| fallback_priority: 'lemma' or 'lookup', default 'lookup' |
| Specifies which fallback should have higher priority |
| if the lemma is not found in |
| the primary table. |
| """ |
|
|
| def __init__( |
| self, |
| nlp: Language, |
| name: str = "freq_lemmatizer", |
| *, |
| overwrite: bool = True, |
| fallback_priority: Literal["lemma", "lookup"] = "lookup", |
| ): |
| self.name = name |
| self.overwrite = overwrite |
| self.scorer = lemmatizer_score |
| self.fallback_priority = fallback_priority |
|
|
| def initialize( |
| self, |
| get_examples=None, |
| *, |
| nlp=None, |
| table: Optional[FrequencyTable] = None, |
| lookup: Optional[LookupTable] = None, |
| ) -> None: |
| """Initializes the frequency lemmatizer from given lemma table and lookup. |
| |
| Parameters |
| ---------- |
| table: iterable of entries or None, default None |
| Iterable of all entries in the lemma table |
| with pos tags morph features and frequencies. |
| lookup: dict of str to str or None, default None |
| Backoff lookup table for simple token-lemma lookup. |
| """ |
| if table is None: |
| self.table = None |
| else: |
| self.table = table |
| self.lookup = lookup |
|
|
| def backoff(self, token: Token) -> str: |
| """Gets backoff token based on priority.""" |
| orth = token.orth_.lower() |
| lookup = self.lookup |
| in_lookup = (lookup is not None) and (orth in lookup) |
| priority = self.fallback_priority |
| has_lemma = (token.lemma != 0) and (token.lemma_ != token.orth_) |
| if in_lookup: |
| if priority == "lookup": |
| return lookup[orth] |
| else: |
| if has_lemma: |
| return token.lemma_ |
| else: |
| return token.orth_ |
| else: |
| if has_lemma: |
| return token.lemma_ |
| else: |
| return token.orth_ |
|
|
| def lemmatize(self, token: Token) -> str: |
| """Lemmatizes token.""" |
| backoff = self.backoff(token) |
| orth = token.orth_.lower() |
| |
| if self.table is None: |
| return backoff |
| |
| token_entry: TableEntry = TableEntry( |
| form=orth, upos=token.pos_, frequency=-1, **token.morph.to_dict() |
| ) |
| lemma = match_lemma(token_entry=token_entry, table=self.table) |
| if lemma is None: |
| return backoff |
| else: |
| return lemma |
|
|
| def __call__(self, doc: Doc) -> Doc: |
| """Apply the lemmatization to a document.""" |
| error_handler = self.get_error_handler() |
| try: |
| for token in doc: |
| if self.overwrite or token.lemma == 0: |
| token.lemma_ = self.lemmatize(token) |
| return doc |
| except Exception as e: |
| error_handler(self.name, self, [doc], e) |
|
|
| def to_disk( |
| self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() |
| ): |
| """Save frequency lemmatizer data to a directory.""" |
| path = ensure_path(path) |
| Path(path).mkdir(parents=True, exist_ok=True) |
| config = dict( |
| overwrite=self.overwrite, fallback_priority=self.fallback_priority |
| ) |
| with open(os.path.join(path, "config.json"), "w") as config_file: |
| json.dump(config, config_file) |
| if self.table is not None: |
| table_path = os.path.join(path, "table.json") |
| write_json(self.table, path=table_path) |
| if self.lookup is not None: |
| lookup_path = os.path.join(path, "lookup.json") |
| write_json(self.lookup, path=lookup_path) |
|
|
| def from_disk( |
| self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() |
| ) -> "FrequencyLemmatizer": |
| """Load component from disk.""" |
| path = ensure_path(path) |
| config = read_json(os.path.join(path, "config.json")) |
| self.overwrite = config.get("overwrite", self.overwrite) |
| self.fallback_priority = config.get( |
| "fallback_priority", self.fallback_priority |
| ) |
| try: |
| table: Optional[FrequencyTable] = read_json( |
| os.path.join(path, "table.json") |
| ) |
| except FileNotFoundError: |
| table = None |
| try: |
| lookup: Optional[LookupTable] = read_json( |
| os.path.join(path, "lookup.json") |
| ) |
| except FileNotFoundError: |
| lookup = None |
| self.initialize(table=table, lookup=lookup) |
| return self |
|
|