| | """ |
| | This module contains various rule-based components aiming to improve on baseline lemmatization tools. |
| | """ |
| |
|
| | import re |
| | from typing import List, Callable |
| |
|
| | from spacy.lang.hu import Hungarian |
| | from spacy.pipeline import Pipe |
| | from spacy.tokens import Token |
| | from spacy.tokens.doc import Doc |
| |
|
| |
|
| | @Hungarian.component( |
| | "lemma_case_smoother", |
| | assigns=["token.lemma"], |
| | requires=["token.lemma", "token.pos"], |
| | ) |
| | def lemma_case_smoother(doc: Doc) -> Doc: |
| | """Smooth lemma casing by POS. |
| | |
| | DEPRECATED: This is not needed anymore, as the lemmatizer is now case-insensitive. |
| | |
| | Args: |
| | doc (Doc): Input document. |
| | |
| | Returns: |
| | Doc: Output document. |
| | """ |
| | for token in doc: |
| | if token.is_sent_start and token.tag_ != "PROPN": |
| | token.lemma_ = token.lemma_.lower() |
| |
|
| | return doc |
| |
|
| |
|
| | class LemmaSmoother(Pipe): |
| | """Smooths lemma by fixing common errors of the edit-tree lemmatizer.""" |
| |
|
| | _DATE_PATTERN = re.compile(r"(\d+)-j?[éá]?n?a?(t[őó]l)?") |
| | _NUMBER_PATTERN = re.compile(r"(\d+([-,/_.:]?(._)?\d+)*%?)") |
| |
|
| | |
| | @staticmethod |
| | @Hungarian.factory("lemma_smoother", assigns=["token.lemma"], requires=["token.lemma", "token.pos"]) |
| | def create_lemma_smoother(nlp: Hungarian, name: str) -> "LemmaSmoother": |
| | return LemmaSmoother() |
| |
|
| | def __call__(self, doc: Doc) -> Doc: |
| | rules: List[Callable] = [ |
| | self._remove_exclamation_marks, |
| | self._remove_question_marks, |
| | self._remove_date_suffixes, |
| | self._remove_suffix_after_numbers, |
| | ] |
| |
|
| | for token in doc: |
| | for rule in rules: |
| | rule(token) |
| |
|
| | return doc |
| |
|
| | @classmethod |
| | def _remove_exclamation_marks(cls, token: Token) -> None: |
| | """Removes exclamation marks from the lemma. |
| | |
| | Args: |
| | token (Token): The original token. |
| | """ |
| |
|
| | if "!" != token.lemma_: |
| | exclamation_mark_index = token.lemma_.find("!") |
| | if exclamation_mark_index != -1: |
| | token.lemma_ = token.lemma_[:exclamation_mark_index] |
| |
|
| | @classmethod |
| | def _remove_question_marks(cls, token: Token) -> None: |
| | """Removes question marks from the lemma. |
| | |
| | Args: |
| | token (Token): The original token. |
| | """ |
| |
|
| | if "?" != token.lemma_: |
| | question_mark_index = token.lemma_.find("?") |
| | if question_mark_index != -1: |
| | token.lemma_ = token.lemma_[:question_mark_index] |
| |
|
| | @classmethod |
| | def _remove_date_suffixes(cls, token: Token) -> None: |
| | """Fixes the suffixes of dates. |
| | |
| | Args: |
| | token (Token): The original token. |
| | """ |
| |
|
| | if token.pos_ == "NOUN": |
| | match = cls._DATE_PATTERN.match(token.lemma_) |
| | if match is not None: |
| | token.lemma_ = match.group(1) + "." |
| |
|
| | @classmethod |
| | def _remove_suffix_after_numbers(cls, token: Token) -> None: |
| | """Removes suffixes after numbers. |
| | |
| | Args: |
| | token (str): The original token. |
| | """ |
| |
|
| | if token.pos_ == "NUM": |
| | match = cls._NUMBER_PATTERN.match(token.text) |
| | if match is not None: |
| | token.lemma_ = match.group(0) |
| |
|