| |
| import spacy |
| from vocabulary.parse_vocabulary import VOCABULARY_MANAGER |
|
|
|
|
| class RussianLemmatizer: |
| def __init__(self): |
| print(" Загрузка русской модели spaCy...") |
| try: |
| self.nlp = spacy.load("./ru_model") |
| except OSError: |
| print(" ⚠️ Модель ru_core_news_sm не найдена, скачиваю...") |
| import subprocess |
| subprocess.check_call(["python", "-m", "spacy", "download", "ru_core_news_sm"]) |
| self.nlp = spacy.load("ru_core_news_sm") |
| self.nlp.to_disk("./ru_model") |
| |
| self.terms = {} |
| self.stems = {} |
| |
| |
| |
| self.terms = {term.lower(): term.lower() for term in VOCABULARY_MANAGER.vocabulary} |
| |
| print(f" Загружено {len(self.terms)} терминов из vocabulary.md") |
| |
| |
| @self.nlp.component("fix_terms") |
| def fix_terms(doc): |
| """Компонент для исправления лемм терминов и их форм""" |
| for token in doc: |
| lemma_lower = token.lemma_.lower() |
| canonical = VOCABULARY_MANAGER.find_word_in_terms(lemma_lower) |
| if canonical: |
| token.lemma_ = canonical.lower() |
| return doc |
| |
| |
| if "fix_terms" not in self.nlp.pipe_names: |
| self.nlp.add_pipe("fix_terms", after="lemmatizer") |
| |
| def tokenize_text(self, text: str) -> list[str]: |
| """Лемматизация текста. |
| |
| Args: |
| text: текст для лемматизации |
| |
| Returns: |
| list: список лемм, исключая пунктуацию |
| """ |
| text = text.replace('ё', 'е').lower() |
| doc = self.nlp(text) |
| |
| |
| return [token.lemma_ for token in doc if not token.is_punct and token.lemma_.strip()] |
|
|