from typing import List, Union import nltk from pymystem3 import Mystem class MystemTokenizer: def __init__(self, stopwords: Union[List[str], str] = "ru"): """ Initialize the MystemTokenizer. Args: stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords. """ if stopwords == "ru": try: self.stopwords = nltk.corpus.stopwords.words("russian") except LookupError: # Download stopwords if not available nltk.download("stopwords") self.stopwords = nltk.corpus.stopwords.words("russian") else: self.stopwords = stopwords self.mystem = Mystem() def tokenize(self, text: str) -> List[str]: """ Tokenize and lemmatize the input text, removing stopwords. Args: text (str): The input text to tokenize. Returns: List[str]: A list of lemmatized tokens. """ # Lemmatize and tokenize using Mystem lemmas = self.mystem.lemmatize(text.lower()) # Filter out non-letter tokens and stopwords tokens = [ lemma for lemma in lemmas if lemma.isalpha() and lemma not in self.stopwords ] return tokens