Spaces:
Sleeping
Sleeping
| from typing import List, Union | |
| import nltk | |
| from pymystem3 import Mystem | |
| class MystemTokenizer: | |
| def __init__(self, stopwords: Union[List[str], str] = "ru"): | |
| """ | |
| Initialize the MystemTokenizer. | |
| Args: | |
| stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords. | |
| """ | |
| if stopwords == "ru": | |
| try: | |
| self.stopwords = nltk.corpus.stopwords.words("russian") | |
| except LookupError: | |
| # Download stopwords if not available | |
| nltk.download("stopwords") | |
| self.stopwords = nltk.corpus.stopwords.words("russian") | |
| else: | |
| self.stopwords = stopwords | |
| self.mystem = Mystem() | |
| def tokenize(self, text: str) -> List[str]: | |
| """ | |
| Tokenize and lemmatize the input text, removing stopwords. | |
| Args: | |
| text (str): The input text to tokenize. | |
| Returns: | |
| List[str]: A list of lemmatized tokens. | |
| """ | |
| # Lemmatize and tokenize using Mystem | |
| lemmas = self.mystem.lemmatize(text.lower()) | |
| # Filter out non-letter tokens and stopwords | |
| tokens = [ | |
| lemma for lemma in lemmas | |
| if lemma.isalpha() and lemma not in self.stopwords | |
| ] | |
| return tokens | |