textmeme_search / src /preprocessing /mystem_tokenizer.py
Futyn-Maker
Deploy the app
7e1f5f6
from typing import List, Union
import nltk
from pymystem3 import Mystem
class MystemTokenizer:
def __init__(self, stopwords: Union[List[str], str] = "ru"):
"""
Initialize the MystemTokenizer.
Args:
stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
"""
if stopwords == "ru":
try:
self.stopwords = nltk.corpus.stopwords.words("russian")
except LookupError:
# Download stopwords if not available
nltk.download("stopwords")
self.stopwords = nltk.corpus.stopwords.words("russian")
else:
self.stopwords = stopwords
self.mystem = Mystem()
def tokenize(self, text: str) -> List[str]:
"""
Tokenize and lemmatize the input text, removing stopwords.
Args:
text (str): The input text to tokenize.
Returns:
List[str]: A list of lemmatized tokens.
"""
# Lemmatize and tokenize using Mystem
lemmas = self.mystem.lemmatize(text.lower())
# Filter out non-letter tokens and stopwords
tokens = [
lemma for lemma in lemmas
if lemma.isalpha() and lemma not in self.stopwords
]
return tokens