Spaces:
Sleeping
Sleeping
File size: 1,335 Bytes
7e1f5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from typing import List, Union
import nltk
from pymystem3 import Mystem
class MystemTokenizer:
def __init__(self, stopwords: Union[List[str], str] = "ru"):
"""
Initialize the MystemTokenizer.
Args:
stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
"""
if stopwords == "ru":
try:
self.stopwords = nltk.corpus.stopwords.words("russian")
except LookupError:
# Download stopwords if not available
nltk.download("stopwords")
self.stopwords = nltk.corpus.stopwords.words("russian")
else:
self.stopwords = stopwords
self.mystem = Mystem()
def tokenize(self, text: str) -> List[str]:
"""
Tokenize and lemmatize the input text, removing stopwords.
Args:
text (str): The input text to tokenize.
Returns:
List[str]: A list of lemmatized tokens.
"""
# Lemmatize and tokenize using Mystem
lemmas = self.mystem.lemmatize(text.lower())
# Filter out non-letter tokens and stopwords
tokens = [
lemma for lemma in lemmas
if lemma.isalpha() and lemma not in self.stopwords
]
return tokens
|