Spaces:

Futyn-Maker
/

textmeme_search

Sleeping

File size: 1,335 Bytes

7e1f5f6

from typing import List, Union

import nltk
from pymystem3 import Mystem


class MystemTokenizer:
    def __init__(self, stopwords: Union[List[str], str] = "ru"):
        """
        Initialize the MystemTokenizer.

        Args:
            stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
        """
        if stopwords == "ru":
            try:
                self.stopwords = nltk.corpus.stopwords.words("russian")
            except LookupError:
                # Download stopwords if not available
                nltk.download("stopwords")
                self.stopwords = nltk.corpus.stopwords.words("russian")
        else:
            self.stopwords = stopwords

        self.mystem = Mystem()

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize and lemmatize the input text, removing stopwords.

        Args:
            text (str): The input text to tokenize.

        Returns:
            List[str]: A list of lemmatized tokens.
        """
        # Lemmatize and tokenize using Mystem
        lemmas = self.mystem.lemmatize(text.lower())

        # Filter out non-letter tokens and stopwords
        tokens = [
            lemma for lemma in lemmas
            if lemma.isalpha() and lemma not in self.stopwords
        ]

        return tokens