File size: 1,335 Bytes
7e1f5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from typing import List, Union

import nltk
from pymystem3 import Mystem


class MystemTokenizer:
    def __init__(self, stopwords: Union[List[str], str] = "ru"):
        """
        Initialize the MystemTokenizer.

        Args:
            stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
        """
        if stopwords == "ru":
            try:
                self.stopwords = nltk.corpus.stopwords.words("russian")
            except LookupError:
                # Download stopwords if not available
                nltk.download("stopwords")
                self.stopwords = nltk.corpus.stopwords.words("russian")
        else:
            self.stopwords = stopwords

        self.mystem = Mystem()

    def tokenize(self, text: str) -> List[str]:
        """
        Tokenize and lemmatize the input text, removing stopwords.

        Args:
            text (str): The input text to tokenize.

        Returns:
            List[str]: A list of lemmatized tokens.
        """
        # Lemmatize and tokenize using Mystem
        lemmas = self.mystem.lemmatize(text.lower())

        # Filter out non-letter tokens and stopwords
        tokens = [
            lemma for lemma in lemmas
            if lemma.isalpha() and lemma not in self.stopwords
        ]

        return tokens