File size: 1,078 Bytes
85ac9d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from pymystem3 import Mystem
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)


def preproc_text(x):
    mystem = Mystem()
    stop_words = stopwords.words('russian')
    stop_words.extend([' ', ' \n', '  ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа'])
    tokenizer = RegexpTokenizer(r'\w+')
    data = ''.join(mystem.lemmatize(x))
    data = tokenizer.tokenize(data)
    return ' '.join([word for word in data if word not in stop_words])


def predict(sample, model, label_encoder, preproc=True):
    if preproc:
        if type(sample) == pd.Series:
            sample = sample.parallel_apply(preproc_text)
            return label_encoder.inverse_transform(model.predict(sample))

        elif type(sample) == bytes or type(sample) == str:
            sample = [preproc_text(sample)]
            return label_encoder.inverse_transform(model.predict(sample))[0]