tags-generation / utils.py
jellyhater's picture
model
85ac9d4
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from pymystem3 import Mystem
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
def preproc_text(x):
mystem = Mystem()
stop_words = stopwords.words('russian')
stop_words.extend([' ', ' \n', ' ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа'])
tokenizer = RegexpTokenizer(r'\w+')
data = ''.join(mystem.lemmatize(x))
data = tokenizer.tokenize(data)
return ' '.join([word for word in data if word not in stop_words])
def predict(sample, model, label_encoder, preproc=True):
if preproc:
if type(sample) == pd.Series:
sample = sample.parallel_apply(preproc_text)
return label_encoder.inverse_transform(model.predict(sample))
elif type(sample) == bytes or type(sample) == str:
sample = [preproc_text(sample)]
return label_encoder.inverse_transform(model.predict(sample))[0]