Spaces:

DanilO0o
/

nlp_group_project

Sleeping

App Files Files Community

nlp_group_project / models /text_preprocessor.py

DanilO0o

added new model

edcd390 over 1 year ago

raw

history blame contribute delete

2.05 kB

	#text_preprocessor.py

	import pandas as pd
	import re
	import string
	import pymorphy3
	from sklearn.base import BaseEstimator, TransformerMixin

	class MyCustomTextPreprocessor(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.stop_words = self.get_stopwords_list()
	self.morph = pymorphy3.MorphAnalyzer()

	def fit(self, X, y=None):
	return self

	def transform(self, texts, y=None, lemmatize=True):
	return [self.preprocess(text, lemmatize=lemmatize) for text in texts]

	def get_stopwords_list(self):
	url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
	stopwords_cust = set(pd.read_csv(url, header=None, names=["stopwords"], encoding="utf-8")['stopwords'])
	return stopwords_cust

	def clean(self, text):
	text = text.lower()
	text = re.sub(r'http\S+', " ", text)
	text = re.sub(r'@\w+', ' ', text)
	text = re.sub(r'#\w+', ' ', text)
	text = re.sub(r'\d+', ' ', text)
	text = re.sub(r'[^\w\s,]', '', text)
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = re.sub(r'<.*?>', ' ', text)
	text = re.sub(r'[\u00A0\u2000-\u206F]', ' ', text)
	text = re.sub(r'[a-zA-Z]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def remove_stopwords(self, text):
	return ' '.join([word for word in text.split() if word not in self.stop_words])

	def lemmatize(self, text):
	morph = self.morph
	lemmatized_text = ''
	for word in text.split():
	lemmatized_text += morph.parse(word)[0].normal_form + " "
	return lemmatized_text

	def preprocess(self, text, lemmatize=True):
	"""Общая функция обработки текста с возможностью отключить лемматизацию"""
	text = self.clean(text)
	text = self.remove_stopwords(text)
	if lemmatize:
	text = self.lemmatize(text)
	return text