Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

NLP_Homework_1 / src /text_preprocessing.py

Kolesnikov Dmitry

feat: Попытка навайбкодить 3 и 4 лабораторные

68545bc about 1 month ago

14 kB

	"""
	Модуль для предобработки текстовых данных для задач классификации.
	Включает очистку, токенизацию, лемматизацию, векторизацию и извлечение мета-признаков.
	"""

	from __future__ import annotations

	import re
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass

	import numpy as np
	from bs4 import BeautifulSoup
	import spacy
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from gensim.models import Word2Vec, FastText, Doc2Vec
	from gensim.utils import simple_preprocess

	from src.text_cleaner import clean_text, remove_html, normalize_whitespace
	from src.classical_vectorizers import ClassicalVectorizers, VectorizationConfig


	@dataclass
	class PreprocessingConfig:
	"""Конфигурация предобработки текста."""
	lowercase: bool = True
	remove_html: bool = True
	remove_urls: bool = True
	remove_emails: bool = True
	remove_numbers: bool = False
	lemmatize: bool = True
	remove_stopwords: bool = False
	min_token_length: int = 2
	emoji_to_text: bool = True


	class TextPreprocessor:
	"""Класс для предобработки текстов для классификации."""

	def __init__(self, config: Optional[PreprocessingConfig] = None):
	self.config = config or PreprocessingConfig()
	self.nlp = None
	if self.config.lemmatize:
	try:
	self.nlp = spacy.load("ru_core_news_sm")
	except OSError:
	try:
	self.nlp = spacy.load("ru_core_news_md")
	except OSError:
	print("⚠️ spaCy русская модель не найдена. Лемматизация отключена.")
	self.config.lemmatize = False

	def _remove_urls(self, text: str) -> str:
	"""Удаляет URL из текста."""
	url_pattern = r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
	return re.sub(url_pattern, '', text)

	def _remove_emails(self, text: str) -> str:
	"""Удаляет email адреса из текста."""
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	return re.sub(email_pattern, '', text)

	def _emoji_to_text(self, text: str) -> str:
	"""Заменяет эмодзи на текстовое описание (упрощенная версия)."""
	# Базовые замены для русскоязычного контекста
	emoji_map = {
	'😀': ' улыбка ',
	'😃': ' радость ',
	'😄': ' смех ',
	'😁': ' веселье ',
	'😆': ' хохот ',
	'😅': ' пот ',
	'😂': ' слезы радости ',
	'🤣': ' хохот ',
	'😊': ' улыбка ',
	'😇': ' ангел ',
	'🙂': ' улыбка ',
	'🙃': ' перевернутое лицо ',
	'😉': ' подмигивание ',
	'😌': ' облегчение ',
	'😍': ' любовь ',
	'🥰': ' любовь ',
	'😘': ' поцелуй ',
	'😗': ' поцелуй ',
	'😙': ' поцелуй ',
	'😚': ' поцелуй ',
	'😋': ' вкусно ',
	'😛': ' язык ',
	'😜': ' подмигивание ',
	'😝': ' язык ',
	'😞': ' грусть ',
	'😟': ' беспокойство ',
	'😠': ' злость ',
	'😡': ' ярость ',
	'😢': ' плач ',
	'😣': ' страдание ',
	'😤': ' упрямство ',
	'😥': ' разочарование ',
	'😦': ' удивление ',
	'😧': ' шок ',
	'😨': ' страх ',
	'😩': ' усталость ',
	'😪': ' сонливость ',
	'😫': ' усталость ',
	'😬': ' напряжение ',
	'😭': ' плач ',
	'😮': ' удивление ',
	'😯': ' удивление ',
	'😰': ' тревога ',
	'😱': ' ужас ',
	'😲': ' шок ',
	'😳': ' смущение ',
	'😴': ' сон ',
	'😵': ' головокружение ',
	'😶': ' без слов ',
	'😷': ' маска ',
	'🤐': ' молчание ',
	'🤒': ' болезнь ',
	'🤕': ' травма ',
	'🤢': ' тошнота ',
	'🤣': ' хохот ',
	'🤤': ' слюни ',
	'🤥': ' ложь ',
	'🤧': ' чихание ',
	'🤨': ' подозрение ',
	'🤩': ' звезды ',
	'🤪': ' безумие ',
	'🤫': ' тишина ',
	'🤬': ' ругательство ',
	'🤭': ' секрет ',
	'🤮': ' рвота ',
	'🤯': ' взрыв мозга ',
	}
	for emoji, replacement in emoji_map.items():
	text = text.replace(emoji, replacement)
	return text

	def preprocess(self, text: str) -> str:
	"""Основная функция предобработки текста."""
	if not text:
	return ""

	# Удаление HTML
	if self.config.remove_html:
	text = remove_html(text)

	# Удаление URL
	if self.config.remove_urls:
	text = self._remove_urls(text)

	# Удаление email
	if self.config.remove_emails:
	text = self._remove_emails(text)

	# Замена эмодзи
	if self.config.emoji_to_text:
	text = self._emoji_to_text(text)

	# Нормализация пробелов
	text = normalize_whitespace(text)

	# Приведение к нижнему регистру
	if self.config.lowercase:
	text = text.lower()

	# Удаление чисел (опционально)
	if self.config.remove_numbers:
	text = re.sub(r'\d+', '', text)

	# Лемматизация
	if self.config.lemmatize and self.nlp:
	doc = self.nlp(text)
	tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
	text = ' '.join(tokens)
	else:
	# Простая токенизация
	tokens = simple_preprocess(text, deacc=False, min_len=self.config.min_token_length)
	text = ' '.join(tokens)

	# Удаление стоп-слов (если не использовалась лемматизация со spaCy)
	if self.config.remove_stopwords and not (self.config.lemmatize and self.nlp):
	from src.text_cleaner import remove_stopwords_tokens
	tokens = text.split()
	tokens = remove_stopwords_tokens(tokens)
	text = ' '.join(tokens)

	# Финальная нормализация
	text = normalize_whitespace(text)

	return text

	def preprocess_batch(self, texts: List[str]) -> List[str]:
	"""Предобработка списка текстов."""
	return [self.preprocess(text) for text in texts]


	def extract_meta_features(texts: List[str]) -> np.ndarray:
	"""
	Извлекает мета-признаки из текстов.

	Возвращает:
	Массив формы (n_texts, n_features) с признаками:
	- длина текста (символы)
	- средняя длина слова
	- количество уникальных слов
	- доля знаков препинания
	- доля заглавных букв
	- доля цифр
	"""
	features = []

	for text in texts:
	if not text:
	features.append([0, 0, 0, 0, 0, 0])
	continue

	# Длина текста
	text_length = len(text)

	# Токены
	tokens = text.split()
	if not tokens:
	features.append([text_length, 0, 0, 0, 0, 0])
	continue

	# Средняя длина слова
	avg_word_length = np.mean([len(token) for token in tokens])

	# Количество уникальных слов
	unique_words = len(set(tokens))

	# Доля знаков препинания
	punct_count = sum(1 for c in text if c in '.,;:!?()[]{}"\'-')
	punct_ratio = punct_count / text_length if text_length > 0 else 0

	# Доля заглавных букв
	upper_count = sum(1 for c in text if c.isupper())
	upper_ratio = upper_count / text_length if text_length > 0 else 0

	# Доля цифр
	digit_count = sum(1 for c in text if c.isdigit())
	digit_ratio = digit_count / text_length if text_length > 0 else 0

	features.append([
	text_length,
	avg_word_length,
	unique_words,
	punct_ratio,
	upper_ratio,
	digit_ratio
	])

	return np.array(features)


	def vectorize_with_classical(texts: List[str], method: str = "tfidf",
	ngram_range: Tuple[int, int] = (1, 2),
	max_features: Optional[int] = None) -> Tuple[np.ndarray, Any]:
	"""
	Векторизация текстов классическими методами.

	Args:
	texts: Список текстов
	method: Метод векторизации (tfidf, bow)
	ngram_range: Диапазон n-грамм
	max_features: Максимальное количество признаков

	Returns:
	Матрица признаков и векторизатор
	"""
	config = VectorizationConfig(
	method=method,
	ngram_range=ngram_range,
	max_features=max_features
	)
	vectorizer = ClassicalVectorizers(config)
	X, _ = vectorizer.fit_transform(texts)
	return X.toarray() if hasattr(X, 'toarray') else X, vectorizer


	def vectorize_with_embeddings(texts: List[str],
	model: Any,
	aggregation: str = "mean") -> np.ndarray:
	"""
	Векторизация текстов с использованием обученных эмбеддингов.

	Args:
	texts: Список текстов (уже токенизированных)
	model: Обученная модель (Word2Vec, FastText, Doc2Vec)
	aggregation: Метод агрегации (mean, max, sum)

	Returns:
	Матрица эмбеддингов документов
	"""
	if isinstance(model, Doc2Vec):
	# Doc2Vec имеет встроенный метод для документов
	vectors = []
	for text in texts:
	tokens = simple_preprocess(text, deacc=False, min_len=1)
	if tokens:
	vec = model.infer_vector(tokens)
	else:
	vec = np.zeros(model.vector_size)
	vectors.append(vec)
	return np.array(vectors)

	# Word2Vec / FastText
	kv = model.wv if hasattr(model, 'wv') else model
	vector_size = kv.vector_size if hasattr(kv, 'vector_size') else model.vector_size

	vectors = []
	for text in texts:
	tokens = simple_preprocess(text, deacc=False, min_len=1)
	word_vectors = []
	for token in tokens:
	if token in kv:
	word_vectors.append(kv[token])

	if not word_vectors:
	vectors.append(np.zeros(vector_size))
	continue

	word_vectors = np.array(word_vectors)

	if aggregation == "mean":
	doc_vector = np.mean(word_vectors, axis=0)
	elif aggregation == "max":
	doc_vector = np.max(word_vectors, axis=0)
	elif aggregation == "sum":
	doc_vector = np.sum(word_vectors, axis=0)
	else:
	doc_vector = np.mean(word_vectors, axis=0)

	vectors.append(doc_vector)

	return np.array(vectors)


	if __name__ == "__main__":
	# Тестирование
	sample_texts = [
	"Это тестовый текст для проверки предобработки. https://example.com test@email.ru",
	"Второй текст с эмодзи 😀 и HTML <p>тегами</p>.",
	"Третий текст 123 с числами и ПРОПИСНЫМИ буквами!"
	]

	config = PreprocessingConfig(
	lowercase=True,
	remove_html=True,
	remove_urls=True,
	remove_emails=True,
	lemmatize=False, # Отключаем для теста
	remove_stopwords=False
	)

	preprocessor = TextPreprocessor(config)
	processed = preprocessor.preprocess_batch(sample_texts)

	print("Обработанные тексты:")
	for i, (orig, proc) in enumerate(zip(sample_texts, processed)):
	print(f"\n{i+1}. Исходный: {orig[:50]}...")
	print(f" Обработанный: {proc[:50]}...")

	# Мета-признаки
	meta_features = extract_meta_features(processed)
	print(f"\nМета-признаки (форма: {meta_features.shape}):")
	print(meta_features)