Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

File size: 13,970 Bytes

68545bc

"""
Модуль для предобработки текстовых данных для задач классификации.
Включает очистку, токенизацию, лемматизацию, векторизацию и извлечение мета-признаков.
"""

from __future__ import annotations

import re
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass

import numpy as np
from bs4 import BeautifulSoup
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.utils import simple_preprocess

from src.text_cleaner import clean_text, remove_html, normalize_whitespace
from src.classical_vectorizers import ClassicalVectorizers, VectorizationConfig


@dataclass
class PreprocessingConfig:
    """Конфигурация предобработки текста."""
    lowercase: bool = True
    remove_html: bool = True
    remove_urls: bool = True
    remove_emails: bool = True
    remove_numbers: bool = False
    lemmatize: bool = True
    remove_stopwords: bool = False
    min_token_length: int = 2
    emoji_to_text: bool = True


class TextPreprocessor:
    """Класс для предобработки текстов для классификации."""
    
    def __init__(self, config: Optional[PreprocessingConfig] = None):
        self.config = config or PreprocessingConfig()
        self.nlp = None
        if self.config.lemmatize:
            try:
                self.nlp = spacy.load("ru_core_news_sm")
            except OSError:
                try:
                    self.nlp = spacy.load("ru_core_news_md")
                except OSError:
                    print("⚠️ spaCy русская модель не найдена. Лемматизация отключена.")
                    self.config.lemmatize = False
    
    def _remove_urls(self, text: str) -> str:
        """Удаляет URL из текста."""
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        return re.sub(url_pattern, '', text)
    
    def _remove_emails(self, text: str) -> str:
        """Удаляет email адреса из текста."""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        return re.sub(email_pattern, '', text)
    
    def _emoji_to_text(self, text: str) -> str:
        """Заменяет эмодзи на текстовое описание (упрощенная версия)."""
        # Базовые замены для русскоязычного контекста
        emoji_map = {
            '😀': ' улыбка ',
            '😃': ' радость ',
            '😄': ' смех ',
            '😁': ' веселье ',
            '😆': ' хохот ',
            '😅': ' пот ',
            '😂': ' слезы радости ',
            '🤣': ' хохот ',
            '😊': ' улыбка ',
            '😇': ' ангел ',
            '🙂': ' улыбка ',
            '🙃': ' перевернутое лицо ',
            '😉': ' подмигивание ',
            '😌': ' облегчение ',
            '😍': ' любовь ',
            '🥰': ' любовь ',
            '😘': ' поцелуй ',
            '😗': ' поцелуй ',
            '😙': ' поцелуй ',
            '😚': ' поцелуй ',
            '😋': ' вкусно ',
            '😛': ' язык ',
            '😜': ' подмигивание ',
            '😝': ' язык ',
            '😞': ' грусть ',
            '😟': ' беспокойство ',
            '😠': ' злость ',
            '😡': ' ярость ',
            '😢': ' плач ',
            '😣': ' страдание ',
            '😤': ' упрямство ',
            '😥': ' разочарование ',
            '😦': ' удивление ',
            '😧': ' шок ',
            '😨': ' страх ',
            '😩': ' усталость ',
            '😪': ' сонливость ',
            '😫': ' усталость ',
            '😬': ' напряжение ',
            '😭': ' плач ',
            '😮': ' удивление ',
            '😯': ' удивление ',
            '😰': ' тревога ',
            '😱': ' ужас ',
            '😲': ' шок ',
            '😳': ' смущение ',
            '😴': ' сон ',
            '😵': ' головокружение ',
            '😶': ' без слов ',
            '😷': ' маска ',
            '🤐': ' молчание ',
            '🤒': ' болезнь ',
            '🤕': ' травма ',
            '🤢': ' тошнота ',
            '🤣': ' хохот ',
            '🤤': ' слюни ',
            '🤥': ' ложь ',
            '🤧': ' чихание ',
            '🤨': ' подозрение ',
            '🤩': ' звезды ',
            '🤪': ' безумие ',
            '🤫': ' тишина ',
            '🤬': ' ругательство ',
            '🤭': ' секрет ',
            '🤮': ' рвота ',
            '🤯': ' взрыв мозга ',
        }
        for emoji, replacement in emoji_map.items():
            text = text.replace(emoji, replacement)
        return text
    
    def preprocess(self, text: str) -> str:
        """Основная функция предобработки текста."""
        if not text:
            return ""
        
        # Удаление HTML
        if self.config.remove_html:
            text = remove_html(text)
        
        # Удаление URL
        if self.config.remove_urls:
            text = self._remove_urls(text)
        
        # Удаление email
        if self.config.remove_emails:
            text = self._remove_emails(text)
        
        # Замена эмодзи
        if self.config.emoji_to_text:
            text = self._emoji_to_text(text)
        
        # Нормализация пробелов
        text = normalize_whitespace(text)
        
        # Приведение к нижнему регистру
        if self.config.lowercase:
            text = text.lower()
        
        # Удаление чисел (опционально)
        if self.config.remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # Лемматизация
        if self.config.lemmatize and self.nlp:
            doc = self.nlp(text)
            tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
            text = ' '.join(tokens)
        else:
            # Простая токенизация
            tokens = simple_preprocess(text, deacc=False, min_len=self.config.min_token_length)
            text = ' '.join(tokens)
        
        # Удаление стоп-слов (если не использовалась лемматизация со spaCy)
        if self.config.remove_stopwords and not (self.config.lemmatize and self.nlp):
            from src.text_cleaner import remove_stopwords_tokens
            tokens = text.split()
            tokens = remove_stopwords_tokens(tokens)
            text = ' '.join(tokens)
        
        # Финальная нормализация
        text = normalize_whitespace(text)
        
        return text
    
    def preprocess_batch(self, texts: List[str]) -> List[str]:
        """Предобработка списка текстов."""
        return [self.preprocess(text) for text in texts]


def extract_meta_features(texts: List[str]) -> np.ndarray:
    """
    Извлекает мета-признаки из текстов.
    
    Возвращает:
        Массив формы (n_texts, n_features) с признаками:
        - длина текста (символы)
        - средняя длина слова
        - количество уникальных слов
        - доля знаков препинания
        - доля заглавных букв
        - доля цифр
    """
    features = []
    
    for text in texts:
        if not text:
            features.append([0, 0, 0, 0, 0, 0])
            continue
        
        # Длина текста
        text_length = len(text)
        
        # Токены
        tokens = text.split()
        if not tokens:
            features.append([text_length, 0, 0, 0, 0, 0])
            continue
        
        # Средняя длина слова
        avg_word_length = np.mean([len(token) for token in tokens])
        
        # Количество уникальных слов
        unique_words = len(set(tokens))
        
        # Доля знаков препинания
        punct_count = sum(1 for c in text if c in '.,;:!?()[]{}"\'-')
        punct_ratio = punct_count / text_length if text_length > 0 else 0
        
        # Доля заглавных букв
        upper_count = sum(1 for c in text if c.isupper())
        upper_ratio = upper_count / text_length if text_length > 0 else 0
        
        # Доля цифр
        digit_count = sum(1 for c in text if c.isdigit())
        digit_ratio = digit_count / text_length if text_length > 0 else 0
        
        features.append([
            text_length,
            avg_word_length,
            unique_words,
            punct_ratio,
            upper_ratio,
            digit_ratio
        ])
    
    return np.array(features)


def vectorize_with_classical(texts: List[str], method: str = "tfidf", 
                            ngram_range: Tuple[int, int] = (1, 2),
                            max_features: Optional[int] = None) -> Tuple[np.ndarray, Any]:
    """
    Векторизация текстов классическими методами.
    
    Args:
        texts: Список текстов
        method: Метод векторизации (tfidf, bow)
        ngram_range: Диапазон n-грамм
        max_features: Максимальное количество признаков
    
    Returns:
        Матрица признаков и векторизатор
    """
    config = VectorizationConfig(
        method=method,
        ngram_range=ngram_range,
        max_features=max_features
    )
    vectorizer = ClassicalVectorizers(config)
    X, _ = vectorizer.fit_transform(texts)
    return X.toarray() if hasattr(X, 'toarray') else X, vectorizer


def vectorize_with_embeddings(texts: List[str], 
                              model: Any,
                              aggregation: str = "mean") -> np.ndarray:
    """
    Векторизация текстов с использованием обученных эмбеддингов.
    
    Args:
        texts: Список текстов (уже токенизированных)
        model: Обученная модель (Word2Vec, FastText, Doc2Vec)
        aggregation: Метод агрегации (mean, max, sum)
    
    Returns:
        Матрица эмбеддингов документов
    """
    if isinstance(model, Doc2Vec):
        # Doc2Vec имеет встроенный метод для документов
        vectors = []
        for text in texts:
            tokens = simple_preprocess(text, deacc=False, min_len=1)
            if tokens:
                vec = model.infer_vector(tokens)
            else:
                vec = np.zeros(model.vector_size)
            vectors.append(vec)
        return np.array(vectors)
    
    # Word2Vec / FastText
    kv = model.wv if hasattr(model, 'wv') else model
    vector_size = kv.vector_size if hasattr(kv, 'vector_size') else model.vector_size
    
    vectors = []
    for text in texts:
        tokens = simple_preprocess(text, deacc=False, min_len=1)
        word_vectors = []
        for token in tokens:
            if token in kv:
                word_vectors.append(kv[token])
        
        if not word_vectors:
            vectors.append(np.zeros(vector_size))
            continue
        
        word_vectors = np.array(word_vectors)
        
        if aggregation == "mean":
            doc_vector = np.mean(word_vectors, axis=0)
        elif aggregation == "max":
            doc_vector = np.max(word_vectors, axis=0)
        elif aggregation == "sum":
            doc_vector = np.sum(word_vectors, axis=0)
        else:
            doc_vector = np.mean(word_vectors, axis=0)
        
        vectors.append(doc_vector)
    
    return np.array(vectors)


if __name__ == "__main__":
    # Тестирование
    sample_texts = [
        "Это тестовый текст для проверки предобработки. https://example.com test@email.ru",
        "Второй текст с эмодзи 😀 и HTML <p>тегами</p>.",
        "Третий текст 123 с числами и ПРОПИСНЫМИ буквами!"
    ]
    
    config = PreprocessingConfig(
        lowercase=True,
        remove_html=True,
        remove_urls=True,
        remove_emails=True,
        lemmatize=False,  # Отключаем для теста
        remove_stopwords=False
    )
    
    preprocessor = TextPreprocessor(config)
    processed = preprocessor.preprocess_batch(sample_texts)
    
    print("Обработанные тексты:")
    for i, (orig, proc) in enumerate(zip(sample_texts, processed)):
        print(f"\n{i+1}. Исходный: {orig[:50]}...")
        print(f"   Обработанный: {proc[:50]}...")
    
    # Мета-признаки
    meta_features = extract_meta_features(processed)
    print(f"\nМета-признаки (форма: {meta_features.shape}):")
    print(meta_features)