Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

File size: 12,493 Bytes

68545bc

"""
Нейросетевые методы классификации текстов: MLP, CNN, LSTM, GRU, гибридные архитектуры.
Примечание: Для трансформеров (BERT, RuBERT) требуется установка transformers и torch.
"""

from __future__ import annotations

import time
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, models, callbacks
    TENSORFLOW_AVAILABLE = True
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("⚠️ TensorFlow не установлен. Нейросетевые модели недоступны.")

try:
    import torch
    import torch.nn as nn
    from transformers import AutoTokenizer, AutoModel
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️ PyTorch/Transformers не установлены. Трансформерные модели недоступны.")


@dataclass
class NeuralConfig:
    """Конфигурация нейросетевой модели."""
    model_type: str  # mlp, cnn, lstm, gru, cnn_lstm, birnn_attention
    input_dim: int
    num_classes: int
    embedding_dim: int = 300
    hidden_dim: int = 128
    dropout: float = 0.5
    learning_rate: float = 0.001
    epochs: int = 10
    batch_size: int = 32
    validation_split: float = 0.2


class NeuralClassifiers:
    """Класс для работы с нейросетевыми классификаторами."""
    
    def __init__(self, config: NeuralConfig):
        if not TENSORFLOW_AVAILABLE:
            raise ImportError("TensorFlow не установлен. Установите: pip install tensorflow")
        
        self.config = config
        self.model = self._create_model()
        self.history = None
        self.train_time = 0.0
        self.predict_time = 0.0
    
    def _create_model(self):
        """Создает нейросетевую модель."""
        model_type = self.config.model_type.lower()
        
        if model_type == "mlp":
            return self._create_mlp()
        elif model_type == "cnn":
            return self._create_cnn()
        elif model_type == "lstm":
            return self._create_lstm()
        elif model_type == "gru":
            return self._create_gru()
        elif model_type == "cnn_lstm":
            return self._create_cnn_lstm()
        elif model_type == "birnn_attention":
            return self._create_birnn_attention()
        else:
            raise ValueError(f"Неизвестный тип модели: {model_type}")
    
    def _create_mlp(self):
        """Многослойный персептрон."""
        model = models.Sequential([
            layers.Dense(self.config.hidden_dim, activation='relu', input_dim=self.config.input_dim),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.hidden_dim // 2, activation='relu'),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def _create_cnn(self):
        """Сверточная нейросеть для текста (Kim CNN)."""
        # Для CNN нужна последовательность токенов, поэтому используем embedding
        # В упрощенной версии работаем с уже векторизованными данными
        model = models.Sequential([
            layers.Reshape((self.config.input_dim, 1), input_shape=(self.config.input_dim,)),
            layers.Conv1D(128, 3, activation='relu'),
            layers.MaxPooling1D(2),
            layers.Conv1D(64, 3, activation='relu'),
            layers.GlobalMaxPooling1D(),
            layers.Dense(self.config.hidden_dim, activation='relu'),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def _create_lstm(self):
        """LSTM сеть."""
        model = models.Sequential([
            layers.Reshape((self.config.input_dim, 1), input_shape=(self.config.input_dim,)),
            layers.LSTM(self.config.hidden_dim, return_sequences=False),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def _create_gru(self):
        """GRU сеть."""
        model = models.Sequential([
            layers.Reshape((self.config.input_dim, 1), input_shape=(self.config.input_dim,)),
            layers.GRU(self.config.hidden_dim, return_sequences=False),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def _create_cnn_lstm(self):
        """Гибридная CNN + LSTM архитектура."""
        model = models.Sequential([
            layers.Reshape((self.config.input_dim, 1), input_shape=(self.config.input_dim,)),
            layers.Conv1D(64, 3, activation='relu'),
            layers.MaxPooling1D(2),
            layers.LSTM(self.config.hidden_dim, return_sequences=False),
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def _create_birnn_attention(self):
        """Двунаправленная RNN с механизмом внимания (упрощенная версия)."""
        # Упрощенная версия без настоящего attention механизма
        model = models.Sequential([
            layers.Reshape((self.config.input_dim, 1), input_shape=(self.config.input_dim,)),
            layers.Bidirectional(layers.LSTM(self.config.hidden_dim, return_sequences=True)),
            layers.GlobalAveragePooling1D(),  # Простая агрегация вместо attention
            layers.Dropout(self.config.dropout),
            layers.Dense(self.config.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    
    def fit(self, X, y, validation_data=None):
        """Обучение модели."""
        if not TENSORFLOW_AVAILABLE:
            raise ImportError("TensorFlow не установлен")
        
        start = time.time()
        
        callbacks_list = [
            callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-7)
        ]
        
        if validation_data is None and self.config.validation_split > 0:
            self.history = self.model.fit(
                X, y,
                epochs=self.config.epochs,
                batch_size=self.config.batch_size,
                validation_split=self.config.validation_split,
                callbacks=callbacks_list,
                verbose=1
            )
        else:
            self.history = self.model.fit(
                X, y,
                epochs=self.config.epochs,
                batch_size=self.config.batch_size,
                validation_data=validation_data,
                callbacks=callbacks_list,
                verbose=1
            )
        
        self.train_time = time.time() - start
        return self
    
    def predict(self, X):
        """Предсказание классов."""
        start = time.time()
        predictions = self.model.predict(X, verbose=0)
        self.predict_time = time.time() - start
        return np.argmax(predictions, axis=1)
    
    def predict_proba(self, X):
        """Предсказание вероятностей."""
        return self.model.predict(X, verbose=0)


class TransformerClassifier:
    """
    Классификатор на основе трансформеров (BERT, RuBERT).
    Требует установки transformers и torch.
    """
    
    def __init__(self, model_name: str = "DeepPavlov/rubert-base-cased", 
                 num_classes: int = 2,
                 max_length: int = 512,
                 learning_rate: float = 2e-5,
                 epochs: int = 3,
                 batch_size: int = 16):
        if not TRANSFORMERS_AVAILABLE:
            raise ImportError(
                "PyTorch и Transformers не установлены. "
                "Установите: pip install torch transformers"
            )
        
        self.model_name = model_name
        self.num_classes = num_classes
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Добавляем классификационный слой
        self.classifier = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.classifier.to(self.device)
    
    def fit(self, texts: List[str], labels: List[int]):
        """Обучение трансформерной модели."""
        # Реализация обучения требует более сложной логики
        # Здесь только заглушка
        raise NotImplementedError(
            "Полная реализация обучения трансформеров требует дополнительной настройки. "
            "Рекомендуется использовать готовые решения из библиотеки transformers."
        )
    
    def predict(self, texts: List[str]):
        """Предсказание классов."""
        raise NotImplementedError("См. fit()")


if __name__ == "__main__":
    # Тестирование (только если TensorFlow доступен)
    if TENSORFLOW_AVAILABLE:
        from sklearn.datasets import make_classification
        from sklearn.model_selection import train_test_split
        
        X, y = make_classification(n_samples=1000, n_features=100, n_classes=3, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        config = NeuralConfig(
            model_type="mlp",
            input_dim=100,
            num_classes=3,
            epochs=5
        )
        
        classifier = NeuralClassifiers(config)
        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test)
        
        from sklearn.metrics import accuracy_score
        print(f"Точность: {accuracy_score(y_test, predictions):.4f}")
    else:
        print("TensorFlow не установлен. Тесты пропущены.")