Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

File size: 18,227 Bytes

54ccdcb

# src/train_subword.py
"""
Модуль для обучения подсловных моделей токенизации (BPE, WordPiece, Unigram).
Поддерживает обучение моделей с различными параметрами и их сравнительный анализ.
"""

import os
import json
import time
from typing import List, Dict, Tuple, Optional, Any
from dataclasses import dataclass
from pathlib import Path
import pandas as pd

# Импорты для различных библиотек токенизации
try:
    from tokenizers import Tokenizer, trainers, models, pre_tokenizers, normalizers
    from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
    TOKENIZERS_AVAILABLE = True
except ImportError:
    TOKENIZERS_AVAILABLE = False

try:
    import sentencepiece as spm
    SENTENCEPIECE_AVAILABLE = True
except ImportError:
    SENTENCEPIECE_AVAILABLE = False


@dataclass
class SubwordModelConfig:
    """Конфигурация для обучения подсловной модели."""
    model_type: str  # 'bpe', 'wordpiece', 'unigram'
    vocab_size: int
    min_frequency: int = 2
    special_tokens: List[str] = None
    model_name: str = ""
    
    def __post_init__(self):
        if self.special_tokens is None:
            self.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        if not self.model_name:
            self.model_name = f"{self.model_type}_{self.vocab_size}"


@dataclass
class SubwordMetrics:
    """Метрики для оценки подсловных моделей."""
    model_name: str
    vocab_size: int
    fragmentation_rate: float
    compression_ratio: float
    reconstruction_accuracy: float
    training_time: float
    oov_rate: float = 0.0


class SubwordModelTrainer:
    """Класс для обучения и сравнения подсловных моделей токенизации."""
    
    def __init__(self, output_dir: str = "models"):
        """
        Инициализация тренера.
        
        Args:
            output_dir: Директория для сохранения моделей
        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.models = {}
        self.metrics = {}
    
    def prepare_corpus(self, input_path: str, output_path: str, text_field: str = 'text') -> int:
        """
        Подготавливает корпус для обучения подсловных моделей.
        
        Args:
            input_path: Путь к JSONL файлу с корпусом
            output_path: Путь для сохранения подготовленного корпуса
            text_field: Поле с текстом статьи
        
        Returns:
            Количество обработанных статей
        """
        import json
        
        texts = []
        with open(input_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    article = json.loads(line.strip())
                    if text_field in article and article[text_field].strip():
                        texts.append(article[text_field])
                except json.JSONDecodeError:
                    continue
        
        # Сохраняем корпус как текстовый файл
        with open(output_path, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(text + '\n')
        
        return len(texts)
    
    def train_bpe_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
        """
        Обучает BPE модель.
        
        Args:
            config: Конфигурация модели
            corpus_path: Путь к корпусу
        
        Returns:
            Путь к сохраненной модели
        """
        if not TOKENIZERS_AVAILABLE:
            raise ImportError("Библиотека tokenizers не установлена")
        
        # Создаем токенизатор
        tokenizer = Tokenizer(models.BPE())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        # Создаем тренер
        trainer = BpeTrainer(
            vocab_size=config.vocab_size,
            min_frequency=config.min_frequency,
            special_tokens=config.special_tokens
        )
        
        # Обучаем модель
        start_time = time.time()
        tokenizer.train([corpus_path], trainer)
        training_time = time.time() - start_time
        
        # Сохраняем модель
        model_path = self.output_dir / f"{config.model_name}.json"
        tokenizer.save(str(model_path))
        
        # Сохраняем метрики
        self.metrics[config.model_name] = {
            'training_time': training_time,
            'model_type': 'bpe'
        }
        
        return str(model_path)
    
    def train_wordpiece_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
        """
        Обучает WordPiece модель.
        
        Args:
            config: Конфигурация модели
            corpus_path: Путь к корпусу
        
        Returns:
            Путь к сохраненной модели
        """
        if not TOKENIZERS_AVAILABLE:
            raise ImportError("Библиотека tokenizers не установлена")
        
        # Создаем токенизатор
        tokenizer = Tokenizer(models.WordPiece())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        # Создаем тренер
        trainer = WordPieceTrainer(
            vocab_size=config.vocab_size,
            min_frequency=config.min_frequency,
            special_tokens=config.special_tokens
        )
        
        # Обучаем модель
        start_time = time.time()
        tokenizer.train([corpus_path], trainer)
        training_time = time.time() - start_time
        
        # Сохраняем модель
        model_path = self.output_dir / f"{config.model_name}.json"
        tokenizer.save(str(model_path))
        
        # Сохраняем метрики
        self.metrics[config.model_name] = {
            'training_time': training_time,
            'model_type': 'wordpiece'
        }
        
        return str(model_path)
    
    def train_unigram_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
        """
        Обучает Unigram модель.
        
        Args:
            config: Конфигурация модели
            corpus_path: Путь к корпусу
        
        Returns:
            Путь к сохраненной модели
        """
        if not TOKENIZERS_AVAILABLE:
            raise ImportError("Библиотека tokenizers не установлена")
        
        # Создаем токенизатор
        tokenizer = Tokenizer(models.Unigram())
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        # Создаем тренер
        trainer = UnigramTrainer(
            vocab_size=config.vocab_size,
            min_frequency=config.min_frequency,
            special_tokens=config.special_tokens
        )
        
        # Обучаем модель
        start_time = time.time()
        tokenizer.train([corpus_path], trainer)
        training_time = time.time() - start_time
        
        # Сохраняем модель
        model_path = self.output_dir / f"{config.model_name}.json"
        tokenizer.save(str(model_path))
        
        # Сохраняем метрики
        self.metrics[config.model_name] = {
            'training_time': training_time,
            'model_type': 'unigram'
        }
        
        return str(model_path)
    
    def train_sentencepiece_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
        """
        Обучает SentencePiece модель.
        
        Args:
            config: Конфигурация модели
            corpus_path: Путь к корпусу
        
        Returns:
            Путь к сохраненной модели
        """
        if not SENTENCEPIECE_AVAILABLE:
            raise ImportError("Библиотека sentencepiece не установлена")
        
        # Параметры для SentencePiece
        model_prefix = str(self.output_dir / config.model_name)
        
        # Определяем тип модели
        model_type_map = {
            'bpe': 'bpe',
            'wordpiece': 'word',  # SentencePiece не поддерживает WordPiece напрямую
            'unigram': 'unigram'
        }
        
        spm_model_type = model_type_map.get(config.model_type, 'bpe')
        
        # Параметры обучения
        train_args = [
            f'--input={corpus_path}',
            f'--model_prefix={model_prefix}',
            f'--vocab_size={config.vocab_size}',
            f'--model_type={spm_model_type}',
            f'--character_coverage=0.9995',
            f'--normalization_rule_name=nfkc',
            f'--user_defined_symbols={",".join(config.special_tokens)}'
        ]
        
        # Обучаем модель
        start_time = time.time()
        spm.SentencePieceTrainer.train(' '.join(train_args))
        training_time = time.time() - start_time
        
        # Сохраняем метрики
        self.metrics[config.model_name] = {
            'training_time': training_time,
            'model_type': f'sentencepiece_{spm_model_type}'
        }
        
        return f"{model_prefix}.model"
    
    def train_model(self, config: SubwordModelConfig, corpus_path: str, use_sentencepiece: bool = False) -> str:
        """
        Обучает модель указанного типа.
        
        Args:
            config: Конфигурация модели
            corpus_path: Путь к корпусу
            use_sentencepiece: Использовать SentencePiece вместо tokenizers
        
        Returns:
            Путь к сохраненной модели
        """
        print(f"Обучаем модель {config.model_name} ({config.model_type})...")
        
        if use_sentencepiece and SENTENCEPIECE_AVAILABLE:
            return self.train_sentencepiece_model(config, corpus_path)
        
        if config.model_type == 'bpe':
            return self.train_bpe_model(config, corpus_path)
        elif config.model_type == 'wordpiece':
            return self.train_wordpiece_model(config, corpus_path)
        elif config.model_type == 'unigram':
            return self.train_unigram_model(config, corpus_path)
        else:
            raise ValueError(f"Неподдерживаемый тип модели: {config.model_type}")
    
    def evaluate_model(self, model_path: str, test_texts: List[str]) -> SubwordMetrics:
        """
        Оценивает качество обученной модели.
        
        Args:
            model_path: Путь к модели
            test_texts: Тестовые тексты
        
        Returns:
            Метрики модели
        """
        if not TOKENIZERS_AVAILABLE:
            raise ImportError("Библиотека tokenizers не установлена")
        
        # Загружаем модель
        tokenizer = Tokenizer.from_file(model_path)
        
        total_tokens = 0
        total_words = 0
        fragmented_words = 0
        reconstruction_errors = 0
        
        for text in test_texts:
            # Токенизируем
            encoded = tokenizer.encode(text)
            tokens = encoded.tokens
            
            # Декодируем обратно
            reconstructed = tokenizer.decode(encoded.ids)
            
            # Подсчитываем метрики
            words = text.split()
            total_words += len(words)
            total_tokens += len(tokens)
            
            # Подсчитываем фрагментированные слова
            for word in words:
                word_tokens = tokenizer.encode(word).tokens
                if len(word_tokens) > 1:
                    fragmented_words += 1
            
            # Проверяем точность реконструкции
            if reconstructed.strip() != text.strip():
                reconstruction_errors += 1
        
        # Вычисляем метрики
        fragmentation_rate = fragmented_words / total_words if total_words > 0 else 0
        compression_ratio = total_words / total_tokens if total_tokens > 0 else 1
        reconstruction_accuracy = 1 - (reconstruction_errors / len(test_texts)) if test_texts else 1
        
        model_name = Path(model_path).stem
        
        return SubwordMetrics(
            model_name=model_name,
            vocab_size=tokenizer.get_vocab_size(),
            fragmentation_rate=fragmentation_rate,
            compression_ratio=compression_ratio,
            reconstruction_accuracy=reconstruction_accuracy,
            training_time=self.metrics.get(model_name, {}).get('training_time', 0),
            oov_rate=0.0  # Будет вычислено отдельно
        )
    
    def train_multiple_models(self, corpus_path: str, vocab_sizes: List[int] = None) -> Dict[str, str]:
        """
        Обучает несколько моделей с разными параметрами.
        
        Args:
            corpus_path: Путь к корпусу
            vocab_sizes: Список размеров словаря
        
        Returns:
            Словарь {имя_модели: путь_к_модели}
        """
        if vocab_sizes is None:
            vocab_sizes = [8000, 16000, 32000]
        
        model_types = ['bpe', 'wordpiece', 'unigram']
        trained_models = {}
        
        for model_type in model_types:
            for vocab_size in vocab_sizes:
                config = SubwordModelConfig(
                    model_type=model_type,
                    vocab_size=vocab_size,
                    min_frequency=2
                )
                
                try:
                    model_path = self.train_model(config, corpus_path)
                    trained_models[config.model_name] = model_path
                    print(f"Модель {config.model_name} обучена успешно")
                except Exception as e:
                    print(f"Ошибка при обучении модели {config.model_name}: {e}")
        
        return trained_models
    
    def compare_models(self, model_paths: Dict[str, str], test_texts: List[str]) -> pd.DataFrame:
        """
        Сравнивает несколько обученных моделей.
        
        Args:
            model_paths: Словарь {имя_модели: путь_к_модели}
            test_texts: Тестовые тексты
        
        Returns:
            DataFrame с результатами сравнения
        """
        results = []
        
        for model_name, model_path in model_paths.items():
            try:
                metrics = self.evaluate_model(model_path, test_texts)
                results.append({
                    'Модель': model_name,
                    'Тип': metrics.model_name.split('_')[0],
                    'Размер словаря': metrics.vocab_size,
                    'Процент фрагментации': round(metrics.fragmentation_rate * 100, 2),
                    'Коэффициент сжатия': round(metrics.compression_ratio, 3),
                    'Точность реконструкции': round(metrics.reconstruction_accuracy * 100, 2),
                    'Время обучения (сек)': round(metrics.training_time, 2)
                })
            except Exception as e:
                print(f"Ошибка при оценке модели {model_name}: {e}")
        
        return pd.DataFrame(results)
    
    def save_comparison_results(self, results_df: pd.DataFrame, output_path: str):
        """Сохраняет результаты сравнения в CSV файл."""
        results_df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"Результаты сравнения сохранены в {output_path}")


def main():
    """Основная функция для обучения и сравнения подсловных моделей."""
    trainer = SubwordModelTrainer()
    
    # Подготавливаем корпус
    corpus_path = "data/corpus.txt"
    if not os.path.exists(corpus_path):
        print("Подготавливаем корпус...")
        articles_count = trainer.prepare_corpus("data/raw_corpus.jsonl", corpus_path)
        print(f"Подготовлено {articles_count} статей")
    
    # Обучаем модели
    print("Обучаем подсловные модели...")
    trained_models = trainer.train_multiple_models(corpus_path)
    
    # Загружаем тестовые тексты
    test_texts = []
    with open(corpus_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 100:  # Берем первые 100 строк для тестирования
                break
            test_texts.append(line.strip())
    
    # Сравниваем модели
    print("Сравниваем модели...")
    comparison_results = trainer.compare_models(trained_models, test_texts)
    
    print("\nРезультаты сравнения:")
    print(comparison_results)
    
    # Сохраняем результаты
    trainer.save_comparison_results(comparison_results, "results/subword_comparison.csv")


if __name__ == "__main__":
    main()