Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

NLP_Homework_1 / src /train_subword.py

Kolesnikov Dmitry

feat: Готовый проект

54ccdcb 2 months ago

18.2 kB

	# src/train_subword.py
	"""
	Модуль для обучения подсловных моделей токенизации (BPE, WordPiece, Unigram).
	Поддерживает обучение моделей с различными параметрами и их сравнительный анализ.
	"""

	import os
	import json
	import time
	from typing import List, Dict, Tuple, Optional, Any
	from dataclasses import dataclass
	from pathlib import Path
	import pandas as pd

	# Импорты для различных библиотек токенизации
	try:
	from tokenizers import Tokenizer, trainers, models, pre_tokenizers, normalizers
	from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
	TOKENIZERS_AVAILABLE = True
	except ImportError:
	TOKENIZERS_AVAILABLE = False

	try:
	import sentencepiece as spm
	SENTENCEPIECE_AVAILABLE = True
	except ImportError:
	SENTENCEPIECE_AVAILABLE = False


	@dataclass
	class SubwordModelConfig:
	"""Конфигурация для обучения подсловной модели."""
	model_type: str # 'bpe', 'wordpiece', 'unigram'
	vocab_size: int
	min_frequency: int = 2
	special_tokens: List[str] = None
	model_name: str = ""

	def __post_init__(self):
	if self.special_tokens is None:
	self.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
	if not self.model_name:
	self.model_name = f"{self.model_type}_{self.vocab_size}"


	@dataclass
	class SubwordMetrics:
	"""Метрики для оценки подсловных моделей."""
	model_name: str
	vocab_size: int
	fragmentation_rate: float
	compression_ratio: float
	reconstruction_accuracy: float
	training_time: float
	oov_rate: float = 0.0


	class SubwordModelTrainer:
	"""Класс для обучения и сравнения подсловных моделей токенизации."""

	def __init__(self, output_dir: str = "models"):
	"""
	Инициализация тренера.

	Args:
	output_dir: Директория для сохранения моделей
	"""
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(exist_ok=True)
	self.models = {}
	self.metrics = {}

	def prepare_corpus(self, input_path: str, output_path: str, text_field: str = 'text') -> int:
	"""
	Подготавливает корпус для обучения подсловных моделей.

	Args:
	input_path: Путь к JSONL файлу с корпусом
	output_path: Путь для сохранения подготовленного корпуса
	text_field: Поле с текстом статьи

	Returns:
	Количество обработанных статей
	"""
	import json

	texts = []
	with open(input_path, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	article = json.loads(line.strip())
	if text_field in article and article[text_field].strip():
	texts.append(article[text_field])
	except json.JSONDecodeError:
	continue

	# Сохраняем корпус как текстовый файл
	with open(output_path, 'w', encoding='utf-8') as f:
	for text in texts:
	f.write(text + '\n')

	return len(texts)

	def train_bpe_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
	"""
	Обучает BPE модель.

	Args:
	config: Конфигурация модели
	corpus_path: Путь к корпусу

	Returns:
	Путь к сохраненной модели
	"""
	if not TOKENIZERS_AVAILABLE:
	raise ImportError("Библиотека tokenizers не установлена")

	# Создаем токенизатор
	tokenizer = Tokenizer(models.BPE())
	tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

	# Создаем тренер
	trainer = BpeTrainer(
	vocab_size=config.vocab_size,
	min_frequency=config.min_frequency,
	special_tokens=config.special_tokens
	)

	# Обучаем модель
	start_time = time.time()
	tokenizer.train([corpus_path], trainer)
	training_time = time.time() - start_time

	# Сохраняем модель
	model_path = self.output_dir / f"{config.model_name}.json"
	tokenizer.save(str(model_path))

	# Сохраняем метрики
	self.metrics[config.model_name] = {
	'training_time': training_time,
	'model_type': 'bpe'
	}

	return str(model_path)

	def train_wordpiece_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
	"""
	Обучает WordPiece модель.

	Args:
	config: Конфигурация модели
	corpus_path: Путь к корпусу

	Returns:
	Путь к сохраненной модели
	"""
	if not TOKENIZERS_AVAILABLE:
	raise ImportError("Библиотека tokenizers не установлена")

	# Создаем токенизатор
	tokenizer = Tokenizer(models.WordPiece())
	tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

	# Создаем тренер
	trainer = WordPieceTrainer(
	vocab_size=config.vocab_size,
	min_frequency=config.min_frequency,
	special_tokens=config.special_tokens
	)

	# Обучаем модель
	start_time = time.time()
	tokenizer.train([corpus_path], trainer)
	training_time = time.time() - start_time

	# Сохраняем модель
	model_path = self.output_dir / f"{config.model_name}.json"
	tokenizer.save(str(model_path))

	# Сохраняем метрики
	self.metrics[config.model_name] = {
	'training_time': training_time,
	'model_type': 'wordpiece'
	}

	return str(model_path)

	def train_unigram_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
	"""
	Обучает Unigram модель.

	Args:
	config: Конфигурация модели
	corpus_path: Путь к корпусу

	Returns:
	Путь к сохраненной модели
	"""
	if not TOKENIZERS_AVAILABLE:
	raise ImportError("Библиотека tokenizers не установлена")

	# Создаем токенизатор
	tokenizer = Tokenizer(models.Unigram())
	tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

	# Создаем тренер
	trainer = UnigramTrainer(
	vocab_size=config.vocab_size,
	min_frequency=config.min_frequency,
	special_tokens=config.special_tokens
	)

	# Обучаем модель
	start_time = time.time()
	tokenizer.train([corpus_path], trainer)
	training_time = time.time() - start_time

	# Сохраняем модель
	model_path = self.output_dir / f"{config.model_name}.json"
	tokenizer.save(str(model_path))

	# Сохраняем метрики
	self.metrics[config.model_name] = {
	'training_time': training_time,
	'model_type': 'unigram'
	}

	return str(model_path)

	def train_sentencepiece_model(self, config: SubwordModelConfig, corpus_path: str) -> str:
	"""
	Обучает SentencePiece модель.

	Args:
	config: Конфигурация модели
	corpus_path: Путь к корпусу

	Returns:
	Путь к сохраненной модели
	"""
	if not SENTENCEPIECE_AVAILABLE:
	raise ImportError("Библиотека sentencepiece не установлена")

	# Параметры для SentencePiece
	model_prefix = str(self.output_dir / config.model_name)

	# Определяем тип модели
	model_type_map = {
	'bpe': 'bpe',
	'wordpiece': 'word', # SentencePiece не поддерживает WordPiece напрямую
	'unigram': 'unigram'
	}

	spm_model_type = model_type_map.get(config.model_type, 'bpe')

	# Параметры обучения
	train_args = [
	f'--input={corpus_path}',
	f'--model_prefix={model_prefix}',
	f'--vocab_size={config.vocab_size}',
	f'--model_type={spm_model_type}',
	f'--character_coverage=0.9995',
	f'--normalization_rule_name=nfkc',
	f'--user_defined_symbols={",".join(config.special_tokens)}'
	]

	# Обучаем модель
	start_time = time.time()
	spm.SentencePieceTrainer.train(' '.join(train_args))
	training_time = time.time() - start_time

	# Сохраняем метрики
	self.metrics[config.model_name] = {
	'training_time': training_time,
	'model_type': f'sentencepiece_{spm_model_type}'
	}

	return f"{model_prefix}.model"

	def train_model(self, config: SubwordModelConfig, corpus_path: str, use_sentencepiece: bool = False) -> str:
	"""
	Обучает модель указанного типа.

	Args:
	config: Конфигурация модели
	corpus_path: Путь к корпусу
	use_sentencepiece: Использовать SentencePiece вместо tokenizers

	Returns:
	Путь к сохраненной модели
	"""
	print(f"Обучаем модель {config.model_name} ({config.model_type})...")

	if use_sentencepiece and SENTENCEPIECE_AVAILABLE:
	return self.train_sentencepiece_model(config, corpus_path)

	if config.model_type == 'bpe':
	return self.train_bpe_model(config, corpus_path)
	elif config.model_type == 'wordpiece':
	return self.train_wordpiece_model(config, corpus_path)
	elif config.model_type == 'unigram':
	return self.train_unigram_model(config, corpus_path)
	else:
	raise ValueError(f"Неподдерживаемый тип модели: {config.model_type}")

	def evaluate_model(self, model_path: str, test_texts: List[str]) -> SubwordMetrics:
	"""
	Оценивает качество обученной модели.

	Args:
	model_path: Путь к модели
	test_texts: Тестовые тексты

	Returns:
	Метрики модели
	"""
	if not TOKENIZERS_AVAILABLE:
	raise ImportError("Библиотека tokenizers не установлена")

	# Загружаем модель
	tokenizer = Tokenizer.from_file(model_path)

	total_tokens = 0
	total_words = 0
	fragmented_words = 0
	reconstruction_errors = 0

	for text in test_texts:
	# Токенизируем
	encoded = tokenizer.encode(text)
	tokens = encoded.tokens

	# Декодируем обратно
	reconstructed = tokenizer.decode(encoded.ids)

	# Подсчитываем метрики
	words = text.split()
	total_words += len(words)
	total_tokens += len(tokens)

	# Подсчитываем фрагментированные слова
	for word in words:
	word_tokens = tokenizer.encode(word).tokens
	if len(word_tokens) > 1:
	fragmented_words += 1

	# Проверяем точность реконструкции
	if reconstructed.strip() != text.strip():
	reconstruction_errors += 1

	# Вычисляем метрики
	fragmentation_rate = fragmented_words / total_words if total_words > 0 else 0
	compression_ratio = total_words / total_tokens if total_tokens > 0 else 1
	reconstruction_accuracy = 1 - (reconstruction_errors / len(test_texts)) if test_texts else 1

	model_name = Path(model_path).stem

	return SubwordMetrics(
	model_name=model_name,
	vocab_size=tokenizer.get_vocab_size(),
	fragmentation_rate=fragmentation_rate,
	compression_ratio=compression_ratio,
	reconstruction_accuracy=reconstruction_accuracy,
	training_time=self.metrics.get(model_name, {}).get('training_time', 0),
	oov_rate=0.0 # Будет вычислено отдельно
	)

	def train_multiple_models(self, corpus_path: str, vocab_sizes: List[int] = None) -> Dict[str, str]:
	"""
	Обучает несколько моделей с разными параметрами.

	Args:
	corpus_path: Путь к корпусу
	vocab_sizes: Список размеров словаря

	Returns:
	Словарь {имя_модели: путь_к_модели}
	"""
	if vocab_sizes is None:
	vocab_sizes = [8000, 16000, 32000]

	model_types = ['bpe', 'wordpiece', 'unigram']
	trained_models = {}

	for model_type in model_types:
	for vocab_size in vocab_sizes:
	config = SubwordModelConfig(
	model_type=model_type,
	vocab_size=vocab_size,
	min_frequency=2
	)

	try:
	model_path = self.train_model(config, corpus_path)
	trained_models[config.model_name] = model_path
	print(f"Модель {config.model_name} обучена успешно")
	except Exception as e:
	print(f"Ошибка при обучении модели {config.model_name}: {e}")

	return trained_models

	def compare_models(self, model_paths: Dict[str, str], test_texts: List[str]) -> pd.DataFrame:
	"""
	Сравнивает несколько обученных моделей.

	Args:
	model_paths: Словарь {имя_модели: путь_к_модели}
	test_texts: Тестовые тексты

	Returns:
	DataFrame с результатами сравнения
	"""
	results = []

	for model_name, model_path in model_paths.items():
	try:
	metrics = self.evaluate_model(model_path, test_texts)
	results.append({
	'Модель': model_name,
	'Тип': metrics.model_name.split('_')[0],
	'Размер словаря': metrics.vocab_size,
	'Процент фрагментации': round(metrics.fragmentation_rate * 100, 2),
	'Коэффициент сжатия': round(metrics.compression_ratio, 3),
	'Точность реконструкции': round(metrics.reconstruction_accuracy * 100, 2),
	'Время обучения (сек)': round(metrics.training_time, 2)
	})
	except Exception as e:
	print(f"Ошибка при оценке модели {model_name}: {e}")

	return pd.DataFrame(results)

	def save_comparison_results(self, results_df: pd.DataFrame, output_path: str):
	"""Сохраняет результаты сравнения в CSV файл."""
	results_df.to_csv(output_path, index=False, encoding='utf-8')
	print(f"Результаты сравнения сохранены в {output_path}")


	def main():
	"""Основная функция для обучения и сравнения подсловных моделей."""
	trainer = SubwordModelTrainer()

	# Подготавливаем корпус
	corpus_path = "data/corpus.txt"
	if not os.path.exists(corpus_path):
	print("Подготавливаем корпус...")
	articles_count = trainer.prepare_corpus("data/raw_corpus.jsonl", corpus_path)
	print(f"Подготовлено {articles_count} статей")

	# Обучаем модели
	print("Обучаем подсловные модели...")
	trained_models = trainer.train_multiple_models(corpus_path)

	# Загружаем тестовые тексты
	test_texts = []
	with open(corpus_path, 'r', encoding='utf-8') as f:
	for i, line in enumerate(f):
	if i >= 100: # Берем первые 100 строк для тестирования
	break
	test_texts.append(line.strip())

	# Сравниваем модели
	print("Сравниваем модели...")
	comparison_results = trainer.compare_models(trained_models, test_texts)

	print("\nРезультаты сравнения:")
	print(comparison_results)

	# Сохраняем результаты
	trainer.save_comparison_results(comparison_results, "results/subword_comparison.csv")


	if __name__ == "__main__":
	main()