sen-1 / src /train.py

Refactor training to Hydra config and use underthesea imports

903cdb2 3 days ago

8.95 kB

	"""
	Training CLI for Vietnamese Text Classification using Hydra.

	Usage:
	python src/train.py --config-name=vntc
	python src/train.py --config-name=sentiment_general
	python src/train.py --config-name=sentiment_bank
	python src/train.py --config-name=bank

	Override params from CLI:
	python src/train.py --config-name=sentiment_general model.c=0.5 model.max_features=100000
	python src/train.py --config-name=vntc preprocessor=sentiment
	python src/train.py --config-name=sentiment_general data.vlsp2016_dir=/path/to/VLSP2016_SA
	"""

	import os
	import time
	import logging
	from pathlib import Path

	import hydra
	from omegaconf import DictConfig, OmegaConf
	from sklearn.metrics import accuracy_score, f1_score, classification_report

	from underthesea import TextClassifier, TextPreprocessor

	log = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# Preprocessor
	# ---------------------------------------------------------------------------

	def build_preprocessor(pp_cfg):
	"""Build a Rust TextPreprocessor from model.preprocessor config."""
	teencode = dict(pp_cfg.get("teencode", {})) or None
	neg_words = list(pp_cfg.get("negation_words", [])) or None
	neg_window = pp_cfg.get("negation_window", 2)

	return TextPreprocessor(
	lowercase=pp_cfg.get("lowercase", True),
	unicode_normalize=pp_cfg.get("unicode_normalize", True),
	remove_urls=pp_cfg.get("remove_urls", True),
	normalize_repeated_chars=pp_cfg.get("normalize_repeated_chars", True),
	normalize_punctuation=pp_cfg.get("normalize_punctuation", True),
	teencode=teencode,
	negation_words=neg_words,
	negation_window=neg_window,
	)


	# ---------------------------------------------------------------------------
	# Data loaders
	# ---------------------------------------------------------------------------

	def read_file(filepath):
	"""Read text file with multiple encoding attempts."""
	for enc in ['utf-16', 'utf-8', 'latin-1']:
	try:
	with open(filepath, 'r', encoding=enc) as f:
	text = ' '.join(f.read().split())
	if len(text) > 10:
	return text
	except (UnicodeDecodeError, UnicodeError):
	continue
	return None


	def load_vntc_data(data_dir):
	"""Load VNTC data from directory."""
	texts, labels = [], []
	for folder in sorted(os.listdir(data_dir)):
	folder_path = os.path.join(data_dir, folder)
	if not os.path.isdir(folder_path):
	continue
	for fname in os.listdir(folder_path):
	if fname.endswith('.txt'):
	text = read_file(os.path.join(folder_path, fname))
	if text:
	texts.append(text)
	labels.append(folder)
	return texts, labels


	def load_vlsp2016(data_dir):
	"""Load VLSP2016 sentiment data from directory."""
	label_map = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}
	texts, labels = [], []
	for split in ['train.txt', 'test.txt']:
	split_texts, split_labels = [], []
	filepath = os.path.join(data_dir, split)
	with open(filepath, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line.startswith('__label__'):
	parts = line.split(' ', 1)
	label = label_map[parts[0].replace('__label__', '')]
	text = parts[1] if len(parts) > 1 else ''
	split_texts.append(text)
	split_labels.append(label)
	texts.append(split_texts)
	labels.append(split_labels)
	return texts[0], labels[0], texts[1], labels[1]


	def load_data(cfg):
	"""Load train/test data based on Hydra data config."""
	data_cfg = cfg.data
	name = data_cfg.name
	extra_test = {}

	if name == "vntc":
	train_texts, train_labels = load_vntc_data(
	os.path.join(data_cfg.data_dir, "Train_Full"))
	test_texts, test_labels = load_vntc_data(
	os.path.join(data_cfg.data_dir, "Test_Full"))

	elif name == "bank":
	from datasets import load_dataset
	dataset = load_dataset(data_cfg.dataset, data_cfg.config)
	train_texts = list(dataset["train"]["text"])
	train_labels = list(dataset["train"]["label"])
	test_texts = list(dataset["test"]["text"])
	test_labels = list(dataset["test"]["label"])

	elif name == "sentiment_general":
	train_texts, train_labels, test_texts, test_labels = load_vlsp2016(
	data_cfg.data_dir)

	elif name == "sentiment_bank":
	from datasets import load_dataset
	ds_class = load_dataset(data_cfg.dataset, "classification")
	ds_sent = load_dataset(data_cfg.dataset, "sentiment")
	train_texts = list(ds_class["train"]["text"])
	train_labels = [f'{c}#{s}' for c, s in
	zip(ds_class["train"]["label"], ds_sent["train"]["sentiment"])]
	test_texts = list(ds_class["test"]["text"])
	test_labels = [f'{c}#{s}' for c, s in
	zip(ds_class["test"]["label"], ds_sent["test"]["sentiment"])]
	else:
	raise ValueError(f"Unknown data: {name}")

	return train_texts, train_labels, test_texts, test_labels, extra_test


	# ---------------------------------------------------------------------------
	# Evaluate
	# ---------------------------------------------------------------------------

	def evaluate(test_labels, preds, name=""):
	"""Print evaluation metrics."""
	acc = accuracy_score(test_labels, preds)
	f1_w = f1_score(test_labels, preds, average='weighted', zero_division=0)
	f1_m = f1_score(test_labels, preds, average='macro', zero_division=0)

	header = f"RESULTS ({name})" if name else "RESULTS"
	log.info("=" * 70)
	log.info(header)
	log.info("=" * 70)
	log.info(f" Accuracy: {acc:.4f} ({acc*100:.2f}%)")
	log.info(f" F1 (weighted): {f1_w:.4f}")
	log.info(f" F1 (macro): {f1_m:.4f}")
	log.info("\n" + classification_report(test_labels, preds, zero_division=0))
	return acc


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	@hydra.main(version_base=None, config_path="conf", config_name="config")
	def train(cfg: DictConfig):
	"""Train Vietnamese text classification model."""
	log.info("=" * 70)
	log.info(f"Training: {cfg.data.name}")
	log.info("=" * 70)
	log.info(f"\nConfig:\n{OmegaConf.to_yaml(cfg)}")

	# Load data
	log.info("Loading data...")
	t0 = time.perf_counter()
	train_texts, train_labels, test_texts, test_labels, extra_test = load_data(cfg)
	load_time = time.perf_counter() - t0

	log.info(f" Train samples: {len(train_texts)}")
	log.info(f" Test samples: {len(test_texts)}")
	log.info(f" Labels: {len(set(train_labels))}")
	log.info(f" Load time: {load_time:.2f}s")

	# Build preprocessor — model.preprocess=true activates model.preprocessor config
	# Preprocessor is passed to TextClassifier and packed into the .bin model
	preprocessor = None
	if cfg.model.get("preprocess", False):
	preprocessor = build_preprocessor(cfg.model.preprocessor)
	log.info(f"\nPreprocessor: {preprocessor}")

	# Build classifier from config
	model_cfg = cfg.model
	ngram_range = tuple(model_cfg.ngram_range)

	log.info("\nTraining TextClassifier...")
	log.info(f" max_features={model_cfg.max_features}, ngram_range={ngram_range}, "
	f"max_df={model_cfg.max_df}, C={model_cfg.c}")

	clf = TextClassifier(
	max_features=model_cfg.max_features,
	ngram_range=ngram_range,
	min_df=model_cfg.min_df,
	max_df=model_cfg.max_df,
	c=model_cfg.c,
	max_iter=model_cfg.max_iter,
	tol=model_cfg.tol,
	preprocessor=preprocessor,
	)

	t0 = time.perf_counter()
	clf.fit(train_texts, train_labels)
	train_time = time.perf_counter() - t0
	log.info(f" Training time: {train_time:.3f}s")
	log.info(f" Vocabulary size: {clf.n_features}")

	# Evaluate on primary test set
	# TextClassifier auto-preprocesses via its built-in preprocessor
	log.info("\nEvaluating...")
	preds = clf.predict_batch(test_texts)
	evaluate(test_labels, preds, cfg.data.name)

	# Evaluate on extra test sets (e.g. VLSP2016)
	for name, (et_texts, et_labels) in extra_test.items():
	et_preds = clf.predict_batch(et_texts)
	evaluate(et_labels, et_preds, name)

	# Save model
	output = cfg.output
	model_path = Path(output)
	model_path.parent.mkdir(parents=True, exist_ok=True)
	clf.save(str(model_path))

	size_mb = model_path.stat().st_size / (1024 * 1024)
	log.info(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")


	if __name__ == "__main__":
	train()