sen-1

File size: 8,951 Bytes

"""
Training CLI for Vietnamese Text Classification using Hydra.

Usage:
    python src/train.py --config-name=vntc
    python src/train.py --config-name=sentiment_general
    python src/train.py --config-name=sentiment_bank
    python src/train.py --config-name=bank

Override params from CLI:
    python src/train.py --config-name=sentiment_general model.c=0.5 model.max_features=100000
    python src/train.py --config-name=vntc preprocessor=sentiment
    python src/train.py --config-name=sentiment_general data.vlsp2016_dir=/path/to/VLSP2016_SA
"""

import os
import time
import logging
from pathlib import Path

import hydra
from omegaconf import DictConfig, OmegaConf
from sklearn.metrics import accuracy_score, f1_score, classification_report

from underthesea import TextClassifier, TextPreprocessor

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Preprocessor
# ---------------------------------------------------------------------------

def build_preprocessor(pp_cfg):
    """Build a Rust TextPreprocessor from model.preprocessor config."""
    teencode = dict(pp_cfg.get("teencode", {})) or None
    neg_words = list(pp_cfg.get("negation_words", [])) or None
    neg_window = pp_cfg.get("negation_window", 2)

    return TextPreprocessor(
        lowercase=pp_cfg.get("lowercase", True),
        unicode_normalize=pp_cfg.get("unicode_normalize", True),
        remove_urls=pp_cfg.get("remove_urls", True),
        normalize_repeated_chars=pp_cfg.get("normalize_repeated_chars", True),
        normalize_punctuation=pp_cfg.get("normalize_punctuation", True),
        teencode=teencode,
        negation_words=neg_words,
        negation_window=neg_window,
    )


# ---------------------------------------------------------------------------
# Data loaders
# ---------------------------------------------------------------------------

def read_file(filepath):
    """Read text file with multiple encoding attempts."""
    for enc in ['utf-16', 'utf-8', 'latin-1']:
        try:
            with open(filepath, 'r', encoding=enc) as f:
                text = ' '.join(f.read().split())
                if len(text) > 10:
                    return text
        except (UnicodeDecodeError, UnicodeError):
            continue
    return None


def load_vntc_data(data_dir):
    """Load VNTC data from directory."""
    texts, labels = [], []
    for folder in sorted(os.listdir(data_dir)):
        folder_path = os.path.join(data_dir, folder)
        if not os.path.isdir(folder_path):
            continue
        for fname in os.listdir(folder_path):
            if fname.endswith('.txt'):
                text = read_file(os.path.join(folder_path, fname))
                if text:
                    texts.append(text)
                    labels.append(folder)
    return texts, labels


def load_vlsp2016(data_dir):
    """Load VLSP2016 sentiment data from directory."""
    label_map = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}
    texts, labels = [], []
    for split in ['train.txt', 'test.txt']:
        split_texts, split_labels = [], []
        filepath = os.path.join(data_dir, split)
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith('__label__'):
                    parts = line.split(' ', 1)
                    label = label_map[parts[0].replace('__label__', '')]
                    text = parts[1] if len(parts) > 1 else ''
                    split_texts.append(text)
                    split_labels.append(label)
        texts.append(split_texts)
        labels.append(split_labels)
    return texts[0], labels[0], texts[1], labels[1]


def load_data(cfg):
    """Load train/test data based on Hydra data config."""
    data_cfg = cfg.data
    name = data_cfg.name
    extra_test = {}

    if name == "vntc":
        train_texts, train_labels = load_vntc_data(
            os.path.join(data_cfg.data_dir, "Train_Full"))
        test_texts, test_labels = load_vntc_data(
            os.path.join(data_cfg.data_dir, "Test_Full"))

    elif name == "bank":
        from datasets import load_dataset
        dataset = load_dataset(data_cfg.dataset, data_cfg.config)
        train_texts = list(dataset["train"]["text"])
        train_labels = list(dataset["train"]["label"])
        test_texts = list(dataset["test"]["text"])
        test_labels = list(dataset["test"]["label"])

    elif name == "sentiment_general":
        train_texts, train_labels, test_texts, test_labels = load_vlsp2016(
            data_cfg.data_dir)

    elif name == "sentiment_bank":
        from datasets import load_dataset
        ds_class = load_dataset(data_cfg.dataset, "classification")
        ds_sent = load_dataset(data_cfg.dataset, "sentiment")
        train_texts = list(ds_class["train"]["text"])
        train_labels = [f'{c}#{s}' for c, s in
                        zip(ds_class["train"]["label"], ds_sent["train"]["sentiment"])]
        test_texts = list(ds_class["test"]["text"])
        test_labels = [f'{c}#{s}' for c, s in
                       zip(ds_class["test"]["label"], ds_sent["test"]["sentiment"])]
    else:
        raise ValueError(f"Unknown data: {name}")

    return train_texts, train_labels, test_texts, test_labels, extra_test


# ---------------------------------------------------------------------------
# Evaluate
# ---------------------------------------------------------------------------

def evaluate(test_labels, preds, name=""):
    """Print evaluation metrics."""
    acc = accuracy_score(test_labels, preds)
    f1_w = f1_score(test_labels, preds, average='weighted', zero_division=0)
    f1_m = f1_score(test_labels, preds, average='macro', zero_division=0)

    header = f"RESULTS ({name})" if name else "RESULTS"
    log.info("=" * 70)
    log.info(header)
    log.info("=" * 70)
    log.info(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    log.info(f"  F1 (weighted): {f1_w:.4f}")
    log.info(f"  F1 (macro): {f1_m:.4f}")
    log.info("\n" + classification_report(test_labels, preds, zero_division=0))
    return acc


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

@hydra.main(version_base=None, config_path="conf", config_name="config")
def train(cfg: DictConfig):
    """Train Vietnamese text classification model."""
    log.info("=" * 70)
    log.info(f"Training: {cfg.data.name}")
    log.info("=" * 70)
    log.info(f"\nConfig:\n{OmegaConf.to_yaml(cfg)}")

    # Load data
    log.info("Loading data...")
    t0 = time.perf_counter()
    train_texts, train_labels, test_texts, test_labels, extra_test = load_data(cfg)
    load_time = time.perf_counter() - t0

    log.info(f"  Train samples: {len(train_texts)}")
    log.info(f"  Test samples: {len(test_texts)}")
    log.info(f"  Labels: {len(set(train_labels))}")
    log.info(f"  Load time: {load_time:.2f}s")

    # Build preprocessor — model.preprocess=true activates model.preprocessor config
    # Preprocessor is passed to TextClassifier and packed into the .bin model
    preprocessor = None
    if cfg.model.get("preprocess", False):
        preprocessor = build_preprocessor(cfg.model.preprocessor)
        log.info(f"\nPreprocessor: {preprocessor}")

    # Build classifier from config
    model_cfg = cfg.model
    ngram_range = tuple(model_cfg.ngram_range)

    log.info("\nTraining TextClassifier...")
    log.info(f"  max_features={model_cfg.max_features}, ngram_range={ngram_range}, "
             f"max_df={model_cfg.max_df}, C={model_cfg.c}")

    clf = TextClassifier(
        max_features=model_cfg.max_features,
        ngram_range=ngram_range,
        min_df=model_cfg.min_df,
        max_df=model_cfg.max_df,
        c=model_cfg.c,
        max_iter=model_cfg.max_iter,
        tol=model_cfg.tol,
        preprocessor=preprocessor,
    )

    t0 = time.perf_counter()
    clf.fit(train_texts, train_labels)
    train_time = time.perf_counter() - t0
    log.info(f"  Training time: {train_time:.3f}s")
    log.info(f"  Vocabulary size: {clf.n_features}")

    # Evaluate on primary test set
    # TextClassifier auto-preprocesses via its built-in preprocessor
    log.info("\nEvaluating...")
    preds = clf.predict_batch(test_texts)
    evaluate(test_labels, preds, cfg.data.name)

    # Evaluate on extra test sets (e.g. VLSP2016)
    for name, (et_texts, et_labels) in extra_test.items():
        et_preds = clf.predict_batch(et_texts)
        evaluate(et_labels, et_preds, name)

    # Save model
    output = cfg.output
    model_path = Path(output)
    model_path.parent.mkdir(parents=True, exist_ok=True)
    clf.save(str(model_path))

    size_mb = model_path.stat().st_size / (1024 * 1024)
    log.info(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")


if __name__ == "__main__":
    train()