sen-1

File size: 13,404 Bytes

"""
Benchmark CLI for Vietnamese Text Classification.

Compares Rust TextClassifier vs sklearn.

Usage:
    python bench.py vntc
    python bench.py bank
    python bench.py synthetic
"""

import os
import time
import random
from pathlib import Path

import click
from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer
from sklearn.svm import LinearSVC as SklearnLinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

from underthesea import TextClassifier


def read_file(filepath):
    """Read text file with multiple encoding attempts."""
    for enc in ['utf-16', 'utf-16-le', 'utf-8', 'latin-1']:
        try:
            with open(filepath, 'r', encoding=enc) as f:
                text = ' '.join(f.read().split())
                if len(text) > 10:
                    return text
        except (UnicodeDecodeError, UnicodeError):
            continue
    return None


def benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=20000):
    """Benchmark scikit-learn TF-IDF + LinearSVC."""
    click.echo("\n" + "=" * 70)
    click.echo("scikit-learn: TfidfVectorizer + LinearSVC")
    click.echo("=" * 70)

    # Vectorize
    click.echo("  Vectorizing...")
    t0 = time.perf_counter()
    vectorizer = SklearnTfidfVectorizer(max_features=max_features, ngram_range=(1, 2), min_df=2)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    vec_time = time.perf_counter() - t0
    click.echo(f"    Vectorization time: {vec_time:.2f}s")
    click.echo(f"    Vocabulary size: {len(vectorizer.vocabulary_)}")

    # Train
    click.echo("  Training LinearSVC...")
    t0 = time.perf_counter()
    clf = SklearnLinearSVC(C=1.0, max_iter=2000)
    clf.fit(X_train, train_labels)
    train_time = time.perf_counter() - t0
    click.echo(f"    Training time: {train_time:.2f}s")

    # End-to-end inference
    click.echo("  End-to-end inference...")
    t0 = time.perf_counter()
    X_test_e2e = vectorizer.transform(test_texts)
    preds = clf.predict(X_test_e2e)
    e2e_time = time.perf_counter() - t0
    e2e_throughput = len(test_texts) / e2e_time
    click.echo(f"    E2E time: {e2e_time:.2f}s ({e2e_throughput:.0f} samples/sec)")

    # Metrics
    acc = accuracy_score(test_labels, preds)
    f1_w = f1_score(test_labels, preds, average='weighted')
    click.echo(f"  Results: Accuracy={acc:.4f}, F1={f1_w:.4f}")

    return {
        "total_train": vec_time + train_time,
        "e2e_throughput": e2e_throughput,
        "accuracy": acc,
        "f1_weighted": f1_w,
    }


def benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=20000):
    """Benchmark Rust TextClassifier."""
    click.echo("\n" + "=" * 70)
    click.echo("Rust: TextClassifier (underthesea_core)")
    click.echo("=" * 70)

    clf = TextClassifier(
        max_features=max_features,
        ngram_range=(1, 2),
        min_df=2,
        c=1.0,
        max_iter=1000,
        tol=0.1,
    )

    # Train
    click.echo("  Training...")
    t0 = time.perf_counter()
    clf.fit(list(train_texts), list(train_labels))
    train_time = time.perf_counter() - t0
    click.echo(f"    Training time: {train_time:.2f}s")
    click.echo(f"    Vocabulary size: {clf.n_features}")

    # Inference
    click.echo("  Inference...")
    t0 = time.perf_counter()
    preds = clf.predict_batch(list(test_texts))
    infer_time = time.perf_counter() - t0
    throughput = len(test_texts) / infer_time
    click.echo(f"    Inference time: {infer_time:.2f}s ({throughput:.0f} samples/sec)")

    # Metrics
    acc = accuracy_score(test_labels, preds)
    f1_w = f1_score(test_labels, preds, average='weighted')
    click.echo(f"  Results: Accuracy={acc:.4f}, F1={f1_w:.4f}")

    return {
        "total_train": train_time,
        "throughput": throughput,
        "accuracy": acc,
        "f1_weighted": f1_w,
        "clf": clf,
        "preds": preds,
    }


def print_comparison(sklearn_results, rust_results):
    """Print comparison summary."""
    click.echo("\n" + "=" * 70)
    click.echo("COMPARISON SUMMARY")
    click.echo("=" * 70)
    click.echo(f"{'Metric':<30} {'sklearn':<20} {'Rust':<20}")
    click.echo("-" * 70)

    click.echo(f"{'Training time (s)':<30} {sklearn_results['total_train']:<20.2f} {rust_results['total_train']:<20.2f}")
    click.echo(f"{'Inference (samples/sec)':<30} {sklearn_results['e2e_throughput']:<20.0f} {rust_results['throughput']:<20.0f}")
    click.echo(f"{'Accuracy':<30} {sklearn_results['accuracy']:<20.4f} {rust_results['accuracy']:<20.4f}")
    click.echo(f"{'F1 (weighted)':<30} {sklearn_results['f1_weighted']:<20.4f} {rust_results['f1_weighted']:<20.4f}")

    click.echo("-" * 70)
    train_speedup = sklearn_results['total_train'] / rust_results['total_train'] if rust_results['total_train'] > 0 else 0
    infer_speedup = rust_results['throughput'] / sklearn_results['e2e_throughput'] if sklearn_results['e2e_throughput'] > 0 else 0
    click.echo(f"Speedup: Training {train_speedup:.2f}x, Inference {infer_speedup:.2f}x")
    click.echo("=" * 70)


@click.group()
def cli():
    """Benchmark Vietnamese text classification models."""
    pass


@cli.command()
@click.option('--data-dir', default='/home/claude-user/projects/workspace_underthesea/VNTC/Data/10Topics/Ver1.1',
              help='Path to VNTC dataset')
@click.option('--save-model', is_flag=True, help='Save the trained Rust model')
@click.option('--output', '-o', default='models/sen-vntc.bin', help='Output model path')
def vntc(data_dir, save_model, output):
    """Benchmark on VNTC dataset (10 topics, ~84k documents)."""
    click.echo("=" * 70)
    click.echo("VNTC Full Dataset Benchmark")
    click.echo("Vietnamese News Text Classification (10 Topics)")
    click.echo("=" * 70)

    train_dir = os.path.join(data_dir, "Train_Full")
    test_dir = os.path.join(data_dir, "Test_Full")

    # Load data
    click.echo("\nLoading training data...")
    t0 = time.perf_counter()
    train_texts, train_labels = [], []
    for folder in sorted(os.listdir(train_dir)):
        folder_path = os.path.join(train_dir, folder)
        if not os.path.isdir(folder_path):
            continue
        for fname in os.listdir(folder_path):
            if fname.endswith('.txt'):
                text = read_file(os.path.join(folder_path, fname))
                if text:
                    train_texts.append(text)
                    train_labels.append(folder)
    click.echo(f"  Loaded {len(train_texts)} training samples in {time.perf_counter()-t0:.1f}s")

    click.echo("Loading test data...")
    t0 = time.perf_counter()
    test_texts, test_labels = [], []
    for folder in sorted(os.listdir(test_dir)):
        folder_path = os.path.join(test_dir, folder)
        if not os.path.isdir(folder_path):
            continue
        for fname in os.listdir(folder_path):
            if fname.endswith('.txt'):
                text = read_file(os.path.join(folder_path, fname))
                if text:
                    test_texts.append(text)
                    test_labels.append(folder)
    click.echo(f"  Loaded {len(test_texts)} test samples in {time.perf_counter()-t0:.1f}s")

    # Run benchmarks
    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels)
    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels)

    print_comparison(sklearn_results, rust_results)

    if save_model:
        model_path = Path(output)
        model_path.parent.mkdir(parents=True, exist_ok=True)
        rust_results['clf'].save(str(model_path))
        size_mb = model_path.stat().st_size / (1024 * 1024)
        click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")


@cli.command()
@click.option('--save-model', is_flag=True, help='Save the trained Rust model')
@click.option('--output', '-o', default='models/sen-bank.bin', help='Output model path')
def bank(save_model, output):
    """Benchmark on UTS2017_Bank dataset (14 categories, banking domain)."""
    from datasets import load_dataset

    click.echo("=" * 70)
    click.echo("UTS2017_Bank Dataset Benchmark")
    click.echo("Vietnamese Banking Domain Text Classification (14 Categories)")
    click.echo("=" * 70)

    # Load data
    click.echo("\nLoading UTS2017_Bank dataset from HuggingFace...")
    dataset = load_dataset("undertheseanlp/UTS2017_Bank", "classification")

    train_texts = list(dataset["train"]["text"])
    train_labels = list(dataset["train"]["label"])
    test_texts = list(dataset["test"]["text"])
    test_labels = list(dataset["test"]["label"])

    click.echo(f"  Train samples: {len(train_texts)}")
    click.echo(f"  Test samples: {len(test_texts)}")
    click.echo(f"  Categories: {len(set(train_labels))}")

    # Run benchmarks (smaller max_features for smaller dataset)
    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=10000)
    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=10000)

    print_comparison(sklearn_results, rust_results)

    click.echo("\nClassification Report (Rust):")
    click.echo(classification_report(test_labels, rust_results['preds']))

    if save_model:
        model_path = Path(output)
        model_path.parent.mkdir(parents=True, exist_ok=True)
        rust_results['clf'].save(str(model_path))
        size_mb = model_path.stat().st_size / (1024 * 1024)
        click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")


@cli.command()
@click.option('--train-per-cat', default=340, help='Training samples per category')
@click.option('--test-per-cat', default=500, help='Test samples per category')
@click.option('--seed', default=42, help='Random seed')
def synthetic(train_per_cat, test_per_cat, seed):
    """Benchmark on synthetic VNTC-like data."""
    # Vietnamese text templates by category
    TEMPLATES = {
        "the_thao": ["Đội tuyển {} thắng {} với tỷ số {}", "Cầu thủ {} ghi bàn đẹp mắt"],
        "kinh_doanh": ["Chứng khoán {} điểm trong phiên giao dịch", "Ngân hàng {} công bố lãi suất {}"],
        "cong_nghe": ["Apple ra mắt {} với nhiều tính năng", "Trí tuệ nhân tạo đang thay đổi {}"],
        "chinh_tri": ["Quốc hội thông qua nghị quyết về {}", "Chủ tịch {} tiếp đón phái đoàn"],
        "van_hoa": ["Nghệ sĩ {} ra mắt album mới", "Liên hoan phim {} trao giải"],
        "khoa_hoc": ["Nhà khoa học phát hiện {} mới", "Nghiên cứu cho thấy {} có tác dụng"],
        "suc_khoe": ["Bộ Y tế cảnh báo về {} trong mùa", "Vaccine {} đạt hiệu quả cao"],
        "giao_duc": ["Trường {} công bố điểm chuẩn", "Học sinh đoạt huy chương tại Olympic"],
        "phap_luat": ["Tòa án xét xử vụ án {} với bị cáo", "Công an triệt phá đường dây"],
        "doi_song": ["Giá {} tăng trong tháng", "Người dân đổ xô đi mua {}"],
    }
    FILLS = {
        "the_thao": ["Việt Nam", "Thái Lan", "3-0", "bóng đá", "AFF Cup"],
        "kinh_doanh": ["tăng", "giảm", "VN-Index", "Vietcombank", "8%"],
        "cong_nghe": ["iPhone 16", "ChatGPT", "công việc", "VinAI", "5G"],
        "chinh_tri": ["kinh tế", "nước", "Trung Quốc", "Hà Nội", "phát triển"],
        "van_hoa": ["Mỹ Tâm", "Cannes", "nghệ thuật", "Hà Nội", "Bố Già"],
        "khoa_hoc": ["loài sinh vật", "trà xanh", "VNREDSat-1", "Nobel", "robot"],
        "suc_khoe": ["dịch cúm", "COVID-19", "Bạch Mai", "dinh dưỡng", "tiểu đường"],
        "giao_duc": ["Bách Khoa", "Việt Nam", "Toán", "THPT", "STEM"],
        "phap_luat": ["tham nhũng", "TP.HCM", "ma túy", "Hình sự", "gian lận"],
        "doi_song": ["xăng", "vàng", "nắng nóng", "Trung thu", "bún chả"],
    }

    def generate_sample(category):
        template = random.choice(TEMPLATES[category])
        fills = FILLS[category]
        n = template.count("{}")
        return template.format(*random.choices(fills, k=n))

    def generate_dataset(n_per_cat, categories):
        texts, labels = [], []
        for cat in categories:
            for _ in range(n_per_cat):
                texts.append(generate_sample(cat))
                labels.append(cat)
        combined = list(zip(texts, labels))
        random.shuffle(combined)
        return [t for t, _ in combined], [l for _, l in combined]

    click.echo("=" * 70)
    click.echo("Synthetic VNTC-like Benchmark")
    click.echo("=" * 70)

    random.seed(seed)
    categories = list(TEMPLATES.keys())

    click.echo(f"\nConfiguration:")
    click.echo(f"  Categories: {len(categories)}")
    click.echo(f"  Train samples: {train_per_cat * len(categories)}")
    click.echo(f"  Test samples: {test_per_cat * len(categories)}")

    train_texts, train_labels = generate_dataset(train_per_cat, categories)
    test_texts, test_labels = generate_dataset(test_per_cat, categories)

    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=10000)
    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=10000)

    print_comparison(sklearn_results, rust_results)


if __name__ == "__main__":
    cli()