Refactor training to Hydra config and use underthesea imports

- Replace Click CLI with Hydra config system for flexible training
- Add config files for all training tasks (vntc, bank, sentiment_general, sentiment_bank)
- Change imports from underthesea_core to underthesea throughout
- Move preprocessing to Rust TextPreprocessor (built into model binary)
- Delete extends/ directory (code now in underthesea_core v3.2.0)
- Add outputs/ to .gitignore for Hydra run outputs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (16) hide show

.gitignore +3 -0
pyproject.toml +1 -1
src/bench.py +1 -1
src/conf/bank.yaml +7 -0
src/conf/config.yaml +6 -0
src/conf/data/bank.yaml +4 -0
src/conf/data/sentiment_bank.yaml +5 -0
src/conf/data/sentiment_general.yaml +3 -0
src/conf/data/vntc.yaml +3 -0
src/conf/model/default.yaml +8 -0
src/conf/model/sentiment.yaml +63 -0
src/conf/model/small.yaml +8 -0
src/conf/sentiment_bank.yaml +7 -0
src/conf/sentiment_general.yaml +7 -0
src/conf/vntc.yaml +7 -0
src/train.py +158 -405

.gitignore CHANGED Viewed

@@ -26,6 +26,9 @@ Thumbs.db
 # Jupyter
 .ipynb_checkpoints/
 # Testing
 .pytest_cache/
 .coverage

 # Jupyter
 .ipynb_checkpoints/
+# Hydra outputs
+outputs/
 # Testing
 .pytest_cache/
 .coverage

pyproject.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "sen"
 version = "1.1.0"
-description = "Vietnamese Text Classification - Training scripts for underthesea_core"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "Apache-2.0"

 [project]
 name = "sen"
 version = "1.1.0"
+description = "Vietnamese Text Classification - Training scripts for underthesea"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "Apache-2.0"

src/bench.py CHANGED Viewed

@@ -19,7 +19,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVecto
 from sklearn.svm import LinearSVC as SklearnLinearSVC
 from sklearn.metrics import accuracy_score, f1_score, classification_report
-from underthesea_core import TextClassifier
 def read_file(filepath):

 from sklearn.svm import LinearSVC as SklearnLinearSVC
 from sklearn.metrics import accuracy_score, f1_score, classification_report
+from underthesea import TextClassifier
 def read_file(filepath):

src/conf/bank.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# python src/train.py --config-name=bank
+defaults:
+  - data: bank
+  - model: small
+  - _self_
+output: models/sen-bank-1.0.0-${now:%Y%m%d}.bin

src/conf/config.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+defaults:
+  - data: vntc
+  - model: default
+  - _self_
+output: models/sen-${data.name}.bin

src/conf/data/bank.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+name: bank
+source: huggingface
+dataset: undertheseanlp/UTS2017_Bank
+config: classification

src/conf/data/sentiment_bank.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+name: sentiment_bank
+source: huggingface
+dataset: undertheseanlp/UTS2017_Bank
+config: [classification, sentiment]
+label_format: "{category}#{sentiment}"

src/conf/data/sentiment_general.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: sentiment_general
+source: vlsp2016
+data_dir: /tmp/VLSP2016_SA

src/conf/data/vntc.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: vntc
+source: local
+data_dir: /home/claude-user/projects/workspace_underthesea/VNTC/Data/10Topics/Ver1.1

src/conf/model/default.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+max_features: 20000
+ngram_range: [1, 2]
+min_df: 1
+max_df: 1.0
+c: 1.0
+max_iter: 1000
+tol: 0.1
+preprocess: false

src/conf/model/sentiment.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+max_features: 200000
+ngram_range: [1, 3]
+min_df: 1
+max_df: 0.9
+c: 0.7
+max_iter: 1000
+tol: 0.0001
+preprocess: true
+preprocessor:
+  lowercase: true
+  unicode_normalize: true
+  remove_urls: true
+  normalize_repeated_chars: true
+  normalize_punctuation: true
+  teencode:
+    ko: "không"
+    k: "không"
+    hok: "không"
+    hem: "không"
+    dc: "được"
+    đc: "được"
+    dk: "được"
+    ntn: "như thế nào"
+    nc: "nói chuyện"
+    nt: "nhắn tin"
+    cx: "cũng"
+    cg: "cũng"
+    vs: "với"
+    vl: "vãi"
+    bt: "bình thường"
+    bth: "bình thường"
+    lg: "lượng"
+    tl: "trả lời"
+    ms: "mới"
+    r: "rồi"
+    mn: "mọi người"
+    mk: "mình"
+    ok: "tốt"
+    oke: "tốt"
+    sp: "sản phẩm"
+    hqua: "hôm qua"
+    hnay: "hôm nay"
+    tks: "cảm ơn"
+    thanks: "cảm ơn"
+    thank: "cảm ơn"
+    j: "gì"
+    z: "vậy"
+    v: "vậy"
+    đt: "điện thoại"
+    dt: "điện thoại"
+    lm: "làm"
+    ns: "nói"
+  negation_words:
+    - "không"
+    - "chẳng"
+    - "chả"
+    - "chưa"
+    - "đừng"
+    - "ko"
+    - "hok"
+    - "hem"
+    - "chăng"
+  negation_window: 2

src/conf/model/small.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+max_features: 10000
+ngram_range: [1, 2]
+min_df: 1
+max_df: 0.9
+c: 1.0
+max_iter: 1000
+tol: 0.0001
+preprocess: false

src/conf/sentiment_bank.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# python src/train.py --config-name=sentiment_bank
+defaults:
+  - data: sentiment_bank
+  - model: sentiment
+  - _self_
+output: models/sen-sentiment-bank-1.0.0-${now:%Y%m%d}.bin

src/conf/sentiment_general.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# python src/train.py --config-name=sentiment_general
+defaults:
+  - data: sentiment_general
+  - model: sentiment
+  - _self_
+output: models/sen-sentiment-general-1.0.0-${now:%Y%m%d}.bin

src/conf/vntc.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# python src/train.py --config-name=vntc
+defaults:
+  - data: vntc
+  - model: default
+  - _self_
+output: models/sen-vntc-1.0.0-${now:%Y%m%d}.bin

src/train.py CHANGED Viewed

@@ -1,73 +1,57 @@
 """
-Training CLI for Vietnamese Text Classification.
 Usage:
-    python train.py vntc --output models/sen-vntc.bin
-    python train.py bank --output models/sen-bank.bin
 """
 import os
-import re
 import time
-import unicodedata
 from pathlib import Path
-import click
 from sklearn.metrics import accuracy_score, f1_score, classification_report
-from underthesea_core import TextClassifier
-# Vietnamese teencode dictionary
-_TEENCODE = {
-    'ko': 'không', 'k': 'không', 'hok': 'không', 'hem': 'không',
-    'dc': 'được', 'đc': 'được', 'dk': 'được',
-    'ntn': 'như thế nào',
-    'nc': 'nói chuyện', 'nt': 'nhắn tin',
-    'cx': 'cũng', 'cg': 'cũng',
-    'vs': 'với', 'vl': 'vãi',
-    'bt': 'bình thường', 'bth': 'bình thường',
-    'lg': 'lượng', 'tl': 'trả lời',
-    'ms': 'mới', 'r': 'rồi',
-    'mn': 'mọi người', 'mk': 'mình',
-    'ok': 'tốt', 'oke': 'tốt',
-    'sp': 'sản phẩm',
-    'hqua': 'hôm qua', 'hnay': 'hôm nay',
-    'tks': 'cảm ơn', 'thanks': 'cảm ơn', 'thank': 'cảm ơn',
-    'j': 'gì', 'z': 'vậy', 'v': 'vậy',
-    'đt': 'điện thoại', 'dt': 'điện thoại',
-    'lm': 'làm', 'ns': 'nói',
-}
-_NEG_WORDS = {'không', 'chẳng', 'chả', 'chưa', 'đừng', 'ko', 'hok', 'hem', 'chăng'}
-def preprocess_sentiment(text):
-    """Preprocess Vietnamese text for sentiment analysis."""
-    text = unicodedata.normalize('NFC', text)
-    text = text.lower()
-    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
-    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
-    text = re.sub(r'!{2,}', '!', text)
-    text = re.sub(r'\?{2,}', '?', text)
-    text = re.sub(r'\.{4,}', '...', text)
-    # Teencode expansion
-    words = text.split()
-    expanded = []
-    for w in words:
-        wl = w.strip('.,!?;:')
-        if wl in _TEENCODE:
-            expanded.append(_TEENCODE[wl])
-        else:
-            expanded.append(w)
-    # Negation marking (2-word window)
-    new_words = list(expanded)
-    for i, w in enumerate(expanded):
-        wl = w.strip('.,!?;:')
-        if wl in _NEG_WORDS:
-            for j in range(i + 1, min(i + 3, len(expanded))):
-                new_words[j] = 'NEG_' + expanded[j]
-    return ' '.join(new_words)
 def read_file(filepath):
     """Read text file with multiple encoding attempts."""
@@ -85,184 +69,20 @@ def read_file(filepath):
 def load_vntc_data(data_dir):
     """Load VNTC data from directory."""
     texts, labels = [], []
     for folder in sorted(os.listdir(data_dir)):
         folder_path = os.path.join(data_dir, folder)
         if not os.path.isdir(folder_path):
             continue
         for fname in os.listdir(folder_path):
             if fname.endswith('.txt'):
                 text = read_file(os.path.join(folder_path, fname))
                 if text:
                     texts.append(text)
                     labels.append(folder)
     return texts, labels
-@click.group()
-def cli():
-    """Train Vietnamese text classification models."""
-    pass
-@cli.command()
-@click.option('--data-dir', default='/home/claude-user/projects/workspace_underthesea/VNTC/Data/10Topics/Ver1.1',
-              help='Path to VNTC dataset')
-@click.option('--output', '-o', default='models/sen-vntc.bin', help='Output model path')
-@click.option('--max-features', default=20000, help='Maximum vocabulary size')
-@click.option('--ngram-min', default=1, help='Minimum n-gram')
-@click.option('--ngram-max', default=2, help='Maximum n-gram')
-@click.option('--min-df', default=2, help='Minimum document frequency')
-@click.option('--c', default=1.0, help='SVM regularization parameter')
-@click.option('--max-iter', default=1000, help='Maximum iterations')
-@click.option('--tol', default=0.1, help='Convergence tolerance')
-def vntc(data_dir, output, max_features, ngram_min, ngram_max, min_df, c, max_iter, tol):
-    """Train on VNTC dataset (10 topics, ~84k documents)."""
-    click.echo("=" * 70)
-    click.echo("VNTC Dataset Training (10 Topics)")
-    click.echo("=" * 70)
-    train_dir = os.path.join(data_dir, "Train_Full")
-    test_dir = os.path.join(data_dir, "Test_Full")
-    # Load data
-    click.echo("\nLoading data...")
-    t0 = time.perf_counter()
-    train_texts, train_labels = load_vntc_data(train_dir)
-    test_texts, test_labels = load_vntc_data(test_dir)
-    load_time = time.perf_counter() - t0
-    click.echo(f"  Train samples: {len(train_texts)}")
-    click.echo(f"  Test samples: {len(test_texts)}")
-    click.echo(f"  Categories: {len(set(train_labels))}")
-    click.echo(f"  Load time: {load_time:.2f}s")
-    # Train
-    click.echo("\nTraining Rust TextClassifier...")
-    clf = TextClassifier(
-        max_features=max_features,
-        ngram_range=(ngram_min, ngram_max),
-        min_df=min_df,
-        c=c,
-        max_iter=max_iter,
-        tol=tol,
-    )
-    t0 = time.perf_counter()
-    clf.fit(train_texts, train_labels)
-    train_time = time.perf_counter() - t0
-    click.echo(f"  Training time: {train_time:.2f}s")
-    click.echo(f"  Vocabulary size: {clf.n_features}")
-    # Evaluate
-    click.echo("\nEvaluating...")
-    t0 = time.perf_counter()
-    preds = clf.predict_batch(test_texts)
-    infer_time = time.perf_counter() - t0
-    throughput = len(test_texts) / infer_time
-    acc = accuracy_score(test_labels, preds)
-    f1_w = f1_score(test_labels, preds, average='weighted')
-    f1_m = f1_score(test_labels, preds, average='macro')
-    click.echo(f"  Inference: {infer_time:.3f}s ({throughput:.0f} samples/sec)")
-    click.echo("\n" + "=" * 70)
-    click.echo("RESULTS")
-    click.echo("=" * 70)
-    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
-    click.echo(f"  F1 (weighted): {f1_w:.4f}")
-    click.echo(f"  F1 (macro): {f1_m:.4f}")
-    click.echo("\nClassification Report:")
-    click.echo(classification_report(test_labels, preds))
-    # Save model
-    model_path = Path(output)
-    model_path.parent.mkdir(parents=True, exist_ok=True)
-    clf.save(str(model_path))
-    size_mb = model_path.stat().st_size / (1024 * 1024)
-    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
-@cli.command()
-@click.option('--output', '-o', default='models/sen-bank.bin', help='Output model path')
-@click.option('--max-features', default=10000, help='Maximum vocabulary size')
-@click.option('--ngram-min', default=1, help='Minimum n-gram')
-@click.option('--ngram-max', default=2, help='Maximum n-gram')
-@click.option('--min-df', default=1, help='Minimum document frequency')
-@click.option('--c', default=1.0, help='SVM regularization parameter')
-@click.option('--max-iter', default=1000, help='Maximum iterations')
-@click.option('--tol', default=0.1, help='Convergence tolerance')
-def bank(output, max_features, ngram_min, ngram_max, min_df, c, max_iter, tol):
-    """Train on UTS2017_Bank dataset (14 categories, banking domain)."""
-    from datasets import load_dataset
-    click.echo("=" * 70)
-    click.echo("UTS2017_Bank Dataset Training (14 Categories)")
-    click.echo("=" * 70)
-    # Load data
-    click.echo("\nLoading UTS2017_Bank dataset from HuggingFace...")
-    dataset = load_dataset("undertheseanlp/UTS2017_Bank", "classification")
-    train_texts = list(dataset["train"]["text"])
-    train_labels = list(dataset["train"]["label"])
-    test_texts = list(dataset["test"]["text"])
-    test_labels = list(dataset["test"]["label"])
-    click.echo(f"  Train samples: {len(train_texts)}")
-    click.echo(f"  Test samples: {len(test_texts)}")
-    click.echo(f"  Categories: {len(set(train_labels))}")
-    # Train
-    click.echo("\nTraining Rust TextClassifier...")
-    clf = TextClassifier(
-        max_features=max_features,
-        ngram_range=(ngram_min, ngram_max),
-        min_df=min_df,
-        c=c,
-        max_iter=max_iter,
-        tol=tol,
-    )
-    t0 = time.perf_counter()
-    clf.fit(train_texts, train_labels)
-    train_time = time.perf_counter() - t0
-    click.echo(f"  Training time: {train_time:.3f}s")
-    click.echo(f"  Vocabulary size: {clf.n_features}")
-    # Evaluate
-    click.echo("\nEvaluating...")
-    preds = clf.predict_batch(test_texts)
-    acc = accuracy_score(test_labels, preds)
-    f1_w = f1_score(test_labels, preds, average='weighted')
-    f1_m = f1_score(test_labels, preds, average='macro')
-    click.echo("\n" + "=" * 70)
-    click.echo("RESULTS")
-    click.echo("=" * 70)
-    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
-    click.echo(f"  F1 (weighted): {f1_w:.4f}")
-    click.echo(f"  F1 (macro): {f1_m:.4f}")
-    click.echo("\nClassification Report:")
-    click.echo(classification_report(test_labels, preds))
-    # Save model
-    model_path = Path(output)
-    model_path.parent.mkdir(parents=True, exist_ok=True)
-    clf.save(str(model_path))
-    size_mb = model_path.stat().st_size / (1024 * 1024)
-    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
-def _load_vlsp2016(data_dir):
     """Load VLSP2016 sentiment data from directory."""
     label_map = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}
     texts, labels = [], []
@@ -283,209 +103,142 @@ def _load_vlsp2016(data_dir):
     return texts[0], labels[0], texts[1], labels[1]
-@cli.command('sentiment-general')
-@click.option('--output', '-o', default=None, help='Output model path')
-@click.option('--vlsp2016-dir', default=None, help='Path to VLSP2016_SA directory (adds to training data)')
-@click.option('--max-features', default=200000, help='Maximum vocabulary size')
-@click.option('--ngram-min', default=1, help='Minimum n-gram')
-@click.option('--ngram-max', default=3, help='Maximum n-gram')
-@click.option('--min-df', default=1, help='Minimum document frequency')
-@click.option('--max-df', default=0.9, help='Maximum document frequency')
-@click.option('--c', default=0.7, help='SVM regularization parameter')
-@click.option('--max-iter', default=1000, help='Maximum iterations')
-@click.option('--tol', default=0.0001, help='Convergence tolerance')
-def sentiment_general(output, vlsp2016_dir, max_features, ngram_min, ngram_max, min_df, max_df, c, max_iter, tol):
-    """Train sentiment-general model (3 classes: positive/negative/neutral).
-    Uses UTS2017_Bank sentiment data by default. Optionally adds VLSP2016 data
-    with --vlsp2016-dir for improved general-domain coverage.
-    """
-    from datetime import datetime
-    from datasets import load_dataset
-    if output is None:
-        date_str = datetime.now().strftime('%Y%m%d')
-        output = f'models/sen-sentiment-general-1.0.0-{date_str}.bin'
-    click.echo("=" * 70)
-    click.echo("Sentiment General Training (positive/negative/neutral)")
-    click.echo("=" * 70)
-    # Load UTS2017_Bank sentiment data
-    click.echo("\nLoading UTS2017_Bank sentiment dataset from HuggingFace...")
-    dataset = load_dataset("undertheseanlp/UTS2017_Bank", "sentiment")
-    train_texts = list(dataset["train"]["text"])
-    train_labels = list(dataset["train"]["sentiment"])
-    test_texts = list(dataset["test"]["text"])
-    test_labels = list(dataset["test"]["sentiment"])
-    vlsp_test_texts, vlsp_test_labels = None, None
-    # Optionally add VLSP2016 data
-    if vlsp2016_dir:
-        click.echo(f"\nLoading VLSP2016 data from {vlsp2016_dir}...")
-        vlsp_train_texts, vlsp_train_labels, vlsp_test_texts, vlsp_test_labels = _load_vlsp2016(vlsp2016_dir)
-        train_texts.extend(vlsp_train_texts)
-        train_labels.extend(vlsp_train_labels)
-        click.echo(f"  VLSP2016 train: {len(vlsp_train_texts)}, test: {len(vlsp_test_texts)}")
-    click.echo(f"  Total train samples: {len(train_texts)}")
-    click.echo(f"  UTS2017 test samples: {len(test_texts)}")
-    click.echo(f"  Labels: {sorted(set(train_labels))}")
-    # Preprocess
-    click.echo("\nPreprocessing...")
-    proc_train = [preprocess_sentiment(t) for t in train_texts]
-    proc_test = [preprocess_sentiment(t) for t in test_texts]
-    # Train
-    click.echo("\nTraining Rust TextClassifier...")
-    clf = TextClassifier(
-        max_features=max_features,
-        ngram_range=(ngram_min, ngram_max),
-        min_df=min_df,
-        max_df=max_df,
-        c=c,
-        max_iter=max_iter,
-        tol=tol,
-    )
     t0 = time.perf_counter()
-    clf.fit(proc_train, train_labels)
-    train_time = time.perf_counter() - t0
-    click.echo(f"  Training time: {train_time:.3f}s")
-    click.echo(f"  Vocabulary size: {clf.n_features}")
-    # Evaluate on UTS2017
-    click.echo("\nEvaluating on UTS2017_Bank test set...")
-    preds = clf.predict_batch(proc_test)
-    acc = accuracy_score(test_labels, preds)
-    f1_w = f1_score(test_labels, preds, average='weighted', zero_division=0)
-    f1_m = f1_score(test_labels, preds, average='macro', zero_division=0)
-    click.echo("\n" + "=" * 70)
-    click.echo("RESULTS (UTS2017_Bank)")
-    click.echo("=" * 70)
-    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
-    click.echo(f"  F1 (weighted): {f1_w:.4f}")
-    click.echo(f"  F1 (macro): {f1_m:.4f}")
-    click.echo("\nClassification Report:")
-    click.echo(classification_report(test_labels, preds, zero_division=0))
-    # Evaluate on VLSP2016 if available
-    if vlsp_test_texts:
-        proc_vlsp_test = [preprocess_sentiment(t) for t in vlsp_test_texts]
-        vlsp_preds = clf.predict_batch(proc_vlsp_test)
-        vlsp_acc = accuracy_score(vlsp_test_labels, vlsp_preds)
-        vlsp_f1w = f1_score(vlsp_test_labels, vlsp_preds, average='weighted', zero_division=0)
-        vlsp_f1m = f1_score(vlsp_test_labels, vlsp_preds, average='macro', zero_division=0)
-        click.echo("=" * 70)
-        click.echo("RESULTS (VLSP2016)")
-        click.echo("=" * 70)
-        click.echo(f"  Accuracy: {vlsp_acc:.4f} ({vlsp_acc*100:.2f}%)")
-        click.echo(f"  F1 (weighted): {vlsp_f1w:.4f}")
-        click.echo(f"  F1 (macro): {vlsp_f1m:.4f}")
-        click.echo("\nClassification Report:")
-        click.echo(classification_report(vlsp_test_labels, vlsp_preds, zero_division=0))
-    # Save model
-    model_path = Path(output)
-    model_path.parent.mkdir(parents=True, exist_ok=True)
-    clf.save(str(model_path))
-    size_mb = model_path.stat().st_size / (1024 * 1024)
-    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
-@cli.command('sentiment-bank')
-@click.option('--output', '-o', default=None, help='Output model path')
-@click.option('--max-features', default=200000, help='Maximum vocabulary size')
-@click.option('--ngram-min', default=1, help='Minimum n-gram')
-@click.option('--ngram-max', default=3, help='Maximum n-gram')
-@click.option('--min-df', default=1, help='Minimum document frequency')
-@click.option('--max-df', default=0.9, help='Maximum document frequency')
-@click.option('--c', default=0.7, help='SVM regularization parameter')
-@click.option('--max-iter', default=1000, help='Maximum iterations')
-@click.option('--tol', default=0.0001, help='Convergence tolerance')
-def sentiment_bank(output, max_features, ngram_min, ngram_max, min_df, max_df, c, max_iter, tol):
-    """Train sentiment-bank model on UTS2017_Bank (36 combined category#sentiment labels)."""
-    from datetime import datetime
-    from datasets import load_dataset
-    if output is None:
-        date_str = datetime.now().strftime('%Y%m%d')
-        output = f'models/sen-sentiment-bank-1.0.0-{date_str}.bin'
-    click.echo("=" * 70)
-    click.echo("Sentiment Bank Training (category#sentiment, 36 labels)")
-    click.echo("=" * 70)
-    # Load and merge classification + sentiment configs
-    click.echo("\nLoading UTS2017_Bank dataset from HuggingFace...")
-    ds_class = load_dataset("undertheseanlp/UTS2017_Bank", "classification")
-    ds_sent = load_dataset("undertheseanlp/UTS2017_Bank", "sentiment")
-    train_texts = list(ds_class["train"]["text"])
-    train_labels = [f'{c}#{s}' for c, s in zip(ds_class["train"]["label"], ds_sent["train"]["sentiment"])]
-    test_texts = list(ds_class["test"]["text"])
-    test_labels = [f'{c}#{s}' for c, s in zip(ds_class["test"]["label"], ds_sent["test"]["sentiment"])]
-    click.echo(f"  Train samples: {len(train_texts)}")
-    click.echo(f"  Test samples: {len(test_texts)}")
-    click.echo(f"  Labels: {len(set(train_labels))}")
-    # Preprocess
-    click.echo("\nPreprocessing...")
-    proc_train = [preprocess_sentiment(t) for t in train_texts]
-    proc_test = [preprocess_sentiment(t) for t in test_texts]
-    # Train
-    click.echo("\nTraining Rust TextClassifier...")
     clf = TextClassifier(
-        max_features=max_features,
-        ngram_range=(ngram_min, ngram_max),
-        min_df=min_df,
-        max_df=max_df,
-        c=c,
-        max_iter=max_iter,
-        tol=tol,
     )
     t0 = time.perf_counter()
-    clf.fit(proc_train, train_labels)
     train_time = time.perf_counter() - t0
-    click.echo(f"  Training time: {train_time:.3f}s")
-    click.echo(f"  Vocabulary size: {clf.n_features}")
-    # Evaluate
-    click.echo("\nEvaluating...")
-    preds = clf.predict_batch(proc_test)
-    acc = accuracy_score(test_labels, preds)
-    f1_w = f1_score(test_labels, preds, average='weighted', zero_division=0)
-    f1_m = f1_score(test_labels, preds, average='macro', zero_division=0)
-    click.echo("\n" + "=" * 70)
-    click.echo("RESULTS")
-    click.echo("=" * 70)
-    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
-    click.echo(f"  F1 (weighted): {f1_w:.4f}")
-    click.echo(f"  F1 (macro): {f1_m:.4f}")
-    click.echo("\nClassification Report:")
-    click.echo(classification_report(test_labels, preds, zero_division=0))
     # Save model
     model_path = Path(output)
     model_path.parent.mkdir(parents=True, exist_ok=True)
     clf.save(str(model_path))
     size_mb = model_path.stat().st_size / (1024 * 1024)
-    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
 if __name__ == "__main__":
-    cli()

 """
+Training CLI for Vietnamese Text Classification using Hydra.
 Usage:
+    python src/train.py --config-name=vntc
+    python src/train.py --config-name=sentiment_general
+    python src/train.py --config-name=sentiment_bank
+    python src/train.py --config-name=bank
+Override params from CLI:
+    python src/train.py --config-name=sentiment_general model.c=0.5 model.max_features=100000
+    python src/train.py --config-name=vntc preprocessor=sentiment
+    python src/train.py --config-name=sentiment_general data.vlsp2016_dir=/path/to/VLSP2016_SA
 """
 import os
 import time
+import logging
 from pathlib import Path
+import hydra
+from omegaconf import DictConfig, OmegaConf
 from sklearn.metrics import accuracy_score, f1_score, classification_report
+from underthesea import TextClassifier, TextPreprocessor
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Preprocessor
+# ---------------------------------------------------------------------------
+def build_preprocessor(pp_cfg):
+    """Build a Rust TextPreprocessor from model.preprocessor config."""
+    teencode = dict(pp_cfg.get("teencode", {})) or None
+    neg_words = list(pp_cfg.get("negation_words", [])) or None
+    neg_window = pp_cfg.get("negation_window", 2)
+    return TextPreprocessor(
+        lowercase=pp_cfg.get("lowercase", True),
+        unicode_normalize=pp_cfg.get("unicode_normalize", True),
+        remove_urls=pp_cfg.get("remove_urls", True),
+        normalize_repeated_chars=pp_cfg.get("normalize_repeated_chars", True),
+        normalize_punctuation=pp_cfg.get("normalize_punctuation", True),
+        teencode=teencode,
+        negation_words=neg_words,
+        negation_window=neg_window,
+    )
+# ---------------------------------------------------------------------------
+# Data loaders
+# ---------------------------------------------------------------------------
 def read_file(filepath):
     """Read text file with multiple encoding attempts."""
 def load_vntc_data(data_dir):
     """Load VNTC data from directory."""
     texts, labels = [], []
     for folder in sorted(os.listdir(data_dir)):
         folder_path = os.path.join(data_dir, folder)
         if not os.path.isdir(folder_path):
             continue
         for fname in os.listdir(folder_path):
             if fname.endswith('.txt'):
                 text = read_file(os.path.join(folder_path, fname))
                 if text:
                     texts.append(text)
                     labels.append(folder)
     return texts, labels
+def load_vlsp2016(data_dir):
     """Load VLSP2016 sentiment data from directory."""
     label_map = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}
     texts, labels = [], []
     return texts[0], labels[0], texts[1], labels[1]
+def load_data(cfg):
+    """Load train/test data based on Hydra data config."""
+    data_cfg = cfg.data
+    name = data_cfg.name
+    extra_test = {}
+    if name == "vntc":
+        train_texts, train_labels = load_vntc_data(
+            os.path.join(data_cfg.data_dir, "Train_Full"))
+        test_texts, test_labels = load_vntc_data(
+            os.path.join(data_cfg.data_dir, "Test_Full"))
+    elif name == "bank":
+        from datasets import load_dataset
+        dataset = load_dataset(data_cfg.dataset, data_cfg.config)
+        train_texts = list(dataset["train"]["text"])
+        train_labels = list(dataset["train"]["label"])
+        test_texts = list(dataset["test"]["text"])
+        test_labels = list(dataset["test"]["label"])
+    elif name == "sentiment_general":
+        train_texts, train_labels, test_texts, test_labels = load_vlsp2016(
+            data_cfg.data_dir)
+    elif name == "sentiment_bank":
+        from datasets import load_dataset
+        ds_class = load_dataset(data_cfg.dataset, "classification")
+        ds_sent = load_dataset(data_cfg.dataset, "sentiment")
+        train_texts = list(ds_class["train"]["text"])
+        train_labels = [f'{c}#{s}' for c, s in
+                        zip(ds_class["train"]["label"], ds_sent["train"]["sentiment"])]
+        test_texts = list(ds_class["test"]["text"])
+        test_labels = [f'{c}#{s}' for c, s in
+                       zip(ds_class["test"]["label"], ds_sent["test"]["sentiment"])]
+    else:
+        raise ValueError(f"Unknown data: {name}")
+    return train_texts, train_labels, test_texts, test_labels, extra_test
+# ---------------------------------------------------------------------------
+# Evaluate
+# ---------------------------------------------------------------------------
+def evaluate(test_labels, preds, name=""):
+    """Print evaluation metrics."""
+    acc = accuracy_score(test_labels, preds)
+    f1_w = f1_score(test_labels, preds, average='weighted', zero_division=0)
+    f1_m = f1_score(test_labels, preds, average='macro', zero_division=0)
+    header = f"RESULTS ({name})" if name else "RESULTS"
+    log.info("=" * 70)
+    log.info(header)
+    log.info("=" * 70)
+    log.info(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
+    log.info(f"  F1 (weighted): {f1_w:.4f}")
+    log.info(f"  F1 (macro): {f1_m:.4f}")
+    log.info("\n" + classification_report(test_labels, preds, zero_division=0))
+    return acc
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+@hydra.main(version_base=None, config_path="conf", config_name="config")
+def train(cfg: DictConfig):
+    """Train Vietnamese text classification model."""
+    log.info("=" * 70)
+    log.info(f"Training: {cfg.data.name}")
+    log.info("=" * 70)
+    log.info(f"\nConfig:\n{OmegaConf.to_yaml(cfg)}")
+    # Load data
+    log.info("Loading data...")
     t0 = time.perf_counter()
+    train_texts, train_labels, test_texts, test_labels, extra_test = load_data(cfg)
+    load_time = time.perf_counter() - t0
+    log.info(f"  Train samples: {len(train_texts)}")
+    log.info(f"  Test samples: {len(test_texts)}")
+    log.info(f"  Labels: {len(set(train_labels))}")
+    log.info(f"  Load time: {load_time:.2f}s")
+    # Build preprocessor — model.preprocess=true activates model.preprocessor config
+    # Preprocessor is passed to TextClassifier and packed into the .bin model
+    preprocessor = None
+    if cfg.model.get("preprocess", False):
+        preprocessor = build_preprocessor(cfg.model.preprocessor)
+        log.info(f"\nPreprocessor: {preprocessor}")
+    # Build classifier from config
+    model_cfg = cfg.model
+    ngram_range = tuple(model_cfg.ngram_range)
+    log.info("\nTraining TextClassifier...")
+    log.info(f"  max_features={model_cfg.max_features}, ngram_range={ngram_range}, "
+             f"max_df={model_cfg.max_df}, C={model_cfg.c}")
     clf = TextClassifier(
+        max_features=model_cfg.max_features,
+        ngram_range=ngram_range,
+        min_df=model_cfg.min_df,
+        max_df=model_cfg.max_df,
+        c=model_cfg.c,
+        max_iter=model_cfg.max_iter,
+        tol=model_cfg.tol,
+        preprocessor=preprocessor,
     )
     t0 = time.perf_counter()
+    clf.fit(train_texts, train_labels)
     train_time = time.perf_counter() - t0
+    log.info(f"  Training time: {train_time:.3f}s")
+    log.info(f"  Vocabulary size: {clf.n_features}")
+    # Evaluate on primary test set
+    # TextClassifier auto-preprocesses via its built-in preprocessor
+    log.info("\nEvaluating...")
+    preds = clf.predict_batch(test_texts)
+    evaluate(test_labels, preds, cfg.data.name)
+    # Evaluate on extra test sets (e.g. VLSP2016)
+    for name, (et_texts, et_labels) in extra_test.items():
+        et_preds = clf.predict_batch(et_texts)
+        evaluate(et_labels, et_preds, name)
     # Save model
+    output = cfg.output
     model_path = Path(output)
     model_path.parent.mkdir(parents=True, exist_ok=True)
     clf.save(str(model_path))
     size_mb = model_path.stat().st_size / (1024 * 1024)
+    log.info(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
 if __name__ == "__main__":
+    train()