rain1024 Claude Opus 4.5 commited on Feb 3

Commit

b059f86

1 Parent(s): 2ac432e

Refactor: move Rust extensions to underthesea_core upstream

- Remove local Rust extensions (now in underthesea_core 3.1.7)
- Consolidate training scripts into unified CLI (src/train.py)
- Add benchmark CLI (src/bench.py) with vntc, bank, synthetic commands
- Remove src/sen module (TextClassifier now in underthesea_core)
- Add CLAUDE.md for project documentation
- Update dependencies to use underthesea_core>=3.1.7

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (19) hide show

CLAUDE.md +77 -0
extensions/underthesea_core_extend/.gitignore +0 -5
extensions/underthesea_core_extend/Cargo.lock +0 -351
extensions/underthesea_core_extend/Cargo.toml +0 -23
extensions/underthesea_core_extend/pyproject.toml +0 -22
extensions/underthesea_core_extend/src/lib.rs +0 -21
extensions/underthesea_core_extend/src/svm.rs +0 -512
extensions/underthesea_core_extend/src/tfidf.rs +0 -235
extensions/underthesea_core_extend/uv.lock +0 -8
pyproject.toml +5 -6
src/bench.py +328 -0
src/scripts/train.py +0 -221
src/scripts/train_sonar.py +0 -234
src/scripts/train_vntc.py +0 -181
src/sen/__init__.py +0 -26
src/sen/text_classifier.py +0 -374
src/train.py +213 -0
tests/test_classifier.py +0 -165
uv.lock +0 -0

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+Sen-1 is a lightweight Vietnamese text classification model combining TF-IDF vectorization with Linear SVM. Part of the UnderTheSea NLP ecosystem, it serves as a practical baseline compatible with the underthesea API.
+## Build & Development Commands
+### Running Tests
+```bash
+pytest tests/test_classifier.py
+```
+### Development Installation
+```bash
+pip install -e ".[dev]"
+```
+### Training on VNTC Dataset
+```bash
+python src/scripts/train_vntc.py
+```
+## Architecture
+**3-Stage Pipeline:**
+```
+Input Text → TF-IDF Vectorizer (max_features=20k, ngram 1-2)
+           → Linear SVM (C=1.0, max_iter=1000)
+           → Label + Confidence
+```
+**Key Design Decisions:**
+- Operates at syllable-level (no word segmentation) for speed
+- All-Rust implementation via `underthesea_core` for fast training and inference
+- Model serialization uses binary format (bincode)
+**Core Module:** `src/sen/text_classifier.py` contains `SenTextClassifier` wrapper with train/predict/evaluate/save/load methods.
+**Rust Backend:** Uses `underthesea_core.TextClassifier` which combines TF-IDF vectorization and Linear SVM in a unified Rust implementation via PyO3.
+## Public API
+```python
+from sen import SenTextClassifier, Sentence, Label, classify
+# Pre-trained model inference
+labels = classify("Văn bản tiếng Việt", model_path="models/sen-1")
+# Custom training
+clf = SenTextClassifier()
+clf.train(texts, labels)
+clf.save("my_model")
+```
+## Key Files
+- `src/sen/text_classifier.py` - Main classifier implementation (wraps underthesea_core)
+- `src/scripts/train_vntc.py` - Training script for VNTC dataset
+- `src/scripts/bench_vntc_full.py` - Benchmark comparing sklearn vs Rust
+- `TECHNICAL_REPORT.md` - Detailed methodology and benchmark results
+- `RESEARCH_PLAN.md` - Future work roadmap (PhoBERT comparison, word segmentation)
+## Performance Benchmarks
+- VNTC (news, 10 topics): 92.49% accuracy, 37.6s training
+- UTS2017_Bank (14 categories): 75.76% accuracy
+- Inference: 66,678 samples/sec batch, 0.465ms single
+## Known Limitations
+1. Syllable-level only (no word segmentation) - ~4.6% gap vs word-level approaches
+2. Single-label classification only
+3. Trained on news domain - may not generalize to social media/reviews
+4. Lower performance on imbalanced categories

extensions/underthesea_core_extend/.gitignore DELETED Viewed

@@ -1,5 +0,0 @@
-target/
-.venv/
-__pycache__/
-*.so
-*.pyc

extensions/underthesea_core_extend/Cargo.lock DELETED Viewed

@@ -1,351 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 4
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-[[package]]
-name = "autocfg"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
-[[package]]
-name = "cfg-if"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
-[[package]]
-name = "crossbeam-deque"
-version = "0.8.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
-dependencies = [
- "crossbeam-epoch",
- "crossbeam-utils",
-]
-[[package]]
-name = "crossbeam-epoch"
-version = "0.9.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
-dependencies = [
- "crossbeam-utils",
-]
-[[package]]
-name = "crossbeam-utils"
-version = "0.8.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
-[[package]]
-name = "either"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
-[[package]]
-name = "equivalent"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
-[[package]]
-name = "foldhash"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
-[[package]]
-name = "hashbrown"
-version = "0.15.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
- "serde",
-]
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-[[package]]
-name = "indoc"
-version = "2.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
-dependencies = [
- "rustversion",
-]
-[[package]]
-name = "itoa"
-version = "1.0.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
-[[package]]
-name = "libc"
-version = "0.2.180"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
-[[package]]
-name = "memchr"
-version = "2.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
-[[package]]
-name = "memoffset"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
-dependencies = [
- "autocfg",
-]
-[[package]]
-name = "once_cell"
-version = "1.21.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
-[[package]]
-name = "portable-atomic"
-version = "1.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
-[[package]]
-name = "proc-macro2"
-version = "1.0.106"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
-dependencies = [
- "unicode-ident",
-]
-[[package]]
-name = "pyo3"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
-dependencies = [
- "cfg-if",
- "indoc",
- "libc",
- "memoffset",
- "once_cell",
- "portable-atomic",
- "pyo3-build-config",
- "pyo3-ffi",
- "pyo3-macros",
- "unindent",
-]
-[[package]]
-name = "pyo3-build-config"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
-dependencies = [
- "once_cell",
- "target-lexicon",
-]
-[[package]]
-name = "pyo3-ffi"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
-dependencies = [
- "libc",
- "pyo3-build-config",
-]
-[[package]]
-name = "pyo3-macros"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
-dependencies = [
- "proc-macro2",
- "pyo3-macros-backend",
- "quote",
- "syn",
-]
-[[package]]
-name = "pyo3-macros-backend"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
-dependencies = [
- "heck",
- "proc-macro2",
- "pyo3-build-config",
- "quote",
- "syn",
-]
-[[package]]
-name = "quote"
-version = "1.0.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4"
-dependencies = [
- "proc-macro2",
-]
-[[package]]
-name = "rayon"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f"
-dependencies = [
- "either",
- "rayon-core",
-]
-[[package]]
-name = "rayon-core"
-version = "1.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
-dependencies = [
- "crossbeam-deque",
- "crossbeam-utils",
-]
-[[package]]
-name = "rustversion"
-version = "1.0.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
-[[package]]
-name = "serde"
-version = "1.0.228"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
-dependencies = [
- "serde_core",
- "serde_derive",
-]
-[[package]]
-name = "serde_core"
-version = "1.0.228"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
-dependencies = [
- "serde_derive",
-]
-[[package]]
-name = "serde_derive"
-version = "1.0.228"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-[[package]]
-name = "serde_json"
-version = "1.0.149"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
-dependencies = [
- "itoa",
- "memchr",
- "serde",
- "serde_core",
- "zmij",
-]
-[[package]]
-name = "syn"
-version = "2.0.114"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-[[package]]
-name = "target-lexicon"
-version = "0.12.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
-[[package]]
-name = "tinyvec"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
-dependencies = [
- "tinyvec_macros",
-]
-[[package]]
-name = "tinyvec_macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
-[[package]]
-name = "underthesea_core_extend"
-version = "0.1.0"
-dependencies = [
- "hashbrown",
- "pyo3",
- "rayon",
- "serde",
- "serde_json",
- "unicode-normalization",
-]
-[[package]]
-name = "unicode-ident"
-version = "1.0.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
-[[package]]
-name = "unicode-normalization"
-version = "0.1.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
-dependencies = [
- "tinyvec",
-]
-[[package]]
-name = "unindent"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
-[[package]]
-name = "zmij"
-version = "1.0.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ff05f8caa9038894637571ae6b9e29466c1f4f829d26c9b28f869a29cbe3445"

extensions/underthesea_core_extend/Cargo.toml DELETED Viewed

@@ -1,23 +0,0 @@
-[package]
-name = "underthesea_core_extend"
-version = "0.1.0"
-edition = "2021"
-description = "Rust extensions for underthesea - Text Classification"
-license = "Apache-2.0"
-[lib]
-name = "underthesea_core_extend"
-crate-type = ["cdylib"]
-[dependencies]
-pyo3 = { version = "0.22", features = ["extension-module"] }
-serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0"
-rayon = "1.10"
-hashbrown = { version = "0.15", features = ["serde"] }
-unicode-normalization = "0.1"
-[profile.release]
-lto = true
-codegen-units = 1
-opt-level = 3

extensions/underthesea_core_extend/pyproject.toml DELETED Viewed

@@ -1,22 +0,0 @@
-[build-system]
-requires = ["maturin>=1.0,<2.0"]
-build-backend = "maturin"
-[project]
-name = "underthesea_core_extend"
-version = "0.1.0"
-description = "Rust extensions for underthesea - Text Classification"
-requires-python = ">=3.10"
-license = { text = "Apache-2.0" }
-authors = [{ name = "UnderTheSea NLP", email = "anhv.ict91@gmail.com" }]
-classifiers = [
-    "Programming Language :: Rust",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-[tool.maturin]
-features = ["pyo3/extension-module"]
-module-name = "underthesea_core_extend"

extensions/underthesea_core_extend/src/lib.rs DELETED Viewed

@@ -1,21 +0,0 @@
-//! underthesea_core_extend - Rust extensions for Vietnamese Text Classification
-//!
-//! Provides fast TF-IDF vectorization and Linear SVM classification.
-use pyo3::prelude::*;
-mod tfidf;
-mod svm;
-pub use tfidf::TfIdfVectorizer;
-pub use svm::{LinearSVM, SVMTrainer, FastSVMTrainer};
-/// Python module
-#[pymodule]
-fn underthesea_core_extend(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<TfIdfVectorizer>()?;
-    m.add_class::<LinearSVM>()?;
-    m.add_class::<SVMTrainer>()?;
-    m.add_class::<FastSVMTrainer>()?;
-    Ok(())
-}

extensions/underthesea_core_extend/src/svm.rs DELETED Viewed

@@ -1,512 +0,0 @@
-//! Optimized Linear SVM - LIBLINEAR-style Dual Coordinate Descent
-//!
-//! Pure Rust implementation of L2-regularized L2-loss SVM (dual form)
-//! Reference: "A Dual Coordinate Descent Method for Large-scale Linear SVM"
-//! Hsieh et al., ICML 2008
-use hashbrown::HashMap;
-use pyo3::prelude::*;
-use rayon::prelude::*;
-use serde::{Deserialize, Serialize};
-use std::fs::File;
-use std::io::{BufReader, BufWriter};
-/// Sparse feature vector
-pub type SparseVec = Vec<(u32, f32)>;  // Use u32/f32 for memory efficiency
-/// Linear SVM Model
-#[pyclass]
-#[derive(Clone, Serialize, Deserialize)]
-pub struct LinearSVM {
-    weights: Vec<Vec<f32>>,
-    biases: Vec<f32>,
-    classes: Vec<String>,
-    n_features: usize,
-}
-#[pymethods]
-impl LinearSVM {
-    #[new]
-    pub fn new() -> Self {
-        Self {
-            weights: Vec::new(),
-            biases: Vec::new(),
-            classes: Vec::new(),
-            n_features: 0,
-        }
-    }
-    pub fn predict(&self, features: Vec<f64>) -> String {
-        let idx = self.predict_idx(&features);
-        self.classes[idx].clone()
-    }
-    pub fn predict_with_score(&self, features: Vec<f64>) -> (String, f64) {
-        let scores = self.decision_scores(&features);
-        let (idx, &max_score) = scores
-            .iter()
-            .enumerate()
-            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
-            .unwrap();
-        let confidence = 1.0 / (1.0 + (-max_score as f64).exp());
-        (self.classes[idx].clone(), confidence)
-    }
-    pub fn predict_batch(&self, features_batch: Vec<Vec<f64>>) -> Vec<String> {
-        features_batch
-            .par_iter()
-            .map(|f| {
-                let idx = self.predict_idx(f);
-                self.classes[idx].clone()
-            })
-            .collect()
-    }
-    pub fn predict_batch_sparse(&self, features_batch: Vec<Vec<(usize, f64)>>) -> Vec<String> {
-        features_batch
-            .par_iter()
-            .map(|f| {
-                let idx = self.predict_idx_sparse(f);
-                self.classes[idx].clone()
-            })
-            .collect()
-    }
-    pub fn predict_sparse_with_score(&self, features: Vec<(usize, f64)>) -> (String, f64) {
-        let scores = self.decision_scores_sparse(&features);
-        let (idx, &max_score) = scores
-            .iter()
-            .enumerate()
-            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
-            .unwrap();
-        let confidence = 1.0 / (1.0 + (-max_score as f64).exp());
-        (self.classes[idx].clone(), confidence)
-    }
-    pub fn decision_function(&self, features: Vec<f64>) -> Vec<f64> {
-        self.decision_scores(&features).into_iter().map(|x| x as f64).collect()
-    }
-    #[getter]
-    pub fn classes(&self) -> Vec<String> {
-        self.classes.clone()
-    }
-    #[getter]
-    pub fn n_classes(&self) -> usize {
-        self.classes.len()
-    }
-    #[getter]
-    pub fn n_features(&self) -> usize {
-        self.n_features
-    }
-    pub fn save(&self, path: &str) -> PyResult<()> {
-        let file = File::create(path)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        let writer = BufWriter::new(file);
-        serde_json::to_writer(writer, self)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        Ok(())
-    }
-    #[staticmethod]
-    pub fn load(path: &str) -> PyResult<Self> {
-        let file = File::open(path)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        let reader = BufReader::new(file);
-        let model: Self = serde_json::from_reader(reader)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        Ok(model)
-    }
-}
-impl LinearSVM {
-    #[inline]
-    fn predict_idx(&self, features: &[f64]) -> usize {
-        let mut best_idx = 0;
-        let mut best_score = f32::NEG_INFINITY;
-        for (idx, (w, &b)) in self.weights.iter().zip(self.biases.iter()).enumerate() {
-            let score: f32 = w.iter()
-                .zip(features.iter())
-                .map(|(&wi, &fi)| wi * fi as f32)
-                .sum::<f32>() + b;
-            if score > best_score {
-                best_score = score;
-                best_idx = idx;
-            }
-        }
-        best_idx
-    }
-    #[inline]
-    fn predict_idx_sparse(&self, features: &[(usize, f64)]) -> usize {
-        let mut best_idx = 0;
-        let mut best_score = f32::NEG_INFINITY;
-        for (idx, (w, &b)) in self.weights.iter().zip(self.biases.iter()).enumerate() {
-            let score: f32 = features.iter()
-                .map(|&(j, v)| w[j] * v as f32)
-                .sum::<f32>() + b;
-            if score > best_score {
-                best_score = score;
-                best_idx = idx;
-            }
-        }
-        best_idx
-    }
-    fn decision_scores(&self, features: &[f64]) -> Vec<f32> {
-        self.weights
-            .iter()
-            .zip(self.biases.iter())
-            .map(|(w, &b)| {
-                w.iter()
-                    .zip(features.iter())
-                    .map(|(&wi, &fi)| wi * fi as f32)
-                    .sum::<f32>() + b
-            })
-            .collect()
-    }
-    fn decision_scores_sparse(&self, features: &[(usize, f64)]) -> Vec<f32> {
-        self.weights
-            .iter()
-            .zip(self.biases.iter())
-            .map(|(w, &b)| {
-                features.iter()
-                    .map(|&(j, v)| w[j] * v as f32)
-                    .sum::<f32>() + b
-            })
-            .collect()
-    }
-}
-/// LIBLINEAR-style SVM Trainer
-#[pyclass]
-pub struct SVMTrainer {
-    c: f64,
-    max_iter: usize,
-    tol: f64,
-    verbose: bool,
-}
-#[pymethods]
-impl SVMTrainer {
-    #[new]
-    #[pyo3(signature = (c=1.0, max_iter=1000, tol=0.1, verbose=false))]
-    pub fn new(c: f64, max_iter: usize, tol: f64, verbose: bool) -> Self {
-        Self { c, max_iter, tol, verbose }
-    }
-    pub fn set_c(&mut self, c: f64) {
-        self.c = c;
-    }
-    pub fn set_max_iter(&mut self, max_iter: usize) {
-        self.max_iter = max_iter;
-    }
-    pub fn set_verbose(&mut self, verbose: bool) {
-        self.verbose = verbose;
-    }
-    pub fn train(&self, features: Vec<Vec<f64>>, labels: Vec<String>) -> LinearSVM {
-        let n_samples = features.len();
-        let n_features = if n_samples > 0 { features[0].len() } else { 0 };
-        // Convert to compact sparse format (f32 for memory/cache efficiency)
-        let sparse_features: Vec<SparseVec> = features
-            .par_iter()
-            .map(|dense| {
-                dense
-                    .iter()
-                    .enumerate()
-                    .filter(|&(_, &v)| v.abs() > 1e-10)
-                    .map(|(i, &v)| (i as u32, v as f32))
-                    .collect()
-            })
-            .collect();
-        // Precompute ||x_i||^2
-        let x_sq_norms: Vec<f32> = sparse_features
-            .par_iter()
-            .map(|x| x.iter().map(|&(_, v)| v * v).sum())
-            .collect();
-        // Get unique classes
-        let mut classes: Vec<String> = labels.iter().cloned().collect();
-        classes.sort();
-        classes.dedup();
-        let n_classes = classes.len();
-        let class_to_idx: HashMap<String, usize> = classes
-            .iter()
-            .enumerate()
-            .map(|(i, c)| (c.clone(), i))
-            .collect();
-        let y_idx: Vec<usize> = labels.iter().map(|l| class_to_idx[l]).collect();
-        // Train binary classifiers in parallel (one-vs-rest)
-        let results: Vec<(Vec<f32>, f32)> = (0..n_classes)
-            .into_par_iter()
-            .map(|class_idx| {
-                let y_binary: Vec<i8> = y_idx
-                    .iter()
-                    .map(|&idx| if idx == class_idx { 1 } else { -1 })
-                    .collect();
-                solve_l2r_l2_svc(
-                    &sparse_features,
-                    &y_binary,
-                    &x_sq_norms,
-                    n_features,
-                    self.c as f32,
-                    self.tol as f32,
-                    self.max_iter,
-                )
-            })
-            .collect();
-        let weights = results.iter().map(|(w, _)| w.clone()).collect();
-        let biases = results.iter().map(|(_, b)| *b).collect();
-        LinearSVM {
-            weights,
-            biases,
-            classes,
-            n_features,
-        }
-    }
-}
-/// LIBLINEAR's solve_l2r_l2_svc - Dual Coordinate Descent for L2-loss SVM
-///
-/// Solves: min_α 0.5 * α^T * Q * α - e^T * α, s.t. α_i ≥ 0
-/// where Q_ij = y_i * y_j * x_i^T * x_j + δ_ij / (2C)
-///
-/// Primal-dual relationship: w = Σ α_i * y_i * x_i
-#[inline(never)]
-fn solve_l2r_l2_svc(
-    x: &[SparseVec],
-    y: &[i8],
-    x_sq_norms: &[f32],
-    n_features: usize,
-    c: f32,
-    eps: f32,
-    max_iter: usize,
-) -> (Vec<f32>, f32) {
-    let n = x.len();
-    // D_ii = 1/(2C) for L2-loss SVM
-    let diag = 0.5 / c;
-    // QD[i] = ||x_i||^2 + D_ii
-    let qd: Vec<f32> = x_sq_norms.iter().map(|&xn| xn + diag).collect();
-    // Initialize α = 0
-    let mut alpha = vec![0.0f32; n];
-    // w = Σ α_i * y_i * x_i (initially 0)
-    let mut w = vec![0.0f32; n_features];
-    // Index for permutation
-    let mut index: Vec<usize> = (0..n).collect();
-    // Main loop
-    for iter in 0..max_iter {
-        // Shuffle indices
-        for i in 0..n {
-            let j = i + (iter * 1103515245 + 12345) % (n - i).max(1);
-            index.swap(i, j);
-        }
-        let mut max_violation = 0.0f32;
-        for &i in &index {
-            let yi = y[i] as f32;
-            let xi = &x[i];
-            // G = y_i * (w · x_i) - 1 + D_ii * α_i
-            let wxi: f32 = xi.iter().map(|&(j, v)| w[j as usize] * v).sum();
-            let g = yi * wxi - 1.0 + diag * alpha[i];
-            // Projected gradient (α ≥ 0, no upper bound for L2-loss)
-            let pg = if alpha[i] == 0.0 { g.min(0.0) } else { g };
-            max_violation = max_violation.max(pg.abs());
-            if pg.abs() > 1e-12 {
-                let alpha_old = alpha[i];
-                // α_i = max(0, α_i - G/Q_ii)
-                alpha[i] = (alpha[i] - g / qd[i]).max(0.0);
-                // Update w: w += (α_new - α_old) * y_i * x_i
-                let d = (alpha[i] - alpha_old) * yi;
-                if d.abs() > 1e-12 {
-                    for &(j, v) in xi.iter() {
-                        w[j as usize] += d * v;
-                    }
-                }
-            }
-        }
-        // Stopping criterion
-        if max_violation <= eps {
-            break;
-        }
-    }
-    // Compute bias from KKT conditions
-    // For α_i > 0: y_i * (w · x_i + b) = 1 - α_i / (2C)
-    let mut bias_sum = 0.0f32;
-    let mut n_sv = 0;
-    for i in 0..n {
-        if alpha[i] > 1e-8 {
-            let yi = y[i] as f32;
-            let wxi: f32 = x[i].iter().map(|&(j, v)| w[j as usize] * v).sum();
-            // b = y_i * (1 - α_i * diag) - w · x_i
-            bias_sum += yi * (1.0 - alpha[i] * diag) - wxi;
-            n_sv += 1;
-        }
-    }
-    let bias = if n_sv > 0 { bias_sum / n_sv as f32 } else { 0.0 };
-    (w, bias)
-}
-/// Fast SVM using Pegasos algorithm
-#[pyclass]
-pub struct FastSVMTrainer {
-    c: f64,
-    max_iter: usize,
-}
-#[pymethods]
-impl FastSVMTrainer {
-    #[new]
-    #[pyo3(signature = (c=1.0, max_iter=100))]
-    pub fn new(c: f64, max_iter: usize) -> Self {
-        Self { c, max_iter }
-    }
-    pub fn train(&self, features: Vec<Vec<f64>>, labels: Vec<String>) -> LinearSVM {
-        let n_samples = features.len();
-        let n_features = if n_samples > 0 { features[0].len() } else { 0 };
-        let sparse_features: Vec<SparseVec> = features
-            .par_iter()
-            .map(|dense| {
-                dense
-                    .iter()
-                    .enumerate()
-                    .filter(|&(_, &v)| v.abs() > 1e-10)
-                    .map(|(i, &v)| (i as u32, v as f32))
-                    .collect()
-            })
-            .collect();
-        let mut classes: Vec<String> = labels.iter().cloned().collect();
-        classes.sort();
-        classes.dedup();
-        let n_classes = classes.len();
-        let class_to_idx: HashMap<String, usize> = classes
-            .iter()
-            .enumerate()
-            .map(|(i, c)| (c.clone(), i))
-            .collect();
-        let y_idx: Vec<usize> = labels.iter().map(|l| class_to_idx[l]).collect();
-        let results: Vec<(Vec<f32>, f32)> = (0..n_classes)
-            .into_par_iter()
-            .map(|class_idx| {
-                let y_binary: Vec<i8> = y_idx
-                    .iter()
-                    .map(|&idx| if idx == class_idx { 1 } else { -1 })
-                    .collect();
-                pegasos(&sparse_features, &y_binary, n_features, self.c as f32, self.max_iter)
-            })
-            .collect();
-        LinearSVM {
-            weights: results.iter().map(|(w, _)| w.clone()).collect(),
-            biases: results.iter().map(|(_, b)| *b).collect(),
-            classes,
-            n_features,
-        }
-    }
-}
-/// Pegasos algorithm with lazy scaling
-#[inline(never)]
-fn pegasos(
-    x: &[SparseVec],
-    y: &[i8],
-    n_features: usize,
-    c: f32,
-    max_iter: usize,
-) -> (Vec<f32>, f32) {
-    let n = x.len();
-    let lambda = 1.0 / c;
-    let mut w = vec![0.0f32; n_features];
-    let mut scale = 1.0f32;
-    let mut b = 0.0f32;
-    let eta0 = 0.5;
-    let t0 = 1.0 / (eta0 * lambda);
-    let mut indices: Vec<usize> = (0..n).collect();
-    for epoch in 0..max_iter {
-        // Shuffle
-        for i in 0..n {
-            let j = (i + epoch * 1103515245 + 12345) % n;
-            indices.swap(i, j);
-        }
-        for (t_inner, &i) in indices.iter().enumerate() {
-            let t = (epoch * n + t_inner) as f32;
-            let eta = 1.0 / (lambda * (t + t0));
-            let yi = y[i] as f32;
-            let xi = &x[i];
-            let margin: f32 = scale * xi.iter().map(|&(j, v)| w[j as usize] * v).sum::<f32>() + b;
-            scale *= 1.0 - eta * lambda;
-            if scale < 1e-9 {
-                for wj in w.iter_mut() {
-                    *wj *= scale;
-                }
-                scale = 1.0;
-            }
-            if yi * margin < 1.0 {
-                let update = eta / scale;
-                for &(j, v) in xi.iter() {
-                    w[j as usize] += update * yi * v;
-                }
-                b += eta * yi * 0.1;
-            }
-        }
-    }
-    for wj in w.iter_mut() {
-        *wj *= scale;
-    }
-    (w, b)
-}

extensions/underthesea_core_extend/src/tfidf.rs DELETED Viewed

@@ -1,235 +0,0 @@
-//! TF-IDF Vectorizer implementation
-use hashbrown::HashMap;
-use pyo3::prelude::*;
-use rayon::prelude::*;
-use serde::{Deserialize, Serialize};
-use std::fs::File;
-use std::io::{BufReader, BufWriter};
-/// TF-IDF Vectorizer
-///
-/// Converts text documents into TF-IDF feature vectors.
-#[pyclass]
-#[derive(Clone, Serialize, Deserialize)]
-pub struct TfIdfVectorizer {
-    /// Vocabulary: word -> index
-    vocab: HashMap<String, usize>,
-    /// Inverse vocabulary: index -> word
-    inv_vocab: Vec<String>,
-    /// IDF values for each term
-    idf: Vec<f64>,
-    /// Number of documents used for fitting
-    n_docs: usize,
-    /// Maximum number of features
-    max_features: usize,
-    /// N-gram range (min, max)
-    ngram_range: (usize, usize),
-    /// Minimum document frequency
-    min_df: usize,
-    /// Maximum document frequency (as ratio)
-    max_df: f64,
-    /// Whether the vectorizer is fitted
-    is_fitted: bool,
-}
-#[pymethods]
-impl TfIdfVectorizer {
-    /// Create a new TfIdfVectorizer
-    #[new]
-    #[pyo3(signature = (max_features=20000, ngram_range=(1, 2), min_df=1, max_df=1.0))]
-    pub fn new(
-        max_features: usize,
-        ngram_range: (usize, usize),
-        min_df: usize,
-        max_df: f64,
-    ) -> Self {
-        Self {
-            vocab: HashMap::new(),
-            inv_vocab: Vec::new(),
-            idf: Vec::new(),
-            n_docs: 0,
-            max_features,
-            ngram_range,
-            min_df,
-            max_df,
-            is_fitted: false,
-        }
-    }
-    /// Fit the vectorizer on a list of documents
-    pub fn fit(&mut self, documents: Vec<String>) {
-        let n_docs = documents.len();
-        self.n_docs = n_docs;
-        // Count document frequency for each term
-        let mut df: HashMap<String, usize> = HashMap::new();
-        for doc in &documents {
-            let tokens = self.tokenize(doc);
-            let unique_tokens: std::collections::HashSet<_> = tokens.into_iter().collect();
-            for token in unique_tokens {
-                *df.entry(token).or_insert(0) += 1;
-            }
-        }
-        // Filter by min_df and max_df
-        let max_df_count = (self.max_df * n_docs as f64) as usize;
-        let mut filtered: Vec<(String, usize)> = df
-            .into_iter()
-            .filter(|(_, count)| *count >= self.min_df && *count <= max_df_count)
-            .collect();
-        // Sort by frequency (descending) and take top max_features
-        filtered.sort_by(|a, b| b.1.cmp(&a.1));
-        filtered.truncate(self.max_features);
-        // Build vocabulary
-        self.vocab.clear();
-        self.inv_vocab.clear();
-        self.idf.clear();
-        for (idx, (term, doc_freq)) in filtered.into_iter().enumerate() {
-            self.vocab.insert(term.clone(), idx);
-            self.inv_vocab.push(term);
-            // IDF with smoothing: log((n_docs + 1) / (df + 1)) + 1
-            let idf_value = ((n_docs as f64 + 1.0) / (doc_freq as f64 + 1.0)).ln() + 1.0;
-            self.idf.push(idf_value);
-        }
-        self.is_fitted = true;
-    }
-    /// Transform a single document to TF-IDF vector (sparse format)
-    ///
-    /// Returns list of (index, value) tuples
-    pub fn transform(&self, document: &str) -> Vec<(usize, f64)> {
-        if !self.is_fitted {
-            return Vec::new();
-        }
-        let tokens = self.tokenize(document);
-        let mut tf: HashMap<usize, usize> = HashMap::new();
-        for token in &tokens {
-            if let Some(&idx) = self.vocab.get(token) {
-                *tf.entry(idx).or_insert(0) += 1;
-            }
-        }
-        let n_tokens = tokens.len() as f64;
-        if n_tokens == 0.0 {
-            return Vec::new();
-        }
-        let mut result: Vec<(usize, f64)> = tf
-            .into_iter()
-            .map(|(idx, count)| {
-                let tf_value = count as f64 / n_tokens;
-                let tfidf = tf_value * self.idf[idx];
-                (idx, tfidf)
-            })
-            .collect();
-        // L2 normalize
-        let norm: f64 = result.iter().map(|(_, v)| v * v).sum::<f64>().sqrt();
-        if norm > 0.0 {
-            for (_, v) in &mut result {
-                *v /= norm;
-            }
-        }
-        result.sort_by_key(|(idx, _)| *idx);
-        result
-    }
-    /// Transform a single document to dense TF-IDF vector
-    pub fn transform_dense(&self, document: &str) -> Vec<f64> {
-        let sparse = self.transform(document);
-        let mut dense = vec![0.0; self.vocab.len()];
-        for (idx, val) in sparse {
-            dense[idx] = val;
-        }
-        dense
-    }
-    /// Transform multiple documents to dense TF-IDF vectors (parallel)
-    pub fn transform_batch(&self, documents: Vec<String>) -> Vec<Vec<f64>> {
-        documents
-            .par_iter()
-            .map(|doc| self.transform_dense(doc))
-            .collect()
-    }
-    /// Transform multiple documents to sparse TF-IDF vectors (parallel)
-    pub fn transform_batch_sparse(&self, documents: Vec<String>) -> Vec<Vec<(usize, f64)>> {
-        documents
-            .par_iter()
-            .map(|doc| self.transform(doc))
-            .collect()
-    }
-    /// Fit and transform in one step
-    pub fn fit_transform(&mut self, documents: Vec<String>) -> Vec<Vec<f64>> {
-        self.fit(documents.clone());
-        self.transform_batch(documents)
-    }
-    /// Get vocabulary size
-    #[getter]
-    pub fn vocab_size(&self) -> usize {
-        self.vocab.len()
-    }
-    /// Get feature names (vocabulary terms)
-    pub fn get_feature_names(&self) -> Vec<String> {
-        self.inv_vocab.clone()
-    }
-    /// Check if vectorizer is fitted
-    #[getter]
-    pub fn is_fitted(&self) -> bool {
-        self.is_fitted
-    }
-    /// Save vectorizer to file
-    pub fn save(&self, path: &str) -> PyResult<()> {
-        let file = File::create(path)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        let writer = BufWriter::new(file);
-        serde_json::to_writer(writer, self)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        Ok(())
-    }
-    /// Load vectorizer from file
-    #[staticmethod]
-    pub fn load(path: &str) -> PyResult<Self> {
-        let file = File::open(path)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        let reader = BufReader::new(file);
-        let vectorizer: Self = serde_json::from_reader(reader)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(e.to_string()))?;
-        Ok(vectorizer)
-    }
-}
-impl TfIdfVectorizer {
-    /// Tokenize document into n-grams
-    fn tokenize(&self, document: &str) -> Vec<String> {
-        let words: Vec<&str> = document.split_whitespace().collect();
-        let mut tokens = Vec::new();
-        for n in self.ngram_range.0..=self.ngram_range.1 {
-            if n > words.len() {
-                continue;
-            }
-            for i in 0..=(words.len() - n) {
-                let ngram = words[i..i + n].join(" ");
-                tokens.push(ngram);
-            }
-        }
-        tokens
-    }
-}

extensions/underthesea_core_extend/uv.lock DELETED Viewed

@@ -1,8 +0,0 @@
-version = 1
-revision = 3
-requires-python = ">=3.10"
-[[package]]
-name = "underthesea-core-extend"
-version = "0.1.0"
-source = { editable = "." }

pyproject.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "sen"
 version = "1.1.0"
-description = "Vietnamese Text Classification Model - Rust-powered"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "Apache-2.0"
@@ -10,14 +10,16 @@ authors = [
 ]
 keywords = ["vietnamese", "nlp", "text-classification", "rust", "svm"]
 dependencies = [
-    "underthesea_core_extend>=0.1.0",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=7.0.0",
     "huggingface-hub>=0.20.0",
-    "maturin>=1.0.0",
 ]
 [project.urls]
@@ -27,6 +29,3 @@ Repository = "https://github.com/undertheseanlp/sen"
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
-[tool.hatch.build.targets.wheel]
-packages = ["src/sen"]

 [project]
 name = "sen"
 version = "1.1.0"
+description = "Vietnamese Text Classification - Training scripts for underthesea_core"
 readme = "README.md"
 requires-python = ">=3.10"
 license = "Apache-2.0"
 ]
 keywords = ["vietnamese", "nlp", "text-classification", "rust", "svm"]
 dependencies = [
+    "underthesea>=6.0.0",
+    "click>=8.0.0",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=7.0.0",
     "huggingface-hub>=0.20.0",
+    "scikit-learn>=1.0.0",
+    "datasets>=2.0.0",
 ]
 [project.urls]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"

src/bench.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+Benchmark CLI for Vietnamese Text Classification.
+Compares Rust TextClassifier vs sklearn.
+Usage:
+    python bench.py vntc
+    python bench.py bank
+    python bench.py synthetic
+"""
+import os
+import time
+import random
+from pathlib import Path
+import click
+from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer
+from sklearn.svm import LinearSVC as SklearnLinearSVC
+from sklearn.metrics import accuracy_score, f1_score, classification_report
+from underthesea_core import TextClassifier
+def read_file(filepath):
+    """Read text file with multiple encoding attempts."""
+    for enc in ['utf-16', 'utf-16-le', 'utf-8', 'latin-1']:
+        try:
+            with open(filepath, 'r', encoding=enc) as f:
+                text = ' '.join(f.read().split())
+                if len(text) > 10:
+                    return text
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    return None
+def benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=20000):
+    """Benchmark scikit-learn TF-IDF + LinearSVC."""
+    click.echo("\n" + "=" * 70)
+    click.echo("scikit-learn: TfidfVectorizer + LinearSVC")
+    click.echo("=" * 70)
+    # Vectorize
+    click.echo("  Vectorizing...")
+    t0 = time.perf_counter()
+    vectorizer = SklearnTfidfVectorizer(max_features=max_features, ngram_range=(1, 2), min_df=2)
+    X_train = vectorizer.fit_transform(train_texts)
+    X_test = vectorizer.transform(test_texts)
+    vec_time = time.perf_counter() - t0
+    click.echo(f"    Vectorization time: {vec_time:.2f}s")
+    click.echo(f"    Vocabulary size: {len(vectorizer.vocabulary_)}")
+    # Train
+    click.echo("  Training LinearSVC...")
+    t0 = time.perf_counter()
+    clf = SklearnLinearSVC(C=1.0, max_iter=2000)
+    clf.fit(X_train, train_labels)
+    train_time = time.perf_counter() - t0
+    click.echo(f"    Training time: {train_time:.2f}s")
+    # End-to-end inference
+    click.echo("  End-to-end inference...")
+    t0 = time.perf_counter()
+    X_test_e2e = vectorizer.transform(test_texts)
+    preds = clf.predict(X_test_e2e)
+    e2e_time = time.perf_counter() - t0
+    e2e_throughput = len(test_texts) / e2e_time
+    click.echo(f"    E2E time: {e2e_time:.2f}s ({e2e_throughput:.0f} samples/sec)")
+    # Metrics
+    acc = accuracy_score(test_labels, preds)
+    f1_w = f1_score(test_labels, preds, average='weighted')
+    click.echo(f"  Results: Accuracy={acc:.4f}, F1={f1_w:.4f}")
+    return {
+        "total_train": vec_time + train_time,
+        "e2e_throughput": e2e_throughput,
+        "accuracy": acc,
+        "f1_weighted": f1_w,
+    }
+def benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=20000):
+    """Benchmark Rust TextClassifier."""
+    click.echo("\n" + "=" * 70)
+    click.echo("Rust: TextClassifier (underthesea_core)")
+    click.echo("=" * 70)
+    clf = TextClassifier(
+        max_features=max_features,
+        ngram_range=(1, 2),
+        min_df=2,
+        c=1.0,
+        max_iter=1000,
+        tol=0.1,
+    )
+    # Train
+    click.echo("  Training...")
+    t0 = time.perf_counter()
+    clf.fit(list(train_texts), list(train_labels))
+    train_time = time.perf_counter() - t0
+    click.echo(f"    Training time: {train_time:.2f}s")
+    click.echo(f"    Vocabulary size: {clf.n_features}")
+    # Inference
+    click.echo("  Inference...")
+    t0 = time.perf_counter()
+    preds = clf.predict_batch(list(test_texts))
+    infer_time = time.perf_counter() - t0
+    throughput = len(test_texts) / infer_time
+    click.echo(f"    Inference time: {infer_time:.2f}s ({throughput:.0f} samples/sec)")
+    # Metrics
+    acc = accuracy_score(test_labels, preds)
+    f1_w = f1_score(test_labels, preds, average='weighted')
+    click.echo(f"  Results: Accuracy={acc:.4f}, F1={f1_w:.4f}")
+    return {
+        "total_train": train_time,
+        "throughput": throughput,
+        "accuracy": acc,
+        "f1_weighted": f1_w,
+        "clf": clf,
+        "preds": preds,
+    }
+def print_comparison(sklearn_results, rust_results):
+    """Print comparison summary."""
+    click.echo("\n" + "=" * 70)
+    click.echo("COMPARISON SUMMARY")
+    click.echo("=" * 70)
+    click.echo(f"{'Metric':<30} {'sklearn':<20} {'Rust':<20}")
+    click.echo("-" * 70)
+    click.echo(f"{'Training time (s)':<30} {sklearn_results['total_train']:<20.2f} {rust_results['total_train']:<20.2f}")
+    click.echo(f"{'Inference (samples/sec)':<30} {sklearn_results['e2e_throughput']:<20.0f} {rust_results['throughput']:<20.0f}")
+    click.echo(f"{'Accuracy':<30} {sklearn_results['accuracy']:<20.4f} {rust_results['accuracy']:<20.4f}")
+    click.echo(f"{'F1 (weighted)':<30} {sklearn_results['f1_weighted']:<20.4f} {rust_results['f1_weighted']:<20.4f}")
+    click.echo("-" * 70)
+    train_speedup = sklearn_results['total_train'] / rust_results['total_train'] if rust_results['total_train'] > 0 else 0
+    infer_speedup = rust_results['throughput'] / sklearn_results['e2e_throughput'] if sklearn_results['e2e_throughput'] > 0 else 0
+    click.echo(f"Speedup: Training {train_speedup:.2f}x, Inference {infer_speedup:.2f}x")
+    click.echo("=" * 70)
+@click.group()
+def cli():
+    """Benchmark Vietnamese text classification models."""
+    pass
+@cli.command()
+@click.option('--data-dir', default='/home/claude-user/projects/workspace_underthesea/VNTC/Data/10Topics/Ver1.1',
+              help='Path to VNTC dataset')
+@click.option('--save-model', is_flag=True, help='Save the trained Rust model')
+@click.option('--output', '-o', default='models/sen-vntc.bin', help='Output model path')
+def vntc(data_dir, save_model, output):
+    """Benchmark on VNTC dataset (10 topics, ~84k documents)."""
+    click.echo("=" * 70)
+    click.echo("VNTC Full Dataset Benchmark")
+    click.echo("Vietnamese News Text Classification (10 Topics)")
+    click.echo("=" * 70)
+    train_dir = os.path.join(data_dir, "Train_Full")
+    test_dir = os.path.join(data_dir, "Test_Full")
+    # Load data
+    click.echo("\nLoading training data...")
+    t0 = time.perf_counter()
+    train_texts, train_labels = [], []
+    for folder in sorted(os.listdir(train_dir)):
+        folder_path = os.path.join(train_dir, folder)
+        if not os.path.isdir(folder_path):
+            continue
+        for fname in os.listdir(folder_path):
+            if fname.endswith('.txt'):
+                text = read_file(os.path.join(folder_path, fname))
+                if text:
+                    train_texts.append(text)
+                    train_labels.append(folder)
+    click.echo(f"  Loaded {len(train_texts)} training samples in {time.perf_counter()-t0:.1f}s")
+    click.echo("Loading test data...")
+    t0 = time.perf_counter()
+    test_texts, test_labels = [], []
+    for folder in sorted(os.listdir(test_dir)):
+        folder_path = os.path.join(test_dir, folder)
+        if not os.path.isdir(folder_path):
+            continue
+        for fname in os.listdir(folder_path):
+            if fname.endswith('.txt'):
+                text = read_file(os.path.join(folder_path, fname))
+                if text:
+                    test_texts.append(text)
+                    test_labels.append(folder)
+    click.echo(f"  Loaded {len(test_texts)} test samples in {time.perf_counter()-t0:.1f}s")
+    # Run benchmarks
+    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels)
+    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels)
+    print_comparison(sklearn_results, rust_results)
+    if save_model:
+        model_path = Path(output)
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+        rust_results['clf'].save(str(model_path))
+        size_mb = model_path.stat().st_size / (1024 * 1024)
+        click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
+@cli.command()
+@click.option('--save-model', is_flag=True, help='Save the trained Rust model')
+@click.option('--output', '-o', default='models/sen-bank.bin', help='Output model path')
+def bank(save_model, output):
+    """Benchmark on UTS2017_Bank dataset (14 categories, banking domain)."""
+    from datasets import load_dataset
+    click.echo("=" * 70)
+    click.echo("UTS2017_Bank Dataset Benchmark")
+    click.echo("Vietnamese Banking Domain Text Classification (14 Categories)")
+    click.echo("=" * 70)
+    # Load data
+    click.echo("\nLoading UTS2017_Bank dataset from HuggingFace...")
+    dataset = load_dataset("undertheseanlp/UTS2017_Bank", "classification")
+    train_texts = list(dataset["train"]["text"])
+    train_labels = list(dataset["train"]["label"])
+    test_texts = list(dataset["test"]["text"])
+    test_labels = list(dataset["test"]["label"])
+    click.echo(f"  Train samples: {len(train_texts)}")
+    click.echo(f"  Test samples: {len(test_texts)}")
+    click.echo(f"  Categories: {len(set(train_labels))}")
+    # Run benchmarks (smaller max_features for smaller dataset)
+    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=10000)
+    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=10000)
+    print_comparison(sklearn_results, rust_results)
+    click.echo("\nClassification Report (Rust):")
+    click.echo(classification_report(test_labels, rust_results['preds']))
+    if save_model:
+        model_path = Path(output)
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+        rust_results['clf'].save(str(model_path))
+        size_mb = model_path.stat().st_size / (1024 * 1024)
+        click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
+@cli.command()
+@click.option('--train-per-cat', default=340, help='Training samples per category')
+@click.option('--test-per-cat', default=500, help='Test samples per category')
+@click.option('--seed', default=42, help='Random seed')
+def synthetic(train_per_cat, test_per_cat, seed):
+    """Benchmark on synthetic VNTC-like data."""
+    # Vietnamese text templates by category
+    TEMPLATES = {
+        "the_thao": ["Đội tuyển {} thắng {} với tỷ số {}", "Cầu thủ {} ghi bàn đẹp mắt"],
+        "kinh_doanh": ["Chứng khoán {} điểm trong phiên giao dịch", "Ngân hàng {} công bố lãi suất {}"],
+        "cong_nghe": ["Apple ra mắt {} với nhiều tính năng", "Trí tuệ nhân tạo đang thay đổi {}"],
+        "chinh_tri": ["Quốc hội thông qua nghị quyết về {}", "Chủ tịch {} tiếp đón phái đoàn"],
+        "van_hoa": ["Nghệ sĩ {} ra mắt album mới", "Liên hoan phim {} trao giải"],
+        "khoa_hoc": ["Nhà khoa học phát hiện {} mới", "Nghiên cứu cho thấy {} có tác dụng"],
+        "suc_khoe": ["Bộ Y tế cảnh báo về {} trong mùa", "Vaccine {} đạt hiệu quả cao"],
+        "giao_duc": ["Trường {} công bố điểm chuẩn", "Học sinh đoạt huy chương tại Olympic"],
+        "phap_luat": ["Tòa án xét xử vụ án {} với bị cáo", "Công an triệt phá đường dây"],
+        "doi_song": ["Giá {} tăng trong tháng", "Người dân đổ xô đi mua {}"],
+    }
+    FILLS = {
+        "the_thao": ["Việt Nam", "Thái Lan", "3-0", "bóng đá", "AFF Cup"],
+        "kinh_doanh": ["tăng", "giảm", "VN-Index", "Vietcombank", "8%"],
+        "cong_nghe": ["iPhone 16", "ChatGPT", "công việc", "VinAI", "5G"],
+        "chinh_tri": ["kinh tế", "nước", "Trung Quốc", "Hà Nội", "phát triển"],
+        "van_hoa": ["Mỹ Tâm", "Cannes", "nghệ thuật", "Hà Nội", "Bố Già"],
+        "khoa_hoc": ["loài sinh vật", "trà xanh", "VNREDSat-1", "Nobel", "robot"],
+        "suc_khoe": ["dịch cúm", "COVID-19", "Bạch Mai", "dinh dưỡng", "tiểu đường"],
+        "giao_duc": ["Bách Khoa", "Việt Nam", "Toán", "THPT", "STEM"],
+        "phap_luat": ["tham nhũng", "TP.HCM", "ma túy", "Hình sự", "gian lận"],
+        "doi_song": ["xăng", "vàng", "nắng nóng", "Trung thu", "bún chả"],
+    }
+    def generate_sample(category):
+        template = random.choice(TEMPLATES[category])
+        fills = FILLS[category]
+        n = template.count("{}")
+        return template.format(*random.choices(fills, k=n))
+    def generate_dataset(n_per_cat, categories):
+        texts, labels = [], []
+        for cat in categories:
+            for _ in range(n_per_cat):
+                texts.append(generate_sample(cat))
+                labels.append(cat)
+        combined = list(zip(texts, labels))
+        random.shuffle(combined)
+        return [t for t, _ in combined], [l for _, l in combined]
+    click.echo("=" * 70)
+    click.echo("Synthetic VNTC-like Benchmark")
+    click.echo("=" * 70)
+    random.seed(seed)
+    categories = list(TEMPLATES.keys())
+    click.echo(f"\nConfiguration:")
+    click.echo(f"  Categories: {len(categories)}")
+    click.echo(f"  Train samples: {train_per_cat * len(categories)}")
+    click.echo(f"  Test samples: {test_per_cat * len(categories)}")
+    train_texts, train_labels = generate_dataset(train_per_cat, categories)
+    test_texts, test_labels = generate_dataset(test_per_cat, categories)
+    sklearn_results = benchmark_sklearn(train_texts, train_labels, test_texts, test_labels, max_features=10000)
+    rust_results = benchmark_rust(train_texts, train_labels, test_texts, test_labels, max_features=10000)
+    print_comparison(sklearn_results, rust_results)
+if __name__ == "__main__":
+    cli()

src/scripts/train.py DELETED Viewed

@@ -1,221 +0,0 @@
-"""
-Train Sen Text Classifier on Vietnamese news data.
-Categories based on VNTC dataset:
-- Chinh tri Xa hoi (Politics/Society)
-- Doi song (Lifestyle)
-- Khoa hoc (Science)
-- Kinh doanh (Business)
-- Phap luat (Law)
-- Suc khoe (Health)
-- The gioi (World)
-- The thao (Sports)
-- Van hoa (Culture)
-- Vi tinh (Technology)
-"""
-import json
-import os
-import sys
-sys.path.insert(0, "/home/anhvu2/projects/workspace_underthesea")
-from sen import SenTextClassifier
-# Sample Vietnamese news data for each category
-SAMPLE_DATA = {
-    "chinh_tri_xa_hoi": [
-        "Quốc hội thông qua nghị quyết về phát triển kinh tế xã hội",
-        "Thủ tướng chủ trì họp Chính phủ thường kỳ tháng này",
-        "Đại hội Đảng toàn quốc lần thứ XIII thành công tốt đẹp",
-        "Chủ tịch nước tiếp đoàn đại biểu quốc tế",
-        "Bộ Nội vụ triển khai cải cách hành chính",
-        "Ủy ban Mặt trận Tổ quốc tổ chức hội nghị toàn quốc",
-        "Đoàn thanh niên phát động phong trào tình nguyện",
-        "Hội Liên hiệp Phụ nữ tổ chức đại hội đại biểu",
-    ],
-    "doi_song": [
-        "Mẹo hay giúp tiết kiệm chi phí sinh hoạt hàng ngày",
-        "Xu hướng thời trang mới nhất mùa thu đông năm nay",
-        "Cách trang trí nhà cửa đón Tết đẹp và tiết kiệm",
-        "Bí quyết nấu ăn ngon cho cả gia đình",
-        "Kinh nghiệm du lịch Đà Nẵng tiết kiệm chi phí",
-        "Cách chăm sóc cây cảnh trong nhà hiệu quả",
-        "Chia sẻ cách dạy con học tập hiệu quả",
-        "Mẹo vặt hay cho cuộc sống hàng ngày",
-    ],
-    "khoa_hoc": [
-        "Các nhà khoa học phát hiện hành tinh mới ngoài hệ mặt trời",
-        "Nghiên cứu mới về biến đổi khí hậu toàn cầu",
-        "Vệ tinh nhân tạo được phóng thành công lên quỹ đạo",
-        "Khám phá mới về nguồn gốc vũ trụ",
-        "Công nghệ nano ứng dụng trong y học",
-        "Phát hiện loài động vật mới ở rừng Amazon",
-        "Nghiên cứu về trí tuệ nhân tạo và học máy",
-        "Thí nghiệm vật lý hạt nhân tại CERN",
-    ],
-    "kinh_doanh": [
-        "Chứng khoán Việt Nam tăng điểm mạnh phiên đầu tuần",
-        "Ngân hàng Nhà nước điều chỉnh lãi suất điều hành",
-        "Doanh nghiệp xuất khẩu gặp khó khăn do tỷ giá",
-        "Thị trường bất động sản có dấu hiệu phục hồi",
-        "VN-Index vượt mốc 1200 điểm trong phiên giao dịch",
-        "FDI vào Việt Nam tăng trưởng ấn tượng",
-        "Giá vàng thế giới biến động mạnh trong tuần",
-        "Startup công nghệ Việt gọi vốn thành công Series A",
-    ],
-    "phap_luat": [
-        "Tòa án xét xử vụ án tham nhũng lớn",
-        "Công an triệt phá đường dây buôn lậu xuyên quốc gia",
-        "Luật mới về bảo vệ môi trường có hiệu lực",
-        "Khởi tố vụ án lừa đảo chiếm đoạt tài sản",
-        "Bộ Công an cảnh báo thủ đoạn lừa đảo qua mạng",
-        "Tòa án tuyên án vụ vi phạm an toàn giao thông",
-        "Viện Kiểm sát truy tố các bị cáo trong vụ án kinh tế",
-        "Cảnh sát giao thông xử lý vi phạm nồng độ cồn",
-    ],
-    "suc_khoe": [
-        "Bộ Y tế khuyến cáo phòng chống dịch bệnh mùa đông",
-        "Phát hiện phương pháp điều trị ung thư mới",
-        "Cách phòng tránh các bệnh về đường hô hấp",
-        "Chế độ ăn uống lành mạnh cho người cao tuổi",
-        "Vaccine mới được phê duyệt sử dụng tại Việt Nam",
-        "Bệnh viện Bạch Mai ứng dụng kỹ thuật mổ nội soi",
-        "Cách chăm sóc sức khỏe tinh thần hiệu quả",
-        "Tập thể dục đúng cách để có sức khỏe tốt",
-    ],
-    "the_gioi": [
-        "Tổng thống Mỹ công bố chính sách đối ngoại mới",
-        "Hội nghị thượng đỉnh G20 thảo luận về biến đổi khí hậu",
-        "Xung đột vũ trang leo thang tại Trung Đông",
-        "Liên Hợp Quốc họp khẩn về tình hình nhân đạo",
-        "Châu Âu đối mặt với khủng hoảng năng lượng",
-        "Trung Quốc công bố số liệu tăng trưởng kinh tế",
-        "Nhật Bản bầu cử thủ tướng mới",
-        "Nga và Ukraine tiếp tục đàm phán hòa bình",
-    ],
-    "the_thao": [
-        "Đội tuyển Việt Nam thắng đậm trong trận giao hữu",
-        "Cầu thủ Nguyễn Quang Hải ghi bàn đẹp mắt",
-        "V-League 2024 khởi tranh vào tháng tới",
-        "HLV Park Hang-seo chia tay bóng đá Việt Nam",
-        "U23 Việt Nam vô địch giải Đông Nam Á",
-        "Hoàng Xuân Vinh giành huy chương vàng Olympic",
-        "Ánh Viên phá kỷ lục quốc gia môn bơi lội",
-        "SEA Games 31 tổ chức thành công tại Việt Nam",
-    ],
-    "van_hoa": [
-        "Liên hoan phim Việt Nam lần thứ 23 khai mạc",
-        "Nghệ sĩ nhân dân được phong tặng danh hiệu cao quý",
-        "Triển lãm tranh của họa sĩ nổi tiếng tại Hà Nội",
-        "Ca sĩ Việt Nam giành giải thưởng âm nhạc châu Á",
-        "Lễ hội truyền thống thu hút đông đảo du khách",
-        "Phim Việt Nam được đề cử tại liên hoan phim quốc tế",
-        "Nhạc sĩ sáng tác ca khúc mới về quê hương",
-        "Bảo tàng lịch sử khai trương triển lãm mới",
-    ],
-    "vi_tinh": [
-        "Apple ra mắt iPhone mới với nhiều tính năng",
-        "Trí tuệ nhân tạo đang thay đổi cuộc sống",
-        "Samsung công bố điện thoại gập thế hệ mới",
-        "Microsoft phát hành bản cập nhật Windows",
-        "ChatGPT và cuộc cách mạng trí tuệ nhân tạo",
-        "5G được triển khai rộng rãi tại các thành phố lớn",
-        "Ứng dụng di động phổ biến nhất trong năm",
-        "An ninh mạng trước nguy cơ tấn công hacker",
-    ],
-}
-def prepare_data():
-    """Prepare training and validation data."""
-    train_texts = []
-    train_labels = []
-    val_texts = []
-    val_labels = []
-    for label, texts in SAMPLE_DATA.items():
-        # Use first 6 for training, last 2 for validation
-        for text in texts[:6]:
-            train_texts.append(text)
-            train_labels.append(label)
-        for text in texts[6:]:
-            val_texts.append(text)
-            val_labels.append(label)
-    return train_texts, train_labels, val_texts, val_labels
-def main():
-    print("=" * 60)
-    print("Training Sen Text Classifier")
-    print("Based on VNTC Vietnamese News Classification")
-    print("=" * 60)
-    # Prepare data
-    train_texts, train_labels, val_texts, val_labels = prepare_data()
-    print(f"\nDataset:")
-    print(f"  - Training samples: {len(train_texts)}")
-    print(f"  - Validation samples: {len(val_texts)}")
-    print(f"  - Categories: {len(SAMPLE_DATA)}")
-    # Initialize classifier
-    classifier = SenTextClassifier(
-        max_features=5000,
-        ngram_range=(1, 2),
-        min_df=1,
-        max_df=0.95,
-        sublinear_tf=True,
-        C=1.0,
-        max_iter=1000,
-    )
-    # Train
-    print("\n" + "=" * 60)
-    print("Training...")
-    print("=" * 60)
-    results = classifier.train(
-        train_texts=train_texts,
-        train_labels=train_labels,
-        val_texts=val_texts,
-        val_labels=val_labels,
-    )
-    # Evaluate
-    print("\n" + "=" * 60)
-    print("Evaluation on validation set:")
-    print("=" * 60)
-    classifier.evaluate(val_texts, val_labels)
-    # Test predictions
-    print("\n" + "=" * 60)
-    print("Sample Predictions:")
-    print("=" * 60)
-    test_texts = [
-        "Đội tuyển bóng đá Việt Nam chiến thắng",
-        "Giá vàng tăng mạnh trong phiên giao dịch",
-        "Apple công bố sản phẩm mới tại sự kiện",
-        "Bộ Y tế cảnh báo dịch cúm mùa",
-        "Quốc hội họp phiên bất thường",
-    ]
-    for text in test_texts:
-        from sen import Sentence
-        sentence = Sentence(text)
-        classifier.predict(sentence)
-        print(f"  '{text}' -> {sentence.labels[0]}")
-    # Save model
-    save_path = "/home/anhvu2/projects/workspace_underthesea/sen/trained_model"
-    print(f"\n" + "=" * 60)
-    print(f"Saving model to: {save_path}")
-    print("=" * 60)
-    classifier.save(save_path)
-    print("\nTraining completed!")
-    return classifier
-if __name__ == "__main__":
-    main()

src/scripts/train_sonar.py DELETED Viewed

@@ -1,234 +0,0 @@
-"""
-Reproduce sonar_core_1 training configuration.
-Uses CountVectorizer + TfidfTransformer + SVC pipeline.
-Target: 92.80% accuracy on VNTC (matching sonar_core_1)
-"""
-import os
-import sys
-import time
-import json
-from datetime import datetime
-import numpy as np
-import joblib
-from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
-from sklearn.svm import SVC, LinearSVC
-from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import Pipeline
-from sklearn.metrics import accuracy_score, classification_report, f1_score
-sys.path.insert(0, "/home/anhvu2/projects/workspace_underthesea")
-# VNTC data paths
-VNTC_BASE = "/home/anhvu2/projects/workspace_underthesea/VNTC_github/Data/10Topics/Ver1.1"
-TRAIN_DIR = os.path.join(VNTC_BASE, "Train_Full")
-TEST_DIR = os.path.join(VNTC_BASE, "Test_Full")
-# Category mapping
-CATEGORY_MAP = {
-    "Chinh tri Xa hoi": "Chinh tri Xa hoi",
-    "Doi song": "Doi song",
-    "Khoa hoc": "Khoa hoc",
-    "Kinh doanh": "Kinh doanh",
-    "Phap luat": "Phap luat",
-    "Suc khoe": "Suc khoe",
-    "The gioi": "The gioi",
-    "The thao": "The thao",
-    "Van hoa": "Van hoa",
-    "Vi tinh": "Vi tinh",
-}
-def read_file(filepath):
-    """Read text file with multiple encoding attempts."""
-    encodings = ['utf-16', 'utf-16-le', 'utf-8', 'latin-1']
-    for encoding in encodings:
-        try:
-            with open(filepath, 'r', encoding=encoding) as f:
-                text = f.read()
-                text = ' '.join(text.split())
-                if len(text) > 10:
-                    return text
-        except (UnicodeDecodeError, UnicodeError):
-            continue
-    return None
-def load_vntc_data(data_dir):
-    """Load VNTC data from directory."""
-    texts = []
-    labels = []
-    for folder_name, label in CATEGORY_MAP.items():
-        folder_path = os.path.join(data_dir, folder_name)
-        if not os.path.exists(folder_path):
-            print(f"  Warning: {folder_path} not found")
-            continue
-        files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
-        for filename in files:
-            filepath = os.path.join(folder_path, filename)
-            text = read_file(filepath)
-            if text:
-                texts.append(text)
-                labels.append(label)
-    return np.array(texts), np.array(labels)
-def train_sonar_config():
-    """Train with sonar_core_1 configuration."""
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    print("=" * 70)
-    print("Reproducing sonar_core_1 Configuration")
-    print("=" * 70)
-    # Load data
-    print("\n[1/4] Loading VNTC data...")
-    start = time.time()
-    X_train, y_train = load_vntc_data(TRAIN_DIR)
-    X_test, y_test = load_vntc_data(TEST_DIR)
-    print(f"  Train: {len(X_train)} samples")
-    print(f"  Test: {len(X_test)} samples")
-    print(f"  Classes: {len(set(y_train))}")
-    print(f"  Load time: {time.time()-start:.1f}s")
-    # sonar_core_1 configuration
-    configs = [
-        {
-            "name": "SVC (sonar_core_1 config)",
-            "max_features": 20000,
-            "ngram_range": (1, 2),
-            "classifier": SVC(kernel='linear', probability=True, random_state=42),
-        },
-        {
-            "name": "LinearSVC (faster)",
-            "max_features": 20000,
-            "ngram_range": (1, 2),
-            "classifier": LinearSVC(C=1.0, max_iter=2000, random_state=42),
-        },
-        {
-            "name": "LogisticRegression",
-            "max_features": 20000,
-            "ngram_range": (1, 2),
-            "classifier": LogisticRegression(max_iter=1000, random_state=42),
-        },
-    ]
-    results = []
-    best_model = None
-    best_accuracy = 0
-    for i, config in enumerate(configs):
-        print(f"\n[{i+2}/4] Training: {config['name']}")
-        print("-" * 50)
-        # Create pipeline (sonar_core_1 style)
-        pipeline = Pipeline([
-            ('vect', CountVectorizer(
-                max_features=config['max_features'],
-                ngram_range=config['ngram_range']
-            )),
-            ('tfidf', TfidfTransformer(use_idf=True)),
-            ('clf', config['classifier']),
-        ])
-        # Train
-        start = time.time()
-        pipeline.fit(X_train, y_train)
-        train_time = time.time() - start
-        # Evaluate
-        train_pred = pipeline.predict(X_train)
-        test_pred = pipeline.predict(X_test)
-        train_acc = accuracy_score(y_train, train_pred)
-        test_acc = accuracy_score(y_test, test_pred)
-        test_f1 = f1_score(y_test, test_pred, average='weighted')
-        print(f"  Train accuracy: {train_acc:.4f}")
-        print(f"  Test accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
-        print(f"  Test F1:        {test_f1:.4f}")
-        print(f"  Train time:     {train_time:.1f}s")
-        results.append({
-            "name": config['name'],
-            "train_acc": train_acc,
-            "test_acc": test_acc,
-            "test_f1": test_f1,
-            "train_time": train_time,
-        })
-        if test_acc > best_accuracy:
-            best_accuracy = test_acc
-            best_model = pipeline
-            best_config = config
-    # Print comparison
-    print("\n" + "=" * 70)
-    print("Results Comparison")
-    print("=" * 70)
-    print(f"{'Model':<30} {'Test Acc':>10} {'Test F1':>10} {'Time':>10}")
-    print("-" * 70)
-    for r in results:
-        print(f"{r['name']:<30} {r['test_acc']*100:>9.2f}% {r['test_f1']:>10.4f} {r['train_time']:>9.1f}s")
-    print("-" * 70)
-    print(f"sonar_core_1 reference:        {92.80:>9.2f}%")
-    print("=" * 70)
-    # Save best model
-    save_dir = "/home/anhvu2/projects/workspace_underthesea/sen/sen-general-1.0.0-20260202"
-    os.makedirs(save_dir, exist_ok=True)
-    # Save pipeline
-    joblib.dump(best_model, os.path.join(save_dir, "pipeline.joblib"))
-    # Save label encoder (for compatibility)
-    from sklearn.preprocessing import LabelEncoder
-    le = LabelEncoder()
-    le.fit(y_train)
-    joblib.dump(le, os.path.join(save_dir, "label_encoder.joblib"))
-    # Save metadata
-    metadata = {
-        "model_type": "sonar_core_1_reproduction",
-        "architecture": "CountVectorizer + TfidfTransformer + LinearSVC",
-        "max_features": best_config['max_features'],
-        "ngram_range": list(best_config['ngram_range']),
-        "train_samples": len(X_train),
-        "test_samples": len(X_test),
-        "train_accuracy": float(results[1]['train_acc']),  # LinearSVC
-        "test_accuracy": float(results[1]['test_acc']),
-        "test_f1_weighted": float(results[1]['test_f1']),
-        "labels": sorted(list(set(y_train))),
-        "timestamp": timestamp,
-    }
-    with open(os.path.join(save_dir, "metadata.json"), 'w') as f:
-        json.dump(metadata, f, indent=2, ensure_ascii=False)
-    print(f"\nBest model saved to: {save_dir}")
-    # Print detailed classification report for best model
-    print("\n" + "=" * 70)
-    print("Classification Report (LinearSVC)")
-    print("=" * 70)
-    # Retrain LinearSVC for report
-    pipeline = Pipeline([
-        ('vect', CountVectorizer(max_features=20000, ngram_range=(1, 2))),
-        ('tfidf', TfidfTransformer(use_idf=True)),
-        ('clf', LinearSVC(C=1.0, max_iter=2000, random_state=42)),
-    ])
-    pipeline.fit(X_train, y_train)
-    test_pred = pipeline.predict(X_test)
-    print(classification_report(y_test, test_pred))
-    return results
-if __name__ == "__main__":
-    train_sonar_config()

src/scripts/train_vntc.py DELETED Viewed

@@ -1,181 +0,0 @@
-"""
-Train Sen Text Classifier on full VNTC dataset.
-VNTC: Vietnamese News Text Classification Corpus
-- 10 Topics, ~33,759 train / ~50,373 test documents
-- Reference: Vu et al. (2007) RIVF
-"""
-import os
-import sys
-import time
-from pathlib import Path
-sys.path.insert(0, "/home/anhvu2/projects/workspace_underthesea")
-from sen import SenTextClassifier
-# VNTC data paths
-VNTC_BASE = "/home/anhvu2/projects/workspace_underthesea/VNTC_github/Data/10Topics/Ver1.1"
-TRAIN_DIR = os.path.join(VNTC_BASE, "Train_Full")
-TEST_DIR = os.path.join(VNTC_BASE, "Test_Full")
-# Category mapping (folder name -> normalized label)
-CATEGORY_MAP = {
-    "Chinh tri Xa hoi": "chinh_tri_xa_hoi",
-    "Doi song": "doi_song",
-    "Khoa hoc": "khoa_hoc",
-    "Kinh doanh": "kinh_doanh",
-    "Phap luat": "phap_luat",
-    "Suc khoe": "suc_khoe",
-    "The gioi": "the_gioi",
-    "The thao": "the_thao",
-    "Van hoa": "van_hoa",
-    "Vi tinh": "vi_tinh",
-}
-def read_file(filepath):
-    """Read text file with multiple encoding attempts."""
-    encodings = ['utf-16', 'utf-16-le', 'utf-8', 'latin-1']
-    for encoding in encodings:
-        try:
-            with open(filepath, 'r', encoding=encoding) as f:
-                text = f.read()
-                # Clean up text (remove extra whitespace)
-                text = ' '.join(text.split())
-                if len(text) > 10:  # Valid text
-                    return text
-        except (UnicodeDecodeError, UnicodeError):
-            continue
-    return None
-def load_vntc_data(data_dir, max_per_category=None):
-    """Load VNTC data from directory."""
-    texts = []
-    labels = []
-    stats = {}
-    for folder_name, label in CATEGORY_MAP.items():
-        folder_path = os.path.join(data_dir, folder_name)
-        if not os.path.exists(folder_path):
-            print(f"  Warning: {folder_path} not found")
-            continue
-        files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
-        if max_per_category:
-            files = files[:max_per_category]
-        count = 0
-        for filename in files:
-            filepath = os.path.join(folder_path, filename)
-            text = read_file(filepath)
-            if text:
-                texts.append(text)
-                labels.append(label)
-                count += 1
-        stats[label] = count
-    return texts, labels, stats
-def main():
-    print("=" * 70)
-    print("Training Sen Text Classifier on VNTC Dataset")
-    print("Vietnamese News Text Classification Corpus")
-    print("=" * 70)
-    # Load training data
-    print("\n[1/5] Loading training data...")
-    start = time.time()
-    train_texts, train_labels, train_stats = load_vntc_data(TRAIN_DIR)
-    print(f"  Loaded {len(train_texts)} training samples in {time.time()-start:.1f}s")
-    print("  Per category:")
-    for label, count in sorted(train_stats.items()):
-        print(f"    - {label}: {count}")
-    # Load test data
-    print("\n[2/5] Loading test data...")
-    start = time.time()
-    test_texts, test_labels, test_stats = load_vntc_data(TEST_DIR)
-    print(f"  Loaded {len(test_texts)} test samples in {time.time()-start:.1f}s")
-    print("  Per category:")
-    for label, count in sorted(test_stats.items()):
-        print(f"    - {label}: {count}")
-    # Initialize classifier
-    print("\n[3/5] Initializing classifier...")
-    classifier = SenTextClassifier(
-        max_features=10000,  # Increased for larger dataset
-        ngram_range=(1, 2),
-        min_df=2,  # Require term in at least 2 docs
-        max_df=0.95,
-        sublinear_tf=True,
-        C=1.0,
-        max_iter=2000,
-    )
-    # Train
-    print("\n[4/5] Training...")
-    start = time.time()
-    results = classifier.train(
-        train_texts=train_texts,
-        train_labels=train_labels,
-        val_texts=test_texts[:5000],  # Use subset for validation during training
-        val_labels=test_labels[:5000],
-    )
-    train_time = time.time() - start
-    print(f"  Training completed in {train_time:.1f}s")
-    # Evaluate on full test set
-    print("\n[5/5] Evaluating on full test set...")
-    start = time.time()
-    eval_results = classifier.evaluate(test_texts, test_labels)
-    eval_time = time.time() - start
-    print("\n" + "=" * 70)
-    print("VNTC Benchmark Results (10 Topics)")
-    print("=" * 70)
-    print(f"  Test samples: {len(test_texts)}")
-    print(f"  Accuracy:     {eval_results['accuracy']:.4f} ({eval_results['accuracy']*100:.2f}%)")
-    print(f"  F1 (weighted):{eval_results['f1_weighted']:.4f}")
-    print(f"  Train time:   {train_time:.1f}s")
-    print(f"  Eval time:    {eval_time:.1f}s")
-    print("=" * 70)
-    # Save model
-    save_path = "/home/anhvu2/projects/workspace_underthesea/sen/sen-1.0.0-vntc"
-    print(f"\nSaving model to: {save_path}")
-    classifier.save(save_path)
-    # Sample predictions
-    print("\nSample Predictions:")
-    print("-" * 70)
-    test_samples = [
-        "Đội tuyển Việt Nam thắng đậm 3-0 trước Indonesia",
-        "Giá vàng tăng mạnh trong phiên giao dịch hôm nay",
-        "Apple ra mắt iPhone mới với nhiều tính năng hấp dẫn",
-        "Bộ Y tế cảnh báo về dịch cúm mùa đông",
-        "Quốc hội thông qua nghị quyết phát triển kinh tế",
-    ]
-    from sen import Sentence
-    for text in test_samples:
-        sentence = Sentence(text)
-        classifier.predict(sentence)
-        label = sentence.labels[0]
-        print(f"  '{text[:50]}...' -> {label.value} ({label.score:.2f})")
-    print("\n" + "=" * 70)
-    print("Training completed successfully!")
-    print("=" * 70)
-    return eval_results
-if __name__ == "__main__":
-    results = main()

src/sen/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-"""
-Sen-1: Vietnamese Text Classification by UnderTheSea NLP.
-Based on: "A Comparative Study on Vietnamese Text Classification Methods"
-Vu et al., RIVF 2007
-Methods:
-- TF-IDF vectorization (sklearn)
-- SVM (Support Vector Machine) classifier
-"""
-from .text_classifier import (
-    Label,
-    Sentence,
-    SenTextClassifier,
-    classify,
-)
-__version__ = "1.0.0"
-__all__ = [
-    "Label",
-    "Sentence",
-    "SenTextClassifier",
-    "classify",
-]

src/sen/text_classifier.py DELETED Viewed

@@ -1,374 +0,0 @@
-"""
-Sen Text Classifier - Rust-based classifier using underthesea_core_extend.
-Based on: "A Comparative Study on Vietnamese Text Classification Methods"
-Vu et al., RIVF 2007
-https://ieeexplore.ieee.org/document/4223084/
-Methods:
-- TF-IDF vectorization (Rust: underthesea_core_extend.TfIdfVectorizer)
-- Linear SVM classifier (Rust: underthesea_core_extend.LinearSVM)
-"""
-import json
-import os
-from typing import List, Optional, Union
-from underthesea_core_extend import TfIdfVectorizer, LinearSVM, SVMTrainer
-class Label:
-    """Label class compatible with underthesea."""
-    def __init__(self, value: str, score: float = 1.0):
-        self.value = value
-        self.score = min(max(score, 0.0), 1.0)
-    def __str__(self):
-        return f"{self.value} ({self.score:.4f})"
-    def __repr__(self):
-        return f"{self.value} ({self.score:.4f})"
-class Sentence:
-    """Sentence class compatible with underthesea."""
-    def __init__(self, text: str = None, labels: List[Label] = None):
-        self.text = text
-        self.labels = labels or []
-    def __str__(self):
-        return f'Sentence: "{self.text}" - Labels: {self.labels}'
-    def __repr__(self):
-        return f'Sentence: "{self.text}" - Labels: {self.labels}'
-    def add_labels(self, labels: List[Union[Label, str]]):
-        for label in labels:
-            if isinstance(label, str):
-                label = Label(label)
-            self.labels.append(label)
-class SenTextClassifier:
-    """
-    Rust-based text classifier using TF-IDF + Linear SVM.
-    Uses underthesea_core_extend for fast training and inference.
-    Compatible with underthesea API.
-    Reference:
-        Vu et al. "A Comparative Study on Vietnamese Text Classification Methods"
-        RIVF 2007
-    """
-    def __init__(
-        self,
-        # TF-IDF parameters
-        max_features: int = 20000,
-        ngram_range: tuple = (1, 2),
-        min_df: int = 1,
-        max_df: float = 1.0,
-        # SVM parameters
-        c: float = 1.0,
-        max_iter: int = 1000,
-        tol: float = 0.1,
-        verbose: bool = True,
-    ):
-        self.max_features = max_features
-        self.ngram_range = ngram_range
-        self.min_df = min_df
-        self.max_df = max_df
-        self.c = c
-        self.max_iter = max_iter
-        self.tol = tol
-        self.verbose = verbose
-        self.vectorizer: Optional[TfIdfVectorizer] = None
-        self.classifier: Optional[LinearSVM] = None
-        self.labels_: Optional[List[str]] = None
-    def train(
-        self,
-        train_texts: List[str],
-        train_labels: List[str],
-        val_texts: List[str] = None,
-        val_labels: List[str] = None,
-    ) -> dict:
-        """
-        Train the classifier.
-        Args:
-            train_texts: List of training texts
-            train_labels: List of training labels
-            val_texts: Optional validation texts
-            val_labels: Optional validation labels
-        Returns:
-            Dictionary with training metrics
-        """
-        # Get unique labels
-        self.labels_ = sorted(list(set(train_labels)))
-        # Build and fit vectorizer
-        self.vectorizer = TfIdfVectorizer(
-            max_features=self.max_features,
-            ngram_range=self.ngram_range,
-            min_df=self.min_df,
-            max_df=self.max_df,
-        )
-        self.vectorizer.fit(train_texts)
-        # Transform to features
-        train_features = self.vectorizer.transform_batch(train_texts)
-        # Build and train SVM model
-        trainer = SVMTrainer(
-            c=self.c,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            verbose=self.verbose,
-        )
-        self.classifier = trainer.train(train_features, train_labels)
-        # Calculate training metrics
-        train_preds = self.classifier.predict_batch(train_features)
-        train_acc = sum(1 for p, t in zip(train_preds, train_labels) if p == t) / len(train_labels)
-        # Calculate F1 score
-        train_f1 = self._calculate_f1(train_labels, train_preds)
-        results = {
-            "train_accuracy": train_acc,
-            "train_f1": train_f1,
-            "num_classes": len(self.labels_),
-            "num_samples": len(train_texts),
-            "vocab_size": self.vectorizer.vocab_size,
-        }
-        print(f"Training completed:")
-        print(f"  - Samples: {len(train_texts)}")
-        print(f"  - Classes: {len(self.labels_)}")
-        print(f"  - Vocab size: {self.vectorizer.vocab_size}")
-        print(f"  - Train accuracy: {train_acc:.4f}")
-        print(f"  - Train F1: {train_f1:.4f}")
-        # Validation metrics
-        if val_texts and val_labels:
-            val_features = self.vectorizer.transform_batch(val_texts)
-            val_preds = self.classifier.predict_batch(val_features)
-            val_acc = sum(1 for p, t in zip(val_preds, val_labels) if p == t) / len(val_labels)
-            val_f1 = self._calculate_f1(val_labels, val_preds)
-            results["val_accuracy"] = val_acc
-            results["val_f1"] = val_f1
-            print(f"  - Val accuracy: {val_acc:.4f}")
-            print(f"  - Val F1: {val_f1:.4f}")
-        return results
-    def _calculate_f1(self, y_true: List[str], y_pred: List[str]) -> float:
-        """Calculate weighted F1 score."""
-        from collections import Counter
-        label_counts = Counter(y_true)
-        total = len(y_true)
-        f1_sum = 0.0
-        for label in self.labels_:
-            tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
-            fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
-            fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)
-            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
-            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
-            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
-            weight = label_counts[label] / total
-            f1_sum += f1 * weight
-        return f1_sum
-    def predict(self, sentence: Sentence) -> None:
-        """
-        Predict label for a sentence (underthesea-compatible API).
-        Args:
-            sentence: Sentence object with text attribute
-        """
-        if self.classifier is None or self.vectorizer is None:
-            raise ValueError("Model not trained. Call train() first or load a model.")
-        features = self.vectorizer.transform_dense(sentence.text)
-        label_value, score = self.classifier.predict_with_score(features)
-        sentence.labels = []
-        sentence.add_labels([Label(label_value, score)])
-    def predict_batch(self, texts: List[str]) -> List[Label]:
-        """
-        Predict labels for multiple texts.
-        Args:
-            texts: List of texts to classify
-        Returns:
-            List of Label objects
-        """
-        if self.classifier is None or self.vectorizer is None:
-            raise ValueError("Model not trained. Call train() first or load a model.")
-        # Use dense transform (faster Python-Rust interface)
-        features = self.vectorizer.transform_batch(texts)
-        results = []
-        for feat in features:
-            label_value, score = self.classifier.predict_with_score(feat)
-            results.append(Label(label_value, float(score)))
-        return results
-    def evaluate(self, texts: List[str], labels: List[str]) -> dict:
-        """
-        Evaluate model on test data.
-        Args:
-            texts: List of texts
-            labels: List of true labels
-        Returns:
-            Dictionary with evaluation metrics
-        """
-        # Use dense transform (faster Python-Rust interface)
-        features = self.vectorizer.transform_batch(texts)
-        y_pred = self.classifier.predict_batch(features)
-        acc = sum(1 for p, t in zip(y_pred, labels) if p == t) / len(labels)
-        f1 = self._calculate_f1(labels, y_pred)
-        print(f"Evaluation:")
-        print(f"  - Accuracy: {acc:.4f}")
-        print(f"  - F1 (weighted): {f1:.4f}")
-        # Print classification report
-        self._print_classification_report(labels, y_pred)
-        return {"accuracy": acc, "f1": f1}
-    def _print_classification_report(self, y_true: List[str], y_pred: List[str]):
-        """Print classification report."""
-        from collections import Counter
-        print("\nClassification Report:")
-        print(f"{'':>20} {'precision':>10} {'recall':>10} {'f1-score':>10} {'support':>10}")
-        print()
-        label_counts = Counter(y_true)
-        for label in self.labels_:
-            tp = sum(1 for t, p in zip(y_true, y_pred) if t == label and p == label)
-            fp = sum(1 for t, p in zip(y_true, y_pred) if t != label and p == label)
-            fn = sum(1 for t, p in zip(y_true, y_pred) if t == label and p != label)
-            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
-            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
-            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
-            support = label_counts[label]
-            print(f"{label:>20} {precision:>10.2f} {recall:>10.2f} {f1:>10.2f} {support:>10}")
-        print()
-    def save(self, path: str) -> None:
-        """
-        Save model to disk.
-        Args:
-            path: Directory path to save model
-        """
-        os.makedirs(path, exist_ok=True)
-        # Save vectorizer
-        self.vectorizer.save(os.path.join(path, "vectorizer.json"))
-        # Save classifier
-        self.classifier.save(os.path.join(path, "classifier.json"))
-        # Save metadata
-        metadata = {
-            "estimator": "RUST_SVM",
-            "max_features": self.max_features,
-            "ngram_range": self.ngram_range,
-            "min_df": self.min_df,
-            "max_df": self.max_df,
-            "c": self.c,
-            "max_iter": self.max_iter,
-            "tol": self.tol,
-            "labels": self.labels_,
-            "vocab_size": self.vectorizer.vocab_size,
-            "n_classes": self.classifier.n_classes,
-        }
-        with open(os.path.join(path, "metadata.json"), "w", encoding="utf-8") as f:
-            json.dump(metadata, f, ensure_ascii=False, indent=2)
-        print(f"Model saved to: {path}")
-    @classmethod
-    def load(cls, path: str) -> "SenTextClassifier":
-        """
-        Load model from disk.
-        Args:
-            path: Directory path containing saved model
-        Returns:
-            Loaded SenTextClassifier instance
-        """
-        # Load metadata
-        with open(os.path.join(path, "metadata.json"), "r", encoding="utf-8") as f:
-            metadata = json.load(f)
-        # Create instance with saved parameters
-        classifier = cls(
-            max_features=metadata.get("max_features", 20000),
-            ngram_range=tuple(metadata.get("ngram_range", (1, 2))),
-            min_df=metadata.get("min_df", 1),
-            max_df=metadata.get("max_df", 1.0),
-            c=metadata.get("c", 1.0),
-            max_iter=metadata.get("max_iter", 1000),
-            tol=metadata.get("tol", 0.1),
-        )
-        # Load vectorizer
-        classifier.vectorizer = TfIdfVectorizer.load(os.path.join(path, "vectorizer.json"))
-        # Load SVM model
-        classifier.classifier = LinearSVM.load(os.path.join(path, "classifier.json"))
-        classifier.labels_ = metadata.get("labels", [])
-        print(f"Model loaded from: {path}")
-        return classifier
-def classify(text: str, model_path: str = None) -> List[str]:
-    """
-    Classify text using Sen model.
-    Args:
-        text: Input text to classify
-        model_path: Path to trained model
-    Returns:
-        List of predicted labels
-    """
-    if not hasattr(classify, "_classifier") or classify._model_path != model_path:
-        if model_path:
-            classify._classifier = SenTextClassifier.load(model_path)
-            classify._model_path = model_path
-        else:
-            raise ValueError("model_path is required")
-    sentence = Sentence(text)
-    classify._classifier.predict(sentence)
-    return [label.value for label in sentence.labels]

src/train.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Training CLI for Vietnamese Text Classification.
+Usage:
+    python train.py vntc --output models/sen-vntc.bin
+    python train.py bank --output models/sen-bank.bin
+"""
+import os
+import time
+from pathlib import Path
+import click
+from sklearn.metrics import accuracy_score, f1_score, classification_report
+from underthesea_core import TextClassifier
+def read_file(filepath):
+    """Read text file with multiple encoding attempts."""
+    for enc in ['utf-16', 'utf-8', 'latin-1']:
+        try:
+            with open(filepath, 'r', encoding=enc) as f:
+                text = ' '.join(f.read().split())
+                if len(text) > 10:
+                    return text
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    return None
+def load_vntc_data(data_dir):
+    """Load VNTC data from directory."""
+    texts, labels = [], []
+    for folder in sorted(os.listdir(data_dir)):
+        folder_path = os.path.join(data_dir, folder)
+        if not os.path.isdir(folder_path):
+            continue
+        for fname in os.listdir(folder_path):
+            if fname.endswith('.txt'):
+                text = read_file(os.path.join(folder_path, fname))
+                if text:
+                    texts.append(text)
+                    labels.append(folder)
+    return texts, labels
+@click.group()
+def cli():
+    """Train Vietnamese text classification models."""
+    pass
+@cli.command()
+@click.option('--data-dir', default='/home/claude-user/projects/workspace_underthesea/VNTC/Data/10Topics/Ver1.1',
+              help='Path to VNTC dataset')
+@click.option('--output', '-o', default='models/sen-vntc.bin', help='Output model path')
+@click.option('--max-features', default=20000, help='Maximum vocabulary size')
+@click.option('--ngram-min', default=1, help='Minimum n-gram')
+@click.option('--ngram-max', default=2, help='Maximum n-gram')
+@click.option('--min-df', default=2, help='Minimum document frequency')
+@click.option('--c', default=1.0, help='SVM regularization parameter')
+@click.option('--max-iter', default=1000, help='Maximum iterations')
+@click.option('--tol', default=0.1, help='Convergence tolerance')
+def vntc(data_dir, output, max_features, ngram_min, ngram_max, min_df, c, max_iter, tol):
+    """Train on VNTC dataset (10 topics, ~84k documents)."""
+    click.echo("=" * 70)
+    click.echo("VNTC Dataset Training (10 Topics)")
+    click.echo("=" * 70)
+    train_dir = os.path.join(data_dir, "Train_Full")
+    test_dir = os.path.join(data_dir, "Test_Full")
+    # Load data
+    click.echo("\nLoading data...")
+    t0 = time.perf_counter()
+    train_texts, train_labels = load_vntc_data(train_dir)
+    test_texts, test_labels = load_vntc_data(test_dir)
+    load_time = time.perf_counter() - t0
+    click.echo(f"  Train samples: {len(train_texts)}")
+    click.echo(f"  Test samples: {len(test_texts)}")
+    click.echo(f"  Categories: {len(set(train_labels))}")
+    click.echo(f"  Load time: {load_time:.2f}s")
+    # Train
+    click.echo("\nTraining Rust TextClassifier...")
+    clf = TextClassifier(
+        max_features=max_features,
+        ngram_range=(ngram_min, ngram_max),
+        min_df=min_df,
+        c=c,
+        max_iter=max_iter,
+        tol=tol,
+    )
+    t0 = time.perf_counter()
+    clf.fit(train_texts, train_labels)
+    train_time = time.perf_counter() - t0
+    click.echo(f"  Training time: {train_time:.2f}s")
+    click.echo(f"  Vocabulary size: {clf.n_features}")
+    # Evaluate
+    click.echo("\nEvaluating...")
+    t0 = time.perf_counter()
+    preds = clf.predict_batch(test_texts)
+    infer_time = time.perf_counter() - t0
+    throughput = len(test_texts) / infer_time
+    acc = accuracy_score(test_labels, preds)
+    f1_w = f1_score(test_labels, preds, average='weighted')
+    f1_m = f1_score(test_labels, preds, average='macro')
+    click.echo(f"  Inference: {infer_time:.3f}s ({throughput:.0f} samples/sec)")
+    click.echo("\n" + "=" * 70)
+    click.echo("RESULTS")
+    click.echo("=" * 70)
+    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
+    click.echo(f"  F1 (weighted): {f1_w:.4f}")
+    click.echo(f"  F1 (macro): {f1_m:.4f}")
+    click.echo("\nClassification Report:")
+    click.echo(classification_report(test_labels, preds))
+    # Save model
+    model_path = Path(output)
+    model_path.parent.mkdir(parents=True, exist_ok=True)
+    clf.save(str(model_path))
+    size_mb = model_path.stat().st_size / (1024 * 1024)
+    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
+@cli.command()
+@click.option('--output', '-o', default='models/sen-bank.bin', help='Output model path')
+@click.option('--max-features', default=10000, help='Maximum vocabulary size')
+@click.option('--ngram-min', default=1, help='Minimum n-gram')
+@click.option('--ngram-max', default=2, help='Maximum n-gram')
+@click.option('--min-df', default=1, help='Minimum document frequency')
+@click.option('--c', default=1.0, help='SVM regularization parameter')
+@click.option('--max-iter', default=1000, help='Maximum iterations')
+@click.option('--tol', default=0.1, help='Convergence tolerance')
+def bank(output, max_features, ngram_min, ngram_max, min_df, c, max_iter, tol):
+    """Train on UTS2017_Bank dataset (14 categories, banking domain)."""
+    from datasets import load_dataset
+    click.echo("=" * 70)
+    click.echo("UTS2017_Bank Dataset Training (14 Categories)")
+    click.echo("=" * 70)
+    # Load data
+    click.echo("\nLoading UTS2017_Bank dataset from HuggingFace...")
+    dataset = load_dataset("undertheseanlp/UTS2017_Bank", "classification")
+    train_texts = list(dataset["train"]["text"])
+    train_labels = list(dataset["train"]["label"])
+    test_texts = list(dataset["test"]["text"])
+    test_labels = list(dataset["test"]["label"])
+    click.echo(f"  Train samples: {len(train_texts)}")
+    click.echo(f"  Test samples: {len(test_texts)}")
+    click.echo(f"  Categories: {len(set(train_labels))}")
+    # Train
+    click.echo("\nTraining Rust TextClassifier...")
+    clf = TextClassifier(
+        max_features=max_features,
+        ngram_range=(ngram_min, ngram_max),
+        min_df=min_df,
+        c=c,
+        max_iter=max_iter,
+        tol=tol,
+    )
+    t0 = time.perf_counter()
+    clf.fit(train_texts, train_labels)
+    train_time = time.perf_counter() - t0
+    click.echo(f"  Training time: {train_time:.3f}s")
+    click.echo(f"  Vocabulary size: {clf.n_features}")
+    # Evaluate
+    click.echo("\nEvaluating...")
+    preds = clf.predict_batch(test_texts)
+    acc = accuracy_score(test_labels, preds)
+    f1_w = f1_score(test_labels, preds, average='weighted')
+    f1_m = f1_score(test_labels, preds, average='macro')
+    click.echo("\n" + "=" * 70)
+    click.echo("RESULTS")
+    click.echo("=" * 70)
+    click.echo(f"  Accuracy: {acc:.4f} ({acc*100:.2f}%)")
+    click.echo(f"  F1 (weighted): {f1_w:.4f}")
+    click.echo(f"  F1 (macro): {f1_m:.4f}")
+    click.echo("\nClassification Report:")
+    click.echo(classification_report(test_labels, preds))
+    # Save model
+    model_path = Path(output)
+    model_path.parent.mkdir(parents=True, exist_ok=True)
+    clf.save(str(model_path))
+    size_mb = model_path.stat().st_size / (1024 * 1024)
+    click.echo(f"\nModel saved to {model_path} ({size_mb:.2f} MB)")
+if __name__ == "__main__":
+    cli()

tests/test_classifier.py DELETED Viewed

@@ -1,165 +0,0 @@
-"""Test Sen Text Classifier (sklearn-based)."""
-import sys
-sys.path.insert(0, "/home/anhvu2/projects/workspace_underthesea")
-from sen import SenTextClassifier, Sentence, Label
-def test_training():
-    """Test training on sample Vietnamese sentiment data."""
-    print("=" * 60)
-    print("Test: Training sklearn-based Text Classifier")
-    print("=" * 60)
-    # Sample Vietnamese sentiment data
-    train_texts = [
-        "Sản phẩm rất tốt, tôi hài lòng!",
-        "Chất lượng tuyệt vời, giao hàng nhanh",
-        "Hàng đẹp, đóng gói cẩn thận",
-        "Mình rất thích sản phẩm này",
-        "Shop phục vụ nhiệt tình, sẽ ủng hộ tiếp",
-        "Hàng chính hãng, giá tốt",
-        "Rất đáng tiền, recommend cho mọi người",
-        "Chất liệu tốt, may đẹp",
-        "Hàng tệ quá, không như mô tả",
-        "Chất lượng kém, không đáng tiền",
-        "Giao hàng chậm, đóng gói cẩu thả",
-        "Sản phẩm lỗi, shop không hỗ trợ",
-        "Thất vọng, không bao giờ mua lại",
-        "Hàng giả, không nên mua",
-        "Tệ lắm, phí tiền",
-        "Màu không đúng, size sai",
-    ]
-    train_labels = [
-        "positive", "positive", "positive", "positive",
-        "positive", "positive", "positive", "positive",
-        "negative", "negative", "negative", "negative",
-        "negative", "negative", "negative", "negative",
-    ]
-    val_texts = [
-        "Hàng ok, sẽ mua lại",
-        "Tệ lắm, không nên mua",
-    ]
-    val_labels = ["positive", "negative"]
-    # Initialize and train
-    classifier = SenTextClassifier(
-        max_features=1000,
-        ngram_range=(1, 2),
-        min_df=1,
-        C=1.0,
-    )
-    print("\nTraining...")
-    history = classifier.train(
-        train_texts=train_texts,
-        train_labels=train_labels,
-        val_texts=val_texts,
-        val_labels=val_labels,
-    )
-    print()
-    # Test predictions
-    print("Testing predictions:")
-    test_texts = [
-        "Sản phẩm tuyệt vời!",
-        "Hàng rất tệ",
-        "Giao hàng nhanh, hàng đẹp",
-        "Thất vọng với chất lượng",
-        "Chất lượng tốt, giá hợp lý",
-        "Không đáng tiền, hàng kém",
-    ]
-    for text in test_texts:
-        sentence = Sentence(text)
-        classifier.predict(sentence)
-        print(f"  '{text}' -> {sentence.labels[0]}")
-    print()
-    # Test batch prediction
-    print("Batch prediction:")
-    labels = classifier.predict_batch(test_texts)
-    for text, label in zip(test_texts, labels):
-        print(f"  '{text}' -> {label}")
-    print()
-    # Save model
-    save_path = "/tmp/sen-classifier-sklearn"
-    classifier.save(save_path)
-    # Load and test
-    print("\nLoading saved model...")
-    loaded_classifier = SenTextClassifier.load(save_path)
-    sentence = Sentence("Rất hài lòng với sản phẩm")
-    loaded_classifier.predict(sentence)
-    print(f"Loaded model prediction: '{sentence.text}' -> {sentence.labels[0]}")
-    print()
-def test_multiclass():
-    """Test multi-class classification (news categories)."""
-    print("=" * 60)
-    print("Test: Multi-class Classification (News Categories)")
-    print("=" * 60)
-    # Sample Vietnamese news data (simulating VNTC categories)
-    train_texts = [
-        # Thể thao (Sports)
-        "Đội tuyển Việt Nam thắng 3-0 trước Indonesia",
-        "Cầu thủ Nguyễn Quang Hải ghi bàn đẹp mắt",
-        "V-League 2024 khởi tranh vào tháng tới",
-        "HLV Park Hang-seo chia tay bóng đá Việt Nam",
-        # Kinh doanh (Business)
-        "Chứng khoán tăng điểm mạnh phiên đầu tuần",
-        "Ngân hàng Nhà nước điều chỉnh lãi suất",
-        "Doanh nghiệp xuất khẩu gặp khó khăn",
-        "Thị trường bất động sản phục hồi",
-        # Công nghệ (Technology)
-        "Apple ra mắt iPhone 16 với nhiều tính năng mới",
-        "Trí tuệ nhân tạo đang thay đổi cuộc sống",
-        "Startup công nghệ Việt Nam gọi vốn thành công",
-        "5G được triển khai rộng rãi tại Việt Nam",
-    ]
-    train_labels = [
-        "the_thao", "the_thao", "the_thao", "the_thao",
-        "kinh_doanh", "kinh_doanh", "kinh_doanh", "kinh_doanh",
-        "cong_nghe", "cong_nghe", "cong_nghe", "cong_nghe",
-    ]
-    # Initialize and train
-    classifier = SenTextClassifier(
-        max_features=500,
-        ngram_range=(1, 2),
-        min_df=1,
-    )
-    print("\nTraining...")
-    classifier.train(train_texts, train_labels)
-    print()
-    # Test predictions
-    print("Testing predictions:")
-    test_texts = [
-        "Ronaldo ghi hat-trick trong trận đấu",
-        "VN-Index tăng 10 điểm hôm nay",
-        "Samsung ra mắt điện thoại mới",
-        "Đội bóng đá Việt Nam vô đ���ch AFF Cup",
-        "Lãi suất ngân hàng giảm mạnh",
-    ]
-    for text in test_texts:
-        sentence = Sentence(text)
-        classifier.predict(sentence)
-        print(f"  '{text}' -> {sentence.labels[0]}")
-    print()
-if __name__ == "__main__":
-    test_training()
-    test_multiclass()
-    print("=" * 60)
-    print("All tests completed!")
-    print("=" * 60)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff