diff --git a/.gitattributes b/.gitattributes index 24a49c7add6ae93aa3928cb19b922aa3171f1952..43adae2acb07a23584eda3119f481c7375a4a1a2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -42,5 +42,6 @@ references/2021.naacl.nguyen/paper.pdf filter=lfs diff=lfs merge=lfs -text references/2021.naacl.nguyen/source/JointModel.pdf filter=lfs diff=lfs merge=lfs -text *.pdf filter=lfs diff=lfs merge=lfs -text *.crfsuite filter=lfs diff=lfs merge=lfs -text +*.crf filter=lfs diff=lfs merge=lfs -text *.mco filter=lfs diff=lfs merge=lfs -text *.jar filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index bbf6b040c7684dde394fa1313b5cef0775333682..6d20b04b8cfa9d7924e12b87333e9d37ba759aee 100644 --- a/.gitignore +++ b/.gitignore @@ -26,10 +26,10 @@ per_tag_metrics.png # Temporary model files (main model is tracked via Git LFS) *.crfsuite !pos_tagger.crfsuite +!models/**/*.crfsuite # Logs *.log wandb/ -models.claude .claude diff --git a/models/pos_tagger/20260131_000000/metadata.yaml b/models/pos_tagger/20260131_000000/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..154ab2489c5ea7816cb0a8eb8f091ee03d482e1d --- /dev/null +++ b/models/pos_tagger/20260131_000000/metadata.yaml @@ -0,0 +1,44 @@ +# POS Tagger Model Metadata +# Auto-generated during training + +model: + name: Vietnamese POS Tagger + version: "20260131_000000" + type: CRF (Conditional Random Field) + framework: python-crfsuite + +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + +performance: + test_accuracy: 0.9282 + tags: + - ADJ + - ADP + - ADV + - AUX + - CCONJ + - DET + - NOUN + - NUM + - PART + - PRON + - PROPN + - PUNCT + - SCONJ + - VERB + - X + +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml + +created_at: "2026-01-31" +author: undertheseanlp diff --git a/models/pos_tagger/20260131_000000/model.crfsuite b/models/pos_tagger/20260131_000000/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..0df4ec4a8af56c7c1516e7acc6ca3d7fad8d4dd3 --- /dev/null +++ b/models/pos_tagger/20260131_000000/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27dfbf196829379c69feda056d53482b3cc69a7f134fc5b853b0ba3a0f80f139 +size 2366076 diff --git a/models/pos_tagger/baseline-pos-10iter/metadata.yaml b/models/pos_tagger/baseline-pos-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ebaf698b563416465fc1bb2c9996e96b1e9358e --- /dev/null +++ b/models/pos_tagger/baseline-pos-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: baseline-pos-10iter + type: CRF (Conditional Random Field) + framework: python-crfsuite +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 68.24 +performance: + test_accuracy: 0.7837 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:36:03' +author: undertheseanlp diff --git a/models/pos_tagger/baseline-pos-10iter/model.crfsuite b/models/pos_tagger/baseline-pos-10iter/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..dcd1b503e32fa62550f96a4f126844650fec7db6 --- /dev/null +++ b/models/pos_tagger/baseline-pos-10iter/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b1dcf8240e73a5953180f5da1c2a54aa6ef2dd67775ccd5c86a245489c1520 +size 14812304 diff --git a/models/pos_tagger/baseline-python-crfsuite-10iter/metadata.yaml b/models/pos_tagger/baseline-python-crfsuite-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab11065b425a36430d11b2cef7447b6021fae296 --- /dev/null +++ b/models/pos_tagger/baseline-python-crfsuite-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: baseline-python-crfsuite-10iter + type: CRF (Conditional Random Field) + framework: python-crfsuite +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 67.69 +performance: + test_accuracy: 0.7837 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:56:21' +author: undertheseanlp diff --git a/models/pos_tagger/baseline-python-crfsuite-10iter/model.crfsuite b/models/pos_tagger/baseline-python-crfsuite-10iter/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..dcd1b503e32fa62550f96a4f126844650fec7db6 --- /dev/null +++ b/models/pos_tagger/baseline-python-crfsuite-10iter/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b1dcf8240e73a5953180f5da1c2a54aa6ef2dd67775ccd5c86a245489c1520 +size 14812304 diff --git a/models/pos_tagger/baseline-underthesea-10iter/metadata.yaml b/models/pos_tagger/baseline-underthesea-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7efc516e4f563c6286d61285818d6f2b84e6f276 --- /dev/null +++ b/models/pos_tagger/baseline-underthesea-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: baseline-underthesea-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 59.02 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:55:02' +author: undertheseanlp diff --git a/models/pos_tagger/baseline-underthesea-10iter/model.crf b/models/pos_tagger/baseline-underthesea-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..42a3894c488256eaca779dcfc94aba43a2a15058 --- /dev/null +++ b/models/pos_tagger/baseline-underthesea-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f0d10abcc1d2bc4695a0406e924c3a66b2517b16dd214d4d57b2b5391b5980 +size 29270222 diff --git a/models/pos_tagger/crfsuite-rs/metadata.yaml b/models/pos_tagger/crfsuite-rs/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fa44df4205dfd62fec6d3d8e12edc3425f1fb58c --- /dev/null +++ b/models/pos_tagger/crfsuite-rs/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: crfsuite-rs + type: CRF (Conditional Random Field) + framework: crfsuite-rs +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 165.75 +performance: + test_accuracy: 0.9589 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 07:32:56' +author: undertheseanlp diff --git a/models/pos_tagger/crfsuite-rs/model.crfsuite b/models/pos_tagger/crfsuite-rs/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..0df4ec4a8af56c7c1516e7acc6ca3d7fad8d4dd3 --- /dev/null +++ b/models/pos_tagger/crfsuite-rs/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27dfbf196829379c69feda056d53482b3cc69a7f134fc5b853b0ba3a0f80f139 +size 2366076 diff --git a/models/pos_tagger/fast-exp-10iter/metadata.yaml b/models/pos_tagger/fast-exp-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96c743a51f7bd98c2410c44f022872e8590efb65 --- /dev/null +++ b/models/pos_tagger/fast-exp-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: fast-exp-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 57.44 +performance: + test_accuracy: 0.7553 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:10:30' +author: undertheseanlp diff --git a/models/pos_tagger/fast-exp-10iter/model.crf b/models/pos_tagger/fast-exp-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..cdaf0853b0d9662471aab1b404382b1b267361cd --- /dev/null +++ b/models/pos_tagger/fast-exp-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ef0e55cc1b76bfde88f4abe7c6689968b10ce754cb8e96d0da3655a9bf7f33 +size 29349246 diff --git a/models/pos_tagger/final-baseline-10iter/metadata.yaml b/models/pos_tagger/final-baseline-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..701d63a76291983dd119a0196fdf6bf2136e9d63 --- /dev/null +++ b/models/pos_tagger/final-baseline-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: final-baseline-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 59.49 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:15:11' +author: undertheseanlp diff --git a/models/pos_tagger/final-baseline-10iter/model.crf b/models/pos_tagger/final-baseline-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..1b2043434691fc8960d2103d24d5535903b8d000 --- /dev/null +++ b/models/pos_tagger/final-baseline-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b729820a3972fd5806c25ea70ea287ac4108e9bd2ca6e30b5c86b24718c215fc +size 29270222 diff --git a/models/pos_tagger/optimized-pos/metadata.yaml b/models/pos_tagger/optimized-pos/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..665592bb829bb1af98843265389cf0568381f1d7 --- /dev/null +++ b/models/pos_tagger/optimized-pos/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: optimized-pos + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 349.17 +performance: + test_accuracy: 0.9598 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:23:20' +author: undertheseanlp diff --git a/models/pos_tagger/optimized-pos/model.crf b/models/pos_tagger/optimized-pos/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..40413ffbabf24bbeb94e3dfd8dc33618d528a574 --- /dev/null +++ b/models/pos_tagger/optimized-pos/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:712a51a7b354ad1eb9a0a3b62a33163c935da437a1bd53b3c7c00f7aa84a3f05 +size 25482030 diff --git a/models/pos_tagger/parallel-10iter/metadata.yaml b/models/pos_tagger/parallel-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a0a5b7e41b0a3ec0521ea6d68a63929c3bab622 --- /dev/null +++ b/models/pos_tagger/parallel-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: parallel-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 61.77 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:19:29' +author: undertheseanlp diff --git a/models/pos_tagger/parallel-10iter/model.crf b/models/pos_tagger/parallel-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..2f4a3dbe38cb6312ae94bc897e41fe1a2c1e3d50 --- /dev/null +++ b/models/pos_tagger/parallel-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e2e3b99da852cfa438e3914ac4aab20f00e100709f4cecd325ccf2af60c57b8 +size 29270222 diff --git a/models/pos_tagger/python-crfsuite-v1/metadata.yaml b/models/pos_tagger/python-crfsuite-v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea0f1374fe87e9ec7537b05ba2a04339bef0b17a --- /dev/null +++ b/models/pos_tagger/python-crfsuite-v1/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: python-crfsuite-v1 + type: CRF (Conditional Random Field) + framework: python-crfsuite +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 290.43 +performance: + test_accuracy: 0.9598 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 11:00:59' +author: undertheseanlp diff --git a/models/pos_tagger/python-crfsuite-v1/model.crfsuite b/models/pos_tagger/python-crfsuite-v1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..01e28c463107974d807608ff0591f71c264c6b11 --- /dev/null +++ b/models/pos_tagger/python-crfsuite-v1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaf38e66198bafeac12b38cb6403656c8e51472e840e84699494b29034632ebe +size 2139164 diff --git a/models/pos_tagger/python-crfsuite/metadata.yaml b/models/pos_tagger/python-crfsuite/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e8c6272de391bbe056943aa99e90489ab3eabd97 --- /dev/null +++ b/models/pos_tagger/python-crfsuite/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: python-crfsuite + type: CRF (Conditional Random Field) + framework: python-crfsuite +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 148.29 +performance: + test_accuracy: 0.9589 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 07:30:01' +author: undertheseanlp diff --git a/models/pos_tagger/python-crfsuite/model.crfsuite b/models/pos_tagger/python-crfsuite/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..0df4ec4a8af56c7c1516e7acc6ca3d7fad8d4dd3 --- /dev/null +++ b/models/pos_tagger/python-crfsuite/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27dfbf196829379c69feda056d53482b3cc69a7f134fc5b853b0ba3a0f80f139 +size 2366076 diff --git a/models/pos_tagger/simd-avx2-10iter/metadata.yaml b/models/pos_tagger/simd-avx2-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05883f9a3a8bb03bac4175182bc23d17787819ef --- /dev/null +++ b/models/pos_tagger/simd-avx2-10iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: simd-avx2-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 65.04 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:04:12' +author: undertheseanlp diff --git a/models/pos_tagger/simd-avx2-10iter/model.crf b/models/pos_tagger/simd-avx2-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..c080f62498ee195d79319a21683486a8b8c754ed --- /dev/null +++ b/models/pos_tagger/simd-avx2-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731cfa98e9005c7efb6ba5f58aedef6118680f9bb2e901a7517c68ad4eeb41e0 +size 29270222 diff --git a/models/pos_tagger/simd-v1/metadata.yaml b/models/pos_tagger/simd-v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..73ff7cd8d6b3e4f16e3d3f528cd7b3a323f81a6e --- /dev/null +++ b/models/pos_tagger/simd-v1/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: simd-v1 + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 56.64 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:33:20' +author: undertheseanlp diff --git a/models/pos_tagger/simd-v1/model.crf b/models/pos_tagger/simd-v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..f61199c78b17c916018c3a8d3bfeae7af15e1b20 --- /dev/null +++ b/models/pos_tagger/simd-v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418923286348fc36a7a10d1c3b14a012de5a42590049e9c8e69ecdd128ffcf8a +size 29270222 diff --git a/models/pos_tagger/simd-v2/metadata.yaml b/models/pos_tagger/simd-v2/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9737d32165c1e4d5dbca87dcbca75ede7932995d --- /dev/null +++ b/models/pos_tagger/simd-v2/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: simd-v2 + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 59.18 +performance: + test_accuracy: 0.7542 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 15:34:37' +author: undertheseanlp diff --git a/models/pos_tagger/simd-v2/model.crf b/models/pos_tagger/simd-v2/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..3bb3cc14171a487ee482e8a18e86d56c1b356fb7 --- /dev/null +++ b/models/pos_tagger/simd-v2/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11d51e4b8e635204e8331cf2929102a211c7bbc44aac86f5f3e0407df6b6eb5 +size 29270222 diff --git a/models/pos_tagger/test-200iter-crfsuite/metadata.yaml b/models/pos_tagger/test-200iter-crfsuite/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c760e2b71e0918c6d97f30ef257897db02d3d01 --- /dev/null +++ b/models/pos_tagger/test-200iter-crfsuite/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: test-200iter-crfsuite + type: CRF (Conditional Random Field) + framework: python-crfsuite +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 304.92 +performance: + test_accuracy: 0.9598 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:31:06' +author: undertheseanlp diff --git a/models/pos_tagger/test-200iter-crfsuite/model.crfsuite b/models/pos_tagger/test-200iter-crfsuite/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..01e28c463107974d807608ff0591f71c264c6b11 --- /dev/null +++ b/models/pos_tagger/test-200iter-crfsuite/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaf38e66198bafeac12b38cb6403656c8e51472e840e84699494b29034632ebe +size 2139164 diff --git a/models/pos_tagger/test-200iter/metadata.yaml b/models/pos_tagger/test-200iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2c428ba6987bf9bf91a2dc143a3cb4592c7697a2 --- /dev/null +++ b/models/pos_tagger/test-200iter/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: test-200iter + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 294.05 +performance: + test_accuracy: 0.9597 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 16:25:50' +author: undertheseanlp diff --git a/models/pos_tagger/test-200iter/model.crf b/models/pos_tagger/test-200iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..2f997132e0e50979519dc202ded08eec7fd4697a --- /dev/null +++ b/models/pos_tagger/test-200iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5958fc70df813ace61fb161c290f72af5924f92da40ad4df5812f5cb631820da +size 25482814 diff --git a/models/pos_tagger/test-crfsuite-style/metadata.yaml b/models/pos_tagger/test-crfsuite-style/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9beec08d4845906f32cf61c7a537c09fc683901f --- /dev/null +++ b/models/pos_tagger/test-crfsuite-style/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: test-crfsuite-style + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 461.7 +performance: + test_accuracy: 0.9362 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 08:45:28' +author: undertheseanlp diff --git a/models/pos_tagger/test-crfsuite-style/model.crf b/models/pos_tagger/test-crfsuite-style/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..956af1a8eead732ba1d6cc0567573a775472c28b --- /dev/null +++ b/models/pos_tagger/test-crfsuite-style/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803bd523eaf33c31cd1ce232d38c9d480f80311b5df296a26bac04326b256e11 +size 27880622 diff --git a/models/pos_tagger/test-speed/metadata.yaml b/models/pos_tagger/test-speed/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..78ecde133f65e9ae0846bde2be6341ebf7547287 --- /dev/null +++ b/models/pos_tagger/test-speed/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: test-speed + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 217.03 +performance: + test_accuracy: 0.9362 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 08:32:29' +author: undertheseanlp diff --git a/models/pos_tagger/test-speed/model.crf b/models/pos_tagger/test-speed/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..5eee8cd0c1c84a554caccec8da88991ae7e88ea7 --- /dev/null +++ b/models/pos_tagger/test-speed/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21e4cdf643144be76574e7e9f3441baacafb6fcf7238adb64eac142feb5f63d +size 27880622 diff --git a/models/pos_tagger/underthesea-core-optimized/metadata.yaml b/models/pos_tagger/underthesea-core-optimized/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8f2f8e6fc8b082875c43e03b377ace2adc04c1c3 --- /dev/null +++ b/models/pos_tagger/underthesea-core-optimized/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: underthesea-core-optimized + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 194.48 +performance: + test_accuracy: 0.9362 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 07:52:48' +author: undertheseanlp diff --git a/models/pos_tagger/underthesea-core-optimized/model.crf b/models/pos_tagger/underthesea-core-optimized/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..571bc6a4143ca7f4048856fc972dac6d9b755ce1 --- /dev/null +++ b/models/pos_tagger/underthesea-core-optimized/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa6597733686d5c89ade1ae65a22f593ac1a1880728ff5547598e16efec37beb +size 27880622 diff --git a/models/pos_tagger/underthesea-core-v2/metadata.yaml b/models/pos_tagger/underthesea-core-v2/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..29eebb57e4a80ebea58b7b2affff71971032e46c --- /dev/null +++ b/models/pos_tagger/underthesea-core-v2/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: underthesea-core-v2 + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 351.01 +performance: + test_accuracy: 0.9556 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 08:08:10' +author: undertheseanlp diff --git a/models/pos_tagger/underthesea-core-v2/model.crf b/models/pos_tagger/underthesea-core-v2/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..c62b51f1c000a508459afce4a787aff31f9d9db2 --- /dev/null +++ b/models/pos_tagger/underthesea-core-v2/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9f6fb7fa8c3390597439726b643aa74ee6581d57bbc003ab4b161cebc2cbeeb +size 26618990 diff --git a/models/pos_tagger/underthesea-core-v3/metadata.yaml b/models/pos_tagger/underthesea-core-v3/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6ef125131f302d32cc81768d10b909554c57f2b --- /dev/null +++ b/models/pos_tagger/underthesea-core-v3/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: underthesea-core-v3 + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 544.99 +performance: + test_accuracy: 0.9598 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 10:52:59' +author: undertheseanlp diff --git a/models/pos_tagger/underthesea-core-v3/model.crf b/models/pos_tagger/underthesea-core-v3/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..e2d0124d9522952e4a3b105573dc7cd92f44a521 --- /dev/null +++ b/models/pos_tagger/underthesea-core-v3/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4d9d411cf55d00c58cf83333bd7bb0b66898bd78cc34dea2e6f93271a5f6a56 +size 25482670 diff --git a/models/pos_tagger/underthesea-core-v4/metadata.yaml b/models/pos_tagger/underthesea-core-v4/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c86ecacf8f4fafad88cae8a7ed76651d1f605d28 --- /dev/null +++ b/models/pos_tagger/underthesea-core-v4/metadata.yaml @@ -0,0 +1,26 @@ +model: + name: Vietnamese POS Tagger + version: underthesea-core-v4 + type: CRF (Conditional Random Field) + framework: underthesea-core +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + val_sentences: 859 + test_sentences: 859 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 479.03 +performance: + test_accuracy: 0.9596 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/pos_tagger.yaml +created_at: '2026-01-31 11:58:34' +author: undertheseanlp diff --git a/models/pos_tagger/underthesea-core-v4/model.crf b/models/pos_tagger/underthesea-core-v4/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..f05ec0b78c82f596fd570e5233d75a7ccd0c9fa2 --- /dev/null +++ b/models/pos_tagger/underthesea-core-v4/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e17ddb2c63318801c9fb770a53197b7d806fdf0cc57c12fcac771644c9248a2 +size 25482782 diff --git a/models/word_segmentation/20260131_000000/metadata.yaml b/models/word_segmentation/20260131_000000/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..66ba1a0b6cb9c86919b0ce6ea0cc2d8fc6b9b9cb --- /dev/null +++ b/models/word_segmentation/20260131_000000/metadata.yaml @@ -0,0 +1,36 @@ +# Word Segmentation Model Metadata +# Auto-generated during training + +model: + name: Vietnamese Word Segmentation + version: "20260131_000000" + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO + +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + +performance: + syllable_accuracy: 0.9890 + syllable_f1: 0.9890 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 + +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml + +created_at: "2026-01-31" +author: undertheseanlp diff --git a/models/word_segmentation/20260131_000000/model.crfsuite b/models/word_segmentation/20260131_000000/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/20260131_000000/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/20260131_041701/examples.output b/models/word_segmentation/20260131_041701/examples.output new file mode 100644 index 0000000000000000000000000000000000000000..08c6a1589f032ac4b69e8a51abc920d6363b4f82 --- /dev/null +++ b/models/word_segmentation/20260131_041701/examples.output @@ -0,0 +1,2 @@ +Trên thế_giới , giá_vàng đang được giao_dịch ở mức 5.068 USD / ounce , mất thêm khoảng 280 đồng / USD so với phiên sáng_. Nếu tính trong một phiên , giá_vàng mất tổng_cộng gần 500 USD / ounce ( tương_đương mức giảm khoảng 15 triệu đồng ) ._Đây là mức giảm kỷ_lục trong lịch_sử biến_động của kim_loại quý này . +Hiện_giá vàng thế_giới quy_đổi theo tỷ_giá Vietcombank ( chưa bao_gồm thuế , phí ) vào_khoảng 160,4 triệu đồng /_lượng , thấp hơn vàng trong nước gần 20 triệu đồng /_lượng . diff --git a/models/word_segmentation/20260131_041701/metadata.yaml b/models/word_segmentation/20260131_041701/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc75c142a3d7ce2c5cc371916f9611e92aaf9d27 --- /dev/null +++ b/models/word_segmentation/20260131_041701/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: '20260131_041701' + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 103.65 +performance: + syllable_accuracy: 0.989 + syllable_f1: 0.989 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 04:18:45' +author: undertheseanlp diff --git a/models/word_segmentation/20260131_041701/model.crfsuite b/models/word_segmentation/20260131_041701/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/20260131_041701/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/20260131_060411/metadata.yaml b/models/word_segmentation/20260131_060411/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6ccd69f68fc5b574e45a6f204dd06a3b0c382986 --- /dev/null +++ b/models/word_segmentation/20260131_060411/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: '20260131_060411' + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 125.06 +performance: + syllable_accuracy: 0.989 + syllable_f1: 0.989 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:06:16' +author: undertheseanlp diff --git a/models/word_segmentation/20260131_060411/model.crfsuite b/models/word_segmentation/20260131_060411/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/20260131_060411/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/20260131_061406/metadata.yaml b/models/word_segmentation/20260131_061406/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a3ba4703ef49189b7e9b353441a9d3a6f9ad00dd --- /dev/null +++ b/models/word_segmentation/20260131_061406/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: '20260131_061406' + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 94.48 +performance: + syllable_accuracy: 0.9774 + syllable_f1: 0.9774 + word_precision: 0.9582 + word_recall: 0.9583 + word_f1: 0.9582 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:15:40' +author: undertheseanlp diff --git a/models/word_segmentation/20260131_061406/model.crf b/models/word_segmentation/20260131_061406/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..18a72324e85a86ee5871181c6e1f521cf03c3431 --- /dev/null +++ b/models/word_segmentation/20260131_061406/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d718662d750fb2a1f55eabebc06032c0290f248608c689f43505b619e09016 +size 50022442 diff --git a/models/word_segmentation/baseline-10iter-b/metadata.yaml b/models/word_segmentation/baseline-10iter-b/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4957cec1e51be699ae8c29657702e01aa20b2ed1 --- /dev/null +++ b/models/word_segmentation/baseline-10iter-b/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: baseline-10iter-b + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 45.93 +performance: + syllable_accuracy: 0.8982 + syllable_f1: 0.8989 + word_precision: 0.8212 + word_recall: 0.8077 + word_f1: 0.8144 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:13:06' +author: undertheseanlp diff --git a/models/word_segmentation/baseline-10iter-b/model.crfsuite b/models/word_segmentation/baseline-10iter-b/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..5b915a80bb34cd3de1dc1a2b4d0128b26583fa87 --- /dev/null +++ b/models/word_segmentation/baseline-10iter-b/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f157e3441b629705edbff3f703aa5f814f6165877273ca1888e171570260817d +size 7347980 diff --git a/models/word_segmentation/baseline-10iter/metadata.yaml b/models/word_segmentation/baseline-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..379c010f6c418a2235b7f87747748f5af6f7b13f --- /dev/null +++ b/models/word_segmentation/baseline-10iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: baseline-10iter + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 44.97 +performance: + syllable_accuracy: 0.8982 + syllable_f1: 0.8989 + word_precision: 0.8212 + word_recall: 0.8077 + word_f1: 0.8144 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:12:03' +author: undertheseanlp diff --git a/models/word_segmentation/baseline-10iter/model.crfsuite b/models/word_segmentation/baseline-10iter/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..5b915a80bb34cd3de1dc1a2b4d0128b26583fa87 --- /dev/null +++ b/models/word_segmentation/baseline-10iter/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f157e3441b629705edbff3f703aa5f814f6165877273ca1888e171570260817d +size 7347980 diff --git a/models/word_segmentation/baseline-python-crfsuite-10iter/metadata.yaml b/models/word_segmentation/baseline-python-crfsuite-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..55f3060a72fdb0fde641682e942c6af97857cd79 --- /dev/null +++ b/models/word_segmentation/baseline-python-crfsuite-10iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: baseline-python-crfsuite-10iter + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 47.28 +performance: + syllable_accuracy: 0.8982 + syllable_f1: 0.8989 + word_precision: 0.8212 + word_recall: 0.8077 + word_f1: 0.8144 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:58:15' +author: undertheseanlp diff --git a/models/word_segmentation/baseline-python-crfsuite-10iter/model.crfsuite b/models/word_segmentation/baseline-python-crfsuite-10iter/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..5b915a80bb34cd3de1dc1a2b4d0128b26583fa87 --- /dev/null +++ b/models/word_segmentation/baseline-python-crfsuite-10iter/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f157e3441b629705edbff3f703aa5f814f6165877273ca1888e171570260817d +size 7347980 diff --git a/models/word_segmentation/baseline-underthesea-10iter/metadata.yaml b/models/word_segmentation/baseline-underthesea-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cbabd29e578d05685168f31116c1d6bad464ccc8 --- /dev/null +++ b/models/word_segmentation/baseline-underthesea-10iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: baseline-underthesea-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 38.71 +performance: + syllable_accuracy: 0.9019 + syllable_f1: 0.902 + word_precision: 0.8292 + word_recall: 0.827 + word_f1: 0.8281 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:57:19' +author: undertheseanlp diff --git a/models/word_segmentation/baseline-underthesea-10iter/model.crf b/models/word_segmentation/baseline-underthesea-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..b9647c3db47bb104ebbb5d9112321950b2c53089 --- /dev/null +++ b/models/word_segmentation/baseline-underthesea-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe63c8bb66dbda5bd4869a87c784c9e70d553ad014e6c28d5f8c476a325bca1 +size 35092842 diff --git a/models/word_segmentation/crfsuiters_c2_1/metadata.yaml b/models/word_segmentation/crfsuiters_c2_1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db42312478a9bfcfae9295b9413d3038a01a09d4 --- /dev/null +++ b/models/word_segmentation/crfsuiters_c2_1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: crfsuiters_c2_1 + type: CRF (Conditional Random Field) + framework: crfsuite-rs + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 0.0 + c2: 1.0 + max_iterations: 100 + duration_seconds: 118.28 +performance: + syllable_accuracy: 0.9848 + syllable_f1: 0.9848 + word_precision: 0.9717 + word_recall: 0.9728 + word_f1: 0.9723 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 07:12:56' +author: undertheseanlp diff --git a/models/word_segmentation/crfsuiters_c2_1/model.crfsuite b/models/word_segmentation/crfsuiters_c2_1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..36044b447ce3decba9cda0ab2cc7f5b738033e0e --- /dev/null +++ b/models/word_segmentation/crfsuiters_c2_1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c28c79dd6303b614811a26e328d525035b600d050982c91b6779919d9d2ed0b +size 44378708 diff --git a/models/word_segmentation/crfsuiters_v1/metadata.yaml b/models/word_segmentation/crfsuiters_v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..669a349bce81d6035fbeb541af09ed1afbcbcae2 --- /dev/null +++ b/models/word_segmentation/crfsuiters_v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: crfsuiters_v1 + type: CRF (Conditional Random Field) + framework: crfsuite-rs + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 112.83 +performance: + syllable_accuracy: 0.989 + syllable_f1: 0.989 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:39:29' +author: undertheseanlp diff --git a/models/word_segmentation/crfsuiters_v1/model.crfsuite b/models/word_segmentation/crfsuiters_v1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/crfsuiters_v1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/fast-exp-10iter/metadata.yaml b/models/word_segmentation/fast-exp-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d43442763a5cac17b127ef2600d028337f082598 --- /dev/null +++ b/models/word_segmentation/fast-exp-10iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: fast-exp-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 38.08 +performance: + syllable_accuracy: 0.9012 + syllable_f1: 0.9024 + word_precision: 0.837 + word_recall: 0.8129 + word_f1: 0.8248 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 16:11:22' +author: undertheseanlp diff --git a/models/word_segmentation/fast-exp-10iter/model.crf b/models/word_segmentation/fast-exp-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..556fb85ac297940c169db7841ad33a25563e000c --- /dev/null +++ b/models/word_segmentation/fast-exp-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03822bb5d4446235e55201c93d2f924f61fa0ff072c3e74e08f532875764e733 +size 35114410 diff --git a/models/word_segmentation/optimized-200iter/metadata.yaml b/models/word_segmentation/optimized-200iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e74b91792c5a9500cc8d23ee7641819e74634b5 --- /dev/null +++ b/models/word_segmentation/optimized-200iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: optimized-200iter + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 133.05 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.98 + word_recall: 0.9799 + word_f1: 0.9799 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:15:39' +author: undertheseanlp diff --git a/models/word_segmentation/optimized-200iter/model.crf b/models/word_segmentation/optimized-200iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..3a1531eed2910318b7d759875942bf4906280e87 --- /dev/null +++ b/models/word_segmentation/optimized-200iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a0dd23de6e4ee6a33d29e1ed6db03064350e4af71207932dab6d4feea1259b +size 33274810 diff --git a/models/word_segmentation/optimized-v1/metadata.yaml b/models/word_segmentation/optimized-v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..854a162d43cbca666e6f71cd62b22f6c1d74d590 --- /dev/null +++ b/models/word_segmentation/optimized-v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: optimized-v1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 36.55 +performance: + syllable_accuracy: 0.9019 + syllable_f1: 0.902 + word_precision: 0.8292 + word_recall: 0.827 + word_f1: 0.8281 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 15:10:57' +author: undertheseanlp diff --git a/models/word_segmentation/optimized-v1/model.crf b/models/word_segmentation/optimized-v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..932a06220adbcf211d4e3ec7eb746143c71d5760 --- /dev/null +++ b/models/word_segmentation/optimized-v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4f2f7440bc1755d5f0c46cca8693bc112e3e1aac3134bb033aa1de339a63c62 +size 35092842 diff --git a/models/word_segmentation/pycrfsuite_c2_1/metadata.yaml b/models/word_segmentation/pycrfsuite_c2_1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e19bcc8e057d0d1bb2a2ffb90035ce392b8a24d6 --- /dev/null +++ b/models/word_segmentation/pycrfsuite_c2_1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: pycrfsuite_c2_1 + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 0.0 + c2: 1.0 + max_iterations: 100 + duration_seconds: 103.43 +performance: + syllable_accuracy: 0.9848 + syllable_f1: 0.9848 + word_precision: 0.9717 + word_recall: 0.9728 + word_f1: 0.9723 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 07:06:24' +author: undertheseanlp diff --git a/models/word_segmentation/pycrfsuite_c2_1/model.crfsuite b/models/word_segmentation/pycrfsuite_c2_1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..36044b447ce3decba9cda0ab2cc7f5b738033e0e --- /dev/null +++ b/models/word_segmentation/pycrfsuite_c2_1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c28c79dd6303b614811a26e328d525035b600d050982c91b6779919d9d2ed0b +size 44378708 diff --git a/models/word_segmentation/pycrfsuite_v1/metadata.yaml b/models/word_segmentation/pycrfsuite_v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab2c0ea5b6d71dfe2b8c8a230e2f46d7b3351dd5 --- /dev/null +++ b/models/word_segmentation/pycrfsuite_v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: pycrfsuite_v1 + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 94.15 +performance: + syllable_accuracy: 0.989 + syllable_f1: 0.989 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:37:31' +author: undertheseanlp diff --git a/models/word_segmentation/pycrfsuite_v1/model.crfsuite b/models/word_segmentation/pycrfsuite_v1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/pycrfsuite_v1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/python-crfsuite-20260131/metadata.yaml b/models/word_segmentation/python-crfsuite-20260131/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a156054d8ca4ef7e748b034a4c9880004a49a212 --- /dev/null +++ b/models/word_segmentation/python-crfsuite-20260131/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: python-crfsuite-20260131 + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 96.69 +performance: + syllable_accuracy: 0.989 + syllable_f1: 0.989 + word_precision: 0.9802 + word_recall: 0.9801 + word_f1: 0.9801 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:31:20' +author: undertheseanlp diff --git a/models/word_segmentation/python-crfsuite-20260131/model.crfsuite b/models/word_segmentation/python-crfsuite-20260131/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..c00379d70fed822c7a2f73bd18914148d855d6ce --- /dev/null +++ b/models/word_segmentation/python-crfsuite-20260131/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56dc5e49912bf944679695507f22876861da892faf627ce6ea26a249bc82c8d4 +size 1093088 diff --git a/models/word_segmentation/python-crfsuite-v1/metadata.yaml b/models/word_segmentation/python-crfsuite-v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2e40a58a92fb3ec3c58429b3b5e7a563563a05b --- /dev/null +++ b/models/word_segmentation/python-crfsuite-v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: python-crfsuite-v1 + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 154.39 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.98 + word_recall: 0.9799 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 10:58:43' +author: undertheseanlp diff --git a/models/word_segmentation/python-crfsuite-v1/model.crfsuite b/models/word_segmentation/python-crfsuite-v1/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..7bb287a1e691c1880f139a6353e1f00cb80f0874 --- /dev/null +++ b/models/word_segmentation/python-crfsuite-v1/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686c105bfeadadb2240cabc9df1a9d8f278beb3e7bb0db39562a21601b4bf1fc +size 996768 diff --git a/models/word_segmentation/simd-avx2-10iter/metadata.yaml b/models/word_segmentation/simd-avx2-10iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..000018c0a83f7f023e773851202121cf53c786bf --- /dev/null +++ b/models/word_segmentation/simd-avx2-10iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: simd-avx2-10iter + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 10 + duration_seconds: 38.67 +performance: + syllable_accuracy: 0.9019 + syllable_f1: 0.902 + word_precision: 0.8292 + word_recall: 0.827 + word_f1: 0.8281 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 16:05:04' +author: undertheseanlp diff --git a/models/word_segmentation/simd-avx2-10iter/model.crf b/models/word_segmentation/simd-avx2-10iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..0372bb3f8a88875560fa9f4c7538a8b7f344854c --- /dev/null +++ b/models/word_segmentation/simd-avx2-10iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233ed97d7517011f4b730bac446c0cee61dd5857c28ee5eadc993b57f69b573f +size 35092842 diff --git a/models/word_segmentation/test-200iter-crfsuite/metadata.yaml b/models/word_segmentation/test-200iter-crfsuite/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e7e0210c90aa401268a8b70e2f10b5a115c01ea6 --- /dev/null +++ b/models/word_segmentation/test-200iter-crfsuite/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: test-200iter-crfsuite + type: CRF (Conditional Random Field) + framework: python-crfsuite + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 162.61 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.98 + word_recall: 0.9799 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 16:36:22' +author: undertheseanlp diff --git a/models/word_segmentation/test-200iter-crfsuite/model.crfsuite b/models/word_segmentation/test-200iter-crfsuite/model.crfsuite new file mode 100644 index 0000000000000000000000000000000000000000..7bb287a1e691c1880f139a6353e1f00cb80f0874 --- /dev/null +++ b/models/word_segmentation/test-200iter-crfsuite/model.crfsuite @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686c105bfeadadb2240cabc9df1a9d8f278beb3e7bb0db39562a21601b4bf1fc +size 996768 diff --git a/models/word_segmentation/test-200iter/metadata.yaml b/models/word_segmentation/test-200iter/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45aaf9ce61117671ee8ca0ad77512eb66316e007 --- /dev/null +++ b/models/word_segmentation/test-200iter/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: test-200iter + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 123.32 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.98 + word_recall: 0.9799 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 16:33:28' +author: undertheseanlp diff --git a/models/word_segmentation/test-200iter/model.crf b/models/word_segmentation/test-200iter/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..a8bc1deaa2b8a63e5dfdd9f15ee7d7289f0fd128 --- /dev/null +++ b/models/word_segmentation/test-200iter/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50a23e213812076a30e9b4b90920a8a27e6a79de7d811893f72d9a8968cffbe2 +size 33275866 diff --git a/models/word_segmentation/underthesea-core-v1/metadata.yaml b/models/word_segmentation/underthesea-core-v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7482bce166254a6b3f859b9012570179fa72910b --- /dev/null +++ b/models/word_segmentation/underthesea-core-v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea-core-v1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 417.86 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.9801 + word_recall: 0.9798 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 10:50:51' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea-core-v1/model.crf b/models/word_segmentation/underthesea-core-v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..0ab6f4f5b2f5952dd6db991a5437cb55bff295cf --- /dev/null +++ b/models/word_segmentation/underthesea-core-v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053d6d3f916209f7e92c7a91abf351436ee9cfe03df6a85c81ca53573b6e3b51 +size 33275498 diff --git a/models/word_segmentation/underthesea-core-v4/metadata.yaml b/models/word_segmentation/underthesea-core-v4/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..81bd0e87e4666611a02c7d5ba4679700a232cd99 --- /dev/null +++ b/models/word_segmentation/underthesea-core-v4/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea-core-v4 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 1072.43 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.9801 + word_recall: 0.9799 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 12:16:53' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea-core-v4/model.crf b/models/word_segmentation/underthesea-core-v4/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..0bdbda79f3adef64196dc73c1a812a7f9ee07876 --- /dev/null +++ b/models/word_segmentation/underthesea-core-v4/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:152679c58fa1487c5bc337f5d09d05dfed49e5e856d87d8828ef8060e515dde0 +size 33275834 diff --git a/models/word_segmentation/underthesea-core-v5/metadata.yaml b/models/word_segmentation/underthesea-core-v5/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e69f159de03c44d80ac7ab47604fd14307b3099 --- /dev/null +++ b/models/word_segmentation/underthesea-core-v5/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea-core-v5 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 2112.2 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.9801 + word_recall: 0.9798 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 13:09:50' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea-core-v5/model.crf b/models/word_segmentation/underthesea-core-v5/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..4eccddd8f8346f15b1b881dc5c1b87b97b59a472 --- /dev/null +++ b/models/word_segmentation/underthesea-core-v5/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:329d7e95364871cd6dd59e6a3c778ca4c2a142e1b314aee28bb4f2e5b056dcc2 +size 33275466 diff --git a/models/word_segmentation/underthesea-core-v6/metadata.yaml b/models/word_segmentation/underthesea-core-v6/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..78e26f76cbecacb2e0ebc42316dcd5f7a9a67baa --- /dev/null +++ b/models/word_segmentation/underthesea-core-v6/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea-core-v6 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 200 + duration_seconds: 1138.37 +performance: + syllable_accuracy: 0.9889 + syllable_f1: 0.9889 + word_precision: 0.9801 + word_recall: 0.98 + word_f1: 0.98 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 13:55:36' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea-core-v6/model.crf b/models/word_segmentation/underthesea-core-v6/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..4f5fba4afde5e155041447f6649905ab73111c8c --- /dev/null +++ b/models/word_segmentation/underthesea-core-v6/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf9c08703406aca14b5a600229f1011e98fce48988ae51c2f263c017910b516 +size 33275466 diff --git a/models/word_segmentation/underthesea_c2_1/metadata.yaml b/models/word_segmentation/underthesea_c2_1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..28374c1ff8b5c3a3e87049f38d9b752964874e1f --- /dev/null +++ b/models/word_segmentation/underthesea_c2_1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea_c2_1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 0.0 + c2: 1.0 + max_iterations: 100 + duration_seconds: 84.72 +performance: + syllable_accuracy: 0.9865 + syllable_f1: 0.9865 + word_precision: 0.9748 + word_recall: 0.9754 + word_f1: 0.9751 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 07:04:29' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea_c2_1/model.crf b/models/word_segmentation/underthesea_c2_1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..06c00a1770415c2b815971ddda46cb7a8804bb7f --- /dev/null +++ b/models/word_segmentation/underthesea_c2_1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48138ffefe550e4ac8f89094172264b6bf03d989725288a235495714d375294c +size 50022442 diff --git a/models/word_segmentation/underthesea_crfsuite_v1/metadata.yaml b/models/word_segmentation/underthesea_crfsuite_v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..235aebd64b17b3d44cbeff1e2cf0b22f7bb996b5 --- /dev/null +++ b/models/word_segmentation/underthesea_crfsuite_v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea_crfsuite_v1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 80.17 +performance: + syllable_accuracy: 0.9837 + syllable_f1: 0.9836 + word_precision: 0.9688 + word_recall: 0.9705 + word_f1: 0.9697 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 07:02:48' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea_crfsuite_v1/model.crf b/models/word_segmentation/underthesea_crfsuite_v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..533cbab58e02f2d8d329b88cbbd4c95eff39355c --- /dev/null +++ b/models/word_segmentation/underthesea_crfsuite_v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad1b3d022d6ad04cafe0d3b90ef47e883572fa7529642e5d7dbbd33161cbe53 +size 33931370 diff --git a/models/word_segmentation/underthesea_owlqn_v1/metadata.yaml b/models/word_segmentation/underthesea_owlqn_v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2de2c74528b992d626a99e50d2d553658364c508 --- /dev/null +++ b/models/word_segmentation/underthesea_owlqn_v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea_owlqn_v1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 83.74 +performance: + syllable_accuracy: 0.9837 + syllable_f1: 0.9837 + word_precision: 0.9691 + word_recall: 0.9703 + word_f1: 0.9697 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:56:51' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea_owlqn_v1/model.crf b/models/word_segmentation/underthesea_owlqn_v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..2e0dde816c091d86bffee10fcad6b2849f1e93cd --- /dev/null +++ b/models/word_segmentation/underthesea_owlqn_v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5264d06a45eb476ba3ce751b65dbd2a4b05f38ea2855f381a1f1a2afdd2d7ad6 +size 33901114 diff --git a/models/word_segmentation/underthesea_v1/metadata.yaml b/models/word_segmentation/underthesea_v1/metadata.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a0dad28ee971a2ff24681301e3c3386f1e2bfb6 --- /dev/null +++ b/models/word_segmentation/underthesea_v1/metadata.yaml @@ -0,0 +1,34 @@ +model: + name: Vietnamese Word Segmentation + version: underthesea_v1 + type: CRF (Conditional Random Field) + framework: underthesea-core + tagging_scheme: BIO +training: + dataset: undertheseanlp/UDD-1 + train_sentences: 18282 + train_syllables: 563134 + val_sentences: 859 + val_syllables: 27170 + test_sentences: 859 + test_syllables: 26132 + hyperparameters: + c1: 1.0 + c2: 0.001 + max_iterations: 100 + duration_seconds: 88.92 +performance: + syllable_accuracy: 0.9774 + syllable_f1: 0.9774 + word_precision: 0.9582 + word_recall: 0.9583 + word_f1: 0.9582 +environment: + platform: Linux + cpu_model: AMD EPYC 7713 64-Core Processor + python_version: 3.12.3 +files: + model: model.crfsuite + config: ../../../configs/word_segmentation.yaml +created_at: '2026-01-31 06:42:00' +author: undertheseanlp diff --git a/models/word_segmentation/underthesea_v1/model.crf b/models/word_segmentation/underthesea_v1/model.crf new file mode 100644 index 0000000000000000000000000000000000000000..596023d23b84537b2b7182a9fcbc9206791f91cf --- /dev/null +++ b/models/word_segmentation/underthesea_v1/model.crf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9538597dddf47c945580fc641025fdbfc4270d13c4c3070a1ae3a7f447572a +size 50022442