| cff-version: 1.2.0 | |
| title: "Khmer Graph-Regularized Tokenizer v2.2-RC" | |
| message: "If you use this model, please cite it as below." | |
| type: software | |
| authors: | |
| - family-names: "khopilot" | |
| given-names: "Niko" | |
| orcid: "https://orcid.org/0000-0000-0000-0000" | |
| repository-code: "https://github.com/khopilot/tokkonizer-km" | |
| url: "https://huggingface.co/khopilot/khmer-tokenizer-v7" | |
| version: "2.2-rc" | |
| date-released: "2025-10-07" | |
| license: MIT | |
| keywords: | |
| - khmer | |
| - tokenization | |
| - graph-regularization | |
| - nlp | |
| - semantic-embeddings | |
| abstract: "A SentencePiece tokenizer with graph-regularized lexeme embeddings for Khmer. Achieves 43.25% Coherence@10 (+110% vs baseline) by structuring the semantic space according to morphological and synonymy relationships." | |