Instructions to use KrorngAI/TrorYongASR-tiny with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use KrorngAI/TrorYongASR-tiny with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="KrorngAI/TrorYongASR-tiny", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("KrorngAI/TrorYongASR-tiny", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # Author: KHUN Kimang | |
| # Date: March 2026 | |
| # KrorngAI | |
| # Inspired by https://github.com/openai/whisper/blob/main/whisper/tokenizer.py | |
| from typing import Optional, Tuple, List | |
| from dataclasses import dataclass, field | |
| from functools import cached_property | |
| from enum import Enum | |
| from transformers import LlamaTokenizer, PreTrainedTokenizer | |
| import json | |
| LANGUAGES = { | |
| "km": "khmer", | |
| "en": "english" | |
| } | |
| TO_LANGUAGE_CODE = { | |
| **{lang: code for code, lang in LANGUAGES.items()}, | |
| } | |
| class ASRSpecialTokens(str, Enum): | |
| km_token = "<|km|>" # language token must be added to lm_head of Decoder Model | |
| en_token = "<|en|>" # language token must be added to lm_head of Decoder Model | |
| transcribe = "<|transcribe|>" | |
| translate = "<|translate|>" | |
| no_speech = "<|nospeech|>" | |
| def list(cls): | |
| return [c.value for c in cls] | |
| class TrorYongASRTokenizer(LlamaTokenizer): | |
| """ | |
| Tokenizer for the ASR task. | |
| It supports only two languages: Khmer and English. | |
| It does not support timestamps. | |
| """ | |
| def __init__( | |
| self, | |
| language: Optional[str] = None, | |
| task: Optional[str] = None, | |
| *args, | |
| **kwargs | |
| ): | |
| self.language = language | |
| self.task = task | |
| super().__init__( | |
| *args, | |
| **kwargs | |
| ) | |
| self.add_special_tokens({ | |
| "additional_special_tokens": ASRSpecialTokens.list() | |
| }) | |
| self.special_tokens = dict() | |
| for special in self.all_special_tokens: | |
| special_id = self.encode(special, add_special_tokens=False)[0] | |
| self.special_tokens[special] = special_id | |
| sot: int = self.special_tokens["<s>"] | |
| translate: int = self.special_tokens["<|translate|>"] | |
| transcribe: int = self.special_tokens["<|transcribe|>"] | |
| sot_sequence = [sot] | |
| if self.language is not None: | |
| language = self.language.lower() | |
| if language not in LANGUAGES: | |
| if language in TO_LANGUAGE_CODE: | |
| language = TO_LANGUAGE_CODE[language] | |
| else: | |
| raise ValueError(f"Unsupported language: {language}") | |
| self.language = language | |
| lang_id = self.encode(f"<|{language}|>", add_special_tokens=False)[0] | |
| sot_sequence.append(lang_id) | |
| if self.task is not None: | |
| task_token: int = transcribe if self.task == "transcribe" else translate | |
| sot_sequence.append(task_token) | |
| self.sot_sequence = tuple(sot_sequence) | |
| def encode(self, text, **kwargs) -> List[int]: | |
| encoding = super().encode(text, **kwargs) | |
| return encoding if encoding[0] != 29871 else encoding[1:] # 29871 is whitespace for TinyKhmerTokenizer | |
| def __call__(self, text: Optional[str] = None) -> List[int]: | |
| encoding = self.encode(text, add_special_tokens=False) | |
| return [*self.sot_sequence] + encoding | |
| def eot(self) -> int: | |
| return self.special_tokens["</s>"] | |
| def transcribe(self) -> int: | |
| return self.special_tokens["<|transcribe|>"] | |
| def translate(self) -> int: | |
| return self.special_tokens["<|translate|>"] | |
| def sot(self) -> int: | |
| return self.special_tokens["<s>"] | |
| def no_speech(self) -> int: | |
| return self.special_tokens["<|nospeech|>"] | |
| def language_token(self) -> int: | |
| """Returns the token id corresponding to the value of the `language` field""" | |
| if self.language is None: | |
| raise ValueError("This tokenizer does not have language token configured") | |
| return self.to_language_token(self.language) | |
| def to_language_token(self, language): | |
| if token := self.special_tokens.get(f"<|{language}|>", None): | |
| return token | |
| raise KeyError(f"Language {language} not found in tokenizer.") | |
| def all_language_tokens(self) -> Tuple[int]: | |
| result = [] | |
| for token, token_id in self.special_tokens.items(): | |
| if token.strip("<|>") in LANGUAGES: | |
| result.append(token_id) | |
| return tuple(result) | |
| def all_language_codes(self) -> Tuple[str]: | |
| return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens) | |
| def non_speech_tokens(self) -> Tuple[int]: | |
| """ | |
| Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech | |
| annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. | |
| - ♪♪♪ | |
| - ( SPEAKING FOREIGN LANGUAGE ) | |
| - [DAVID] Hey there, | |
| keeping basic punctuations like commas, periods, question marks, exclamation points, etc. | |
| """ | |
| symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') | |
| symbols += ( | |
| "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() | |
| ) | |
| # symbols that may be a single token or multiple tokens depending on the tokenizer. | |
| # In case they're multiple tokens, suppress the first token, which is safe because: | |
| # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress | |
| # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. | |
| miscellaneous = set("♩♪♫♬♭♮♯") | |
| assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) | |
| # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word | |
| result = {self.encode(" -", add_special_tokens=False)[0], self.encode(" '", add_special_tokens=False)[0]} | |
| for symbol in symbols + list(miscellaneous): | |
| for tokens in [ | |
| self.encode(symbol, add_special_tokens=False), | |
| self.encode(" " + symbol, add_special_tokens=False), | |
| ]: | |
| if len(tokens) == 1 or symbol in miscellaneous: | |
| result.add(tokens[0]) | |
| return tuple(sorted(result)) | |