Upload processor

Browse files

Files changed (4) hide show

processor_config.json +17 -0
tokenization_troryongasr.py +173 -0
tokenizer.json +0 -0
tokenizer_config.json +28 -0

processor_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "feature_extractor": {
+    "chunk_length": 30,
+    "dither": 0.0,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 80,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "return_attention_mask": false,
+    "sampling_rate": 16000
+  },
+  "processor_class": "WhisperProcessor"
+}

tokenization_troryongasr.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Author: KHUN Kimang
+# Date: March 2026
+# KrorngAI
+# Inspired by https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+from typing import Optional, Tuple, List
+from dataclasses import dataclass, field
+from functools import cached_property
+from enum import Enum
+from transformers import LlamaTokenizer, PreTrainedTokenizer
+import json
+LANGUAGES = {
+    "km": "khmer",
+    "en": "english"
+}
+TO_LANGUAGE_CODE = {
+    **{lang: code for code, lang in LANGUAGES.items()},
+}
+class ASRSpecialTokens(str, Enum):
+    km_token = "<|km|>" # language token must be added to lm_head of Decoder Model
+    en_token = "<|en|>" # language token must be added to lm_head of Decoder Model
+    transcribe = "<|transcribe|>"
+    translate = "<|translate|>"
+    no_speech = "<|nospeech|>"
+    @classmethod
+    def list(cls):
+        return [c.value for c in cls]
+class TrorYongASRTokenizer(LlamaTokenizer):
+    """
+    Tokenizer for the ASR task.
+    It supports only two languages: Khmer and English.
+    It does not support timestamps.
+    """
+    def __init__(
+        self,
+        language: Optional[str] = None,
+        task: Optional[str] = None,
+        *args,
+        **kwargs
+    ):
+        self.language = language
+        self.task = task
+        super().__init__(
+            *args,
+            **kwargs
+        )
+        self.add_special_tokens({
+            "additional_special_tokens": ASRSpecialTokens.list()
+        })
+        self.special_tokens = dict()
+        for special in self.all_special_tokens:
+            special_id = self.encode(special, add_special_tokens=False)[0]
+            self.special_tokens[special] = special_id
+        sot: int = self.special_tokens["<s>"]
+        translate: int = self.special_tokens["<|translate|>"]
+        transcribe: int = self.special_tokens["<|transcribe|>"]
+        sot_sequence = [sot]
+        if self.language is not None:
+            language = self.language.lower()
+            if language not in LANGUAGES:
+                if language in TO_LANGUAGE_CODE:
+                    language = TO_LANGUAGE_CODE[language]
+                else:
+                    raise ValueError(f"Unsupported language: {language}")
+            self.language = language
+            lang_id = self.encode(f"<|{language}|>", add_special_tokens=False)[0]
+            sot_sequence.append(lang_id)
+        if self.task is not None:
+            task_token: int = transcribe if self.task == "transcribe" else translate
+            sot_sequence.append(task_token)
+        self.sot_sequence = tuple(sot_sequence)
+    def encode(self, text, **kwargs) -> List[int]:
+        encoding = super().encode(text, **kwargs)
+        return encoding if encoding[0] != 29871 else encoding[1:] # 29871 is whitespace for TinyKhmerTokenizer
+    def __call__(self, text: Optional[str] = None) -> List[int]:
+        encoding = self.encode(text, add_special_tokens=False)
+        return [*self.sot_sequence] + encoding
+    @cached_property
+    def eot(self) -> int:
+        return self.special_tokens["</s>"]
+    @cached_property
+    def transcribe(self) -> int:
+        return self.special_tokens["<|transcribe|>"]
+    @cached_property
+    def translate(self) -> int:
+        return self.special_tokens["<|translate|>"]
+    @cached_property
+    def sot(self) -> int:
+        return self.special_tokens["<s>"]
+    @cached_property
+    def no_speech(self) -> int:
+        return self.special_tokens["<|nospeech|>"]
+    @cached_property
+    def language_token(self) -> int:
+        """Returns the token id corresponding to the value of the `language` field"""
+        if self.language is None:
+            raise ValueError("This tokenizer does not have language token configured")
+        return self.to_language_token(self.language)
+    def to_language_token(self, language):
+        if token := self.special_tokens.get(f"<|{language}|>", None):
+            return token
+        raise KeyError(f"Language {language} not found in tokenizer.")
+    @cached_property
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in self.special_tokens.items():
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+    @cached_property
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
+    @cached_property
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
+        symbols += (
+            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+        )
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.encode(" -", add_special_tokens=False)[0], self.encode(" '", add_special_tokens=False)[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [
+                self.encode(symbol, add_special_tokens=False),
+                self.encode(" " + symbol, add_special_tokens=False),
+            ]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+        return tuple(sorted(result))

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "add_prefix_space": null,
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_troryongasr.TrorYongASRTokenizer",
+      null
+    ]
+  },
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": [
+    "<|km|>",
+    "<|en|>",
+    "<|transcribe|>",
+    "<|translate|>",
+    "<|nospeech|>"
+  ],
+  "is_local": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "right",
+  "processor_class": "WhisperProcessor",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "TrorYongASRTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}