Automatic Speech Recognition
Transformers
Safetensors
Khmer
English
troryongasr
custom_code
Kimang18 commited on
Commit
0133579
·
verified ·
1 Parent(s): 8078798

Upload processor

Browse files
processor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
tokenization_troryongasr.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: KHUN Kimang
2
+ # Date: March 2026
3
+ # KrorngAI
4
+ # Inspired by https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
5
+
6
+ from typing import Optional, Tuple, List
7
+ from dataclasses import dataclass, field
8
+ from functools import cached_property
9
+ from enum import Enum
10
+ from transformers import LlamaTokenizer, PreTrainedTokenizer
11
+ import json
12
+
13
+
14
+ LANGUAGES = {
15
+ "km": "khmer",
16
+ "en": "english"
17
+ }
18
+ TO_LANGUAGE_CODE = {
19
+ **{lang: code for code, lang in LANGUAGES.items()},
20
+ }
21
+
22
+ class ASRSpecialTokens(str, Enum):
23
+ km_token = "<|km|>" # language token must be added to lm_head of Decoder Model
24
+ en_token = "<|en|>" # language token must be added to lm_head of Decoder Model
25
+ transcribe = "<|transcribe|>"
26
+ translate = "<|translate|>"
27
+ no_speech = "<|nospeech|>"
28
+ @classmethod
29
+ def list(cls):
30
+ return [c.value for c in cls]
31
+
32
+
33
+ class TrorYongASRTokenizer(LlamaTokenizer):
34
+ """
35
+ Tokenizer for the ASR task.
36
+ It supports only two languages: Khmer and English.
37
+ It does not support timestamps.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ language: Optional[str] = None,
43
+ task: Optional[str] = None,
44
+ *args,
45
+ **kwargs
46
+ ):
47
+ self.language = language
48
+ self.task = task
49
+
50
+ super().__init__(
51
+ *args,
52
+ **kwargs
53
+ )
54
+ self.add_special_tokens({
55
+ "additional_special_tokens": ASRSpecialTokens.list()
56
+ })
57
+
58
+ self.special_tokens = dict()
59
+ for special in self.all_special_tokens:
60
+ special_id = self.encode(special, add_special_tokens=False)[0]
61
+ self.special_tokens[special] = special_id
62
+
63
+ sot: int = self.special_tokens["<s>"]
64
+ translate: int = self.special_tokens["<|translate|>"]
65
+ transcribe: int = self.special_tokens["<|transcribe|>"]
66
+
67
+ sot_sequence = [sot]
68
+ if self.language is not None:
69
+ language = self.language.lower()
70
+ if language not in LANGUAGES:
71
+ if language in TO_LANGUAGE_CODE:
72
+ language = TO_LANGUAGE_CODE[language]
73
+ else:
74
+ raise ValueError(f"Unsupported language: {language}")
75
+
76
+ self.language = language
77
+ lang_id = self.encode(f"<|{language}|>", add_special_tokens=False)[0]
78
+ sot_sequence.append(lang_id)
79
+ if self.task is not None:
80
+ task_token: int = transcribe if self.task == "transcribe" else translate
81
+ sot_sequence.append(task_token)
82
+
83
+ self.sot_sequence = tuple(sot_sequence)
84
+
85
+ def encode(self, text, **kwargs) -> List[int]:
86
+ encoding = super().encode(text, **kwargs)
87
+ return encoding if encoding[0] != 29871 else encoding[1:] # 29871 is whitespace for TinyKhmerTokenizer
88
+
89
+ def __call__(self, text: Optional[str] = None) -> List[int]:
90
+ encoding = self.encode(text, add_special_tokens=False)
91
+ return [*self.sot_sequence] + encoding
92
+
93
+ @cached_property
94
+ def eot(self) -> int:
95
+ return self.special_tokens["</s>"]
96
+
97
+ @cached_property
98
+ def transcribe(self) -> int:
99
+ return self.special_tokens["<|transcribe|>"]
100
+
101
+ @cached_property
102
+ def translate(self) -> int:
103
+ return self.special_tokens["<|translate|>"]
104
+
105
+ @cached_property
106
+ def sot(self) -> int:
107
+ return self.special_tokens["<s>"]
108
+
109
+ @cached_property
110
+ def no_speech(self) -> int:
111
+ return self.special_tokens["<|nospeech|>"]
112
+
113
+ @cached_property
114
+ def language_token(self) -> int:
115
+ """Returns the token id corresponding to the value of the `language` field"""
116
+ if self.language is None:
117
+ raise ValueError("This tokenizer does not have language token configured")
118
+
119
+ return self.to_language_token(self.language)
120
+
121
+ def to_language_token(self, language):
122
+ if token := self.special_tokens.get(f"<|{language}|>", None):
123
+ return token
124
+
125
+ raise KeyError(f"Language {language} not found in tokenizer.")
126
+
127
+ @cached_property
128
+ def all_language_tokens(self) -> Tuple[int]:
129
+ result = []
130
+ for token, token_id in self.special_tokens.items():
131
+ if token.strip("<|>") in LANGUAGES:
132
+ result.append(token_id)
133
+ return tuple(result)
134
+
135
+ @cached_property
136
+ def all_language_codes(self) -> Tuple[str]:
137
+ return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
138
+
139
+ @cached_property
140
+ def non_speech_tokens(self) -> Tuple[int]:
141
+ """
142
+ Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
143
+ annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
144
+
145
+ - ♪♪♪
146
+ - ( SPEAKING FOREIGN LANGUAGE )
147
+ - [DAVID] Hey there,
148
+
149
+ keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
150
+ """
151
+ symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
152
+ symbols += (
153
+ "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
154
+ )
155
+
156
+ # symbols that may be a single token or multiple tokens depending on the tokenizer.
157
+ # In case they're multiple tokens, suppress the first token, which is safe because:
158
+ # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
159
+ # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
160
+ miscellaneous = set("♩♪♫♬♭♮♯")
161
+ assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
162
+
163
+ # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
164
+ result = {self.encode(" -", add_special_tokens=False)[0], self.encode(" '", add_special_tokens=False)[0]}
165
+ for symbol in symbols + list(miscellaneous):
166
+ for tokens in [
167
+ self.encode(symbol, add_special_tokens=False),
168
+ self.encode(" " + symbol, add_special_tokens=False),
169
+ ]:
170
+ if len(tokens) == 1 or symbol in miscellaneous:
171
+ result.add(tokens[0])
172
+
173
+ return tuple(sorted(result))
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_troryongasr.TrorYongASRTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "backend": "tokenizers",
10
+ "bos_token": "<s>",
11
+ "clean_up_tokenization_spaces": false,
12
+ "eos_token": "</s>",
13
+ "extra_special_tokens": [
14
+ "<|km|>",
15
+ "<|en|>",
16
+ "<|transcribe|>",
17
+ "<|translate|>",
18
+ "<|nospeech|>"
19
+ ],
20
+ "is_local": false,
21
+ "model_max_length": 1000000000000000019884624838656,
22
+ "padding_side": "right",
23
+ "processor_class": "WhisperProcessor",
24
+ "sp_model_kwargs": {},
25
+ "tokenizer_class": "TrorYongASRTokenizer",
26
+ "unk_token": "<unk>",
27
+ "use_default_system_prompt": false
28
+ }