janakhpon commited on
Commit
2ded9d3
·
1 Parent(s): 3f73eda

feat: simplified mon tokenizer in hf format

Browse files
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - mon
4
+ library_name: transformers
5
+ license: mit
6
+ tags:
7
+ - tokenizer
8
+ - mon
9
+ - myanmar
10
+ - sentencepiece
11
+ ---
12
+
13
+ # mon language tokenizer
14
+
15
+ sentencepiece tokenizer for mon language with 4,000 vocabulary.
16
+
17
+ ## usage
18
+
19
+ ```python
20
+ from transformers import AutoTokenizer
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
23
+
24
+ text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
25
+ tokens = tokenizer(text, return_tensors="pt")
26
+ decoded = tokenizer.decode(tokens["input_ids"][0])
27
+ ```
28
+
29
+ ## details
30
+
31
+ - vocabulary size: 4,000
32
+ - algorithm: sentencepiece
33
+ - model type: unigram
34
+ - special tokens: <s>, </s>, <unk>, <pad>
35
+
36
+ ## training data
37
+
38
+ trained on mon language corpus including wikipedia articles, news, and books.
convert_to_hf.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ convert mon sentencepiece tokenizer to hugging face format
5
+ creates required config files for transformers library
6
+ """
7
+
8
+ import json
9
+ import shutil
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Dict, Any
13
+ import sentencepiece as spm
14
+
15
+
16
+ def load_metadata(meta_file: str = "mon_tokenizer.meta.json") -> Dict[str, Any]:
17
+ """load tokenizer metadata"""
18
+ print(f"loading metadata from {meta_file}")
19
+
20
+ if not os.path.exists(meta_file):
21
+ print(f"warning: metadata file not found")
22
+ return {}
23
+
24
+ with open(meta_file, 'r', encoding='utf-8') as f:
25
+ metadata = json.load(f)
26
+
27
+ print(f"loaded metadata - vocab size: {metadata.get('vocab_size', 'unknown')}")
28
+ return metadata
29
+
30
+
31
+ def analyze_model(model_file: str = "mon_tokenizer.model") -> Dict[str, Any]:
32
+ """analyze sentencepiece model"""
33
+ print(f"analyzing model: {model_file}")
34
+
35
+ if not os.path.exists(model_file):
36
+ raise FileNotFoundError(f"model file not found: {model_file}")
37
+
38
+ sp = spm.SentencePieceProcessor()
39
+ sp.load(model_file)
40
+
41
+ vocab_size = sp.get_piece_size()
42
+ bos_id = sp.bos_id()
43
+ eos_id = sp.eos_id()
44
+ unk_id = sp.unk_id()
45
+ pad_id = sp.pad_id() if sp.pad_id() != -1 else vocab_size
46
+
47
+ analysis = {
48
+ "vocab_size": vocab_size,
49
+ "bos_token": sp.id_to_piece(bos_id) if bos_id != -1 else "<s>",
50
+ "eos_token": sp.id_to_piece(eos_id) if eos_id != -1 else "</s>",
51
+ "unk_token": sp.id_to_piece(unk_id) if unk_id != -1 else "<unk>",
52
+ "pad_token": "<pad>",
53
+ "bos_token_id": bos_id if bos_id != -1 else 1,
54
+ "eos_token_id": eos_id if eos_id != -1 else 2,
55
+ "unk_token_id": unk_id if unk_id != -1 else 0,
56
+ "pad_token_id": pad_id
57
+ }
58
+
59
+ print(f"analysis complete - vocab: {vocab_size}")
60
+ return analysis
61
+
62
+
63
+ def create_tokenizer_config(analysis: Dict[str, Any]) -> Dict[str, Any]:
64
+ """create tokenizer_config.json"""
65
+ return {
66
+ "model_type": "llama",
67
+ "tokenizer_class": "LlamaTokenizer",
68
+ "vocab_file": "mon_tokenizer.model",
69
+ "vocab_size": analysis["vocab_size"],
70
+ "bos_token": analysis["bos_token"],
71
+ "eos_token": analysis["eos_token"],
72
+ "unk_token": analysis["unk_token"],
73
+ "pad_token": analysis["pad_token"],
74
+ "bos_token_id": analysis["bos_token_id"],
75
+ "eos_token_id": analysis["eos_token_id"],
76
+ "unk_token_id": analysis["unk_token_id"],
77
+ "pad_token_id": analysis["pad_token_id"],
78
+ "clean_up_tokenization_spaces": False,
79
+ "sp_model_kwargs": {},
80
+ "add_bos_token": True,
81
+ "add_eos_token": False,
82
+ "model_max_length": 2048
83
+ }
84
+
85
+
86
+ def create_special_tokens_map(analysis: Dict[str, Any]) -> Dict[str, Any]:
87
+ """create special_tokens_map.json"""
88
+ return {
89
+ "bos_token": {
90
+ "content": analysis["bos_token"],
91
+ "lstrip": False,
92
+ "normalized": False,
93
+ "rstrip": False,
94
+ "single_word": False
95
+ },
96
+ "eos_token": {
97
+ "content": analysis["eos_token"],
98
+ "lstrip": False,
99
+ "normalized": False,
100
+ "rstrip": False,
101
+ "single_word": False
102
+ },
103
+ "pad_token": {
104
+ "content": analysis["pad_token"],
105
+ "lstrip": False,
106
+ "normalized": False,
107
+ "rstrip": False,
108
+ "single_word": False
109
+ },
110
+ "unk_token": {
111
+ "content": analysis["unk_token"],
112
+ "lstrip": False,
113
+ "normalized": False,
114
+ "rstrip": False,
115
+ "single_word": False
116
+ }
117
+ }
118
+
119
+
120
+ def create_generation_config() -> Dict[str, Any]:
121
+ """create generation_config.json"""
122
+ return {
123
+ "bos_token_id": 1,
124
+ "eos_token_id": 2,
125
+ "pad_token_id": 4000,
126
+ "do_sample": True,
127
+ "max_length": 2048,
128
+ "temperature": 0.8,
129
+ "top_p": 0.9
130
+ }
131
+
132
+
133
+ def create_readme(analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str:
134
+ """create readme model card"""
135
+ return f"""---
136
+ language:
137
+ - mon
138
+ library_name: transformers
139
+ license: mit
140
+ tags:
141
+ - tokenizer
142
+ - mon
143
+ - myanmar
144
+ - sentencepiece
145
+ ---
146
+
147
+ # mon language tokenizer
148
+
149
+ sentencepiece tokenizer for mon language with {analysis["vocab_size"]:,} vocabulary.
150
+
151
+ ## usage
152
+
153
+ ```python
154
+ from transformers import AutoTokenizer
155
+
156
+ tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
157
+
158
+ text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
159
+ tokens = tokenizer(text, return_tensors="pt")
160
+ decoded = tokenizer.decode(tokens["input_ids"][0])
161
+ ```
162
+
163
+ ## details
164
+
165
+ - vocabulary size: {analysis["vocab_size"]:,}
166
+ - algorithm: sentencepiece
167
+ - model type: unigram
168
+ - special tokens: {analysis["bos_token"]}, {analysis["eos_token"]}, {analysis["unk_token"]}, {analysis["pad_token"]}
169
+
170
+ ## training data
171
+
172
+ trained on mon language corpus including wikipedia articles, news, and books.
173
+ """
174
+
175
+
176
+ def create_gitattributes() -> str:
177
+ """create .gitattributes for git lfs"""
178
+ return "mon_tokenizer.model filter=lfs diff=lfs merge=lfs -text\n"
179
+
180
+
181
+ def test_tokenizer(output_dir: str) -> bool:
182
+ """test converted tokenizer"""
183
+ print("testing tokenizer")
184
+
185
+ try:
186
+ from transformers import AutoTokenizer
187
+
188
+ tokenizer = AutoTokenizer.from_pretrained(output_dir)
189
+ test_text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
190
+
191
+ tokens = tokenizer(test_text, return_tensors="pt")
192
+ decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
193
+
194
+ print(f"test passed - vocab: {tokenizer.vocab_size:,}")
195
+ return test_text == decoded
196
+
197
+ except Exception as e:
198
+ print(f"test failed: {e}")
199
+ return False
200
+
201
+
202
+ def convert_to_huggingface(
203
+ input_model: str = "mon_tokenizer.model",
204
+ input_meta: str = "mon_tokenizer.meta.json",
205
+ output_dir: str = "."
206
+ ):
207
+ """convert mon tokenizer to hugging face format"""
208
+
209
+ print("converting mon tokenizer to hugging face format")
210
+
211
+ # create output directory
212
+ output_path = Path(output_dir)
213
+ output_path.mkdir(exist_ok=True)
214
+
215
+ # load metadata and analyze model
216
+ metadata = load_metadata(input_meta)
217
+ analysis = analyze_model(input_model)
218
+
219
+ # copy model file if needed
220
+ model_dest = output_path / "mon_tokenizer.model"
221
+ if not model_dest.exists() or model_dest.resolve() != Path(input_model).resolve():
222
+ print("copying model file")
223
+ shutil.copy2(input_model, model_dest)
224
+ else:
225
+ print("model file already in place")
226
+
227
+ # create config files
228
+ print("creating config files")
229
+
230
+ configs = {
231
+ "tokenizer_config.json": create_tokenizer_config(analysis),
232
+ "special_tokens_map.json": create_special_tokens_map(analysis),
233
+ "generation_config.json": create_generation_config()
234
+ }
235
+
236
+ for filename, config in configs.items():
237
+ with open(output_path / filename, 'w') as f:
238
+ json.dump(config, f, indent=2)
239
+ print(f"created {filename}")
240
+
241
+ # create readme and gitattributes
242
+ with open(output_path / "README.md", 'w', encoding='utf-8') as f:
243
+ f.write(create_readme(analysis, metadata))
244
+ print("created README.md")
245
+
246
+ with open(output_path / ".gitattributes", 'w') as f:
247
+ f.write(create_gitattributes())
248
+ print("created .gitattributes")
249
+
250
+ # test
251
+ success = test_tokenizer(str(output_path))
252
+ print(f"conversion {'successful' if success else 'completed with warnings'}")
253
+
254
+ return success
255
+
256
+
257
+ if __name__ == "__main__":
258
+ convert_to_huggingface()
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 4000,
5
+ "do_sample": true,
6
+ "max_length": 2048,
7
+ "temperature": 0.8,
8
+ "top_p": 0.9
9
+ }
mon_tokenizer.meta.json ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_path": "mon_tokenizer.model",
3
+ "vocab_path": "mon_tokenizer.vocab",
4
+ "lines_trained": 32412,
5
+ "total_characters": 2453293,
6
+ "model_type": "unigram",
7
+ "vocab_size": 4000,
8
+ "original_vocab_size": 4000,
9
+ "character_coverage": 0.9995,
10
+ "byte_fallback": true,
11
+ "user_defined_symbols": [
12
+ "<mask>",
13
+ "<sep>",
14
+ "<cls>"
15
+ ],
16
+ "evaluation": {
17
+ "သ္ဂံသ္ဂံပါ။ ကျာ်တြဲ ပရိတ်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
18
+ "num_pieces": 24,
19
+ "pieces": [
20
+ "▁",
21
+ "သ္",
22
+ "ဂ",
23
+ "ံ",
24
+ "သ္",
25
+ "ဂ",
26
+ "ံ",
27
+ "ပါ",
28
+ "<0xE1>",
29
+ "<0x81>",
30
+ "<0x8B>",
31
+ "▁",
32
+ "ကျာ်တြဲ",
33
+ "▁",
34
+ "ပရိ",
35
+ "တ်",
36
+ "တံဂှ်",
37
+ "▁",
38
+ "ကၠောန်",
39
+ "ဗဒှ်",
40
+ "လဝ်ရ",
41
+ "<0xE1>",
42
+ "<0x81>",
43
+ "<0x8B>"
44
+ ],
45
+ "ids_head": [
46
+ 262,
47
+ 610,
48
+ 324,
49
+ 381,
50
+ 610,
51
+ 324,
52
+ 381,
53
+ 495,
54
+ 231,
55
+ 135,
56
+ 145,
57
+ 262,
58
+ 1733,
59
+ 262,
60
+ 2158,
61
+ 339,
62
+ 1148,
63
+ 262,
64
+ 286,
65
+ 726,
66
+ 1097,
67
+ 231,
68
+ 135,
69
+ 145
70
+ ],
71
+ "round_trip_ok": true,
72
+ "compression_ratio": 1.9166666666666667
73
+ },
74
+ "ဒေါံဏံ ဍာ်မိုဟ် ကြဴကြဴဏောၚ်။": {
75
+ "num_pieces": 14,
76
+ "pieces": [
77
+ "▁",
78
+ "ဒေါ",
79
+ "ံ",
80
+ "ဏံ",
81
+ "▁ဍာ်",
82
+ "မ",
83
+ "ိုဟ်",
84
+ "▁",
85
+ "ကြဴ",
86
+ "ကြဴ",
87
+ "ဏောၚ်",
88
+ "<0xE1>",
89
+ "<0x81>",
90
+ "<0x8B>"
91
+ ],
92
+ "ids_head": [
93
+ 262,
94
+ 1865,
95
+ 381,
96
+ 596,
97
+ 1178,
98
+ 272,
99
+ 1255,
100
+ 262,
101
+ 1752,
102
+ 1752,
103
+ 2484,
104
+ 231,
105
+ 135,
106
+ 145
107
+ ],
108
+ "round_trip_ok": true,
109
+ "compression_ratio": 2.0
110
+ },
111
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
112
+ "num_pieces": 12,
113
+ "pieces": [
114
+ "▁",
115
+ "ဘာသာမန်",
116
+ "▁",
117
+ "ပရူပရာ",
118
+ "တံဂှ်",
119
+ "▁",
120
+ "ကၠောန်",
121
+ "ဗဒှ်",
122
+ "လဝ်ရ",
123
+ "<0xE1>",
124
+ "<0x81>",
125
+ "<0x8B>"
126
+ ],
127
+ "ids_head": [
128
+ 262,
129
+ 1179,
130
+ 262,
131
+ 3651,
132
+ 1148,
133
+ 262,
134
+ 286,
135
+ 726,
136
+ 1097,
137
+ 231,
138
+ 135,
139
+ 145
140
+ ],
141
+ "round_trip_ok": true,
142
+ "compression_ratio": 2.9166666666666665
143
+ },
144
+ "ဘာသာအင်္ဂလိက် ကဵု ဘာသာမန် နွံပၟိက်ရ။": {
145
+ "num_pieces": 11,
146
+ "pieces": [
147
+ "▁",
148
+ "ဘာသာအင်္ဂလိက်",
149
+ "▁ကဵု",
150
+ "▁",
151
+ "ဘာသာမန်",
152
+ "▁",
153
+ "နွံပၟိက်",
154
+ "ရ",
155
+ "<0xE1>",
156
+ "<0x81>",
157
+ "<0x8B>"
158
+ ],
159
+ "ids_head": [
160
+ 262,
161
+ 1970,
162
+ 387,
163
+ 262,
164
+ 1179,
165
+ 262,
166
+ 1205,
167
+ 264,
168
+ 231,
169
+ 135,
170
+ 145
171
+ ],
172
+ "round_trip_ok": true,
173
+ "compression_ratio": 3.272727272727273
174
+ },
175
+ "သၞာံ ၂၀၂၄ ဂိတုဇန္နဝါရဳ ၁၅ မံက်": {
176
+ "num_pieces": 10,
177
+ "pieces": [
178
+ "▁သၞာံ",
179
+ "▁၂၀၂၄",
180
+ "▁ဂိတု",
181
+ "ဇ",
182
+ "န္န",
183
+ "ဝါ",
184
+ "ရဳ",
185
+ "▁၁၅",
186
+ "▁",
187
+ "မံက်"
188
+ ],
189
+ "ids_head": [
190
+ 287,
191
+ 2730,
192
+ 732,
193
+ 384,
194
+ 2733,
195
+ 463,
196
+ 1248,
197
+ 1059,
198
+ 262,
199
+ 967
200
+ ],
201
+ "round_trip_ok": true,
202
+ "compression_ratio": 3.0
203
+ },
204
+ "ၚၛၜၝၞၟၠ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
205
+ "num_pieces": 20,
206
+ "pieces": [
207
+ "▁",
208
+ "ၚ",
209
+ "<0xE1>",
210
+ "<0x81>",
211
+ "<0x9B>",
212
+ "ၜ",
213
+ "ၝ",
214
+ "ၞ",
215
+ "ၟ",
216
+ "ၠ",
217
+ "▁",
218
+ "မန်",
219
+ "တံဂှ်",
220
+ "▁",
221
+ "ကၠောန်",
222
+ "ဗဒှ်",
223
+ "လဝ်ရ",
224
+ "<0xE1>",
225
+ "<0x81>",
226
+ "<0x8B>"
227
+ ],
228
+ "ids_head": [
229
+ 262,
230
+ 1062,
231
+ 231,
232
+ 135,
233
+ 161,
234
+ 844,
235
+ 1937,
236
+ 554,
237
+ 3999,
238
+ 922,
239
+ 262,
240
+ 294,
241
+ 1148,
242
+ 262,
243
+ 286,
244
+ 726,
245
+ 1097,
246
+ 231,
247
+ 135,
248
+ 145
249
+ ],
250
+ "round_trip_ok": true,
251
+ "compression_ratio": 1.6
252
+ },
253
+ "ဨဩဪဥဦဧ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
254
+ "num_pieces": 23,
255
+ "pieces": [
256
+ "▁",
257
+ "ဨ",
258
+ "<0xE1>",
259
+ "<0x80>",
260
+ "<0xA9>",
261
+ "<0xE1>",
262
+ "<0x80>",
263
+ "<0xAA>",
264
+ "ဥ",
265
+ "ဦ",
266
+ "<0xE1>",
267
+ "<0x80>",
268
+ "<0xA7>",
269
+ "▁",
270
+ "မန်",
271
+ "တံဂှ်",
272
+ "▁",
273
+ "ကၠောန်",
274
+ "ဗဒှ်",
275
+ "လဝ်ရ",
276
+ "<0xE1>",
277
+ "<0x81>",
278
+ "<0x8B>"
279
+ ],
280
+ "ids_head": [
281
+ 262,
282
+ 1052,
283
+ 231,
284
+ 134,
285
+ 175,
286
+ 231,
287
+ 134,
288
+ 176,
289
+ 1157,
290
+ 3995,
291
+ 231,
292
+ 134,
293
+ 173,
294
+ 262,
295
+ 294,
296
+ 1148,
297
+ 262,
298
+ 286,
299
+ 726,
300
+ 1097,
301
+ 231,
302
+ 135,
303
+ 145
304
+ ],
305
+ "round_trip_ok": true,
306
+ "compression_ratio": 1.3478260869565217
307
+ },
308
+ "ါာူးေိီဲံ်္ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
309
+ "num_pieces": 22,
310
+ "pieces": [
311
+ "▁",
312
+ "ါ",
313
+ "ာ",
314
+ "ူ",
315
+ "း",
316
+ "ေ",
317
+ "ိ",
318
+ "ီ",
319
+ "ဲ",
320
+ "ံ",
321
+ "်",
322
+ "္",
323
+ "▁",
324
+ "မန်",
325
+ "တံဂှ်",
326
+ "▁",
327
+ "ကၠောန်",
328
+ "ဗဒှ်",
329
+ "လဝ်ရ",
330
+ "<0xE1>",
331
+ "<0x81>",
332
+ "<0x8B>"
333
+ ],
334
+ "ids_head": [
335
+ 262,
336
+ 580,
337
+ 328,
338
+ 634,
339
+ 304,
340
+ 445,
341
+ 478,
342
+ 649,
343
+ 340,
344
+ 381,
345
+ 276,
346
+ 483,
347
+ 262,
348
+ 294,
349
+ 1148,
350
+ 262,
351
+ 286,
352
+ 726,
353
+ 1097,
354
+ 231,
355
+ 135,
356
+ 145
357
+ ],
358
+ "round_trip_ok": true,
359
+ "compression_ratio": 1.6363636363636365
360
+ },
361
+ "ျြွှဿ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
362
+ "num_pieces": 16,
363
+ "pieces": [
364
+ "▁",
365
+ "ျ",
366
+ "ြ",
367
+ "ွ",
368
+ "ှ",
369
+ "ဿ",
370
+ "▁",
371
+ "မန်",
372
+ "တံဂှ်",
373
+ "▁",
374
+ "ကၠောန်",
375
+ "ဗဒှ်",
376
+ "လဝ်ရ",
377
+ "<0xE1>",
378
+ "<0x81>",
379
+ "<0x8B>"
380
+ ],
381
+ "ids_head": [
382
+ 262,
383
+ 2040,
384
+ 2674,
385
+ 738,
386
+ 753,
387
+ 1251,
388
+ 262,
389
+ 294,
390
+ 1148,
391
+ 262,
392
+ 286,
393
+ 726,
394
+ 1097,
395
+ 231,
396
+ 135,
397
+ 145
398
+ ],
399
+ "round_trip_ok": true,
400
+ "compression_ratio": 1.875
401
+ },
402
+ "မန်တံဂှ်၊ ကၠောန်ဗဒှ်လဝ်ရ။ ပရူပရာတံဂှ်၌ နွံပၟိက်ရ။": {
403
+ "num_pieces": 23,
404
+ "pieces": [
405
+ "▁",
406
+ "မန်",
407
+ "တံဂှ်",
408
+ "<0xE1>",
409
+ "<0x81>",
410
+ "<0x8A>",
411
+ "▁",
412
+ "ကၠောန်",
413
+ "ဗဒှ်",
414
+ "လဝ်ရ",
415
+ "<0xE1>",
416
+ "<0x81>",
417
+ "<0x8B>",
418
+ "▁",
419
+ "ပရူပရာ",
420
+ "တံဂှ်",
421
+ "၌",
422
+ "▁",
423
+ "နွံပၟိက်",
424
+ "ရ",
425
+ "<0xE1>",
426
+ "<0x81>",
427
+ "<0x8B>"
428
+ ],
429
+ "ids_head": [
430
+ 262,
431
+ 294,
432
+ 1148,
433
+ 231,
434
+ 135,
435
+ 144,
436
+ 262,
437
+ 286,
438
+ 726,
439
+ 1097,
440
+ 231,
441
+ 135,
442
+ 145,
443
+ 262,
444
+ 3651,
445
+ 1148,
446
+ 3430,
447
+ 262,
448
+ 1205,
449
+ 264,
450
+ 231,
451
+ 135,
452
+ 145
453
+ ],
454
+ "round_trip_ok": true,
455
+ "compression_ratio": 2.130434782608696
456
+ },
457
+ "သၞာံ ၂၀၂၄ ÷ ၄ = ၅၀၆ × ၁၀ = ၅၀၆၀": {
458
+ "num_pieces": 18,
459
+ "pieces": [
460
+ "▁သၞာံ",
461
+ "▁၂၀၂၄",
462
+ "▁",
463
+ "<0xC3>",
464
+ "<0xB7>",
465
+ "▁၄",
466
+ "▁=",
467
+ "▁",
468
+ "၅၀",
469
+ "၆",
470
+ "▁",
471
+ "<0xC3>",
472
+ "<0x97>",
473
+ "▁၁၀",
474
+ "▁=",
475
+ "▁",
476
+ "၅၀",
477
+ "၆၀"
478
+ ],
479
+ "ids_head": [
480
+ 287,
481
+ 2730,
482
+ 262,
483
+ 201,
484
+ 189,
485
+ 705,
486
+ 533,
487
+ 262,
488
+ 1287,
489
+ 936,
490
+ 262,
491
+ 201,
492
+ 157,
493
+ 782,
494
+ 533,
495
+ 262,
496
+ 1287,
497
+ 1812
498
+ ],
499
+ "round_trip_ok": true,
500
+ "compression_ratio": 1.7222222222222223
501
+ },
502
+ "_stats": {
503
+ "avg_compression_ratio": 1.9896373056994818,
504
+ "round_trip_accuracy": 1.0,
505
+ "total_samples": 11,
506
+ "vocab_size": 4000
507
+ }
508
+ },
509
+ "character_analysis": {
510
+ "total_chars": 2453293,
511
+ "mon_chars": 1907807,
512
+ "unique_mon_chars": 94,
513
+ "mon_char_ratio": 0.7776515075859264,
514
+ "categories": {
515
+ "base_consonants": [
516
+ "က",
517
+ "ခ",
518
+ "ဂ",
519
+ "ဃ",
520
+ "င",
521
+ "စ",
522
+ "ဆ",
523
+ "ဇ",
524
+ "ဉ",
525
+ "ည",
526
+ "ဋ",
527
+ "ဌ",
528
+ "ဍ",
529
+ "ဎ",
530
+ "ဏ",
531
+ "တ",
532
+ "ထ",
533
+ "ဒ",
534
+ "ဓ",
535
+ "န",
536
+ "ပ",
537
+ "ဖ",
538
+ "ဗ",
539
+ "ဘ",
540
+ "မ",
541
+ "ယ",
542
+ "ရ",
543
+ "လ",
544
+ "ဝ",
545
+ "သ",
546
+ "ဟ",
547
+ "ဠ",
548
+ "အ"
549
+ ],
550
+ "extended_mon": [
551
+ "ၚ",
552
+ "ၛ",
553
+ "ၜ",
554
+ "ၝ",
555
+ "ၞ",
556
+ "ၟ",
557
+ "ၠ"
558
+ ],
559
+ "extended_vowels": [
560
+ "ဥ",
561
+ "ဦ",
562
+ "ဧ",
563
+ "ဨ",
564
+ "ဩ"
565
+ ],
566
+ "vowel_signs": [
567
+ "ါ",
568
+ "ာ",
569
+ "ိ",
570
+ "ီ",
571
+ "ူ",
572
+ "ေ",
573
+ "ဲ",
574
+ "ံ",
575
+ "း",
576
+ "္",
577
+ "်"
578
+ ],
579
+ "media_chars": [
580
+ "ျ",
581
+ "ြ",
582
+ "ွ",
583
+ "ှ"
584
+ ],
585
+ "punctuation": [
586
+ "၌",
587
+ "၏"
588
+ ],
589
+ "mathematical": [
590
+ "=",
591
+ "×"
592
+ ],
593
+ "other": [
594
+ "ဣ",
595
+ "ဤ",
596
+ "ု",
597
+ "ဳ",
598
+ "ဴ",
599
+ "ဵ",
600
+ "့",
601
+ "ဿ",
602
+ "၀",
603
+ "၁",
604
+ "၂",
605
+ "၃",
606
+ "၄",
607
+ "၅",
608
+ "၆",
609
+ "၇",
610
+ "၈",
611
+ "၉",
612
+ "ၐ",
613
+ "ၑ",
614
+ "ၢ",
615
+ "ၤ",
616
+ "ႄ",
617
+ "ႅ",
618
+ "ႆ",
619
+ "ႇ",
620
+ "ႈ",
621
+ "႓",
622
+ "႕",
623
+ "ႝ"
624
+ ]
625
+ },
626
+ "all_found_chars": [
627
+ "=",
628
+ "×",
629
+ "က",
630
+ "ခ",
631
+ "ဂ",
632
+ "ဃ",
633
+ "င",
634
+ "စ",
635
+ "ဆ",
636
+ "ဇ",
637
+ "ဉ",
638
+ "ည",
639
+ "ဋ",
640
+ "ဌ",
641
+ "ဍ",
642
+ "ဎ",
643
+ "ဏ",
644
+ "တ",
645
+ "ထ",
646
+ "ဒ",
647
+ "ဓ",
648
+ "န",
649
+ "ပ",
650
+ "ဖ",
651
+ "ဗ",
652
+ "ဘ",
653
+ "မ",
654
+ "ယ",
655
+ "ရ",
656
+ "လ",
657
+ "ဝ",
658
+ "သ",
659
+ "ဟ",
660
+ "ဠ",
661
+ "အ",
662
+ "ဣ",
663
+ "ဤ",
664
+ "ဥ",
665
+ "ဦ",
666
+ "ဧ",
667
+ "ဨ",
668
+ "ဩ",
669
+ "ါ",
670
+ "ာ",
671
+ "ိ",
672
+ "ီ",
673
+ "ု",
674
+ "ူ",
675
+ "ေ",
676
+ "ဲ",
677
+ "ဳ",
678
+ "ဴ",
679
+ "ဵ",
680
+ "ံ",
681
+ "့",
682
+ "း",
683
+ "္",
684
+ "်",
685
+ "ျ",
686
+ "ြ",
687
+ "ွ",
688
+ "ှ",
689
+ "ဿ",
690
+ "၀",
691
+ "၁",
692
+ "၂",
693
+ "၃",
694
+ "၄",
695
+ "၅",
696
+ "၆",
697
+ "၇",
698
+ "၈",
699
+ "၉",
700
+ "၌",
701
+ "၏",
702
+ "ၐ",
703
+ "ၑ",
704
+ "ၚ",
705
+ "ၛ",
706
+ "ၜ",
707
+ "ၝ",
708
+ "ၞ",
709
+ "ၟ",
710
+ "ၠ",
711
+ "ၢ",
712
+ "ၤ",
713
+ "ႄ",
714
+ "ႅ",
715
+ "ႆ",
716
+ "ႇ",
717
+ "ႈ",
718
+ "႓",
719
+ "႕",
720
+ "ႝ"
721
+ ]
722
+ },
723
+ "resource_limits": {
724
+ "max_cpu_percent": 90,
725
+ "max_memory_percent": 85,
726
+ "max_disk_percent": 90
727
+ }
728
+ }
mon_tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0b3e772c4f414d2540c3f68474d14b037ec00f8e5ac9bce637938d7e82998d3
3
+ size 338422
pyproject.toml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "mon-tokenizer-hf"
3
+ version = "1.0.0"
4
+ description = "mon language tokenizer for hugging face transformers"
5
+ readme = "README.md"
6
+ requires-python = ">=3.8.1"
7
+ license = {text = "MIT"}
8
+ authors = [
9
+ {name = "Mon Language Project", email = "contact@example.com"}
10
+ ]
11
+ keywords = ["tokenizer", "mon", "myanmar", "nlp", "huggingface", "sentencepiece"]
12
+
13
+ dependencies = [
14
+ "transformers>=4.30.0",
15
+ "torch>=1.12.0",
16
+ "sentencepiece>=0.1.99",
17
+ "huggingface_hub>=0.15.0",
18
+ "protobuf>=3.20.0",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = [
23
+ "pytest>=7.0.0",
24
+ "black>=23.0.0",
25
+ "isort>=5.12.0",
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/yourusername/mon-tokenizer-hf"
30
+ Repository = "https://github.com/yourusername/mon-tokenizer-hf"
31
+ Documentation = "https://github.com/yourusername/mon-tokenizer-hf#readme"
32
+ "Bug Tracker" = "https://github.com/yourusername/mon-tokenizer-hf/issues"
33
+ "Hugging Face" = "https://huggingface.co/janakhpon/mon_tokenizer"
34
+
35
+ [tool.black]
36
+ line-length = 88
37
+ target-version = ['py38']
38
+ include = '\.pyi?$'
39
+
40
+ [tool.isort]
41
+ profile = "black"
42
+ multi_line_output = 3
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
test_tokenizer.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ test mon tokenizer hugging face integration
5
+ """
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
9
+
10
+
11
+ def test_tokenizer():
12
+ """test tokenizer loading and basic functionality"""
13
+ print("testing mon tokenizer")
14
+
15
+ try:
16
+ tokenizer = AutoTokenizer.from_pretrained(".")
17
+ print(f"tokenizer loaded - vocab: {tokenizer.vocab_size:,}")
18
+
19
+ # test tokenization
20
+ test_texts = [
21
+ "ဘာသာမန်",
22
+ "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
23
+ "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"
24
+ ]
25
+
26
+ for text in test_texts:
27
+ inputs = tokenizer(text, return_tensors="pt")
28
+ decoded = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
29
+
30
+ print(f"input: '{text}'")
31
+ print(f"tokens: {inputs['input_ids'].shape}")
32
+ print(f"decoded: '{decoded}'")
33
+ print(f"round-trip: {'ok' if text == decoded else 'failed'}")
34
+ print()
35
+
36
+ return True
37
+
38
+ except Exception as e:
39
+ print(f"tokenizer test failed: {e}")
40
+ return False
41
+
42
+
43
+ def test_model_integration():
44
+ """test tokenizer with gpt2 model"""
45
+ print("testing model integration")
46
+
47
+ try:
48
+ tokenizer = AutoTokenizer.from_pretrained(".")
49
+
50
+ # create small gpt2 model
51
+ config = GPT2Config(
52
+ vocab_size=tokenizer.vocab_size,
53
+ n_positions=512,
54
+ n_embd=256,
55
+ n_layer=4,
56
+ n_head=4,
57
+ bos_token_id=tokenizer.bos_token_id,
58
+ eos_token_id=tokenizer.eos_token_id,
59
+ pad_token_id=tokenizer.pad_token_id,
60
+ )
61
+
62
+ model = GPT2LMHeadModel(config)
63
+ print(f"model created - params: {sum(p.numel() for p in model.parameters()):,}")
64
+
65
+ # test generation
66
+ prompt = "ဘာသာမန်"
67
+ inputs = tokenizer(prompt, return_tensors="pt")
68
+
69
+ with torch.no_grad():
70
+ outputs = model.generate(
71
+ **inputs,
72
+ max_length=inputs['input_ids'].shape[1] + 10,
73
+ do_sample=False,
74
+ pad_token_id=tokenizer.pad_token_id
75
+ )
76
+
77
+ generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
78
+ print(f"generated: '{generated}'")
79
+
80
+ return True
81
+
82
+ except Exception as e:
83
+ print(f"model integration test failed: {e}")
84
+ return False
85
+
86
+
87
+ def main():
88
+ """run all tests"""
89
+ print("mon tokenizer test suite")
90
+
91
+ tests = [
92
+ ("tokenizer", test_tokenizer),
93
+ ("model integration", test_model_integration)
94
+ ]
95
+
96
+ results = []
97
+ for name, test_func in tests:
98
+ print(f"\n--- {name} test ---")
99
+ success = test_func()
100
+ results.append(success)
101
+ print(f"{name}: {'passed' if success else 'failed'}")
102
+
103
+ print(f"\ntest results: {sum(results)}/{len(results)} passed")
104
+ return all(results)
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "llama",
3
+ "tokenizer_class": "LlamaTokenizer",
4
+ "vocab_file": "mon_tokenizer.model",
5
+ "vocab_size": 4000,
6
+ "bos_token": "<s>",
7
+ "eos_token": "</s>",
8
+ "unk_token": "<unk>",
9
+ "pad_token": "<pad>",
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "unk_token_id": 0,
13
+ "pad_token_id": 4000,
14
+ "clean_up_tokenization_spaces": false,
15
+ "sp_model_kwargs": {},
16
+ "add_bos_token": true,
17
+ "add_eos_token": false,
18
+ "model_max_length": 2048
19
+ }
upload_to_hub.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ upload mon tokenizer to hugging face hub
5
+ """
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from huggingface_hub import HfApi, login
10
+ from transformers import AutoTokenizer
11
+
12
+
13
+ def validate_tokenizer(directory: str = ".") -> bool:
14
+ """validate tokenizer before upload"""
15
+ print("validating tokenizer")
16
+
17
+ required_files = [
18
+ "mon_tokenizer.model",
19
+ "tokenizer_config.json",
20
+ "special_tokens_map.json",
21
+ "README.md"
22
+ ]
23
+
24
+ for file in required_files:
25
+ if not os.path.exists(os.path.join(directory, file)):
26
+ print(f"missing required file: {file}")
27
+ return False
28
+
29
+ try:
30
+ tokenizer = AutoTokenizer.from_pretrained(directory)
31
+ test_text = "ဘာသာမန်"
32
+ tokens = tokenizer(test_text, return_tensors="pt")
33
+ decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
34
+
35
+ if test_text != decoded:
36
+ print("tokenizer round-trip test failed")
37
+ return False
38
+
39
+ print("validation passed")
40
+ return True
41
+
42
+ except Exception as e:
43
+ print(f"validation failed: {e}")
44
+ return False
45
+
46
+
47
+ def upload_to_hub(
48
+ repo_id: str = "janakhpon/mon_tokenizer",
49
+ directory: str = ".",
50
+ private: bool = False,
51
+ commit_message: str = "upload mon tokenizer"
52
+ ):
53
+ """upload tokenizer to hugging face hub"""
54
+
55
+ print(f"uploading to {repo_id}")
56
+
57
+ # validate first
58
+ if not validate_tokenizer(directory):
59
+ print("upload cancelled - validation failed")
60
+ return False
61
+
62
+ try:
63
+ # login
64
+ print("logging in to hugging face")
65
+ login()
66
+
67
+ # create api client
68
+ api = HfApi()
69
+
70
+ # create/update repository
71
+ print(f"creating repository: {repo_id}")
72
+ api.create_repo(
73
+ repo_id=repo_id,
74
+ private=private,
75
+ exist_ok=True,
76
+ repo_type="model"
77
+ )
78
+
79
+ # upload files
80
+ print("uploading files")
81
+ api.upload_folder(
82
+ folder_path=directory,
83
+ repo_id=repo_id,
84
+ commit_message=commit_message,
85
+ ignore_patterns=[
86
+ "*.pyc",
87
+ "__pycache__/",
88
+ ".git/",
89
+ ".venv/",
90
+ "*.lock",
91
+ "datasets/"
92
+ ]
93
+ )
94
+
95
+ print(f"upload successful: https://huggingface.co/{repo_id}")
96
+ return True
97
+
98
+ except Exception as e:
99
+ print(f"upload failed: {e}")
100
+ return False
101
+
102
+
103
+ def main():
104
+ """main upload function"""
105
+ print("mon tokenizer hub uploader")
106
+
107
+ # get repo info
108
+ repo_id = input("repository id (janakhpon/mon_tokenizer): ").strip()
109
+ if not repo_id:
110
+ repo_id = "janakhpon/mon_tokenizer"
111
+
112
+ private = input("private repository? (y/n): ").strip().lower() == 'y'
113
+
114
+ # upload
115
+ success = upload_to_hub(
116
+ repo_id=repo_id,
117
+ private=private,
118
+ commit_message="updated mon tokenizer"
119
+ )
120
+
121
+ if success:
122
+ print("tokenizer successfully uploaded to hugging face hub")
123
+ else:
124
+ print("upload failed")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
uv.lock ADDED
The diff for this file is too large to render. See raw diff