janakhpon commited on
Commit
9ed3203
·
1 Parent(s): bece7ec

feat: simplified mon tokenizer in hf format, updated tags

Browse files
Files changed (1) hide show
  1. README.md +17 -2
README.md CHANGED
@@ -20,11 +20,26 @@ sentencepiece tokenizer for mon language with 4,000 vocabulary.
20
  ```python
21
  from transformers import AutoTokenizer
22
 
 
23
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
24
 
25
- text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
 
 
 
26
  tokens = tokenizer(text, return_tensors="pt")
27
- decoded = tokenizer.decode(tokens["input_ids"][0])
 
 
 
 
 
 
 
 
 
 
 
28
  ```
29
 
30
  ## details
 
20
  ```python
21
  from transformers import AutoTokenizer
22
 
23
+ # Load the tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
25
 
26
+ # Example text
27
+ text = "ပ္ဍဲအခိင်မာံနဲသဵု မဒှ်ဘဝကွးဘာတက္ကသိုလ်ဂှ် ပါလုပ်ချဳဓရာင်ကၠုင် ပ္ဍဲပရေင်ကမၠောန်ယေန်သၞာင် ကေုာံ လိက်ပတ်မန် ဗွဲကတိုင်ကၟဟ်ရ။"
28
+
29
+ # Tokenize the text
30
  tokens = tokenizer(text, return_tensors="pt")
31
+ input_ids = tokens["input_ids"][0]
32
+
33
+ # Print token IDs
34
+ print("Token IDs:", input_ids.tolist())
35
+
36
+ # Print tokens
37
+ token_list = tokenizer.convert_ids_to_tokens(input_ids)
38
+ print("Tokens:", token_list)
39
+
40
+ # Decode back to text
41
+ decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
42
+ print("Decoded text:", decoded)
43
  ```
44
 
45
  ## details