deshanksuman commited on
Commit
2bb431c
·
verified ·
1 Parent(s): 2681e31

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -5
README.md CHANGED
@@ -25,13 +25,21 @@ This tokenizer is specifically trained for Romanized Sinhala text (Sinhala writt
25
  ```python
26
  from transformers import PreTrainedTokenizerFast
27
 
28
- tokenizer = PreTrainedTokenizerFast.from_pretrained("deshanksuman/romanized-sinhala-tokenizer")
29
 
30
- # Set language for encoding
31
- tokenizer.src_lang = "si_rom"
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # Encode text
34
- encoded = tokenizer("Romanized Sinhala text goes here", return_tensors="pt")
35
  ```
36
 
37
  ## Citation
 
25
  ```python
26
  from transformers import PreTrainedTokenizerFast
27
 
 
28
 
29
+ from transformers import PreTrainedTokenizerFast
30
+
31
+ tokenizer = PreTrainedTokenizerFast.from_pretrained(
32
+ "deshanksuman/romanized-sinhala-tokenizer",
33
+ token="hf Token"
34
+ )
35
+
36
+ # Just tokenize and get tensors
37
+ encoded = tokenizer("api ada mkda krnne", return_tensors="pt")
38
+ print(encoded)
39
+
40
+ # To see tokens in text form
41
+ print(tokenizer.convert_ids_to_tokens(encoded["input_ids"][0]))
42
 
 
 
43
  ```
44
 
45
  ## Citation