Added tokenizer with large sinhala corpus
Browse files- config.json +5 -5
config.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"unknown_token": "<|unk|>",
|
| 3 |
"pad_token": "<|pad|>",
|
| 4 |
-
"unknown_token_id":
|
| 5 |
-
"pad_token_id":
|
| 6 |
-
"max_length":
|
| 7 |
-
"end_of_text_token": "<|
|
| 8 |
-
"end_of_text_token_id":
|
| 9 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"unknown_token": "<|unk|>",
|
| 3 |
"pad_token": "<|pad|>",
|
| 4 |
+
"unknown_token_id": 576,
|
| 5 |
+
"pad_token_id": 577,
|
| 6 |
+
"max_length": 10,
|
| 7 |
+
"end_of_text_token": "<|end_of_text|>",
|
| 8 |
+
"end_of_text_token_id": 578
|
| 9 |
}
|