Update README.md
Browse files
README.md
CHANGED
|
@@ -22,8 +22,8 @@ A T5 Tokenizer trained for Amharic language.
|
|
| 22 |
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
|
| 24 |
An MT5Tokenizer based Amharic and English tokenizer trained using [Fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [Wura](https://huggingface.co/datasets/castorini/wura) datasets.
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
### Mt% Tokenizer Vs AmhT5 Tokenizer
|
| 29 |
|
|
@@ -47,7 +47,7 @@ print(tokens)
|
|
| 47 |
# ['▁A', '▁', 'Token', 'izer', '▁train', 'ed', '▁for', '▁Am', 'haric', '▁language', '.']
|
| 48 |
|
| 49 |
|
| 50 |
-
amhT5 = "yonas/
|
| 51 |
TOKENIZER = MT5TokenizerFast.from_pretrained(amhT5, legacy=False)
|
| 52 |
tokens = TOKENIZER.tokenize("ከመዲናዋ በቅርብ ርቀት ላይ በምትገኘው ከተማ")
|
| 53 |
|
|
@@ -60,4 +60,5 @@ tokens = TOKENIZER.tokenize("A Tokenizer trained for Amharic language.")
|
|
| 60 |
|
| 61 |
print(len(tokens)) # 7
|
| 62 |
print(tokens)
|
| 63 |
-
# ['▁A', '▁Token', 'izer', '▁trained', '▁for', '▁Amharic', '▁language.']
|
|
|
|
|
|
| 22 |
<!-- Provide a longer summary of what this model is. -->
|
| 23 |
|
| 24 |
An MT5Tokenizer based Amharic and English tokenizer trained using [Fineweb](https://huggingface.co/datasets/HuggingFaceFW/fineweb) and [Wura](https://huggingface.co/datasets/castorini/wura) datasets.
|
| 25 |
+
This tokenizer aims to have a tokenizer that can better represent Amharic while also doing the same for English.
|
| 26 |
+
To balance the dataset, I have used only 3 million document samples from the dataset. The vocabulary size of this tokenizer is the same as `google/mt5-small`.
|
| 27 |
|
| 28 |
### Mt% Tokenizer Vs AmhT5 Tokenizer
|
| 29 |
|
|
|
|
| 47 |
# ['▁A', '▁', 'Token', 'izer', '▁train', 'ed', '▁for', '▁Am', 'haric', '▁language', '.']
|
| 48 |
|
| 49 |
|
| 50 |
+
amhT5 = "yonas/AmhT5-tokenizer"
|
| 51 |
TOKENIZER = MT5TokenizerFast.from_pretrained(amhT5, legacy=False)
|
| 52 |
tokens = TOKENIZER.tokenize("ከመዲናዋ በቅርብ ርቀት ላይ በምትገኘው ከተማ")
|
| 53 |
|
|
|
|
| 60 |
|
| 61 |
print(len(tokens)) # 7
|
| 62 |
print(tokens)
|
| 63 |
+
# ['▁A', '▁Token', 'izer', '▁trained', '▁for', '▁Amharic', '▁language.']
|
| 64 |
+
```
|