Update README.md
Browse files
README.md
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
---
|
| 5 |
|
| 6 |
# A monolingual tokenizer for Azerbaijani trained on `azj_Latn` subset of FineWeb-2 corpus.
|
|
@@ -32,4 +36,4 @@ tags: []
|
|
| 32 |
pages = "18--28",
|
| 33 |
abstract = "The emergence of multilingual large language models has enabled the development of language understanding and generation systems in Azerbaijani. However, most of the production-grade systems rely on cloud solutions, such as GPT-4. While there have been several attempts to develop open foundation models for Azerbaijani, these works have not found their way into common use due to a lack of systemic benchmarking. This paper encompasses several lines of work that promote open-source foundation models for Azerbaijani. We introduce (1) a large text corpus for Azerbaijani, (2) a family of encoder-only language models trained on this dataset, (3) labeled datasets for evaluating these models, and (4) extensive evaluation that covers all major open-source models with Azerbaijani support."
|
| 34 |
}
|
| 35 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
library_name: transformers
|
| 3 |
+
license: apache-2.0
|
| 4 |
+
datasets:
|
| 5 |
+
- HuggingFaceFW/fineweb-2
|
| 6 |
+
language:
|
| 7 |
+
- az
|
| 8 |
---
|
| 9 |
|
| 10 |
# A monolingual tokenizer for Azerbaijani trained on `azj_Latn` subset of FineWeb-2 corpus.
|
|
|
|
| 36 |
pages = "18--28",
|
| 37 |
abstract = "The emergence of multilingual large language models has enabled the development of language understanding and generation systems in Azerbaijani. However, most of the production-grade systems rely on cloud solutions, such as GPT-4. While there have been several attempts to develop open foundation models for Azerbaijani, these works have not found their way into common use due to a lack of systemic benchmarking. This paper encompasses several lines of work that promote open-source foundation models for Azerbaijani. We introduce (1) a large text corpus for Azerbaijani, (2) a family of encoder-only language models trained on this dataset, (3) labeled datasets for evaluating these models, and (4) extensive evaluation that covers all major open-source models with Azerbaijani support."
|
| 38 |
}
|
| 39 |
+
```
|