Upload 11 files
Browse files- .gitattributes +1 -0
- swa1_model/README.Rmd +39 -0
- swa1_model/config.json +64 -0
- swa1_model/eval_results.txt +4 -0
- swa1_model/model.safetensors +3 -0
- swa1_model/sentencepiece.bpe.model +3 -0
- swa1_model/special_tokens_map.json +51 -0
- swa1_model/test_predictions.txt +0 -0
- swa1_model/test_results.txt +4 -0
- swa1_model/tokenizer.json +3 -0
- swa1_model/tokenizer_config.json +55 -0
- swa1_model/training_args.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
swa1_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
swa1_model/README.Rmd
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: "README"
|
| 3 |
+
author: "Kevine Grace"
|
| 4 |
+
date: "2025-04-05"
|
| 5 |
+
output: html_document
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
language: multilingual
|
| 10 |
+
tags:
|
| 11 |
+
- pos-tagging
|
| 12 |
+
- afro-xlmr
|
| 13 |
+
- token-classification
|
| 14 |
+
license: mit
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# Afro-XLM-Roberta Mini POS Tagger
|
| 18 |
+
|
| 19 |
+
This model was fine-tuned on part-of-speech tagging using the `Davlan/afro-xlmr-mini` model.
|
| 20 |
+
It supports token classification for low-resource African languages.
|
| 21 |
+
|
| 22 |
+
## Usage
|
| 23 |
+
|
| 24 |
+
```python
|
| 25 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 26 |
+
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained("your-username/afroxlmr-pos")
|
| 28 |
+
model = AutoModelForTokenClassification.from_pretrained("your-username/afroxlmr-pos")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
We have fine tune the pretrained model with another architechture and we had obtained a higher precision , recall and F1 score. For the training and dev data we have split our own data into train and dev and we have use their test set to get the final performance.
|
| 32 |
+
|
| 33 |
+
## Data Set
|
| 34 |
+
|
| 35 |
+
Our own Data : https://github.com/hausanlp/HERDPhobia
|
| 36 |
+
Their Data : https://github.com/masakhane-io/masakhane-pos
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
swa1_model/config.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"XLMRobertaForTokenClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 384,
|
| 12 |
+
"id2label": {
|
| 13 |
+
"0": "X",
|
| 14 |
+
"1": "ADJ",
|
| 15 |
+
"10": "PART",
|
| 16 |
+
"11": "PRON",
|
| 17 |
+
"12": "PROPN",
|
| 18 |
+
"13": "PUNCT",
|
| 19 |
+
"14": "SCONJ",
|
| 20 |
+
"15": "SYM",
|
| 21 |
+
"16": "VERB",
|
| 22 |
+
"2": "ADP",
|
| 23 |
+
"3": "ADV",
|
| 24 |
+
"4": "AUX",
|
| 25 |
+
"5": "CCONJ",
|
| 26 |
+
"6": "DET",
|
| 27 |
+
"7": "INTJ",
|
| 28 |
+
"8": "NOUN",
|
| 29 |
+
"9": "NUM"
|
| 30 |
+
},
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": 1536,
|
| 33 |
+
"label2id": {
|
| 34 |
+
"ADJ": 1,
|
| 35 |
+
"ADP": 2,
|
| 36 |
+
"ADV": 3,
|
| 37 |
+
"AUX": 4,
|
| 38 |
+
"CCONJ": 5,
|
| 39 |
+
"DET": 6,
|
| 40 |
+
"INTJ": 7,
|
| 41 |
+
"NOUN": 8,
|
| 42 |
+
"NUM": 9,
|
| 43 |
+
"PART": 10,
|
| 44 |
+
"PRON": 11,
|
| 45 |
+
"PROPN": 12,
|
| 46 |
+
"PUNCT": 13,
|
| 47 |
+
"SCONJ": 14,
|
| 48 |
+
"SYM": 15,
|
| 49 |
+
"VERB": 16,
|
| 50 |
+
"X": 0
|
| 51 |
+
},
|
| 52 |
+
"layer_norm_eps": 1e-05,
|
| 53 |
+
"max_position_embeddings": 514,
|
| 54 |
+
"model_type": "xlm-roberta",
|
| 55 |
+
"num_attention_heads": 12,
|
| 56 |
+
"num_hidden_layers": 12,
|
| 57 |
+
"pad_token_id": 1,
|
| 58 |
+
"position_embedding_type": "absolute",
|
| 59 |
+
"torch_dtype": "float32",
|
| 60 |
+
"transformers_version": "4.50.3",
|
| 61 |
+
"type_vocab_size": 1,
|
| 62 |
+
"use_cache": true,
|
| 63 |
+
"vocab_size": 250002
|
| 64 |
+
}
|
swa1_model/eval_results.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
f1 = 0.9117867305676202
|
| 2 |
+
loss = 1.2744013667106628
|
| 3 |
+
precision = 0.9149383829275624
|
| 4 |
+
recall = 0.9086567164179105
|
swa1_model/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ddd2cf63282268d2b8eda52fad6bedb2e026388954ead827c97611a1163642f
|
| 3 |
+
size 470021564
|
swa1_model/sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
swa1_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
swa1_model/test_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
swa1_model/test_results.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
f1 = 0.9227457031109836
|
| 2 |
+
loss = 1.2530506451924641
|
| 3 |
+
precision = 0.9243045072244986
|
| 4 |
+
recall = 0.921192147872188
|
swa1_model/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
|
| 3 |
+
size 17082734
|
swa1_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": false,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 512,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 54 |
+
"unk_token": "<unk>"
|
| 55 |
+
}
|
swa1_model/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10825191ad43743c4e1dd1c758b41c663be2cb10428b3d65f0badbc95c5b13ff
|
| 3 |
+
size 1976
|