Update README.md
Browse files
README.md
CHANGED
|
@@ -52,6 +52,22 @@ Fine-tuned [myanmar-pos-model](https://huggingface.co/chuuhtetnaing/myanmar-pos-
|
|
| 52 |
| 29 | 0.0274 | 0.0837 | 0.8855 | 0.9272 | 0.9058 | 0.9804 |
|
| 53 |
| 30 | 0.0271 | 0.0832 | 0.8875 | 0.9267 | 0.9067 | 0.9806 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
## Training Details
|
| 56 |
|
| 57 |
| Parameter | Value |
|
|
@@ -71,6 +87,70 @@ result = ner("ကိုမောင်သည်ရန်ကုန်မြို
|
|
| 71 |
print(result)
|
| 72 |
```
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
## NER Labels
|
| 75 |
|
| 76 |
| Tag | Description |
|
|
|
|
| 52 |
| 29 | 0.0274 | 0.0837 | 0.8855 | 0.9272 | 0.9058 | 0.9804 |
|
| 53 |
| 30 | 0.0271 | 0.0832 | 0.8875 | 0.9267 | 0.9067 | 0.9806 |
|
| 54 |
|
| 55 |
+
## Test Set Evaluation
|
| 56 |
+
|
| 57 |
+
Evaluated on [myanmar-ner-dataset](https://huggingface.co/datasets/chuuhtetnaing/myanmar-ner-dataset) test split using seqeval metrics:
|
| 58 |
+
|
| 59 |
+
| Entity | Precision | Recall | F1-Score | Support |
|
| 60 |
+
|--------|-----------|--------|----------|---------|
|
| 61 |
+
| DATE | 0.80 | 0.86 | 0.83 | 251 |
|
| 62 |
+
| LOC | 0.93 | 0.96 | 0.95 | 2712 |
|
| 63 |
+
| NUM | 0.89 | 0.92 | 0.90 | 789 |
|
| 64 |
+
| ORG | 0.44 | 0.62 | 0.52 | 94 |
|
| 65 |
+
| PER | 0.84 | 0.88 | 0.86 | 533 |
|
| 66 |
+
| TIME | 0.62 | 0.70 | 0.66 | 57 |
|
| 67 |
+
| **micro avg** | **0.89** | **0.93** | **0.91** | 4436 |
|
| 68 |
+
| **macro avg** | 0.75 | 0.82 | 0.78 | 4436 |
|
| 69 |
+
| **weighted avg** | **0.89** | **0.93** | **0.91** | 4436 |
|
| 70 |
+
|
| 71 |
## Training Details
|
| 72 |
|
| 73 |
| Parameter | Value |
|
|
|
|
| 87 |
print(result)
|
| 88 |
```
|
| 89 |
|
| 90 |
+
## Evaluation Code
|
| 91 |
+
|
| 92 |
+
```python
|
| 93 |
+
!pip install seqeval
|
| 94 |
+
|
| 95 |
+
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
| 96 |
+
from datasets import load_dataset
|
| 97 |
+
from tqdm import tqdm
|
| 98 |
+
from seqeval.metrics import classification_report
|
| 99 |
+
|
| 100 |
+
# Load model and tokenizer
|
| 101 |
+
model = AutoModelForTokenClassification.from_pretrained("chuuhtetnaing/myanmar-ner-model")
|
| 102 |
+
tokenizer = AutoTokenizer.from_pretrained("chuuhtetnaing/myanmar-ner-model")
|
| 103 |
+
|
| 104 |
+
def tokenize_and_align_labels(examples):
|
| 105 |
+
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
|
| 106 |
+
labels = []
|
| 107 |
+
for i, label in enumerate(examples["ner_tags"]):
|
| 108 |
+
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
| 109 |
+
previous_word_idx = None
|
| 110 |
+
label_ids = []
|
| 111 |
+
for word_idx in word_ids:
|
| 112 |
+
if word_idx is None:
|
| 113 |
+
label_ids.append(-100)
|
| 114 |
+
elif word_idx != previous_word_idx:
|
| 115 |
+
label_ids.append(label[word_idx])
|
| 116 |
+
else:
|
| 117 |
+
label_ids.append(-100)
|
| 118 |
+
previous_word_idx = word_idx
|
| 119 |
+
labels.append(label_ids)
|
| 120 |
+
tokenized_inputs["labels"] = labels
|
| 121 |
+
return tokenized_inputs
|
| 122 |
+
|
| 123 |
+
# Load and tokenize dataset
|
| 124 |
+
ner = pipeline("token-classification", model="chuuhtetnaing/myanmar-ner-model", aggregation_strategy=None)
|
| 125 |
+
ds = load_dataset("chuuhtetnaing/myanmar-ner-dataset")
|
| 126 |
+
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
|
| 127 |
+
test_ds = tokenized_ds["test"]
|
| 128 |
+
|
| 129 |
+
# Get label mapping
|
| 130 |
+
label_list = model.config.id2label
|
| 131 |
+
|
| 132 |
+
y_true = []
|
| 133 |
+
y_pred = []
|
| 134 |
+
|
| 135 |
+
for example in tqdm(test_ds):
|
| 136 |
+
tokens = tokenizer.convert_ids_to_tokens(example["input_ids"])
|
| 137 |
+
true_labels = [label_list[l] if l != -100 else "O" for l in example["labels"]]
|
| 138 |
+
|
| 139 |
+
text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
|
| 140 |
+
preds = ner(text)
|
| 141 |
+
|
| 142 |
+
pred_labels = ["O"] * len(true_labels)
|
| 143 |
+
for pred in preds:
|
| 144 |
+
idx = pred["index"]
|
| 145 |
+
if idx < len(pred_labels):
|
| 146 |
+
pred_labels[idx] = pred["entity"]
|
| 147 |
+
|
| 148 |
+
y_true.append([label_list[l] for l in example["labels"] if l != -100])
|
| 149 |
+
y_pred.append([p for p, l in zip(pred_labels, example["labels"]) if l != -100])
|
| 150 |
+
|
| 151 |
+
print(classification_report(y_true, y_pred))
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
## NER Labels
|
| 155 |
|
| 156 |
| Tag | Description |
|