Update README.md
Browse files
README.md
CHANGED
|
@@ -1,8 +1,26 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
| 2 |
license: mit
|
|
|
|
|
|
|
| 3 |
---
|
| 4 |
# Arabic NER
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
```python
|
| 7 |
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
| 8 |
|
|
@@ -17,4 +35,15 @@ license: mit
|
|
| 17 |
>>> output = ner_pip('قال وزير العدل التركي بكير بوزداغ إن أنقرة تريد 12 مشتبهاً بهم من فنلندا و 21 من السويد')
|
| 18 |
>>> print(output)
|
| 19 |
[{'entity_group': 'PER', 'score': 0.9996214, 'word': 'وزير', 'start': 4, 'end': 8}, {'entity_group': 'ORG', 'score': 0.9952383, 'word': 'العدل', 'start': 9, 'end': 14}, {'entity_group': 'GPE', 'score': 0.9996675, 'word': 'التركي', 'start': 15, 'end': 21}, {'entity_group': 'PER', 'score': 0.9978992, 'word': 'بكير بوزداغ', 'start': 22, 'end': 33}, {'entity_group': 'GPE', 'score': 0.9997154, 'word': 'انقرة', 'start': 37, 'end': 42}, {'entity_group': 'PER', 'score': 0.9946885, 'word': 'مشتبها بهم', 'start': 51, 'end': 62}, {'entity_group': 'GPE', 'score': 0.99967396, 'word': 'فنلندا', 'start': 66, 'end': 72}, {'entity_group': 'PER', 'score': 0.99694425, 'word': '21', 'start': 75, 'end': 77}, {'entity_group': 'GPE', 'score': 0.99963355, 'word': 'السويد', 'start': 81, 'end': 87}]
|
| 20 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
language: en,ar
|
| 3 |
+
tags:
|
| 4 |
+
- exbert
|
| 5 |
license: mit
|
| 6 |
+
datasets:
|
| 7 |
+
- ACE2005
|
| 8 |
---
|
| 9 |
# Arabic NER
|
| 10 |
|
| 11 |
+
### Model
|
| 12 |
+
[GigaBERTv4](https://huggingface.co/lanwuwei/GigaBERT-v4-Arabic-and-English)
|
| 13 |
+
|
| 14 |
+
### Hyperparameters
|
| 15 |
+
learning_rate=2e-5
|
| 16 |
+
num_train_epochs=10
|
| 17 |
+
weight_decay=0.01
|
| 18 |
+
### ACE2005 Evaluation results
|
| 19 |
+
| Language | Arabic | English |
|
| 20 |
+
|:----:|:-----------:|:----:|
|
| 21 |
+
| | 89.4 | 88.8 |
|
| 22 |
+
|
| 23 |
+
### How to use
|
| 24 |
```python
|
| 25 |
>>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
| 26 |
|
|
|
|
| 35 |
>>> output = ner_pip('قال وزير العدل التركي بكير بوزداغ إن أنقرة تريد 12 مشتبهاً بهم من فنلندا و 21 من السويد')
|
| 36 |
>>> print(output)
|
| 37 |
[{'entity_group': 'PER', 'score': 0.9996214, 'word': 'وزير', 'start': 4, 'end': 8}, {'entity_group': 'ORG', 'score': 0.9952383, 'word': 'العدل', 'start': 9, 'end': 14}, {'entity_group': 'GPE', 'score': 0.9996675, 'word': 'التركي', 'start': 15, 'end': 21}, {'entity_group': 'PER', 'score': 0.9978992, 'word': 'بكير بوزداغ', 'start': 22, 'end': 33}, {'entity_group': 'GPE', 'score': 0.9997154, 'word': 'انقرة', 'start': 37, 'end': 42}, {'entity_group': 'PER', 'score': 0.9946885, 'word': 'مشتبها بهم', 'start': 51, 'end': 62}, {'entity_group': 'GPE', 'score': 0.99967396, 'word': 'فنلندا', 'start': 66, 'end': 72}, {'entity_group': 'PER', 'score': 0.99694425, 'word': '21', 'start': 75, 'end': 77}, {'entity_group': 'GPE', 'score': 0.99963355, 'word': 'السويد', 'start': 81, 'end': 87}]
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### BibTeX entry and citation info
|
| 41 |
+
|
| 42 |
+
```bibtex
|
| 43 |
+
@inproceedings{lan2020gigabert,
|
| 44 |
+
author = {Lan, Wuwei and Chen, Yang and Xu, Wei and Ritter, Alan},
|
| 45 |
+
title = {Giga{BERT}: Zero-shot Transfer Learning from {E}nglish to {A}rabic},
|
| 46 |
+
booktitle = {Proceedings of The 2020 Conference on Empirical Methods on Natural Language Processing (EMNLP)},
|
| 47 |
+
year = {2020}
|
| 48 |
+
}
|
| 49 |
+
```
|