fix [grouped_entities] output
Browse files
README.md
CHANGED
|
@@ -33,6 +33,7 @@ Install transformers AND nltk (python >= 3.6)
|
|
| 33 |
|
| 34 |
```python
|
| 35 |
# we need to install NLTK punkt to be used for word tokenization
|
|
|
|
| 36 |
import nltk
|
| 37 |
nltk.download('punkt')
|
| 38 |
from nltk.tokenize import word_tokenize
|
|
@@ -55,8 +56,35 @@ example = " ".join(word_tokenize(example))
|
|
| 55 |
# feed to the NER model to parse
|
| 56 |
ner_results = nlp(example)
|
| 57 |
|
| 58 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
for ent in ner_results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
| 61 |
|
| 62 |
#####
|
|
|
|
| 33 |
|
| 34 |
```python
|
| 35 |
# we need to install NLTK punkt to be used for word tokenization
|
| 36 |
+
from collections import defaultdict
|
| 37 |
import nltk
|
| 38 |
nltk.download('punkt')
|
| 39 |
from nltk.tokenize import word_tokenize
|
|
|
|
| 56 |
# feed to the NER model to parse
|
| 57 |
ner_results = nlp(example)
|
| 58 |
|
| 59 |
+
# as the [grouped_entities] parameter does not perform well for Arabic,
|
| 60 |
+
# we prepared a simple fixing code to generate the full entities tokens
|
| 61 |
+
|
| 62 |
+
grouped_ner_results = defaultdict(list)
|
| 63 |
+
fixed_ner_results = []
|
| 64 |
for ent in ner_results:
|
| 65 |
+
grouped_ner_results[ent['entity_group']].append(ent)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
for group, ents in grouped_ner_results.items():
|
| 69 |
+
if len(ents) == 1:
|
| 70 |
+
fixed_ner_results.append(ents[0])
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
last_ent, last_start, last_end = ents[0]['word'], ents[0]['start'], ents[0]['end']
|
| 74 |
+
current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
|
| 75 |
+
for i in range(1, len(ents)):
|
| 76 |
+
if ents[i]['start'] == current_ent["end"]:
|
| 77 |
+
current_ent["word"] += ents[i]['word']
|
| 78 |
+
current_ent["end"] = ents[i]['end']
|
| 79 |
+
current_ent["score"] = max(ents[i]['score'], current_ent["score"])
|
| 80 |
+
else:
|
| 81 |
+
fixed_ner_results.append(current_ent)
|
| 82 |
+
current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
|
| 83 |
+
|
| 84 |
+
fixed_ner_results.append(current_ent)
|
| 85 |
+
|
| 86 |
+
# ===== print the ner_results
|
| 87 |
+
for ent in fixed_ner_results:
|
| 88 |
print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
|
| 89 |
|
| 90 |
#####
|