marefa-nlp
/

marefa-ner

Token Classification

Model card Files Files and versions

bakrianoo commited on Mar 1, 2021

Commit

2bb1643

·

1 Parent(s): dde5302

fix [grouped_entities] output

Files changed (1) hide show

README.md +29 -1

README.md CHANGED Viewed

@@ -33,6 +33,7 @@ Install transformers AND nltk (python >= 3.6)
 ```python
 # we need to install NLTK punkt to be used for word tokenization
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
@@ -55,8 +56,35 @@ example = " ".join(word_tokenize(example))
 # feed to the NER model to parse
 ner_results = nlp(example)
-# ===== print the ner_results
 for ent in ner_results:
   print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
 #####

 ```python
 # we need to install NLTK punkt to be used for word tokenization
+from collections import defaultdict
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 # feed to the NER model to parse
 ner_results = nlp(example)
+# as the [grouped_entities] parameter does not perform well for Arabic,
+# we prepared a simple fixing code to generate the full entities tokens
+grouped_ner_results = defaultdict(list)
+fixed_ner_results = []
 for ent in ner_results:
+  grouped_ner_results[ent['entity_group']].append(ent)
+for group, ents in grouped_ner_results.items():
+  if len(ents) == 1:
+    fixed_ner_results.append(ents[0])
+    continue
+  last_ent, last_start, last_end = ents[0]['word'], ents[0]['start'], ents[0]['end']
+  current_ent = {"word": ents[0]['word'], "start": ents[0]['start'], "end": ents[0]['end'], "entity_group": group, "score": ents[0]['score']}
+  for i in range(1, len(ents)):
+    if ents[i]['start'] == current_ent["end"]:
+      current_ent["word"] += ents[i]['word']
+      current_ent["end"] = ents[i]['end']
+      current_ent["score"] = max(ents[i]['score'], current_ent["score"])
+    else:
+      fixed_ner_results.append(current_ent)
+      current_ent = {"word": ents[i]['word'], "start": ents[i]['start'], "end": ents[i]['end'], "entity_group": group, "score": ents[i]['score']}
+  fixed_ner_results.append(current_ent)
+# ===== print the ner_results
+for ent in fixed_ner_results:
   print(ent["word"], '->' ,ent['entity_group'], " # score:", "%.2f" % ent['score'])
 #####