marefa-nlp
/

marefa-ner

@@ -1,3 +1,4 @@
 ---
 language: ar
 datasets:
@@ -33,39 +34,98 @@ Person, Location, Organization, Nationality, Job, Product, Event, Time, Art-Work
 Install the following Python packages
-`$ pip3 install transformers==4.7.0 nltk==3.5 protobuf==3.15.3 torch==1.7.1`
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
 -----------
 ```python
 import logging
 import re
 import nltk
 nltk.download('punkt')
-from nltk.tokenize import word_tokenize
 # disable INFO Logs
 transformers_logger = logging.getLogger("transformers")
 transformers_logger.setLevel(logging.WARNING)
 custom_labels = ["O", "B-job", "I-job", "B-nationality", "B-person", "I-person", "B-location",
                  "B-time", "I-time", "B-event", "I-event", "B-organization", "I-organization",
                  "I-location", "I-nationality", "B-product", "I-product", "B-artwork", "I-artwork"]
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-from transformers import pipeline
-# ===== import the model
 m_name = "marefa-nlp/marefa-ner"
 tokenizer = AutoTokenizer.from_pretrained(m_name)
 model = AutoModelForTokenClassification.from_pretrained(m_name)
 ar_ner = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True, aggregation_strategy="simple")
-# Model Inference
 samples = [
     "تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م. تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده",
     "بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته",
@@ -73,44 +133,57 @@ samples = [
     "Government extends flight ban from India and Pakistan until June 21"
 ]
-# Preprocess
 samples = [ " ".join(word_tokenize(sample.strip())) for sample in samples if sample.strip() != "" ]
 for sample in samples:
-    results = ar_ner(sample)
-    print(sample)
-    for result in results:
-        print("\t", result["word"], "=>", result["entity_group"])
     print("=========\n")
-###
-# تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م . تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده
-# 	 الأزهر => organization
-# 	 عام 1873م => time
-# 	 جمال الدين الأفغاني => person
-# 	 محمد عبده => person
-# =========
-# بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته
-# 	 القاهرة => location
-# 	 نجيب الريحاني => person
-# 	 فرقة جورج أبيض => organization
-# 	 فرقة سلامة حجازي => organization
-# =========
-# امبارح اتفرجت على مباراة مانشستر يونايتد مع ريال مدريد في غياب الدون كرستيانو رونالدو
-# 	 مانشستر يونايتد => organization
-# 	 ريال مدريد => organization
-# 	 كرستيانو رونالدو => person
-# =========
-# Government extends flight ban from India and Pakistan until June 21
-# 	 India => location
-# 	 Pakistan => location
-# 	 June 21 => time
-# =========
-###
 ```
 ## Fine-Tuning

 ---
 language: ar
 datasets:
 Install the following Python packages
+`$ pip3 install transformers==4.8.0 nltk==3.5 protobuf==3.15.3 torch==1.9.0 `
 > If you are using `Google Colab`, please restart your runtime after installing the packages.
+[**OPTIONAL**]
+Using of an Arabic segmentation tool approved better results in many scenarios. If you want to use `FarasaPy`to segment the texts, please ensure that you have `openjdk-11`installed in your machine, then install the package via:
+```bash
+# install openjdk-11-jdk
+$ apt-get install -y build-essential
+$ apt-get install -y openjdk-11-jdk
+# instll FarasaPy
+$ pip3 install farasapy==0.0.13
+```
+*Do not forget to set `USE_FARASAPY` to `True` in the following code*
+ Also, you can set `USE_SENTENCE_TOKENIZER` to `True` for getting better results for  long texts.
 -----------
 ```python
+# ==== Set configurations
+# do you want to use FarasaPy Segmentation tool ?
+USE_FARASAPY = False # set to True to use it
+# do you want to split text into sentences [better for long texts] ?
+USE_SENTENCE_TOKENIZER = False # set to True to use it
+# ==== Import required modules
 import logging
 import re
 import nltk
 nltk.download('punkt')
+from nltk.tokenize import word_tokenize, sent_tokenize
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 # disable INFO Logs
 transformers_logger = logging.getLogger("transformers")
 transformers_logger.setLevel(logging.WARNING)
+def _extract_ner(sent: str, ner: pipeline) -> str:
+    grouped_ents = []
+    current_ent = {}
+    results = ner(sent)
+    for ent in results:
+        if len(current_ent) == 0:
+            current_ent = ent
+            continue
+        if current_ent["end"] == ent["start"] and current_ent["entity_group"] == ent["entity_group"]:
+            current_ent["word"] = current_ent["word"]+ent["word"]
+        else:
+            grouped_ents.append(current_ent)
+            current_ent = ent
+    if len(grouped_ents) > 0 and grouped_ents[-1] != ent:
+        grouped_ents.append(current_ent)
+    elif len(grouped_ents) == 0 and len(current_ent) > 0:
+        grouped_ents.append(current_ent)
+    return [ g for g in grouped_ents if len(g["word"].strip()) ]
+if USE_FARASAPY:
+	from farasa.segmenter import FarasaSegmenter
+	segmenter = FarasaSegmenter()
+	def _segment_text(text: str, segmenter: FarasaSegmenter) -> str:
+	    segmented = segmenter.segment(text)
+	    f_segments = { w.replace("+",""): w.replace("و+","و ").replace("+","") for w in segmented.split(" ") if w.strip() != "" and w.startswith("و+") }
+	    for s,t in f_segments.items():
+	        text = text.replace(s, t)
+	    return text
+	_ = _segment_text("نص تجريبي للتأكد من عمل الأداة", segmenter)
 custom_labels = ["O", "B-job", "I-job", "B-nationality", "B-person", "I-person", "B-location",
                  "B-time", "I-time", "B-event", "I-event", "B-organization", "I-organization",
                  "I-location", "I-nationality", "B-product", "I-product", "B-artwork", "I-artwork"]
+# ==== Import/Download the NER Model
 m_name = "marefa-nlp/marefa-ner"
 tokenizer = AutoTokenizer.from_pretrained(m_name)
 model = AutoModelForTokenClassification.from_pretrained(m_name)
 ar_ner = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True, aggregation_strategy="simple")
+# ==== Model Inference
 samples = [
     "تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م. تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده",
     "بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته",
     "Government extends flight ban from India and Pakistan until June 21"
 ]
+# [optional]
 samples = [ " ".join(word_tokenize(sample.strip())) for sample in samples if sample.strip() != "" ]
 for sample in samples:
+    ents = []
+    if USE_FARASAPY:
+        sample = _segment_text(sample, segmenter)
+    if USE_SENTENCE_TOKENIZER:
+        for sent in sent_tokenize(sample):
+            ents += _extract_ner(sent, ar_ner)
+    else:
+        ents = _extract_ner(sample, ar_ner)
+    # print the results
+    print("(", sample, ")")
+    for ent in ents:
+        print("\t", ent["word"], "=>", ent["entity_group"])
     print("=========\n")
+```
+Output
+```
+( تلقى تعليمه في الكتاب ثم انضم الى الأزهر عام 1873م . تعلم على يد السيد جمال الدين الأفغاني والشيخ محمد عبده )
+	 الأزهر => organization
+	 عام 1873م => time
+	 جمال الدين الأفغاني => person
+	 محمد عبده => person
+=========
+( بعد عودته إلى القاهرة، التحق نجيب الريحاني فرقة جورج أبيض، الذي كان قد ضمَّ - قُبيل ذلك - فرقته إلى فرقة سلامة حجازي . و منها ذاع صيته )
+	 القاهرة => location
+	 نجيب الريحاني => person
+	 فرقة جورج أبيض => organization
+	 فرقة سلامة حجازي => organization
+=========
+( امبارح اتفرجت على مباراة مانشستر يونايتد مع ريال مدريد في غياب الدون كرستيانو رونالدو )
+	 مانشستر يونايتد => organization
+	 ريال مدريد => organization
+	 كرستيانو رونالدو => person
+=========
+( Government extends flight ban from India and Pakistan until June 21 )
+	 India => location
+	 Pakistan => location
+	 June 21 => time
+=========
 ```
 ## Fine-Tuning