File size: 1,774 Bytes
7e68690 345c58c 7e68690 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import re
import os
import spacy
from huggingface_hub import snapshot_download
# -----------------------
# Arabic normalization
# -----------------------
_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")
def normalize_arabic(text: str) -> str:
# 1. Remove diacritics
text = _DIACRITICS.sub("", text)
# 2. Normalize hamza variants
text = re.sub(r"[إأآ]", "ا", text) # إ أ آ → ا
text = re.sub(r"[ؤئ]", "ء", text) # ؤ ئ → ء
# 3. Normalize other orthographic variants
text = re.sub(r"ة", "ه", text) # ة → ه
text = re.sub(r"ى", "ي", text) # ى → ي
return text
# -----------------------
# Loader
# -----------------------
_MODEL = None
def load_ner(
repo_id="Muhsabrys/AMWAL_ArFinNER",
local_path=None,
):
"""
Load AMWAL NER:
- from local_path (development / testing)
- or from Hugging Face (default)
"""
global _MODEL
if _MODEL is None:
if local_path is not None:
model_path = os.path.join(local_path, "spacy_model", "model-best")
else:
path = snapshot_download(repo_id=repo_id)
model_path = os.path.join(path, "spacy_model", "model-best")
_MODEL = spacy.load(model_path)
def ner(text: str):
raw = text
text_norm = normalize_arabic(text)
doc = _MODEL(text_norm)
return {
"raw_text": raw,
"normalized_text": text_norm,
"entities": [
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
}
for ent in doc.ents
],
}
return ner
|