import re import os import spacy from huggingface_hub import snapshot_download # ----------------------- # Arabic normalization # ----------------------- _DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]") def normalize_arabic(text: str) -> str: # 1. Remove diacritics text = _DIACRITICS.sub("", text) # 2. Normalize hamza variants text = re.sub(r"[إأآ]", "ا", text) # إ أ آ → ا text = re.sub(r"[ؤئ]", "ء", text) # ؤ ئ → ء # 3. Normalize other orthographic variants text = re.sub(r"ة", "ه", text) # ة → ه text = re.sub(r"ى", "ي", text) # ى → ي return text # ----------------------- # Loader # ----------------------- _MODEL = None def load_ner( repo_id="Muhsabrys/AMWAL_ArFinNER", local_path=None, ): """ Load AMWAL NER: - from local_path (development / testing) - or from Hugging Face (default) """ global _MODEL if _MODEL is None: if local_path is not None: model_path = os.path.join(local_path, "spacy_model", "model-best") else: path = snapshot_download(repo_id=repo_id) model_path = os.path.join(path, "spacy_model", "model-best") _MODEL = spacy.load(model_path) def ner(text: str): raw = text text_norm = normalize_arabic(text) doc = _MODEL(text_norm) return { "raw_text": raw, "normalized_text": text_norm, "entities": [ { "text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char, } for ent in doc.ents ], } return ner