File size: 1,774 Bytes
7e68690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345c58c
7e68690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import os
import spacy
from huggingface_hub import snapshot_download

# -----------------------
# Arabic normalization
# -----------------------

_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")

def normalize_arabic(text: str) -> str:
    # 1. Remove diacritics
    text = _DIACRITICS.sub("", text)

    # 2. Normalize hamza variants
    text = re.sub(r"[إأآ]", "ا", text)   # إ أ آ → ا
    text = re.sub(r"[ؤئ]", "ء", text)    # ؤ ئ → ء

    # 3. Normalize other orthographic variants
    text = re.sub(r"ة", "ه", text)       # ة → ه
    text = re.sub(r"ى", "ي", text)       # ى → ي

    return text

# -----------------------
# Loader
# -----------------------
_MODEL = None

def load_ner(
    repo_id="Muhsabrys/AMWAL_ArFinNER",
    local_path=None,
):
    """
    Load AMWAL NER:
    - from local_path (development / testing)
    - or from Hugging Face (default)
    """
    global _MODEL

    if _MODEL is None:
        if local_path is not None:
            model_path = os.path.join(local_path, "spacy_model", "model-best")
        else:
            path = snapshot_download(repo_id=repo_id)
            model_path = os.path.join(path, "spacy_model", "model-best")

        _MODEL = spacy.load(model_path)

    def ner(text: str):
        raw = text
        text_norm = normalize_arabic(text)
        doc = _MODEL(text_norm)

        return {
            "raw_text": raw,
            "normalized_text": text_norm,
            "entities": [
                {
                    "text": ent.text,
                    "label": ent.label_,
                    "start": ent.start_char,
                    "end": ent.end_char,
                }
                for ent in doc.ents
            ],
        }

    return ner