Muhsabrys commited on
Commit
7e68690
ยท
1 Parent(s): 3d6f86b

AMWAL v1: spaCy Arabic Financial NER with normalization pipeline

Browse files
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ spacy_model/** filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/amwal.cpython-312.pyc ADDED
Binary file (2.27 kB). View file
 
amwal.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import spacy
4
+ from huggingface_hub import snapshot_download
5
+
6
+ # -----------------------
7
+ # Arabic normalization
8
+ # -----------------------
9
+
10
+ _DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")
11
+
12
+ def normalize_arabic(text: str) -> str:
13
+ # 1. Remove diacritics
14
+ text = _DIACRITICS.sub("", text)
15
+
16
+ # 2. Normalize hamza variants
17
+ text = re.sub(r"[ุฅุฃุข]", "ุง", text) # ุฅ ุฃ ุข โ†’ ุง
18
+ text = re.sub(r"[ุคุฆ]", "ุก", text) # ุค ุฆ โ†’ ุก
19
+
20
+ # 3. Normalize other orthographic variants
21
+ text = re.sub(r"ุฉ", "ู‡", text) # ุฉ โ†’ ู‡
22
+ text = re.sub(r"ู‰", "ูŠ", text) # ู‰ โ†’ ูŠ
23
+
24
+ return text
25
+
26
+ # -----------------------
27
+ # Loader
28
+ # -----------------------
29
+ _MODEL = None
30
+
31
+ def load_ner(
32
+ repo_id="Muhsabrys/AMWAL-ner-arabic",
33
+ local_path=None,
34
+ ):
35
+ """
36
+ Load AMWAL NER:
37
+ - from local_path (development / testing)
38
+ - or from Hugging Face (default)
39
+ """
40
+ global _MODEL
41
+
42
+ if _MODEL is None:
43
+ if local_path is not None:
44
+ model_path = os.path.join(local_path, "spacy_model", "model-best")
45
+ else:
46
+ path = snapshot_download(repo_id=repo_id)
47
+ model_path = os.path.join(path, "spacy_model", "model-best")
48
+
49
+ _MODEL = spacy.load(model_path)
50
+
51
+ def ner(text: str):
52
+ raw = text
53
+ text_norm = normalize_arabic(text)
54
+ doc = _MODEL(text_norm)
55
+
56
+ return {
57
+ "raw_text": raw,
58
+ "normalized_text": text_norm,
59
+ "entities": [
60
+ {
61
+ "text": ent.text,
62
+ "label": ent.label_,
63
+ "start": ent.start_char,
64
+ "end": ent.end_char,
65
+ }
66
+ for ent in doc.ents
67
+ ],
68
+ }
69
+
70
+ return ner
readme.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from amwal import load_ner
2
+
3
+ ner = load_ner()
4
+
5
+ text = "ุฃุนู„ู† ุตู†ุฏูˆู‚ ู‚ุทุฑ ุงู„ุณูŠุงุฏูŠ ุนู† ุงุณุชุซู…ุงุฑ ุจู‚ูŠู…ุฉ 500 ู…ู„ูŠูˆู† ุฏูˆู„ุงุฑ ุฃู…ุฑูŠูƒูŠ ููŠ ุณู†ุฏุงุช ุญูƒูˆู…ูŠุฉ ูŠุงุจุงู†ูŠุฉ ู…ู‚ูˆู…ุฉ ุจุงู„ูŠู† ููŠ ุทูˆูƒูŠูˆ."
6
+ out = ner(text)
7
+ print(out)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ spacy>=3.7.0
2
+ huggingface_hub>=0.20.0
spacy_model/model-best/config.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c5db8dda5151ec3ab7983258bffcd45c70442618fc1dec41edfcca1f924f7da
3
+ size 2722
spacy_model/model-best/meta.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:687cf15fbd3c932992f5b2c23dab124b33c54d9944df47bcd9238ea1068ca76d
3
+ size 3271
spacy_model/model-best/ner/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7172edadafba9f472e9ac0f2660eec04b6405e471be9e20267b79c67288d22d
3
+ size 221
spacy_model/model-best/ner/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc12d1c22c89822ecf06fdc9c456589f1f85bbc749cca862fafa95595efde0db
3
+ size 189277
spacy_model/model-best/ner/moves ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f38206131ebb4afb533f4d694beb30e0983537efed33f065b64acfc582e81edd
3
+ size 1532
spacy_model/model-best/pipeline.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6a1276222b80fe61d3023384a4019ccf2e940c1565ab9ad523efe756a858125
3
+ size 1492
spacy_model/model-best/tok2vec/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a5a26e3056eb6fb06deeb3dbccfd88ae74900200c98c70b5966bbb7ec9d4de
3
+ size 4
spacy_model/model-best/tok2vec/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efa61cb31c443234eb772adc66a6803f84de2db5bf5be192a688f8a48c5cfdac
3
+ size 34126801
spacy_model/model-best/tokenizer ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:673e9a42da7a084a0456720f4d2db1257598bcf0c8717e37a85aa98ecb086c44
3
+ size 32716
spacy_model/model-best/vocab/app.py ADDED
File without changes
spacy_model/model-best/vocab/key2row ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
spacy_model/model-best/vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
spacy_model/model-best/vocab/strings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf49e22d635a86b3a698595916eb888c23d0443599a372cd81a75d0c0b69ed3
3
+ size 12443841
spacy_model/model-best/vocab/vectors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14772b683e726436d5948ad3fff2b43d036ef2ebbe3458aafed6004e05a40706
3
+ size 128
spacy_model/model-best/vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff4359091952c8cd16f1f0482f5770fb82d1707368d5cca3c46aa501f552e3c5
3
+ size 22