AMWAL v1: spaCy Arabic Financial NER with normalization pipeline
Browse files- .gitattributes +1 -35
- __pycache__/amwal.cpython-312.pyc +0 -0
- amwal.py +70 -0
- readme.md +7 -0
- requirements.txt +2 -0
- spacy_model/model-best/config.cfg +3 -0
- spacy_model/model-best/meta.json +3 -0
- spacy_model/model-best/ner/cfg +3 -0
- spacy_model/model-best/ner/model +3 -0
- spacy_model/model-best/ner/moves +3 -0
- spacy_model/model-best/pipeline.py +3 -0
- spacy_model/model-best/tok2vec/cfg +3 -0
- spacy_model/model-best/tok2vec/model +3 -0
- spacy_model/model-best/tokenizer +3 -0
- spacy_model/model-best/vocab/app.py +0 -0
- spacy_model/model-best/vocab/key2row +3 -0
- spacy_model/model-best/vocab/lookups.bin +3 -0
- spacy_model/model-best/vocab/strings.json +3 -0
- spacy_model/model-best/vocab/vectors +3 -0
- spacy_model/model-best/vocab/vectors.cfg +3 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
spacy_model/** filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/amwal.cpython-312.pyc
ADDED
|
Binary file (2.27 kB). View file
|
|
|
amwal.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
import spacy
|
| 4 |
+
from huggingface_hub import snapshot_download
|
| 5 |
+
|
| 6 |
+
# -----------------------
|
| 7 |
+
# Arabic normalization
|
| 8 |
+
# -----------------------
|
| 9 |
+
|
| 10 |
+
_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")
|
| 11 |
+
|
| 12 |
+
def normalize_arabic(text: str) -> str:
|
| 13 |
+
# 1. Remove diacritics
|
| 14 |
+
text = _DIACRITICS.sub("", text)
|
| 15 |
+
|
| 16 |
+
# 2. Normalize hamza variants
|
| 17 |
+
text = re.sub(r"[ุฅุฃุข]", "ุง", text) # ุฅ ุฃ ุข โ ุง
|
| 18 |
+
text = re.sub(r"[ุคุฆ]", "ุก", text) # ุค ุฆ โ ุก
|
| 19 |
+
|
| 20 |
+
# 3. Normalize other orthographic variants
|
| 21 |
+
text = re.sub(r"ุฉ", "ู", text) # ุฉ โ ู
|
| 22 |
+
text = re.sub(r"ู", "ู", text) # ู โ ู
|
| 23 |
+
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
# -----------------------
|
| 27 |
+
# Loader
|
| 28 |
+
# -----------------------
|
| 29 |
+
_MODEL = None
|
| 30 |
+
|
| 31 |
+
def load_ner(
|
| 32 |
+
repo_id="Muhsabrys/AMWAL-ner-arabic",
|
| 33 |
+
local_path=None,
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Load AMWAL NER:
|
| 37 |
+
- from local_path (development / testing)
|
| 38 |
+
- or from Hugging Face (default)
|
| 39 |
+
"""
|
| 40 |
+
global _MODEL
|
| 41 |
+
|
| 42 |
+
if _MODEL is None:
|
| 43 |
+
if local_path is not None:
|
| 44 |
+
model_path = os.path.join(local_path, "spacy_model", "model-best")
|
| 45 |
+
else:
|
| 46 |
+
path = snapshot_download(repo_id=repo_id)
|
| 47 |
+
model_path = os.path.join(path, "spacy_model", "model-best")
|
| 48 |
+
|
| 49 |
+
_MODEL = spacy.load(model_path)
|
| 50 |
+
|
| 51 |
+
def ner(text: str):
|
| 52 |
+
raw = text
|
| 53 |
+
text_norm = normalize_arabic(text)
|
| 54 |
+
doc = _MODEL(text_norm)
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
"raw_text": raw,
|
| 58 |
+
"normalized_text": text_norm,
|
| 59 |
+
"entities": [
|
| 60 |
+
{
|
| 61 |
+
"text": ent.text,
|
| 62 |
+
"label": ent.label_,
|
| 63 |
+
"start": ent.start_char,
|
| 64 |
+
"end": ent.end_char,
|
| 65 |
+
}
|
| 66 |
+
for ent in doc.ents
|
| 67 |
+
],
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
return ner
|
readme.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from amwal import load_ner
|
| 2 |
+
|
| 3 |
+
ner = load_ner()
|
| 4 |
+
|
| 5 |
+
text = "ุฃุนูู ุตูุฏูู ูุทุฑ ุงูุณูุงุฏู ุนู ุงุณุชุซู
ุงุฑ ุจููู
ุฉ 500 ู
ูููู ุฏููุงุฑ ุฃู
ุฑููู ูู ุณูุฏุงุช ุญููู
ูุฉ ูุงุจุงููุฉ ู
ููู
ุฉ ุจุงููู ูู ุทูููู."
|
| 6 |
+
out = ner(text)
|
| 7 |
+
print(out)
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spacy>=3.7.0
|
| 2 |
+
huggingface_hub>=0.20.0
|
spacy_model/model-best/config.cfg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c5db8dda5151ec3ab7983258bffcd45c70442618fc1dec41edfcca1f924f7da
|
| 3 |
+
size 2722
|
spacy_model/model-best/meta.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:687cf15fbd3c932992f5b2c23dab124b33c54d9944df47bcd9238ea1068ca76d
|
| 3 |
+
size 3271
|
spacy_model/model-best/ner/cfg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7172edadafba9f472e9ac0f2660eec04b6405e471be9e20267b79c67288d22d
|
| 3 |
+
size 221
|
spacy_model/model-best/ner/model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc12d1c22c89822ecf06fdc9c456589f1f85bbc749cca862fafa95595efde0db
|
| 3 |
+
size 189277
|
spacy_model/model-best/ner/moves
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f38206131ebb4afb533f4d694beb30e0983537efed33f065b64acfc582e81edd
|
| 3 |
+
size 1532
|
spacy_model/model-best/pipeline.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6a1276222b80fe61d3023384a4019ccf2e940c1565ab9ad523efe756a858125
|
| 3 |
+
size 1492
|
spacy_model/model-best/tok2vec/cfg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8a5a26e3056eb6fb06deeb3dbccfd88ae74900200c98c70b5966bbb7ec9d4de
|
| 3 |
+
size 4
|
spacy_model/model-best/tok2vec/model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efa61cb31c443234eb772adc66a6803f84de2db5bf5be192a688f8a48c5cfdac
|
| 3 |
+
size 34126801
|
spacy_model/model-best/tokenizer
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:673e9a42da7a084a0456720f4d2db1257598bcf0c8717e37a85aa98ecb086c44
|
| 3 |
+
size 32716
|
spacy_model/model-best/vocab/app.py
ADDED
|
File without changes
|
spacy_model/model-best/vocab/key2row
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
|
| 3 |
+
size 1
|
spacy_model/model-best/vocab/lookups.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
|
| 3 |
+
size 1
|
spacy_model/model-best/vocab/strings.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcf49e22d635a86b3a698595916eb888c23d0443599a372cd81a75d0c0b69ed3
|
| 3 |
+
size 12443841
|
spacy_model/model-best/vocab/vectors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:14772b683e726436d5948ad3fff2b43d036ef2ebbe3458aafed6004e05a40706
|
| 3 |
+
size 128
|
spacy_model/model-best/vocab/vectors.cfg
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff4359091952c8cd16f1f0482f5770fb82d1707368d5cca3c46aa501f552e3c5
|
| 3 |
+
size 22
|