SondosM commited on
Commit
7f255a3
·
verified ·
1 Parent(s): b319fc8

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ arabic_sign_lang_features.csv filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first for layer caching
13
+ COPY requirements.txt .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy application code
19
+ COPY app.py .
20
+
21
+ # Copy data files (add your CSV and keypoints here)
22
+ # COPY arabic_sign_lang_features.csv .
23
+ # COPY keypoints/ keypoints/
24
+
25
+ # Expose port (HF Spaces requires 7860)
26
+ EXPOSE 7860
27
+
28
+ # Set environment variables
29
+ ENV PYTHONUNBUFFERED=1
30
+ ENV CSV_PATH=arabic_sign_lang_features.csv
31
+ ENV KEYPOINTS_FOLDER=keypoints
32
+ ENV SIMILARITY_THRESHOLD=0.72
33
+
34
+ # Run the app
35
+ CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,64 @@
1
  ---
2
- title: AvatarAPI
3
- emoji: 😻
4
  colorFrom: green
5
- colorTo: yellow
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Arabic Sign Language NLP API
3
+ emoji: 🤟
4
  colorFrom: green
5
+ colorTo: blue
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # Arabic Sign Language NLP API
12
+
13
+ Translates Arabic text (Fus-ha and Ammiya) into sign animation sequences.
14
+
15
+ ## Endpoints
16
+
17
+ | Method | Path | Description |
18
+ |--------|------|-------------|
19
+ | GET | `/` | Health check — returns model info and sign count |
20
+ | POST | `/translate` | Translate Arabic text (JSON body) |
21
+ | GET | `/translate?text=...` | Quick translate via URL param |
22
+ | GET | `/signs` | List all signs in the database |
23
+ | GET | `/sequence-file` | Read the last saved sequence file |
24
+
25
+ ## POST /translate
26
+
27
+ ```json
28
+ {
29
+ "text": "انا عايز اروح المدرسة",
30
+ "save_sequence": false
31
+ }
32
+ ```
33
+
34
+ **Response:**
35
+ ```json
36
+ {
37
+ "status": "success",
38
+ "input_text": "انا عايز اروح المدرسة",
39
+ "sequence": ["انا", "يريد", "يذهب", "مدرسة"],
40
+ "total_steps": 4,
41
+ "sign_count": 4,
42
+ "letter_count": 0,
43
+ "missing_keypoint_files": [],
44
+ "detailed_plan": [...]
45
+ }
46
+ ```
47
+
48
+ ## Setup
49
+
50
+ 1. Upload your `arabic_sign_lang_features.csv` to the Space files.
51
+ 2. (Optional) Upload your `keypoints/` folder for `.npy` validation.
52
+ 3. Set `CSV_PATH` env variable if your CSV has a different name.
53
+
54
+ ## Environment Variables
55
+
56
+ | Variable | Default | Description |
57
+ |----------|---------|-------------|
58
+ | `CSV_PATH` | `arabic_sign_lang_features.csv` | Path to sign label CSV |
59
+ | `KEYPOINTS_FOLDER` | `keypoints` | Folder with .npy files |
60
+ | `SIMILARITY_THRESHOLD` | `0.72` | AraBERT match threshold |
61
+
62
+ ## Interactive Docs
63
+
64
+ Visit `/docs` for the Swagger UI.
app.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import logging
5
+ import warnings
6
+ from pathlib import Path
7
+ from typing import List, Dict, Optional, Tuple
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ import torch
14
+ import stanza
15
+ import pyarabic.araby as araby
16
+ from sentence_transformers import SentenceTransformer, util
17
+ from fastapi import FastAPI, HTTPException, Query
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel, Field
20
+
21
+ warnings.filterwarnings("ignore")
22
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
23
+ logger = logging.getLogger("ArabicSignNLP")
24
+
25
+
26
+ # ----- Project Configuration -----
27
+ class Config:
28
+ # Path to your CSV dataset containing sign labels
29
+ # On HF Spaces, upload your CSV to the repo and set the path here
30
+ CSV_PATH: str = os.getenv("CSV_PATH", "arabic_sign_lang_features.csv")
31
+
32
+ # Folder where .npy keypoint files are stored (optional on HF Spaces)
33
+ KEYPOINTS_FOLDER: str = os.getenv("KEYPOINTS_FOLDER", "keypoints")
34
+
35
+ # Output file path for Blender sequence
36
+ SEQUENCE_OUTPUT_PATH: str = "/tmp/sequence.txt"
37
+
38
+ # AraBERT model for Arabic semantic understanding
39
+ EMBEDDING_MODEL: str = "aubmindlab/bert-base-arabertv2"
40
+
41
+ # Similarity threshold for sign matching
42
+ SIMILARITY_THRESHOLD: float = float(os.getenv("SIMILARITY_THRESHOLD", "0.72"))
43
+
44
+ # Include prepositions in signing
45
+ INCLUDE_PREPOSITION_WORDS: bool = False
46
+
47
+ # FastAPI server settings
48
+ API_HOST: str = "0.0.0.0"
49
+ API_PORT: int = 7860 # HF Spaces uses port 7860
50
+
51
+ # Column name in your CSV that contains the sign labels
52
+ CSV_LABEL_COLUMN: str = "label"
53
+
54
+
55
+ # ----- Arabic Letter Mapping -----
56
+ ARABIC_LETTER_TO_LABEL: Dict[str, str] = {
57
+ "ا": "Alef", "أ": "Alef", "إ": "Alef", "آ": "Alef",
58
+ "ب": "Beh", "ت": "Teh", "ة": "Teh_Marbuta",
59
+ "ث": "Theh", "ج": "Jeem", "ح": "Hah",
60
+ "خ": "Khah", "د": "Dal", "ذ": "Thal",
61
+ "ر": "Reh", "ز": "Zain", "س": "Seen",
62
+ "ش": "Sheen", "ص": "Sad", "ض": "Dad",
63
+ "ط": "Tah", "ظ": "Zah", "ع": "Ain",
64
+ "غ": "Ghain", "ف": "Feh", "ق": "Qaf",
65
+ "ك": "Kaf", "ل": "Lam", "م": "Meem",
66
+ "ن": "Noon", "ه": "Heh", "و": "Waw",
67
+ "ي": "Yeh", "ى": "Yeh", "لا": "Laa",
68
+ }
69
+
70
+
71
+ # ----- Text Normalizer -----
72
+ class ArabicTextNormalizer:
73
+ DIALECT_TO_FUSA: Dict[str, str] = {
74
+ "مش": "لا", "مو": "لا", "ماش": "لا",
75
+ "عايز": "يريد", "عاوز": "يريد", "بدي": "يريد", "بدك": "يريد", "بده": "يريد",
76
+ "حابب": "يحب", "بحب": "يحب", "باحب": "يحب", "بتحب": "يحب",
77
+ "فين": "اين", "وين": "اين", "منين": "من اين", "منيين": "من اين",
78
+ "ايه": "ماذا", "ايش": "ماذا", "شو": "ماذا", "وش": "ماذا",
79
+ "كيفك": "كيف حالك", "كيفكم": "كيف حالكم", "عامل ايه": "كيف حالك",
80
+ "تعال": "اقبل", "تعالى": "اقبل",
81
+ "هيك": "هكذا", "كده": "هكذا", "كدا": "هكذا", "هكيه": "هكذا",
82
+ "دلوقتي": "الان", "دلوقت": "الان", "هلا": "الان", "هلق": "الان", "هسه": "الان",
83
+ "بكره": "غدا", "بكرا": "غدا", "بكرة": "غدا",
84
+ "امبارح": "امس", "مبارح": "امس",
85
+ "ليش": "لماذا", "ليه": "لماذا", "علاش": "لماذا",
86
+ "تمام": "جيد", "ماشي": "جيد", "عادي": "جيد",
87
+ "روح": "يذهب", "اروح": "يذهب", "يروح": "يذهب", "رايح": "يذهب",
88
+ "جاي": "يأتي", "جاية": "يأتي", "جاييين": "يأتي",
89
+ "اشتري": "يشتري", "اشترى": "يشتري", "بشتري": "يشتري", "بيشتري": "يشتري",
90
+ "باكل": "ياكل", "بياكل": "ياكل",
91
+ "بشرب": "يشرب", "بيشرب": "يشرب",
92
+ "عارف": "يعرف", "عارفة": "يعرف", "بعرف": "يعرف",
93
+ "شغل": "عمل", "بشتغل": "يعمل", "بيشتغل": "يعمل",
94
+ }
95
+
96
+ _SUFFIXES = ["ين", "ون", "ات", "ة", "ها", "هم", "هن", "كم", "كن", "نا", "وا", "ا"]
97
+
98
+ def __init__(self):
99
+ self._non_arabic_pattern = re.compile(r"[^\u0600-\u06FF\s]")
100
+ self._multi_space_pattern = re.compile(r"\s+")
101
+ self._tatweel_pattern = re.compile(r"\u0640+")
102
+
103
+ def normalize(self, text: str) -> str:
104
+ if not text or not isinstance(text, str):
105
+ raise ValueError("Input text must be a non-empty string.")
106
+ text = text.strip()
107
+ text = self._apply_dialect_mapping(text)
108
+ text = araby.strip_tashkeel(text)
109
+ text = self._tatweel_pattern.sub("", text)
110
+ text = re.sub(r"[\u0625\u0623\u0622]", "\u0627", text)
111
+ text = re.sub(r"[\u0624\u0626]", "\u0648", text)
112
+ text = re.sub(r"\u0649(?=\s|$)", "\u064a", text)
113
+ text = re.sub(r"\u0629(?=\s|$)", "\u0647", text)
114
+ text = self._non_arabic_pattern.sub(" ", text)
115
+ text = self._multi_space_pattern.sub(" ", text).strip()
116
+ if not text:
117
+ raise ValueError("Text became empty after normalization.")
118
+ return text
119
+
120
+ def _apply_dialect_mapping(self, text: str) -> str:
121
+ words = text.split()
122
+ result = []
123
+ for word in words:
124
+ if word in self.DIALECT_TO_FUSA:
125
+ result.append(self.DIALECT_TO_FUSA[word])
126
+ continue
127
+ matched = False
128
+ for suffix in self._SUFFIXES:
129
+ if word.endswith(suffix) and len(word) > len(suffix) + 1:
130
+ root = word[: -len(suffix)]
131
+ if root in self.DIALECT_TO_FUSA:
132
+ result.append(self.DIALECT_TO_FUSA[root])
133
+ matched = True
134
+ break
135
+ if not matched:
136
+ result.append(word)
137
+ return " ".join(result)
138
+
139
+ def normalize_label(self, label: str) -> str:
140
+ try:
141
+ return self.normalize(label)
142
+ except ValueError:
143
+ return label
144
+
145
+
146
+ # ----- NLP Processor -----
147
+ @dataclass
148
+ class ProcessedWord:
149
+ original: str
150
+ normalized: str
151
+ lemma: str
152
+ pos: str
153
+ is_person_name: bool
154
+ is_place_name: bool
155
+
156
+
157
+ class ArabicNLPProcessor:
158
+ SKIP_WORDS_CORE = {"و", "ف", "ب", "ل", "ك", "ال", "قد", "لقد", "سوف", "ان", "إن", "لان", "حتى", "كي"}
159
+ SKIP_WORDS_PREPOSITIONS = {"في", "من", "الى", "على", "عن", "مع", "عند", "لدى"}
160
+ _AL_WHITELIST = {"الان", "الله", "الذي", "التي", "اللذين", "اللتين"}
161
+
162
+ def _active_skip_words(self) -> set:
163
+ s = set(self.SKIP_WORDS_CORE)
164
+ if not Config.INCLUDE_PREPOSITION_WORDS:
165
+ s.update(self.SKIP_WORDS_PREPOSITIONS)
166
+ return s
167
+
168
+ def __init__(self):
169
+ self._pipeline = None
170
+
171
+ def load(self):
172
+ logger.info("Downloading Stanza Arabic models...")
173
+ stanza.download("ar", verbose=False)
174
+ self._pipeline = stanza.Pipeline(lang="ar", processors="tokenize,mwt,pos,lemma,ner", verbose=False)
175
+ logger.info("Stanza Arabic pipeline ready.")
176
+
177
+ def _strip_al(self, word: str) -> str:
178
+ if word in self._AL_WHITELIST:
179
+ return word
180
+ if word.startswith("ال") and len(word) > 3:
181
+ return word[2:]
182
+ return word
183
+
184
+ def process(self, normalized_text: str) -> List[ProcessedWord]:
185
+ if self._pipeline is None:
186
+ raise RuntimeError("Call load() before process().")
187
+ doc = self._pipeline(normalized_text)
188
+ results: List[ProcessedWord] = []
189
+ skip_words = self._active_skip_words()
190
+ for sentence in doc.sentences:
191
+ for word in sentence.words:
192
+ if word.text in skip_words:
193
+ continue
194
+ if word.pos in {"PUNCT", "SYM", "X", "DET", "CCONJ", "SCONJ"}:
195
+ continue
196
+ if len(word.text) <= 1:
197
+ continue
198
+ ner_tag = word.parent.ner if word.parent else "O"
199
+ normalized = self._strip_al(word.text)
200
+ results.append(ProcessedWord(
201
+ original=word.text,
202
+ normalized=normalized,
203
+ lemma=word.lemma if word.lemma else word.text,
204
+ pos=word.pos if word.pos else "NOUN",
205
+ is_person_name="PER" in ner_tag or "PERS" in ner_tag,
206
+ is_place_name="LOC" in ner_tag or "GPE" in ner_tag,
207
+ ))
208
+ return results
209
+
210
+
211
+ # ----- Sign Matcher -----
212
+ @dataclass
213
+ class SignMatch:
214
+ found: bool
215
+ sign_label: str
216
+ confidence: float
217
+ method: str
218
+
219
+
220
+ class SemanticSignMatcher:
221
+ def __init__(self, csv_path: str, label_column: str, threshold: float):
222
+ self.threshold = threshold
223
+ self._word_signs: List[str] = []
224
+ self._raw_labels: List[str] = []
225
+ self._sign_embeddings = None
226
+ self._model: Optional[SentenceTransformer] = None
227
+ self._device = "cuda" if torch.cuda.is_available() else "cpu"
228
+ self._normalizer: Optional[ArabicTextNormalizer] = None
229
+ self._load_database(csv_path, label_column)
230
+
231
+ def set_normalizer(self, normalizer: ArabicTextNormalizer):
232
+ self._normalizer = normalizer
233
+
234
+ def _normalize_label(self, label: str) -> str:
235
+ if self._normalizer:
236
+ return self._normalizer.normalize_label(label)
237
+ return label
238
+
239
+ def _load_database(self, csv_path: str, label_column: str):
240
+ if not os.path.exists(csv_path):
241
+ logger.warning(f"CSV not found at {csv_path}. No word signs loaded.")
242
+ return
243
+ df = pd.read_csv(csv_path, low_memory=False)
244
+ if label_column not in df.columns:
245
+ raise ValueError(f"Column '{label_column}' not found. Available: {list(df.columns)}")
246
+ all_labels = df[label_column].dropna().unique().tolist()
247
+ arabic_labels = [
248
+ str(l) for l in all_labels
249
+ if isinstance(l, str) and any("\u0600" <= c <= "\u06ff" for c in str(l))
250
+ ]
251
+ self._raw_labels = arabic_labels
252
+ self._word_signs = arabic_labels.copy()
253
+ logger.info(f"Database: {len(arabic_labels)} Arabic word labels loaded.")
254
+
255
+ def _finalize_labels(self):
256
+ if self._normalizer and self._raw_labels:
257
+ self._word_signs = [self._normalize_label(l) for l in self._raw_labels]
258
+
259
+ def load_model(self):
260
+ self._finalize_labels()
261
+ if not self._word_signs:
262
+ logger.warning("No Arabic words to encode. Skipping model load.")
263
+ return
264
+ logger.info(f"Loading {Config.EMBEDDING_MODEL} on {self._device} ...")
265
+ self._model = SentenceTransformer(Config.EMBEDDING_MODEL, device=self._device)
266
+ logger.info(f"Encoding {len(self._word_signs)} labels...")
267
+ self._sign_embeddings = self._model.encode(
268
+ self._word_signs, convert_to_tensor=True, device=self._device,
269
+ show_progress_bar=True, batch_size=64,
270
+ )
271
+ logger.info("Sign matcher ready.")
272
+
273
+ def find_sign(self, word_text: str, lemma: str) -> SignMatch:
274
+ if not self._word_signs:
275
+ return SignMatch(found=False, sign_label="", confidence=0.0, method="none")
276
+ norm_word = self._normalize_label(word_text)
277
+ norm_lemma = self._normalize_label(lemma) if lemma else ""
278
+ if norm_word in self._word_signs:
279
+ idx = self._word_signs.index(norm_word)
280
+ return SignMatch(True, self._raw_labels[idx], 1.0, "exact")
281
+ if norm_lemma and norm_lemma != norm_word and norm_lemma in self._word_signs:
282
+ idx = self._word_signs.index(norm_lemma)
283
+ return SignMatch(True, self._raw_labels[idx], 0.95, "lemma")
284
+ if self._model is None or self._sign_embeddings is None:
285
+ return SignMatch(False, "", 0.0, "none")
286
+ candidates = list({norm_word, norm_lemma} - {""})
287
+ embs = self._model.encode(candidates, convert_to_tensor=True, device=self._device, batch_size=len(candidates))
288
+ scores = util.cos_sim(embs, self._sign_embeddings)
289
+ best_val = float(scores.max())
290
+ best_idx = int(scores.argmax() % len(self._word_signs))
291
+ if best_val >= self.threshold:
292
+ return SignMatch(True, self._raw_labels[best_idx], best_val, "semantic")
293
+ return SignMatch(False, self._raw_labels[best_idx] if self._raw_labels else "", best_val, "none")
294
+
295
+ def letter_to_label(self, arabic_letter: str) -> Optional[str]:
296
+ return ARABIC_LETTER_TO_LABEL.get(arabic_letter)
297
+
298
+ @property
299
+ def available_signs(self) -> List[str]:
300
+ return self._raw_labels.copy()
301
+
302
+
303
+ # ----- Execution Plan Builder -----
304
+ class ActionType(str, Enum):
305
+ SIGN = "SIGN"
306
+ LETTER = "LETTER"
307
+
308
+
309
+ @dataclass
310
+ class ExecutionStep:
311
+ action_type: ActionType
312
+ identifier: str
313
+ source_word: str
314
+ confidence: float
315
+ match_method: str
316
+
317
+
318
+ class ExecutionPlanBuilder:
319
+ def __init__(self, normalizer: ArabicTextNormalizer, nlp_proc: ArabicNLPProcessor, matcher: SemanticSignMatcher):
320
+ self.normalizer = normalizer
321
+ self.nlp_proc = nlp_proc
322
+ self.matcher = matcher
323
+
324
+ def build(self, raw_text: str) -> List[ExecutionStep]:
325
+ normalized = self.normalizer.normalize(raw_text)
326
+ processed_words = self.nlp_proc.process(normalized)
327
+ plan: List[ExecutionStep] = []
328
+ for word in processed_words:
329
+ if word.is_person_name or word.is_place_name:
330
+ plan.extend(self._fingerspell(word.original))
331
+ continue
332
+ match = self.matcher.find_sign(word.normalized, word.lemma)
333
+ if match.found:
334
+ plan.append(ExecutionStep(ActionType.SIGN, match.sign_label, word.original, match.confidence, match.method))
335
+ else:
336
+ plan.extend(self._fingerspell(word.original))
337
+ return plan
338
+
339
+ def _fingerspell(self, word: str) -> List[ExecutionStep]:
340
+ steps = []
341
+ i = 0
342
+ while i < len(word):
343
+ if i + 1 < len(word) and word[i:i+2] == "لا":
344
+ label = ARABIC_LETTER_TO_LABEL.get("لا")
345
+ if label:
346
+ steps.append(ExecutionStep(ActionType.LETTER, label, word, 1.0, "fingerspell"))
347
+ i += 2
348
+ continue
349
+ letter = word[i]
350
+ label = ARABIC_LETTER_TO_LABEL.get(letter)
351
+ if label:
352
+ steps.append(ExecutionStep(ActionType.LETTER, label, word, 1.0, "fingerspell"))
353
+ i += 1
354
+ return steps
355
+
356
+
357
+ # ----- Sequence Writer -----
358
+ class BlenderSequenceWriter:
359
+ def __init__(self, output_path: str, keypoints_folder: str):
360
+ self.output_path = output_path
361
+ self.keypoints_folder = keypoints_folder
362
+
363
+ def write(self, plan: List[ExecutionStep]) -> Dict:
364
+ if not plan:
365
+ raise ValueError("Execution plan is empty.")
366
+ output_dir = Path(self.output_path).parent
367
+ output_dir.mkdir(parents=True, exist_ok=True)
368
+ identifiers = [step.identifier for step in plan]
369
+ missing_files = self._check_missing_keypoints(plan)
370
+ with open(self.output_path, "w", encoding="utf-8") as f:
371
+ f.write("\n".join(identifiers))
372
+ sign_steps = [s for s in plan if s.action_type == ActionType.SIGN]
373
+ letter_steps = [s for s in plan if s.action_type == ActionType.LETTER]
374
+ return {
375
+ "output_file": self.output_path,
376
+ "total_steps": len(plan),
377
+ "sign_count": len(sign_steps),
378
+ "letter_count": len(letter_steps),
379
+ "missing_keypoint_files": missing_files,
380
+ "sequence": identifiers,
381
+ "detailed_plan": [
382
+ {"step": i+1, "type": s.action_type.value, "identifier": s.identifier,
383
+ "source_word": s.source_word, "confidence": round(s.confidence, 3), "method": s.match_method}
384
+ for i, s in enumerate(plan)
385
+ ],
386
+ }
387
+
388
+ def _check_missing_keypoints(self, plan: List[ExecutionStep]) -> List[str]:
389
+ missing = []
390
+ for step in plan:
391
+ npy_path = os.path.join(self.keypoints_folder, f"{step.identifier}.npy")
392
+ if not os.path.exists(npy_path):
393
+ missing.append(f"{step.identifier}.npy")
394
+ return missing
395
+
396
+
397
+ # ----- Main Translator -----
398
+ class ArabicSignTranslator:
399
+ def __init__(self, plan_builder: ExecutionPlanBuilder, writer: BlenderSequenceWriter):
400
+ self.builder = plan_builder
401
+ self.writer = writer
402
+
403
+ def translate(self, text: str, save_to_file: bool = True) -> Dict:
404
+ plan = self.builder.build(text)
405
+ if not plan:
406
+ return {"status": "error", "message": "No translatable content found.", "input": text}
407
+ result = {"status": "success", "input": text}
408
+ if save_to_file:
409
+ report = self.writer.write(plan)
410
+ result.update(report)
411
+ else:
412
+ result["sequence"] = [step.identifier for step in plan]
413
+ result["total_steps"] = len(plan)
414
+ result["sign_count"] = sum(1 for s in plan if s.action_type == ActionType.SIGN)
415
+ result["letter_count"] = sum(1 for s in plan if s.action_type == ActionType.LETTER)
416
+ result["missing_keypoint_files"] = []
417
+ result["detailed_plan"] = [
418
+ {"type": s.action_type.value, "identifier": s.identifier,
419
+ "source_word": s.source_word, "confidence": round(s.confidence, 3), "method": s.match_method}
420
+ for s in plan
421
+ ]
422
+ return result
423
+
424
+
425
+ # ----- Initialize Components -----
426
+ logger.info("Initializing pipeline components...")
427
+ normalizer = ArabicTextNormalizer()
428
+ nlp_processor = ArabicNLPProcessor()
429
+ nlp_processor.load()
430
+
431
+ sign_matcher = SemanticSignMatcher(
432
+ csv_path=Config.CSV_PATH,
433
+ label_column=Config.CSV_LABEL_COLUMN,
434
+ threshold=Config.SIMILARITY_THRESHOLD,
435
+ )
436
+ sign_matcher.set_normalizer(normalizer)
437
+ sign_matcher.load_model()
438
+
439
+ plan_builder = ExecutionPlanBuilder(normalizer, nlp_processor, sign_matcher)
440
+ writer = BlenderSequenceWriter(Config.SEQUENCE_OUTPUT_PATH, Config.KEYPOINTS_FOLDER)
441
+ translator = ArabicSignTranslator(plan_builder, writer)
442
+ logger.info("All components ready.")
443
+
444
+
445
+ # ----- FastAPI App -----
446
+ class TranslateRequest(BaseModel):
447
+ text: str = Field(description="Arabic input text (Fus-ha or Ammiya)", min_length=1, max_length=4000, examples=["انا عايز اروح المدرسة"])
448
+ save_sequence: bool = Field(default=False, description="Save sequence file to /tmp/sequence.txt")
449
+
450
+
451
+ class StepDetail(BaseModel):
452
+ type: str
453
+ identifier: str
454
+ source_word: str
455
+ confidence: float
456
+ method: str
457
+
458
+
459
+ class TranslateResponse(BaseModel):
460
+ status: str
461
+ input_text: str
462
+ sequence: List[str]
463
+ total_steps: int
464
+ sign_count: int
465
+ letter_count: int
466
+ missing_keypoint_files: List[str]
467
+ detailed_plan: List[StepDetail]
468
+
469
+
470
+ app = FastAPI(
471
+ title="Arabic Sign Language NLP API",
472
+ description="Translates Arabic text (Fus-ha and Ammiya) into sign animation sequences.",
473
+ version="1.0.0",
474
+ )
475
+
476
+ app.add_middleware(
477
+ CORSMiddleware,
478
+ allow_origins=["*"],
479
+ allow_methods=["*"],
480
+ allow_headers=["*"],
481
+ )
482
+
483
+
484
+ @app.get("/")
485
+ def health_check():
486
+ return {
487
+ "status": "running",
488
+ "model": Config.EMBEDDING_MODEL,
489
+ "signs_in_database": len(sign_matcher.available_signs),
490
+ }
491
+
492
+
493
+ @app.post("/translate", response_model=TranslateResponse)
494
+ def translate_post(request: TranslateRequest):
495
+ try:
496
+ result = translator.translate(request.text, save_to_file=request.save_sequence)
497
+ except Exception as e:
498
+ raise HTTPException(status_code=500, detail=str(e))
499
+ if result["status"] == "error":
500
+ raise HTTPException(status_code=422, detail=result["message"])
501
+ return TranslateResponse(
502
+ status=result["status"],
503
+ input_text=request.text,
504
+ sequence=result.get("sequence", []),
505
+ total_steps=result.get("total_steps", 0),
506
+ sign_count=result.get("sign_count", 0),
507
+ letter_count=result.get("letter_count", 0),
508
+ missing_keypoint_files=result.get("missing_keypoint_files", []),
509
+ detailed_plan=[
510
+ StepDetail(type=s["type"], identifier=s["identifier"], source_word=s["source_word"],
511
+ confidence=s["confidence"], method=s["method"])
512
+ for s in result.get("detailed_plan", [])
513
+ ],
514
+ )
515
+
516
+
517
+ @app.get("/translate")
518
+ def translate_get(
519
+ text: str = Query(description="Arabic text to translate"),
520
+ save_sequence: bool = Query(default=False),
521
+ ):
522
+ return translate_post(TranslateRequest(text=text, save_sequence=save_sequence))
523
+
524
+
525
+ @app.get("/signs")
526
+ def list_signs():
527
+ return {"total": len(sign_matcher.available_signs), "signs": sign_matcher.available_signs}
528
+
529
+
530
+ @app.get("/sequence-file")
531
+ def read_sequence_file():
532
+ path = Config.SEQUENCE_OUTPUT_PATH
533
+ if not os.path.exists(path):
534
+ raise HTTPException(status_code=404, detail="Sequence file not found. Run a translation first.")
535
+ with open(path, "r", encoding="utf-8") as f:
536
+ lines = [line.strip() for line in f.readlines() if line.strip()]
537
+ return {"file_path": path, "sequence": lines, "count": len(lines)}
538
+
539
+
540
+ if __name__ == "__main__":
541
+ import uvicorn
542
+ uvicorn.run(app, host=Config.API_HOST, port=Config.API_PORT)
arabic_sign_lang_features.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239288257f7eb9dff6c3f957536b066887f2ef30224caf97453ad087e1df34af
3
+ size 16081522
keypoints/أرقام.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fcca94e751888514e14418aba9743d8ca80f2ff73085befb669f42a3b6d2290
3
+ size 40808
keypoints/حجاب.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60d098fde9afb65ed8bf554003ebc70ec222e516664890340df3cf23cb32997
3
+ size 81488
keypoints/طاوله.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66024eb0527d27eb6107ca78201623987ae88a1e628686cd19e0f8d9a5e3b39
3
+ size 81488
keypoints/كلمه.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5d74b528ae26e9927b825d8919108de076ab47b4e831079857219a45e7925e
3
+ size 54368
keypoints/مع_السلامه.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43cff4780d51b6ae7758ba13a9cba505205e3afa1cc2c6d9f10a918d22d2d249
3
+ size 81488
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pyarabic
4
+ stanza
5
+ sentence-transformers
6
+ transformers
7
+ torch
8
+ pandas
9
+ numpy
10
+ python-multipart
11
+ pydantic