Mr-HASSAN commited on
Commit
4907c9e
·
verified ·
1 Parent(s): e614c70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -244
app.py CHANGED
@@ -1,313 +1,322 @@
1
- import time
2
  import os
3
-
4
  import cv2
5
- import numpy as np
6
  import gradio as gr
7
- import gradio.utils as gr_utils
8
- from ultralytics import YOLO
9
- from PIL import Image, ImageDraw, ImageFont
10
- import arabic_reshaper
11
- from bidi.algorithm import get_display
12
-
13
  import google.generativeai as genai
 
 
14
  import torch
15
 
16
- # ==========================
17
- # ⚠️ Gemini API Key
18
- # ==========================
 
 
19
 
20
- GEMINI_API_KEY = "YOUR_GEMINI_API_KEY_HERE" # ← حط المفتاح هنا
21
- genai.configure(api_key=GEMINI_API_KEY)
 
22
 
23
- # ==========================
24
- # Patch لمشكلة Spaces hot-reload
25
- # ==========================
26
 
27
- # داخل Hugging Face Spaces، SPACE_ID يكون موجود في الـ env
28
- if os.getenv("SPACE_ID"):
29
- def _no_watchfn_spaces(*args, **kwargs):
30
- # نطفي الـ hot-reload اللي يسبب RuntimeError
31
- return
32
- # نكتب فوق الدالة الأصلية
33
- gr_utils.watchfn_spaces = _no_watchfn_spaces
34
 
35
- # ==========================
36
- # إعدادات أداء PyTorch / GPU
37
- # ==========================
 
 
 
 
 
 
38
 
39
- torch.backends.cudnn.benchmark = True # تسريع الـ conv على GPU
40
 
41
- DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
42
- USE_HALF = DEVICE.startswith("cuda")
43
- print("🔥 Using device:", DEVICE)
 
 
 
 
 
 
 
 
44
 
45
- # ==========================
46
- # إعدادات YOLO + الثوابت
47
- # ==========================
48
 
49
  WEIGHTS_PATH = "best.pt"
50
- IMG_SIZE = 256
51
- CONF_THRESHOLD = 0.5
52
 
53
- MIN_STABLE_FRAMES = 3
54
- WARN_BEFORE_RESET = 1.5
55
- RESET_DELAY = 2.5
 
 
56
 
57
  arabic_map = {
58
- "aleff": "ا", "bb": "ب", "ta": "ت", "thaa": "ث", "jeem": "ج",
59
- "haa": "ح", "khaa": "خ", "dal": "د", "thal": "ذ", "ra": "ر",
60
- "zay": "ز", "seen": "س", "sheen": "ش", "saad": "ص", "dhad": "ض",
61
- "taa": "ط", "dha": "ظ", "ain": "ع", "ghain": "غ", "fa": "ف",
62
- "gaaf": "ق", "kaaf": "ك", "laam": "ل", "la": "لا", "meem": "م",
63
- "nun": "ن", "ha": "ه", "waw": "و", "ya": "ي", "yaa": "ي",
64
- "toot": "ة", "al": "ال"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  }
66
 
67
- SYSTEM_PROMPT = (
68
- "أنت مساعد ذكي يستقبل كلمات أو جمل قصيرة قادمة من مترجم لغة "
69
- "الإشارة العربية، ودورك أن تعيد صياغتها كنص عربي واضح ومفهوم، "
70
- "أو تشرح معناها باختصار إذا كانت كلمة واحدة."
71
- )
72
 
73
- # ==========================
74
- # إعداد خط عربي مرة وحدة
75
- # ==========================
 
 
 
76
 
77
- DEFAULT_FONT_SIZE = 24
78
- DEFAULT_FONT_PATH = "NotoNaskhArabic-VariableFont_wght.ttf"
79
 
80
- try:
81
- FONT_AR = ImageFont.truetype(DEFAULT_FONT_PATH, DEFAULT_FONT_SIZE)
82
- except Exception:
83
- FONT_AR = ImageFont.load_default()
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- def prepare_arabic(text: str) -> str:
87
- reshaped = arabic_reshaper.reshape(text)
88
- bidi_text = get_display(reshaped)
89
- return bidi_text
90
 
91
- # ==========================
92
- # رسم الديتكشن بشكل سريع
93
- # ==========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- def draw_detections(result, frame, names):
96
  boxes = result.boxes
97
- detected_labels = []
98
 
99
  if boxes is None or len(boxes) == 0:
100
- return frame, detected_labels
101
-
102
- label_infos = []
103
 
 
104
  for box in boxes:
105
  x1, y1, x2, y2 = map(int, box.xyxy[0])
106
  cls_id = int(box.cls[0])
107
 
108
- if isinstance(names, dict):
109
- eng_label = names.get(cls_id, str(cls_id))
110
  else:
111
- eng_label = names[cls_id] if cls_id < len(names) else str(cls_id)
112
-
113
- ar_label = arabic_map.get(eng_label, eng_label)
114
- detected_labels.append(ar_label)
115
-
116
- # box
117
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
118
-
119
- label_bg_y1 = max(0, y1 - 35)
120
- label_bg_y2 = y1
121
- cv2.rectangle(
122
- frame,
123
- (x1, label_bg_y1),
124
- (x1 + 140, label_bg_y2),
125
  (0, 255, 0),
126
- -1,
127
  )
128
 
129
- label_infos.append(
130
- (
131
- prepare_arabic(ar_label),
132
- x1 + 5,
133
- label_bg_y1 + 5,
134
- )
135
- )
136
 
137
- img_pil = Image.fromarray(frame)
138
- draw = ImageDraw.Draw(img_pil)
139
 
140
- for bidi_text, tx, ty in label_infos:
141
- draw.text((tx, ty), bidi_text, font=FONT_AR, fill=(0, 0, 0))
 
142
 
143
- return np.array(img_pil), detected_labels
 
 
 
144
 
145
- # ==========================
146
- # تحميل YOLO على GPU + half
147
- # ==========================
148
 
149
- print("🔹 Loading YOLO model...")
150
- model = YOLO(WEIGHTS_PATH)
151
- model.to(DEVICE)
152
 
153
- if USE_HALF:
154
- try:
155
- model.model.half()
156
- print("⚡ Using half precision for YOLO on GPU")
157
- except Exception as e:
158
- print("⚠️ Could not enable half precision:", e)
159
 
160
- print("📚 Classes:", model.names)
161
 
162
- # ==========================
163
- # Gemini API Call
164
- # ==========================
 
 
 
 
165
 
166
- def call_gemini_on_word(word: str) -> str:
167
- if not word:
168
- return ""
 
169
 
170
- try:
171
- model_g = genai.GenerativeModel("gemini-1.5-flash")
 
172
 
173
- prompt = (
174
- SYSTEM_PROMPT
175
- + f"\n\nالنص القادم من مترجم لغة الإشارة هو: «{word}».\n"
176
- + "اكتب جملة قصيرة أو شرحًا بسيطًا بالعربية اعتمادًا على هذا النص."
177
- )
178
 
179
- response = model_g.generate_content(prompt)
180
- return (response.text or "").strip()
181
- except Exception as e:
182
- return f"خطأ Gemini: {e}"
183
-
184
- # ==========================
185
- # معالجة الفريم
186
- # ==========================
187
-
188
- def process_frame(
189
- frame,
190
- current_word="",
191
- last_label=None,
192
- stable_count=0,
193
- last_letter_time=None,
194
- chat_history=None,
195
- ):
196
- if chat_history is None:
197
- chat_history = []
198
-
199
- frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
200
- frame_bgr = cv2.flip(frame_bgr, 1)
201
-
202
- results = model.predict(
203
- frame_bgr,
204
- conf=CONF_THRESHOLD,
205
- imgsz=IMG_SIZE,
206
- verbose=False,
207
- device=DEVICE,
208
- half=USE_HALF,
209
- )[0]
210
 
211
- annotated, labels = draw_detections(results, frame_bgr, model.names)
 
 
212
 
213
- if labels:
214
- current_label = labels[0]
215
- if current_label == last_label:
216
- stable_count += 1
 
 
 
 
 
 
 
217
  else:
218
- last_label = current_label
219
- stable_count = 1
 
 
 
 
 
220
 
221
- if stable_count >= MIN_STABLE_FRAMES:
222
- current_word += current_label
223
- last_letter_time = time.time()
224
- stable_count = 0
225
 
226
- status_text = ""
 
227
 
228
- if current_word and last_letter_time is not None:
229
- elapsed = time.time() - last_letter_time
230
 
231
- if elapsed > RESET_DELAY:
232
- final_text = current_word
233
 
234
- chat_history.append(["🖐️ من الإشارات", final_text])
 
 
235
 
236
- gpt_reply = call_gemini_on_word(final_text)
 
 
237
 
238
- if gpt_reply:
239
- chat_history.append(["🤖 المساعد", gpt_reply])
240
 
241
- current_word = ""
242
- last_label = None
243
- stable_count = 0
244
- last_letter_time = None
245
 
246
- elif elapsed > WARN_BEFORE_RESET:
247
- status_text = f"الكلمة الحالية: {current_word} (سيتم إنهاؤها قريبًا)"
248
- else:
249
- status_text = f"الكلمة الحالية: {current_word}"
250
 
251
- annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
 
252
 
253
- return (
254
- annotated_rgb,
255
- status_text,
256
- current_word,
257
- last_label,
258
- stable_count,
259
- last_letter_time,
260
- chat_history,
261
- chat_history,
262
- )
263
 
264
- # ==========================
265
- # واجهة Gradio
266
- # ==========================
267
 
268
  with gr.Blocks() as demo:
269
- gr.Markdown("## ASL → Arabic Chat (YOLO + Gemini)")
270
-
271
- with gr.Row():
272
- cam = gr.Image(
273
- sources=["webcam"],
274
- streaming=True,
275
- type="numpy",
276
- label="الكاميرا",
277
- )
278
- video_out = gr.Image(label="النتيجة")
279
-
280
- word_status = gr.Markdown()
281
- chatbox = gr.Chatbot(label="الشات (إشارة → نص)")
282
-
283
- state_current_word = gr.State("")
284
- state_last_label = gr.State(None)
285
- state_stable_count = gr.State(0)
286
- state_last_letter_time = gr.State(None)
287
- state_chat_history = gr.State([])
288
-
289
- cam.stream(
290
- fn=process_frame,
291
- inputs=[
292
- cam,
293
- state_current_word,
294
- state_last_label,
295
- state_stable_count,
296
- state_last_letter_time,
297
- state_chat_history,
298
- ],
299
- outputs=[
300
- video_out,
301
- word_status,
302
- state_current_word,
303
- state_last_label,
304
- state_stable_count,
305
- state_last_letter_time,
306
- state_chat_history,
307
- chatbox,
308
- ],
309
- )
310
-
311
- # في Spaces ما نحتاج نحدد port غالباً
312
  if __name__ == "__main__":
313
- demo.launch()
 
 
1
  import os
 
2
  import cv2
 
3
  import gradio as gr
 
 
 
 
 
 
4
  import google.generativeai as genai
5
+ from ultralytics import YOLO
6
+ import tempfile
7
  import torch
8
 
9
+ # =============================
10
+ # اختيار الجهاز (GPU / CPU)
11
+ # =============================
12
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
+ print(f"🚀 Using device: {DEVICE}")
14
 
15
+ # =============================
16
+ # إعداد مفتاح Gemini (مكتوب صريح في الكود)
17
+ # =============================
18
 
19
+ GEMINI_API_KEY = "AIzaSyAvm28ZnTMaZ1Jtg9sYM-EO4qlAN2W4BIQ"
20
+ "
 
21
 
22
+ genai.configure(api_key=GEMINI_API_KEY)
 
 
 
 
 
 
23
 
24
+ SYSTEM_PROMPT = (
25
+ "لدي نص خام عبارة عن حروف عربية متتابعة بدون مسافات "
26
+ "ومع وجود تكرار بسيط لأنه ناتج من مترجم لغة الإشارة.\n"
27
+ "مهمتك:\n"
28
+ "1) إزالة التكرار غير الضروري.\n"
29
+ "2) إضافة المسافات بين الكلمات.\n"
30
+ "3) إخراج الجملة الأقرب للمعنى.\n"
31
+ "أعد النص فقط بدون شرح."
32
+ )
33
 
 
34
 
35
+ def fix_with_gemini(raw_text: str) -> str:
36
+ if not raw_text:
37
+ return ""
38
+ try:
39
+ model = genai.GenerativeModel("models/gemini-2.5-flash")
40
+ prompt = SYSTEM_PROMPT + f"\n\nالنص الخام:\n«{raw_text}»"
41
+ resp = model.generate_content(prompt)
42
+ return (resp.text or "").strip()
43
+ except Exception as e:
44
+ return f"خطأ في Gemini: {e}"
45
+
46
 
47
+ # =============================
48
+ # إعدادات YOLO + السرعة
49
+ # =============================
50
 
51
  WEIGHTS_PATH = "best.pt"
52
+ IMG_SIZE = 320
53
+ CONF_THRESHOLD = 0.25 # خفضناها عشان يسوي ديتكشن أسهل
54
 
55
+ # إعدادات تجميع الحروف
56
+ MIN_STABLE_FRAMES = 1 # اعتبر الحرف من أول مرة للاستكشاف
57
+ FRAME_SKIP = 1 # حلّل كل فريم (مع GPU تقدر تخليه 1)
58
+ MAX_FRAMES = 1000 # حد أقصى للفريمات
59
+ WORD_GAP_FRAMES = 10 # فجوة (بدون حروف) لنهاية الكلمة
60
 
61
  arabic_map = {
62
+ "aleff": "ا",
63
+ "bb": "ب",
64
+ "ta": "ت",
65
+ "taa": "ت",
66
+ "thaa": "ث",
67
+ "jeem": "ج",
68
+ "haa": "ح",
69
+ "khaa": "خ",
70
+ "dal": "د",
71
+ "dha": "ظ",
72
+ "dhad": "ض",
73
+ "fa": "ف",
74
+ "gaaf": "ق",
75
+ "ghain": "غ",
76
+ "ha": "ه",
77
+ "kaaf": "ك",
78
+ "laam": "ل",
79
+ "meem": "م",
80
+ "nun": "ن",
81
+ "ra": "ر",
82
+ "saad": "ص",
83
+ "seen": "س",
84
+ "sheen": "ش",
85
+ "thal": "ذ",
86
+ "toot": "ة",
87
+ "waw": "و",
88
+ "ya": "ي",
89
+ "yaa": "ي",
90
+ "zay": "ز",
91
+ "ain": "ع",
92
+ "al": "ال",
93
+ "la": "لا",
94
  }
95
 
96
+ print("🔹 Loading YOLO model...")
97
+ model = YOLO(WEIGHTS_PATH)
 
 
 
98
 
99
+ # ننقل الموديل إلى كرت الشاشة لو موجود
100
+ try:
101
+ model.to(DEVICE)
102
+ print("✅ YOLO model moved to", DEVICE)
103
+ except Exception as e:
104
+ print("⚠️ تعذر نقل الموديل إلى الجهاز:", e)
105
 
106
+ print("📚 Classes:", model.names)
 
107
 
 
 
 
 
108
 
109
+ # =============================
110
+ # ضغط الفيديو قبل المعالجة (دقة 360p تقريباً + تقليل FPS)
111
+ # =============================
112
+
113
+ def preprocess_video(input_path: str, target_width: int = 640, target_fps: int = 8) -> str:
114
+ """
115
+ يقلل دقة الفيديو والـ FPS عشان نخلي البروسيس أسرع.
116
+ يرجّع مسار فيديو خفيف جديد.
117
+ """
118
+ cap = cv2.VideoCapture(input_path)
119
+ if not cap.isOpened():
120
+ print("[preprocess] تعذر فتح الفيديو، سنستخدم الملف الأصلي كما هو.")
121
+ return input_path # fallback
122
+
123
+ orig_fps = cap.get(cv2.CAP_PROP_FPS)
124
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
125
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
126
+
127
+ if orig_fps <= 0:
128
+ frame_step = 1
129
+ out_fps = float(target_fps)
130
+ else:
131
+ frame_step = max(1, int(round(orig_fps / target_fps)))
132
+ out_fps = orig_fps / frame_step
133
+
134
+ # ارتفاع النسخة 360p تقريباً حسب نسبة الأبعاد
135
+ target_height = int(target_width * h / w)
136
 
137
+ fd, tmp_path = tempfile.mkstemp(suffix=".mp4")
138
+ os.close(fd)
 
 
139
 
140
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
141
+ out = cv2.VideoWriter(tmp_path, fourcc, out_fps, (target_width, target_height))
142
+
143
+ frame_idx = 0
144
+ while True:
145
+ ret, frame = cap.read()
146
+ if not ret:
147
+ break
148
+
149
+ # نأخذ كل frame_step فريم واحد فقط
150
+ if frame_idx % frame_step == 0:
151
+ resized = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
152
+ out.write(resized)
153
+
154
+ frame_idx += 1
155
+
156
+ cap.release()
157
+ out.release()
158
+ print(f"[preprocess] original_fps={orig_fps:.2f}, new_fps={out_fps:.2f}, saved={tmp_path}")
159
+ return tmp_path
160
+
161
+
162
+ # =============================
163
+ # معالجة فريم واحد (YOLO على GPU)
164
+ # =============================
165
+
166
+ def detect_frame(frame_bgr):
167
+ frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
168
+ result = model.predict(
169
+ frame_rgb,
170
+ conf=CONF_THRESHOLD,
171
+ imgsz=IMG_SIZE,
172
+ verbose=False,
173
+ device=DEVICE # هنا نحدد إنه يشتغل على cuda لو متوفر
174
+ )[0]
175
 
 
176
  boxes = result.boxes
 
177
 
178
  if boxes is None or len(boxes) == 0:
179
+ return [], frame_bgr
 
 
180
 
181
+ labels = []
182
  for box in boxes:
183
  x1, y1, x2, y2 = map(int, box.xyxy[0])
184
  cls_id = int(box.cls[0])
185
 
186
+ if isinstance(model.names, dict):
187
+ eng = model.names.get(cls_id, str(cls_id))
188
  else:
189
+ eng = model.names[cls_id] if cls_id < len(model.names) else str(cls_id)
190
+
191
+ letter = arabic_map.get(eng, eng)
192
+ labels.append(letter)
193
+
194
+ cv2.rectangle(frame_bgr, (x1, y1), (x2, y2), (0, 255, 0), 2)
195
+ cv2.putText(
196
+ frame_bgr,
197
+ letter,
198
+ (x1, y1 - 10),
199
+ cv2.FONT_HERSHEY_SIMPLEX,
200
+ 0.7,
 
 
201
  (0, 255, 0),
202
+ 2,
203
  )
204
 
205
+ return labels, frame_bgr
 
 
 
 
 
 
206
 
 
 
207
 
208
+ # =============================
209
+ # VIDEO RAW TEXT + OUTPUT VIDEO
210
+ # =============================
211
 
212
+ def extract_and_render(video_path: str):
213
+ cap = cv2.VideoCapture(video_path)
214
+ if not cap.isOpened():
215
+ return "", None
216
 
217
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
218
+ out_path = "processed_output.mp4"
 
219
 
220
+ fps = cap.get(cv2.CAP_PROP_FPS)
221
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
222
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
223
 
224
+ if fps <= 0:
225
+ fps = 8.0 # fallback
 
 
 
 
226
 
227
+ out = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
228
 
229
+ word = ""
230
+ words = []
231
+ last_label = None
232
+ last_added = None
233
+ stable = 0
234
+ last_seen = None
235
+ frame_index = 0
236
 
237
+ while True:
238
+ ret, frame = cap.read()
239
+ if not ret:
240
+ break
241
 
242
+ frame_index += 1
243
+ if frame_index > MAX_FRAMES:
244
+ break
245
 
246
+ if FRAME_SKIP > 1 and frame_index % FRAME_SKIP != 0:
247
+ continue
 
 
 
248
 
249
+ frame = cv2.flip(frame, 1)
250
+ labels, rendered = detect_frame(frame)
251
+ out.write(rendered)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ if labels:
254
+ label = labels[0]
255
+ last_seen = frame_index
256
 
257
+ if label == last_label:
258
+ stable += 1
259
+ else:
260
+ last_label = label
261
+ stable = 1
262
+
263
+ if stable >= MIN_STABLE_FRAMES:
264
+ if label != last_added:
265
+ word += label
266
+ last_added = label
267
+ stable = 0
268
  else:
269
+ if word and last_seen and (frame_index - last_seen >= WORD_GAP_FRAMES):
270
+ words.append(word)
271
+ word = ""
272
+ last_label = None
273
+ last_added = None
274
+ stable = 0
275
+ last_seen = None
276
 
277
+ cap.release()
278
+ out.release()
 
 
279
 
280
+ if word:
281
+ words.append(word)
282
 
283
+ raw_text = " ".join(words).strip()
284
+ return raw_text, out_path
285
 
 
 
286
 
287
+ # =============================
288
+ # Gradio واجهة كاملة
289
+ # =============================
290
 
291
+ def run(file):
292
+ if file is None:
293
+ return "لم يتم رفع فيديو", "", None
294
 
295
+ video_path = file.name
 
296
 
297
+ # خطوة تسريع الفيديو قبل التحليل (360p + ~8fps)
298
+ light_path = preprocess_video(video_path, target_width=640, target_fps=8)
 
 
299
 
300
+ raw, processed_path = extract_and_render(light_path)
301
+ pretty = fix_with_gemini(raw) if raw else ""
 
 
302
 
303
+ if not raw:
304
+ raw = "لم يتم التعرف على أي نص من الإشارات."
305
 
306
+ return raw, pretty, processed_path
 
 
 
 
 
 
 
 
 
307
 
 
 
 
308
 
309
  with gr.Blocks() as demo:
310
+ gr.Markdown("## 🤟 ASL → Arabic (YOLO + Gemini) مع إعادة فيديو المعالجة 🎥 — نسخة GPU")
311
+
312
+ inp = gr.File(label="ارفع فيديو الإشارة")
313
+ raw = gr.Textbox(label="النص الخام", lines=3)
314
+ pretty = gr.Textbox(label="النص المحسن (Gemini)", lines=3)
315
+ video_out = gr.Video(label="الفيديو بعد البروسيس")
316
+
317
+ btn = gr.Button(بدأ المعالجة")
318
+
319
+ btn.click(run, inputs=[inp], outputs=[raw, pretty, video_out])
320
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  if __name__ == "__main__":
322
+ demo.launch(server_name="0.0.0.0", server_port=7860)