Spaces:

Mr-HASSAN
/

testing

Paused

App Files Files Community

Mr-HASSAN commited on Nov 25, 2025

Commit

0e03f83

verified ·

1 Parent(s): 1dd48ee

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -30

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 import spaces  # مهم لـ @spaces.GPU
 # =============================
-# إعداد مفتاح Gemini (حطه هنا)
 # =============================
 GEMINI_API_KEY = "AIzaSyAvm28ZnTMaZ1Jtg9sYM-EO4qlAN2W4BIQ"
@@ -43,15 +43,17 @@ def fix_with_gemini(raw_text: str) -> str:
 # =============================
 WEIGHTS_PATH = "best.pt"
-IMG_SIZE = 720
-CONF_THRESHOLD = 0.60  # تخفيض للسهولة
-# إعدادات تجميع الحروف
 MIN_STABLE_FRAMES = 1
 FRAME_SKIP = 1
 MAX_FRAMES = 1000
 WORD_GAP_FRAMES = 10
 arabic_map = {
     "aleff": "ا",
     "bb": "ب",
@@ -87,15 +89,13 @@ arabic_map = {
     "la": "لا",
 }
-# هنستخدم موديل عالمي لكن نحمّله عند أول استخدام فقط
 yolo_model = None
 DEVICE = "cpu"
 def get_model():
     """
-    يحمّل YOLO مرة واحدة، ويحاول نقله لـ GPU لو متوفر.
-    يُستدعى داخل دالة عليها @spaces.GPU بعد ما الـ GPU يشتغل فعليًا.
     """
     global yolo_model, DEVICE
@@ -104,7 +104,6 @@ def get_model():
         yolo_model = YOLO(WEIGHTS_PATH)
         print("📚 Classes:", yolo_model.names)
-    # نعيد التحقق من CUDA هنا (بعد ما GPU يشتغل في Spaces)
     if torch.cuda.is_available():
         if DEVICE != "cuda":
             DEVICE = "cuda"
@@ -114,25 +113,66 @@ def get_model():
             except Exception as e:
                 print("⚠️ تعذر نقل الموديل إلى cuda:", e)
     else:
         DEVICE = "cpu"
-        print("⚠️ CUDA غير متوفر، سيتم استخدام CPU.")
     return yolo_model
 # =============================
 # ضغط الفيديو قبل المعالجة
 # =============================
-def preprocess_video(input_path: str, target_width: int = 640, target_fps: int = 8) -> str:
     """
-    يقلل دقة الفيديو والـ FPS عشان نخلي البروسيس أسرع.
-    يرجّع مسار فيديو خفيف جديد.
     """
     cap = cv2.VideoCapture(input_path)
     if not cap.isOpened():
         print("[preprocess] تعذر فتح الفيديو، سنستخدم الملف الأصلي كما هو.")
-        return input_path  # fallback
     orig_fps = cap.get(cv2.CAP_PROP_FPS)
     w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -145,13 +185,26 @@ def preprocess_video(input_path: str, target_width: int = 640, target_fps: int =
         frame_step = max(1, int(round(orig_fps / target_fps)))
         out_fps = orig_fps / frame_step
-    target_height = int(target_width * h / w)
     fd, tmp_path = tempfile.mkstemp(suffix=".mp4")
     os.close(fd)
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    out = cv2.VideoWriter(tmp_path, fourcc, out_fps, (target_width, target_height))
     frame_idx = 0
     while True:
@@ -160,14 +213,18 @@ def preprocess_video(input_path: str, target_width: int = 640, target_fps: int =
             break
         if frame_idx % frame_step == 0:
-            resized = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
-            out.write(resized)
         frame_idx += 1
     cap.release()
     out.release()
-    print(f"[preprocess] original_fps={orig_fps:.2f}, new_fps={out_fps:.2f}, saved={tmp_path}")
     return tmp_path
@@ -176,7 +233,7 @@ def preprocess_video(input_path: str, target_width: int = 640, target_fps: int =
 # =============================
 def detect_frame(frame_bgr):
-    model = get_model()  # نتأكد الموديل جاهز وعلى الجهاز الصحيح
     frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
     result = model.predict(
@@ -184,11 +241,14 @@ def detect_frame(frame_bgr):
         conf=CONF_THRESHOLD,
         imgsz=IMG_SIZE,
         verbose=False,
-        device=DEVICE,  # cuda أو cpu حسب المتاح
     )[0]
     boxes = result.boxes
     if boxes is None or len(boxes) == 0:
         return [], frame_bgr
@@ -220,13 +280,13 @@ def detect_frame(frame_bgr):
 # =============================
-# VIDEO → RAW TEXT + OUTPUT VIDEO
 # =============================
 def extract_and_render(video_path: str):
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
-        return "", None
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     out_path = "processed_output.mp4"
@@ -248,6 +308,9 @@ def extract_and_render(video_path: str):
     last_seen = None
     frame_index = 0
     while True:
         ret, frame = cap.read()
         if not ret:
@@ -265,6 +328,9 @@ def extract_and_render(video_path: str):
         out.write(rendered)
         if labels:
             label = labels[0]
             last_seen = frame_index
@@ -295,42 +361,61 @@ def extract_and_render(video_path: str):
         words.append(word)
     raw_text = " ".join(words).strip()
-    return raw_text, out_path
 # =============================
 # Gradio + @spaces.GPU
 # =============================
-@spaces.GPU  # هذا اللي يرضي Hugging Face ويشغّل GPU on demand
 def run(file):
     if file is None:
-        return "لم يتم رفع فيديو", "", None
     video_path = file.name
-    light_path = preprocess_video(video_path, target_width=640, target_fps=8)
-    raw, processed_path = extract_and_render(light_path)
     pretty = fix_with_gemini(raw) if raw else ""
     if not raw:
         raw = "لم يتم التعرف على أي نص من الإشارات."
-    return raw, pretty, processed_path
 with gr.Blocks() as demo:
-    gr.Markdown("## 🤟 ASL → Arabic (YOLO + Gemini) — نسخة GPU على Hugging Face Spaces")
     inp = gr.File(label="ارفع فيديو الإشارة")
     raw = gr.Textbox(label="النص الخام", lines=3)
     pretty = gr.Textbox(label="النص المحسن (Gemini)", lines=3)
     video_out = gr.Video(label="الفيديو بعد البروسيس")
     btn = gr.Button("ابدأ المعالجة")
-    btn.click(run, inputs=[inp], outputs=[raw, pretty, video_out])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import spaces  # مهم لـ @spaces.GPU
 # =============================
+# إعداد مفتاح Gemini (حط المفتاح هنا)
 # =============================
 GEMINI_API_KEY = "AIzaSyAvm28ZnTMaZ1Jtg9sYM-EO4qlAN2W4BIQ"
 # =============================
 WEIGHTS_PATH = "best.pt"
+IMG_SIZE = 640          # حجم الإدخال لـ YOLO
+CONF_THRESHOLD = 0.15   # مخفض عشان نلتقط أكثر
 MIN_STABLE_FRAMES = 1
 FRAME_SKIP = 1
 MAX_FRAMES = 1000
 WORD_GAP_FRAMES = 10
+# لو حاب تلغي القص المركزي (زوم)، خليه False
+CENTER_CROP = True
 arabic_map = {
     "aleff": "ا",
     "bb": "ب",
     "la": "لا",
 }
 yolo_model = None
 DEVICE = "cpu"
 def get_model():
     """
+    يحمّل YOLO مرة واحدة، ويحاول يستخدم CUDA لو متوفر.
     """
     global yolo_model, DEVICE
         yolo_model = YOLO(WEIGHTS_PATH)
         print("📚 Classes:", yolo_model.names)
     if torch.cuda.is_available():
         if DEVICE != "cuda":
             DEVICE = "cuda"
             except Exception as e:
                 print("⚠️ تعذر نقل الموديل إلى cuda:", e)
     else:
+        if DEVICE != "cpu":
+            print("⚠️ CUDA غير متوفر، سيتم استخدام CPU.")
         DEVICE = "cpu"
     return yolo_model
+# =============================
+# دالة مساعدة: تكبير + قص من الوسط 640x640
+# =============================
+def resize_and_center_crop(frame, target: int = 640):
+    """
+    - نكبر/نصغر بحيث أقصر ضلع = target
+    - ثم نقص مربع 640x640 من الوسط (Zoom لطيف على المركز)
+    """
+    h, w = frame.shape[:2]
+    short_side = min(w, h)
+    if short_side <= 0:
+        return frame
+    scale = target / short_side
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    h, w = frame.shape[:2]
+    x1 = max(0, (w - target) // 2)
+    y1 = max(0, (h - target) // 2)
+    x2 = x1 + target
+    y2 = y1 + target
+    x2 = min(x2, w)
+    y2 = min(y2, h)
+    crop = frame[y1:y2, x1:x2]
+    # لو لسبب ما القص أصغر من 640x640، نرجع نضبطه
+    ch, cw = crop.shape[:2]
+    if ch != target or cw != target:
+        crop = cv2.resize(crop, (target, target), interpolation=cv2.INTER_AREA)
+    return crop
 # =============================
 # ضغط الفيديو قبل المعالجة
 # =============================
+def preprocess_video(input_path: str, target_short_side: int = 640, target_fps: int = 8) -> str:
     """
+    نضبط الفيديو بحيث:
+    - أقصر ضلع ≈ target_short_side
+    - مع خيار قص مركزي 640x640 (Zoom) لو CENTER_CROP = True
     """
     cap = cv2.VideoCapture(input_path)
     if not cap.isOpened():
         print("[preprocess] تعذر فتح الفيديو، سنستخدم الملف الأصلي كما هو.")
+        return input_path
     orig_fps = cap.get(cv2.CAP_PROP_FPS)
     w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         frame_step = max(1, int(round(orig_fps / target_fps)))
         out_fps = orig_fps / frame_step
+    short_side = min(w, h)
+    if short_side <= 0:
+        scale = 1.0
+    else:
+        scale = target_short_side / short_side
+    new_w = int(w * scale)
+    new_h = int(h * scale)
     fd, tmp_path = tempfile.mkstemp(suffix=".mp4")
     os.close(fd)
+    # لو بنقص 640x640 نخلي الـ writer برضو 640x640
+    if CENTER_CROP:
+        out_w, out_h = IMG_SIZE, IMG_SIZE
+    else:
+        out_w, out_h = new_w, new_h
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    out = cv2.VideoWriter(tmp_path, fourcc, out_fps, (out_w, out_h))
     frame_idx = 0
     while True:
             break
         if frame_idx % frame_step == 0:
+            if CENTER_CROP:
+                processed = resize_and_center_crop(frame, target=IMG_SIZE)
+            else:
+                processed = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+            out.write(processed)
         frame_idx += 1
     cap.release()
     out.release()
+    print(f"[preprocess] orig=({w}x{h}), new=({out_w}x{out_h}), saved={tmp_path}")
     return tmp_path
 # =============================
 def detect_frame(frame_bgr):
+    model = get_model()
     frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
     result = model.predict(
         conf=CONF_THRESHOLD,
         imgsz=IMG_SIZE,
         verbose=False,
+        device=DEVICE,
     )[0]
     boxes = result.boxes
+    num_boxes = 0 if boxes is None else len(boxes)
+    print(f"[detect_frame] boxes={num_boxes}")
     if boxes is None or len(boxes) == 0:
         return [], frame_bgr
 # =============================
+# VIDEO → RAW TEXT + OUTPUT VIDEO + DEBUG
 # =============================
 def extract_and_render(video_path: str):
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
+        return "", None, "تعذر فتح الفيديو في extract_and_render"
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     out_path = "processed_output.mp4"
     last_seen = None
     frame_index = 0
+    frames_with_dets = 0
+    debug_lines = []
     while True:
         ret, frame = cap.read()
         if not ret:
         out.write(rendered)
         if labels:
+            frames_with_dets += 1
+            debug_lines.append(f"frame {frame_index}: {labels}")
             label = labels[0]
             last_seen = frame_index
         words.append(word)
     raw_text = " ".join(words).strip()
+    if not debug_lines:
+        debug_info = (
+            f"total_frames={frame_index}, frames_with_detections=0\n"
+            "لم يتم رصد أي صناديق (boxes) من YOLO في أي فريم.\n"
+            "تحقق من:\n"
+            "- أن best.pt هو موديل detection وتدريبه سليم.\n"
+            "- أن الفيديو مشابه لتدريب الموديل من ناحية وضعية اليد والكاميرا."
+        )
+    else:
+        sample = "\n".join(debug_lines[:30])
+        debug_info = (
+            f"total_frames={frame_index}, frames_with_detections={frames_with_dets}\n"
+            "أمثلة من الفريمات اللي فيها حروف:\n"
+            f"{sample}"
+        )
+    return raw_text, out_path, debug_info
 # =============================
 # Gradio + @spaces.GPU
 # =============================
+@spaces.GPU
 def run(file):
     if file is None:
+        return "لم يتم رفع فيديو", "", None, "لم يتم رفع فيديو"
     video_path = file.name
+    # التكبير + center crop 640x640
+    light_path = preprocess_video(video_path, target_short_side=640, target_fps=8)
+    raw, processed_path, debug_info = extract_and_render(light_path)
     pretty = fix_with_gemini(raw) if raw else ""
     if not raw:
         raw = "لم يتم التعرف على أي نص من الإشارات."
+    return raw, pretty, processed_path, debug_info
 with gr.Blocks() as demo:
+    gr.Markdown("## 🤟 ASL → Arabic (YOLO + Gemini) — مع تكبير أفضل للفيديوهات العرضية")
     inp = gr.File(label="ارفع فيديو الإشارة")
     raw = gr.Textbox(label="النص الخام", lines=3)
     pretty = gr.Textbox(label="النص المحسن (Gemini)", lines=3)
     video_out = gr.Video(label="الفيديو بعد البروسيس")
+    debug_box = gr.Textbox(label="Debug info", lines=10)
     btn = gr.Button("ابدأ المعالجة")
+    btn.click(run, inputs=[inp], outputs=[raw, pretty, video_out, debug_box])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)