Spaces:

Mustafa-albakkar
/

MediaAgent

Sleeping

App Files Files Community

Mustafa-albakkar commited on Nov 2, 2025

Commit

fa1a396

verified ·

1 Parent(s): f62f292

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -18

app.py CHANGED Viewed

@@ -1,12 +1,29 @@
 import gradio as gr
 import intel_extension_for_pytorch as ipex
 import torch
-import torch, os, tempfile, requests, cv2
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from PIL import Image
 from faster_whisper import WhisperModel
 import torch.nn as nn
 import transformers.activations
 # إصلاح مؤقت لمشكلة PytorchGELUTanh المحذوفة
 if not hasattr(transformers.activations, "PytorchGELUTanh"):
@@ -14,15 +31,19 @@ if not hasattr(transformers.activations, "PytorchGELUTanh"):
         def forward(self, x):
             return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * x**3)))
     transformers.activations.PytorchGELUTanh = PytorchGELUTanh
 # ==============================
 # إعدادات الجهاز والنماذج
 # ==============================
 device = "cpu"
 VL_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
 processor = AutoProcessor.from_pretrained(VL_MODEL_ID, trust_remote_code=True)
 vl_model = AutoModelForVision2Seq.from_pretrained(VL_MODEL_ID, trust_remote_code=True).to(device)
 whisper = WhisperModel("base", device=device)
 # ==============================
 # الدالة الرئيسية لتحليل الوسائط
@@ -32,16 +53,21 @@ def analyze_media(input_data: str) -> str:
     يستقبل إما رابط صورة / صوت / فيديو أو مسار ملف محلي.
     ويُرجع وصف الصورة أو تفريغ النص من الصوت.
     """
     try:
         # --- تحديد نوع الإدخال ---
-        url_or_path = input_data.strip()
         if not url_or_path:
-            return "No input provided."
         # --- تحليل الصورة ---
         if url_or_path.endswith((".jpg", ".jpeg", ".png")):
             # تحميل الصورة من الإنترنت أو المسار المحلي
             if url_or_path.startswith("http"):
                 response = requests.get(url_or_path, stream=True, timeout=15)
                 response.raise_for_status()
                 image = Image.open(response.raw).convert("RGB")
@@ -51,14 +77,18 @@ def analyze_media(input_data: str) -> str:
             inputs = processor(text="Describe the image in detail.", images=image, return_tensors="pt").to(device)
             with torch.no_grad():
                 out = vl_model.generate(**inputs, max_new_tokens=256)
-            result = processor.batch_decode(out, skip_special_tokens=True)[0]
-            return result.strip()
         # --- تحليل الصوت ---
         elif url_or_path.endswith((".mp3", ".wav", ".m4a", ".flac")):
             # تحميل الملف مؤقتًا إذا كان من رابط
             if url_or_path.startswith("http"):
                 temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
                 data = requests.get(url_or_path, timeout=30).content
                 with open(temp_path, "wb") as f:
                     f.write(data)
@@ -66,15 +96,23 @@ def analyze_media(input_data: str) -> str:
                 temp_path = url_or_path
             segments, _ = whisper.transcribe(temp_path)
-            text = " ".join([seg.text for seg in segments])
-            if os.path.exists(temp_path) and url_or_path.startswith("http"):
-                os.remove(temp_path)
-            return text.strip()
         # --- تحليل الفيديو (وصف الإطار الأول) ---
         elif url_or_path.endswith((".mp4", ".avi", ".mov", ".mkv")):
             if url_or_path.startswith("http"):
                 temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
                 data = requests.get(url_or_path, timeout=30).content
                 with open(temp_video, "wb") as f:
                     f.write(data)
@@ -85,7 +123,13 @@ def analyze_media(input_data: str) -> str:
             ret, frame = cap.read()
             cap.release()
             if not ret:
-                return "Could not read video."
             frame_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
             cv2.imwrite(frame_path, frame)
             image = Image.open(frame_path).convert("RGB")
@@ -93,18 +137,30 @@ def analyze_media(input_data: str) -> str:
             inputs = processor(text="Describe the video frame.", images=image, return_tensors="pt").to(device)
             with torch.no_grad():
                 out = vl_model.generate(**inputs, max_new_tokens=256)
-            result = processor.batch_decode(out, skip_special_tokens=True)[0]
-            os.remove(frame_path)
-            if url_or_path.startswith("http") and os.path.exists(temp_video):
-                os.remove(temp_video)
-            return result.strip()
         else:
-            return "Unsupported format. Please provide an image, audio, or video file."
     except Exception as e:
-        return f"❌ Error: {str(e)}"
 # ==============================
 # واجهة Gradio
@@ -121,4 +177,5 @@ iface = gr.Interface(
 # تشغيل الواجهة فقط (بدون FastAPI)
 # ==============================
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

 import gradio as gr
 import intel_extension_for_pytorch as ipex
 import torch
+import os
+import tempfile
+import requests
+import cv2
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from PIL import Image
 from faster_whisper import WhisperModel
 import torch.nn as nn
 import transformers.activations
+import logging
+import sys
+import traceback
+# ==============================
+# Logging configuration
+# ==============================
+LOG_LEVEL = os.getenv("MEDIA_AGENT_LOG_LEVEL", "INFO").upper()
+logging.basicConfig(
+    level=LOG_LEVEL,
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    handlers=[logging.StreamHandler(stream=sys.stdout)]
+)
+logger = logging.getLogger("MediaAgent")
 # إصلاح مؤقت لمشكلة PytorchGELUTanh المحذوفة
 if not hasattr(transformers.activations, "PytorchGELUTanh"):
         def forward(self, x):
             return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * x**3)))
     transformers.activations.PytorchGELUTanh = PytorchGELUTanh
 # ==============================
 # إعدادات الجهاز والنماذج
 # ==============================
 device = "cpu"
 VL_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct-AWQ"
+logger.info("Loading processor and VL model (%s)...", VL_MODEL_ID)
 processor = AutoProcessor.from_pretrained(VL_MODEL_ID, trust_remote_code=True)
 vl_model = AutoModelForVision2Seq.from_pretrained(VL_MODEL_ID, trust_remote_code=True).to(device)
+logger.info("VL model loaded.")
 whisper = WhisperModel("base", device=device)
+logger.info("Whisper model loaded.")
 # ==============================
 # الدالة الرئيسية لتحليل الوسائط
     يستقبل إما رابط صورة / صوت / فيديو أو مسار ملف محلي.
     ويُرجع وصف الصورة أو تفريغ النص من الصوت.
     """
+    logger.info("analyze_media called. input (first 300 chars): %s", (input_data or "")[:300])
     try:
         # --- تحديد نوع الإدخال ---
+        url_or_path = (input_data or "").strip()
         if not url_or_path:
+            result = "No input provided."
+            logger.info("result: %s", result)
+            return result
         # --- تحليل الصورة ---
         if url_or_path.endswith((".jpg", ".jpeg", ".png")):
+            logger.info("Detected image input: %s", url_or_path)
             # تحميل الصورة من الإنترنت أو المسار المحلي
             if url_or_path.startswith("http"):
+                logger.info("Downloading image from URL...")
                 response = requests.get(url_or_path, stream=True, timeout=15)
                 response.raise_for_status()
                 image = Image.open(response.raw).convert("RGB")
             inputs = processor(text="Describe the image in detail.", images=image, return_tensors="pt").to(device)
             with torch.no_grad():
                 out = vl_model.generate(**inputs, max_new_tokens=256)
+            result = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
+            logger.info("image analysis result (first 500 chars): %s", result[:500])
+            return result
         # --- تحليل الصوت ---
         elif url_or_path.endswith((".mp3", ".wav", ".m4a", ".flac")):
+            logger.info("Detected audio input: %s", url_or_path)
             # تحميل الملف مؤقتًا إذا كان من رابط
+            temp_path = None
             if url_or_path.startswith("http"):
                 temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+                logger.info("Downloading audio to temporary path: %s", temp_path)
                 data = requests.get(url_or_path, timeout=30).content
                 with open(temp_path, "wb") as f:
                     f.write(data)
                 temp_path = url_or_path
             segments, _ = whisper.transcribe(temp_path)
+            text = " ".join([seg.text for seg in segments]).strip()
+            if url_or_path.startswith("http") and os.path.exists(temp_path):
+                try:
+                    os.remove(temp_path)
+                    logger.debug("Temporary audio file removed: %s", temp_path)
+                except Exception:
+                    logger.warning("Failed to remove temp audio: %s", temp_path)
+            logger.info("audio transcription result (first 500 chars): %s", text[:500])
+            return text
         # --- تحليل الفيديو (وصف الإطار الأول) ---
         elif url_or_path.endswith((".mp4", ".avi", ".mov", ".mkv")):
+            logger.info("Detected video input: %s", url_or_path)
+            temp_video = None
             if url_or_path.startswith("http"):
                 temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+                logger.info("Downloading video to temporary path: %s", temp_video)
                 data = requests.get(url_or_path, timeout=30).content
                 with open(temp_video, "wb") as f:
                     f.write(data)
             ret, frame = cap.read()
             cap.release()
             if not ret:
+                result = "Could not read video."
+                logger.error(result + " input: %s", url_or_path)
+                if temp_video and os.path.exists(temp_video):
+                    try: os.remove(temp_video)
+                    except: pass
+                return result
             frame_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
             cv2.imwrite(frame_path, frame)
             image = Image.open(frame_path).convert("RGB")
             inputs = processor(text="Describe the video frame.", images=image, return_tensors="pt").to(device)
             with torch.no_grad():
                 out = vl_model.generate(**inputs, max_new_tokens=256)
+            result = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
+            logger.info("video frame analysis result (first 500 chars): %s", result[:500])
+            try:
+                os.remove(frame_path)
+            except Exception:
+                logger.debug("Could not remove frame file: %s", frame_path)
+            if temp_video and os.path.exists(temp_video):
+                try:
+                    os.remove(temp_video)
+                except Exception:
+                    logger.debug("Could not remove temp video: %s", temp_video)
+            return result
         else:
+            result = "Unsupported format. Please provide an image, audio, or video file."
+            logger.warning("Unsupported format for input: %s", url_or_path)
+            return result
     except Exception as e:
+        # سجل الاستثناء مع traceback كامل
+        logger.exception("Exception in analyze_media: %s", e)
+        tb = traceback.format_exc()
+        # أعد رسالة أكثر ودية للواجهة مع تضمين سطر الخطأ الأول (تفصيل كامل في اللوغ)
+        return f"❌ Error: {str(e)} (see server log for traceback)"
 # ==============================
 # واجهة Gradio
 # تشغيل الواجهة فقط (بدون FastAPI)
 # ==============================
 if __name__ == "__main__":
+    logger.info("Launching Gradio app on %s:%s", "0.0.0.0", os.getenv("PORT", 7860))
     iface.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))