Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on Nov 30, 2025

Commit

5f4403f

verified ·

1 Parent(s): de94a22

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -21

app.py CHANGED Viewed

@@ -23,7 +23,10 @@ processor = None
 def load_model():
-    """تحميل Qwen3-Omni والمعالج عند أول استدعاء فقط (على ZeroGPU)."""
     global model, processor
     if model is not None and processor is not None:
@@ -31,27 +34,28 @@ def load_model():
     print(f"[ZeroGPU] Loading model from: {MODEL_PATH}")
-    try:
-        local_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
-            MODEL_PATH,
-            dtype="auto",
-            device_map="auto",
-            attn_implementation="flash_attention_2",
-        )
-        print("[ZeroGPU] Model loaded with flash_attention_2.")
-    except Exception as e:
-        print(f"[ZeroGPU] flash_attention_2 failed, falling back. Error: {e}")
-        local_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
-            MODEL_PATH,
-            dtype="auto",
-            device_map="auto",
-        )
-        print("[ZeroGPU] Model loaded with default attention.")
     local_processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
     model = local_model
     processor = local_processor
 def build_messages_from_history(history, system_prompt, user_text, image, audio_path, video_path):
@@ -122,10 +126,14 @@ def qwen3_omni_inference(
     top_p,
     max_tokens,
 ):
-    """تنفيذ الاستدلال الفعلي على ZeroGPU."""
     if not (user_text or image is not None or audio_path or video_path):
-        # لا يوجد مدخل
         return history, None, "", None, None, None
     load_model()
@@ -140,17 +148,20 @@ def qwen3_omni_inference(
         video_path=video_path,
     )
     text_prompt = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
     audios, images, videos = process_mm_info(
         messages,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
     inputs = processor(
         text=text_prompt,
         audio=audios,
@@ -161,8 +172,13 @@ def qwen3_omni_inference(
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
-    inputs = inputs.to(model.device).to(model.dtype)
     gen_kwargs = dict(
         temperature=float(temperature),
         top_p=float(top_p),
@@ -171,6 +187,7 @@ def qwen3_omni_inference(
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
     if not return_audio:
         gen_kwargs["return_audio"] = False
         text_ids, _ = model.generate(**inputs, **gen_kwargs)
@@ -179,15 +196,19 @@ def qwen3_omni_inference(
         gen_kwargs["speaker"] = speaker
         text_ids, audio_out = model.generate(**inputs, **gen_kwargs)
     generated_text = processor.batch_decode(
-        text_ids.sequences[:, inputs["input_ids"].shape[1]:],
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
     user_display = user_text if (user_text and user_text.strip()) else "[Multimodal message]"
     history = history + [[user_display, generated_text]]
     gr_audio = None
     if audio_out is not None:
         audio_np = audio_out.reshape(-1).detach().cpu().numpy()

 def load_model():
+    """
+    تحميل Qwen3-Omni والمعالج عند أول استدعاء فقط (على ZeroGPU).
+    تم إلغاء flash_attention_2 و device_map='auto' لتجنب مشاكل الاستدعاء.
+    """
     global model, processor
     if model is not None and processor is not None:
     print(f"[ZeroGPU] Loading model from: {MODEL_PATH}")
+    # نحدد نوع البيانات والجهاز
+    if torch.cuda.is_available():
+        torch_dtype = torch.bfloat16
+        device = "cuda"
+    else:
+        torch_dtype = torch.float32
+        device = "cpu"
+    # تحميل النموذج بدون flash_attention_2 ولا device_map="auto"
+    local_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch_dtype,
+        attn_implementation="eager",  # الأكثر أماناً في هذه البيئة
+    )
+    local_model.to(device)
     local_processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
     model = local_model
     processor = local_processor
+    print(f"[ZeroGPU] Model loaded on {device} with dtype {torch_dtype}.")
 def build_messages_from_history(history, system_prompt, user_text, image, audio_path, video_path):
     top_p,
     max_tokens,
 ):
+    """
+    تنفيذ الاستدلال الفعلي على ZeroGPU:
+    - نص + صورة + صوت + فيديو
+    - مخرج نصي دائماً، وصوتي عند الحاجة
+    """
     if not (user_text or image is not None or audio_path or video_path):
+        # لا يوجد مدخل من المستخدم
         return history, None, "", None, None, None
     load_model()
         video_path=video_path,
     )
+    # بناء النص من المحادثة (chat template)
     text_prompt = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
+    # تجهيز الوسائط المتعددة (صوت/صورة/فيديو)
     audios, images, videos = process_mm_info(
         messages,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
+    # تحويل إلى تينسورات
     inputs = processor(
         text=text_prompt,
         audio=audios,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
+    # نقل المدخلات إلى نفس الجهاز ونفس dtype للنموذج
+    first_param = next(model.parameters())
+    device = first_param.device
+    dtype = first_param.dtype
+    inputs = inputs.to(device=device, dtype=dtype)
+    # بارامترات التوليد
     gen_kwargs = dict(
         temperature=float(temperature),
         top_p=float(top_p),
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
+    # توليد النص فقط أو نص + صوت
     if not return_audio:
         gen_kwargs["return_audio"] = False
         text_ids, _ = model.generate(**inputs, **gen_kwargs)
         gen_kwargs["speaker"] = speaker
         text_ids, audio_out = model.generate(**inputs, **gen_kwargs)
+    # فك ترميز النص الناتج
+    input_len = inputs["input_ids"].shape[1]
     generated_text = processor.batch_decode(
+        text_ids.sequences[:, input_len:],
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
+    # تحديث تاريخ الدردشة
     user_display = user_text if (user_text and user_text.strip()) else "[Multimodal message]"
     history = history + [[user_display, generated_text]]
+    # تجهيز الصوت لمخرج Gradio (إن وجد)
     gr_audio = None
     if audio_out is not None:
         audio_np = audio_out.reshape(-1).detach().cpu().numpy()