Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on 14 days ago

Commit

68da44b

verified ·

1 Parent(s): c4974ed

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -303

app.py CHANGED Viewed

@@ -1,321 +1,231 @@
 import os
-import numpy as np
 import torch
 import gradio as gr
 import spaces
-import warnings
-warnings.filterwarnings("ignore")
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 # =========================================================
 # إعدادات النموذج
 # =========================================================
-MODEL_PATH = "openbmb/MiniCPM-o-2_6"
-# النموذج يدعم:
-# - Vision (الصور)
-# - Audio (الصوت)
-# - TTS (تحويل النص إلى كلام)
-# - ASR (التعرف على الكلام)
-# - Video (الفيديو)
-# - Voice Cloning (استنساخ الصوت)
 model = None
 tokenizer = None
 def load_model():
-    """
-    تحميل MiniCPM-o-2_6 مع دعم جميع الوسائط
-    """
     global model, tokenizer
-    if model is not None and tokenizer is not None:
         return
-    print(f"[ZeroGPU] Loading MiniCPM-o-2_6...")
-    # اختيار الجهاز ونوع البيانات
-    if torch.cuda.is_available():
-        device = "cuda"
-        torch_dtype = torch.bfloat16
-    else:
-        device = "cpu"
-        torch_dtype = torch.float32
-    # تحميل النموذج مع جميع القدرات
-    model = AutoModel.from_pretrained(
-        MODEL_PATH,
-        trust_remote_code=True,
-        attn_implementation='sdpa',  # sdpa أو flash_attention_2
-        torch_dtype=torch_dtype,
-        init_vision=True,  # تفعيل الرؤية
-        init_audio=True,   # تفعيل الصوت
-        init_tts=True      # تفعيل TTS
-    )
-    model = model.eval().to(device)
-    # تحميل tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        MODEL_PATH,
-        trust_remote_code=True
-    )
-    print(f"[ZeroGPU] Model loaded successfully on {device}")
-```<!--citation:1-->
-```python
-# =========================================================
-# دالة معالجة الصور
-# =========================================================
-def process_image(image_path_or_pil):
-    """معالجة الصورة للنموذج"""
-    if isinstance(image_path_or_pil, str):
-        image = Image.open(image_path_or_pil).convert('RGB')
-    else:
-        image = image_path_or_pil.convert('RGB')
-    return image
 # =========================================================
-# دالة الاستدلال الرئيسية (مع دعم ZeroGPU)
 # =========================================================
-@spaces.GPU(duration=120)
-def minicpm_o_inference(
     text_input,
     image_input,
-    audio_input,
-    video_input,
-    mode,
     temperature,
     top_p,
-    max_new_tokens,
-    enable_tts,
-    tts_style
 ):
     """
-    دالة الاستدلال الرئيسية لـ MiniCPM-o-2_6
-    تدعم: نص، صورة، صوت، فيديو
     """
-    load_model()
-    global model, tokenizer
-    # بناء الرسائل حسب نوع المدخل
-    messages = []
-    # إضافة المحتوى حسب نوع المدخل
-    if mode == "Text Only":
-        if not text_input:
-            return "Please provide text input.", None
-        messages = [
-            {"role": "user", "content": text_input}
-        ]
-    elif mode == "Image + Text":
-        if not image_input:
-            return "Please provide an image.", None
-        image = process_image(image_input)
-        # صياغة السؤال
-        question = text_input if text_input else "What is shown in this image?"
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    Image.open(image_input) if isinstance(image_input, str) else image_input,
-                    question
-                ]
-            }
-        ]
-    elif mode == "Audio + Text":
-        if not audio_input:
-            return "Please provide audio input.", None
-        # معالجة الصوت
-        question = text_input if text_input else "What is the content of this audio?"
-        # النموذج يدعم الصوت مباشرة
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "audio", "audio": audio_input},
-                    {"type": "text", "text": question}
-                ]
-            }
-        ]
-    elif mode == "Video + Text":
-        if not video_input:
-            return "Please provide a video.", None
-        question = text_input if text_input else "What happens in this video?"
-        # معالجة الفيديو
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": video_input},
-                    {"type": "text", "text": question}
-                ]
-            }
-        ]
-    # إعدادات التوليد
-    generation_config = {
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "do_sample": temperature > 0,
-    }
-    try:
-        # التوليد
-        with torch.no_grad():
-            if mode == "Image + Text" and image_input:
-                # معالجة خاصة للصور
-                image = process_image(image_input)
-                question = text_input if text_input else "What is shown in this image?"
-                # استخدام chat للصور
-                response = model.chat(
-                    image=image,
-                    msgs=[{"role": "user", "content": question}],
-                    tokenizer=tokenizer,
-                    **generation_config
-                )
-            else:
-                # للنص والأنواع الأخرى
-                inputs = tokenizer(messages, return_tensors="pt")
-                inputs = inputs.to(model.device)
                 outputs = model.generate(
                     **inputs,
-                    **generation_config
                 )
-                response = tokenizer.decode(
-                    outputs[0][inputs['input_ids'].shape[1]:],
-                    skip_special_tokens=True
-                )
-        # إذا كان TTS مفعل، نولد صوت
-        audio_output = None
-        if enable_tts and isinstance(response, str):
-            try:
-                # استخدام TTS المدمج في النموذج
-                audio_output = model.generate_speech(
-                    text=response,
-                    style=tts_style
-                )
-            except Exception as e:
-                print(f"TTS generation failed: {e}")
-                audio_output = None
-        return response, audio_output
     except Exception as e:
         import traceback
         traceback.print_exc()
-        return f"Error: {str(e)}", None
 # =========================================================
 # واجهة Gradio
 # =========================================================
-def create_interface():
-    """إنشاء واجهة Gradio لـ MiniCPM-o-2_6"""
-    with gr.Blocks(title="MiniCPM-o-2_6 - Multimodal AI") as demo:
         gr.Markdown(
             """
-            # 🤖 MiniCPM-o-2_6 - Multimodal AI Assistant
-            **القدرات:**
-            - 🖼️ فهم الصور (OCR، وصف، تحليل)
-            - 🎙️ معالجة الصوت (ASR، فهم المحتوى)
-            - 🎬 تحليل الفيديو
-            - 🗣️ تحويل النص إلى كلام (TTS)
-            - 🎭 استنساخ الصوت
-            - 💬 محادثة في الوقت الفعلي
-            **الأداء:** يتفوق على GPT-4o و Claude 3.5 في العديد من المهام!
             """
         )
         with gr.Row():
-            with gr.Column(scale=3):
-                # اختيار نوع المدخل
-                mode = gr.Radio(
-                    choices=["Text Only", "Image + Text", "Audio + Text", "Video + Text"],
-                    value="Text Only",
-                    label="Input Mode",
-                    info="اختر نوع المدخل"
-                )
-                # المدخلات
                 text_input = gr.Textbox(
                     label="Text Input",
-                    placeholder="اكتب سؤالك أو النص هنا...",
                     lines=3
                 )
                 image_input = gr.Image(
-                    label="Image Input",
-                    type="pil",
-                    visible=False
                 )
-                audio_input = gr.Audio(
-                    label="Audio Input",
-                    type="filepath",
-                    visible=False
-                )
-                video_input = gr.Video(
-                    label="Video Input",
-                    visible=False
-                )
-                # زر الإرسال
-                submit_btn = gr.Button("🚀 Process", variant="primary")
-                # المخرجات
-                output_text = gr.Textbox(
                     label="Response",
-                    lines=5,
                     interactive=False
                 )
-                output_audio = gr.Audio(
-                    label="Generated Speech (TTS)",
-                    type="numpy",
-                    visible=False
-                )
             with gr.Column(scale=1):
                 gr.Markdown("### ⚙️ Settings")
                 temperature = gr.Slider(
                     label="Temperature",
-                    minimum=0.0,
-                    maximum=1.5,
                     value=0.7,
-                    step=0.1
                 )
                 top_p = gr.Slider(
@@ -323,96 +233,61 @@ def create_interface():
                     minimum=0.1,
                     maximum=1.0,
                     value=0.9,
-                    step=0.05
                 )
                 max_new_tokens = gr.Slider(
                     label="Max Tokens",
                     minimum=50,
-                    maximum=2048,
                     value=512,
-                    step=50
                 )
-                gr.Markdown("### 🗣️ TTS Settings")
-                enable_tts = gr.Checkbox(
-                    label="Enable TTS",
-                    value=False,
-                    info="تحويل الرد إلى كلام"
                 )
-                tts_style = gr.Dropdown(
-                    choices=["default", "emotional", "calm", "energetic"],
-                    value="default",
-                    label="TTS Style",
-                    visible=False
-                )
-        # تحديث visibility حسب الوضع
-        def update_inputs(mode_value):
-            return {
-                image_input: gr.update(visible="Image" in mode_value),
-                audio_input: gr.update(visible="Audio" in mode_value),
-                video_input: gr.update(visible="Video" in mode_value),
-            }
-        mode.change(
-            fn=update_inputs,
-            inputs=[mode],
-            outputs=[image_input, audio_input, video_input]
-        )
-        # تحديث visibility لإعدادات TTS
-        enable_tts.change(
-            fn=lambda x: {
-                tts_style: gr.update(visible=x),
-                output_audio: gr.update(visible=x)
-            },
-            inputs=[enable_tts],
-            outputs=[tts_style, output_audio]
         )
-        # معالجة الإرسال
-        submit_btn.click(
-            fn=minicpm_o_inference,
-            inputs=[
-                text_input,
-                image_input,
-                audio_input,
-                video_input,
-                mode,
-                temperature,
-                top_p,
-                max_new_tokens,
-                enable_tts,
-                tts_style
-            ],
-            outputs=[output_text, output_audio]
         )
-        # أمثلة
         gr.Examples(
             examples=[
-                ["What is artificial intelligence?", None, None, None, "Text Only"],
-                ["Describe this image in detail", "examples/sample.jpg", None, None, "Image + Text"],
-                ["Transcribe this audio", None, "examples/audio.wav", None, "Audio + Text"],
-                ["What happens in this video?", None, None, "examples/video.mp4", "Video + Text"],
             ],
-            inputs=[text_input, image_input, audio_input, video_input, mode],
         )
     return demo
-# =========================================================
-# تشغيل التطبيق
-# =========================================================
 if __name__ == "__main__":
-    demo = create_interface()
     demo.launch(
         ssr_mode=False,
-        show_error=True,
-        share=False
     )

 import os
 import torch
 import gradio as gr
 import spaces
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
+import warnings
+warnings.filterwarnings("ignore")
 # =========================================================
 # إعدادات النموذج
 # =========================================================
+MODEL_ID = "openbmb/MiniCPM-o-2_6"
+# تحميل كسول للنموذج
 model = None
 tokenizer = None
 def load_model():
+    """تحميل النموذج عند الحاجة فقط"""
     global model, tokenizer
+    if model is not None:
         return
+    print(f"Loading {MODEL_ID}...")
+    # استخدام float16 بدلاً من bfloat16 للتوافق مع ZeroGPU
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    try:
+        # تحميل tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_ID,
+            trust_remote_code=True,
+            use_fast=False
+        )
+        # تحميل النموذج مع إعدادات آمنة لـ ZeroGPU
+        model = AutoModel.from_pretrained(
+            MODEL_ID,
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",  # استخدام eager بدلاً من flash_attention
+        ).eval()
+        if torch.cuda.is_available():
+            model = model.cuda()
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        # محاولة تحميل بديلة بدون trust_remote_code
+        try:
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                torch_dtype=dtype,
+                low_cpu_mem_usage=True,
+            ).eval()
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+            if torch.cuda.is_available():
+                model = model.cuda()
+        except Exception as e2:
+            raise RuntimeError(f"Failed to load model: {e2}")
 # =========================================================
+# دالة الاستدلال مع ZeroGPU
 # =========================================================
+@spaces.GPU(duration=60)
+def generate_response(
     text_input,
     image_input,
     temperature,
     top_p,
+    max_new_tokens
 ):
     """
+    معالجة النص والصور باستخدام MiniCPM-o-2_6
     """
+    if not text_input and not image_input:
+        return "Please provide text or image input."
+    try:
+        load_model()
+        global model, tokenizer
+        # إعداد الرسائل
+        if image_input is not None:
+            # معالجة الصورة + النص
+            if not text_input:
+                text_input = "What is shown in this image? Please describe in detail."
+            # تحضير المدخل للنموذج
+            msgs = [{"role": "user", "content": [image_input, text_input]}]
+            # استخدام طريقة chat الخاصة بالنموذج
+            with torch.no_grad():
+                if hasattr(model, 'chat'):
+                    response = model.chat(
+                        image=image_input,
+                        msgs=msgs,
+                        tokenizer=tokenizer,
+                        sampling=True,
+                        temperature=temperature,
+                        top_p=top_p,
+                        max_new_tokens=max_new_tokens
+                    )
+                else:
+                    # fallback للنماذج التي لا تدعم chat
+                    inputs = tokenizer(text_input, return_tensors="pt")
+                    if torch.cuda.is_available():
+                        inputs = inputs.to("cuda")
+                    outputs = model.generate(
+                        **inputs,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        do_sample=True
+                    )
+                    response = tokenizer.decode(
+                        outputs[0][inputs['input_ids'].shape[1]:],
+                        skip_special_tokens=True
+                    )
+        else:
+            # نص فقط
+            inputs = tokenizer(
+                text_input,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048
+            )
+            if torch.cuda.is_available():
+                inputs = inputs.to("cuda")
+            with torch.no_grad():
                 outputs = model.generate(
                     **inputs,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                    pad_token_id=tokenizer.pad_token_id,
+                    eos_token_id=tokenizer.eos_token_id
                 )
+            response = tokenizer.decode(
+                outputs[0][inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+        return response
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return f"Error: {str(e)}"
 # =========================================================
 # واجهة Gradio
 # =========================================================
+def create_demo():
+    """إنشاء واجهة Gradio البسيطة"""
+    with gr.Blocks(title="MiniCPM-o-2.6") as demo:
         gr.Markdown(
             """
+            # 🤖 MiniCPM-o-2.6 - Multimodal AI
+            **Capabilities:**
+            - 🖼️ Image Understanding (OCR, description, analysis)
+            - 💬 Text Generation
+            - 🧠 8B parameters with GPT-4 level performance
+            Enter your text or upload an image to start!
             """
         )
         with gr.Row():
+            with gr.Column(scale=2):
                 text_input = gr.Textbox(
                     label="Text Input",
+                    placeholder="Enter your question or prompt...",
                     lines=3
                 )
                 image_input = gr.Image(
+                    label="Image Input (Optional)",
+                    type="pil"
                 )
+                with gr.Row():
+                    submit_btn = gr.Button("🚀 Generate", variant="primary")
+                    clear_btn = gr.Button("🗑️ Clear")
+                output = gr.Textbox(
                     label="Response",
+                    lines=8,
                     interactive=False
                 )
             with gr.Column(scale=1):
                 gr.Markdown("### ⚙️ Settings")
                 temperature = gr.Slider(
                     label="Temperature",
+                    minimum=0.1,
+                    maximum=1.0,
                     value=0.7,
+                    step=0.1,
+                    info="Higher = more creative"
                 )
                 top_p = gr.Slider(
                     minimum=0.1,
                     maximum=1.0,
                     value=0.9,
+                    step=0.05,
+                    info="Nucleus sampling"
                 )
                 max_new_tokens = gr.Slider(
                     label="Max Tokens",
                     minimum=50,
+                    maximum=1024,
                     value=512,
+                    step=50,
+                    info="Maximum response length"
                 )
+                gr.Markdown(
+                    """
+                    ### 📝 Tips:
+                    - For images: Upload and ask questions
+                    - Supports OCR and image analysis
+                    - Can handle multiple languages
+                    """
                 )
+        # Event handlers
+        submit_btn.click(
+            fn=generate_response,
+            inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
+            outputs=output,
+            api_name="generate"
         )
+        clear_btn.click(
+            fn=lambda: (None, None, ""),
+            inputs=[],
+            outputs=[text_input, image_input, output]
         )
+        # Examples
         gr.Examples(
             examples=[
+                ["What is artificial intelligence?", None],
+                ["Explain quantum computing in simple terms", None],
+                ["Write a poem about nature", None],
             ],
+            inputs=[text_input, image_input],
+            outputs=output,
+            fn=lambda t, i: generate_response(t, i, 0.7, 0.9, 512),
+            cache_examples=False
         )
     return demo
 if __name__ == "__main__":
+    demo = create_demo()
     demo.launch(
         ssr_mode=False,
+        show_error=True
     )