import os import numpy as np import torch import gradio as gr import spaces import warnings warnings.filterwarnings("ignore") from PIL import Image from transformers import AutoModel, AutoTokenizer # ========================================================= # إعدادات النموذج # ========================================================= MODEL_PATH = "openbmb/MiniCPM-o-2_6" # النموذج يدعم: # - Vision (الصور) # - Audio (الصوت) # - TTS (تحويل النص إلى كلام) # - ASR (التعرف على الكلام) # - Video (الفيديو) # - Voice Cloning (استنساخ الصوت) model = None tokenizer = None def load_model(): """ تحميل MiniCPM-o-2_6 مع دعم جميع الوسائط """ global model, tokenizer if model is not None and tokenizer is not None: return print(f"[ZeroGPU] Loading MiniCPM-o-2_6...") # اختيار الجهاز ونوع البيانات if torch.cuda.is_available(): device = "cuda" torch_dtype = torch.bfloat16 else: device = "cpu" torch_dtype = torch.float32 # تحميل النموذج مع جميع القدرات model = AutoModel.from_pretrained( MODEL_PATH, trust_remote_code=True, attn_implementation='sdpa', # sdpa أو flash_attention_2 torch_dtype=torch_dtype, init_vision=True, # تفعيل الرؤية init_audio=True, # تفعيل الصوت init_tts=True # تفعيل TTS ) model = model.eval().to(device) # تحميل tokenizer tokenizer = AutoTokenizer.from_pretrained( MODEL_PATH, trust_remote_code=True ) print(f"[ZeroGPU] Model loaded successfully on {device}") ``` ```python # ========================================================= # دالة معالجة الصور # ========================================================= def process_image(image_path_or_pil): """معالجة الصورة للنموذج""" if isinstance(image_path_or_pil, str): image = Image.open(image_path_or_pil).convert('RGB') else: image = image_path_or_pil.convert('RGB') return image # ========================================================= # دالة الاستدلال الرئيسية (مع دعم ZeroGPU) # ========================================================= @spaces.GPU(duration=120) def minicpm_o_inference( text_input, image_input, audio_input, video_input, mode, temperature, top_p, max_new_tokens, enable_tts, tts_style ): """ دالة الاستدلال الرئيسية لـ MiniCPM-o-2_6 تدعم: نص، صورة، صوت، فيديو """ load_model() global model, tokenizer # بناء الرسائل حسب نوع المدخل messages = [] # إضافة المحتوى حسب نوع المدخل if mode == "Text Only": if not text_input: return "Please provide text input.", None messages = [ {"role": "user", "content": text_input} ] elif mode == "Image + Text": if not image_input: return "Please provide an image.", None image = process_image(image_input) # صياغة السؤال question = text_input if text_input else "What is shown in this image?" messages = [ { "role": "user", "content": [ Image.open(image_input) if isinstance(image_input, str) else image_input, question ] } ] elif mode == "Audio + Text": if not audio_input: return "Please provide audio input.", None # معالجة الصوت question = text_input if text_input else "What is the content of this audio?" # النموذج يدعم الصوت مباشرة messages = [ { "role": "user", "content": [ {"type": "audio", "audio": audio_input}, {"type": "text", "text": question} ] } ] elif mode == "Video + Text": if not video_input: return "Please provide a video.", None question = text_input if text_input else "What happens in this video?" # معالجة الفيديو messages = [ { "role": "user", "content": [ {"type": "video", "video": video_input}, {"type": "text", "text": question} ] } ] # إعدادات التوليد generation_config = { "max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "do_sample": temperature > 0, } try: # التوليد with torch.no_grad(): if mode == "Image + Text" and image_input: # معالجة خاصة للصور image = process_image(image_input) question = text_input if text_input else "What is shown in this image?" # استخدام chat للصور response = model.chat( image=image, msgs=[{"role": "user", "content": question}], tokenizer=tokenizer, **generation_config ) else: # للنص والأنواع الأخرى inputs = tokenizer(messages, return_tensors="pt") inputs = inputs.to(model.device) outputs = model.generate( **inputs, **generation_config ) response = tokenizer.decode( outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True ) # إذا كان TTS مفعل، نولد صوت audio_output = None if enable_tts and isinstance(response, str): try: # استخدام TTS المدمج في النموذج audio_output = model.generate_speech( text=response, style=tts_style ) except Exception as e: print(f"TTS generation failed: {e}") audio_output = None return response, audio_output except Exception as e: import traceback traceback.print_exc() return f"Error: {str(e)}", None # ========================================================= # واجهة Gradio # ========================================================= def create_interface(): """إنشاء واجهة Gradio لـ MiniCPM-o-2_6""" with gr.Blocks(title="MiniCPM-o-2_6 - Multimodal AI") as demo: gr.Markdown( """ # 🤖 MiniCPM-o-2_6 - Multimodal AI Assistant **القدرات:** - 🖼️ فهم الصور (OCR، وصف، تحليل) - 🎙️ معالجة الصوت (ASR، فهم المحتوى) - 🎬 تحليل الفيديو - 🗣️ تحويل النص إلى كلام (TTS) - 🎭 استنساخ الصوت - 💬 محادثة في الوقت الفعلي **الأداء:** يتفوق على GPT-4o و Claude 3.5 في العديد من المهام! """ ) with gr.Row(): with gr.Column(scale=3): # اختيار نوع المدخل mode = gr.Radio( choices=["Text Only", "Image + Text", "Audio + Text", "Video + Text"], value="Text Only", label="Input Mode", info="اختر نوع المدخل" ) # المدخلات text_input = gr.Textbox( label="Text Input", placeholder="اكتب سؤالك أو النص هنا...", lines=3 ) image_input = gr.Image( label="Image Input", type="pil", visible=False ) audio_input = gr.Audio( label="Audio Input", type="filepath", visible=False ) video_input = gr.Video( label="Video Input", visible=False ) # زر الإرسال submit_btn = gr.Button("🚀 Process", variant="primary") # المخرجات output_text = gr.Textbox( label="Response", lines=5, interactive=False ) output_audio = gr.Audio( label="Generated Speech (TTS)", type="numpy", visible=False ) with gr.Column(scale=1): gr.Markdown("### ⚙️ Settings") temperature = gr.Slider( label="Temperature", minimum=0.0, maximum=1.5, value=0.7, step=0.1 ) top_p = gr.Slider( label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05 ) max_new_tokens = gr.Slider( label="Max Tokens", minimum=50, maximum=2048, value=512, step=50 ) gr.Markdown("### 🗣️ TTS Settings") enable_tts = gr.Checkbox( label="Enable TTS", value=False, info="تحويل الرد إلى كلام" ) tts_style = gr.Dropdown( choices=["default", "emotional", "calm", "energetic"], value="default", label="TTS Style", visible=False ) # تحديث visibility حسب الوضع def update_inputs(mode_value): return { image_input: gr.update(visible="Image" in mode_value), audio_input: gr.update(visible="Audio" in mode_value), video_input: gr.update(visible="Video" in mode_value), } mode.change( fn=update_inputs, inputs=[mode], outputs=[image_input, audio_input, video_input] ) # تحديث visibility لإعدادات TTS enable_tts.change( fn=lambda x: { tts_style: gr.update(visible=x), output_audio: gr.update(visible=x) }, inputs=[enable_tts], outputs=[tts_style, output_audio] ) # معالجة الإرسال submit_btn.click( fn=minicpm_o_inference, inputs=[ text_input, image_input, audio_input, video_input, mode, temperature, top_p, max_new_tokens, enable_tts, tts_style ], outputs=[output_text, output_audio] ) # أمثلة gr.Examples( examples=[ ["What is artificial intelligence?", None, None, None, "Text Only"], ["Describe this image in detail", "examples/sample.jpg", None, None, "Image + Text"], ["Transcribe this audio", None, "examples/audio.wav", None, "Audio + Text"], ["What happens in this video?", None, None, "examples/video.mp4", "Video + Text"], ], inputs=[text_input, image_input, audio_input, video_input, mode], ) return demo # ========================================================= # تشغيل التطبيق # ========================================================= if __name__ == "__main__": demo = create_interface() demo.launch( ssr_mode=False, show_error=True, share=False )