|
|
import os |
|
|
import numpy as np |
|
|
import torch |
|
|
import gradio as gr |
|
|
import spaces |
|
|
import warnings |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
from PIL import Image |
|
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_PATH = "openbmb/MiniCPM-o-2_6" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = None |
|
|
tokenizer = None |
|
|
|
|
|
|
|
|
def load_model(): |
|
|
""" |
|
|
تحميل MiniCPM-o-2_6 مع دعم جميع الوسائط |
|
|
""" |
|
|
global model, tokenizer |
|
|
|
|
|
if model is not None and tokenizer is not None: |
|
|
return |
|
|
|
|
|
print(f"[ZeroGPU] Loading MiniCPM-o-2_6...") |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device = "cuda" |
|
|
torch_dtype = torch.bfloat16 |
|
|
else: |
|
|
device = "cpu" |
|
|
torch_dtype = torch.float32 |
|
|
|
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
|
MODEL_PATH, |
|
|
trust_remote_code=True, |
|
|
attn_implementation='sdpa', |
|
|
torch_dtype=torch_dtype, |
|
|
init_vision=True, |
|
|
init_audio=True, |
|
|
init_tts=True |
|
|
) |
|
|
|
|
|
model = model.eval().to(device) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
MODEL_PATH, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
print(f"[ZeroGPU] Model loaded successfully on {device}") |
|
|
```<!--citation:1--> |
|
|
|
|
|
```python |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_path_or_pil): |
|
|
"""معالجة الصورة للنموذج""" |
|
|
if isinstance(image_path_or_pil, str): |
|
|
image = Image.open(image_path_or_pil).convert('RGB') |
|
|
else: |
|
|
image = image_path_or_pil.convert('RGB') |
|
|
return image |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def minicpm_o_inference( |
|
|
text_input, |
|
|
image_input, |
|
|
audio_input, |
|
|
video_input, |
|
|
mode, |
|
|
temperature, |
|
|
top_p, |
|
|
max_new_tokens, |
|
|
enable_tts, |
|
|
tts_style |
|
|
): |
|
|
""" |
|
|
دالة الاستدلال الرئيسية لـ MiniCPM-o-2_6 |
|
|
تدعم: نص، صورة، صوت، فيديو |
|
|
""" |
|
|
|
|
|
load_model() |
|
|
global model, tokenizer |
|
|
|
|
|
|
|
|
messages = [] |
|
|
|
|
|
|
|
|
if mode == "Text Only": |
|
|
if not text_input: |
|
|
return "Please provide text input.", None |
|
|
|
|
|
messages = [ |
|
|
{"role": "user", "content": text_input} |
|
|
] |
|
|
|
|
|
elif mode == "Image + Text": |
|
|
if not image_input: |
|
|
return "Please provide an image.", None |
|
|
|
|
|
image = process_image(image_input) |
|
|
|
|
|
|
|
|
question = text_input if text_input else "What is shown in this image?" |
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
Image.open(image_input) if isinstance(image_input, str) else image_input, |
|
|
question |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
elif mode == "Audio + Text": |
|
|
if not audio_input: |
|
|
return "Please provide audio input.", None |
|
|
|
|
|
|
|
|
question = text_input if text_input else "What is the content of this audio?" |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "audio", "audio": audio_input}, |
|
|
{"type": "text", "text": question} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
elif mode == "Video + Text": |
|
|
if not video_input: |
|
|
return "Please provide a video.", None |
|
|
|
|
|
question = text_input if text_input else "What happens in this video?" |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "video", "video": video_input}, |
|
|
{"type": "text", "text": question} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
generation_config = { |
|
|
"max_new_tokens": max_new_tokens, |
|
|
"temperature": temperature, |
|
|
"top_p": top_p, |
|
|
"do_sample": temperature > 0, |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
with torch.no_grad(): |
|
|
if mode == "Image + Text" and image_input: |
|
|
|
|
|
image = process_image(image_input) |
|
|
question = text_input if text_input else "What is shown in this image?" |
|
|
|
|
|
|
|
|
response = model.chat( |
|
|
image=image, |
|
|
msgs=[{"role": "user", "content": question}], |
|
|
tokenizer=tokenizer, |
|
|
**generation_config |
|
|
) |
|
|
|
|
|
else: |
|
|
|
|
|
inputs = tokenizer(messages, return_tensors="pt") |
|
|
inputs = inputs.to(model.device) |
|
|
|
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
**generation_config |
|
|
) |
|
|
|
|
|
response = tokenizer.decode( |
|
|
outputs[0][inputs['input_ids'].shape[1]:], |
|
|
skip_special_tokens=True |
|
|
) |
|
|
|
|
|
|
|
|
audio_output = None |
|
|
if enable_tts and isinstance(response, str): |
|
|
try: |
|
|
|
|
|
audio_output = model.generate_speech( |
|
|
text=response, |
|
|
style=tts_style |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"TTS generation failed: {e}") |
|
|
audio_output = None |
|
|
|
|
|
return response, audio_output |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return f"Error: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
"""إنشاء واجهة Gradio لـ MiniCPM-o-2_6""" |
|
|
|
|
|
with gr.Blocks(title="MiniCPM-o-2_6 - Multimodal AI") as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🤖 MiniCPM-o-2_6 - Multimodal AI Assistant |
|
|
|
|
|
**القدرات:** |
|
|
- 🖼️ فهم الصور (OCR، وصف، تحليل) |
|
|
- 🎙️ معالجة الصوت (ASR، فهم المحتوى) |
|
|
- 🎬 تحليل الفيديو |
|
|
- 🗣️ تحويل النص إلى كلام (TTS) |
|
|
- 🎭 استنساخ الصوت |
|
|
- 💬 محادثة في الوقت الفعلي |
|
|
|
|
|
**الأداء:** يتفوق على GPT-4o و Claude 3.5 في العديد من المهام! |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
|
|
|
mode = gr.Radio( |
|
|
choices=["Text Only", "Image + Text", "Audio + Text", "Video + Text"], |
|
|
value="Text Only", |
|
|
label="Input Mode", |
|
|
info="اختر نوع المدخل" |
|
|
) |
|
|
|
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text Input", |
|
|
placeholder="اكتب سؤالك أو النص هنا...", |
|
|
lines=3 |
|
|
) |
|
|
|
|
|
image_input = gr.Image( |
|
|
label="Image Input", |
|
|
type="pil", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
label="Audio Input", |
|
|
type="filepath", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
video_input = gr.Video( |
|
|
label="Video Input", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn = gr.Button("🚀 Process", variant="primary") |
|
|
|
|
|
|
|
|
output_text = gr.Textbox( |
|
|
label="Response", |
|
|
lines=5, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
output_audio = gr.Audio( |
|
|
label="Generated Speech (TTS)", |
|
|
type="numpy", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### ⚙️ Settings") |
|
|
|
|
|
temperature = gr.Slider( |
|
|
label="Temperature", |
|
|
minimum=0.0, |
|
|
maximum=1.5, |
|
|
value=0.7, |
|
|
step=0.1 |
|
|
) |
|
|
|
|
|
top_p = gr.Slider( |
|
|
label="Top-p", |
|
|
minimum=0.1, |
|
|
maximum=1.0, |
|
|
value=0.9, |
|
|
step=0.05 |
|
|
) |
|
|
|
|
|
max_new_tokens = gr.Slider( |
|
|
label="Max Tokens", |
|
|
minimum=50, |
|
|
maximum=2048, |
|
|
value=512, |
|
|
step=50 |
|
|
) |
|
|
|
|
|
gr.Markdown("### 🗣️ TTS Settings") |
|
|
|
|
|
enable_tts = gr.Checkbox( |
|
|
label="Enable TTS", |
|
|
value=False, |
|
|
info="تحويل الرد إلى كلام" |
|
|
) |
|
|
|
|
|
tts_style = gr.Dropdown( |
|
|
choices=["default", "emotional", "calm", "energetic"], |
|
|
value="default", |
|
|
label="TTS Style", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
def update_inputs(mode_value): |
|
|
return { |
|
|
image_input: gr.update(visible="Image" in mode_value), |
|
|
audio_input: gr.update(visible="Audio" in mode_value), |
|
|
video_input: gr.update(visible="Video" in mode_value), |
|
|
} |
|
|
|
|
|
mode.change( |
|
|
fn=update_inputs, |
|
|
inputs=[mode], |
|
|
outputs=[image_input, audio_input, video_input] |
|
|
) |
|
|
|
|
|
|
|
|
enable_tts.change( |
|
|
fn=lambda x: { |
|
|
tts_style: gr.update(visible=x), |
|
|
output_audio: gr.update(visible=x) |
|
|
}, |
|
|
inputs=[enable_tts], |
|
|
outputs=[tts_style, output_audio] |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=minicpm_o_inference, |
|
|
inputs=[ |
|
|
text_input, |
|
|
image_input, |
|
|
audio_input, |
|
|
video_input, |
|
|
mode, |
|
|
temperature, |
|
|
top_p, |
|
|
max_new_tokens, |
|
|
enable_tts, |
|
|
tts_style |
|
|
], |
|
|
outputs=[output_text, output_audio] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["What is artificial intelligence?", None, None, None, "Text Only"], |
|
|
["Describe this image in detail", "examples/sample.jpg", None, None, "Image + Text"], |
|
|
["Transcribe this audio", None, "examples/audio.wav", None, "Audio + Text"], |
|
|
["What happens in this video?", None, None, "examples/video.mp4", "Video + Text"], |
|
|
], |
|
|
inputs=[text_input, image_input, audio_input, video_input, mode], |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_interface() |
|
|
demo.launch( |
|
|
ssr_mode=False, |
|
|
show_error=True, |
|
|
share=False |
|
|
) |