Der11 / app.py
Derr11's picture
Update app.py
30dcad4 verified
raw
history blame
13.1 kB
import os
import numpy as np
import torch
import gradio as gr
import spaces
import warnings
warnings.filterwarnings("ignore")
from PIL import Image
from transformers import AutoModel, AutoTokenizer
# =========================================================
# إعدادات النموذج
# =========================================================
MODEL_PATH = "openbmb/MiniCPM-o-2_6"
# النموذج يدعم:
# - Vision (الصور)
# - Audio (الصوت)
# - TTS (تحويل النص إلى كلام)
# - ASR (التعرف على الكلام)
# - Video (الفيديو)
# - Voice Cloning (استنساخ الصوت)
model = None
tokenizer = None
def load_model():
"""
تحميل MiniCPM-o-2_6 مع دعم جميع الوسائط
"""
global model, tokenizer
if model is not None and tokenizer is not None:
return
print(f"[ZeroGPU] Loading MiniCPM-o-2_6...")
# اختيار الجهاز ونوع البيانات
if torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.bfloat16
else:
device = "cpu"
torch_dtype = torch.float32
# تحميل النموذج مع جميع القدرات
model = AutoModel.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
attn_implementation='sdpa', # sdpa أو flash_attention_2
torch_dtype=torch_dtype,
init_vision=True, # تفعيل الرؤية
init_audio=True, # تفعيل الصوت
init_tts=True # تفعيل TTS
)
model = model.eval().to(device)
# تحميل tokenizer
tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True
)
print(f"[ZeroGPU] Model loaded successfully on {device}")
```<!--citation:1-->
```python
# =========================================================
# دالة معالجة الصور
# =========================================================
def process_image(image_path_or_pil):
"""معالجة الصورة للنموذج"""
if isinstance(image_path_or_pil, str):
image = Image.open(image_path_or_pil).convert('RGB')
else:
image = image_path_or_pil.convert('RGB')
return image
# =========================================================
# دالة الاستدلال الرئيسية (مع دعم ZeroGPU)
# =========================================================
@spaces.GPU(duration=120)
def minicpm_o_inference(
text_input,
image_input,
audio_input,
video_input,
mode,
temperature,
top_p,
max_new_tokens,
enable_tts,
tts_style
):
"""
دالة الاستدلال الرئيسية لـ MiniCPM-o-2_6
تدعم: نص، صورة، صوت، فيديو
"""
load_model()
global model, tokenizer
# بناء الرسائل حسب نوع المدخل
messages = []
# إضافة المحتوى حسب نوع المدخل
if mode == "Text Only":
if not text_input:
return "Please provide text input.", None
messages = [
{"role": "user", "content": text_input}
]
elif mode == "Image + Text":
if not image_input:
return "Please provide an image.", None
image = process_image(image_input)
# صياغة السؤال
question = text_input if text_input else "What is shown in this image?"
messages = [
{
"role": "user",
"content": [
Image.open(image_input) if isinstance(image_input, str) else image_input,
question
]
}
]
elif mode == "Audio + Text":
if not audio_input:
return "Please provide audio input.", None
# معالجة الصوت
question = text_input if text_input else "What is the content of this audio?"
# النموذج يدعم الصوت مباشرة
messages = [
{
"role": "user",
"content": [
{"type": "audio", "audio": audio_input},
{"type": "text", "text": question}
]
}
]
elif mode == "Video + Text":
if not video_input:
return "Please provide a video.", None
question = text_input if text_input else "What happens in this video?"
# معالجة الفيديو
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": video_input},
{"type": "text", "text": question}
]
}
]
# إعدادات التوليد
generation_config = {
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_p": top_p,
"do_sample": temperature > 0,
}
try:
# التوليد
with torch.no_grad():
if mode == "Image + Text" and image_input:
# معالجة خاصة للصور
image = process_image(image_input)
question = text_input if text_input else "What is shown in this image?"
# استخدام chat للصور
response = model.chat(
image=image,
msgs=[{"role": "user", "content": question}],
tokenizer=tokenizer,
**generation_config
)
else:
# للنص والأنواع الأخرى
inputs = tokenizer(messages, return_tensors="pt")
inputs = inputs.to(model.device)
outputs = model.generate(
**inputs,
**generation_config
)
response = tokenizer.decode(
outputs[0][inputs['input_ids'].shape[1]:],
skip_special_tokens=True
)
# إذا كان TTS مفعل، نولد صوت
audio_output = None
if enable_tts and isinstance(response, str):
try:
# استخدام TTS المدمج في النموذج
audio_output = model.generate_speech(
text=response,
style=tts_style
)
except Exception as e:
print(f"TTS generation failed: {e}")
audio_output = None
return response, audio_output
except Exception as e:
import traceback
traceback.print_exc()
return f"Error: {str(e)}", None
# =========================================================
# واجهة Gradio
# =========================================================
def create_interface():
"""إنشاء واجهة Gradio لـ MiniCPM-o-2_6"""
with gr.Blocks(title="MiniCPM-o-2_6 - Multimodal AI") as demo:
gr.Markdown(
"""
# 🤖 MiniCPM-o-2_6 - Multimodal AI Assistant
**القدرات:**
- 🖼️ فهم الصور (OCR، وصف، تحليل)
- 🎙️ معالجة الصوت (ASR، فهم المحتوى)
- 🎬 تحليل الفيديو
- 🗣️ تحويل النص إلى كلام (TTS)
- 🎭 استنساخ الصوت
- 💬 محادثة في الوقت الفعلي
**الأداء:** يتفوق على GPT-4o و Claude 3.5 في العديد من المهام!
"""
)
with gr.Row():
with gr.Column(scale=3):
# اختيار نوع المدخل
mode = gr.Radio(
choices=["Text Only", "Image + Text", "Audio + Text", "Video + Text"],
value="Text Only",
label="Input Mode",
info="اختر نوع المدخل"
)
# المدخلات
text_input = gr.Textbox(
label="Text Input",
placeholder="اكتب سؤالك أو النص هنا...",
lines=3
)
image_input = gr.Image(
label="Image Input",
type="pil",
visible=False
)
audio_input = gr.Audio(
label="Audio Input",
type="filepath",
visible=False
)
video_input = gr.Video(
label="Video Input",
visible=False
)
# زر الإرسال
submit_btn = gr.Button("🚀 Process", variant="primary")
# المخرجات
output_text = gr.Textbox(
label="Response",
lines=5,
interactive=False
)
output_audio = gr.Audio(
label="Generated Speech (TTS)",
type="numpy",
visible=False
)
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Settings")
temperature = gr.Slider(
label="Temperature",
minimum=0.0,
maximum=1.5,
value=0.7,
step=0.1
)
top_p = gr.Slider(
label="Top-p",
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05
)
max_new_tokens = gr.Slider(
label="Max Tokens",
minimum=50,
maximum=2048,
value=512,
step=50
)
gr.Markdown("### 🗣️ TTS Settings")
enable_tts = gr.Checkbox(
label="Enable TTS",
value=False,
info="تحويل الرد إلى كلام"
)
tts_style = gr.Dropdown(
choices=["default", "emotional", "calm", "energetic"],
value="default",
label="TTS Style",
visible=False
)
# تحديث visibility حسب الوضع
def update_inputs(mode_value):
return {
image_input: gr.update(visible="Image" in mode_value),
audio_input: gr.update(visible="Audio" in mode_value),
video_input: gr.update(visible="Video" in mode_value),
}
mode.change(
fn=update_inputs,
inputs=[mode],
outputs=[image_input, audio_input, video_input]
)
# تحديث visibility لإعدادات TTS
enable_tts.change(
fn=lambda x: {
tts_style: gr.update(visible=x),
output_audio: gr.update(visible=x)
},
inputs=[enable_tts],
outputs=[tts_style, output_audio]
)
# معالجة الإرسال
submit_btn.click(
fn=minicpm_o_inference,
inputs=[
text_input,
image_input,
audio_input,
video_input,
mode,
temperature,
top_p,
max_new_tokens,
enable_tts,
tts_style
],
outputs=[output_text, output_audio]
)
# أمثلة
gr.Examples(
examples=[
["What is artificial intelligence?", None, None, None, "Text Only"],
["Describe this image in detail", "examples/sample.jpg", None, None, "Image + Text"],
["Transcribe this audio", None, "examples/audio.wav", None, "Audio + Text"],
["What happens in this video?", None, None, "examples/video.mp4", "Video + Text"],
],
inputs=[text_input, image_input, audio_input, video_input, mode],
)
return demo
# =========================================================
# تشغيل التطبيق
# =========================================================
if __name__ == "__main__":
demo = create_interface()
demo.launch(
ssr_mode=False,
show_error=True,
share=False
)