Derr11 commited on
Commit
68da44b
·
verified ·
1 Parent(s): c4974ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -303
app.py CHANGED
@@ -1,321 +1,231 @@
1
  import os
2
- import numpy as np
3
  import torch
4
  import gradio as gr
5
  import spaces
6
- import warnings
7
- warnings.filterwarnings("ignore")
8
-
9
  from PIL import Image
10
  from transformers import AutoModel, AutoTokenizer
 
 
11
 
12
  # =========================================================
13
  # إعدادات النموذج
14
  # =========================================================
15
 
16
- MODEL_PATH = "openbmb/MiniCPM-o-2_6"
17
-
18
- # النموذج يدعم:
19
- # - Vision (الصور)
20
- # - Audio (الصوت)
21
- # - TTS (تحويل النص إلى كلام)
22
- # - ASR (التعرف على الكلام)
23
- # - Video (الفيديو)
24
- # - Voice Cloning (استنساخ الصوت)
25
 
 
26
  model = None
27
  tokenizer = None
28
 
29
 
30
  def load_model():
31
- """
32
- تحميل MiniCPM-o-2_6 مع دعم جميع الوسائط
33
- """
34
  global model, tokenizer
35
 
36
- if model is not None and tokenizer is not None:
37
  return
38
 
39
- print(f"[ZeroGPU] Loading MiniCPM-o-2_6...")
40
-
41
- # اختيار الجهاز ونوع البيانات
42
- if torch.cuda.is_available():
43
- device = "cuda"
44
- torch_dtype = torch.bfloat16
45
- else:
46
- device = "cpu"
47
- torch_dtype = torch.float32
48
-
49
- # تحميل النموذج مع جميع القدرات
50
- model = AutoModel.from_pretrained(
51
- MODEL_PATH,
52
- trust_remote_code=True,
53
- attn_implementation='sdpa', # sdpa أو flash_attention_2
54
- torch_dtype=torch_dtype,
55
- init_vision=True, # تفعيل الرؤية
56
- init_audio=True, # تفعيل الصوت
57
- init_tts=True # تفعيل TTS
58
- )
59
 
60
- model = model.eval().to(device)
 
 
61
 
62
- # تحميل tokenizer
63
- tokenizer = AutoTokenizer.from_pretrained(
64
- MODEL_PATH,
65
- trust_remote_code=True
66
- )
67
-
68
- print(f"[ZeroGPU] Model loaded successfully on {device}")
69
- ```<!--citation:1-->
70
-
71
- ```python
72
- # =========================================================
73
- # دالة معالجة الصور
74
- # =========================================================
75
-
76
- def process_image(image_path_or_pil):
77
- """معالجة الصورة للنموذج"""
78
- if isinstance(image_path_or_pil, str):
79
- image = Image.open(image_path_or_pil).convert('RGB')
80
- else:
81
- image = image_path_or_pil.convert('RGB')
82
- return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
 
85
  # =========================================================
86
- # دالة الاستدلال الرئيسية (مع دعم ZeroGPU)
87
  # =========================================================
88
 
89
- @spaces.GPU(duration=120)
90
- def minicpm_o_inference(
91
  text_input,
92
  image_input,
93
- audio_input,
94
- video_input,
95
- mode,
96
  temperature,
97
  top_p,
98
- max_new_tokens,
99
- enable_tts,
100
- tts_style
101
  ):
102
  """
103
- دالة الاستدلال الرئيسية لـ MiniCPM-o-2_6
104
- تدعم: نص، صورة، صوت، فيديو
105
  """
106
 
107
- load_model()
108
- global model, tokenizer
109
-
110
- # بناء الرسائل حسب نوع المدخل
111
- messages = []
112
 
113
- # إضافة المحتوى حسب نوع المدخل
114
- if mode == "Text Only":
115
- if not text_input:
116
- return "Please provide text input.", None
117
-
118
- messages = [
119
- {"role": "user", "content": text_input}
120
- ]
121
 
122
- elif mode == "Image + Text":
123
- if not image_input:
124
- return "Please provide an image.", None
 
 
125
 
126
- image = process_image(image_input)
127
-
128
- # صياغة السؤال
129
- question = text_input if text_input else "What is shown in this image?"
130
-
131
- messages = [
132
- {
133
- "role": "user",
134
- "content": [
135
- Image.open(image_input) if isinstance(image_input, str) else image_input,
136
- question
137
- ]
138
- }
139
- ]
140
-
141
- elif mode == "Audio + Text":
142
- if not audio_input:
143
- return "Please provide audio input.", None
144
 
145
- # معالجة الصوت
146
- question = text_input if text_input else "What is the content of this audio?"
147
-
148
- # النموذج يدعم الصوت مباشرة
149
- messages = [
150
- {
151
- "role": "user",
152
- "content": [
153
- {"type": "audio", "audio": audio_input},
154
- {"type": "text", "text": question}
155
- ]
156
- }
157
- ]
158
-
159
- elif mode == "Video + Text":
160
- if not video_input:
161
- return "Please provide a video.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- question = text_input if text_input else "What happens in this video?"
164
-
165
- # معالجة الفيديو
166
- messages = [
167
- {
168
- "role": "user",
169
- "content": [
170
- {"type": "video", "video": video_input},
171
- {"type": "text", "text": question}
172
- ]
173
- }
174
- ]
175
-
176
- # إعدادات التوليد
177
- generation_config = {
178
- "max_new_tokens": max_new_tokens,
179
- "temperature": temperature,
180
- "top_p": top_p,
181
- "do_sample": temperature > 0,
182
- }
183
-
184
- try:
185
- # التوليد
186
- with torch.no_grad():
187
- if mode == "Image + Text" and image_input:
188
- # معالجة خاصة للصور
189
- image = process_image(image_input)
190
- question = text_input if text_input else "What is shown in this image?"
191
-
192
- # استخدام chat للصور
193
- response = model.chat(
194
- image=image,
195
- msgs=[{"role": "user", "content": question}],
196
- tokenizer=tokenizer,
197
- **generation_config
198
- )
199
-
200
- else:
201
- # للنص والأنواع الأخرى
202
- inputs = tokenizer(messages, return_tensors="pt")
203
- inputs = inputs.to(model.device)
204
-
205
  outputs = model.generate(
206
  **inputs,
207
- **generation_config
 
 
 
 
 
208
  )
209
-
210
- response = tokenizer.decode(
211
- outputs[0][inputs['input_ids'].shape[1]:],
212
- skip_special_tokens=True
213
- )
214
-
215
- # إذا كان TTS مفعل، نولد صوت
216
- audio_output = None
217
- if enable_tts and isinstance(response, str):
218
- try:
219
- # استخدام TTS المدمج في النموذج
220
- audio_output = model.generate_speech(
221
- text=response,
222
- style=tts_style
223
- )
224
- except Exception as e:
225
- print(f"TTS generation failed: {e}")
226
- audio_output = None
227
 
228
- return response, audio_output
229
 
230
  except Exception as e:
231
  import traceback
232
  traceback.print_exc()
233
- return f"Error: {str(e)}", None
234
 
235
 
236
  # =========================================================
237
  # واجهة Gradio
238
  # =========================================================
239
 
240
- def create_interface():
241
- """إنشاء واجهة Gradio لـ MiniCPM-o-2_6"""
242
 
243
- with gr.Blocks(title="MiniCPM-o-2_6 - Multimodal AI") as demo:
244
  gr.Markdown(
245
  """
246
- # 🤖 MiniCPM-o-2_6 - Multimodal AI Assistant
247
 
248
- **القدرات:**
249
- - 🖼️ فهم الصور (OCR، وصف، تحليل)
250
- - 🎙️ معالجة الصوت (ASR، فهم المحتوى)
251
- - 🎬 تحليل الفيديو
252
- - 🗣️ تحويل النص إلى كلام (TTS)
253
- - 🎭 استنساخ الصوت
254
- - 💬 محادثة في الوقت الفعلي
255
 
256
- **الأداء:** يتفوق على GPT-4o و Claude 3.5 في العديد من المهام!
257
  """
258
  )
259
 
260
  with gr.Row():
261
- with gr.Column(scale=3):
262
- # اختيار نوع المدخل
263
- mode = gr.Radio(
264
- choices=["Text Only", "Image + Text", "Audio + Text", "Video + Text"],
265
- value="Text Only",
266
- label="Input Mode",
267
- info="اختر نوع المدخل"
268
- )
269
-
270
- # المدخلات
271
  text_input = gr.Textbox(
272
  label="Text Input",
273
- placeholder="اكتب سؤالك أو النص هنا...",
274
  lines=3
275
  )
276
 
277
  image_input = gr.Image(
278
- label="Image Input",
279
- type="pil",
280
- visible=False
281
  )
282
 
283
- audio_input = gr.Audio(
284
- label="Audio Input",
285
- type="filepath",
286
- visible=False
287
- )
288
 
289
- video_input = gr.Video(
290
- label="Video Input",
291
- visible=False
292
- )
293
-
294
- # زر الإرسال
295
- submit_btn = gr.Button("🚀 Process", variant="primary")
296
-
297
- # المخرجات
298
- output_text = gr.Textbox(
299
  label="Response",
300
- lines=5,
301
  interactive=False
302
  )
303
-
304
- output_audio = gr.Audio(
305
- label="Generated Speech (TTS)",
306
- type="numpy",
307
- visible=False
308
- )
309
 
310
  with gr.Column(scale=1):
311
  gr.Markdown("### ⚙️ Settings")
312
 
313
  temperature = gr.Slider(
314
  label="Temperature",
315
- minimum=0.0,
316
- maximum=1.5,
317
  value=0.7,
318
- step=0.1
 
319
  )
320
 
321
  top_p = gr.Slider(
@@ -323,96 +233,61 @@ def create_interface():
323
  minimum=0.1,
324
  maximum=1.0,
325
  value=0.9,
326
- step=0.05
 
327
  )
328
 
329
  max_new_tokens = gr.Slider(
330
  label="Max Tokens",
331
  minimum=50,
332
- maximum=2048,
333
  value=512,
334
- step=50
 
335
  )
336
 
337
- gr.Markdown("### 🗣️ TTS Settings")
338
-
339
- enable_tts = gr.Checkbox(
340
- label="Enable TTS",
341
- value=False,
342
- info="تحويل الرد إلى كلام"
 
343
  )
344
-
345
- tts_style = gr.Dropdown(
346
- choices=["default", "emotional", "calm", "energetic"],
347
- value="default",
348
- label="TTS Style",
349
- visible=False
350
- )
351
-
352
- # تحديث visibility حسب الوضع
353
- def update_inputs(mode_value):
354
- return {
355
- image_input: gr.update(visible="Image" in mode_value),
356
- audio_input: gr.update(visible="Audio" in mode_value),
357
- video_input: gr.update(visible="Video" in mode_value),
358
- }
359
-
360
- mode.change(
361
- fn=update_inputs,
362
- inputs=[mode],
363
- outputs=[image_input, audio_input, video_input]
364
- )
365
 
366
- # تحديث visibility لإعدادات TTS
367
- enable_tts.change(
368
- fn=lambda x: {
369
- tts_style: gr.update(visible=x),
370
- output_audio: gr.update(visible=x)
371
- },
372
- inputs=[enable_tts],
373
- outputs=[tts_style, output_audio]
374
  )
375
 
376
- # معالجة الإرسال
377
- submit_btn.click(
378
- fn=minicpm_o_inference,
379
- inputs=[
380
- text_input,
381
- image_input,
382
- audio_input,
383
- video_input,
384
- mode,
385
- temperature,
386
- top_p,
387
- max_new_tokens,
388
- enable_tts,
389
- tts_style
390
- ],
391
- outputs=[output_text, output_audio]
392
  )
393
 
394
- # أمثلة
395
  gr.Examples(
396
  examples=[
397
- ["What is artificial intelligence?", None, None, None, "Text Only"],
398
- ["Describe this image in detail", "examples/sample.jpg", None, None, "Image + Text"],
399
- ["Transcribe this audio", None, "examples/audio.wav", None, "Audio + Text"],
400
- ["What happens in this video?", None, None, "examples/video.mp4", "Video + Text"],
401
  ],
402
- inputs=[text_input, image_input, audio_input, video_input, mode],
 
 
 
403
  )
404
-
405
  return demo
406
 
407
 
408
- # =========================================================
409
- # تشغيل التطبيق
410
- # =========================================================
411
-
412
  if __name__ == "__main__":
413
- demo = create_interface()
414
  demo.launch(
415
  ssr_mode=False,
416
- show_error=True,
417
- share=False
418
  )
 
1
  import os
 
2
  import torch
3
  import gradio as gr
4
  import spaces
 
 
 
5
  from PIL import Image
6
  from transformers import AutoModel, AutoTokenizer
7
+ import warnings
8
+ warnings.filterwarnings("ignore")
9
 
10
  # =========================================================
11
  # إعدادات النموذج
12
  # =========================================================
13
 
14
+ MODEL_ID = "openbmb/MiniCPM-o-2_6"
 
 
 
 
 
 
 
 
15
 
16
+ # تحميل كسول للنموذج
17
  model = None
18
  tokenizer = None
19
 
20
 
21
  def load_model():
22
+ """تحميل النموذج عند الحاجة فقط"""
 
 
23
  global model, tokenizer
24
 
25
+ if model is not None:
26
  return
27
 
28
+ print(f"Loading {MODEL_ID}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # استخدام float16 بدلاً من bfloat16 للتوافق مع ZeroGPU
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
33
 
34
+ try:
35
+ # تحميل tokenizer
36
+ tokenizer = AutoTokenizer.from_pretrained(
37
+ MODEL_ID,
38
+ trust_remote_code=True,
39
+ use_fast=False
40
+ )
41
+
42
+ # تحميل النموذج مع إعدادات آمنة لـ ZeroGPU
43
+ model = AutoModel.from_pretrained(
44
+ MODEL_ID,
45
+ trust_remote_code=True,
46
+ torch_dtype=dtype,
47
+ low_cpu_mem_usage=True,
48
+ attn_implementation="eager", # استخدام eager بدلاً من flash_attention
49
+ ).eval()
50
+
51
+ if torch.cuda.is_available():
52
+ model = model.cuda()
53
+
54
+ print("Model loaded successfully!")
55
+
56
+ except Exception as e:
57
+ print(f"Error loading model: {e}")
58
+ # محاولة تحميل بديلة بدون trust_remote_code
59
+ try:
60
+ from transformers import AutoModelForCausalLM
61
+ model = AutoModelForCausalLM.from_pretrained(
62
+ MODEL_ID,
63
+ torch_dtype=dtype,
64
+ low_cpu_mem_usage=True,
65
+ ).eval()
66
+
67
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
68
+
69
+ if torch.cuda.is_available():
70
+ model = model.cuda()
71
+
72
+ except Exception as e2:
73
+ raise RuntimeError(f"Failed to load model: {e2}")
74
 
75
 
76
  # =========================================================
77
+ # دالة الاستدلال مع ZeroGPU
78
  # =========================================================
79
 
80
+ @spaces.GPU(duration=60)
81
+ def generate_response(
82
  text_input,
83
  image_input,
 
 
 
84
  temperature,
85
  top_p,
86
+ max_new_tokens
 
 
87
  ):
88
  """
89
+ معالجة النص والصور باستخدام MiniCPM-o-2_6
 
90
  """
91
 
92
+ if not text_input and not image_input:
93
+ return "Please provide text or image input."
 
 
 
94
 
95
+ try:
96
+ load_model()
97
+ global model, tokenizer
 
 
 
 
 
98
 
99
+ # إعداد الرسائل
100
+ if image_input is not None:
101
+ # معالجة الصورة + النص
102
+ if not text_input:
103
+ text_input = "What is shown in this image? Please describe in detail."
104
 
105
+ # تحضير المدخل للنموذج
106
+ msgs = [{"role": "user", "content": [image_input, text_input]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ # استخدام طريقة chat الخاصة بالنموذج
109
+ with torch.no_grad():
110
+ if hasattr(model, 'chat'):
111
+ response = model.chat(
112
+ image=image_input,
113
+ msgs=msgs,
114
+ tokenizer=tokenizer,
115
+ sampling=True,
116
+ temperature=temperature,
117
+ top_p=top_p,
118
+ max_new_tokens=max_new_tokens
119
+ )
120
+ else:
121
+ # fallback للنماذج التي لا تدعم chat
122
+ inputs = tokenizer(text_input, return_tensors="pt")
123
+ if torch.cuda.is_available():
124
+ inputs = inputs.to("cuda")
125
+
126
+ outputs = model.generate(
127
+ **inputs,
128
+ max_new_tokens=max_new_tokens,
129
+ temperature=temperature,
130
+ top_p=top_p,
131
+ do_sample=True
132
+ )
133
+
134
+ response = tokenizer.decode(
135
+ outputs[0][inputs['input_ids'].shape[1]:],
136
+ skip_special_tokens=True
137
+ )
138
+ else:
139
+ # نص فقط
140
+ inputs = tokenizer(
141
+ text_input,
142
+ return_tensors="pt",
143
+ padding=True,
144
+ truncation=True,
145
+ max_length=2048
146
+ )
147
 
148
+ if torch.cuda.is_available():
149
+ inputs = inputs.to("cuda")
150
+
151
+ with torch.no_grad():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  outputs = model.generate(
153
  **inputs,
154
+ max_new_tokens=max_new_tokens,
155
+ temperature=temperature,
156
+ top_p=top_p,
157
+ do_sample=True,
158
+ pad_token_id=tokenizer.pad_token_id,
159
+ eos_token_id=tokenizer.eos_token_id
160
  )
161
+
162
+ response = tokenizer.decode(
163
+ outputs[0][inputs['input_ids'].shape[1]:],
164
+ skip_special_tokens=True
165
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ return response
168
 
169
  except Exception as e:
170
  import traceback
171
  traceback.print_exc()
172
+ return f"Error: {str(e)}"
173
 
174
 
175
  # =========================================================
176
  # واجهة Gradio
177
  # =========================================================
178
 
179
+ def create_demo():
180
+ """إنشاء واجهة Gradio البسيطة"""
181
 
182
+ with gr.Blocks(title="MiniCPM-o-2.6") as demo:
183
  gr.Markdown(
184
  """
185
+ # 🤖 MiniCPM-o-2.6 - Multimodal AI
186
 
187
+ **Capabilities:**
188
+ - 🖼️ Image Understanding (OCR, description, analysis)
189
+ - 💬 Text Generation
190
+ - 🧠 8B parameters with GPT-4 level performance
 
 
 
191
 
192
+ Enter your text or upload an image to start!
193
  """
194
  )
195
 
196
  with gr.Row():
197
+ with gr.Column(scale=2):
 
 
 
 
 
 
 
 
 
198
  text_input = gr.Textbox(
199
  label="Text Input",
200
+ placeholder="Enter your question or prompt...",
201
  lines=3
202
  )
203
 
204
  image_input = gr.Image(
205
+ label="Image Input (Optional)",
206
+ type="pil"
 
207
  )
208
 
209
+ with gr.Row():
210
+ submit_btn = gr.Button("🚀 Generate", variant="primary")
211
+ clear_btn = gr.Button("🗑️ Clear")
 
 
212
 
213
+ output = gr.Textbox(
 
 
 
 
 
 
 
 
 
214
  label="Response",
215
+ lines=8,
216
  interactive=False
217
  )
 
 
 
 
 
 
218
 
219
  with gr.Column(scale=1):
220
  gr.Markdown("### ⚙️ Settings")
221
 
222
  temperature = gr.Slider(
223
  label="Temperature",
224
+ minimum=0.1,
225
+ maximum=1.0,
226
  value=0.7,
227
+ step=0.1,
228
+ info="Higher = more creative"
229
  )
230
 
231
  top_p = gr.Slider(
 
233
  minimum=0.1,
234
  maximum=1.0,
235
  value=0.9,
236
+ step=0.05,
237
+ info="Nucleus sampling"
238
  )
239
 
240
  max_new_tokens = gr.Slider(
241
  label="Max Tokens",
242
  minimum=50,
243
+ maximum=1024,
244
  value=512,
245
+ step=50,
246
+ info="Maximum response length"
247
  )
248
 
249
+ gr.Markdown(
250
+ """
251
+ ### 📝 Tips:
252
+ - For images: Upload and ask questions
253
+ - Supports OCR and image analysis
254
+ - Can handle multiple languages
255
+ """
256
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ # Event handlers
259
+ submit_btn.click(
260
+ fn=generate_response,
261
+ inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
262
+ outputs=output,
263
+ api_name="generate"
 
 
264
  )
265
 
266
+ clear_btn.click(
267
+ fn=lambda: (None, None, ""),
268
+ inputs=[],
269
+ outputs=[text_input, image_input, output]
 
 
 
 
 
 
 
 
 
 
 
 
270
  )
271
 
272
+ # Examples
273
  gr.Examples(
274
  examples=[
275
+ ["What is artificial intelligence?", None],
276
+ ["Explain quantum computing in simple terms", None],
277
+ ["Write a poem about nature", None],
 
278
  ],
279
+ inputs=[text_input, image_input],
280
+ outputs=output,
281
+ fn=lambda t, i: generate_response(t, i, 0.7, 0.9, 512),
282
+ cache_examples=False
283
  )
284
+
285
  return demo
286
 
287
 
 
 
 
 
288
  if __name__ == "__main__":
289
+ demo = create_demo()
290
  demo.launch(
291
  ssr_mode=False,
292
+ show_error=True
 
293
  )