1inkusFace commited on
Commit
812b854
Β·
verified Β·
1 Parent(s): 68c9588

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -78
app.py CHANGED
@@ -1,36 +1,28 @@
1
- import spaces
2
  import os
3
  import sys
4
  import subprocess
5
  import tempfile
6
  import warnings
7
  warnings.filterwarnings('ignore')
8
-
9
  # ====================== DEPENDENCY SETUP ======================
10
  def setup():
11
- """Fixed setup: Clone repo with submodules + install flash-attn properly + ensure kimia_infer import"""
12
  print("πŸ”§ Setting up dependencies...")
13
 
14
- # 0. Install torch early (required for builds like flash-attn and pyproject.toml)
15
- print("πŸ“¦ Installing base torch...")
16
  try:
17
  subprocess.run([
18
  sys.executable, '-m', 'pip', 'install', '-q',
19
- 'torch==2.6.0', 'torchaudio==2.6.0'
 
20
  ], check=True, stdout=sys.stdout, stderr=sys.stderr)
21
- print("βœ… torch installed")
22
  except subprocess.CalledProcessError as e:
23
  print(f"⚠️ torch install failed: {e}")
24
 
25
- # 1. Flash-Attn
26
- print("⚑ Installing flash-attn...")
27
- try:
28
- subprocess.run([
29
- sys.executable, '-m', 'pip', 'install', '-q',
30
- 'flash-attn==2.7.4.post1', '--no-build-isolation'
31
- ], check=True, stdout=sys.stdout, stderr=sys.stderr)
32
- except subprocess.CalledProcessError as e:
33
- print(f"⚠️ flash-attn failed: {e}")
34
 
35
  # 2. Clone Kimi-Audio with submodules
36
  repo_dir = "/tmp/Kimi-Audio"
@@ -54,7 +46,18 @@ def setup():
54
  except Exception as e:
55
  print(f"⚠️ requirements install failed: {e}")
56
 
57
- # 4. Optional: Try editable install (but not critical now)
 
 
 
 
 
 
 
 
 
 
 
58
  print("🎡 Trying to install kimia_infer editable...")
59
  try:
60
  subprocess.run([
@@ -63,11 +66,11 @@ def setup():
63
  except Exception as e:
64
  print(f"⚠️ Editable install failed (ignoring, using path fallback): {e}")
65
 
66
- # 5. Fallback: Add repo to sys.path for direct import
67
  sys.path.insert(0, repo_dir)
68
  print(f"βœ… Added {repo_dir} to sys.path: {sys.path[:2]}") # Debug
69
 
70
- # 6. Install other deps (overlaps with requirements.txt, but ensures HF-specific; remove torch since installed early)
71
  print("πŸ“š Installing additional deps...")
72
  subprocess.run([
73
  sys.executable, '-m', 'pip', 'install', '-q',
@@ -75,7 +78,7 @@ def setup():
75
  'soundfile', 'gradio', 'spaces', 'pillow', 'numpy', 'scipy'
76
  ], check=True, stdout=sys.stdout, stderr=sys.stderr)
77
 
78
- # 7. Early import test
79
  try:
80
  from kimia_infer.api.kimia import KimiAudio
81
  print("βœ… Early import test: kimia_infer SUCCESS")
@@ -83,10 +86,8 @@ def setup():
83
  print(f"❌ Early import test failed: {e}")
84
 
85
  print("βœ… Setup completed!")
86
-
87
  # Run setup before any imports
88
  setup()
89
-
90
  # ====================== IMPORTS ======================
91
  import torch
92
  import gradio as gr
@@ -95,7 +96,6 @@ from huggingface_hub import snapshot_download
95
  import soundfile as sf
96
  from PIL import Image
97
  import numpy as np
98
-
99
  # Now safe to import kimia
100
  try:
101
  from kimia_infer.api.kimia import KimiAudio
@@ -105,7 +105,6 @@ except Exception as e:
105
  print(f"⚠️ KimiAudio import failed: {e}")
106
  KIMI_AUDIO_AVAILABLE = False
107
  KimiAudio = None
108
-
109
  # Try to import transformers for Kimi-VL
110
  try:
111
  from transformers import AutoProcessor, AutoModelForVision2Seq
@@ -115,11 +114,9 @@ except ImportError:
115
  KIMI_VL_AVAILABLE = False
116
  AutoProcessor = None
117
  AutoModelForVision2Seq = None
118
-
119
  print(f"CUDA available: {torch.cuda.is_available()}")
120
  if torch.cuda.is_available():
121
  print(f"GPU: {torch.cuda.get_device_name(0)}")
122
-
123
  # ====================== MODEL LOADING ======================
124
  class ModelManager:
125
  def __init__(self):
@@ -128,13 +125,13 @@ class ModelManager:
128
  self.vl_model = None
129
  self.vl_processor = None
130
  self.vl_device = None
131
-
132
  @spaces.GPU(duration=120)
133
  def load_audio_model(self):
134
  """Load Kimi-Audio with ZeroGPU"""
135
  if not KIMI_AUDIO_AVAILABLE:
136
  return "❌ kimia_infer not available"
137
-
138
  try:
139
  print("⬇️ Downloading Kimi-Audio-7B...")
140
  model_path = snapshot_download(
@@ -143,69 +140,67 @@ class ModelManager:
143
  local_dir_use_symlinks=False,
144
  resume_download=True
145
  )
146
-
147
  print(f"πŸš€ Loading Audio model...")
148
  device = "cuda" if torch.cuda.is_available() else "cpu"
149
-
150
  model = KimiAudio(
151
  model_path=model_path,
152
  load_detokenizer=True
153
  )
154
  model = model.to(device)
155
-
156
  self.audio_model = model
157
  self.audio_device = device
158
  return f"βœ… Audio model loaded on {device}"
159
  except Exception as e:
160
  return f"❌ Audio load failed: {str(e)}"
161
-
162
  @spaces.GPU(duration=180)
163
  def load_vl_model(self):
164
  """Load Kimi-VL with ZeroGPU"""
165
  if not KIMI_VL_AVAILABLE:
166
  return "❌ Transformers not available"
167
-
168
  try:
169
  print("⬇️ Downloading Kimi-VL-A3B...")
170
  model_id = "moonshotai/Kimi-VL-A3B-Thinking-2506"
171
-
172
  processor = AutoProcessor.from_pretrained(
173
- model_id,
174
  trust_remote_code=True
175
  )
176
-
177
  model = AutoModelForVision2Seq.from_pretrained(
178
  model_id,
179
  torch_dtype=torch.float16,
180
  device_map="auto",
181
  trust_remote_code=True
182
  )
183
-
184
  self.vl_processor = processor
185
  self.vl_model = model
186
  self.vl_device = next(model.parameters()).device
187
  return f"βœ… VL model loaded on {self.vl_device}"
188
  except Exception as e:
189
  return f"❌ VL load failed: {str(e)}"
190
-
191
  # Global model manager
192
  manager = ModelManager()
193
-
194
  # ====================== INFERENCE FUNCTIONS ======================
195
  def generate_audio_response(audio_path: str, prompt: str):
196
  """Kimi-Audio inference"""
197
  if not manager.audio_model:
198
  return "Model not loaded. Click 'Load Audio Model' first.", None
199
-
200
  if not audio_path:
201
  return "Please upload audio.", None
202
-
203
  try:
204
  messages = [
205
  {"role": "user", "message_type": "text", "content": prompt or "Respond naturally."},
206
  {"role": "user", "message_type": "audio", "content": audio_path},
207
  ]
208
-
209
  sampling_params = {
210
  "audio_temperature": 0.8,
211
  "audio_top_k": 10,
@@ -216,40 +211,39 @@ def generate_audio_response(audio_path: str, prompt: str):
216
  "text_repetition_penalty": 1.0,
217
  "text_repetition_window_size": 16,
218
  }
219
-
220
  wav_output, text_output = manager.audio_model.generate(
221
  messages, **sampling_params, output_type="both"
222
  )
223
-
224
  # Save audio
225
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
226
  output_path = f.name
227
  if isinstance(wav_output, torch.Tensor):
228
  wav_output = wav_output.detach().cpu().view(-1).numpy()
229
  sf.write(output_path, wav_output, 24000)
230
-
231
  return text_output, output_path
232
  except Exception as e:
233
  return f"Error: {str(e)}", None
234
-
235
  def generate_vl_response(image, text: str):
236
  """Kimi-VL inference"""
237
  if not manager.vl_model:
238
  return "Model not loaded. Click 'Load VL Model' first."
239
-
240
  if image is None:
241
  return "Please upload an image."
242
-
243
  try:
244
  # Format prompt for Kimi-VL
245
  prompt = f"<|im_start|>user\n<image>\n{text}<|im_end|>\n<|im_start|>assistant\n"
246
-
247
  inputs = manager.vl_processor(
248
  text=text,
249
  images=image,
250
  return_tensors="pt"
251
  ).to(manager.vl_device)
252
-
253
  outputs = manager.vl_model.generate(
254
  **inputs,
255
  max_new_tokens=512,
@@ -257,50 +251,48 @@ def generate_vl_response(image, text: str):
257
  temperature=0.7,
258
  top_p=0.9
259
  )
260
-
261
  response = manager.vl_processor.decode(outputs[0], skip_special_tokens=True)
262
  # Clean up the response (remove the prompt part)
263
  if "assistant" in response:
264
  response = response.split("assistant")[-1].strip()
265
-
266
  return response
267
  except Exception as e:
268
  return f"Error: {str(e)}"
269
-
270
  def chain_vl_to_audio(image, vl_prompt: str, audio_prompt: str):
271
  """Pipeline: Image β†’ Kimi-VL description β†’ Kimi-Audio narration"""
272
  if not manager.vl_model or not manager.audio_model:
273
  return "Both models must be loaded first.", None, None
274
-
275
  # Step 1: VL generates description
276
  description = generate_vl_response(image, vl_prompt)
277
-
278
  # Step 2: Audio generates speech from description
279
  # Create a dummy audio input for the text-to-speech mode if supported
280
  # Or use the description as text input to audio model
281
  text_out, audio_out = generate_audio_response(None, f"Narrate this: {description}")
282
-
283
  return description, text_out, audio_out
284
-
285
  # ====================== GRADIO UI ======================
286
  with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft()) as demo:
287
  gr.Markdown("""
288
  # πŸŽ­πŸŽ΅πŸ‘οΈ Kimi Multimodal Test Lab
289
  **Kimi-Audio-7B** (Voice) + **Kimi-VL-A3B** (Vision) on HuggingFace ZeroGPU
290
  """)
291
-
292
  with gr.Tab("πŸš€ Model Setup"):
293
  gr.Markdown("Load models first (takes 60-120s each on ZeroGPU)")
294
  with gr.Row():
295
  load_audio_btn = gr.Button("Load Kimi-Audio", variant="primary")
296
  load_vl_btn = gr.Button("Load Kimi-VL", variant="primary")
297
-
298
  audio_status = gr.Textbox(label="Audio Model Status", value="Not loaded")
299
  vl_status = gr.Textbox(label="VL Model Status", value="Not loaded")
300
-
301
  load_audio_btn.click(manager.load_audio_model, outputs=audio_status)
302
  load_vl_btn.click(manager.load_vl_model, outputs=vl_status)
303
-
304
  with gr.Tab("🎡 Kimi-Audio"):
305
  gr.Markdown("Voice conversation, ASR, audio Q&A")
306
  with gr.Row():
@@ -316,17 +308,17 @@ with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft())
316
  placeholder="E.g., 'What is being said?' or 'Summarize the meeting'"
317
  )
318
  audio_gen_btn = gr.Button("Generate Response", variant="primary")
319
-
320
  with gr.Column():
321
  audio_text_out = gr.Textbox(label="Text Response", lines=4)
322
  audio_out = gr.Audio(label="Kimi's Voice Response", type="filepath")
323
-
324
  audio_gen_btn.click(
325
  generate_audio_response,
326
  inputs=[audio_input, audio_text_prompt],
327
  outputs=[audio_text_out, audio_out]
328
  )
329
-
330
  with gr.Tab("πŸ‘οΈ Kimi-VL"):
331
  gr.Markdown("Visual question answering, image description, visual comedy")
332
  with gr.Row():
@@ -338,16 +330,16 @@ with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft())
338
  placeholder="E.g., 'What do you see?' or 'Roast this outfit'"
339
  )
340
  vl_gen_btn = gr.Button("Analyze Image", variant="primary")
341
-
342
  with gr.Column():
343
  vl_output = gr.Textbox(label="Visual Analysis", lines=8)
344
-
345
  vl_gen_btn.click(
346
  generate_vl_response,
347
  inputs=[image_input, vl_text_prompt],
348
  outputs=vl_output
349
  )
350
-
351
  with gr.Tab("🎭 Combined Pipeline"):
352
  gr.Markdown("Chain: Image β†’ Description β†’ Voice Narration")
353
  with gr.Row():
@@ -362,18 +354,18 @@ with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft())
362
  label="Voice Style Prompt"
363
  )
364
  chain_btn = gr.Button("Run Full Pipeline", variant="primary")
365
-
366
  with gr.Column():
367
  chain_desc = gr.Textbox(label="Generated Description")
368
  chain_text = gr.Textbox(label="Audio Text")
369
  chain_audio = gr.Audio(label="Narrated Audio")
370
-
371
  chain_btn.click(
372
  chain_vl_to_audio,
373
  inputs=[chain_image, chain_vl_prompt, chain_audio_prompt],
374
  outputs=[chain_desc, chain_text, chain_audio]
375
  )
376
-
377
  gr.Markdown("---")
378
  gr.Markdown("""
379
  **Notes:**
@@ -381,14 +373,10 @@ with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft())
381
  - ZeroGPU provides A100/L4 GPUs - cold start ~60-120s per model
382
  - Keep `max_size=1` in queue to prevent OOM with two large models
383
  """)
384
-
385
-
386
  import asyncio
387
  import warnings
388
-
389
  # Suppress the event loop cleanup error
390
  warnings.filterwarnings("ignore", category=ResourceWarning)
391
-
392
  # Fix for asyncio cleanup on exit
393
  def silence_event_loop_closed(func):
394
  def wrapper(*args, **kwargs):
@@ -400,17 +388,15 @@ def silence_event_loop_closed(func):
400
  else:
401
  raise
402
  return wrapper
403
-
404
  # Patch the event loop to prevent the error
405
  asyncio.base_events.BaseEventLoop.__del__ = silence_event_loop_closed(
406
  asyncio.base_events.BaseEventLoop.__del__
407
  )
408
-
409
  # Disable SSR (experimental mode causing the issue)
410
  demo.queue(max_size=1)
411
  demo.launch(
412
  server_name="0.0.0.0",
413
  server_port=7860,
414
- share=False, # Set to True if you need a public gradio.live link
415
- ssr_mode=False # <-- DISABLES the experimental SSR causing the error
416
  )
 
1
+ import spaces
2
  import os
3
  import sys
4
  import subprocess
5
  import tempfile
6
  import warnings
7
  warnings.filterwarnings('ignore')
 
8
  # ====================== DEPENDENCY SETUP ======================
9
  def setup():
10
+ """Fixed setup: Clone repo with submodules + install flash-attn properly"""
11
  print("πŸ”§ Setting up dependencies...")
12
 
13
+ # 0. Install base torch with compatible versions and CUDA
14
+ print("πŸ“¦ Installing base torch, torchaudio, torchvision...")
15
  try:
16
  subprocess.run([
17
  sys.executable, '-m', 'pip', 'install', '-q',
18
+ 'torch==2.6.0', 'torchaudio==2.6.0', 'torchvision==0.21.0',
19
+ '--index-url', 'https://download.pytorch.org/whl/cu126'
20
  ], check=True, stdout=sys.stdout, stderr=sys.stderr)
21
+ print("βœ… torch ecosystem installed")
22
  except subprocess.CalledProcessError as e:
23
  print(f"⚠️ torch install failed: {e}")
24
 
25
+ # 1. Flash-Attn (install later after requirements)
 
 
 
 
 
 
 
 
26
 
27
  # 2. Clone Kimi-Audio with submodules
28
  repo_dir = "/tmp/Kimi-Audio"
 
46
  except Exception as e:
47
  print(f"⚠️ requirements install failed: {e}")
48
 
49
+ # 4. Force rebuild flash-attn from source to match torch
50
+ print("⚑ Forcing flash-attn build from source...")
51
+ try:
52
+ subprocess.run([
53
+ sys.executable, '-m', 'pip', 'install', '-q', 'flash-attn',
54
+ '--no-binary', 'flash-attn', '--force-reinstall', '--no-build-isolation'
55
+ ], check=True, stdout=sys.stdout, stderr=sys.stderr)
56
+ print("βœ… flash-attn rebuilt")
57
+ except Exception as e:
58
+ print(f"⚠️ flash-attn rebuild failed: {e}")
59
+
60
+ # 5. Optional: Try editable install
61
  print("🎡 Trying to install kimia_infer editable...")
62
  try:
63
  subprocess.run([
 
66
  except Exception as e:
67
  print(f"⚠️ Editable install failed (ignoring, using path fallback): {e}")
68
 
69
+ # 6. Fallback: Add repo to sys.path for direct import
70
  sys.path.insert(0, repo_dir)
71
  print(f"βœ… Added {repo_dir} to sys.path: {sys.path[:2]}") # Debug
72
 
73
+ # 7. Install other deps
74
  print("πŸ“š Installing additional deps...")
75
  subprocess.run([
76
  sys.executable, '-m', 'pip', 'install', '-q',
 
78
  'soundfile', 'gradio', 'spaces', 'pillow', 'numpy', 'scipy'
79
  ], check=True, stdout=sys.stdout, stderr=sys.stderr)
80
 
81
+ # 8. Early import test
82
  try:
83
  from kimia_infer.api.kimia import KimiAudio
84
  print("βœ… Early import test: kimia_infer SUCCESS")
 
86
  print(f"❌ Early import test failed: {e}")
87
 
88
  print("βœ… Setup completed!")
 
89
  # Run setup before any imports
90
  setup()
 
91
  # ====================== IMPORTS ======================
92
  import torch
93
  import gradio as gr
 
96
  import soundfile as sf
97
  from PIL import Image
98
  import numpy as np
 
99
  # Now safe to import kimia
100
  try:
101
  from kimia_infer.api.kimia import KimiAudio
 
105
  print(f"⚠️ KimiAudio import failed: {e}")
106
  KIMI_AUDIO_AVAILABLE = False
107
  KimiAudio = None
 
108
  # Try to import transformers for Kimi-VL
109
  try:
110
  from transformers import AutoProcessor, AutoModelForVision2Seq
 
114
  KIMI_VL_AVAILABLE = False
115
  AutoProcessor = None
116
  AutoModelForVision2Seq = None
 
117
  print(f"CUDA available: {torch.cuda.is_available()}")
118
  if torch.cuda.is_available():
119
  print(f"GPU: {torch.cuda.get_device_name(0)}")
 
120
  # ====================== MODEL LOADING ======================
121
  class ModelManager:
122
  def __init__(self):
 
125
  self.vl_model = None
126
  self.vl_processor = None
127
  self.vl_device = None
128
+
129
  @spaces.GPU(duration=120)
130
  def load_audio_model(self):
131
  """Load Kimi-Audio with ZeroGPU"""
132
  if not KIMI_AUDIO_AVAILABLE:
133
  return "❌ kimia_infer not available"
134
+
135
  try:
136
  print("⬇️ Downloading Kimi-Audio-7B...")
137
  model_path = snapshot_download(
 
140
  local_dir_use_symlinks=False,
141
  resume_download=True
142
  )
143
+
144
  print(f"πŸš€ Loading Audio model...")
145
  device = "cuda" if torch.cuda.is_available() else "cpu"
146
+
147
  model = KimiAudio(
148
  model_path=model_path,
149
  load_detokenizer=True
150
  )
151
  model = model.to(device)
152
+
153
  self.audio_model = model
154
  self.audio_device = device
155
  return f"βœ… Audio model loaded on {device}"
156
  except Exception as e:
157
  return f"❌ Audio load failed: {str(e)}"
158
+
159
  @spaces.GPU(duration=180)
160
  def load_vl_model(self):
161
  """Load Kimi-VL with ZeroGPU"""
162
  if not KIMI_VL_AVAILABLE:
163
  return "❌ Transformers not available"
164
+
165
  try:
166
  print("⬇️ Downloading Kimi-VL-A3B...")
167
  model_id = "moonshotai/Kimi-VL-A3B-Thinking-2506"
168
+
169
  processor = AutoProcessor.from_pretrained(
170
+ model_id,
171
  trust_remote_code=True
172
  )
173
+
174
  model = AutoModelForVision2Seq.from_pretrained(
175
  model_id,
176
  torch_dtype=torch.float16,
177
  device_map="auto",
178
  trust_remote_code=True
179
  )
180
+
181
  self.vl_processor = processor
182
  self.vl_model = model
183
  self.vl_device = next(model.parameters()).device
184
  return f"βœ… VL model loaded on {self.vl_device}"
185
  except Exception as e:
186
  return f"❌ VL load failed: {str(e)}"
 
187
  # Global model manager
188
  manager = ModelManager()
 
189
  # ====================== INFERENCE FUNCTIONS ======================
190
  def generate_audio_response(audio_path: str, prompt: str):
191
  """Kimi-Audio inference"""
192
  if not manager.audio_model:
193
  return "Model not loaded. Click 'Load Audio Model' first.", None
194
+
195
  if not audio_path:
196
  return "Please upload audio.", None
197
+
198
  try:
199
  messages = [
200
  {"role": "user", "message_type": "text", "content": prompt or "Respond naturally."},
201
  {"role": "user", "message_type": "audio", "content": audio_path},
202
  ]
203
+
204
  sampling_params = {
205
  "audio_temperature": 0.8,
206
  "audio_top_k": 10,
 
211
  "text_repetition_penalty": 1.0,
212
  "text_repetition_window_size": 16,
213
  }
214
+
215
  wav_output, text_output = manager.audio_model.generate(
216
  messages, **sampling_params, output_type="both"
217
  )
218
+
219
  # Save audio
220
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
221
  output_path = f.name
222
  if isinstance(wav_output, torch.Tensor):
223
  wav_output = wav_output.detach().cpu().view(-1).numpy()
224
  sf.write(output_path, wav_output, 24000)
225
+
226
  return text_output, output_path
227
  except Exception as e:
228
  return f"Error: {str(e)}", None
 
229
  def generate_vl_response(image, text: str):
230
  """Kimi-VL inference"""
231
  if not manager.vl_model:
232
  return "Model not loaded. Click 'Load VL Model' first."
233
+
234
  if image is None:
235
  return "Please upload an image."
236
+
237
  try:
238
  # Format prompt for Kimi-VL
239
  prompt = f"<|im_start|>user\n<image>\n{text}<|im_end|>\n<|im_start|>assistant\n"
240
+
241
  inputs = manager.vl_processor(
242
  text=text,
243
  images=image,
244
  return_tensors="pt"
245
  ).to(manager.vl_device)
246
+
247
  outputs = manager.vl_model.generate(
248
  **inputs,
249
  max_new_tokens=512,
 
251
  temperature=0.7,
252
  top_p=0.9
253
  )
254
+
255
  response = manager.vl_processor.decode(outputs[0], skip_special_tokens=True)
256
  # Clean up the response (remove the prompt part)
257
  if "assistant" in response:
258
  response = response.split("assistant")[-1].strip()
259
+
260
  return response
261
  except Exception as e:
262
  return f"Error: {str(e)}"
 
263
  def chain_vl_to_audio(image, vl_prompt: str, audio_prompt: str):
264
  """Pipeline: Image β†’ Kimi-VL description β†’ Kimi-Audio narration"""
265
  if not manager.vl_model or not manager.audio_model:
266
  return "Both models must be loaded first.", None, None
267
+
268
  # Step 1: VL generates description
269
  description = generate_vl_response(image, vl_prompt)
270
+
271
  # Step 2: Audio generates speech from description
272
  # Create a dummy audio input for the text-to-speech mode if supported
273
  # Or use the description as text input to audio model
274
  text_out, audio_out = generate_audio_response(None, f"Narrate this: {description}")
275
+
276
  return description, text_out, audio_out
 
277
  # ====================== GRADIO UI ======================
278
  with gr.Blocks(title="Kimi Multimodal Lab β€’ ZeroGPU", theme=gr.themes.Soft()) as demo:
279
  gr.Markdown("""
280
  # πŸŽ­πŸŽ΅πŸ‘οΈ Kimi Multimodal Test Lab
281
  **Kimi-Audio-7B** (Voice) + **Kimi-VL-A3B** (Vision) on HuggingFace ZeroGPU
282
  """)
283
+
284
  with gr.Tab("πŸš€ Model Setup"):
285
  gr.Markdown("Load models first (takes 60-120s each on ZeroGPU)")
286
  with gr.Row():
287
  load_audio_btn = gr.Button("Load Kimi-Audio", variant="primary")
288
  load_vl_btn = gr.Button("Load Kimi-VL", variant="primary")
289
+
290
  audio_status = gr.Textbox(label="Audio Model Status", value="Not loaded")
291
  vl_status = gr.Textbox(label="VL Model Status", value="Not loaded")
292
+
293
  load_audio_btn.click(manager.load_audio_model, outputs=audio_status)
294
  load_vl_btn.click(manager.load_vl_model, outputs=vl_status)
295
+
296
  with gr.Tab("🎡 Kimi-Audio"):
297
  gr.Markdown("Voice conversation, ASR, audio Q&A")
298
  with gr.Row():
 
308
  placeholder="E.g., 'What is being said?' or 'Summarize the meeting'"
309
  )
310
  audio_gen_btn = gr.Button("Generate Response", variant="primary")
311
+
312
  with gr.Column():
313
  audio_text_out = gr.Textbox(label="Text Response", lines=4)
314
  audio_out = gr.Audio(label="Kimi's Voice Response", type="filepath")
315
+
316
  audio_gen_btn.click(
317
  generate_audio_response,
318
  inputs=[audio_input, audio_text_prompt],
319
  outputs=[audio_text_out, audio_out]
320
  )
321
+
322
  with gr.Tab("πŸ‘οΈ Kimi-VL"):
323
  gr.Markdown("Visual question answering, image description, visual comedy")
324
  with gr.Row():
 
330
  placeholder="E.g., 'What do you see?' or 'Roast this outfit'"
331
  )
332
  vl_gen_btn = gr.Button("Analyze Image", variant="primary")
333
+
334
  with gr.Column():
335
  vl_output = gr.Textbox(label="Visual Analysis", lines=8)
336
+
337
  vl_gen_btn.click(
338
  generate_vl_response,
339
  inputs=[image_input, vl_text_prompt],
340
  outputs=vl_output
341
  )
342
+
343
  with gr.Tab("🎭 Combined Pipeline"):
344
  gr.Markdown("Chain: Image β†’ Description β†’ Voice Narration")
345
  with gr.Row():
 
354
  label="Voice Style Prompt"
355
  )
356
  chain_btn = gr.Button("Run Full Pipeline", variant="primary")
357
+
358
  with gr.Column():
359
  chain_desc = gr.Textbox(label="Generated Description")
360
  chain_text = gr.Textbox(label="Audio Text")
361
  chain_audio = gr.Audio(label="Narrated Audio")
362
+
363
  chain_btn.click(
364
  chain_vl_to_audio,
365
  inputs=[chain_image, chain_vl_prompt, chain_audio_prompt],
366
  outputs=[chain_desc, chain_text, chain_audio]
367
  )
368
+
369
  gr.Markdown("---")
370
  gr.Markdown("""
371
  **Notes:**
 
373
  - ZeroGPU provides A100/L4 GPUs - cold start ~60-120s per model
374
  - Keep `max_size=1` in queue to prevent OOM with two large models
375
  """)
 
 
376
  import asyncio
377
  import warnings
 
378
  # Suppress the event loop cleanup error
379
  warnings.filterwarnings("ignore", category=ResourceWarning)
 
380
  # Fix for asyncio cleanup on exit
381
  def silence_event_loop_closed(func):
382
  def wrapper(*args, **kwargs):
 
388
  else:
389
  raise
390
  return wrapper
 
391
  # Patch the event loop to prevent the error
392
  asyncio.base_events.BaseEventLoop.__del__ = silence_event_loop_closed(
393
  asyncio.base_events.BaseEventLoop.__del__
394
  )
 
395
  # Disable SSR (experimental mode causing the issue)
396
  demo.queue(max_size=1)
397
  demo.launch(
398
  server_name="0.0.0.0",
399
  server_port=7860,
400
+ share=False, # Set to True if you need a public gradio.live link
401
+ ssr_mode=False # <-- DISABLES the experimental SSR causing the error
402
  )