Spaces:
Build error
Build error
| import spaces | |
| import os | |
| import sys | |
| import subprocess | |
| import tempfile | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # ====================== DEPENDENCY SETUP ====================== | |
| def setup(): | |
| """Fixed setup: Clone repo with submodules + install flash-attn properly""" | |
| print("π§ Setting up dependencies...") | |
| # 0. Install base torch with compatible versions and CUDA | |
| print("π¦ Installing base torch, torchaudio, torchvision...") | |
| try: | |
| subprocess.run([ | |
| sys.executable, '-m', 'pip', 'install', '-q', | |
| 'torch==2.6.0', 'torchaudio==2.6.0', 'torchvision==0.21.0', | |
| '--index-url', 'https://download.pytorch.org/whl/cu126' | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| print("β torch ecosystem installed") | |
| except subprocess.CalledProcessError as e: | |
| print(f"β οΈ torch install failed: {e}") | |
| # 1. Flash-Attn (install later after requirements) | |
| # 2. Clone Kimi-Audio with submodules | |
| repo_dir = "/tmp/Kimi-Audio" | |
| if not os.path.exists(repo_dir): | |
| print("π¦ Cloning Kimi-Audio with submodules...") | |
| subprocess.run([ | |
| 'git', 'clone', '--recursive', '--depth', '1', | |
| 'https://github.com/MoonshotAI/Kimi-Audio.git', | |
| repo_dir | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| # 3. Install from requirements.txt | |
| print("π Installing from requirements.txt...") | |
| try: | |
| current_dir = os.getcwd() | |
| os.chdir(repo_dir) | |
| subprocess.run([ | |
| sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt' | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| os.chdir(current_dir) | |
| except Exception as e: | |
| print(f"β οΈ requirements install failed: {e}") | |
| # 4. Force rebuild flash-attn from source to match torch | |
| print("β‘ Forcing flash-attn build from source...") | |
| try: | |
| subprocess.run([ | |
| sys.executable, '-m', 'pip', 'install', '-q', 'flash-attn', | |
| '--no-binary', 'flash-attn', '--force-reinstall', '--no-build-isolation' | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| print("β flash-attn rebuilt") | |
| except Exception as e: | |
| print(f"β οΈ flash-attn rebuild failed: {e}") | |
| # 5. Optional: Try editable install | |
| print("π΅ Trying to install kimia_infer editable...") | |
| try: | |
| subprocess.run([ | |
| sys.executable, '-m', 'pip', 'install', '-q', '-e', repo_dir | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| except Exception as e: | |
| print(f"β οΈ Editable install failed (ignoring, using path fallback): {e}") | |
| # 6. Fallback: Add repo to sys.path for direct import | |
| sys.path.insert(0, repo_dir) | |
| print(f"β Added {repo_dir} to sys.path: {sys.path[:2]}") # Debug | |
| # 7. Install other deps | |
| print("π Installing additional deps...") | |
| subprocess.run([ | |
| sys.executable, '-m', 'pip', 'install', '-q', | |
| 'transformers>=4.36.0', 'accelerate', 'huggingface_hub', | |
| 'soundfile', 'gradio', 'spaces', 'pillow', 'numpy', 'scipy' | |
| ], check=True, stdout=sys.stdout, stderr=sys.stderr) | |
| # 8. Early import test | |
| try: | |
| from kimia_infer.api.kimia import KimiAudio | |
| print("β Early import test: kimia_infer SUCCESS") | |
| except Exception as e: | |
| print(f"β Early import test failed: {e}") | |
| print("β Setup completed!") | |
| # Run setup before any imports | |
| setup() | |
| # ====================== IMPORTS ====================== | |
| import torch | |
| import gradio as gr | |
| import spaces | |
| from huggingface_hub import snapshot_download | |
| import soundfile as sf | |
| from PIL import Image | |
| import numpy as np | |
| # Now safe to import kimia | |
| try: | |
| from kimia_infer.api.kimia import KimiAudio | |
| KIMI_AUDIO_AVAILABLE = True | |
| print("π΅ KimiAudio imported successfully") | |
| except Exception as e: | |
| print(f"β οΈ KimiAudio import failed: {e}") | |
| KIMI_AUDIO_AVAILABLE = False | |
| KimiAudio = None | |
| # Try to import transformers for Kimi-VL | |
| try: | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| KIMI_VL_AVAILABLE = True | |
| print("ποΈ Transformers imported for Kimi-VL") | |
| except ImportError: | |
| KIMI_VL_AVAILABLE = False | |
| AutoProcessor = None | |
| AutoModelForVision2Seq = None | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| # ====================== MODEL LOADING ====================== | |
| class ModelManager: | |
| def __init__(self): | |
| self.audio_model = None | |
| self.audio_device = None | |
| self.vl_model = None | |
| self.vl_processor = None | |
| self.vl_device = None | |
| def load_audio_model(self): | |
| """Load Kimi-Audio with ZeroGPU""" | |
| if not KIMI_AUDIO_AVAILABLE: | |
| return "β kimia_infer not available" | |
| try: | |
| print("β¬οΈ Downloading Kimi-Audio-7B...") | |
| model_path = snapshot_download( | |
| repo_id="moonshotai/Kimi-Audio-7B-Instruct", | |
| local_dir="./kimi-audio-model", | |
| local_dir_use_symlinks=False, | |
| resume_download=True | |
| ) | |
| print(f"π Loading Audio model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = KimiAudio( | |
| model_path=model_path, | |
| load_detokenizer=True | |
| ) | |
| model = model.to(device) | |
| self.audio_model = model | |
| self.audio_device = device | |
| return f"β Audio model loaded on {device}" | |
| except Exception as e: | |
| return f"β Audio load failed: {str(e)}" | |
| def load_vl_model(self): | |
| """Load Kimi-VL with ZeroGPU""" | |
| if not KIMI_VL_AVAILABLE: | |
| return "β Transformers not available" | |
| try: | |
| print("β¬οΈ Downloading Kimi-VL-A3B...") | |
| model_id = "moonshotai/Kimi-VL-A3B-Thinking-2506" | |
| processor = AutoProcessor.from_pretrained( | |
| model_id, | |
| trust_remote_code=True | |
| ) | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| self.vl_processor = processor | |
| self.vl_model = model | |
| self.vl_device = next(model.parameters()).device | |
| return f"β VL model loaded on {self.vl_device}" | |
| except Exception as e: | |
| return f"β VL load failed: {str(e)}" | |
| # Global model manager | |
| manager = ModelManager() | |
| # ====================== INFERENCE FUNCTIONS ====================== | |
| def generate_audio_response(audio_path: str, prompt: str): | |
| """Kimi-Audio inference""" | |
| if not manager.audio_model: | |
| return "Model not loaded. Click 'Load Audio Model' first.", None | |
| if not audio_path: | |
| return "Please upload audio.", None | |
| try: | |
| messages = [ | |
| {"role": "user", "message_type": "text", "content": prompt or "Respond naturally."}, | |
| {"role": "user", "message_type": "audio", "content": audio_path}, | |
| ] | |
| sampling_params = { | |
| "audio_temperature": 0.8, | |
| "audio_top_k": 10, | |
| "text_temperature": 0.7, | |
| "text_top_k": 5, | |
| "audio_repetition_penalty": 1.0, | |
| "audio_repetition_window_size": 64, | |
| "text_repetition_penalty": 1.0, | |
| "text_repetition_window_size": 16, | |
| } | |
| wav_output, text_output = manager.audio_model.generate( | |
| messages, **sampling_params, output_type="both" | |
| ) | |
| # Save audio | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| output_path = f.name | |
| if isinstance(wav_output, torch.Tensor): | |
| wav_output = wav_output.detach().cpu().view(-1).numpy() | |
| sf.write(output_path, wav_output, 24000) | |
| return text_output, output_path | |
| except Exception as e: | |
| return f"Error: {str(e)}", None | |
| def generate_vl_response(image, text: str): | |
| """Kimi-VL inference""" | |
| if not manager.vl_model: | |
| return "Model not loaded. Click 'Load VL Model' first." | |
| if image is None: | |
| return "Please upload an image." | |
| try: | |
| # Format prompt for Kimi-VL | |
| prompt = f"<|im_start|>user\n<image>\n{text}<|im_end|>\n<|im_start|>assistant\n" | |
| inputs = manager.vl_processor( | |
| text=text, | |
| images=image, | |
| return_tensors="pt" | |
| ).to(manager.vl_device) | |
| outputs = manager.vl_model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| response = manager.vl_processor.decode(outputs[0], skip_special_tokens=True) | |
| # Clean up the response (remove the prompt part) | |
| if "assistant" in response: | |
| response = response.split("assistant")[-1].strip() | |
| return response | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def chain_vl_to_audio(image, vl_prompt: str, audio_prompt: str): | |
| """Pipeline: Image β Kimi-VL description β Kimi-Audio narration""" | |
| if not manager.vl_model or not manager.audio_model: | |
| return "Both models must be loaded first.", None, None | |
| # Step 1: VL generates description | |
| description = generate_vl_response(image, vl_prompt) | |
| # Step 2: Audio generates speech from description | |
| # Create a dummy audio input for the text-to-speech mode if supported | |
| # Or use the description as text input to audio model | |
| text_out, audio_out = generate_audio_response(None, f"Narrate this: {description}") | |
| return description, text_out, audio_out | |
| # ====================== GRADIO UI ====================== | |
| with gr.Blocks(title="Kimi Multimodal Lab β’ ZeroGPU", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ππ΅ποΈ Kimi Multimodal Test Lab | |
| **Kimi-Audio-7B** (Voice) + **Kimi-VL-A3B** (Vision) on HuggingFace ZeroGPU | |
| """) | |
| with gr.Tab("π Model Setup"): | |
| gr.Markdown("Load models first (takes 60-120s each on ZeroGPU)") | |
| with gr.Row(): | |
| load_audio_btn = gr.Button("Load Kimi-Audio", variant="primary") | |
| load_vl_btn = gr.Button("Load Kimi-VL", variant="primary") | |
| audio_status = gr.Textbox(label="Audio Model Status", value="Not loaded") | |
| vl_status = gr.Textbox(label="VL Model Status", value="Not loaded") | |
| load_audio_btn.click(manager.load_audio_model, outputs=audio_status) | |
| load_vl_btn.click(manager.load_vl_model, outputs=vl_status) | |
| with gr.Tab("π΅ Kimi-Audio"): | |
| gr.Markdown("Voice conversation, ASR, audio Q&A") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Upload/Record Audio", | |
| sources=["microphone", "upload"], | |
| type="filepath" | |
| ) | |
| audio_text_prompt = gr.Textbox( | |
| label="Text Instruction", | |
| value="Transcribe this audio accurately.", | |
| placeholder="E.g., 'What is being said?' or 'Summarize the meeting'" | |
| ) | |
| audio_gen_btn = gr.Button("Generate Response", variant="primary") | |
| with gr.Column(): | |
| audio_text_out = gr.Textbox(label="Text Response", lines=4) | |
| audio_out = gr.Audio(label="Kimi's Voice Response", type="filepath") | |
| audio_gen_btn.click( | |
| generate_audio_response, | |
| inputs=[audio_input, audio_text_prompt], | |
| outputs=[audio_text_out, audio_out] | |
| ) | |
| with gr.Tab("ποΈ Kimi-VL"): | |
| gr.Markdown("Visual question answering, image description, visual comedy") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label="Upload Image") | |
| vl_text_prompt = gr.Textbox( | |
| label="Question/Prompt", | |
| value="Describe this image in a funny way.", | |
| placeholder="E.g., 'What do you see?' or 'Roast this outfit'" | |
| ) | |
| vl_gen_btn = gr.Button("Analyze Image", variant="primary") | |
| with gr.Column(): | |
| vl_output = gr.Textbox(label="Visual Analysis", lines=8) | |
| vl_gen_btn.click( | |
| generate_vl_response, | |
| inputs=[image_input, vl_text_prompt], | |
| outputs=vl_output | |
| ) | |
| with gr.Tab("π Combined Pipeline"): | |
| gr.Markdown("Chain: Image β Description β Voice Narration") | |
| with gr.Row(): | |
| with gr.Column(): | |
| chain_image = gr.Image(type="pil", label="Input Image") | |
| chain_vl_prompt = gr.Textbox( | |
| value="Describe this scene vividly in 2 sentences.", | |
| label="Image Analysis Prompt" | |
| ) | |
| chain_audio_prompt = gr.Textbox( | |
| value="Narrate this description dramatically.", | |
| label="Voice Style Prompt" | |
| ) | |
| chain_btn = gr.Button("Run Full Pipeline", variant="primary") | |
| with gr.Column(): | |
| chain_desc = gr.Textbox(label="Generated Description") | |
| chain_text = gr.Textbox(label="Audio Text") | |
| chain_audio = gr.Audio(label="Narrated Audio") | |
| chain_btn.click( | |
| chain_vl_to_audio, | |
| inputs=[chain_image, chain_vl_prompt, chain_audio_prompt], | |
| outputs=[chain_desc, chain_text, chain_audio] | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| **Notes:** | |
| - First load requires downloading ~7GB (Audio) + ~6GB (VL) = ~13GB total | |
| - ZeroGPU provides A100/L4 GPUs - cold start ~60-120s per model | |
| - Keep `max_size=1` in queue to prevent OOM with two large models | |
| """) | |
| import asyncio | |
| import warnings | |
| # Suppress the event loop cleanup error | |
| warnings.filterwarnings("ignore", category=ResourceWarning) | |
| # Fix for asyncio cleanup on exit | |
| def silence_event_loop_closed(func): | |
| def wrapper(*args, **kwargs): | |
| try: | |
| return func(*args, **kwargs) | |
| except RuntimeError as e: | |
| if "Event loop is closed" in str(e): | |
| pass | |
| else: | |
| raise | |
| return wrapper | |
| # Patch the event loop to prevent the error | |
| asyncio.base_events.BaseEventLoop.__del__ = silence_event_loop_closed( | |
| asyncio.base_events.BaseEventLoop.__del__ | |
| ) | |
| # Disable SSR (experimental mode causing the issue) | |
| demo.queue(max_size=1) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, # Set to True if you need a public gradio.live link | |
| ssr_mode=False # <-- DISABLES the experimental SSR causing the error | |
| ) |