import gradio as gr import requests import json import subprocess import time import os import signal import sys # Model configuration MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune" VLLM_PORT = 8000 VLLM_PROCESS = None def start_vllm_server(): """Start vLLM server in background""" global VLLM_PROCESS if VLLM_PROCESS is not None: return "✅ vLLM server already running" try: # Start vLLM server cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", MODEL_NAME, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--dtype", "bfloat16", "--trust-remote-code", ] VLLM_PROCESS = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid if sys.platform != 'win32' else None ) # Wait for server to start max_retries = 60 for i in range(max_retries): try: response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1) if response.status_code == 200: return "✅ vLLM server started successfully!" except: time.sleep(2) return "⚠️ vLLM server started but health check failed" except Exception as e: return f"❌ Failed to start vLLM server: {str(e)}" def chat(message, history, system_prompt, max_tokens, temperature, top_p): """Send chat message to vLLM server""" try: # Build messages messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) # Add history for human, assistant in history: messages.append({"role": "user", "content": human}) if assistant: messages.append({"role": "assistant", "content": assistant}) # Add current message messages.append({"role": "user", "content": message}) # Call vLLM API response = requests.post( f"http://localhost:{VLLM_PORT}/v1/chat/completions", headers={"Content-Type": "application/json"}, json={ "model": MODEL_NAME, "messages": messages, "max_tokens": max_tokens, "temperature": temperature, "top_p": top_p, "stream": False }, timeout=300 ) if response.status_code == 200: result = response.json() assistant_message = result["choices"][0]["message"]["content"] return assistant_message else: return f"❌ Error: {response.status_code} - {response.text}" except requests.exceptions.ConnectionError: return "❌ Cannot connect to vLLM server. Please start the server first." except Exception as e: return f"❌ Error: {str(e)}" # Custom CSS custom_css = """ .gradio-container { max-width: 1200px !important; } """ # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo: gr.Markdown(""" # 🚀 Kimi Linear 48B A3B - Fine-tuned Inference High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model. **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune` """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🎛️ Server Control") start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg") server_status = gr.Markdown("**Status:** Server not started") gr.Markdown("---") gr.Markdown("### ⚙️ Generation Settings") system_prompt = gr.Textbox( label="System Prompt (Optional)", placeholder="You are a helpful AI assistant...", lines=3, value="" ) max_tokens = gr.Slider( minimum=50, maximum=4096, value=1024, step=1, label="Max Tokens" ) temperature = gr.Slider( minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="Temperature" ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P" ) gr.Markdown(""" ### 📖 Instructions 1. **Start Server** - Click the button above (takes 2-5 min) 2. **Wait for "✅"** - Server is ready when you see green checkmark 3. **Start Chatting** - Type your message below **Note:** First message may be slow as the model loads into memory. """) with gr.Column(scale=2): gr.Markdown("### 💬 Chat") chatbot = gr.Chatbot( height=500, show_copy_button=True, avatar_images=["👤", "🤖"] ) with gr.Row(): msg = gr.Textbox( label="Your Message", placeholder="Type your message here...", lines=2, scale=4 ) send_btn = gr.Button("📤 Send", variant="primary", scale=1) with gr.Row(): clear_btn = gr.Button("🗑️ Clear Chat") # Event handlers start_btn.click( fn=start_vllm_server, outputs=server_status ) def user_message(user_msg, history): return "", history + [[user_msg, None]] def bot_response(history, system_prompt, max_tokens, temperature, top_p): if not history or history[-1][1] is not None: return history user_msg = history[-1][0] bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p) history[-1][1] = bot_msg return history msg.submit( user_message, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_response, [chatbot, system_prompt, max_tokens, temperature, top_p], chatbot ) send_btn.click( user_message, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_response, [chatbot, system_prompt, max_tokens, temperature, top_p], chatbot ) clear_btn.click(lambda: None, None, chatbot, queue=False) gr.Markdown(""" --- **Powered by vLLM** - High-performance LLM inference engine **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune) """) # Cleanup on exit def cleanup(): global VLLM_PROCESS if VLLM_PROCESS: try: if sys.platform == 'win32': VLLM_PROCESS.terminate() else: os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM) except: pass import atexit atexit.register(cleanup) if __name__ == "__main__": demo.queue() demo.launch( server_name="0.0.0.0", server_port=7860, share=False )