Spaces:
Paused
Paused
| import gradio as gr | |
| import requests | |
| import json | |
| import subprocess | |
| import time | |
| import os | |
| import signal | |
| import sys | |
| # Model configuration | |
| MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune" | |
| VLLM_PORT = 8000 | |
| VLLM_PROCESS = None | |
| def start_vllm_server(): | |
| """Start vLLM server in background""" | |
| global VLLM_PROCESS | |
| if VLLM_PROCESS is not None: | |
| return "β vLLM server already running" | |
| try: | |
| # Start vLLM server | |
| cmd = [ | |
| "python", "-m", "vllm.entrypoints.openai.api_server", | |
| "--model", MODEL_NAME, | |
| "--host", "0.0.0.0", | |
| "--port", str(VLLM_PORT), | |
| "--dtype", "bfloat16", | |
| "--trust-remote-code", | |
| ] | |
| VLLM_PROCESS = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| preexec_fn=os.setsid if sys.platform != 'win32' else None | |
| ) | |
| # Wait for server to start | |
| max_retries = 60 | |
| for i in range(max_retries): | |
| try: | |
| response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1) | |
| if response.status_code == 200: | |
| return "β vLLM server started successfully!" | |
| except: | |
| time.sleep(2) | |
| return "β οΈ vLLM server started but health check failed" | |
| except Exception as e: | |
| return f"β Failed to start vLLM server: {str(e)}" | |
| def chat(message, history, system_prompt, max_tokens, temperature, top_p): | |
| """Send chat message to vLLM server""" | |
| try: | |
| # Build messages | |
| messages = [] | |
| if system_prompt.strip(): | |
| messages.append({"role": "system", "content": system_prompt.strip()}) | |
| # Add history | |
| for human, assistant in history: | |
| messages.append({"role": "user", "content": human}) | |
| if assistant: | |
| messages.append({"role": "assistant", "content": assistant}) | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| # Call vLLM API | |
| response = requests.post( | |
| f"http://localhost:{VLLM_PORT}/v1/chat/completions", | |
| headers={"Content-Type": "application/json"}, | |
| json={ | |
| "model": MODEL_NAME, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": top_p, | |
| "stream": False | |
| }, | |
| timeout=300 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| assistant_message = result["choices"][0]["message"]["content"] | |
| return assistant_message | |
| else: | |
| return f"β Error: {response.status_code} - {response.text}" | |
| except requests.exceptions.ConnectionError: | |
| return "β Cannot connect to vLLM server. Please start the server first." | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| # Custom CSS | |
| custom_css = """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo: | |
| gr.Markdown(""" | |
| # π Kimi Linear 48B A3B - Fine-tuned Inference | |
| High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model. | |
| **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune` | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ποΈ Server Control") | |
| start_btn = gr.Button("π Start vLLM Server", variant="primary", size="lg") | |
| server_status = gr.Markdown("**Status:** Server not started") | |
| gr.Markdown("---") | |
| gr.Markdown("### βοΈ Generation Settings") | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (Optional)", | |
| placeholder="You are a helpful AI assistant...", | |
| lines=3, | |
| value="" | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=50, | |
| maximum=4096, | |
| value=1024, | |
| step=1, | |
| label="Max Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.05, | |
| label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top P" | |
| ) | |
| gr.Markdown(""" | |
| ### π Instructions | |
| 1. **Start Server** - Click the button above (takes 2-5 min) | |
| 2. **Wait for "β "** - Server is ready when you see green checkmark | |
| 3. **Start Chatting** - Type your message below | |
| **Note:** First message may be slow as the model loads into memory. | |
| """) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π¬ Chat") | |
| chatbot = gr.Chatbot( | |
| height=500, | |
| show_copy_button=True, | |
| avatar_images=["π€", "π€"] | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| label="Your Message", | |
| placeholder="Type your message here...", | |
| lines=2, | |
| scale=4 | |
| ) | |
| send_btn = gr.Button("π€ Send", variant="primary", scale=1) | |
| with gr.Row(): | |
| clear_btn = gr.Button("ποΈ Clear Chat") | |
| # Event handlers | |
| start_btn.click( | |
| fn=start_vllm_server, | |
| outputs=server_status | |
| ) | |
| def user_message(user_msg, history): | |
| return "", history + [[user_msg, None]] | |
| def bot_response(history, system_prompt, max_tokens, temperature, top_p): | |
| if not history or history[-1][1] is not None: | |
| return history | |
| user_msg = history[-1][0] | |
| bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p) | |
| history[-1][1] = bot_msg | |
| return history | |
| msg.submit( | |
| user_message, | |
| [msg, chatbot], | |
| [msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_response, | |
| [chatbot, system_prompt, max_tokens, temperature, top_p], | |
| chatbot | |
| ) | |
| send_btn.click( | |
| user_message, | |
| [msg, chatbot], | |
| [msg, chatbot], | |
| queue=False | |
| ).then( | |
| bot_response, | |
| [chatbot, system_prompt, max_tokens, temperature, top_p], | |
| chatbot | |
| ) | |
| clear_btn.click(lambda: None, None, chatbot, queue=False) | |
| gr.Markdown(""" | |
| --- | |
| **Powered by vLLM** - High-performance LLM inference engine | |
| **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune) | |
| """) | |
| # Cleanup on exit | |
| def cleanup(): | |
| global VLLM_PROCESS | |
| if VLLM_PROCESS: | |
| try: | |
| if sys.platform == 'win32': | |
| VLLM_PROCESS.terminate() | |
| else: | |
| os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM) | |
| except: | |
| pass | |
| import atexit | |
| atexit.register(cleanup) | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |