fnmodel / app.py
aeb56
Switch to vLLM for high-performance, stable inference
310eb95
raw
history blame
7.76 kB
import gradio as gr
import requests
import json
import subprocess
import time
import os
import signal
import sys
# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
VLLM_PORT = 8000
VLLM_PROCESS = None
def start_vllm_server():
"""Start vLLM server in background"""
global VLLM_PROCESS
if VLLM_PROCESS is not None:
return "βœ… vLLM server already running"
try:
# Start vLLM server
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(VLLM_PORT),
"--dtype", "bfloat16",
"--trust-remote-code",
]
VLLM_PROCESS = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid if sys.platform != 'win32' else None
)
# Wait for server to start
max_retries = 60
for i in range(max_retries):
try:
response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
if response.status_code == 200:
return "βœ… vLLM server started successfully!"
except:
time.sleep(2)
return "⚠️ vLLM server started but health check failed"
except Exception as e:
return f"❌ Failed to start vLLM server: {str(e)}"
def chat(message, history, system_prompt, max_tokens, temperature, top_p):
"""Send chat message to vLLM server"""
try:
# Build messages
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
# Add history
for human, assistant in history:
messages.append({"role": "user", "content": human})
if assistant:
messages.append({"role": "assistant", "content": assistant})
# Add current message
messages.append({"role": "user", "content": message})
# Call vLLM API
response = requests.post(
f"http://localhost:{VLLM_PORT}/v1/chat/completions",
headers={"Content-Type": "application/json"},
json={
"model": MODEL_NAME,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"stream": False
},
timeout=300
)
if response.status_code == 200:
result = response.json()
assistant_message = result["choices"][0]["message"]["content"]
return assistant_message
else:
return f"❌ Error: {response.status_code} - {response.text}"
except requests.exceptions.ConnectionError:
return "❌ Cannot connect to vLLM server. Please start the server first."
except Exception as e:
return f"❌ Error: {str(e)}"
# Custom CSS
custom_css = """
.gradio-container {
max-width: 1200px !important;
}
"""
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
gr.Markdown("""
# πŸš€ Kimi Linear 48B A3B - Fine-tuned Inference
High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
**Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸŽ›οΈ Server Control")
start_btn = gr.Button("πŸš€ Start vLLM Server", variant="primary", size="lg")
server_status = gr.Markdown("**Status:** Server not started")
gr.Markdown("---")
gr.Markdown("### βš™οΈ Generation Settings")
system_prompt = gr.Textbox(
label="System Prompt (Optional)",
placeholder="You are a helpful AI assistant...",
lines=3,
value=""
)
max_tokens = gr.Slider(
minimum=50,
maximum=4096,
value=1024,
step=1,
label="Max Tokens"
)
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.05,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P"
)
gr.Markdown("""
### πŸ“– Instructions
1. **Start Server** - Click the button above (takes 2-5 min)
2. **Wait for "βœ…"** - Server is ready when you see green checkmark
3. **Start Chatting** - Type your message below
**Note:** First message may be slow as the model loads into memory.
""")
with gr.Column(scale=2):
gr.Markdown("### πŸ’¬ Chat")
chatbot = gr.Chatbot(
height=500,
show_copy_button=True,
avatar_images=["πŸ‘€", "πŸ€–"]
)
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here...",
lines=2,
scale=4
)
send_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.Button("πŸ—‘οΈ Clear Chat")
# Event handlers
start_btn.click(
fn=start_vllm_server,
outputs=server_status
)
def user_message(user_msg, history):
return "", history + [[user_msg, None]]
def bot_response(history, system_prompt, max_tokens, temperature, top_p):
if not history or history[-1][1] is not None:
return history
user_msg = history[-1][0]
bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
history[-1][1] = bot_msg
return history
msg.submit(
user_message,
[msg, chatbot],
[msg, chatbot],
queue=False
).then(
bot_response,
[chatbot, system_prompt, max_tokens, temperature, top_p],
chatbot
)
send_btn.click(
user_message,
[msg, chatbot],
[msg, chatbot],
queue=False
).then(
bot_response,
[chatbot, system_prompt, max_tokens, temperature, top_p],
chatbot
)
clear_btn.click(lambda: None, None, chatbot, queue=False)
gr.Markdown("""
---
**Powered by vLLM** - High-performance LLM inference engine
**Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
""")
# Cleanup on exit
def cleanup():
global VLLM_PROCESS
if VLLM_PROCESS:
try:
if sys.platform == 'win32':
VLLM_PROCESS.terminate()
else:
os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
except:
pass
import atexit
atexit.register(cleanup)
if __name__ == "__main__":
demo.queue()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)