Spaces:

optiviseapp
/

fnmodel

Paused

fnmodel / app.py

aeb56

Switch to vLLM for high-performance, stable inference

310eb95 about 1 month ago

7.76 kB

	import gradio as gr
	import requests
	import json
	import subprocess
	import time
	import os
	import signal
	import sys

	# Model configuration
	MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
	VLLM_PORT = 8000
	VLLM_PROCESS = None

	def start_vllm_server():
	"""Start vLLM server in background"""
	global VLLM_PROCESS

	if VLLM_PROCESS is not None:
	return "✅ vLLM server already running"

	try:
	# Start vLLM server
	cmd = [
	"python", "-m", "vllm.entrypoints.openai.api_server",
	"--model", MODEL_NAME,
	"--host", "0.0.0.0",
	"--port", str(VLLM_PORT),
	"--dtype", "bfloat16",
	"--trust-remote-code",
	]

	VLLM_PROCESS = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	preexec_fn=os.setsid if sys.platform != 'win32' else None
	)

	# Wait for server to start
	max_retries = 60
	for i in range(max_retries):
	try:
	response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
	if response.status_code == 200:
	return "✅ vLLM server started successfully!"
	except:
	time.sleep(2)

	return "⚠️ vLLM server started but health check failed"

	except Exception as e:
	return f"❌ Failed to start vLLM server: {str(e)}"

	def chat(message, history, system_prompt, max_tokens, temperature, top_p):
	"""Send chat message to vLLM server"""
	try:
	# Build messages
	messages = []

	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt.strip()})

	# Add history
	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	if assistant:
	messages.append({"role": "assistant", "content": assistant})

	# Add current message
	messages.append({"role": "user", "content": message})

	# Call vLLM API
	response = requests.post(
	f"http://localhost:{VLLM_PORT}/v1/chat/completions",
	headers={"Content-Type": "application/json"},
	json={
	"model": MODEL_NAME,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"stream": False
	},
	timeout=300
	)

	if response.status_code == 200:
	result = response.json()
	assistant_message = result["choices"][0]["message"]["content"]
	return assistant_message
	else:
	return f"❌ Error: {response.status_code} - {response.text}"

	except requests.exceptions.ConnectionError:
	return "❌ Cannot connect to vLLM server. Please start the server first."
	except Exception as e:
	return f"❌ Error: {str(e)}"

	# Custom CSS
	custom_css = """
	.gradio-container {
	max-width: 1200px !important;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
	gr.Markdown("""
	# 🚀 Kimi Linear 48B A3B - Fine-tuned Inference

	High-performance inference using vLLM for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.

	Model: `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎛️ Server Control")
	start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
	server_status = gr.Markdown("Status: Server not started")

	gr.Markdown("---")
	gr.Markdown("### ⚙️ Generation Settings")

	system_prompt = gr.Textbox(
	label="System Prompt (Optional)",
	placeholder="You are a helpful AI assistant...",
	lines=3,
	value=""
	)

	max_tokens = gr.Slider(
	minimum=50,
	maximum=4096,
	value=1024,
	step=1,
	label="Max Tokens"
	)

	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.05,
	label="Temperature"
	)

	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P"
	)

	gr.Markdown("""
	### 📖 Instructions

	1. Start Server - Click the button above (takes 2-5 min)
	2. Wait for "✅" - Server is ready when you see green checkmark
	3. Start Chatting - Type your message below

	Note: First message may be slow as the model loads into memory.
	""")

	with gr.Column(scale=2):
	gr.Markdown("### 💬 Chat")

	chatbot = gr.Chatbot(
	height=500,
	show_copy_button=True,
	avatar_images=["👤", "🤖"]
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here...",
	lines=2,
	scale=4
	)
	send_btn = gr.Button("📤 Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear Chat")

	# Event handlers
	start_btn.click(
	fn=start_vllm_server,
	outputs=server_status
	)

	def user_message(user_msg, history):
	return "", history + [[user_msg, None]]

	def bot_response(history, system_prompt, max_tokens, temperature, top_p):
	if not history or history[-1][1] is not None:
	return history

	user_msg = history[-1][0]
	bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
	history[-1][1] = bot_msg
	return history

	msg.submit(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_prompt, max_tokens, temperature, top_p],
	chatbot
	)

	send_btn.click(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_prompt, max_tokens, temperature, top_p],
	chatbot
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	gr.Markdown("""
	---

	Powered by vLLM - High-performance LLM inference engine

	Model: [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
	""")

	# Cleanup on exit
	def cleanup():
	global VLLM_PROCESS
	if VLLM_PROCESS:
	try:
	if sys.platform == 'win32':
	VLLM_PROCESS.terminate()
	else:
	os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
	except:
	pass

	import atexit
	atexit.register(cleanup)

	if __name__ == "__main__":
	demo.queue()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)