Spaces:

optiviseapp
/

fnmodel

Paused

fnmodel / app.py

aeb56

Improve vLLM startup with tensor parallelism, better logging, and 10min timeout

a82de92 about 1 month ago

10.1 kB

	import gradio as gr
	import requests
	import json
	import subprocess
	import time
	import os
	import signal
	import sys

	# Model configuration
	MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
	VLLM_PORT = 8000
	VLLM_PROCESS = None

	def start_vllm_server():
	"""Start vLLM server in background"""
	global VLLM_PROCESS

	if VLLM_PROCESS is not None:
	return "✅ vLLM server already running"

	try:
	# Start vLLM server with tensor parallelism for multi-GPU
	cmd = [
	"python3", "-m", "vllm.entrypoints.openai.api_server",
	"--model", MODEL_NAME,
	"--host", "0.0.0.0",
	"--port", str(VLLM_PORT),
	"--dtype", "bfloat16",
	"--trust-remote-code",
	"--tensor-parallel-size", "4", # Use all 4 GPUs
	"--max-model-len", "8192", # Limit context to save memory
	]

	log_file = open("/tmp/vllm.log", "w")
	VLLM_PROCESS = subprocess.Popen(
	cmd,
	stdout=log_file,
	stderr=subprocess.STDOUT,
	preexec_fn=os.setsid if sys.platform != 'win32' else None
	)

	status_msg = "🔄 vLLM server starting...\n\n"
	status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
	status_msg += "Progress:\n"
	status_msg += "1. Downloading model (if not cached)\n"
	status_msg += "2. Loading weights across 4 GPUs\n"
	status_msg += "3. Initializing inference engine\n\n"
	status_msg += "Status: Initializing...\n\n"
	status_msg += "_Check logs at /tmp/vllm.log for details_"

	# Wait longer for big model - up to 10 minutes
	max_retries = 300 # 300 * 2 seconds = 10 minutes
	for i in range(max_retries):
	try:
	response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
	if response.status_code == 200:
	return "✅ vLLM server started successfully!\n\nYou can now start chatting below."
	except requests.exceptions.RequestException:
	pass

	# Check if process died
	if VLLM_PROCESS.poll() is not None:
	# Process ended
	with open("/tmp/vllm.log", "r") as f:
	last_lines = f.readlines()[-20:]
	error_msg = "❌ vLLM server crashed during startup\n\n"
	error_msg += "Last log lines:\n```\n"
	error_msg += "".join(last_lines)
	error_msg += "\n```"
	return error_msg

	time.sleep(2)

	# Timeout but process still running
	return "⚠️ vLLM server started but taking longer than expected\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."

	except Exception as e:
	return f"❌ Failed to start vLLM server:\n\n{str(e)}"

	def view_logs():
	"""View vLLM server logs"""
	try:
	if not os.path.exists("/tmp/vllm.log"):
	return "📝 No logs yet. Start the server first."

	with open("/tmp/vllm.log", "r") as f:
	lines = f.readlines()
	last_lines = lines[-50:] # Last 50 lines

	log_text = "📋 vLLM Server Logs (Last 50 lines)\n\n```\n"
	log_text += "".join(last_lines)
	log_text += "\n```"
	return log_text
	except Exception as e:
	return f"❌ Error reading logs: {str(e)}"

	def chat(message, history, system_prompt, max_tokens, temperature, top_p):
	"""Send chat message to vLLM server"""
	try:
	# Build messages
	messages = []

	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt.strip()})

	# Add history
	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	if assistant:
	messages.append({"role": "assistant", "content": assistant})

	# Add current message
	messages.append({"role": "user", "content": message})

	# Call vLLM API
	response = requests.post(
	f"http://localhost:{VLLM_PORT}/v1/chat/completions",
	headers={"Content-Type": "application/json"},
	json={
	"model": MODEL_NAME,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"stream": False
	},
	timeout=300
	)

	if response.status_code == 200:
	result = response.json()
	assistant_message = result["choices"][0]["message"]["content"]
	return assistant_message
	else:
	return f"❌ Error: {response.status_code} - {response.text}"

	except requests.exceptions.ConnectionError:
	return "❌ Cannot connect to vLLM server. Please start the server first."
	except Exception as e:
	return f"❌ Error: {str(e)}"

	# Custom CSS
	custom_css = """
	.gradio-container {
	max-width: 1200px !important;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
	gr.Markdown("""
	# 🚀 Kimi Linear 48B A3B - Fine-tuned Inference

	High-performance inference using vLLM for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.

	Model: `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎛️ Server Control")
	start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
	server_status = gr.Markdown("Status: Server not started")
	view_logs_btn = gr.Button("📋 View Server Logs", size="sm")
	logs_display = gr.Markdown("", visible=False)

	gr.Markdown("---")
	gr.Markdown("### ⚙️ Generation Settings")

	system_prompt = gr.Textbox(
	label="System Prompt (Optional)",
	placeholder="You are a helpful AI assistant...",
	lines=3,
	value=""
	)

	max_tokens = gr.Slider(
	minimum=50,
	maximum=4096,
	value=1024,
	step=1,
	label="Max Tokens"
	)

	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.05,
	label="Temperature"
	)

	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P"
	)

	gr.Markdown("""
	### 📖 Instructions

	1. Start Server - Click the button above (takes 2-5 min)
	2. Wait for "✅" - Server is ready when you see green checkmark
	3. Start Chatting - Type your message below

	Note: First message may be slow as the model loads into memory.
	""")

	with gr.Column(scale=2):
	gr.Markdown("### 💬 Chat")

	chatbot = gr.Chatbot(
	height=500,
	show_copy_button=True
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here...",
	lines=2,
	scale=4
	)
	send_btn = gr.Button("📤 Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear Chat")

	# Event handlers
	start_btn.click(
	fn=start_vllm_server,
	outputs=server_status
	)

	def show_logs():
	return {logs_display: gr.update(value=view_logs(), visible=True)}

	view_logs_btn.click(
	fn=show_logs,
	outputs=logs_display
	)

	def user_message(user_msg, history):
	return "", history + [[user_msg, None]]

	def bot_response(history, system_prompt, max_tokens, temperature, top_p):
	if not history or history[-1][1] is not None:
	return history

	user_msg = history[-1][0]
	bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
	history[-1][1] = bot_msg
	return history

	msg.submit(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_prompt, max_tokens, temperature, top_p],
	chatbot
	)

	send_btn.click(
	user_message,
	[msg, chatbot],
	[msg, chatbot],
	queue=False
	).then(
	bot_response,
	[chatbot, system_prompt, max_tokens, temperature, top_p],
	chatbot
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	gr.Markdown("""
	---

	Powered by vLLM - High-performance LLM inference engine

	Model: [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
	""")

	# Cleanup on exit
	def cleanup():
	global VLLM_PROCESS
	if VLLM_PROCESS:
	try:
	if sys.platform == 'win32':
	VLLM_PROCESS.terminate()
	else:
	os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
	except:
	pass

	import atexit
	atexit.register(cleanup)

	if __name__ == "__main__":
	demo.queue()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	show_error=True
	)