Spaces:
Paused
Paused
aeb56
commited on
Commit
Β·
a82de92
1
Parent(s):
75c2813
Improve vLLM startup with tensor parallelism, better logging, and 10min timeout
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ def start_vllm_server():
|
|
| 20 |
return "β
vLLM server already running"
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
# Start vLLM server
|
| 24 |
cmd = [
|
| 25 |
"python3", "-m", "vllm.entrypoints.openai.api_server",
|
| 26 |
"--model", MODEL_NAME,
|
|
@@ -28,29 +28,72 @@ def start_vllm_server():
|
|
| 28 |
"--port", str(VLLM_PORT),
|
| 29 |
"--dtype", "bfloat16",
|
| 30 |
"--trust-remote-code",
|
|
|
|
|
|
|
| 31 |
]
|
| 32 |
|
|
|
|
| 33 |
VLLM_PROCESS = subprocess.Popen(
|
| 34 |
cmd,
|
| 35 |
-
stdout=
|
| 36 |
-
stderr=subprocess.
|
| 37 |
preexec_fn=os.setsid if sys.platform != 'win32' else None
|
| 38 |
)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
for i in range(max_retries):
|
| 43 |
try:
|
| 44 |
-
response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=
|
| 45 |
if response.status_code == 200:
|
| 46 |
-
return "β
vLLM server started successfully
|
| 47 |
-
except:
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
except Exception as e:
|
| 53 |
-
return f"β Failed to start vLLM server
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
def chat(message, history, system_prompt, max_tokens, temperature, top_p):
|
| 56 |
"""Send chat message to vLLM server"""
|
|
@@ -119,6 +162,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
|
|
| 119 |
gr.Markdown("### ποΈ Server Control")
|
| 120 |
start_btn = gr.Button("π Start vLLM Server", variant="primary", size="lg")
|
| 121 |
server_status = gr.Markdown("**Status:** Server not started")
|
|
|
|
|
|
|
| 122 |
|
| 123 |
gr.Markdown("---")
|
| 124 |
gr.Markdown("### βοΈ Generation Settings")
|
|
@@ -190,6 +235,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
|
|
| 190 |
outputs=server_status
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
def user_message(user_msg, history):
|
| 194 |
return "", history + [[user_msg, None]]
|
| 195 |
|
|
|
|
| 20 |
return "β
vLLM server already running"
|
| 21 |
|
| 22 |
try:
|
| 23 |
+
# Start vLLM server with tensor parallelism for multi-GPU
|
| 24 |
cmd = [
|
| 25 |
"python3", "-m", "vllm.entrypoints.openai.api_server",
|
| 26 |
"--model", MODEL_NAME,
|
|
|
|
| 28 |
"--port", str(VLLM_PORT),
|
| 29 |
"--dtype", "bfloat16",
|
| 30 |
"--trust-remote-code",
|
| 31 |
+
"--tensor-parallel-size", "4", # Use all 4 GPUs
|
| 32 |
+
"--max-model-len", "8192", # Limit context to save memory
|
| 33 |
]
|
| 34 |
|
| 35 |
+
log_file = open("/tmp/vllm.log", "w")
|
| 36 |
VLLM_PROCESS = subprocess.Popen(
|
| 37 |
cmd,
|
| 38 |
+
stdout=log_file,
|
| 39 |
+
stderr=subprocess.STDOUT,
|
| 40 |
preexec_fn=os.setsid if sys.platform != 'win32' else None
|
| 41 |
)
|
| 42 |
|
| 43 |
+
status_msg = "π **vLLM server starting...**\n\n"
|
| 44 |
+
status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
|
| 45 |
+
status_msg += "**Progress:**\n"
|
| 46 |
+
status_msg += "1. Downloading model (if not cached)\n"
|
| 47 |
+
status_msg += "2. Loading weights across 4 GPUs\n"
|
| 48 |
+
status_msg += "3. Initializing inference engine\n\n"
|
| 49 |
+
status_msg += "**Status:** Initializing...\n\n"
|
| 50 |
+
status_msg += "_Check logs at /tmp/vllm.log for details_"
|
| 51 |
+
|
| 52 |
+
# Wait longer for big model - up to 10 minutes
|
| 53 |
+
max_retries = 300 # 300 * 2 seconds = 10 minutes
|
| 54 |
for i in range(max_retries):
|
| 55 |
try:
|
| 56 |
+
response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
|
| 57 |
if response.status_code == 200:
|
| 58 |
+
return "β
**vLLM server started successfully!**\n\nYou can now start chatting below."
|
| 59 |
+
except requests.exceptions.RequestException:
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
# Check if process died
|
| 63 |
+
if VLLM_PROCESS.poll() is not None:
|
| 64 |
+
# Process ended
|
| 65 |
+
with open("/tmp/vllm.log", "r") as f:
|
| 66 |
+
last_lines = f.readlines()[-20:]
|
| 67 |
+
error_msg = "β **vLLM server crashed during startup**\n\n"
|
| 68 |
+
error_msg += "**Last log lines:**\n```\n"
|
| 69 |
+
error_msg += "".join(last_lines)
|
| 70 |
+
error_msg += "\n```"
|
| 71 |
+
return error_msg
|
| 72 |
+
|
| 73 |
+
time.sleep(2)
|
| 74 |
|
| 75 |
+
# Timeout but process still running
|
| 76 |
+
return "β οΈ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
|
| 77 |
|
| 78 |
except Exception as e:
|
| 79 |
+
return f"β **Failed to start vLLM server:**\n\n{str(e)}"
|
| 80 |
+
|
| 81 |
+
def view_logs():
|
| 82 |
+
"""View vLLM server logs"""
|
| 83 |
+
try:
|
| 84 |
+
if not os.path.exists("/tmp/vllm.log"):
|
| 85 |
+
return "π No logs yet. Start the server first."
|
| 86 |
+
|
| 87 |
+
with open("/tmp/vllm.log", "r") as f:
|
| 88 |
+
lines = f.readlines()
|
| 89 |
+
last_lines = lines[-50:] # Last 50 lines
|
| 90 |
+
|
| 91 |
+
log_text = "π **vLLM Server Logs (Last 50 lines)**\n\n```\n"
|
| 92 |
+
log_text += "".join(last_lines)
|
| 93 |
+
log_text += "\n```"
|
| 94 |
+
return log_text
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return f"β Error reading logs: {str(e)}"
|
| 97 |
|
| 98 |
def chat(message, history, system_prompt, max_tokens, temperature, top_p):
|
| 99 |
"""Send chat message to vLLM server"""
|
|
|
|
| 162 |
gr.Markdown("### ποΈ Server Control")
|
| 163 |
start_btn = gr.Button("π Start vLLM Server", variant="primary", size="lg")
|
| 164 |
server_status = gr.Markdown("**Status:** Server not started")
|
| 165 |
+
view_logs_btn = gr.Button("π View Server Logs", size="sm")
|
| 166 |
+
logs_display = gr.Markdown("", visible=False)
|
| 167 |
|
| 168 |
gr.Markdown("---")
|
| 169 |
gr.Markdown("### βοΈ Generation Settings")
|
|
|
|
| 235 |
outputs=server_status
|
| 236 |
)
|
| 237 |
|
| 238 |
+
def show_logs():
|
| 239 |
+
return {logs_display: gr.update(value=view_logs(), visible=True)}
|
| 240 |
+
|
| 241 |
+
view_logs_btn.click(
|
| 242 |
+
fn=show_logs,
|
| 243 |
+
outputs=logs_display
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
def user_message(user_msg, history):
|
| 247 |
return "", history + [[user_msg, None]]
|
| 248 |
|