aeb56 commited on
Commit
a82de92
Β·
1 Parent(s): 75c2813

Improve vLLM startup with tensor parallelism, better logging, and 10min timeout

Browse files
Files changed (1) hide show
  1. app.py +64 -11
app.py CHANGED
@@ -20,7 +20,7 @@ def start_vllm_server():
20
  return "βœ… vLLM server already running"
21
 
22
  try:
23
- # Start vLLM server
24
  cmd = [
25
  "python3", "-m", "vllm.entrypoints.openai.api_server",
26
  "--model", MODEL_NAME,
@@ -28,29 +28,72 @@ def start_vllm_server():
28
  "--port", str(VLLM_PORT),
29
  "--dtype", "bfloat16",
30
  "--trust-remote-code",
 
 
31
  ]
32
 
 
33
  VLLM_PROCESS = subprocess.Popen(
34
  cmd,
35
- stdout=subprocess.PIPE,
36
- stderr=subprocess.PIPE,
37
  preexec_fn=os.setsid if sys.platform != 'win32' else None
38
  )
39
 
40
- # Wait for server to start
41
- max_retries = 60
 
 
 
 
 
 
 
 
 
42
  for i in range(max_retries):
43
  try:
44
- response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
45
  if response.status_code == 200:
46
- return "βœ… vLLM server started successfully!"
47
- except:
48
- time.sleep(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- return "⚠️ vLLM server started but health check failed"
 
51
 
52
  except Exception as e:
53
- return f"❌ Failed to start vLLM server: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def chat(message, history, system_prompt, max_tokens, temperature, top_p):
56
  """Send chat message to vLLM server"""
@@ -119,6 +162,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
119
  gr.Markdown("### πŸŽ›οΈ Server Control")
120
  start_btn = gr.Button("πŸš€ Start vLLM Server", variant="primary", size="lg")
121
  server_status = gr.Markdown("**Status:** Server not started")
 
 
122
 
123
  gr.Markdown("---")
124
  gr.Markdown("### βš™οΈ Generation Settings")
@@ -190,6 +235,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
190
  outputs=server_status
191
  )
192
 
 
 
 
 
 
 
 
 
193
  def user_message(user_msg, history):
194
  return "", history + [[user_msg, None]]
195
 
 
20
  return "βœ… vLLM server already running"
21
 
22
  try:
23
+ # Start vLLM server with tensor parallelism for multi-GPU
24
  cmd = [
25
  "python3", "-m", "vllm.entrypoints.openai.api_server",
26
  "--model", MODEL_NAME,
 
28
  "--port", str(VLLM_PORT),
29
  "--dtype", "bfloat16",
30
  "--trust-remote-code",
31
+ "--tensor-parallel-size", "4", # Use all 4 GPUs
32
+ "--max-model-len", "8192", # Limit context to save memory
33
  ]
34
 
35
+ log_file = open("/tmp/vllm.log", "w")
36
  VLLM_PROCESS = subprocess.Popen(
37
  cmd,
38
+ stdout=log_file,
39
+ stderr=subprocess.STDOUT,
40
  preexec_fn=os.setsid if sys.platform != 'win32' else None
41
  )
42
 
43
+ status_msg = "πŸ”„ **vLLM server starting...**\n\n"
44
+ status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
45
+ status_msg += "**Progress:**\n"
46
+ status_msg += "1. Downloading model (if not cached)\n"
47
+ status_msg += "2. Loading weights across 4 GPUs\n"
48
+ status_msg += "3. Initializing inference engine\n\n"
49
+ status_msg += "**Status:** Initializing...\n\n"
50
+ status_msg += "_Check logs at /tmp/vllm.log for details_"
51
+
52
+ # Wait longer for big model - up to 10 minutes
53
+ max_retries = 300 # 300 * 2 seconds = 10 minutes
54
  for i in range(max_retries):
55
  try:
56
+ response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
57
  if response.status_code == 200:
58
+ return "βœ… **vLLM server started successfully!**\n\nYou can now start chatting below."
59
+ except requests.exceptions.RequestException:
60
+ pass
61
+
62
+ # Check if process died
63
+ if VLLM_PROCESS.poll() is not None:
64
+ # Process ended
65
+ with open("/tmp/vllm.log", "r") as f:
66
+ last_lines = f.readlines()[-20:]
67
+ error_msg = "❌ **vLLM server crashed during startup**\n\n"
68
+ error_msg += "**Last log lines:**\n```\n"
69
+ error_msg += "".join(last_lines)
70
+ error_msg += "\n```"
71
+ return error_msg
72
+
73
+ time.sleep(2)
74
 
75
+ # Timeout but process still running
76
+ return "⚠️ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
77
 
78
  except Exception as e:
79
+ return f"❌ **Failed to start vLLM server:**\n\n{str(e)}"
80
+
81
+ def view_logs():
82
+ """View vLLM server logs"""
83
+ try:
84
+ if not os.path.exists("/tmp/vllm.log"):
85
+ return "πŸ“ No logs yet. Start the server first."
86
+
87
+ with open("/tmp/vllm.log", "r") as f:
88
+ lines = f.readlines()
89
+ last_lines = lines[-50:] # Last 50 lines
90
+
91
+ log_text = "πŸ“‹ **vLLM Server Logs (Last 50 lines)**\n\n```\n"
92
+ log_text += "".join(last_lines)
93
+ log_text += "\n```"
94
+ return log_text
95
+ except Exception as e:
96
+ return f"❌ Error reading logs: {str(e)}"
97
 
98
  def chat(message, history, system_prompt, max_tokens, temperature, top_p):
99
  """Send chat message to vLLM server"""
 
162
  gr.Markdown("### πŸŽ›οΈ Server Control")
163
  start_btn = gr.Button("πŸš€ Start vLLM Server", variant="primary", size="lg")
164
  server_status = gr.Markdown("**Status:** Server not started")
165
+ view_logs_btn = gr.Button("πŸ“‹ View Server Logs", size="sm")
166
+ logs_display = gr.Markdown("", visible=False)
167
 
168
  gr.Markdown("---")
169
  gr.Markdown("### βš™οΈ Generation Settings")
 
235
  outputs=server_status
236
  )
237
 
238
+ def show_logs():
239
+ return {logs_display: gr.update(value=view_logs(), visible=True)}
240
+
241
+ view_logs_btn.click(
242
+ fn=show_logs,
243
+ outputs=logs_display
244
+ )
245
+
246
  def user_message(user_msg, history):
247
  return "", history + [[user_msg, None]]
248