Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on Nov 10

Commit

a82de92

1 Parent(s): 75c2813

Improve vLLM startup with tensor parallelism, better logging, and 10min timeout

Browse files

Files changed (1) hide show

app.py +64 -11

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ def start_vllm_server():
         return "✅ vLLM server already running"
     try:
-        # Start vLLM server
         cmd = [
             "python3", "-m", "vllm.entrypoints.openai.api_server",
             "--model", MODEL_NAME,
@@ -28,29 +28,72 @@ def start_vllm_server():
             "--port", str(VLLM_PORT),
             "--dtype", "bfloat16",
             "--trust-remote-code",
         ]
         VLLM_PROCESS = subprocess.Popen(
             cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
             preexec_fn=os.setsid if sys.platform != 'win32' else None
         )
-        # Wait for server to start
-        max_retries = 60
         for i in range(max_retries):
             try:
-                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
                 if response.status_code == 200:
-                    return "✅ vLLM server started successfully!"
-            except:
-                time.sleep(2)
-        return "⚠️ vLLM server started but health check failed"
     except Exception as e:
-        return f"❌ Failed to start vLLM server: {str(e)}"
 def chat(message, history, system_prompt, max_tokens, temperature, top_p):
     """Send chat message to vLLM server"""
@@ -119,6 +162,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
             gr.Markdown("### 🎛️ Server Control")
             start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
             server_status = gr.Markdown("**Status:** Server not started")
             gr.Markdown("---")
             gr.Markdown("### ⚙️ Generation Settings")
@@ -190,6 +235,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tune
         outputs=server_status
     )
     def user_message(user_msg, history):
         return "", history + [[user_msg, None]]

         return "✅ vLLM server already running"
     try:
+        # Start vLLM server with tensor parallelism for multi-GPU
         cmd = [
             "python3", "-m", "vllm.entrypoints.openai.api_server",
             "--model", MODEL_NAME,
             "--port", str(VLLM_PORT),
             "--dtype", "bfloat16",
             "--trust-remote-code",
+            "--tensor-parallel-size", "4",  # Use all 4 GPUs
+            "--max-model-len", "8192",  # Limit context to save memory
         ]
+        log_file = open("/tmp/vllm.log", "w")
         VLLM_PROCESS = subprocess.Popen(
             cmd,
+            stdout=log_file,
+            stderr=subprocess.STDOUT,
             preexec_fn=os.setsid if sys.platform != 'win32' else None
         )
+        status_msg = "🔄 **vLLM server starting...**\n\n"
+        status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
+        status_msg += "**Progress:**\n"
+        status_msg += "1. Downloading model (if not cached)\n"
+        status_msg += "2. Loading weights across 4 GPUs\n"
+        status_msg += "3. Initializing inference engine\n\n"
+        status_msg += "**Status:** Initializing...\n\n"
+        status_msg += "_Check logs at /tmp/vllm.log for details_"
+        # Wait longer for big model - up to 10 minutes
+        max_retries = 300  # 300 * 2 seconds = 10 minutes
         for i in range(max_retries):
             try:
+                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
                 if response.status_code == 200:
+                    return "✅ **vLLM server started successfully!**\n\nYou can now start chatting below."
+            except requests.exceptions.RequestException:
+                pass
+            # Check if process died
+            if VLLM_PROCESS.poll() is not None:
+                # Process ended
+                with open("/tmp/vllm.log", "r") as f:
+                    last_lines = f.readlines()[-20:]
+                error_msg = "❌ **vLLM server crashed during startup**\n\n"
+                error_msg += "**Last log lines:**\n```\n"
+                error_msg += "".join(last_lines)
+                error_msg += "\n```"
+                return error_msg
+            time.sleep(2)
+        # Timeout but process still running
+        return "⚠️ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
     except Exception as e:
+        return f"❌ **Failed to start vLLM server:**\n\n{str(e)}"
+def view_logs():
+    """View vLLM server logs"""
+    try:
+        if not os.path.exists("/tmp/vllm.log"):
+            return "📝 No logs yet. Start the server first."
+        with open("/tmp/vllm.log", "r") as f:
+            lines = f.readlines()
+            last_lines = lines[-50:]  # Last 50 lines
+        log_text = "📋 **vLLM Server Logs (Last 50 lines)**\n\n```\n"
+        log_text += "".join(last_lines)
+        log_text += "\n```"
+        return log_text
+    except Exception as e:
+        return f"❌ Error reading logs: {str(e)}"
 def chat(message, history, system_prompt, max_tokens, temperature, top_p):
     """Send chat message to vLLM server"""
             gr.Markdown("### 🎛️ Server Control")
             start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
             server_status = gr.Markdown("**Status:** Server not started")
+            view_logs_btn = gr.Button("📋 View Server Logs", size="sm")
+            logs_display = gr.Markdown("", visible=False)
             gr.Markdown("---")
             gr.Markdown("### ⚙️ Generation Settings")
         outputs=server_status
     )
+    def show_logs():
+        return {logs_display: gr.update(value=view_logs(), visible=True)}
+    view_logs_btn.click(
+        fn=show_logs,
+        outputs=logs_display
+    )
     def user_message(user_msg, history):
         return "", history + [[user_msg, None]]