Spaces:

optiviseapp
/

fnmodel

Paused

File size: 10,122 Bytes

import gradio as gr
import requests
import json
import subprocess
import time
import os
import signal
import sys

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
VLLM_PORT = 8000
VLLM_PROCESS = None

def start_vllm_server():
    """Start vLLM server in background"""
    global VLLM_PROCESS
    
    if VLLM_PROCESS is not None:
        return "✅ vLLM server already running"
    
    try:
        # Start vLLM server with tensor parallelism for multi-GPU
        cmd = [
            "python3", "-m", "vllm.entrypoints.openai.api_server",
            "--model", MODEL_NAME,
            "--host", "0.0.0.0",
            "--port", str(VLLM_PORT),
            "--dtype", "bfloat16",
            "--trust-remote-code",
            "--tensor-parallel-size", "4",  # Use all 4 GPUs
            "--max-model-len", "8192",  # Limit context to save memory
        ]
        
        log_file = open("/tmp/vllm.log", "w")
        VLLM_PROCESS = subprocess.Popen(
            cmd,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            preexec_fn=os.setsid if sys.platform != 'win32' else None
        )
        
        status_msg = "🔄 **vLLM server starting...**\n\n"
        status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
        status_msg += "**Progress:**\n"
        status_msg += "1. Downloading model (if not cached)\n"
        status_msg += "2. Loading weights across 4 GPUs\n"
        status_msg += "3. Initializing inference engine\n\n"
        status_msg += "**Status:** Initializing...\n\n"
        status_msg += "_Check logs at /tmp/vllm.log for details_"
        
        # Wait longer for big model - up to 10 minutes
        max_retries = 300  # 300 * 2 seconds = 10 minutes
        for i in range(max_retries):
            try:
                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
                if response.status_code == 200:
                    return "✅ **vLLM server started successfully!**\n\nYou can now start chatting below."
            except requests.exceptions.RequestException:
                pass
            
            # Check if process died
            if VLLM_PROCESS.poll() is not None:
                # Process ended
                with open("/tmp/vllm.log", "r") as f:
                    last_lines = f.readlines()[-20:]
                error_msg = "❌ **vLLM server crashed during startup**\n\n"
                error_msg += "**Last log lines:**\n```\n"
                error_msg += "".join(last_lines)
                error_msg += "\n```"
                return error_msg
            
            time.sleep(2)
        
        # Timeout but process still running
        return "⚠️ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
        
    except Exception as e:
        return f"❌ **Failed to start vLLM server:**\n\n{str(e)}"

def view_logs():
    """View vLLM server logs"""
    try:
        if not os.path.exists("/tmp/vllm.log"):
            return "📝 No logs yet. Start the server first."
        
        with open("/tmp/vllm.log", "r") as f:
            lines = f.readlines()
            last_lines = lines[-50:]  # Last 50 lines
        
        log_text = "📋 **vLLM Server Logs (Last 50 lines)**\n\n```\n"
        log_text += "".join(last_lines)
        log_text += "\n```"
        return log_text
    except Exception as e:
        return f"❌ Error reading logs: {str(e)}"

def chat(message, history, system_prompt, max_tokens, temperature, top_p):
    """Send chat message to vLLM server"""
    try:
        # Build messages
        messages = []
        
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt.strip()})
        
        # Add history
        for human, assistant in history:
            messages.append({"role": "user", "content": human})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})
        
        # Add current message
        messages.append({"role": "user", "content": message})
        
        # Call vLLM API
        response = requests.post(
            f"http://localhost:{VLLM_PORT}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": messages,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "top_p": top_p,
                "stream": False
            },
            timeout=300
        )
        
        if response.status_code == 200:
            result = response.json()
            assistant_message = result["choices"][0]["message"]["content"]
            return assistant_message
        else:
            return f"❌ Error: {response.status_code} - {response.text}"
            
    except requests.exceptions.ConnectionError:
        return "❌ Cannot connect to vLLM server. Please start the server first."
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Custom CSS
custom_css = """
.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
    gr.Markdown("""
    # 🚀 Kimi Linear 48B A3B - Fine-tuned Inference
    
    High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
    
    **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎛️ Server Control")
            start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
            server_status = gr.Markdown("**Status:** Server not started")
            view_logs_btn = gr.Button("📋 View Server Logs", size="sm")
            logs_display = gr.Markdown("", visible=False)
            
            gr.Markdown("---")
            gr.Markdown("### ⚙️ Generation Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful AI assistant...",
                lines=3,
                value=""
            )
            
            max_tokens = gr.Slider(
                minimum=50,
                maximum=4096,
                value=1024,
                step=1,
                label="Max Tokens"
            )
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.05,
                label="Temperature"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P"
            )
            
            gr.Markdown("""
            ### 📖 Instructions
            
            1. **Start Server** - Click the button above (takes 2-5 min)
            2. **Wait for "✅"** - Server is ready when you see green checkmark
            3. **Start Chatting** - Type your message below
            
            **Note:** First message may be slow as the model loads into memory.
            """)
        
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Chat")
            
            chatbot = gr.Chatbot(
                height=500,
                show_copy_button=True
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    lines=2,
                    scale=4
                )
                send_btn = gr.Button("📤 Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Chat")
    
    # Event handlers
    start_btn.click(
        fn=start_vllm_server,
        outputs=server_status
    )
    
    def show_logs():
        return {logs_display: gr.update(value=view_logs(), visible=True)}
    
    view_logs_btn.click(
        fn=show_logs,
        outputs=logs_display
    )
    
    def user_message(user_msg, history):
        return "", history + [[user_msg, None]]
    
    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
        if not history or history[-1][1] is not None:
            return history
        
        user_msg = history[-1][0]
        bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
        history[-1][1] = bot_msg
        return history
    
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    send_btn.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    
    gr.Markdown("""
    ---
    
    **Powered by vLLM** - High-performance LLM inference engine
    
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    """)

# Cleanup on exit
def cleanup():
    global VLLM_PROCESS
    if VLLM_PROCESS:
        try:
            if sys.platform == 'win32':
                VLLM_PROCESS.terminate()
            else:
                os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
        except:
            pass

import atexit
atexit.register(cleanup)

if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True
    )