Spaces:

optiviseapp
/

fnmodel

Paused

File size: 7,760 Bytes

import gradio as gr
import requests
import json
import subprocess
import time
import os
import signal
import sys

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
VLLM_PORT = 8000
VLLM_PROCESS = None

def start_vllm_server():
    """Start vLLM server in background"""
    global VLLM_PROCESS
    
    if VLLM_PROCESS is not None:
        return "✅ vLLM server already running"
    
    try:
        # Start vLLM server
        cmd = [
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", MODEL_NAME,
            "--host", "0.0.0.0",
            "--port", str(VLLM_PORT),
            "--dtype", "bfloat16",
            "--trust-remote-code",
        ]
        
        VLLM_PROCESS = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            preexec_fn=os.setsid if sys.platform != 'win32' else None
        )
        
        # Wait for server to start
        max_retries = 60
        for i in range(max_retries):
            try:
                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
                if response.status_code == 200:
                    return "✅ vLLM server started successfully!"
            except:
                time.sleep(2)
        
        return "⚠️ vLLM server started but health check failed"
        
    except Exception as e:
        return f"❌ Failed to start vLLM server: {str(e)}"

def chat(message, history, system_prompt, max_tokens, temperature, top_p):
    """Send chat message to vLLM server"""
    try:
        # Build messages
        messages = []
        
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt.strip()})
        
        # Add history
        for human, assistant in history:
            messages.append({"role": "user", "content": human})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})
        
        # Add current message
        messages.append({"role": "user", "content": message})
        
        # Call vLLM API
        response = requests.post(
            f"http://localhost:{VLLM_PORT}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": messages,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "top_p": top_p,
                "stream": False
            },
            timeout=300
        )
        
        if response.status_code == 200:
            result = response.json()
            assistant_message = result["choices"][0]["message"]["content"]
            return assistant_message
        else:
            return f"❌ Error: {response.status_code} - {response.text}"
            
    except requests.exceptions.ConnectionError:
        return "❌ Cannot connect to vLLM server. Please start the server first."
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Custom CSS
custom_css = """
.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
    gr.Markdown("""
    # 🚀 Kimi Linear 48B A3B - Fine-tuned Inference
    
    High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
    
    **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎛️ Server Control")
            start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
            server_status = gr.Markdown("**Status:** Server not started")
            
            gr.Markdown("---")
            gr.Markdown("### ⚙️ Generation Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful AI assistant...",
                lines=3,
                value=""
            )
            
            max_tokens = gr.Slider(
                minimum=50,
                maximum=4096,
                value=1024,
                step=1,
                label="Max Tokens"
            )
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.05,
                label="Temperature"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P"
            )
            
            gr.Markdown("""
            ### 📖 Instructions
            
            1. **Start Server** - Click the button above (takes 2-5 min)
            2. **Wait for "✅"** - Server is ready when you see green checkmark
            3. **Start Chatting** - Type your message below
            
            **Note:** First message may be slow as the model loads into memory.
            """)
        
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Chat")
            
            chatbot = gr.Chatbot(
                height=500,
                show_copy_button=True,
                avatar_images=["👤", "🤖"]
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    lines=2,
                    scale=4
                )
                send_btn = gr.Button("📤 Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Chat")
    
    # Event handlers
    start_btn.click(
        fn=start_vllm_server,
        outputs=server_status
    )
    
    def user_message(user_msg, history):
        return "", history + [[user_msg, None]]
    
    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
        if not history or history[-1][1] is not None:
            return history
        
        user_msg = history[-1][0]
        bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
        history[-1][1] = bot_msg
        return history
    
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    send_btn.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    
    gr.Markdown("""
    ---
    
    **Powered by vLLM** - High-performance LLM inference engine
    
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    """)

# Cleanup on exit
def cleanup():
    global VLLM_PROCESS
    if VLLM_PROCESS:
        try:
            if sys.platform == 'win32':
                VLLM_PROCESS.terminate()
            else:
                os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
        except:
            pass

import atexit
atexit.register(cleanup)

if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )