File size: 7,760 Bytes
a951334
310eb95
 
 
 
 
 
 
a951334
5e458c4
 
310eb95
 
b51ac87
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a259bc
310eb95
 
 
 
 
 
3a259bc
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
5e458c4
310eb95
 
 
 
 
5e458c4
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a951334
310eb95
 
 
 
a951334
310eb95
 
 
 
 
 
b51ac87
a951334
310eb95
 
 
a951334
310eb95
a951334
310eb95
 
b51ac87
5e458c4
 
310eb95
 
 
5e458c4
310eb95
5e458c4
 
 
 
 
 
 
 
e32298d
310eb95
5e458c4
 
 
 
310eb95
5e458c4
a951334
5e458c4
 
 
 
 
310eb95
a951334
 
5e458c4
 
 
 
 
310eb95
5e458c4
a951334
310eb95
 
a951334
310eb95
 
 
 
 
 
a951334
5e458c4
310eb95
5e458c4
 
 
 
 
 
a951334
5e458c4
 
 
 
310eb95
5e458c4
 
 
a951334
 
5e458c4
a951334
5e458c4
310eb95
 
 
5e458c4
 
 
 
 
310eb95
 
 
 
5e458c4
310eb95
5e458c4
 
 
 
 
 
 
 
 
 
310eb95
5e458c4
 
 
 
 
 
 
 
 
 
310eb95
5e458c4
 
 
 
 
a951334
 
5e458c4
310eb95
5e458c4
310eb95
a951334
 
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a951334
310eb95
a951334
 
 
310eb95
a951334
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import gradio as gr
import requests
import json
import subprocess
import time
import os
import signal
import sys

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
VLLM_PORT = 8000
VLLM_PROCESS = None

def start_vllm_server():
    """Start vLLM server in background"""
    global VLLM_PROCESS
    
    if VLLM_PROCESS is not None:
        return "βœ… vLLM server already running"
    
    try:
        # Start vLLM server
        cmd = [
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", MODEL_NAME,
            "--host", "0.0.0.0",
            "--port", str(VLLM_PORT),
            "--dtype", "bfloat16",
            "--trust-remote-code",
        ]
        
        VLLM_PROCESS = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            preexec_fn=os.setsid if sys.platform != 'win32' else None
        )
        
        # Wait for server to start
        max_retries = 60
        for i in range(max_retries):
            try:
                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=1)
                if response.status_code == 200:
                    return "βœ… vLLM server started successfully!"
            except:
                time.sleep(2)
        
        return "⚠️ vLLM server started but health check failed"
        
    except Exception as e:
        return f"❌ Failed to start vLLM server: {str(e)}"

def chat(message, history, system_prompt, max_tokens, temperature, top_p):
    """Send chat message to vLLM server"""
    try:
        # Build messages
        messages = []
        
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt.strip()})
        
        # Add history
        for human, assistant in history:
            messages.append({"role": "user", "content": human})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})
        
        # Add current message
        messages.append({"role": "user", "content": message})
        
        # Call vLLM API
        response = requests.post(
            f"http://localhost:{VLLM_PORT}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": messages,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "top_p": top_p,
                "stream": False
            },
            timeout=300
        )
        
        if response.status_code == 200:
            result = response.json()
            assistant_message = result["choices"][0]["message"]["content"]
            return assistant_message
        else:
            return f"❌ Error: {response.status_code} - {response.text}"
            
    except requests.exceptions.ConnectionError:
        return "❌ Cannot connect to vLLM server. Please start the server first."
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Custom CSS
custom_css = """
.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
    gr.Markdown("""
    # πŸš€ Kimi Linear 48B A3B - Fine-tuned Inference
    
    High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
    
    **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸŽ›οΈ Server Control")
            start_btn = gr.Button("πŸš€ Start vLLM Server", variant="primary", size="lg")
            server_status = gr.Markdown("**Status:** Server not started")
            
            gr.Markdown("---")
            gr.Markdown("### βš™οΈ Generation Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful AI assistant...",
                lines=3,
                value=""
            )
            
            max_tokens = gr.Slider(
                minimum=50,
                maximum=4096,
                value=1024,
                step=1,
                label="Max Tokens"
            )
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.05,
                label="Temperature"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P"
            )
            
            gr.Markdown("""
            ### πŸ“– Instructions
            
            1. **Start Server** - Click the button above (takes 2-5 min)
            2. **Wait for "βœ…"** - Server is ready when you see green checkmark
            3. **Start Chatting** - Type your message below
            
            **Note:** First message may be slow as the model loads into memory.
            """)
        
        with gr.Column(scale=2):
            gr.Markdown("### πŸ’¬ Chat")
            
            chatbot = gr.Chatbot(
                height=500,
                show_copy_button=True,
                avatar_images=["πŸ‘€", "πŸ€–"]
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    lines=2,
                    scale=4
                )
                send_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("πŸ—‘οΈ Clear Chat")
    
    # Event handlers
    start_btn.click(
        fn=start_vllm_server,
        outputs=server_status
    )
    
    def user_message(user_msg, history):
        return "", history + [[user_msg, None]]
    
    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
        if not history or history[-1][1] is not None:
            return history
        
        user_msg = history[-1][0]
        bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
        history[-1][1] = bot_msg
        return history
    
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    send_btn.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    
    gr.Markdown("""
    ---
    
    **Powered by vLLM** - High-performance LLM inference engine
    
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    """)

# Cleanup on exit
def cleanup():
    global VLLM_PROCESS
    if VLLM_PROCESS:
        try:
            if sys.platform == 'win32':
                VLLM_PROCESS.terminate()
            else:
                os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
        except:
            pass

import atexit
atexit.register(cleanup)

if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )