File size: 10,122 Bytes
a951334
310eb95
 
 
 
 
 
 
a951334
5e458c4
 
310eb95
 
b51ac87
310eb95
 
 
 
 
 
 
 
a82de92
310eb95
75c2813
310eb95
 
 
 
 
a82de92
 
310eb95
3a259bc
a82de92
310eb95
 
a82de92
 
310eb95
 
3a259bc
a82de92
 
 
 
 
 
 
 
 
 
 
310eb95
 
a82de92
310eb95
a82de92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310eb95
a82de92
 
310eb95
 
a82de92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e458c4
310eb95
 
 
 
 
5e458c4
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a951334
310eb95
 
 
 
a951334
310eb95
 
 
 
 
 
b51ac87
a951334
310eb95
 
 
a951334
310eb95
a951334
310eb95
 
b51ac87
5e458c4
 
310eb95
 
 
a82de92
 
5e458c4
310eb95
5e458c4
 
 
 
 
 
 
 
e32298d
310eb95
5e458c4
 
 
 
310eb95
5e458c4
a951334
5e458c4
 
 
 
 
310eb95
a951334
 
5e458c4
 
 
 
 
310eb95
5e458c4
a951334
310eb95
 
a951334
310eb95
 
 
 
 
 
a951334
5e458c4
310eb95
5e458c4
 
 
5f01a47
5e458c4
a951334
5e458c4
 
 
 
310eb95
5e458c4
 
 
a951334
 
5e458c4
a951334
5e458c4
310eb95
 
 
5e458c4
 
a82de92
 
 
 
 
 
 
 
5e458c4
 
 
310eb95
 
 
 
5e458c4
310eb95
5e458c4
 
 
 
 
 
 
 
 
 
310eb95
5e458c4
 
 
 
 
 
 
 
 
 
310eb95
5e458c4
 
 
 
 
a951334
 
5e458c4
310eb95
5e458c4
310eb95
a951334
 
310eb95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a951334
310eb95
a951334
 
 
d073f8b
 
a951334
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import gradio as gr
import requests
import json
import subprocess
import time
import os
import signal
import sys

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
VLLM_PORT = 8000
VLLM_PROCESS = None

def start_vllm_server():
    """Start vLLM server in background"""
    global VLLM_PROCESS
    
    if VLLM_PROCESS is not None:
        return "βœ… vLLM server already running"
    
    try:
        # Start vLLM server with tensor parallelism for multi-GPU
        cmd = [
            "python3", "-m", "vllm.entrypoints.openai.api_server",
            "--model", MODEL_NAME,
            "--host", "0.0.0.0",
            "--port", str(VLLM_PORT),
            "--dtype", "bfloat16",
            "--trust-remote-code",
            "--tensor-parallel-size", "4",  # Use all 4 GPUs
            "--max-model-len", "8192",  # Limit context to save memory
        ]
        
        log_file = open("/tmp/vllm.log", "w")
        VLLM_PROCESS = subprocess.Popen(
            cmd,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            preexec_fn=os.setsid if sys.platform != 'win32' else None
        )
        
        status_msg = "πŸ”„ **vLLM server starting...**\n\n"
        status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
        status_msg += "**Progress:**\n"
        status_msg += "1. Downloading model (if not cached)\n"
        status_msg += "2. Loading weights across 4 GPUs\n"
        status_msg += "3. Initializing inference engine\n\n"
        status_msg += "**Status:** Initializing...\n\n"
        status_msg += "_Check logs at /tmp/vllm.log for details_"
        
        # Wait longer for big model - up to 10 minutes
        max_retries = 300  # 300 * 2 seconds = 10 minutes
        for i in range(max_retries):
            try:
                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
                if response.status_code == 200:
                    return "βœ… **vLLM server started successfully!**\n\nYou can now start chatting below."
            except requests.exceptions.RequestException:
                pass
            
            # Check if process died
            if VLLM_PROCESS.poll() is not None:
                # Process ended
                with open("/tmp/vllm.log", "r") as f:
                    last_lines = f.readlines()[-20:]
                error_msg = "❌ **vLLM server crashed during startup**\n\n"
                error_msg += "**Last log lines:**\n```\n"
                error_msg += "".join(last_lines)
                error_msg += "\n```"
                return error_msg
            
            time.sleep(2)
        
        # Timeout but process still running
        return "⚠️ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
        
    except Exception as e:
        return f"❌ **Failed to start vLLM server:**\n\n{str(e)}"

def view_logs():
    """View vLLM server logs"""
    try:
        if not os.path.exists("/tmp/vllm.log"):
            return "πŸ“ No logs yet. Start the server first."
        
        with open("/tmp/vllm.log", "r") as f:
            lines = f.readlines()
            last_lines = lines[-50:]  # Last 50 lines
        
        log_text = "πŸ“‹ **vLLM Server Logs (Last 50 lines)**\n\n```\n"
        log_text += "".join(last_lines)
        log_text += "\n```"
        return log_text
    except Exception as e:
        return f"❌ Error reading logs: {str(e)}"

def chat(message, history, system_prompt, max_tokens, temperature, top_p):
    """Send chat message to vLLM server"""
    try:
        # Build messages
        messages = []
        
        if system_prompt.strip():
            messages.append({"role": "system", "content": system_prompt.strip()})
        
        # Add history
        for human, assistant in history:
            messages.append({"role": "user", "content": human})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})
        
        # Add current message
        messages.append({"role": "user", "content": message})
        
        # Call vLLM API
        response = requests.post(
            f"http://localhost:{VLLM_PORT}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": messages,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "top_p": top_p,
                "stream": False
            },
            timeout=300
        )
        
        if response.status_code == 200:
            result = response.json()
            assistant_message = result["choices"][0]["message"]["content"]
            return assistant_message
        else:
            return f"❌ Error: {response.status_code} - {response.text}"
            
    except requests.exceptions.ConnectionError:
        return "❌ Cannot connect to vLLM server. Please start the server first."
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Custom CSS
custom_css = """
.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
    gr.Markdown("""
    # πŸš€ Kimi Linear 48B A3B - Fine-tuned Inference
    
    High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
    
    **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### πŸŽ›οΈ Server Control")
            start_btn = gr.Button("πŸš€ Start vLLM Server", variant="primary", size="lg")
            server_status = gr.Markdown("**Status:** Server not started")
            view_logs_btn = gr.Button("πŸ“‹ View Server Logs", size="sm")
            logs_display = gr.Markdown("", visible=False)
            
            gr.Markdown("---")
            gr.Markdown("### βš™οΈ Generation Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful AI assistant...",
                lines=3,
                value=""
            )
            
            max_tokens = gr.Slider(
                minimum=50,
                maximum=4096,
                value=1024,
                step=1,
                label="Max Tokens"
            )
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.05,
                label="Temperature"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P"
            )
            
            gr.Markdown("""
            ### πŸ“– Instructions
            
            1. **Start Server** - Click the button above (takes 2-5 min)
            2. **Wait for "βœ…"** - Server is ready when you see green checkmark
            3. **Start Chatting** - Type your message below
            
            **Note:** First message may be slow as the model loads into memory.
            """)
        
        with gr.Column(scale=2):
            gr.Markdown("### πŸ’¬ Chat")
            
            chatbot = gr.Chatbot(
                height=500,
                show_copy_button=True
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    lines=2,
                    scale=4
                )
                send_btn = gr.Button("πŸ“€ Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("πŸ—‘οΈ Clear Chat")
    
    # Event handlers
    start_btn.click(
        fn=start_vllm_server,
        outputs=server_status
    )
    
    def show_logs():
        return {logs_display: gr.update(value=view_logs(), visible=True)}
    
    view_logs_btn.click(
        fn=show_logs,
        outputs=logs_display
    )
    
    def user_message(user_msg, history):
        return "", history + [[user_msg, None]]
    
    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
        if not history or history[-1][1] is not None:
            return history
        
        user_msg = history[-1][0]
        bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
        history[-1][1] = bot_msg
        return history
    
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    send_btn.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_tokens, temperature, top_p],
        chatbot
    )
    
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    
    gr.Markdown("""
    ---
    
    **Powered by vLLM** - High-performance LLM inference engine
    
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    """)

# Cleanup on exit
def cleanup():
    global VLLM_PROCESS
    if VLLM_PROCESS:
        try:
            if sys.platform == 'win32':
                VLLM_PROCESS.terminate()
            else:
                os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
        except:
            pass

import atexit
atexit.register(cleanup)

if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True
    )