Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on Nov 10

Commit

9905f0a

1 Parent(s): a82de92

Switch to transformers inference (vLLM doesn't support KimiLinear architecture)

Browse files

Files changed (2) hide show

app.py +132 -269
requirements.txt +8 -9

app.py CHANGED Viewed

@@ -1,312 +1,175 @@
 import gradio as gr
-import requests
-import json
-import subprocess
-import time
 import os
-import signal
-import sys
 # Model configuration
 MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
-VLLM_PORT = 8000
-VLLM_PROCESS = None
-def start_vllm_server():
-    """Start vLLM server in background"""
-    global VLLM_PROCESS
-    if VLLM_PROCESS is not None:
-        return "✅ vLLM server already running"
-    try:
-        # Start vLLM server with tensor parallelism for multi-GPU
-        cmd = [
-            "python3", "-m", "vllm.entrypoints.openai.api_server",
-            "--model", MODEL_NAME,
-            "--host", "0.0.0.0",
-            "--port", str(VLLM_PORT),
-            "--dtype", "bfloat16",
-            "--trust-remote-code",
-            "--tensor-parallel-size", "4",  # Use all 4 GPUs
-            "--max-model-len", "8192",  # Limit context to save memory
-        ]
-        log_file = open("/tmp/vllm.log", "w")
-        VLLM_PROCESS = subprocess.Popen(
-            cmd,
-            stdout=log_file,
-            stderr=subprocess.STDOUT,
-            preexec_fn=os.setsid if sys.platform != 'win32' else None
-        )
-        status_msg = "🔄 **vLLM server starting...**\n\n"
-        status_msg += "This takes 5-10 minutes for the 48B model.\n\n"
-        status_msg += "**Progress:**\n"
-        status_msg += "1. Downloading model (if not cached)\n"
-        status_msg += "2. Loading weights across 4 GPUs\n"
-        status_msg += "3. Initializing inference engine\n\n"
-        status_msg += "**Status:** Initializing...\n\n"
-        status_msg += "_Check logs at /tmp/vllm.log for details_"
-        # Wait longer for big model - up to 10 minutes
-        max_retries = 300  # 300 * 2 seconds = 10 minutes
-        for i in range(max_retries):
-            try:
-                response = requests.get(f"http://localhost:{VLLM_PORT}/health", timeout=2)
-                if response.status_code == 200:
-                    return "✅ **vLLM server started successfully!**\n\nYou can now start chatting below."
-            except requests.exceptions.RequestException:
-                pass
-            # Check if process died
-            if VLLM_PROCESS.poll() is not None:
-                # Process ended
-                with open("/tmp/vllm.log", "r") as f:
-                    last_lines = f.readlines()[-20:]
-                error_msg = "❌ **vLLM server crashed during startup**\n\n"
-                error_msg += "**Last log lines:**\n```\n"
-                error_msg += "".join(last_lines)
-                error_msg += "\n```"
-                return error_msg
-            time.sleep(2)
-        # Timeout but process still running
-        return "⚠️ **vLLM server started but taking longer than expected**\n\nThe server may still be initializing. Wait a few more minutes and try sending a message."
-    except Exception as e:
-        return f"❌ **Failed to start vLLM server:**\n\n{str(e)}"
-def view_logs():
-    """View vLLM server logs"""
-    try:
-        if not os.path.exists("/tmp/vllm.log"):
-            return "📝 No logs yet. Start the server first."
-        with open("/tmp/vllm.log", "r") as f:
-            lines = f.readlines()
-            last_lines = lines[-50:]  # Last 50 lines
-        log_text = "📋 **vLLM Server Logs (Last 50 lines)**\n\n```\n"
-        log_text += "".join(last_lines)
-        log_text += "\n```"
-        return log_text
-    except Exception as e:
-        return f"❌ Error reading logs: {str(e)}"
-def chat(message, history, system_prompt, max_tokens, temperature, top_p):
-    """Send chat message to vLLM server"""
-    try:
-        # Build messages
-        messages = []
-        if system_prompt.strip():
-            messages.append({"role": "system", "content": system_prompt.strip()})
-        # Add history
-        for human, assistant in history:
-            messages.append({"role": "user", "content": human})
-            if assistant:
-                messages.append({"role": "assistant", "content": assistant})
-        # Add current message
-        messages.append({"role": "user", "content": message})
-        # Call vLLM API
-        response = requests.post(
-            f"http://localhost:{VLLM_PORT}/v1/chat/completions",
-            headers={"Content-Type": "application/json"},
-            json={
-                "model": MODEL_NAME,
-                "messages": messages,
-                "max_tokens": max_tokens,
-                "temperature": temperature,
-                "top_p": top_p,
-                "stream": False
-            },
-            timeout=300
-        )
-        if response.status_code == 200:
-            result = response.json()
-            assistant_message = result["choices"][0]["message"]["content"]
-            return assistant_message
-        else:
-            return f"❌ Error: {response.status_code} - {response.text}"
-    except requests.exceptions.ConnectionError:
-        return "❌ Cannot connect to vLLM server. Please start the server first."
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
-# Custom CSS
-custom_css = """
-.gradio-container {
-    max-width: 1200px !important;
-}
-"""
-# Create Gradio interface
-with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Kimi 48B Fine-tuned") as demo:
     gr.Markdown("""
-    # 🚀 Kimi Linear 48B A3B - Fine-tuned Inference
-    High-performance inference using **vLLM** for the fine-tuned Kimi-Linear-48B-A3B-Instruct model.
     **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### 🎛️ Server Control")
-            start_btn = gr.Button("🚀 Start vLLM Server", variant="primary", size="lg")
-            server_status = gr.Markdown("**Status:** Server not started")
-            view_logs_btn = gr.Button("📋 View Server Logs", size="sm")
-            logs_display = gr.Markdown("", visible=False)
             gr.Markdown("---")
-            gr.Markdown("### ⚙️ Generation Settings")
             system_prompt = gr.Textbox(
-                label="System Prompt (Optional)",
-                placeholder="You are a helpful AI assistant...",
-                lines=3,
-                value=""
-            )
-            max_tokens = gr.Slider(
-                minimum=50,
-                maximum=4096,
-                value=1024,
-                step=1,
-                label="Max Tokens"
             )
-            temperature = gr.Slider(
-                minimum=0.0,
-                maximum=2.0,
-                value=0.7,
-                step=0.05,
-                label="Temperature"
-            )
-            top_p = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.9,
-                step=0.05,
-                label="Top P"
-            )
-            gr.Markdown("""
-            ### 📖 Instructions
-            1. **Start Server** - Click the button above (takes 2-5 min)
-            2. **Wait for "✅"** - Server is ready when you see green checkmark
-            3. **Start Chatting** - Type your message below
-            **Note:** First message may be slow as the model loads into memory.
-            """)
         with gr.Column(scale=2):
             gr.Markdown("### 💬 Chat")
-            chatbot = gr.Chatbot(
-                height=500,
-                show_copy_button=True
-            )
             with gr.Row():
-                msg = gr.Textbox(
-                    label="Your Message",
-                    placeholder="Type your message here...",
-                    lines=2,
-                    scale=4
-                )
-                send_btn = gr.Button("📤 Send", variant="primary", scale=1)
-            with gr.Row():
-                clear_btn = gr.Button("🗑️ Clear Chat")
-    # Event handlers
-    start_btn.click(
-        fn=start_vllm_server,
-        outputs=server_status
-    )
-    def show_logs():
-        return {logs_display: gr.update(value=view_logs(), visible=True)}
-    view_logs_btn.click(
-        fn=show_logs,
-        outputs=logs_display
-    )
-    def user_message(user_msg, history):
-        return "", history + [[user_msg, None]]
-    def bot_response(history, system_prompt, max_tokens, temperature, top_p):
-        if not history or history[-1][1] is not None:
-            return history
-        user_msg = history[-1][0]
-        bot_msg = chat(user_msg, history[:-1], system_prompt, max_tokens, temperature, top_p)
-        history[-1][1] = bot_msg
-        return history
-    msg.submit(
-        user_message,
-        [msg, chatbot],
-        [msg, chatbot],
-        queue=False
-    ).then(
-        bot_response,
-        [chatbot, system_prompt, max_tokens, temperature, top_p],
-        chatbot
-    )
-    send_btn.click(
-        user_message,
-        [msg, chatbot],
-        [msg, chatbot],
-        queue=False
-    ).then(
-        bot_response,
-        [chatbot, system_prompt, max_tokens, temperature, top_p],
-        chatbot
-    )
-    clear_btn.click(lambda: None, None, chatbot, queue=False)
     gr.Markdown("""
     ---
-    **Powered by vLLM** - High-performance LLM inference engine
     **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
     """)
-# Cleanup on exit
-def cleanup():
-    global VLLM_PROCESS
-    if VLLM_PROCESS:
-        try:
-            if sys.platform == 'win32':
-                VLLM_PROCESS.terminate()
-            else:
-                os.killpg(os.getpgid(VLLM_PROCESS.pid), signal.SIGTERM)
-        except:
-            pass
-import atexit
-atexit.register(cleanup)
 if __name__ == "__main__":
-    demo.queue()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True
-    )

 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 # Model configuration
 MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
+class ChatBot:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.loaded = False
+    def load_model(self):
+        if self.loaded:
+            return "✅ Model already loaded!"
+        try:
+            yield "🔄 Loading tokenizer..."
+            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+            yield "🔄 Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..."
+            # Configure memory for 4 GPUs
+            num_gpus = torch.cuda.device_count()
+            max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)}  # L4 has 24GB, leave 1GB
+            self.model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                torch_dtype=torch.bfloat16,
+                device_map="balanced",  # Distribute evenly
+                max_memory=max_memory,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+            )
+            self.model.eval()
+            self.loaded = True
+            # Get GPU distribution info
+            if hasattr(self.model, 'hf_device_map'):
+                device_info = "\n\n**GPU Distribution:**\n"
+                devices = {}
+                for name, device in self.model.hf_device_map.items():
+                    if device not in devices:
+                        devices[device] = 0
+                    devices[device] += 1
+                for device, count in devices.items():
+                    device_info += f"- {device}: {count} layers\n"
+            else:
+                device_info = ""
+            yield f"✅ **Model loaded successfully!**{device_info}\n\nYou can now start chatting below."
+        except Exception as e:
+            self.loaded = False
+            yield f"❌ **Error loading model:**\n\n{str(e)}"
+    def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
+        if not self.loaded:
+            return "❌ Please load the model first by clicking the 'Load Model' button."
+        try:
+            # Build prompt from history
+            conversation = []
+            if system_prompt.strip():
+                conversation.append(f"System: {system_prompt}")
+            for user_msg, bot_msg in history:
+                conversation.append(f"User: {user_msg}")
+                if bot_msg:
+                    conversation.append(f"Assistant: {bot_msg}")
+            conversation.append(f"User: {message}")
+            conversation.append("Assistant:")
+            prompt = "\n".join(conversation)
+            # Tokenize
+            inputs = self.tokenizer(prompt, return_tensors="pt")
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=temperature > 0,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            # Decode
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract assistant response
+            if "Assistant:" in response:
+                response = response.split("Assistant:")[-1].strip()
+            return response
+        except Exception as e:
+            return f"❌ Error: {str(e)}"
+# Initialize
+bot = ChatBot()
+# UI
+with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
     gr.Markdown("""
+    # 🚀 Kimi Linear 48B A3B - Fine-tuned
+    Chat interface for the fine-tuned Kimi model.
     **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
     """)
+    # Show GPU info
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        gpu_name = torch.cuda.get_device_name(0)
+        total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
+        gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 🎛️ Controls")
+            load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
+            status = gr.Markdown("**Status:** Model not loaded")
             gr.Markdown("---")
+            gr.Markdown("### ⚙️ Settings")
             system_prompt = gr.Textbox(
+                label="System Prompt",
+                placeholder="You are a helpful assistant...",
+                lines=2
             )
+            max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
+            temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
+            top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
         with gr.Column(scale=2):
             gr.Markdown("### 💬 Chat")
+            chatbot = gr.Chatbot(height=500, show_copy_button=True)
             with gr.Row():
+                msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
+                send = gr.Button("Send", variant="primary", scale=1)
+            clear = gr.Button("Clear")
+    # Events
+    load_btn.click(bot.load_model, outputs=status)
+    def respond(message, history, system, max_tok, temp, top):
+        bot_message = bot.chat(message, history, system, max_tok, temp, top)
+        history.append((message, bot_message))
+        return history, ""
+    msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
+    send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
+    clear.click(lambda: None, None, chatbot)
     gr.Markdown("""
     ---
     **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
     """)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

requirements.txt CHANGED Viewed

@@ -1,12 +1,11 @@
-# vLLM for high-performance inference
-vllm>=0.6.0
-# Core dependencies
 gradio==4.19.2
-requests>=2.31.0
-# Note: vLLM automatically installs:
-# - torch
-# - transformers
-# - tokenizers
-# - etc.

+# Core ML dependencies
+torch>=2.1.0
+transformers>=4.56.0
+accelerate>=0.34.0
+sentencepiece>=0.1.99
+# UI
 gradio==4.19.2
+# Utils
+safetensors>=0.4.0