Spaces:

optiviseapp
/

fnmodel

Paused

File size: 11,875 Bytes

5e458c4

import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
MODEL_DESCRIPTION = """
# 🚀 Kimi Linear 48B A3B Instruct - Fine-tuned

A professionally fine-tuned version of Moonshot AI's Kimi-Linear-48B-A3B-Instruct model using QLoRA.

**Model Details:**
- **Base Model:** moonshotai/Kimi-Linear-48B-A3B-Instruct
- **Parameters:** 48 Billion
- **Fine-tuning Method:** QLoRA (Quantized Low-Rank Adaptation)
- **Training Focus:** Attention layers (q_proj, k_proj, v_proj, o_proj)
- **Architecture:** Mixture of Experts (MoE) Transformer
"""

# Check GPU availability
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(num_gpus))
    logger.info(f"🎮 {num_gpus} GPU(s) detected with {total_vram:.1f}GB total VRAM")
else:
    logger.warning("⚠️ No GPUs detected - running on CPU (will be slow)")

class ModelInference:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.is_loaded = False
        
    def load_model(self, progress=gr.Progress()):
        """Load the model and tokenizer"""
        if self.is_loaded:
            return "✅ Model already loaded"
        
        try:
            progress(0.2, desc="Loading tokenizer...")
            logger.info(f"Loading tokenizer from: {MODEL_NAME}")
            self.tokenizer = AutoTokenizer.from_pretrained(
                MODEL_NAME,
                trust_remote_code=True
            )
            
            progress(0.4, desc="Loading model (this may take several minutes)...")
            logger.info(f"Loading model from: {MODEL_NAME}")
            
            # Configure for multi-GPU
            num_gpus = torch.cuda.device_count()
            max_memory = {}
            if num_gpus > 0:
                for i in range(num_gpus):
                    gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
                    max_memory[i] = f"{int(gpu_memory - 3)}GB"
            
            self.model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_memory=max_memory if max_memory else None,
                trust_remote_code=True,
                low_cpu_mem_usage=True,
            )
            
            self.model.eval()
            self.is_loaded = True
            
            progress(1.0, desc="Model loaded!")
            logger.info("✅ Model loaded successfully")
            
            # Get model info
            total_params = sum(p.numel() for p in self.model.parameters())
            model_size = (total_params * 2) / 1024**3  # bfloat16 = 2 bytes
            
            info_msg = f"""
✅ **Model Loaded Successfully!**

**Model Information:**
- Model: `{MODEL_NAME}`
- Parameters: {total_params:,}
- Size: ~{model_size:.1f} GB (bfloat16)
- Device: {"Multi-GPU" if num_gpus > 1 else "Single GPU" if num_gpus == 1 else "CPU"}

**You can now start chatting below!** 👇
"""
            return info_msg
            
        except Exception as e:
            logger.error(f"Failed to load model: {str(e)}", exc_info=True)
            self.is_loaded = False
            return f"❌ **Failed to load model:**\n\n{str(e)}"
    
    def generate_response(
        self,
        message,
        history,
        system_prompt,
        max_new_tokens,
        temperature,
        top_p,
        top_k,
        repetition_penalty,
    ):
        """Generate a response from the model"""
        if not self.is_loaded:
            return "❌ Please load the model first using the 'Load Model' button above."
        
        try:
            # Build conversation context
            conversation = []
            
            # Add system prompt if provided
            if system_prompt.strip():
                conversation.append(f"System: {system_prompt.strip()}")
            
            # Add chat history
            for human, assistant in history:
                conversation.append(f"User: {human}")
                if assistant:
                    conversation.append(f"Assistant: {assistant}")
            
            # Add current message
            conversation.append(f"User: {message}")
            conversation.append("Assistant:")
            
            # Format prompt
            prompt = "\n".join(conversation)
            
            # Tokenize
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            
            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    top_k=top_k,
                    repetition_penalty=repetition_penalty,
                    do_sample=True if temperature > 0 else False,
                    pad_token_id=self.tokenizer.eos_token_id,
                )
            
            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract assistant's response (everything after the last "Assistant:")
            if "Assistant:" in response:
                response = response.split("Assistant:")[-1].strip()
            
            return response
            
        except Exception as e:
            logger.error(f"Generation failed: {str(e)}", exc_info=True)
            return f"❌ **Generation failed:**\n\n{str(e)}"

# Initialize inference
inferencer = ModelInference()

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Inference") as demo:
    gr.Markdown(MODEL_DESCRIPTION)
    
    # GPU Info
    if torch.cuda.is_available():
        gpu_info = f"### 🎮 Hardware: {torch.cuda.device_count()}x {torch.cuda.get_device_name(0)} ({total_vram:.1f}GB total VRAM)"
    else:
        gpu_info = "### ⚠️ Running on CPU (no GPU detected)"
    gr.Markdown(gpu_info)
    
    gr.Markdown("---")
    
    with gr.Row():
        with gr.Column(scale=1):
            load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
            load_status = gr.Markdown("**Status:** Model not loaded. Click 'Load Model' to start.")
            
            gr.Markdown("### ⚙️ Generation Settings")
            
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful AI assistant...",
                lines=3,
                value=""
            )
            
            max_new_tokens = gr.Slider(
                minimum=50,
                maximum=4096,
                value=1024,
                step=1,
                label="Max New Tokens",
                info="Maximum length of generated response"
            )
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.05,
                label="Temperature",
                info="Higher = more creative, Lower = more focused"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P (Nucleus Sampling)",
                info="Probability threshold for token selection"
            )
            
            top_k = gr.Slider(
                minimum=0,
                maximum=100,
                value=50,
                step=1,
                label="Top K",
                info="Number of top tokens to consider (0 = disabled)"
            )
            
            repetition_penalty = gr.Slider(
                minimum=1.0,
                maximum=2.0,
                value=1.1,
                step=0.05,
                label="Repetition Penalty",
                info="Penalty for repeating tokens"
            )
        
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Chat Interface")
            
            chatbot = gr.Chatbot(
                height=500,
                label="Conversation",
                show_copy_button=True,
                avatar_images=["👤", "🤖"]
            )
            
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    lines=3,
                    scale=4
                )
                send_btn = gr.Button("📤 Send", variant="primary", scale=1)
            
            with gr.Row():
                clear_btn = gr.Button("🗑️ Clear Chat")
                retry_btn = gr.Button("🔄 Retry Last")
            
            gr.Markdown("""
            ### 📝 Usage Tips:
            - First, click **"Load Model"** to initialize the model (takes 2-5 minutes)
            - Use the **System Prompt** to set the assistant's behavior
            - Adjust **Temperature** for creativity (0.7-1.0 recommended)
            - Lower **Top P** for more focused responses
            - Clear chat to start a new conversation
            """)
    
    # Event handlers
    load_btn.click(
        fn=inferencer.load_model,
        outputs=load_status
    )
    
    def user_message(user_msg, history):
        return "", history + [[user_msg, None]]
    
    def bot_response(history, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
        user_msg = history[-1][0]
        bot_msg = inferencer.generate_response(
            user_msg,
            history[:-1],
            system_prompt,
            max_new_tokens,
            temperature,
            top_p,
            top_k,
            repetition_penalty
        )
        history[-1][1] = bot_msg
        return history
    
    # Send message
    msg.submit(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        chatbot
    )
    
    send_btn.click(
        user_message,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        chatbot
    )
    
    # Clear chat
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    
    # Retry last message
    def retry_last(history):
        if history:
            history[-1][1] = None
        return history
    
    retry_btn.click(
        retry_last,
        chatbot,
        chatbot,
        queue=False
    ).then(
        bot_response,
        [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        chatbot
    )
    
    gr.Markdown("""
    ---
    
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    
    **Base Model:** [moonshotai/Kimi-Linear-48B-A3B-Instruct](https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct)
    
    Fine-tuned with ❤️ using QLoRA
    """)

# Launch
if __name__ == "__main__":
    demo.queue(max_size=10)
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )