Spaces:

optiviseapp
/

fnmodel

Paused

File size: 19,624 Bytes

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import subprocess
import json
from datetime import datetime
import time

# Set environment variables for flash-linear-attention and memory management
os.environ["FLA_USE_TRITON"] = "1"
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"  # Updated from PYTORCH_CUDA_ALLOC_CONF

# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"

class ChatBot:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.loaded = False
    
    def _create_status_table(self, tasks, status="⏳ Waiting", results=None):
        """Create a markdown table showing evaluation status"""
        table = "## 📊 Evaluation Progress\n\n"
        table += "| Benchmark | Status | Score | Details |\n"
        table += "|-----------|--------|-------|----------|\n"
        
        for task in tasks:
            task_status = status
            task_score = "-"
            task_details = ""
            
            if results and task in results:
                task_status = "✅ Complete"
                if task == "ARC-Challenge" and "arc_challenge" in results[task]:
                    score_data = results[task]["arc_challenge"]
                    task_score = f"{score_data.get('acc_norm', 0):.2%}"
                    task_details = f"acc: {score_data.get('acc', 0):.2%}"
                elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]:
                    score_data = results[task]["truthfulqa_mc2"]
                    task_score = f"{score_data.get('acc', 0):.2%}"
                elif task == "Winogrande" and "winogrande" in results[task]:
                    score_data = results[task]["winogrande"]
                    task_score = f"{score_data.get('acc', 0):.2%}"
            
            table += f"| {task} | {task_status} | {task_score} | {task_details} |\n"
        
        return table
    
    def load_model(self):
        if self.loaded:
            return "✅ Model already loaded!"
        
        try:
            yield "🔄 Loading tokenizer..."
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
            
            yield "🔄 Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..."
            
            # Configure memory for 4 GPUs
            num_gpus = torch.cuda.device_count()
            max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)}  # L4 has 24GB, leave 1GB
            
            self.model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                torch_dtype=torch.bfloat16,
                device_map="balanced",
                max_memory=max_memory,
                trust_remote_code=True,
                low_cpu_mem_usage=True,
                attn_implementation="eager",
            )
            
            self.model.eval()
            
            # Patch model config to avoid flash attention issues
            if hasattr(self.model.config, '_attn_implementation'):
                self.model.config._attn_implementation = "eager"
            if hasattr(self.model.config, 'attn_implementation'):
                self.model.config.attn_implementation = "eager"
            
            self.loaded = True
            
            # Get GPU distribution info
            if hasattr(self.model, 'hf_device_map'):
                device_info = "\n\n**GPU Distribution:**\n"
                devices = {}
                for name, device in self.model.hf_device_map.items():
                    if device not in devices:
                        devices[device] = 0
                    devices[device] += 1
                for device, count in devices.items():
                    device_info += f"- {device}: {count} layers\n"
            else:
                device_info = ""
            
            yield f"✅ **Model loaded successfully!**{device_info}\n\nYou can now use the Evaluation tab."
            
        except Exception as e:
            self.loaded = False
            yield f"❌ **Error loading model:**\n\n{str(e)}"
    
    def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
        if not self.loaded:
            return "❌ Please load the model first by clicking the 'Load Model' button in Controls."
        
        try:
            # Build prompt from history
            conversation = []
            if system_prompt.strip():
                conversation.append(f"System: {system_prompt}")
            
            for user_msg, bot_msg in history:
                conversation.append(f"User: {user_msg}")
                if bot_msg:
                    conversation.append(f"Assistant: {bot_msg}")
            
            conversation.append(f"User: {message}")
            conversation.append("Assistant:")
            
            prompt = "\n".join(conversation)
            
            # Tokenize
            inputs = self.tokenizer(prompt, return_tensors="pt")
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
            
            # Generate
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.eos_token_id,
                    use_cache=True,
                )
            
            # Decode
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract assistant response
            if "Assistant:" in response:
                response = response.split("Assistant:")[-1].strip()
            
            return response
            
        except Exception as e:
            return f"❌ Error: {str(e)}"
    
    def run_evaluation(self, tasks_to_run):
        """Run lm_eval on selected tasks"""
        # Note: We don't strictly require the model to be loaded first
        # since we'll be unloading it anyway. The load step is just for verification.
        
        try:
            # Map friendly names to lm_eval task names
            task_map = {
                "ARC-Challenge": "arc_challenge",
                "TruthfulQA": "truthfulqa_mc2",
                "Winogrande": "winogrande"
            }
            
            selected_tasks = [task_map[t] for t in tasks_to_run]
            task_string = ",".join(selected_tasks)
            
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_dir = f"/tmp/eval_results_{timestamp}"
            
            # Initial status table
            status_table = self._create_status_table(tasks_to_run, "⏳ Preparing")
            logs = "🔄 **Preparing for evaluation...**\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n"
            yield status_table, logs
            
            # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
            if self.loaded and self.model is not None:
                logs += "🔄 **Unloading model to free VRAM...**\n\n"
                yield status_table, logs
                
                if self.model is not None:
                    del self.model
                    self.model = None
                if self.tokenizer is not None:
                    del self.tokenizer
                    self.tokenizer = None
                
                self.loaded = False
            else:
                logs += "🔄 **Cleaning up memory...**\n\n"
                yield status_table, logs
            
            # Aggressive memory cleanup
            import gc
            for _ in range(3):
                gc.collect()
            
            if torch.cuda.is_available():
                for i in range(torch.cuda.device_count()):
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize(device=i)
                    torch.cuda.reset_peak_memory_stats(device=i)
                    torch.cuda.reset_accumulated_memory_stats(device=i)
            
            # Wait for memory to be fully released
            logs += "🔄 **Waiting for memory cleanup (5s)...**\n\n"
            yield status_table, logs
            time.sleep(5)
            
            # Final garbage collection
            gc.collect()
            
            status_table = self._create_status_table(tasks_to_run, "🔄 Loading Model")
            logs += "✅ **Memory cleared! Starting evaluation...**\n\n"
            logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
            yield status_table, logs
            
            # Run lm_eval with optimized memory settings
            # Note: We use parallelize=True to distribute across GPUs instead of device_map in model_args
            # We need to install flash-attn for this model to work properly
            cmd = [
                "lm_eval",
                "--model", "hf",
                "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
                "--tasks", task_string,
                "--batch_size", "1",  # Reduced to minimize memory usage
                "--output_path", output_dir,
                "--log_samples"
            ]
            
            status_table = self._create_status_table(tasks_to_run, "🔄 Running")
            logs += f"🔄 **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\n"
            logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
            yield status_table, logs
            
            # Run evaluation
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                bufsize=1
            )
            
            output_lines = []
            log_update_counter = 0
            for line in process.stdout:
                output_lines.append(line)
                log_update_counter += 1
                
                # Update every 5 lines to reduce UI flickering
                if log_update_counter % 5 == 0:
                    recent = ''.join(output_lines[-15:])
                    current_logs = logs + recent + "\n```"
                    yield status_table, current_logs
            
            process.wait()
            
            if process.returncode != 0:
                status_table = self._create_status_table(tasks_to_run, "❌ Failed")
                error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n"
                error_logs += f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n"
                yield status_table, error_logs
                return
            
            # Read results
            results_file = os.path.join(output_dir, "results.json")
            if os.path.exists(results_file):
                with open(results_file, 'r') as f:
                    results = json.load(f)
                
                # Parse results for status table
                parsed_results = {}
                for task in tasks_to_run:
                    task_key = task_map[task]
                    if task_key in results['results']:
                        parsed_results[task] = {task_key: results['results'][task_key]}
                
                # Update status table with results
                status_table = self._create_status_table(tasks_to_run, "✅ Complete", parsed_results)
                
                # Format detailed results
                result_logs = "✅ **Evaluation Complete!**\n\n"
                result_logs += f"**Timestamp:** {timestamp}\n\n"
                result_logs += "## 📊 Detailed Results:\n\n"
                
                for task in selected_tasks:
                    if task in results['results']:
                        task_results = results['results'][task]
                        result_logs += f"### {task}\n"
                        for metric, value in task_results.items():
                            if isinstance(value, float):
                                result_logs += f"- **{metric}:** {value:.4f}\n"
                            else:
                                result_logs += f"- **{metric}:** {value}\n"
                        result_logs += "\n"
                
                # Add summary if available
                if 'summary' in results:
                    result_logs += "## 📈 Summary:\n\n"
                    for metric, value in results['summary'].items():
                        if isinstance(value, float):
                            result_logs += f"- **{metric}:** {value:.4f}\n"
                        else:
                            result_logs += f"- **{metric}:** {value}\n"
                
                result_logs += f"\n\n**Full results saved to:** `{output_dir}`"
                
                yield status_table, result_logs
            else:
                status_table = self._create_status_table(tasks_to_run, "⚠️ Unknown")
                warning_logs = f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
                yield status_table, warning_logs
                
        except Exception as e:
            status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "❌ Error")
            error_logs = f"❌ **Evaluation error:**\n\n{str(e)}"
            yield status_table, error_logs

# Initialize
bot = ChatBot()

# UI with Tabs
with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo:
    gr.Markdown("""
    # 📊 Kimi Linear 48B A3B - Evaluation
    
    **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
    
    **This Space is configured for model evaluation only. Chat/inference is disabled.**
    """)
    
    # Show GPU info
    if torch.cuda.is_available():
        gpu_count = torch.cuda.device_count()
        gpu_name = torch.cuda.get_device_name(0)
        total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
        gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
    
    with gr.Tabs():
        # Tab 1: Controls (always visible)
        with gr.Tab("🎛️ Controls"):
            gr.Markdown("### Load Model (Optional)")
            load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
            status = gr.Markdown("**Status:** Model not loaded")
            
            gr.Markdown("""
            ### ℹ️ Instructions
            1. **(Optional)** Click "Load Model" to verify setup (takes 5-10 minutes)
            2. **Go directly to Evaluation tab** to run benchmarks
            
            **Note:** 
            - Chat/inference functionality is currently disabled. This Space focuses on model evaluation only.
            - Loading the model first is optional - you can go straight to the Evaluation tab
            - Any loaded model will be automatically unloaded before evaluation starts to free VRAM for lm_eval.
            """)
        
        # Tab 2: Chat - DISABLED
        # Uncomment this section to re-enable chat functionality
        """
        with gr.Tab("💬 Chat"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### ⚙️ Settings")
                    
                    system_prompt = gr.Textbox(
                        label="System Prompt",
                        placeholder="You are a helpful assistant...",
                        lines=2
                    )
                    
                    max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
                    temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
                    top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
                
                with gr.Column(scale=2):
                    chatbot = gr.Chatbot(height=500, show_copy_button=True)
                    
                    with gr.Row():
                        msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
                        send = gr.Button("Send", variant="primary", scale=1)
                    
                    clear = gr.Button("Clear Chat")
        """
        
        # Tab 3: Evaluation
        with gr.Tab("📊 Evaluation"):
            gr.Markdown("""
            ### Run LM Evaluation Harness
            
            Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.**
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Select Benchmarks")
                    
                    tasks = gr.CheckboxGroup(
                        choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
                        value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
                        label="Tasks to Run",
                        info="Select one or more tasks"
                    )
                    
                    eval_btn = gr.Button("🚀 Start Evaluation", variant="primary", size="lg")
                    
                    gr.Markdown("""
                    ### ⏱️ Estimated Time:
                    - **ARC-Challenge:** 15-30 min
                    - **TruthfulQA:** 10-20 min
                    - **Winogrande:** 15-30 min
                    
                    **Total:** ~40-80 minutes for all 3
                    """)
                
                with gr.Column(scale=2):
                    eval_status = gr.Markdown("## 📊 Evaluation Progress\n\nClick '🚀 Start Evaluation' to begin.")
                    eval_logs = gr.Markdown("### 📜 Logs\n\nLogs will appear here during evaluation.")
            
            gr.Markdown("""
            ---
            **Note:** 
            - You can start evaluation immediately - no need to load the model first
            - If you did load the model, it will be automatically unloaded before evaluation to free VRAM
            - lm_eval will load its own fresh instance of the model for evaluation
            - Results will be saved to `/tmp/eval_results_[timestamp]/`
            """)
    
    gr.Markdown("""
    ---
    **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
    """)
    
    # Events
    load_btn.click(bot.load_model, outputs=status)
    
    # Chat event handlers - DISABLED
    # Uncomment these lines to re-enable chat functionality
    """
    def respond(message, history, system, max_tok, temp, top):
        bot_message = bot.chat(message, history, system, max_tok, temp, top)
        history.append((message, bot_message))
        return history, ""
    
    msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
    send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
    clear.click(lambda: None, None, chatbot)
    """
    
    # Evaluation event handler
    eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)