Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import os | |
| import subprocess | |
| import json | |
| from datetime import datetime | |
| import time | |
| # Set environment variables for flash-linear-attention and memory management | |
| os.environ["FLA_USE_TRITON"] = "1" | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| # Model configuration | |
| MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune" | |
| class ChatBot: | |
| def __init__(self): | |
| self.model = None | |
| self.tokenizer = None | |
| self.loaded = False | |
| def load_model(self): | |
| if self.loaded: | |
| return "β Model already loaded!" | |
| try: | |
| yield "π Loading tokenizer..." | |
| self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| yield "π Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..." | |
| # Configure memory for 4 GPUs | |
| num_gpus = torch.cuda.device_count() | |
| max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.bfloat16, | |
| device_map="balanced", | |
| max_memory=max_memory, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| attn_implementation="eager", | |
| ) | |
| self.model.eval() | |
| # Patch model config to avoid flash attention issues | |
| if hasattr(self.model.config, '_attn_implementation'): | |
| self.model.config._attn_implementation = "eager" | |
| if hasattr(self.model.config, 'attn_implementation'): | |
| self.model.config.attn_implementation = "eager" | |
| self.loaded = True | |
| # Get GPU distribution info | |
| if hasattr(self.model, 'hf_device_map'): | |
| device_info = "\n\n**GPU Distribution:**\n" | |
| devices = {} | |
| for name, device in self.model.hf_device_map.items(): | |
| if device not in devices: | |
| devices[device] = 0 | |
| devices[device] += 1 | |
| for device, count in devices.items(): | |
| device_info += f"- {device}: {count} layers\n" | |
| else: | |
| device_info = "" | |
| yield f"β **Model loaded successfully!**{device_info}\n\nYou can now use the Evaluation tab." | |
| except Exception as e: | |
| self.loaded = False | |
| yield f"β **Error loading model:**\n\n{str(e)}" | |
| def chat(self, message, history, system_prompt, max_tokens, temperature, top_p): | |
| if not self.loaded: | |
| return "β Please load the model first by clicking the 'Load Model' button in Controls." | |
| try: | |
| # Build prompt from history | |
| conversation = [] | |
| if system_prompt.strip(): | |
| conversation.append(f"System: {system_prompt}") | |
| for user_msg, bot_msg in history: | |
| conversation.append(f"User: {user_msg}") | |
| if bot_msg: | |
| conversation.append(f"Assistant: {bot_msg}") | |
| conversation.append(f"User: {message}") | |
| conversation.append("Assistant:") | |
| prompt = "\n".join(conversation) | |
| # Tokenize | |
| inputs = self.tokenizer(prompt, return_tensors="pt") | |
| inputs = {k: v.to(self.model.device) for k, v in inputs.items()} | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=temperature > 0, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| use_cache=True, | |
| ) | |
| # Decode | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract assistant response | |
| if "Assistant:" in response: | |
| response = response.split("Assistant:")[-1].strip() | |
| return response | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def run_evaluation(self, tasks_to_run): | |
| """Run lm_eval on selected tasks""" | |
| # Note: We don't strictly require the model to be loaded first | |
| # since we'll be unloading it anyway. The load step is just for verification. | |
| try: | |
| # Map friendly names to lm_eval task names | |
| task_map = { | |
| "ARC-Challenge": "arc_challenge", | |
| "TruthfulQA": "truthfulqa_mc2", | |
| "Winogrande": "winogrande" | |
| } | |
| selected_tasks = [task_map[t] for t in tasks_to_run] | |
| task_string = ",".join(selected_tasks) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_dir = f"/tmp/eval_results_{timestamp}" | |
| yield f"π **Preparing for evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\n" | |
| # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval | |
| if self.loaded and self.model is not None: | |
| yield f"π **Unloading model to free VRAM...**\n\nThis is necessary because lm_eval will load its own instance.\n\n" | |
| if self.model is not None: | |
| del self.model | |
| self.model = None | |
| if self.tokenizer is not None: | |
| del self.tokenizer | |
| self.tokenizer = None | |
| self.loaded = False | |
| else: | |
| yield f"π **Cleaning up memory...**\n\nPreparing environment for evaluation.\n\n" | |
| # Aggressive memory cleanup | |
| import gc | |
| for _ in range(3): | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| for i in range(torch.cuda.device_count()): | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize(device=i) | |
| torch.cuda.reset_peak_memory_stats(device=i) | |
| torch.cuda.reset_accumulated_memory_stats(device=i) | |
| # Wait for memory to be fully released | |
| yield f"π **Waiting for memory cleanup...**\n\nGiving the system time to fully release VRAM.\n\n" | |
| time.sleep(5) | |
| # Final garbage collection | |
| gc.collect() | |
| yield f"β **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n" | |
| # Run lm_eval with optimized memory settings | |
| cmd = [ | |
| "lm_eval", | |
| "--model", "hf", | |
| "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,device_map=auto,low_cpu_mem_usage=True", | |
| "--tasks", task_string, | |
| "--batch_size", "1", # Reduced to minimize memory usage | |
| "--output_path", output_dir, | |
| "--log_samples" | |
| ] | |
| yield f"π **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n" | |
| # Run evaluation | |
| process = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| bufsize=1 | |
| ) | |
| output_lines = [] | |
| for line in process.stdout: | |
| output_lines.append(line) | |
| # Show last 20 lines | |
| recent = ''.join(output_lines[-20:]) | |
| yield f"π **Running evaluation...**\n\n```\n{recent}\n```" | |
| process.wait() | |
| if process.returncode != 0: | |
| yield f"β **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```" | |
| return | |
| # Read results | |
| results_file = os.path.join(output_dir, "results.json") | |
| if os.path.exists(results_file): | |
| with open(results_file, 'r') as f: | |
| results = json.load(f) | |
| # Format results | |
| result_text = "β **Evaluation Complete!**\n\n" | |
| result_text += f"**Timestamp:** {timestamp}\n\n" | |
| result_text += "## π Results:\n\n" | |
| for task in selected_tasks: | |
| if task in results['results']: | |
| task_results = results['results'][task] | |
| result_text += f"### {task}\n" | |
| for metric, value in task_results.items(): | |
| if isinstance(value, float): | |
| result_text += f"- **{metric}:** {value:.4f}\n" | |
| else: | |
| result_text += f"- **{metric}:** {value}\n" | |
| result_text += "\n" | |
| # Add summary if available | |
| if 'summary' in results: | |
| result_text += "## π Summary:\n\n" | |
| for metric, value in results['summary'].items(): | |
| if isinstance(value, float): | |
| result_text += f"- **{metric}:** {value:.4f}\n" | |
| else: | |
| result_text += f"- **{metric}:** {value}\n" | |
| result_text += f"\n\n**Full results saved to:** `{output_dir}`" | |
| yield result_text | |
| else: | |
| yield f"β οΈ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```" | |
| except Exception as e: | |
| yield f"β **Evaluation error:**\n\n{str(e)}" | |
| # Initialize | |
| bot = ChatBot() | |
| # UI with Tabs | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo: | |
| gr.Markdown(""" | |
| # π Kimi Linear 48B A3B - Evaluation | |
| **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune` | |
| **This Space is configured for model evaluation only. Chat/inference is disabled.** | |
| """) | |
| # Show GPU info | |
| if torch.cuda.is_available(): | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count)) | |
| gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)") | |
| with gr.Tabs(): | |
| # Tab 1: Controls (always visible) | |
| with gr.Tab("ποΈ Controls"): | |
| gr.Markdown("### Load Model (Optional)") | |
| load_btn = gr.Button("π Load Model", variant="primary", size="lg") | |
| status = gr.Markdown("**Status:** Model not loaded") | |
| gr.Markdown(""" | |
| ### βΉοΈ Instructions | |
| 1. **(Optional)** Click "Load Model" to verify setup (takes 5-10 minutes) | |
| 2. **Go directly to Evaluation tab** to run benchmarks | |
| **Note:** | |
| - Chat/inference functionality is currently disabled. This Space focuses on model evaluation only. | |
| - Loading the model first is optional - you can go straight to the Evaluation tab | |
| - Any loaded model will be automatically unloaded before evaluation starts to free VRAM for lm_eval. | |
| """) | |
| # Tab 2: Chat - DISABLED | |
| # Uncomment this section to re-enable chat functionality | |
| """ | |
| with gr.Tab("π¬ Chat"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Settings") | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| placeholder="You are a helpful assistant...", | |
| lines=2 | |
| ) | |
| max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1) | |
| temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1) | |
| top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05) | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(height=500, show_copy_button=True) | |
| with gr.Row(): | |
| msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4) | |
| send = gr.Button("Send", variant="primary", scale=1) | |
| clear = gr.Button("Clear Chat") | |
| """ | |
| # Tab 3: Evaluation | |
| with gr.Tab("π Evaluation"): | |
| gr.Markdown(""" | |
| ### Run LM Evaluation Harness | |
| Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Select Benchmarks") | |
| tasks = gr.CheckboxGroup( | |
| choices=["ARC-Challenge", "TruthfulQA", "Winogrande"], | |
| value=["ARC-Challenge", "TruthfulQA", "Winogrande"], | |
| label="Tasks to Run", | |
| info="Select one or more tasks" | |
| ) | |
| eval_btn = gr.Button("π Start Evaluation", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### β±οΈ Estimated Time: | |
| - **ARC-Challenge:** 15-30 min | |
| - **TruthfulQA:** 10-20 min | |
| - **Winogrande:** 15-30 min | |
| **Total:** ~40-80 minutes for all 3 | |
| """) | |
| with gr.Column(scale=2): | |
| eval_results = gr.Markdown("Results will appear here after evaluation completes.") | |
| gr.Markdown(""" | |
| --- | |
| **Note:** | |
| - You can start evaluation immediately - no need to load the model first | |
| - If you did load the model, it will be automatically unloaded before evaluation to free VRAM | |
| - lm_eval will load its own fresh instance of the model for evaluation | |
| - Results will be saved to `/tmp/eval_results_[timestamp]/` | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune) | |
| """) | |
| # Events | |
| load_btn.click(bot.load_model, outputs=status) | |
| # Chat event handlers - DISABLED | |
| # Uncomment these lines to re-enable chat functionality | |
| """ | |
| def respond(message, history, system, max_tok, temp, top): | |
| bot_message = bot.chat(message, history, system, max_tok, temp, top) | |
| history.append((message, bot_message)) | |
| return history, "" | |
| msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) | |
| send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) | |
| clear.click(lambda: None, None, chatbot) | |
| """ | |
| # Evaluation event handler | |
| eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) | |