Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import os | |
| import subprocess | |
| import json | |
| from datetime import datetime | |
| import time | |
| # Set environment variables for flash-linear-attention and memory management | |
| os.environ["FLA_USE_TRITON"] = "1" | |
| os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" # Updated from PYTORCH_CUDA_ALLOC_CONF | |
| # Model configuration | |
| MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune" | |
| class ChatBot: | |
| def __init__(self): | |
| self.model = None | |
| self.tokenizer = None | |
| self.loaded = False | |
| def _create_status_table(self, tasks, status="β³ Waiting", results=None): | |
| """Create a markdown table showing evaluation status""" | |
| table = "## π Evaluation Progress\n\n" | |
| table += "| Benchmark | Status | Score | Details |\n" | |
| table += "|-----------|--------|-------|----------|\n" | |
| for task in tasks: | |
| task_status = status | |
| task_score = "-" | |
| task_details = "" | |
| if results and task in results: | |
| task_status = "β Complete" | |
| if task == "ARC-Challenge" and "arc_challenge" in results[task]: | |
| score_data = results[task]["arc_challenge"] | |
| task_score = f"{score_data.get('acc_norm', 0):.2%}" | |
| task_details = f"acc: {score_data.get('acc', 0):.2%}" | |
| elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]: | |
| score_data = results[task]["truthfulqa_mc2"] | |
| task_score = f"{score_data.get('acc', 0):.2%}" | |
| elif task == "Winogrande" and "winogrande" in results[task]: | |
| score_data = results[task]["winogrande"] | |
| task_score = f"{score_data.get('acc', 0):.2%}" | |
| table += f"| {task} | {task_status} | {task_score} | {task_details} |\n" | |
| return table | |
| def load_model(self): | |
| if self.loaded: | |
| return "β Model already loaded!" | |
| try: | |
| yield "π Loading tokenizer..." | |
| self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| yield "π Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..." | |
| # Configure memory for 4 GPUs | |
| num_gpus = torch.cuda.device_count() | |
| max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.bfloat16, | |
| device_map="balanced", | |
| max_memory=max_memory, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| attn_implementation="eager", | |
| ) | |
| self.model.eval() | |
| # Patch model config to avoid flash attention issues | |
| if hasattr(self.model.config, '_attn_implementation'): | |
| self.model.config._attn_implementation = "eager" | |
| if hasattr(self.model.config, 'attn_implementation'): | |
| self.model.config.attn_implementation = "eager" | |
| self.loaded = True | |
| # Get GPU distribution info | |
| if hasattr(self.model, 'hf_device_map'): | |
| device_info = "\n\n**GPU Distribution:**\n" | |
| devices = {} | |
| for name, device in self.model.hf_device_map.items(): | |
| if device not in devices: | |
| devices[device] = 0 | |
| devices[device] += 1 | |
| for device, count in devices.items(): | |
| device_info += f"- {device}: {count} layers\n" | |
| else: | |
| device_info = "" | |
| yield f"β **Model loaded successfully!**{device_info}\n\nYou can now use the Evaluation tab." | |
| except Exception as e: | |
| self.loaded = False | |
| yield f"β **Error loading model:**\n\n{str(e)}" | |
| def chat(self, message, history, system_prompt, max_tokens, temperature, top_p): | |
| if not self.loaded: | |
| return "β Please load the model first by clicking the 'Load Model' button in Controls." | |
| try: | |
| # Build prompt from history | |
| conversation = [] | |
| if system_prompt.strip(): | |
| conversation.append(f"System: {system_prompt}") | |
| for user_msg, bot_msg in history: | |
| conversation.append(f"User: {user_msg}") | |
| if bot_msg: | |
| conversation.append(f"Assistant: {bot_msg}") | |
| conversation.append(f"User: {message}") | |
| conversation.append("Assistant:") | |
| prompt = "\n".join(conversation) | |
| # Tokenize | |
| inputs = self.tokenizer(prompt, return_tensors="pt") | |
| inputs = {k: v.to(self.model.device) for k, v in inputs.items()} | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=temperature > 0, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| use_cache=True, | |
| ) | |
| # Decode | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract assistant response | |
| if "Assistant:" in response: | |
| response = response.split("Assistant:")[-1].strip() | |
| return response | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def run_evaluation(self, tasks_to_run): | |
| """Run lm_eval on selected tasks""" | |
| # Note: We don't strictly require the model to be loaded first | |
| # since we'll be unloading it anyway. The load step is just for verification. | |
| try: | |
| # Map friendly names to lm_eval task names | |
| task_map = { | |
| "ARC-Challenge": "arc_challenge", | |
| "TruthfulQA": "truthfulqa_mc2", | |
| "Winogrande": "winogrande" | |
| } | |
| selected_tasks = [task_map[t] for t in tasks_to_run] | |
| task_string = ",".join(selected_tasks) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_dir = f"/tmp/eval_results_{timestamp}" | |
| # Initial status table | |
| status_table = self._create_status_table(tasks_to_run, "β³ Preparing") | |
| logs = "π **Preparing for evaluation...**\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n" | |
| yield status_table, logs | |
| # IMPORTANT: Clean up any loaded model to free VRAM for lm_eval | |
| if self.loaded and self.model is not None: | |
| logs += "π **Unloading model to free VRAM...**\n\n" | |
| yield status_table, logs | |
| if self.model is not None: | |
| del self.model | |
| self.model = None | |
| if self.tokenizer is not None: | |
| del self.tokenizer | |
| self.tokenizer = None | |
| self.loaded = False | |
| else: | |
| logs += "π **Cleaning up memory...**\n\n" | |
| yield status_table, logs | |
| # Aggressive memory cleanup | |
| import gc | |
| for _ in range(3): | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| for i in range(torch.cuda.device_count()): | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize(device=i) | |
| torch.cuda.reset_peak_memory_stats(device=i) | |
| torch.cuda.reset_accumulated_memory_stats(device=i) | |
| # Wait for memory to be fully released | |
| logs += "π **Waiting for memory cleanup (5s)...**\n\n" | |
| yield status_table, logs | |
| time.sleep(5) | |
| # Final garbage collection | |
| gc.collect() | |
| status_table = self._create_status_table(tasks_to_run, "π Loading Model") | |
| logs += "β **Memory cleared! Starting evaluation...**\n\n" | |
| logs += f"β±οΈ Estimated time: 30-60 minutes\n\n" | |
| yield status_table, logs | |
| # Create a fake flash_attn package to avoid import errors | |
| # This will fallback to standard PyTorch attention | |
| fake_flash_dir = f"/tmp/flash_attn_{timestamp}" | |
| os.makedirs(fake_flash_dir, exist_ok=True) | |
| with open(os.path.join(fake_flash_dir, "__init__.py"), 'w') as f: | |
| f.write(""" | |
| # Fake flash_attn module that falls back to standard PyTorch attention | |
| import torch | |
| def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False, **kwargs): | |
| '''Fallback to standard PyTorch attention (slower but works without flash-attn)''' | |
| if softmax_scale is None: | |
| softmax_scale = 1.0 / (q.size(-1) ** 0.5) | |
| # Standard attention: softmax(Q @ K.T) @ V | |
| attn_weights = torch.matmul(q, k.transpose(-2, -1)) * softmax_scale | |
| if causal: | |
| seq_len = attn_weights.size(-1) | |
| causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attn_weights.device), diagonal=1).bool() | |
| attn_weights = attn_weights.masked_fill(causal_mask, float('-inf')) | |
| attn_weights = torch.softmax(attn_weights, dim=-1) | |
| if dropout_p > 0: | |
| attn_weights = torch.nn.functional.dropout(attn_weights, p=dropout_p) | |
| output = torch.matmul(attn_weights, v) | |
| return output, None # Return None for attention weights | |
| def flash_attn_varlen_func(*args, **kwargs): | |
| return flash_attn_func(*args, **kwargs) | |
| __version__ = "2.5.0" | |
| """) | |
| # Add fake package to Python path for subprocess | |
| import sys | |
| if f"/tmp" not in sys.path: | |
| sys.path.insert(0, "/tmp") | |
| # Set PYTHONPATH environment variable so subprocess can find fake flash_attn | |
| env = os.environ.copy() | |
| pythonpath = env.get('PYTHONPATH', '') | |
| env['PYTHONPATH'] = f"/tmp:{pythonpath}" if pythonpath else "/tmp" | |
| logs += "β οΈ **Note:** Using fallback PyTorch attention (slower than flash-attn)\n\n" | |
| yield status_table, logs | |
| # Run lm_eval | |
| cmd = [ | |
| "lm_eval", | |
| "--model", "hf", | |
| "--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True", | |
| "--tasks", task_string, | |
| "--batch_size", "1", | |
| "--output_path", output_dir, | |
| "--log_samples" | |
| ] | |
| status_table = self._create_status_table(tasks_to_run, "π Running") | |
| logs += f"π **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\n" | |
| logs += "---\n\n### π Live Logs (last 15 lines):\n\n```\n" | |
| yield status_table, logs | |
| # Run evaluation with custom environment | |
| process = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| bufsize=1, | |
| env=env # Pass custom environment with PYTHONPATH | |
| ) | |
| output_lines = [] | |
| log_update_counter = 0 | |
| for line in process.stdout: | |
| output_lines.append(line) | |
| log_update_counter += 1 | |
| # Update every 5 lines to reduce UI flickering | |
| if log_update_counter % 5 == 0: | |
| recent = ''.join(output_lines[-15:]) | |
| current_logs = logs + recent + "\n```" | |
| yield status_table, current_logs | |
| process.wait() | |
| if process.returncode != 0: | |
| status_table = self._create_status_table(tasks_to_run, "β Failed") | |
| error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n" | |
| error_logs += f"β **Evaluation failed!**\n\nExit code: {process.returncode}\n" | |
| yield status_table, error_logs | |
| return | |
| # Read results | |
| results_file = os.path.join(output_dir, "results.json") | |
| if os.path.exists(results_file): | |
| with open(results_file, 'r') as f: | |
| results = json.load(f) | |
| # Parse results for status table | |
| parsed_results = {} | |
| for task in tasks_to_run: | |
| task_key = task_map[task] | |
| if task_key in results['results']: | |
| parsed_results[task] = {task_key: results['results'][task_key]} | |
| # Update status table with results | |
| status_table = self._create_status_table(tasks_to_run, "β Complete", parsed_results) | |
| # Format detailed results | |
| result_logs = "β **Evaluation Complete!**\n\n" | |
| result_logs += f"**Timestamp:** {timestamp}\n\n" | |
| result_logs += "## π Detailed Results:\n\n" | |
| for task in selected_tasks: | |
| if task in results['results']: | |
| task_results = results['results'][task] | |
| result_logs += f"### {task}\n" | |
| for metric, value in task_results.items(): | |
| if isinstance(value, float): | |
| result_logs += f"- **{metric}:** {value:.4f}\n" | |
| else: | |
| result_logs += f"- **{metric}:** {value}\n" | |
| result_logs += "\n" | |
| # Add summary if available | |
| if 'summary' in results: | |
| result_logs += "## π Summary:\n\n" | |
| for metric, value in results['summary'].items(): | |
| if isinstance(value, float): | |
| result_logs += f"- **{metric}:** {value:.4f}\n" | |
| else: | |
| result_logs += f"- **{metric}:** {value}\n" | |
| result_logs += f"\n\n**Full results saved to:** `{output_dir}`" | |
| yield status_table, result_logs | |
| else: | |
| status_table = self._create_status_table(tasks_to_run, "β οΈ Unknown") | |
| warning_logs = f"β οΈ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```" | |
| yield status_table, warning_logs | |
| except Exception as e: | |
| status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "β Error") | |
| error_logs = f"β **Evaluation error:**\n\n{str(e)}" | |
| yield status_table, error_logs | |
| # Initialize | |
| bot = ChatBot() | |
| # UI with Tabs | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo: | |
| gr.Markdown(""" | |
| # π Kimi Linear 48B A3B - Evaluation | |
| **Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune` | |
| **This Space is configured for model evaluation only. Chat/inference is disabled.** | |
| """) | |
| # Show GPU info | |
| if torch.cuda.is_available(): | |
| gpu_count = torch.cuda.device_count() | |
| gpu_name = torch.cuda.get_device_name(0) | |
| total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count)) | |
| gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)") | |
| with gr.Tabs(): | |
| # Tab 1: Controls (always visible) | |
| with gr.Tab("ποΈ Controls"): | |
| gr.Markdown("### Load Model (Optional)") | |
| load_btn = gr.Button("π Load Model", variant="primary", size="lg") | |
| status = gr.Markdown("**Status:** Model not loaded") | |
| gr.Markdown(""" | |
| ### βΉοΈ Instructions | |
| 1. **(Optional)** Click "Load Model" to verify setup (takes 5-10 minutes) | |
| 2. **Go directly to Evaluation tab** to run benchmarks | |
| **Note:** | |
| - Chat/inference functionality is currently disabled. This Space focuses on model evaluation only. | |
| - Loading the model first is optional - you can go straight to the Evaluation tab | |
| - Any loaded model will be automatically unloaded before evaluation starts to free VRAM for lm_eval. | |
| """) | |
| # Tab 2: Chat - DISABLED | |
| # Uncomment this section to re-enable chat functionality | |
| """ | |
| with gr.Tab("π¬ Chat"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Settings") | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| placeholder="You are a helpful assistant...", | |
| lines=2 | |
| ) | |
| max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1) | |
| temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1) | |
| top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05) | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot(height=500, show_copy_button=True) | |
| with gr.Row(): | |
| msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4) | |
| send = gr.Button("Send", variant="primary", scale=1) | |
| clear = gr.Button("Clear Chat") | |
| """ | |
| # Tab 3: Evaluation | |
| with gr.Tab("π Evaluation"): | |
| gr.Markdown(""" | |
| ### Run LM Evaluation Harness | |
| Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Select Benchmarks") | |
| tasks = gr.CheckboxGroup( | |
| choices=["ARC-Challenge", "TruthfulQA", "Winogrande"], | |
| value=["ARC-Challenge", "TruthfulQA", "Winogrande"], | |
| label="Tasks to Run", | |
| info="Select one or more tasks" | |
| ) | |
| eval_btn = gr.Button("π Start Evaluation", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| ### β±οΈ Estimated Time: | |
| - **ARC-Challenge:** 15-30 min | |
| - **TruthfulQA:** 10-20 min | |
| - **Winogrande:** 15-30 min | |
| **Total:** ~40-80 minutes for all 3 | |
| """) | |
| with gr.Column(scale=2): | |
| eval_status = gr.Markdown("## π Evaluation Progress\n\nClick 'π Start Evaluation' to begin.") | |
| eval_logs = gr.Markdown("### π Logs\n\nLogs will appear here during evaluation.") | |
| gr.Markdown(""" | |
| --- | |
| **Note:** | |
| - You can start evaluation immediately - no need to load the model first | |
| - If you did load the model, it will be automatically unloaded before evaluation to free VRAM | |
| - lm_eval will load its own fresh instance of the model for evaluation | |
| - Results will be saved to `/tmp/eval_results_[timestamp]/` | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| **Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune) | |
| """) | |
| # Events | |
| load_btn.click(bot.load_model, outputs=status) | |
| # Chat event handlers - DISABLED | |
| # Uncomment these lines to re-enable chat functionality | |
| """ | |
| def respond(message, history, system, max_tok, temp, top): | |
| bot_message = bot.chat(message, history, system, max_tok, temp, top) | |
| history.append((message, bot_message)) | |
| return history, "" | |
| msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) | |
| send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg]) | |
| clear.click(lambda: None, None, chatbot) | |
| """ | |
| # Evaluation event handler | |
| eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs]) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) | |