fnmodel / app.py
aeb56
Fix OOM: Unload model before evaluation to free VRAM for lm_eval
74f609c
raw
history blame
15.1 kB
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import subprocess
import json
from datetime import datetime
# Set environment variable for flash-linear-attention
os.environ["FLA_USE_TRITON"] = "1"
# Model configuration
MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"
class ChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.loaded = False
def load_model(self):
if self.loaded:
return "βœ… Model already loaded!"
try:
yield "πŸ”„ Loading tokenizer..."
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
yield "πŸ”„ Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..."
# Configure memory for 4 GPUs
num_gpus = torch.cuda.device_count()
max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
device_map="balanced",
max_memory=max_memory,
trust_remote_code=True,
low_cpu_mem_usage=True,
attn_implementation="eager",
)
self.model.eval()
# Patch model config to avoid flash attention issues
if hasattr(self.model.config, '_attn_implementation'):
self.model.config._attn_implementation = "eager"
if hasattr(self.model.config, 'attn_implementation'):
self.model.config.attn_implementation = "eager"
self.loaded = True
# Get GPU distribution info
if hasattr(self.model, 'hf_device_map'):
device_info = "\n\n**GPU Distribution:**\n"
devices = {}
for name, device in self.model.hf_device_map.items():
if device not in devices:
devices[device] = 0
devices[device] += 1
for device, count in devices.items():
device_info += f"- {device}: {count} layers\n"
else:
device_info = ""
yield f"βœ… **Model loaded successfully!**{device_info}\n\nYou can now use the Evaluation tab."
except Exception as e:
self.loaded = False
yield f"❌ **Error loading model:**\n\n{str(e)}"
def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
if not self.loaded:
return "❌ Please load the model first by clicking the 'Load Model' button in Controls."
try:
# Build prompt from history
conversation = []
if system_prompt.strip():
conversation.append(f"System: {system_prompt}")
for user_msg, bot_msg in history:
conversation.append(f"User: {user_msg}")
if bot_msg:
conversation.append(f"Assistant: {bot_msg}")
conversation.append(f"User: {message}")
conversation.append("Assistant:")
prompt = "\n".join(conversation)
# Tokenize
inputs = self.tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=temperature > 0,
pad_token_id=self.tokenizer.eos_token_id,
use_cache=True,
)
# Decode
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract assistant response
if "Assistant:" in response:
response = response.split("Assistant:")[-1].strip()
return response
except Exception as e:
return f"❌ Error: {str(e)}"
def run_evaluation(self, tasks_to_run):
"""Run lm_eval on selected tasks"""
if not self.loaded:
yield "❌ Please load the model first!"
return
try:
# Map friendly names to lm_eval task names
task_map = {
"ARC-Challenge": "arc_challenge",
"TruthfulQA": "truthfulqa_mc2",
"Winogrande": "winogrande"
}
selected_tasks = [task_map[t] for t in tasks_to_run]
task_string = ",".join(selected_tasks)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"/tmp/eval_results_{timestamp}"
yield f"πŸ”„ **Preparing for evaluation...**\n\nTasks: {', '.join(tasks_to_run)}\n\n"
# IMPORTANT: Unload the model from memory to free VRAM for lm_eval
yield f"πŸ”„ **Unloading model to free VRAM...**\n\nThis is necessary because lm_eval will load its own instance.\n\n"
if self.model is not None:
del self.model
self.model = None
if self.tokenizer is not None:
del self.tokenizer
self.tokenizer = None
# Clear CUDA cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
import gc
gc.collect()
self.loaded = False
yield f"βœ… **Memory cleared! Starting evaluation...**\n\nThis will take 30-60 minutes total.\n\n"
# Run lm_eval with optimized memory settings
cmd = [
"lm_eval",
"--model", "hf",
"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,device_map=auto,low_cpu_mem_usage=True",
"--tasks", task_string,
"--batch_size", "1", # Reduced to minimize memory usage
"--output_path", output_dir,
"--log_samples"
]
yield f"πŸ”„ **Running lm_eval...**\n\nCommand: `{' '.join(cmd)}`\n\nProgress will update below...\n\n"
# Run evaluation
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1
)
output_lines = []
for line in process.stdout:
output_lines.append(line)
# Show last 20 lines
recent = ''.join(output_lines[-20:])
yield f"πŸ”„ **Running evaluation...**\n\n```\n{recent}\n```"
process.wait()
if process.returncode != 0:
yield f"❌ **Evaluation failed!**\n\nExit code: {process.returncode}\n\nLogs:\n```\n{''.join(output_lines[-50:])}\n```"
return
# Read results
results_file = os.path.join(output_dir, "results.json")
if os.path.exists(results_file):
with open(results_file, 'r') as f:
results = json.load(f)
# Format results
result_text = "βœ… **Evaluation Complete!**\n\n"
result_text += f"**Timestamp:** {timestamp}\n\n"
result_text += "## πŸ“Š Results:\n\n"
for task in selected_tasks:
if task in results['results']:
task_results = results['results'][task]
result_text += f"### {task}\n"
for metric, value in task_results.items():
if isinstance(value, float):
result_text += f"- **{metric}:** {value:.4f}\n"
else:
result_text += f"- **{metric}:** {value}\n"
result_text += "\n"
# Add summary if available
if 'summary' in results:
result_text += "## πŸ“ˆ Summary:\n\n"
for metric, value in results['summary'].items():
if isinstance(value, float):
result_text += f"- **{metric}:** {value:.4f}\n"
else:
result_text += f"- **{metric}:** {value}\n"
result_text += f"\n\n**Full results saved to:** `{output_dir}`"
yield result_text
else:
yield f"⚠️ **Evaluation completed but results file not found.**\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
except Exception as e:
yield f"❌ **Evaluation error:**\n\n{str(e)}"
# Initialize
bot = ChatBot()
# UI with Tabs
with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo:
gr.Markdown("""
# πŸ“Š Kimi Linear 48B A3B - Evaluation
**Model:** `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
**This Space is configured for model evaluation only. Chat/inference is disabled.**
""")
# Show GPU info
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)
total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
gr.Markdown(f"**Hardware:** {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")
with gr.Tabs():
# Tab 1: Controls (always visible)
with gr.Tab("πŸŽ›οΈ Controls"):
gr.Markdown("### Load Model First")
load_btn = gr.Button("πŸš€ Load Model", variant="primary", size="lg")
status = gr.Markdown("**Status:** Model not loaded")
gr.Markdown("""
### ℹ️ Instructions
1. **Click "Load Model"** - Takes 5-10 minutes (verifies setup)
2. **Use Evaluation tab** - To run benchmarks
**Note:**
- Chat/inference functionality is currently disabled. This Space focuses on model evaluation only.
- The model will be automatically unloaded before evaluation starts to free VRAM for lm_eval.
""")
# Tab 2: Chat - DISABLED
# Uncomment this section to re-enable chat functionality
"""
with gr.Tab("πŸ’¬ Chat"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Settings")
system_prompt = gr.Textbox(
label="System Prompt",
placeholder="You are a helpful assistant...",
lines=2
)
max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)
with gr.Column(scale=2):
chatbot = gr.Chatbot(height=500, show_copy_button=True)
with gr.Row():
msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
send = gr.Button("Send", variant="primary", scale=1)
clear = gr.Button("Clear Chat")
"""
# Tab 3: Evaluation
with gr.Tab("πŸ“Š Evaluation"):
gr.Markdown("""
### Run LM Evaluation Harness
Select benchmarks to evaluate your fine-tuned model. **Estimated time: 30-60 minutes total.**
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Select Benchmarks")
tasks = gr.CheckboxGroup(
choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
label="Tasks to Run",
info="Select one or more tasks"
)
eval_btn = gr.Button("πŸš€ Start Evaluation", variant="primary", size="lg")
gr.Markdown("""
### ⏱️ Estimated Time:
- **ARC-Challenge:** 15-30 min
- **TruthfulQA:** 10-20 min
- **Winogrande:** 15-30 min
**Total:** ~40-80 minutes for all 3
""")
with gr.Column(scale=2):
eval_results = gr.Markdown("Results will appear here after evaluation completes.")
gr.Markdown("""
---
**Note:**
- Click "Load Model" in Controls tab first to verify the setup
- The model will be automatically unloaded before evaluation to free VRAM
- lm_eval will load its own instance of the model for evaluation
- Results will be saved to `/tmp/eval_results_[timestamp]/`
""")
gr.Markdown("""
---
**Model:** [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
""")
# Events
load_btn.click(bot.load_model, outputs=status)
# Chat event handlers - DISABLED
# Uncomment these lines to re-enable chat functionality
"""
def respond(message, history, system, max_tok, temp, top):
bot_message = bot.chat(message, history, system, max_tok, temp, top)
history.append((message, bot_message))
return history, ""
msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
clear.click(lambda: None, None, chatbot)
"""
# Evaluation event handler
eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=eval_results)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)