Spaces:

optiviseapp
/

fnmodel

Paused

fnmodel / app.py

aeb56

Workaround flash-attn: create fake module with PyTorch fallback attention

b705945 about 1 month ago

21.5 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import os
	import subprocess
	import json
	from datetime import datetime
	import time

	# Set environment variables for flash-linear-attention and memory management
	os.environ["FLA_USE_TRITON"] = "1"
	os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" # Updated from PYTORCH_CUDA_ALLOC_CONF

	# Model configuration
	MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"

	class ChatBot:
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.loaded = False

	def _create_status_table(self, tasks, status="⏳ Waiting", results=None):
	"""Create a markdown table showing evaluation status"""
	table = "## 📊 Evaluation Progress\n\n"
	table += "\| Benchmark \| Status \| Score \| Details \|\n"
	table += "\|-----------\|--------\|-------\|----------\|\n"

	for task in tasks:
	task_status = status
	task_score = "-"
	task_details = ""

	if results and task in results:
	task_status = "✅ Complete"
	if task == "ARC-Challenge" and "arc_challenge" in results[task]:
	score_data = results[task]["arc_challenge"]
	task_score = f"{score_data.get('acc_norm', 0):.2%}"
	task_details = f"acc: {score_data.get('acc', 0):.2%}"
	elif task == "TruthfulQA" and "truthfulqa_mc2" in results[task]:
	score_data = results[task]["truthfulqa_mc2"]
	task_score = f"{score_data.get('acc', 0):.2%}"
	elif task == "Winogrande" and "winogrande" in results[task]:
	score_data = results[task]["winogrande"]
	task_score = f"{score_data.get('acc', 0):.2%}"

	table += f"\| {task} \| {task_status} \| {task_score} \| {task_details} \|\n"

	return table

	def load_model(self):
	if self.loaded:
	return "✅ Model already loaded!"

	try:
	yield "🔄 Loading tokenizer..."
	self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

	yield "🔄 Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..."

	# Configure memory for 4 GPUs
	num_gpus = torch.cuda.device_count()
	max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB

	self.model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.bfloat16,
	device_map="balanced",
	max_memory=max_memory,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	attn_implementation="eager",
	)

	self.model.eval()

	# Patch model config to avoid flash attention issues
	if hasattr(self.model.config, '_attn_implementation'):
	self.model.config._attn_implementation = "eager"
	if hasattr(self.model.config, 'attn_implementation'):
	self.model.config.attn_implementation = "eager"

	self.loaded = True

	# Get GPU distribution info
	if hasattr(self.model, 'hf_device_map'):
	device_info = "\n\nGPU Distribution:\n"
	devices = {}
	for name, device in self.model.hf_device_map.items():
	if device not in devices:
	devices[device] = 0
	devices[device] += 1
	for device, count in devices.items():
	device_info += f"- {device}: {count} layers\n"
	else:
	device_info = ""

	yield f"✅ Model loaded successfully!{device_info}\n\nYou can now use the Evaluation tab."

	except Exception as e:
	self.loaded = False
	yield f"❌ Error loading model:\n\n{str(e)}"

	def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
	if not self.loaded:
	return "❌ Please load the model first by clicking the 'Load Model' button in Controls."

	try:
	# Build prompt from history
	conversation = []
	if system_prompt.strip():
	conversation.append(f"System: {system_prompt}")

	for user_msg, bot_msg in history:
	conversation.append(f"User: {user_msg}")
	if bot_msg:
	conversation.append(f"Assistant: {bot_msg}")

	conversation.append(f"User: {message}")
	conversation.append("Assistant:")

	prompt = "\n".join(conversation)

	# Tokenize
	inputs = self.tokenizer(prompt, return_tensors="pt")
	inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

	# Generate
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=temperature > 0,
	pad_token_id=self.tokenizer.eos_token_id,
	use_cache=True,
	)

	# Decode
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract assistant response
	if "Assistant:" in response:
	response = response.split("Assistant:")[-1].strip()

	return response

	except Exception as e:
	return f"❌ Error: {str(e)}"

	def run_evaluation(self, tasks_to_run):
	"""Run lm_eval on selected tasks"""
	# Note: We don't strictly require the model to be loaded first
	# since we'll be unloading it anyway. The load step is just for verification.

	try:
	# Map friendly names to lm_eval task names
	task_map = {
	"ARC-Challenge": "arc_challenge",
	"TruthfulQA": "truthfulqa_mc2",
	"Winogrande": "winogrande"
	}

	selected_tasks = [task_map[t] for t in tasks_to_run]
	task_string = ",".join(selected_tasks)

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_dir = f"/tmp/eval_results_{timestamp}"

	# Initial status table
	status_table = self._create_status_table(tasks_to_run, "⏳ Preparing")
	logs = "🔄 Preparing for evaluation...\n\nTasks: " + ", ".join(tasks_to_run) + "\n\n"
	yield status_table, logs

	# IMPORTANT: Clean up any loaded model to free VRAM for lm_eval
	if self.loaded and self.model is not None:
	logs += "🔄 Unloading model to free VRAM...\n\n"
	yield status_table, logs

	if self.model is not None:
	del self.model
	self.model = None
	if self.tokenizer is not None:
	del self.tokenizer
	self.tokenizer = None

	self.loaded = False
	else:
	logs += "🔄 Cleaning up memory...\n\n"
	yield status_table, logs

	# Aggressive memory cleanup
	import gc
	for _ in range(3):
	gc.collect()

	if torch.cuda.is_available():
	for i in range(torch.cuda.device_count()):
	torch.cuda.empty_cache()
	torch.cuda.synchronize(device=i)
	torch.cuda.reset_peak_memory_stats(device=i)
	torch.cuda.reset_accumulated_memory_stats(device=i)

	# Wait for memory to be fully released
	logs += "🔄 Waiting for memory cleanup (5s)...\n\n"
	yield status_table, logs
	time.sleep(5)

	# Final garbage collection
	gc.collect()

	status_table = self._create_status_table(tasks_to_run, "🔄 Loading Model")
	logs += "✅ Memory cleared! Starting evaluation...\n\n"
	logs += f"⏱️ Estimated time: 30-60 minutes\n\n"
	yield status_table, logs

	# Create a fake flash_attn package to avoid import errors
	# This will fallback to standard PyTorch attention
	fake_flash_dir = f"/tmp/flash_attn_{timestamp}"
	os.makedirs(fake_flash_dir, exist_ok=True)

	with open(os.path.join(fake_flash_dir, "__init__.py"), 'w') as f:
	f.write("""
	# Fake flash_attn module that falls back to standard PyTorch attention
	import torch

	def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False, **kwargs):
	'''Fallback to standard PyTorch attention (slower but works without flash-attn)'''
	if softmax_scale is None:
	softmax_scale = 1.0 / (q.size(-1) ** 0.5)

	# Standard attention: softmax(Q @ K.T) @ V
	attn_weights = torch.matmul(q, k.transpose(-2, -1)) * softmax_scale

	if causal:
	seq_len = attn_weights.size(-1)
	causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attn_weights.device), diagonal=1).bool()
	attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))

	attn_weights = torch.softmax(attn_weights, dim=-1)

	if dropout_p > 0:
	attn_weights = torch.nn.functional.dropout(attn_weights, p=dropout_p)

	output = torch.matmul(attn_weights, v)
	return output, None # Return None for attention weights

	def flash_attn_varlen_func(args, *kwargs):
	return flash_attn_func(args, *kwargs)

	__version__ = "2.5.0"
	""")

	# Add fake package to Python path for subprocess
	import sys
	if f"/tmp" not in sys.path:
	sys.path.insert(0, "/tmp")

	# Set PYTHONPATH environment variable so subprocess can find fake flash_attn
	env = os.environ.copy()
	pythonpath = env.get('PYTHONPATH', '')
	env['PYTHONPATH'] = f"/tmp:{pythonpath}" if pythonpath else "/tmp"

	logs += "⚠️ Note: Using fallback PyTorch attention (slower than flash-attn)\n\n"
	yield status_table, logs

	# Run lm_eval
	cmd = [
	"lm_eval",
	"--model", "hf",
	"--model_args", f"pretrained={MODEL_NAME},trust_remote_code=True,dtype=bfloat16,low_cpu_mem_usage=True,parallelize=True",
	"--tasks", task_string,
	"--batch_size", "1",
	"--output_path", output_dir,
	"--log_samples"
	]

	status_table = self._create_status_table(tasks_to_run, "🔄 Running")
	logs += f"🔄 Running lm_eval...\n\nCommand: `{' '.join(cmd)}`\n\n"
	logs += "---\n\n### 📜 Live Logs (last 15 lines):\n\n```\n"
	yield status_table, logs

	# Run evaluation with custom environment
	process = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True,
	bufsize=1,
	env=env # Pass custom environment with PYTHONPATH
	)

	output_lines = []
	log_update_counter = 0
	for line in process.stdout:
	output_lines.append(line)
	log_update_counter += 1

	# Update every 5 lines to reduce UI flickering
	if log_update_counter % 5 == 0:
	recent = ''.join(output_lines[-15:])
	current_logs = logs + recent + "\n```"
	yield status_table, current_logs

	process.wait()

	if process.returncode != 0:
	status_table = self._create_status_table(tasks_to_run, "❌ Failed")
	error_logs = logs + ''.join(output_lines[-50:]) + "\n```\n\n"
	error_logs += f"❌ Evaluation failed!\n\nExit code: {process.returncode}\n"
	yield status_table, error_logs
	return

	# Read results
	results_file = os.path.join(output_dir, "results.json")
	if os.path.exists(results_file):
	with open(results_file, 'r') as f:
	results = json.load(f)

	# Parse results for status table
	parsed_results = {}
	for task in tasks_to_run:
	task_key = task_map[task]
	if task_key in results['results']:
	parsed_results[task] = {task_key: results['results'][task_key]}

	# Update status table with results
	status_table = self._create_status_table(tasks_to_run, "✅ Complete", parsed_results)

	# Format detailed results
	result_logs = "✅ Evaluation Complete!\n\n"
	result_logs += f"Timestamp: {timestamp}\n\n"
	result_logs += "## 📊 Detailed Results:\n\n"

	for task in selected_tasks:
	if task in results['results']:
	task_results = results['results'][task]
	result_logs += f"### {task}\n"
	for metric, value in task_results.items():
	if isinstance(value, float):
	result_logs += f"- {metric}: {value:.4f}\n"
	else:
	result_logs += f"- {metric}: {value}\n"
	result_logs += "\n"

	# Add summary if available
	if 'summary' in results:
	result_logs += "## 📈 Summary:\n\n"
	for metric, value in results['summary'].items():
	if isinstance(value, float):
	result_logs += f"- {metric}: {value:.4f}\n"
	else:
	result_logs += f"- {metric}: {value}\n"

	result_logs += f"\n\nFull results saved to: `{output_dir}`"

	yield status_table, result_logs
	else:
	status_table = self._create_status_table(tasks_to_run, "⚠️ Unknown")
	warning_logs = f"⚠️ Evaluation completed but results file not found.\n\nOutput:\n```\n{''.join(output_lines[-30:])}\n```"
	yield status_table, warning_logs

	except Exception as e:
	status_table = self._create_status_table(tasks_to_run if 'tasks_to_run' in locals() else [], "❌ Error")
	error_logs = f"❌ Evaluation error:\n\n{str(e)}"
	yield status_table, error_logs

	# Initialize
	bot = ChatBot()

	# UI with Tabs
	with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned - Evaluation") as demo:
	gr.Markdown("""
	# 📊 Kimi Linear 48B A3B - Evaluation

	Model: `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`

	This Space is configured for model evaluation only. Chat/inference is disabled.
	""")

	# Show GPU info
	if torch.cuda.is_available():
	gpu_count = torch.cuda.device_count()
	gpu_name = torch.cuda.get_device_name(0)
	total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
	gr.Markdown(f"Hardware: {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")

	with gr.Tabs():
	# Tab 1: Controls (always visible)
	with gr.Tab("🎛️ Controls"):
	gr.Markdown("### Load Model (Optional)")
	load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
	status = gr.Markdown("Status: Model not loaded")

	gr.Markdown("""
	### ℹ️ Instructions
	1. (Optional) Click "Load Model" to verify setup (takes 5-10 minutes)
	2. Go directly to Evaluation tab to run benchmarks

	Note:
	- Chat/inference functionality is currently disabled. This Space focuses on model evaluation only.
	- Loading the model first is optional - you can go straight to the Evaluation tab
	- Any loaded model will be automatically unloaded before evaluation starts to free VRAM for lm_eval.
	""")

	# Tab 2: Chat - DISABLED
	# Uncomment this section to re-enable chat functionality
	"""
	with gr.Tab("💬 Chat"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Settings")

	system_prompt = gr.Textbox(
	label="System Prompt",
	placeholder="You are a helpful assistant...",
	lines=2
	)

	max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
	temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
	top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(height=500, show_copy_button=True)

	with gr.Row():
	msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
	send = gr.Button("Send", variant="primary", scale=1)

	clear = gr.Button("Clear Chat")
	"""

	# Tab 3: Evaluation
	with gr.Tab("📊 Evaluation"):
	gr.Markdown("""
	### Run LM Evaluation Harness

	Select benchmarks to evaluate your fine-tuned model. Estimated time: 30-60 minutes total.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Select Benchmarks")

	tasks = gr.CheckboxGroup(
	choices=["ARC-Challenge", "TruthfulQA", "Winogrande"],
	value=["ARC-Challenge", "TruthfulQA", "Winogrande"],
	label="Tasks to Run",
	info="Select one or more tasks"
	)

	eval_btn = gr.Button("🚀 Start Evaluation", variant="primary", size="lg")

	gr.Markdown("""
	### ⏱️ Estimated Time:
	- ARC-Challenge: 15-30 min
	- TruthfulQA: 10-20 min
	- Winogrande: 15-30 min

	Total: ~40-80 minutes for all 3
	""")

	with gr.Column(scale=2):
	eval_status = gr.Markdown("## 📊 Evaluation Progress\n\nClick '🚀 Start Evaluation' to begin.")
	eval_logs = gr.Markdown("### 📜 Logs\n\nLogs will appear here during evaluation.")

	gr.Markdown("""
	---
	Note:
	- You can start evaluation immediately - no need to load the model first
	- If you did load the model, it will be automatically unloaded before evaluation to free VRAM
	- lm_eval will load its own fresh instance of the model for evaluation
	- Results will be saved to `/tmp/eval_results_[timestamp]/`
	""")

	gr.Markdown("""
	---
	Model: [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
	""")

	# Events
	load_btn.click(bot.load_model, outputs=status)

	# Chat event handlers - DISABLED
	# Uncomment these lines to re-enable chat functionality
	"""
	def respond(message, history, system, max_tok, temp, top):
	bot_message = bot.chat(message, history, system, max_tok, temp, top)
	history.append((message, bot_message))
	return history, ""

	msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
	send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
	clear.click(lambda: None, None, chatbot)
	"""

	# Evaluation event handler
	eval_btn.click(bot.run_evaluation, inputs=tasks, outputs=[eval_status, eval_logs])

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)