Spaces:

optiviseapp
/

fnmodel

Paused

fnmodel / app.py

aeb56

Fix flash attention error by patching model config to use eager attention

2f60fd7 about 1 month ago

7.22 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import os

	# Set environment variable for flash-linear-attention
	os.environ["FLA_USE_TRITON"] = "1"

	# Model configuration
	MODEL_NAME = "optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune"

	class ChatBot:
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.loaded = False

	def load_model(self):
	if self.loaded:
	return "✅ Model already loaded!"

	try:
	yield "🔄 Loading tokenizer..."
	self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

	yield "🔄 Loading model (this takes 5-10 minutes)...\n\nThe 48B model is being distributed across 4 GPUs..."

	# Configure memory for 4 GPUs
	num_gpus = torch.cuda.device_count()
	max_memory = {i: f"{int(23)}GB" for i in range(num_gpus)} # L4 has 24GB, leave 1GB

	self.model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.bfloat16,
	device_map="balanced", # Distribute evenly
	max_memory=max_memory,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	attn_implementation="eager", # Use eager attention instead of flash
	)

	self.model.eval()

	# Patch model config to avoid flash attention issues
	if hasattr(self.model.config, '_attn_implementation'):
	self.model.config._attn_implementation = "eager"
	if hasattr(self.model.config, 'attn_implementation'):
	self.model.config.attn_implementation = "eager"

	self.loaded = True

	# Get GPU distribution info
	if hasattr(self.model, 'hf_device_map'):
	device_info = "\n\nGPU Distribution:\n"
	devices = {}
	for name, device in self.model.hf_device_map.items():
	if device not in devices:
	devices[device] = 0
	devices[device] += 1
	for device, count in devices.items():
	device_info += f"- {device}: {count} layers\n"
	else:
	device_info = ""

	yield f"✅ Model loaded successfully!{device_info}\n\nYou can now start chatting below."

	except Exception as e:
	self.loaded = False
	yield f"❌ Error loading model:\n\n{str(e)}"

	def chat(self, message, history, system_prompt, max_tokens, temperature, top_p):
	if not self.loaded:
	return "❌ Please load the model first by clicking the 'Load Model' button."

	try:
	# Build prompt from history
	conversation = []
	if system_prompt.strip():
	conversation.append(f"System: {system_prompt}")

	for user_msg, bot_msg in history:
	conversation.append(f"User: {user_msg}")
	if bot_msg:
	conversation.append(f"Assistant: {bot_msg}")

	conversation.append(f"User: {message}")
	conversation.append("Assistant:")

	prompt = "\n".join(conversation)

	# Tokenize
	inputs = self.tokenizer(prompt, return_tensors="pt")
	inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

	# Generate with explicit attention settings
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=temperature > 0,
	pad_token_id=self.tokenizer.eos_token_id,
	use_cache=True, # Enable KV caching
	)

	# Decode
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract assistant response
	if "Assistant:" in response:
	response = response.split("Assistant:")[-1].strip()

	return response

	except Exception as e:
	return f"❌ Error: {str(e)}"

	# Initialize
	bot = ChatBot()

	# UI
	with gr.Blocks(theme=gr.themes.Soft(), title="Kimi 48B Fine-tuned") as demo:
	gr.Markdown("""
	# 🚀 Kimi Linear 48B A3B - Fine-tuned

	Chat interface for the fine-tuned Kimi model.

	Model: `optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune`
	""")

	# Show GPU info
	if torch.cuda.is_available():
	gpu_count = torch.cuda.device_count()
	gpu_name = torch.cuda.get_device_name(0)
	total_vram = sum(torch.cuda.get_device_properties(i).total_memory / 1024**3 for i in range(gpu_count))
	gr.Markdown(f"Hardware: {gpu_count}x {gpu_name} ({total_vram:.0f}GB total VRAM)")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎛️ Controls")

	load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
	status = gr.Markdown("Status: Model not loaded")

	gr.Markdown("---")
	gr.Markdown("### ⚙️ Settings")

	system_prompt = gr.Textbox(
	label="System Prompt",
	placeholder="You are a helpful assistant...",
	lines=2
	)

	max_tokens = gr.Slider(50, 2048, 512, label="Max Tokens", step=1)
	temperature = gr.Slider(0, 2, 0.7, label="Temperature", step=0.1)
	top_p = gr.Slider(0, 1, 0.9, label="Top P", step=0.05)

	with gr.Column(scale=2):
	gr.Markdown("### 💬 Chat")
	chatbot = gr.Chatbot(height=500, show_copy_button=True)

	with gr.Row():
	msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4)
	send = gr.Button("Send", variant="primary", scale=1)

	clear = gr.Button("Clear")

	# Events
	load_btn.click(bot.load_model, outputs=status)

	def respond(message, history, system, max_tok, temp, top):
	bot_message = bot.chat(message, history, system, max_tok, temp, top)
	history.append((message, bot_message))
	return history, ""

	msg.submit(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
	send.click(respond, [msg, chatbot, system_prompt, max_tokens, temperature, top_p], [chatbot, msg])
	clear.click(lambda: None, None, chatbot)

	gr.Markdown("""
	---
	Model: [optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune](https://huggingface.co/optiviseapp/kimi-linear-48b-a3b-instruct-fine-tune)
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)