Spaces:

optiviseapp
/

fnmodel

Paused

fnmodel / app.py

aeb56

Use sequential device_map to fix key naming conflicts during LoRA merge

d3d4339 about 1 month ago

24.9 kB

	import os
	import torch
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel, PeftConfig
	import gc
	from huggingface_hub import login, snapshot_download
	import logging
	from datetime import datetime
	from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Check GPU availability
	if torch.cuda.is_available():
	num_gpus = torch.cuda.device_count()
	logger.info(f"Found {num_gpus} GPUs available")
	for i in range(num_gpus):
	gpu_name = torch.cuda.get_device_name(i)
	gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
	logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
	else:
	logger.warning("No GPUs found! This will likely fail for 48B model.")

	# Constants
	BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
	LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
	OUTPUT_DIR = "/app/merged_model"

	class ModelMerger:
	def __init__(self):
	self.base_model = None
	self.tokenizer = None
	self.merged_model = None

	def clear_memory(self):
	"""Clear GPU memory"""
	if self.base_model is not None:
	del self.base_model
	if self.merged_model is not None:
	del self.merged_model
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	# Synchronize all GPUs
	for i in range(torch.cuda.device_count()):
	with torch.cuda.device(i):
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	logger.info("Memory cleared successfully")

	def login_huggingface(self, token):
	"""Login to Hugging Face"""
	try:
	login(token=token)
	logger.info("Successfully logged in to Hugging Face")
	return "✅ Successfully logged in to Hugging Face"
	except Exception as e:
	logger.error(f"Login failed: {str(e)}")
	return f"❌ Login failed: {str(e)}"

	def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()):
	"""Merge LoRA adapters with base model"""
	try:
	# Login to HF
	if hf_token:
	progress(0.05, desc="Logging in to Hugging Face...")
	login(token=hf_token)
	logger.info("Logged in to Hugging Face")

	# Clear any existing models from memory
	progress(0.1, desc="Clearing GPU memory...")
	self.clear_memory()

	# Load tokenizer
	progress(0.15, desc="Loading tokenizer...")
	logger.info("Loading tokenizer...")
	self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)

	# Configure memory allocation for multi-GPU setup
	# Auto-detect GPU memory and adjust accordingly
	num_gpus = torch.cuda.device_count()
	max_memory = {}
	total_vram = 0

	if num_gpus > 0:
	# Calculate available memory per GPU
	for i in range(num_gpus):
	gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
	total_vram += gpu_memory
	# Reserve 2-4GB per GPU for overhead
	per_gpu_memory = f"{int(gpu_memory - 3)}GB"
	max_memory[i] = per_gpu_memory

	logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM")
	logger.info(f"Configured max_memory: {max_memory}")

	# Warn if total VRAM is low
	if total_vram < 90 and not use_8bit:
	logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.")
	else:
	# Fallback for CPU-only (will be slow)
	max_memory = {"cpu": "64GB"}
	logger.warning("No GPUs detected, using CPU fallback")

	# Load base model with explicit multi-GPU configuration
	progress(0.25, desc="Loading base model (this may take several minutes)...")
	logger.info(f"Loading base model: {BASE_MODEL_NAME}")
	logger.info(f"Note: For merging, we'll use a simpler device_map to avoid key naming issues")

	if use_8bit:
	logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)")
	precision_desc = "int8"
	else:
	logger.info(f"Using bfloat16 precision for memory efficiency")
	precision_desc = "bfloat16"

	try:
	# For merging, use sequential device map to avoid complex key nesting
	# This ensures consistent key names between training and merging
	load_kwargs = {
	"trust_remote_code": True,
	"low_cpu_mem_usage": True,
	"device_map": "sequential", # Changed from "auto" to avoid key nesting issues
	"max_memory": max_memory,
	}

	if use_8bit:
	# Use 8-bit quantization for tighter memory constraints
	load_kwargs["load_in_8bit"] = True
	load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
	load_kwargs["llm_int8_threshold"] = 6.0
	logger.info("Enabling CPU offload for 8-bit quantization")
	else:
	# Use bfloat16 for best quality when memory allows
	load_kwargs["torch_dtype"] = torch.bfloat16

	self.base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL_NAME,
	**load_kwargs
	)
	logger.info(f"Base model loaded successfully in {precision_desc}")

	# Log device map to see distribution
	if hasattr(self.base_model, 'hf_device_map'):
	logger.info(f"Model device map: {self.base_model.hf_device_map}")

	except torch.cuda.OutOfMemoryError as e:
	logger.error("Out of memory error!")
	error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n"
	error_msg += f"You have {total_vram:.1f}GB VRAM available.\n"
	if not use_8bit:
	error_msg += "\n💡 Try enabling 8-bit quantization to reduce memory usage by ~50%."
	raise Exception(error_msg)

	# Load LoRA configuration
	progress(0.50, desc="Loading LoRA adapters...")
	logger.info(f"Loading LoRA adapters from: {LORA_MODEL_NAME}")

	# Check if LoRA model exists and is accessible
	try:
	from huggingface_hub import repo_info
	info = repo_info(LORA_MODEL_NAME, token=hf_token)
	logger.info(f"LoRA model found: {info}")
	except Exception as e:
	logger.warning(f"Could not verify LoRA model: {str(e)}")

	# Load LoRA adapters with additional parameters
	try:
	logger.info("Attempting to load LoRA adapters...")
	logger.info(f"LoRA targets attention layers: q_proj, k_proj, v_proj, o_proj")

	# Load PEFT model - this wraps the base model
	peft_model = PeftModel.from_pretrained(
	self.base_model,
	LORA_MODEL_NAME,
	torch_dtype=torch.bfloat16 if not use_8bit else None,
	is_trainable=False,
	)
	logger.info("LoRA adapters loaded successfully")

	progress(0.70, desc="Merging LoRA weights with base model...")
	logger.info("Merging LoRA weights into base model...")

	# Use merge_and_unload with explicit safe merge
	try:
	self.merged_model = peft_model.merge_and_unload(safe_merge=True)
	logger.info("Models merged successfully with safe_merge=True")
	except Exception as merge_error:
	logger.warning(f"safe_merge=True failed, trying without: {str(merge_error)}")
	# Fallback to regular merge
	self.merged_model = peft_model.merge_and_unload()
	logger.info("Models merged successfully")

	except KeyError as e:
	# Handle missing keys - might be an architecture mismatch
	error_key = str(e)
	error_msg = f"Key error when loading LoRA adapters: {error_key}\n\n"

	if "block_sparse_moe" in error_key or "experts" in error_key:
	error_msg += "⚠️ This error is related to MoE (Mixture of Experts) layers.\n\n"
	error_msg += "The LoRA adapters only target attention layers (q/k/v/o_proj),\n"
	error_msg += "but there seems to be a key naming mismatch with the base model.\n\n"
	error_msg += "Possible causes:\n"
	error_msg += "1. The base model version has changed since training\n"
	error_msg += "2. Different transformers/peft library versions\n"
	error_msg += "3. Model was saved with different device_map than loading\n\n"

	error_msg += "Please verify:\n"
	error_msg += f"- Base model: {BASE_MODEL_NAME}\n"
	error_msg += f"- LoRA model: {LORA_MODEL_NAME}\n"
	error_msg += "- Both use the same transformers version\n"
	logger.error(error_msg)
	raise Exception(error_msg)
	except Exception as e:
	logger.error(f"Unexpected error during merge: {str(e)}", exc_info=True)
	raise

	# Save merged model
	progress(0.85, desc="Saving merged model...")
	logger.info(f"Saving merged model to: {OUTPUT_DIR}")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	self.merged_model.save_pretrained(
	OUTPUT_DIR,
	safe_serialization=True,
	max_shard_size="5GB"
	)
	self.tokenizer.save_pretrained(OUTPUT_DIR)

	progress(1.0, desc="Complete!")
	logger.info("Merge completed successfully")

	# Get model info
	total_params = sum(p.numel() for p in self.merged_model.parameters())
	trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)

	# Get GPU memory usage
	gpu_memory_info = ""
	if torch.cuda.is_available():
	gpu_memory_info = "\nGPU Memory Usage:\n"
	for i in range(torch.cuda.device_count()):
	allocated = torch.cuda.memory_allocated(i) / 1024**3
	reserved = torch.cuda.memory_reserved(i) / 1024**3
	total = torch.cuda.get_device_properties(i).total_memory / 1024**3
	gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"

	result_message = f"""
	✅ Merge Completed Successfully!

	Model Information:
	- Base Model: `{BASE_MODEL_NAME}`
	- LoRA Adapters: `{LORA_MODEL_NAME}`
	- Output Directory: `{OUTPUT_DIR}`
	- Total Parameters: {total_params:,}
	- Trainable Parameters: {trainable_params:,}
	- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
	- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
	{gpu_memory_info}
	Next Steps:
	1. The merged model is saved in the container at `/app/merged_model`
	2. You can now test the model using the inference tab
	3. To upload to Hugging Face, use the upload section
	"""

	return result_message

	except Exception as e:
	logger.error(f"Error during merge: {str(e)}", exc_info=True)
	self.clear_memory()
	return f"❌ Error during merge:\n\n{str(e)}\n\nPlease check the logs for more details."

	def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()):
	"""Test the merged model with a prompt"""
	try:
	if self.merged_model is None:
	return "❌ Please merge the models first before testing inference."

	progress(0.3, desc="Tokenizing input...")
	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device)

	progress(0.5, desc="Generating response...")
	with torch.no_grad():
	outputs = self.merged_model.generate(
	**inputs,
	max_length=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id,
	)

	progress(0.9, desc="Decoding output...")
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	progress(1.0, desc="Complete!")
	return response

	except Exception as e:
	logger.error(f"Error during inference: {str(e)}", exc_info=True)
	return f"❌ Error during inference:\n\n{str(e)}"

	def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()):
	"""Upload merged model to Hugging Face Hub"""
	try:
	if self.merged_model is None:
	return "❌ Please merge the models first before uploading."

	if not repo_name:
	return "❌ Please provide a repository name."

	if not hf_token:
	return "❌ Please provide a Hugging Face token."

	progress(0.1, desc="Logging in...")
	login(token=hf_token)

	progress(0.3, desc="Uploading model to Hugging Face Hub...")
	logger.info(f"Uploading to: {repo_name}")

	self.merged_model.push_to_hub(
	repo_name,
	private=private,
	safe_serialization=True,
	max_shard_size="5GB"
	)

	progress(0.8, desc="Uploading tokenizer...")
	self.tokenizer.push_to_hub(repo_name, private=private)

	progress(1.0, desc="Complete!")
	logger.info("Upload completed successfully")

	repo_url = f"https://huggingface.co/{repo_name}"
	return f"✅ Successfully uploaded to Hugging Face Hub!\n\nRepository: [{repo_name}]({repo_url})"

	except Exception as e:
	logger.error(f"Error during upload: {str(e)}", exc_info=True)
	return f"❌ Error during upload:\n\n{str(e)}"

	# Initialize merger
	merger = ModelMerger()

	# Get GPU info for display
	def get_gpu_info():
	if not torch.cuda.is_available():
	return "⚠️ No GPUs detected! This Space requires GPUs to run."

	gpu_info = f"✅ {torch.cuda.device_count()} GPU(s) detected:\n\n"
	total_memory = 0
	for i in range(torch.cuda.device_count()):
	name = torch.cuda.get_device_name(i)
	memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
	total_memory += memory
	gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
	gpu_info += f"\nTotal VRAM: {total_memory:.1f} GB"
	return gpu_info

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
	gr.Markdown("""
	# 🔗 LoRA Model Merger

	Merge your fine-tuned LoRA adapters with the base model for the Kimi-Linear-48B-A3B-Instruct model.

	Models:
	- Base Model: `moonshotai/Kimi-Linear-48B-A3B-Instruct`
	- LoRA Adapters: `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
	""")

	# Display GPU info
	gr.Markdown(get_gpu_info())

	with gr.Tabs():
	# Tab 1: Merge Models
	with gr.Tab("🔄 Merge Models"):
	gr.Markdown("""
	### Step 1: Merge LoRA Adapters with Base Model

	This process will:
	1. Download the base model and LoRA adapters
	2. Merge the LoRA weights into the base model
	3. Save the merged model for inference

	⚠️ Important Notes:
	- This process may take 10-30 minutes depending on model size and network speed
	- The 48B parameter model requires ~96GB VRAM in bfloat16 precision
	- Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
	- The model will be automatically distributed across all available GPUs
	""")

	with gr.Row():
	hf_token_merge = gr.Textbox(
	label="Hugging Face Token",
	placeholder="hf_...",
	type="password",
	info="Required for accessing private models or avoiding rate limits"
	)

	with gr.Row():
	use_8bit_checkbox = gr.Checkbox(
	label="Use 8-bit Quantization",
	value=False,
	info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss."
	)

	merge_button = gr.Button("🚀 Start Merge Process", variant="primary", size="lg")
	merge_output = gr.Markdown(label="Merge Status")

	merge_button.click(
	fn=merger.merge_models,
	inputs=[hf_token_merge, use_8bit_checkbox],
	outputs=merge_output
	)

	# Tab 2: Test Inference
	with gr.Tab("🧪 Test Inference"):
	gr.Markdown("""
	### Step 2: Test the Merged Model

	Test the merged model with custom prompts to verify it's working correctly.
	""")

	with gr.Row():
	with gr.Column():
	test_prompt = gr.Textbox(
	label="Test Prompt",
	placeholder="Enter your test prompt here...",
	lines=5,
	value="Hello, how are you today?"
	)

	with gr.Row():
	max_length = gr.Slider(
	minimum=50,
	maximum=2048,
	value=512,
	step=1,
	label="Max Length"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P"
	)

	test_button = gr.Button("🎯 Generate", variant="primary")

	with gr.Column():
	test_output = gr.Textbox(
	label="Model Output",
	lines=15,
	interactive=False
	)

	test_button.click(
	fn=merger.test_inference,
	inputs=[test_prompt, max_length, temperature, top_p],
	outputs=test_output
	)

	# Tab 3: Upload to Hub
	with gr.Tab("☁️ Upload to Hub"):
	gr.Markdown("""
	### Step 3: Upload Merged Model to Hugging Face Hub

	Upload your merged model to Hugging Face Hub for easy sharing and deployment.
	""")

	with gr.Row():
	with gr.Column():
	repo_name = gr.Textbox(
	label="Repository Name",
	placeholder="username/model-name",
	info="Format: username/model-name"
	)
	hf_token_upload = gr.Textbox(
	label="Hugging Face Token (with write access)",
	placeholder="hf_...",
	type="password",
	info="Token must have write permissions"
	)
	private_repo = gr.Checkbox(
	label="Private Repository",
	value=True,
	info="Keep the model private"
	)
	upload_button = gr.Button("📤 Upload to Hub", variant="primary", size="lg")

	with gr.Column():
	upload_output = gr.Markdown(label="Upload Status")

	upload_button.click(
	fn=merger.upload_to_hub,
	inputs=[repo_name, hf_token_upload, private_repo],
	outputs=upload_output
	)

	# Tab 4: Info & Help
	with gr.Tab("ℹ️ Info & Help"):
	gr.Markdown("""
	## About This Space

	This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models.

	### What is LoRA Merging?

	LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model.
	To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into
	the base model, creating a single unified model.

	### Process Overview

	1. Merge: Combines the LoRA adapters with the base model
	2. Test: Verify the merged model works correctly with inference
	3. Upload: Share your merged model on Hugging Face Hub

	### Hardware Requirements

	- Current Setup: 4x NVIDIA L40S GPUs (48GB VRAM each)
	- Model Size: ~48B parameters
	- Memory Usage: ~96-120GB VRAM during merge

	### Tips

	- The merge process can take 10-30 minutes
	- Make sure you have a valid Hugging Face token with appropriate permissions
	- Test the model thoroughly before uploading to Hub
	- Consider keeping the uploaded model private initially

	### Troubleshooting

	Out of Memory Errors:
	- The model is very large (48B parameters)
	- Try restarting the Space to clear memory

	Authentication Errors:
	- Ensure your HF token has read access to the base model
	- For private models, token must have appropriate permissions

	Slow Download/Upload:
	- Large models take time to transfer
	- Network speed affects download/upload times

	### Support

	For issues or questions, please check:
	- [PEFT Documentation](https://huggingface.co/docs/peft)
	- [Transformers Documentation](https://huggingface.co/docs/transformers)
	""")

	gr.Markdown("""
	---
	Note: This Space requires significant computational resources. Ensure you have appropriate GPU allocation.
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.queue(max_size=5)
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)