import os import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel, PeftConfig import gc from huggingface_hub import login, snapshot_download import logging from datetime import datetime from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Check GPU availability if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() logger.info(f"Found {num_gpus} GPUs available") for i in range(num_gpus): gpu_name = torch.cuda.get_device_name(i) gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory") else: logger.warning("No GPUs found! This will likely fail for 48B model.") # Constants BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct" LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned" OUTPUT_DIR = "/app/merged_model" class ModelMerger: def __init__(self): self.base_model = None self.tokenizer = None self.merged_model = None def clear_memory(self): """Clear GPU memory""" if self.base_model is not None: del self.base_model if self.merged_model is not None: del self.merged_model gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() # Synchronize all GPUs for i in range(torch.cuda.device_count()): with torch.cuda.device(i): torch.cuda.empty_cache() torch.cuda.synchronize() logger.info("Memory cleared successfully") def login_huggingface(self, token): """Login to Hugging Face""" try: login(token=token) logger.info("Successfully logged in to Hugging Face") return "โœ… Successfully logged in to Hugging Face" except Exception as e: logger.error(f"Login failed: {str(e)}") return f"โŒ Login failed: {str(e)}" def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()): """Merge LoRA adapters with base model""" try: # Login to HF if hf_token: progress(0.05, desc="Logging in to Hugging Face...") login(token=hf_token) logger.info("Logged in to Hugging Face") # Clear any existing models from memory progress(0.1, desc="Clearing GPU memory...") self.clear_memory() # Load tokenizer progress(0.15, desc="Loading tokenizer...") logger.info("Loading tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True) # Configure memory allocation for multi-GPU setup # Auto-detect GPU memory and adjust accordingly num_gpus = torch.cuda.device_count() max_memory = {} total_vram = 0 if num_gpus > 0: # Calculate available memory per GPU for i in range(num_gpus): gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 total_vram += gpu_memory # Reserve 2-4GB per GPU for overhead per_gpu_memory = f"{int(gpu_memory - 3)}GB" max_memory[i] = per_gpu_memory logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM") logger.info(f"Configured max_memory: {max_memory}") # Warn if total VRAM is low if total_vram < 90 and not use_8bit: logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.") else: # Fallback for CPU-only (will be slow) max_memory = {"cpu": "64GB"} logger.warning("No GPUs detected, using CPU fallback") # Load base model with explicit multi-GPU configuration progress(0.25, desc="Loading base model (this may take several minutes)...") logger.info(f"Loading base model: {BASE_MODEL_NAME}") if use_8bit: logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)") precision_desc = "int8" else: logger.info(f"Using bfloat16 precision for memory efficiency") precision_desc = "bfloat16" try: load_kwargs = { "trust_remote_code": True, "low_cpu_mem_usage": True, "device_map": "auto", "max_memory": max_memory, "offload_folder": "/tmp/offload", "offload_state_dict": True, } if use_8bit: # Use 8-bit quantization for tighter memory constraints load_kwargs["load_in_8bit"] = True load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True load_kwargs["llm_int8_threshold"] = 6.0 logger.info("Enabling CPU offload for 8-bit quantization") else: # Use bfloat16 for best quality when memory allows load_kwargs["torch_dtype"] = torch.bfloat16 self.base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL_NAME, **load_kwargs ) logger.info(f"Base model loaded successfully in {precision_desc}") # Log device map to see distribution if hasattr(self.base_model, 'hf_device_map'): logger.info(f"Model device map: {self.base_model.hf_device_map}") except torch.cuda.OutOfMemoryError as e: logger.error("Out of memory error!") error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n" error_msg += f"You have {total_vram:.1f}GB VRAM available.\n" if not use_8bit: error_msg += "\n๐Ÿ’ก **Try enabling 8-bit quantization** to reduce memory usage by ~50%." raise Exception(error_msg) # Load LoRA configuration progress(0.50, desc="Loading LoRA adapters...") logger.info(f"Loading LoRA adapters from: {LORA_MODEL_NAME}") # Check if LoRA model exists and is accessible try: from huggingface_hub import repo_info info = repo_info(LORA_MODEL_NAME, token=hf_token) logger.info(f"LoRA model found: {info}") except Exception as e: logger.warning(f"Could not verify LoRA model: {str(e)}") # Load LoRA adapters with additional parameters try: logger.info("Attempting to load LoRA adapters...") logger.info(f"LoRA targets attention layers: q_proj, k_proj, v_proj, o_proj") # Load PEFT model - this wraps the base model peft_model = PeftModel.from_pretrained( self.base_model, LORA_MODEL_NAME, torch_dtype=torch.bfloat16 if not use_8bit else None, is_trainable=False, ) logger.info("LoRA adapters loaded successfully") progress(0.70, desc="Merging LoRA weights with base model...") logger.info("Merging LoRA weights into base model...") # Use merge_and_unload with explicit safe merge try: self.merged_model = peft_model.merge_and_unload(safe_merge=True) logger.info("Models merged successfully with safe_merge=True") except Exception as merge_error: logger.warning(f"safe_merge=True failed, trying without: {str(merge_error)}") # Fallback to regular merge self.merged_model = peft_model.merge_and_unload() logger.info("Models merged successfully") except KeyError as e: # Handle missing keys - might be an architecture mismatch error_key = str(e) error_msg = f"Key error when loading LoRA adapters: {error_key}\n\n" if "block_sparse_moe" in error_key or "experts" in error_key: error_msg += "โš ๏ธ This error is related to MoE (Mixture of Experts) layers.\n\n" error_msg += "The LoRA adapters only target attention layers (q/k/v/o_proj),\n" error_msg += "but there seems to be a key naming mismatch with the base model.\n\n" error_msg += "Possible causes:\n" error_msg += "1. The base model version has changed since training\n" error_msg += "2. Different transformers/peft library versions\n" error_msg += "3. Model was saved with different device_map than loading\n\n" error_msg += "Please verify:\n" error_msg += f"- Base model: {BASE_MODEL_NAME}\n" error_msg += f"- LoRA model: {LORA_MODEL_NAME}\n" error_msg += "- Both use the same transformers version\n" logger.error(error_msg) raise Exception(error_msg) except Exception as e: logger.error(f"Unexpected error during merge: {str(e)}", exc_info=True) raise # Save merged model progress(0.85, desc="Saving merged model...") logger.info(f"Saving merged model to: {OUTPUT_DIR}") os.makedirs(OUTPUT_DIR, exist_ok=True) self.merged_model.save_pretrained( OUTPUT_DIR, safe_serialization=True, max_shard_size="5GB" ) self.tokenizer.save_pretrained(OUTPUT_DIR) progress(1.0, desc="Complete!") logger.info("Merge completed successfully") # Get model info total_params = sum(p.numel() for p in self.merged_model.parameters()) trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad) # Get GPU memory usage gpu_memory_info = "" if torch.cuda.is_available(): gpu_memory_info = "\n**GPU Memory Usage:**\n" for i in range(torch.cuda.device_count()): allocated = torch.cuda.memory_allocated(i) / 1024**3 reserved = torch.cuda.memory_reserved(i) / 1024**3 total = torch.cuda.get_device_properties(i).total_memory / 1024**3 gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n" result_message = f""" โœ… **Merge Completed Successfully!** **Model Information:** - Base Model: `{BASE_MODEL_NAME}` - LoRA Adapters: `{LORA_MODEL_NAME}` - Output Directory: `{OUTPUT_DIR}` - Total Parameters: {total_params:,} - Trainable Parameters: {trainable_params:,} - Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} {gpu_memory_info} **Next Steps:** 1. The merged model is saved in the container at `/app/merged_model` 2. You can now test the model using the inference tab 3. To upload to Hugging Face, use the upload section """ return result_message except Exception as e: logger.error(f"Error during merge: {str(e)}", exc_info=True) self.clear_memory() return f"โŒ **Error during merge:**\n\n{str(e)}\n\nPlease check the logs for more details." def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()): """Test the merged model with a prompt""" try: if self.merged_model is None: return "โŒ Please merge the models first before testing inference." progress(0.3, desc="Tokenizing input...") inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device) progress(0.5, desc="Generating response...") with torch.no_grad(): outputs = self.merged_model.generate( **inputs, max_length=max_length, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) progress(0.9, desc="Decoding output...") response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) progress(1.0, desc="Complete!") return response except Exception as e: logger.error(f"Error during inference: {str(e)}", exc_info=True) return f"โŒ **Error during inference:**\n\n{str(e)}" def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()): """Upload merged model to Hugging Face Hub""" try: if self.merged_model is None: return "โŒ Please merge the models first before uploading." if not repo_name: return "โŒ Please provide a repository name." if not hf_token: return "โŒ Please provide a Hugging Face token." progress(0.1, desc="Logging in...") login(token=hf_token) progress(0.3, desc="Uploading model to Hugging Face Hub...") logger.info(f"Uploading to: {repo_name}") self.merged_model.push_to_hub( repo_name, private=private, safe_serialization=True, max_shard_size="5GB" ) progress(0.8, desc="Uploading tokenizer...") self.tokenizer.push_to_hub(repo_name, private=private) progress(1.0, desc="Complete!") logger.info("Upload completed successfully") repo_url = f"https://huggingface.co/{repo_name}" return f"โœ… **Successfully uploaded to Hugging Face Hub!**\n\nRepository: [{repo_name}]({repo_url})" except Exception as e: logger.error(f"Error during upload: {str(e)}", exc_info=True) return f"โŒ **Error during upload:**\n\n{str(e)}" # Initialize merger merger = ModelMerger() # Get GPU info for display def get_gpu_info(): if not torch.cuda.is_available(): return "โš ๏ธ **No GPUs detected!** This Space requires GPUs to run." gpu_info = f"โœ… **{torch.cuda.device_count()} GPU(s) detected:**\n\n" total_memory = 0 for i in range(torch.cuda.device_count()): name = torch.cuda.get_device_name(i) memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 total_memory += memory gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n" gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB" return gpu_info # Create Gradio interface with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo: gr.Markdown(""" # ๐Ÿ”— LoRA Model Merger Merge your fine-tuned LoRA adapters with the base model for the **Kimi-Linear-48B-A3B-Instruct** model. **Models:** - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct` - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned` """) # Display GPU info gr.Markdown(get_gpu_info()) with gr.Tabs(): # Tab 1: Merge Models with gr.Tab("๐Ÿ”„ Merge Models"): gr.Markdown(""" ### Step 1: Merge LoRA Adapters with Base Model This process will: 1. Download the base model and LoRA adapters 2. Merge the LoRA weights into the base model 3. Save the merged model for inference โš ๏ธ **Important Notes:** - This process may take 10-30 minutes depending on model size and network speed - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation - The model will be automatically distributed across all available GPUs """) with gr.Row(): hf_token_merge = gr.Textbox( label="Hugging Face Token", placeholder="hf_...", type="password", info="Required for accessing private models or avoiding rate limits" ) with gr.Row(): use_8bit_checkbox = gr.Checkbox( label="Use 8-bit Quantization", value=False, info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss." ) merge_button = gr.Button("๐Ÿš€ Start Merge Process", variant="primary", size="lg") merge_output = gr.Markdown(label="Merge Status") merge_button.click( fn=merger.merge_models, inputs=[hf_token_merge, use_8bit_checkbox], outputs=merge_output ) # Tab 2: Test Inference with gr.Tab("๐Ÿงช Test Inference"): gr.Markdown(""" ### Step 2: Test the Merged Model Test the merged model with custom prompts to verify it's working correctly. """) with gr.Row(): with gr.Column(): test_prompt = gr.Textbox( label="Test Prompt", placeholder="Enter your test prompt here...", lines=5, value="Hello, how are you today?" ) with gr.Row(): max_length = gr.Slider( minimum=50, maximum=2048, value=512, step=1, label="Max Length" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P" ) test_button = gr.Button("๐ŸŽฏ Generate", variant="primary") with gr.Column(): test_output = gr.Textbox( label="Model Output", lines=15, interactive=False ) test_button.click( fn=merger.test_inference, inputs=[test_prompt, max_length, temperature, top_p], outputs=test_output ) # Tab 3: Upload to Hub with gr.Tab("โ˜๏ธ Upload to Hub"): gr.Markdown(""" ### Step 3: Upload Merged Model to Hugging Face Hub Upload your merged model to Hugging Face Hub for easy sharing and deployment. """) with gr.Row(): with gr.Column(): repo_name = gr.Textbox( label="Repository Name", placeholder="username/model-name", info="Format: username/model-name" ) hf_token_upload = gr.Textbox( label="Hugging Face Token (with write access)", placeholder="hf_...", type="password", info="Token must have write permissions" ) private_repo = gr.Checkbox( label="Private Repository", value=True, info="Keep the model private" ) upload_button = gr.Button("๐Ÿ“ค Upload to Hub", variant="primary", size="lg") with gr.Column(): upload_output = gr.Markdown(label="Upload Status") upload_button.click( fn=merger.upload_to_hub, inputs=[repo_name, hf_token_upload, private_repo], outputs=upload_output ) # Tab 4: Info & Help with gr.Tab("โ„น๏ธ Info & Help"): gr.Markdown(""" ## About This Space This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models. ### What is LoRA Merging? LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model. To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into the base model, creating a single unified model. ### Process Overview 1. **Merge:** Combines the LoRA adapters with the base model 2. **Test:** Verify the merged model works correctly with inference 3. **Upload:** Share your merged model on Hugging Face Hub ### Hardware Requirements - **Current Setup:** 4x NVIDIA L40S GPUs (48GB VRAM each) - **Model Size:** ~48B parameters - **Memory Usage:** ~96-120GB VRAM during merge ### Tips - The merge process can take 10-30 minutes - Make sure you have a valid Hugging Face token with appropriate permissions - Test the model thoroughly before uploading to Hub - Consider keeping the uploaded model private initially ### Troubleshooting **Out of Memory Errors:** - The model is very large (48B parameters) - Try restarting the Space to clear memory **Authentication Errors:** - Ensure your HF token has read access to the base model - For private models, token must have appropriate permissions **Slow Download/Upload:** - Large models take time to transfer - Network speed affects download/upload times ### Support For issues or questions, please check: - [PEFT Documentation](https://huggingface.co/docs/peft) - [Transformers Documentation](https://huggingface.co/docs/transformers) """) gr.Markdown(""" --- **Note:** This Space requires significant computational resources. Ensure you have appropriate GPU allocation. """) # Launch the app if __name__ == "__main__": demo.queue(max_size=5) demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )