Spaces:
Paused
Paused
| import os | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel, PeftConfig | |
| import gc | |
| from huggingface_hub import login, snapshot_download | |
| import logging | |
| from datetime import datetime | |
| from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Check GPU availability | |
| if torch.cuda.is_available(): | |
| num_gpus = torch.cuda.device_count() | |
| logger.info(f"Found {num_gpus} GPUs available") | |
| for i in range(num_gpus): | |
| gpu_name = torch.cuda.get_device_name(i) | |
| gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 | |
| logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory") | |
| else: | |
| logger.warning("No GPUs found! This will likely fail for 48B model.") | |
| # Constants | |
| BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct" | |
| LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned" | |
| OUTPUT_DIR = "/app/merged_model" | |
| class ModelMerger: | |
| def __init__(self): | |
| self.base_model = None | |
| self.tokenizer = None | |
| self.merged_model = None | |
| def clear_memory(self): | |
| """Clear GPU memory""" | |
| if self.base_model is not None: | |
| del self.base_model | |
| if self.merged_model is not None: | |
| del self.merged_model | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Synchronize all GPUs | |
| for i in range(torch.cuda.device_count()): | |
| with torch.cuda.device(i): | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize() | |
| logger.info("Memory cleared successfully") | |
| def login_huggingface(self, token): | |
| """Login to Hugging Face""" | |
| try: | |
| login(token=token) | |
| logger.info("Successfully logged in to Hugging Face") | |
| return "β Successfully logged in to Hugging Face" | |
| except Exception as e: | |
| logger.error(f"Login failed: {str(e)}") | |
| return f"β Login failed: {str(e)}" | |
| def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()): | |
| """Merge LoRA adapters with base model""" | |
| try: | |
| # Login to HF | |
| if hf_token: | |
| progress(0.05, desc="Logging in to Hugging Face...") | |
| login(token=hf_token) | |
| logger.info("Logged in to Hugging Face") | |
| # Clear any existing models from memory | |
| progress(0.1, desc="Clearing GPU memory...") | |
| self.clear_memory() | |
| # Load tokenizer | |
| progress(0.15, desc="Loading tokenizer...") | |
| logger.info("Loading tokenizer...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True) | |
| # Configure memory allocation for multi-GPU setup | |
| # Auto-detect GPU memory and adjust accordingly | |
| num_gpus = torch.cuda.device_count() | |
| max_memory = {} | |
| total_vram = 0 | |
| if num_gpus > 0: | |
| # Calculate available memory per GPU | |
| for i in range(num_gpus): | |
| gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 | |
| total_vram += gpu_memory | |
| # Reserve 2-4GB per GPU for overhead | |
| per_gpu_memory = f"{int(gpu_memory - 3)}GB" | |
| max_memory[i] = per_gpu_memory | |
| logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM") | |
| logger.info(f"Configured max_memory: {max_memory}") | |
| # Warn if total VRAM is low | |
| if total_vram < 90 and not use_8bit: | |
| logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.") | |
| else: | |
| # Fallback for CPU-only (will be slow) | |
| max_memory = {"cpu": "64GB"} | |
| logger.warning("No GPUs detected, using CPU fallback") | |
| # Load base model with explicit multi-GPU configuration | |
| progress(0.25, desc="Loading base model (this may take several minutes)...") | |
| logger.info(f"Loading base model: {BASE_MODEL_NAME}") | |
| logger.info(f"Note: For merging, we'll use a simpler device_map to avoid key naming issues") | |
| if use_8bit: | |
| logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)") | |
| precision_desc = "int8" | |
| else: | |
| logger.info(f"Using bfloat16 precision for memory efficiency") | |
| precision_desc = "bfloat16" | |
| try: | |
| # For merging, use sequential device map to avoid complex key nesting | |
| # This ensures consistent key names between training and merging | |
| load_kwargs = { | |
| "trust_remote_code": True, | |
| "low_cpu_mem_usage": True, | |
| "device_map": "sequential", # Changed from "auto" to avoid key nesting issues | |
| "max_memory": max_memory, | |
| } | |
| if use_8bit: | |
| # Use 8-bit quantization for tighter memory constraints | |
| load_kwargs["load_in_8bit"] = True | |
| load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True | |
| load_kwargs["llm_int8_threshold"] = 6.0 | |
| logger.info("Enabling CPU offload for 8-bit quantization") | |
| else: | |
| # Use bfloat16 for best quality when memory allows | |
| load_kwargs["torch_dtype"] = torch.bfloat16 | |
| self.base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL_NAME, | |
| **load_kwargs | |
| ) | |
| logger.info(f"Base model loaded successfully in {precision_desc}") | |
| # Log device map to see distribution | |
| if hasattr(self.base_model, 'hf_device_map'): | |
| logger.info(f"Model device map: {self.base_model.hf_device_map}") | |
| except torch.cuda.OutOfMemoryError as e: | |
| logger.error("Out of memory error!") | |
| error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n" | |
| error_msg += f"You have {total_vram:.1f}GB VRAM available.\n" | |
| if not use_8bit: | |
| error_msg += "\nπ‘ **Try enabling 8-bit quantization** to reduce memory usage by ~50%." | |
| raise Exception(error_msg) | |
| # Load LoRA configuration | |
| progress(0.50, desc="Loading LoRA adapters...") | |
| logger.info(f"Loading LoRA adapters from: {LORA_MODEL_NAME}") | |
| # Check if LoRA model exists and is accessible | |
| try: | |
| from huggingface_hub import repo_info | |
| info = repo_info(LORA_MODEL_NAME, token=hf_token) | |
| logger.info(f"LoRA model found: {info}") | |
| except Exception as e: | |
| logger.warning(f"Could not verify LoRA model: {str(e)}") | |
| # Load LoRA adapters with additional parameters | |
| try: | |
| logger.info("Attempting to load LoRA adapters...") | |
| logger.info(f"LoRA targets attention layers: q_proj, k_proj, v_proj, o_proj") | |
| # Load PEFT model - this wraps the base model | |
| peft_model = PeftModel.from_pretrained( | |
| self.base_model, | |
| LORA_MODEL_NAME, | |
| torch_dtype=torch.bfloat16 if not use_8bit else None, | |
| is_trainable=False, | |
| ) | |
| logger.info("LoRA adapters loaded successfully") | |
| progress(0.70, desc="Merging LoRA weights with base model...") | |
| logger.info("Merging LoRA weights into base model...") | |
| # Use merge_and_unload with explicit safe merge | |
| try: | |
| self.merged_model = peft_model.merge_and_unload(safe_merge=True) | |
| logger.info("Models merged successfully with safe_merge=True") | |
| except Exception as merge_error: | |
| logger.warning(f"safe_merge=True failed, trying without: {str(merge_error)}") | |
| # Fallback to regular merge | |
| self.merged_model = peft_model.merge_and_unload() | |
| logger.info("Models merged successfully") | |
| except KeyError as e: | |
| # Handle missing keys - might be an architecture mismatch | |
| error_key = str(e) | |
| error_msg = f"Key error when loading LoRA adapters: {error_key}\n\n" | |
| if "block_sparse_moe" in error_key or "experts" in error_key: | |
| error_msg += "β οΈ This error is related to MoE (Mixture of Experts) layers.\n\n" | |
| error_msg += "The LoRA adapters only target attention layers (q/k/v/o_proj),\n" | |
| error_msg += "but there seems to be a key naming mismatch with the base model.\n\n" | |
| error_msg += "Possible causes:\n" | |
| error_msg += "1. The base model version has changed since training\n" | |
| error_msg += "2. Different transformers/peft library versions\n" | |
| error_msg += "3. Model was saved with different device_map than loading\n\n" | |
| error_msg += "Please verify:\n" | |
| error_msg += f"- Base model: {BASE_MODEL_NAME}\n" | |
| error_msg += f"- LoRA model: {LORA_MODEL_NAME}\n" | |
| error_msg += "- Both use the same transformers version\n" | |
| logger.error(error_msg) | |
| raise Exception(error_msg) | |
| except Exception as e: | |
| logger.error(f"Unexpected error during merge: {str(e)}", exc_info=True) | |
| raise | |
| # Save merged model | |
| progress(0.85, desc="Saving merged model...") | |
| logger.info(f"Saving merged model to: {OUTPUT_DIR}") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| self.merged_model.save_pretrained( | |
| OUTPUT_DIR, | |
| safe_serialization=True, | |
| max_shard_size="5GB" | |
| ) | |
| self.tokenizer.save_pretrained(OUTPUT_DIR) | |
| progress(1.0, desc="Complete!") | |
| logger.info("Merge completed successfully") | |
| # Get model info | |
| total_params = sum(p.numel() for p in self.merged_model.parameters()) | |
| trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad) | |
| # Get GPU memory usage | |
| gpu_memory_info = "" | |
| if torch.cuda.is_available(): | |
| gpu_memory_info = "\n**GPU Memory Usage:**\n" | |
| for i in range(torch.cuda.device_count()): | |
| allocated = torch.cuda.memory_allocated(i) / 1024**3 | |
| reserved = torch.cuda.memory_reserved(i) / 1024**3 | |
| total = torch.cuda.get_device_properties(i).total_memory / 1024**3 | |
| gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n" | |
| result_message = f""" | |
| β **Merge Completed Successfully!** | |
| **Model Information:** | |
| - Base Model: `{BASE_MODEL_NAME}` | |
| - LoRA Adapters: `{LORA_MODEL_NAME}` | |
| - Output Directory: `{OUTPUT_DIR}` | |
| - Total Parameters: {total_params:,} | |
| - Trainable Parameters: {trainable_params:,} | |
| - Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB | |
| - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} | |
| {gpu_memory_info} | |
| **Next Steps:** | |
| 1. The merged model is saved in the container at `/app/merged_model` | |
| 2. You can now test the model using the inference tab | |
| 3. To upload to Hugging Face, use the upload section | |
| """ | |
| return result_message | |
| except Exception as e: | |
| logger.error(f"Error during merge: {str(e)}", exc_info=True) | |
| self.clear_memory() | |
| return f"β **Error during merge:**\n\n{str(e)}\n\nPlease check the logs for more details." | |
| def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()): | |
| """Test the merged model with a prompt""" | |
| try: | |
| if self.merged_model is None: | |
| return "β Please merge the models first before testing inference." | |
| progress(0.3, desc="Tokenizing input...") | |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device) | |
| progress(0.5, desc="Generating response...") | |
| with torch.no_grad(): | |
| outputs = self.merged_model.generate( | |
| **inputs, | |
| max_length=max_length, | |
| temperature=temperature, | |
| top_p=top_p, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| ) | |
| progress(0.9, desc="Decoding output...") | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| progress(1.0, desc="Complete!") | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error during inference: {str(e)}", exc_info=True) | |
| return f"β **Error during inference:**\n\n{str(e)}" | |
| def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()): | |
| """Upload merged model to Hugging Face Hub""" | |
| try: | |
| if self.merged_model is None: | |
| return "β Please merge the models first before uploading." | |
| if not repo_name: | |
| return "β Please provide a repository name." | |
| if not hf_token: | |
| return "β Please provide a Hugging Face token." | |
| progress(0.1, desc="Logging in...") | |
| login(token=hf_token) | |
| progress(0.3, desc="Uploading model to Hugging Face Hub...") | |
| logger.info(f"Uploading to: {repo_name}") | |
| self.merged_model.push_to_hub( | |
| repo_name, | |
| private=private, | |
| safe_serialization=True, | |
| max_shard_size="5GB" | |
| ) | |
| progress(0.8, desc="Uploading tokenizer...") | |
| self.tokenizer.push_to_hub(repo_name, private=private) | |
| progress(1.0, desc="Complete!") | |
| logger.info("Upload completed successfully") | |
| repo_url = f"https://huggingface.co/{repo_name}" | |
| return f"β **Successfully uploaded to Hugging Face Hub!**\n\nRepository: [{repo_name}]({repo_url})" | |
| except Exception as e: | |
| logger.error(f"Error during upload: {str(e)}", exc_info=True) | |
| return f"β **Error during upload:**\n\n{str(e)}" | |
| # Initialize merger | |
| merger = ModelMerger() | |
| # Get GPU info for display | |
| def get_gpu_info(): | |
| if not torch.cuda.is_available(): | |
| return "β οΈ **No GPUs detected!** This Space requires GPUs to run." | |
| gpu_info = f"β **{torch.cuda.device_count()} GPU(s) detected:**\n\n" | |
| total_memory = 0 | |
| for i in range(torch.cuda.device_count()): | |
| name = torch.cuda.get_device_name(i) | |
| memory = torch.cuda.get_device_properties(i).total_memory / 1024**3 | |
| total_memory += memory | |
| gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n" | |
| gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB" | |
| return gpu_info | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo: | |
| gr.Markdown(""" | |
| # π LoRA Model Merger | |
| Merge your fine-tuned LoRA adapters with the base model for the **Kimi-Linear-48B-A3B-Instruct** model. | |
| **Models:** | |
| - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct` | |
| - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned` | |
| """) | |
| # Display GPU info | |
| gr.Markdown(get_gpu_info()) | |
| with gr.Tabs(): | |
| # Tab 1: Merge Models | |
| with gr.Tab("π Merge Models"): | |
| gr.Markdown(""" | |
| ### Step 1: Merge LoRA Adapters with Base Model | |
| This process will: | |
| 1. Download the base model and LoRA adapters | |
| 2. Merge the LoRA weights into the base model | |
| 3. Save the merged model for inference | |
| β οΈ **Important Notes:** | |
| - This process may take 10-30 minutes depending on model size and network speed | |
| - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision | |
| - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation | |
| - The model will be automatically distributed across all available GPUs | |
| """) | |
| with gr.Row(): | |
| hf_token_merge = gr.Textbox( | |
| label="Hugging Face Token", | |
| placeholder="hf_...", | |
| type="password", | |
| info="Required for accessing private models or avoiding rate limits" | |
| ) | |
| with gr.Row(): | |
| use_8bit_checkbox = gr.Checkbox( | |
| label="Use 8-bit Quantization", | |
| value=False, | |
| info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss." | |
| ) | |
| merge_button = gr.Button("π Start Merge Process", variant="primary", size="lg") | |
| merge_output = gr.Markdown(label="Merge Status") | |
| merge_button.click( | |
| fn=merger.merge_models, | |
| inputs=[hf_token_merge, use_8bit_checkbox], | |
| outputs=merge_output | |
| ) | |
| # Tab 2: Test Inference | |
| with gr.Tab("π§ͺ Test Inference"): | |
| gr.Markdown(""" | |
| ### Step 2: Test the Merged Model | |
| Test the merged model with custom prompts to verify it's working correctly. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| test_prompt = gr.Textbox( | |
| label="Test Prompt", | |
| placeholder="Enter your test prompt here...", | |
| lines=5, | |
| value="Hello, how are you today?" | |
| ) | |
| with gr.Row(): | |
| max_length = gr.Slider( | |
| minimum=50, | |
| maximum=2048, | |
| value=512, | |
| step=1, | |
| label="Max Length" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top P" | |
| ) | |
| test_button = gr.Button("π― Generate", variant="primary") | |
| with gr.Column(): | |
| test_output = gr.Textbox( | |
| label="Model Output", | |
| lines=15, | |
| interactive=False | |
| ) | |
| test_button.click( | |
| fn=merger.test_inference, | |
| inputs=[test_prompt, max_length, temperature, top_p], | |
| outputs=test_output | |
| ) | |
| # Tab 3: Upload to Hub | |
| with gr.Tab("βοΈ Upload to Hub"): | |
| gr.Markdown(""" | |
| ### Step 3: Upload Merged Model to Hugging Face Hub | |
| Upload your merged model to Hugging Face Hub for easy sharing and deployment. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| repo_name = gr.Textbox( | |
| label="Repository Name", | |
| placeholder="username/model-name", | |
| info="Format: username/model-name" | |
| ) | |
| hf_token_upload = gr.Textbox( | |
| label="Hugging Face Token (with write access)", | |
| placeholder="hf_...", | |
| type="password", | |
| info="Token must have write permissions" | |
| ) | |
| private_repo = gr.Checkbox( | |
| label="Private Repository", | |
| value=True, | |
| info="Keep the model private" | |
| ) | |
| upload_button = gr.Button("π€ Upload to Hub", variant="primary", size="lg") | |
| with gr.Column(): | |
| upload_output = gr.Markdown(label="Upload Status") | |
| upload_button.click( | |
| fn=merger.upload_to_hub, | |
| inputs=[repo_name, hf_token_upload, private_repo], | |
| outputs=upload_output | |
| ) | |
| # Tab 4: Info & Help | |
| with gr.Tab("βΉοΈ Info & Help"): | |
| gr.Markdown(""" | |
| ## About This Space | |
| This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models. | |
| ### What is LoRA Merging? | |
| LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model. | |
| To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into | |
| the base model, creating a single unified model. | |
| ### Process Overview | |
| 1. **Merge:** Combines the LoRA adapters with the base model | |
| 2. **Test:** Verify the merged model works correctly with inference | |
| 3. **Upload:** Share your merged model on Hugging Face Hub | |
| ### Hardware Requirements | |
| - **Current Setup:** 4x NVIDIA L40S GPUs (48GB VRAM each) | |
| - **Model Size:** ~48B parameters | |
| - **Memory Usage:** ~96-120GB VRAM during merge | |
| ### Tips | |
| - The merge process can take 10-30 minutes | |
| - Make sure you have a valid Hugging Face token with appropriate permissions | |
| - Test the model thoroughly before uploading to Hub | |
| - Consider keeping the uploaded model private initially | |
| ### Troubleshooting | |
| **Out of Memory Errors:** | |
| - The model is very large (48B parameters) | |
| - Try restarting the Space to clear memory | |
| **Authentication Errors:** | |
| - Ensure your HF token has read access to the base model | |
| - For private models, token must have appropriate permissions | |
| **Slow Download/Upload:** | |
| - Large models take time to transfer | |
| - Network speed affects download/upload times | |
| ### Support | |
| For issues or questions, please check: | |
| - [PEFT Documentation](https://huggingface.co/docs/peft) | |
| - [Transformers Documentation](https://huggingface.co/docs/transformers) | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| **Note:** This Space requires significant computational resources. Ensure you have appropriate GPU allocation. | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue(max_size=5) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True | |
| ) | |