Spaces:

optiviseapp
/

fnmodel

Paused

File size: 24,893 Bytes

import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import gc
from huggingface_hub import login, snapshot_download
import logging
from datetime import datetime
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check GPU availability
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    logger.info(f"Found {num_gpus} GPUs available")
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
else:
    logger.warning("No GPUs found! This will likely fail for 48B model.")

# Constants
BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
OUTPUT_DIR = "/app/merged_model"

class ModelMerger:
    def __init__(self):
        self.base_model = None
        self.tokenizer = None
        self.merged_model = None
        
    def clear_memory(self):
        """Clear GPU memory"""
        if self.base_model is not None:
            del self.base_model
        if self.merged_model is not None:
            del self.merged_model
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            # Synchronize all GPUs
            for i in range(torch.cuda.device_count()):
                with torch.cuda.device(i):
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
        logger.info("Memory cleared successfully")
        
    def login_huggingface(self, token):
        """Login to Hugging Face"""
        try:
            login(token=token)
            logger.info("Successfully logged in to Hugging Face")
            return "✅ Successfully logged in to Hugging Face"
        except Exception as e:
            logger.error(f"Login failed: {str(e)}")
            return f"❌ Login failed: {str(e)}"
    
    def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()):
        """Merge LoRA adapters with base model"""
        try:
            # Login to HF
            if hf_token:
                progress(0.05, desc="Logging in to Hugging Face...")
                login(token=hf_token)
                logger.info("Logged in to Hugging Face")
            
            # Clear any existing models from memory
            progress(0.1, desc="Clearing GPU memory...")
            self.clear_memory()
            
            # Load tokenizer
            progress(0.15, desc="Loading tokenizer...")
            logger.info("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
            
            # Configure memory allocation for multi-GPU setup
            # Auto-detect GPU memory and adjust accordingly
            num_gpus = torch.cuda.device_count()
            max_memory = {}
            total_vram = 0
            
            if num_gpus > 0:
                # Calculate available memory per GPU
                for i in range(num_gpus):
                    gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
                    total_vram += gpu_memory
                    # Reserve 2-4GB per GPU for overhead
                    per_gpu_memory = f"{int(gpu_memory - 3)}GB"
                    max_memory[i] = per_gpu_memory
                
                logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM")
                logger.info(f"Configured max_memory: {max_memory}")
                
                # Warn if total VRAM is low
                if total_vram < 90 and not use_8bit:
                    logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.")
            else:
                # Fallback for CPU-only (will be slow)
                max_memory = {"cpu": "64GB"}
                logger.warning("No GPUs detected, using CPU fallback")
            
            # Load base model with explicit multi-GPU configuration
            progress(0.25, desc="Loading base model (this may take several minutes)...")
            logger.info(f"Loading base model: {BASE_MODEL_NAME}")
            logger.info(f"Note: For merging, we'll use a simpler device_map to avoid key naming issues")
            
            if use_8bit:
                logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)")
                precision_desc = "int8"
            else:
                logger.info(f"Using bfloat16 precision for memory efficiency")
                precision_desc = "bfloat16"
            
            try:
                # For merging, use sequential device map to avoid complex key nesting
                # This ensures consistent key names between training and merging
                load_kwargs = {
                    "trust_remote_code": True,
                    "low_cpu_mem_usage": True,
                    "device_map": "sequential",  # Changed from "auto" to avoid key nesting issues
                    "max_memory": max_memory,
                }
                
                if use_8bit:
                    # Use 8-bit quantization for tighter memory constraints
                    load_kwargs["load_in_8bit"] = True
                    load_kwargs["llm_int8_enable_fp32_cpu_offload"] = True
                    load_kwargs["llm_int8_threshold"] = 6.0
                    logger.info("Enabling CPU offload for 8-bit quantization")
                else:
                    # Use bfloat16 for best quality when memory allows
                    load_kwargs["torch_dtype"] = torch.bfloat16
                
                self.base_model = AutoModelForCausalLM.from_pretrained(
                    BASE_MODEL_NAME,
                    **load_kwargs
                )
                logger.info(f"Base model loaded successfully in {precision_desc}")
                
                # Log device map to see distribution
                if hasattr(self.base_model, 'hf_device_map'):
                    logger.info(f"Model device map: {self.base_model.hf_device_map}")
                    
            except torch.cuda.OutOfMemoryError as e:
                logger.error("Out of memory error!")
                error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n"
                error_msg += f"You have {total_vram:.1f}GB VRAM available.\n"
                if not use_8bit:
                    error_msg += "\n💡 **Try enabling 8-bit quantization** to reduce memory usage by ~50%."
                raise Exception(error_msg)
            
            # Load LoRA configuration
            progress(0.50, desc="Loading LoRA adapters...")
            logger.info(f"Loading LoRA adapters from: {LORA_MODEL_NAME}")
            
            # Check if LoRA model exists and is accessible
            try:
                from huggingface_hub import repo_info
                info = repo_info(LORA_MODEL_NAME, token=hf_token)
                logger.info(f"LoRA model found: {info}")
            except Exception as e:
                logger.warning(f"Could not verify LoRA model: {str(e)}")
            
            # Load LoRA adapters with additional parameters
            try:
                logger.info("Attempting to load LoRA adapters...")
                logger.info(f"LoRA targets attention layers: q_proj, k_proj, v_proj, o_proj")
                
                # Load PEFT model - this wraps the base model
                peft_model = PeftModel.from_pretrained(
                    self.base_model,
                    LORA_MODEL_NAME,
                    torch_dtype=torch.bfloat16 if not use_8bit else None,
                    is_trainable=False,
                )
                logger.info("LoRA adapters loaded successfully")
                
                progress(0.70, desc="Merging LoRA weights with base model...")
                logger.info("Merging LoRA weights into base model...")
                
                # Use merge_and_unload with explicit safe merge
                try:
                    self.merged_model = peft_model.merge_and_unload(safe_merge=True)
                    logger.info("Models merged successfully with safe_merge=True")
                except Exception as merge_error:
                    logger.warning(f"safe_merge=True failed, trying without: {str(merge_error)}")
                    # Fallback to regular merge
                    self.merged_model = peft_model.merge_and_unload()
                    logger.info("Models merged successfully")
                
            except KeyError as e:
                # Handle missing keys - might be an architecture mismatch
                error_key = str(e)
                error_msg = f"Key error when loading LoRA adapters: {error_key}\n\n"
                
                if "block_sparse_moe" in error_key or "experts" in error_key:
                    error_msg += "⚠️ This error is related to MoE (Mixture of Experts) layers.\n\n"
                    error_msg += "The LoRA adapters only target attention layers (q/k/v/o_proj),\n"
                    error_msg += "but there seems to be a key naming mismatch with the base model.\n\n"
                    error_msg += "Possible causes:\n"
                    error_msg += "1. The base model version has changed since training\n"
                    error_msg += "2. Different transformers/peft library versions\n"
                    error_msg += "3. Model was saved with different device_map than loading\n\n"
                
                error_msg += "Please verify:\n"
                error_msg += f"- Base model: {BASE_MODEL_NAME}\n"
                error_msg += f"- LoRA model: {LORA_MODEL_NAME}\n"
                error_msg += "- Both use the same transformers version\n"
                logger.error(error_msg)
                raise Exception(error_msg)
            except Exception as e:
                logger.error(f"Unexpected error during merge: {str(e)}", exc_info=True)
                raise
            
            # Save merged model
            progress(0.85, desc="Saving merged model...")
            logger.info(f"Saving merged model to: {OUTPUT_DIR}")
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            
            self.merged_model.save_pretrained(
                OUTPUT_DIR,
                safe_serialization=True,
                max_shard_size="5GB"
            )
            self.tokenizer.save_pretrained(OUTPUT_DIR)
            
            progress(1.0, desc="Complete!")
            logger.info("Merge completed successfully")
            
            # Get model info
            total_params = sum(p.numel() for p in self.merged_model.parameters())
            trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
            
            # Get GPU memory usage
            gpu_memory_info = ""
            if torch.cuda.is_available():
                gpu_memory_info = "\n**GPU Memory Usage:**\n"
                for i in range(torch.cuda.device_count()):
                    allocated = torch.cuda.memory_allocated(i) / 1024**3
                    reserved = torch.cuda.memory_reserved(i) / 1024**3
                    total = torch.cuda.get_device_properties(i).total_memory / 1024**3
                    gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
            
            result_message = f"""
✅ **Merge Completed Successfully!**

**Model Information:**
- Base Model: `{BASE_MODEL_NAME}`
- LoRA Adapters: `{LORA_MODEL_NAME}`
- Output Directory: `{OUTPUT_DIR}`
- Total Parameters: {total_params:,}
- Trainable Parameters: {trainable_params:,}
- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
{gpu_memory_info}
**Next Steps:**
1. The merged model is saved in the container at `/app/merged_model`
2. You can now test the model using the inference tab
3. To upload to Hugging Face, use the upload section
"""
            
            return result_message
            
        except Exception as e:
            logger.error(f"Error during merge: {str(e)}", exc_info=True)
            self.clear_memory()
            return f"❌ **Error during merge:**\n\n{str(e)}\n\nPlease check the logs for more details."
    
    def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()):
        """Test the merged model with a prompt"""
        try:
            if self.merged_model is None:
                return "❌ Please merge the models first before testing inference."
            
            progress(0.3, desc="Tokenizing input...")
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device)
            
            progress(0.5, desc="Generating response...")
            with torch.no_grad():
                outputs = self.merged_model.generate(
                    **inputs,
                    max_length=max_length,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                )
            
            progress(0.9, desc="Decoding output...")
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            progress(1.0, desc="Complete!")
            return response
            
        except Exception as e:
            logger.error(f"Error during inference: {str(e)}", exc_info=True)
            return f"❌ **Error during inference:**\n\n{str(e)}"
    
    def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()):
        """Upload merged model to Hugging Face Hub"""
        try:
            if self.merged_model is None:
                return "❌ Please merge the models first before uploading."
            
            if not repo_name:
                return "❌ Please provide a repository name."
            
            if not hf_token:
                return "❌ Please provide a Hugging Face token."
            
            progress(0.1, desc="Logging in...")
            login(token=hf_token)
            
            progress(0.3, desc="Uploading model to Hugging Face Hub...")
            logger.info(f"Uploading to: {repo_name}")
            
            self.merged_model.push_to_hub(
                repo_name,
                private=private,
                safe_serialization=True,
                max_shard_size="5GB"
            )
            
            progress(0.8, desc="Uploading tokenizer...")
            self.tokenizer.push_to_hub(repo_name, private=private)
            
            progress(1.0, desc="Complete!")
            logger.info("Upload completed successfully")
            
            repo_url = f"https://huggingface.co/{repo_name}"
            return f"✅ **Successfully uploaded to Hugging Face Hub!**\n\nRepository: [{repo_name}]({repo_url})"
            
        except Exception as e:
            logger.error(f"Error during upload: {str(e)}", exc_info=True)
            return f"❌ **Error during upload:**\n\n{str(e)}"

# Initialize merger
merger = ModelMerger()

# Get GPU info for display
def get_gpu_info():
    if not torch.cuda.is_available():
        return "⚠️ **No GPUs detected!** This Space requires GPUs to run."
    
    gpu_info = f"✅ **{torch.cuda.device_count()} GPU(s) detected:**\n\n"
    total_memory = 0
    for i in range(torch.cuda.device_count()):
        name = torch.cuda.get_device_name(i)
        memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        total_memory += memory
        gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
    gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
    return gpu_info

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
    gr.Markdown("""
    # 🔗 LoRA Model Merger
    
    Merge your fine-tuned LoRA adapters with the base model for the **Kimi-Linear-48B-A3B-Instruct** model.
    
    **Models:**
    - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
    - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
    """)
    
    # Display GPU info
    gr.Markdown(get_gpu_info())
    
    with gr.Tabs():
        # Tab 1: Merge Models
        with gr.Tab("🔄 Merge Models"):
            gr.Markdown("""
            ### Step 1: Merge LoRA Adapters with Base Model
            
            This process will:
            1. Download the base model and LoRA adapters
            2. Merge the LoRA weights into the base model
            3. Save the merged model for inference
            
            ⚠️ **Important Notes:**
            - This process may take 10-30 minutes depending on model size and network speed
            - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
            - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
            - The model will be automatically distributed across all available GPUs
            """)
            
            with gr.Row():
                hf_token_merge = gr.Textbox(
                    label="Hugging Face Token",
                    placeholder="hf_...",
                    type="password",
                    info="Required for accessing private models or avoiding rate limits"
                )
            
            with gr.Row():
                use_8bit_checkbox = gr.Checkbox(
                    label="Use 8-bit Quantization",
                    value=False,
                    info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss."
                )
            
            merge_button = gr.Button("🚀 Start Merge Process", variant="primary", size="lg")
            merge_output = gr.Markdown(label="Merge Status")
            
            merge_button.click(
                fn=merger.merge_models,
                inputs=[hf_token_merge, use_8bit_checkbox],
                outputs=merge_output
            )
        
        # Tab 2: Test Inference
        with gr.Tab("🧪 Test Inference"):
            gr.Markdown("""
            ### Step 2: Test the Merged Model
            
            Test the merged model with custom prompts to verify it's working correctly.
            """)
            
            with gr.Row():
                with gr.Column():
                    test_prompt = gr.Textbox(
                        label="Test Prompt",
                        placeholder="Enter your test prompt here...",
                        lines=5,
                        value="Hello, how are you today?"
                    )
                    
                    with gr.Row():
                        max_length = gr.Slider(
                            minimum=50,
                            maximum=2048,
                            value=512,
                            step=1,
                            label="Max Length"
                        )
                        temperature = gr.Slider(
                            minimum=0.1,
                            maximum=2.0,
                            value=0.7,
                            step=0.1,
                            label="Temperature"
                        )
                        top_p = gr.Slider(
                            minimum=0.1,
                            maximum=1.0,
                            value=0.9,
                            step=0.05,
                            label="Top P"
                        )
                    
                    test_button = gr.Button("🎯 Generate", variant="primary")
                
                with gr.Column():
                    test_output = gr.Textbox(
                        label="Model Output",
                        lines=15,
                        interactive=False
                    )
            
            test_button.click(
                fn=merger.test_inference,
                inputs=[test_prompt, max_length, temperature, top_p],
                outputs=test_output
            )
        
        # Tab 3: Upload to Hub
        with gr.Tab("☁️ Upload to Hub"):
            gr.Markdown("""
            ### Step 3: Upload Merged Model to Hugging Face Hub
            
            Upload your merged model to Hugging Face Hub for easy sharing and deployment.
            """)
            
            with gr.Row():
                with gr.Column():
                    repo_name = gr.Textbox(
                        label="Repository Name",
                        placeholder="username/model-name",
                        info="Format: username/model-name"
                    )
                    hf_token_upload = gr.Textbox(
                        label="Hugging Face Token (with write access)",
                        placeholder="hf_...",
                        type="password",
                        info="Token must have write permissions"
                    )
                    private_repo = gr.Checkbox(
                        label="Private Repository",
                        value=True,
                        info="Keep the model private"
                    )
                    upload_button = gr.Button("📤 Upload to Hub", variant="primary", size="lg")
                
                with gr.Column():
                    upload_output = gr.Markdown(label="Upload Status")
            
            upload_button.click(
                fn=merger.upload_to_hub,
                inputs=[repo_name, hf_token_upload, private_repo],
                outputs=upload_output
            )
        
        # Tab 4: Info & Help
        with gr.Tab("ℹ️ Info & Help"):
            gr.Markdown("""
            ## About This Space
            
            This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models.
            
            ### What is LoRA Merging?
            
            LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model. 
            To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into 
            the base model, creating a single unified model.
            
            ### Process Overview
            
            1. **Merge:** Combines the LoRA adapters with the base model
            2. **Test:** Verify the merged model works correctly with inference
            3. **Upload:** Share your merged model on Hugging Face Hub
            
            ### Hardware Requirements
            
            - **Current Setup:** 4x NVIDIA L40S GPUs (48GB VRAM each)
            - **Model Size:** ~48B parameters
            - **Memory Usage:** ~96-120GB VRAM during merge
            
            ### Tips
            
            - The merge process can take 10-30 minutes
            - Make sure you have a valid Hugging Face token with appropriate permissions
            - Test the model thoroughly before uploading to Hub
            - Consider keeping the uploaded model private initially
            
            ### Troubleshooting
            
            **Out of Memory Errors:**
            - The model is very large (48B parameters)
            - Try restarting the Space to clear memory
            
            **Authentication Errors:**
            - Ensure your HF token has read access to the base model
            - For private models, token must have appropriate permissions
            
            **Slow Download/Upload:**
            - Large models take time to transfer
            - Network speed affects download/upload times
            
            ### Support
            
            For issues or questions, please check:
            - [PEFT Documentation](https://huggingface.co/docs/peft)
            - [Transformers Documentation](https://huggingface.co/docs/transformers)
            """)
    
    gr.Markdown("""
    ---
    **Note:** This Space requires significant computational resources. Ensure you have appropriate GPU allocation.
    """)

# Launch the app
if __name__ == "__main__":
    demo.queue(max_size=5)
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )