Spaces:

optiviseapp
/

fnmodel

Paused

File size: 25,833 Bytes

import os
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from safetensors.torch import load_file
import gc
from huggingface_hub import login, snapshot_download
import logging
from datetime import datetime
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check GPU availability
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    logger.info(f"Found {num_gpus} GPUs available")
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
else:
    logger.warning("No GPUs found! This will likely fail for 48B model.")

# Constants
BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
OUTPUT_DIR = "/app/merged_model"

class ModelMerger:
    def __init__(self):
        self.base_model = None
        self.tokenizer = None
        self.merged_model = None
        
    def clear_memory(self):
        """Clear GPU memory"""
        if self.base_model is not None:
            del self.base_model
        if self.merged_model is not None:
            del self.merged_model
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            # Synchronize all GPUs
            for i in range(torch.cuda.device_count()):
                with torch.cuda.device(i):
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
        logger.info("Memory cleared successfully")
        
    def login_huggingface(self, token):
        """Login to Hugging Face"""
        try:
            login(token=token)
            logger.info("Successfully logged in to Hugging Face")
            return "✅ Successfully logged in to Hugging Face"
        except Exception as e:
            logger.error(f"Login failed: {str(e)}")
            return f"❌ Login failed: {str(e)}"
    
    def manual_merge_lora(self, model, adapter_path, progress=gr.Progress()):
        """Manually merge LoRA weights into model to avoid PEFT key naming issues"""
        import json
        from tqdm import tqdm
        
        logger.info("Using manual LoRA merge to avoid key naming conflicts...")
        progress(0.55, desc="Loading LoRA adapter weights...")
        
        # Load adapter weights
        adapter_file = os.path.join(adapter_path, "adapter_model.safetensors")
        adapter_weights = load_file(adapter_file)
        logger.info(f"Loaded {len(adapter_weights)} adapter weight tensors")
        
        # Load adapter config
        config_file = os.path.join(adapter_path, "adapter_config.json")
        with open(config_file) as f:
            adapter_config = json.load(f)
        
        lora_alpha = adapter_config["lora_alpha"]
        r = adapter_config["r"]
        scaling = lora_alpha / r
        logger.info(f"LoRA scaling: {scaling} (alpha={lora_alpha}, r={r})")
        
        # Group LoRA A and B weights
        lora_pairs = {}
        for key in adapter_weights.keys():
            if "lora_A" in key:
                base_key = key.replace(".lora_A.weight", "")
                lora_pairs[base_key] = {
                    "A": adapter_weights[key],
                    "B": adapter_weights.get(base_key + ".lora_B.weight")
                }
        
        logger.info(f"Found {len(lora_pairs)} LoRA pairs to merge")
        
        progress(0.65, desc=f"Merging {len(lora_pairs)} LoRA layers...")
        
        # Get model state dict
        model_state_dict = model.state_dict()
        merged_count = 0
        
        for adapter_key, lora_weights in lora_pairs.items():
            # adapter_key: base_model.model.model.layers.0.self_attn.q_proj
            # Need to find corresponding key in model_state_dict
            
            # Remove 'base_model.model.' prefix
            if adapter_key.startswith("base_model.model."):
                search_key = adapter_key[len("base_model.model."):]
            else:
                search_key = adapter_key
            
            # Find matching key in model
            model_key = None
            for mk in model_state_dict.keys():
                if search_key in mk or mk.endswith(search_key.split(".")[-4:][0]):
                    # Match by layer structure
                    if all(part in mk for part in search_key.split(".")[-4:]):
                        model_key = mk
                        break
            
            if model_key and model_key in model_state_dict:
                lora_A = lora_weights["A"].to(model_state_dict[model_key].device)
                lora_B = lora_weights["B"].to(model_state_dict[model_key].device)
                
                # Merge: W_new = W_old + (lora_B @ lora_A) * scaling
                delta_W = (lora_B @ lora_A) * scaling
                model_state_dict[model_key] = model_state_dict[model_key] + delta_W.to(model_state_dict[model_key].dtype)
                merged_count += 1
        
        logger.info(f"Successfully merged {merged_count}/{len(lora_pairs)} LoRA weights")
        
        # Load merged weights back
        progress(0.75, desc="Loading merged weights into model...")
        model.load_state_dict(model_state_dict, strict=False)
        
        return model
    
    def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()):
        """Merge LoRA adapters with base model"""
        try:
            # Login to HF
            if hf_token:
                progress(0.05, desc="Logging in to Hugging Face...")
                login(token=hf_token)
                logger.info("Logged in to Hugging Face")
            
            # Clear any existing models from memory
            progress(0.1, desc="Clearing GPU memory...")
            self.clear_memory()
            
            # Load tokenizer
            progress(0.15, desc="Loading tokenizer...")
            logger.info("Loading tokenizer...")
            self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
            
            # Configure memory allocation for multi-GPU setup
            # Auto-detect GPU memory and adjust accordingly
            num_gpus = torch.cuda.device_count()
            max_memory = {}
            total_vram = 0
            
            if num_gpus > 0:
                # Calculate available memory per GPU
                for i in range(num_gpus):
                    gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
                    total_vram += gpu_memory
                    # Reserve 2-4GB per GPU for overhead
                    per_gpu_memory = f"{int(gpu_memory - 3)}GB"
                    max_memory[i] = per_gpu_memory
                
                logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM")
                logger.info(f"Configured max_memory: {max_memory}")
                
                # Warn if total VRAM is low
                if total_vram < 90 and not use_8bit:
                    logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.")
            else:
                # Fallback for CPU-only (will be slow)
                max_memory = {"cpu": "64GB"}
                logger.warning("No GPUs detected, using CPU fallback")
            
            # Load base model with explicit multi-GPU configuration
            progress(0.25, desc="Loading base model (this may take several minutes)...")
            logger.info(f"Loading base model: {BASE_MODEL_NAME}")
            logger.info(f"Note: For merging, we'll use a simpler device_map to avoid key naming issues")
            
            if use_8bit:
                logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)")
                precision_desc = "int8"
            else:
                logger.info(f"Using bfloat16 precision for memory efficiency")
                precision_desc = "bfloat16"
            
            try:
                # Try loading with balanced device map to distribute evenly
                load_kwargs = {
                    "trust_remote_code": True,
                    "low_cpu_mem_usage": True,
                    "device_map": "balanced",  # Distribute layers evenly across GPUs
                    "max_memory": max_memory,
                    "torch_dtype": torch.bfloat16,
                }
                
                logger.info("Loading base model with balanced device map...")
                
                self.base_model = AutoModelForCausalLM.from_pretrained(
                    BASE_MODEL_NAME,
                    **load_kwargs
                )
                logger.info(f"Base model loaded successfully in {precision_desc}")
                
                # Log device map to see distribution
                if hasattr(self.base_model, 'hf_device_map'):
                    logger.info(f"Model device map: {self.base_model.hf_device_map}")
                    
            except torch.cuda.OutOfMemoryError as e:
                logger.error("Out of memory error!")
                error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n"
                error_msg += f"You have {total_vram:.1f}GB VRAM available.\n"
                if not use_8bit:
                    error_msg += "\n💡 **Try enabling 8-bit quantization** to reduce memory usage by ~50%."
                raise Exception(error_msg)
            
            # Download LoRA adapters
            progress(0.50, desc="Downloading LoRA adapters...")
            logger.info(f"Downloading LoRA adapters from: {LORA_MODEL_NAME}")
            
            # Download entire adapter folder
            adapter_path = snapshot_download(
                repo_id=LORA_MODEL_NAME,
                token=hf_token,
                allow_patterns=["adapter_*", "*.json"]
            )
            logger.info(f"LoRA adapters downloaded to: {adapter_path}")
            
            # Use manual merge to avoid PEFT key naming issues
            progress(0.55, desc="Merging LoRA weights (manual merge)...")
            logger.info("Using manual LoRA merge to avoid key naming conflicts with PEFT")
            
            try:
                self.merged_model = self.manual_merge_lora(self.base_model, adapter_path, progress)
                logger.info("✅ LoRA weights merged successfully using manual method")
                
            except Exception as merge_error:
                logger.error(f"Manual merge failed: {str(merge_error)}", exc_info=True)
                error_msg = f"Failed to merge LoRA adapters: {str(merge_error)}\n\n"
                error_msg += "This could be due to:\n"
                error_msg += "1. Incompatible model architectures\n"
                error_msg += "2. Corrupted adapter files\n"
                error_msg += "3. Memory issues during merge\n"
                raise Exception(error_msg)
            
            # Save merged model
            progress(0.85, desc="Saving merged model...")
            logger.info(f"Saving merged model to: {OUTPUT_DIR}")
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            
            self.merged_model.save_pretrained(
                OUTPUT_DIR,
                safe_serialization=True,
                max_shard_size="5GB"
            )
            self.tokenizer.save_pretrained(OUTPUT_DIR)
            
            progress(1.0, desc="Complete!")
            logger.info("Merge completed successfully")
            
            # Get model info
            total_params = sum(p.numel() for p in self.merged_model.parameters())
            trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
            
            # Get GPU memory usage
            gpu_memory_info = ""
            if torch.cuda.is_available():
                gpu_memory_info = "\n**GPU Memory Usage:**\n"
                for i in range(torch.cuda.device_count()):
                    allocated = torch.cuda.memory_allocated(i) / 1024**3
                    reserved = torch.cuda.memory_reserved(i) / 1024**3
                    total = torch.cuda.get_device_properties(i).total_memory / 1024**3
                    gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
            
            result_message = f"""
✅ **Merge Completed Successfully!**

**Model Information:**
- Base Model: `{BASE_MODEL_NAME}`
- LoRA Adapters: `{LORA_MODEL_NAME}`
- Output Directory: `{OUTPUT_DIR}`
- Total Parameters: {total_params:,}
- Trainable Parameters: {trainable_params:,}
- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
- Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
{gpu_memory_info}
**Next Steps:**
1. The merged model is saved in the container at `/app/merged_model`
2. You can now test the model using the inference tab
3. To upload to Hugging Face, use the upload section
"""
            
            return result_message
            
        except Exception as e:
            logger.error(f"Error during merge: {str(e)}", exc_info=True)
            self.clear_memory()
            return f"❌ **Error during merge:**\n\n{str(e)}\n\nPlease check the logs for more details."
    
    def test_inference(self, prompt, max_length, temperature, top_p, progress=gr.Progress()):
        """Test the merged model with a prompt"""
        try:
            if self.merged_model is None:
                return "❌ Please merge the models first before testing inference."
            
            progress(0.3, desc="Tokenizing input...")
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.merged_model.device)
            
            progress(0.5, desc="Generating response...")
            with torch.no_grad():
                outputs = self.merged_model.generate(
                    **inputs,
                    max_length=max_length,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                )
            
            progress(0.9, desc="Decoding output...")
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            progress(1.0, desc="Complete!")
            return response
            
        except Exception as e:
            logger.error(f"Error during inference: {str(e)}", exc_info=True)
            return f"❌ **Error during inference:**\n\n{str(e)}"
    
    def upload_to_hub(self, repo_name, hf_token, private, progress=gr.Progress()):
        """Upload merged model to Hugging Face Hub"""
        try:
            if self.merged_model is None:
                return "❌ Please merge the models first before uploading."
            
            if not repo_name:
                return "❌ Please provide a repository name."
            
            if not hf_token:
                return "❌ Please provide a Hugging Face token."
            
            progress(0.1, desc="Logging in...")
            login(token=hf_token)
            
            progress(0.3, desc="Uploading model to Hugging Face Hub...")
            logger.info(f"Uploading to: {repo_name}")
            
            self.merged_model.push_to_hub(
                repo_name,
                private=private,
                safe_serialization=True,
                max_shard_size="5GB"
            )
            
            progress(0.8, desc="Uploading tokenizer...")
            self.tokenizer.push_to_hub(repo_name, private=private)
            
            progress(1.0, desc="Complete!")
            logger.info("Upload completed successfully")
            
            repo_url = f"https://huggingface.co/{repo_name}"
            return f"✅ **Successfully uploaded to Hugging Face Hub!**\n\nRepository: [{repo_name}]({repo_url})"
            
        except Exception as e:
            logger.error(f"Error during upload: {str(e)}", exc_info=True)
            return f"❌ **Error during upload:**\n\n{str(e)}"

# Initialize merger
merger = ModelMerger()

# Get GPU info for display
def get_gpu_info():
    if not torch.cuda.is_available():
        return "⚠️ **No GPUs detected!** This Space requires GPUs to run."
    
    gpu_info = f"✅ **{torch.cuda.device_count()} GPU(s) detected:**\n\n"
    total_memory = 0
    for i in range(torch.cuda.device_count()):
        name = torch.cuda.get_device_name(i)
        memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
        total_memory += memory
        gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
    gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
    return gpu_info

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
    gr.Markdown("""
    # 🔗 LoRA Model Merger
    
    Merge your fine-tuned LoRA adapters with the base model for the **Kimi-Linear-48B-A3B-Instruct** model.
    
    **Models:**
    - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
    - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
    """)
    
    # Display GPU info
    gr.Markdown(get_gpu_info())
    
    with gr.Tabs():
        # Tab 1: Merge Models
        with gr.Tab("🔄 Merge Models"):
            gr.Markdown("""
            ### Step 1: Merge LoRA Adapters with Base Model
            
            This process will:
            1. Download the base model and LoRA adapters
            2. Merge the LoRA weights into the base model
            3. Save the merged model for inference
            
            ⚠️ **Important Notes:**
            - This process may take 10-30 minutes depending on model size and network speed
            - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
            - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
            - The model will be automatically distributed across all available GPUs
            """)
            
            with gr.Row():
                hf_token_merge = gr.Textbox(
                    label="Hugging Face Token",
                    placeholder="hf_...",
                    type="password",
                    info="Required for accessing private models or avoiding rate limits"
                )
            
            with gr.Row():
                use_8bit_checkbox = gr.Checkbox(
                    label="Use 8-bit Quantization",
                    value=False,
                    info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss."
                )
            
            merge_button = gr.Button("🚀 Start Merge Process", variant="primary", size="lg")
            merge_output = gr.Markdown(label="Merge Status")
            
            merge_button.click(
                fn=merger.merge_models,
                inputs=[hf_token_merge, use_8bit_checkbox],
                outputs=merge_output
            )
        
        # Tab 2: Test Inference
        with gr.Tab("🧪 Test Inference"):
            gr.Markdown("""
            ### Step 2: Test the Merged Model
            
            Test the merged model with custom prompts to verify it's working correctly.
            """)
            
            with gr.Row():
                with gr.Column():
                    test_prompt = gr.Textbox(
                        label="Test Prompt",
                        placeholder="Enter your test prompt here...",
                        lines=5,
                        value="Hello, how are you today?"
                    )
                    
                    with gr.Row():
                        max_length = gr.Slider(
                            minimum=50,
                            maximum=2048,
                            value=512,
                            step=1,
                            label="Max Length"
                        )
                        temperature = gr.Slider(
                            minimum=0.1,
                            maximum=2.0,
                            value=0.7,
                            step=0.1,
                            label="Temperature"
                        )
                        top_p = gr.Slider(
                            minimum=0.1,
                            maximum=1.0,
                            value=0.9,
                            step=0.05,
                            label="Top P"
                        )
                    
                    test_button = gr.Button("🎯 Generate", variant="primary")
                
                with gr.Column():
                    test_output = gr.Textbox(
                        label="Model Output",
                        lines=15,
                        interactive=False
                    )
            
            test_button.click(
                fn=merger.test_inference,
                inputs=[test_prompt, max_length, temperature, top_p],
                outputs=test_output
            )
        
        # Tab 3: Upload to Hub
        with gr.Tab("☁️ Upload to Hub"):
            gr.Markdown("""
            ### Step 3: Upload Merged Model to Hugging Face Hub
            
            Upload your merged model to Hugging Face Hub for easy sharing and deployment.
            """)
            
            with gr.Row():
                with gr.Column():
                    repo_name = gr.Textbox(
                        label="Repository Name",
                        placeholder="username/model-name",
                        info="Format: username/model-name"
                    )
                    hf_token_upload = gr.Textbox(
                        label="Hugging Face Token (with write access)",
                        placeholder="hf_...",
                        type="password",
                        info="Token must have write permissions"
                    )
                    private_repo = gr.Checkbox(
                        label="Private Repository",
                        value=True,
                        info="Keep the model private"
                    )
                    upload_button = gr.Button("📤 Upload to Hub", variant="primary", size="lg")
                
                with gr.Column():
                    upload_output = gr.Markdown(label="Upload Status")
            
            upload_button.click(
                fn=merger.upload_to_hub,
                inputs=[repo_name, hf_token_upload, private_repo],
                outputs=upload_output
            )
        
        # Tab 4: Info & Help
        with gr.Tab("ℹ️ Info & Help"):
            gr.Markdown("""
            ## About This Space
            
            This Space allows you to merge LoRA (Low-Rank Adaptation) fine-tuned models with their base models.
            
            ### What is LoRA Merging?
            
            LoRA is a parameter-efficient fine-tuning technique that adds small adapter layers to a pretrained model. 
            To use the fine-tuned model without the PEFT library overhead, you can merge these adapters back into 
            the base model, creating a single unified model.
            
            ### Process Overview
            
            1. **Merge:** Combines the LoRA adapters with the base model
            2. **Test:** Verify the merged model works correctly with inference
            3. **Upload:** Share your merged model on Hugging Face Hub
            
            ### Hardware Requirements
            
            - **Current Setup:** 4x NVIDIA L40S GPUs (48GB VRAM each)
            - **Model Size:** ~48B parameters
            - **Memory Usage:** ~96-120GB VRAM during merge
            
            ### Tips
            
            - The merge process can take 10-30 minutes
            - Make sure you have a valid Hugging Face token with appropriate permissions
            - Test the model thoroughly before uploading to Hub
            - Consider keeping the uploaded model private initially
            
            ### Troubleshooting
            
            **Out of Memory Errors:**
            - The model is very large (48B parameters)
            - Try restarting the Space to clear memory
            
            **Authentication Errors:**
            - Ensure your HF token has read access to the base model
            - For private models, token must have appropriate permissions
            
            **Slow Download/Upload:**
            - Large models take time to transfer
            - Network speed affects download/upload times
            
            ### Support
            
            For issues or questions, please check:
            - [PEFT Documentation](https://huggingface.co/docs/peft)
            - [Transformers Documentation](https://huggingface.co/docs/transformers)
            """)
    
    gr.Markdown("""
    ---
    **Note:** This Space requires significant computational resources. Ensure you have appropriate GPU allocation.
    """)

# Launch the app
if __name__ == "__main__":
    demo.queue(max_size=5)
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )