Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on Nov 10

Commit

b51ac87

1 Parent(s): 9bb160e

Optimize app.py for 48B model on 4xL40S GPUs with multi-GPU support

Browse files

Files changed (1) hide show

app.py +93 -14

app.py CHANGED Viewed

@@ -7,11 +7,23 @@ import gc
 from huggingface_hub import login, snapshot_download
 import logging
 from datetime import datetime
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Constants
 BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
 LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
@@ -30,7 +42,14 @@ class ModelMerger:
         if self.merged_model is not None:
             del self.merged_model
         gc.collect()
-        torch.cuda.empty_cache()
     def login_huggingface(self, token):
         """Login to Hugging Face"""
@@ -60,17 +79,46 @@ class ModelMerger:
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
-            # Load base model
             progress(0.25, desc="Loading base model (this may take several minutes)...")
             logger.info(f"Loading base model: {BASE_MODEL_NAME}")
-            self.base_model = AutoModelForCausalLM.from_pretrained(
-                BASE_MODEL_NAME,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-            )
-            logger.info("Base model loaded successfully")
             # Load LoRA configuration
             progress(0.50, desc="Loading LoRA adapters...")
@@ -108,6 +156,16 @@ class ModelMerger:
             total_params = sum(p.numel() for p in self.merged_model.parameters())
             trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
             result_message = f"""
 ✅ **Merge Completed Successfully!**
@@ -117,8 +175,9 @@ class ModelMerger:
 - Output Directory: `{OUTPUT_DIR}`
 - Total Parameters: {total_params:,}
 - Trainable Parameters: {trainable_params:,}
 - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
 **Next Steps:**
 1. The merged model is saved in the container at `/app/merged_model`
 2. You can now test the model using the inference tab
@@ -203,6 +262,21 @@ class ModelMerger:
 # Initialize merger
 merger = ModelMerger()
 # Create Gradio interface
 with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
     gr.Markdown("""
@@ -213,10 +287,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
     **Models:**
     - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
     - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
-    **Hardware:** Running on 4xL40S GPUs
     """)
     with gr.Tabs():
         # Tab 1: Merge Models
         with gr.Tab("🔄 Merge Models"):
@@ -228,7 +303,11 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
             2. Merge the LoRA weights into the base model
             3. Save the merged model for inference
-            ⚠️ **Note:** This process may take 10-30 minutes depending on model size and network speed.
             """)
             with gr.Row():

 from huggingface_hub import login, snapshot_download
 import logging
 from datetime import datetime
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Check GPU availability
+if torch.cuda.is_available():
+    num_gpus = torch.cuda.device_count()
+    logger.info(f"Found {num_gpus} GPUs available")
+    for i in range(num_gpus):
+        gpu_name = torch.cuda.get_device_name(i)
+        gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
+        logger.info(f"GPU {i}: {gpu_name} with {gpu_memory:.2f} GB memory")
+else:
+    logger.warning("No GPUs found! This will likely fail for 48B model.")
 # Constants
 BASE_MODEL_NAME = "moonshotai/Kimi-Linear-48B-A3B-Instruct"
 LORA_MODEL_NAME = "Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned"
         if self.merged_model is not None:
             del self.merged_model
         gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            # Synchronize all GPUs
+            for i in range(torch.cuda.device_count()):
+                with torch.cuda.device(i):
+                    torch.cuda.empty_cache()
+                    torch.cuda.synchronize()
+        logger.info("Memory cleared successfully")
     def login_huggingface(self, token):
         """Login to Hugging Face"""
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
+            # Configure memory allocation for multi-GPU setup (4xL40S = 4x48GB = 192GB)
+            # Reserve some memory for CUDA overhead and operations
+            num_gpus = torch.cuda.device_count()
+            max_memory = {}
+            if num_gpus > 0:
+                # Allocate memory per GPU (leave ~2GB per GPU for overhead)
+                per_gpu_memory = "46GB"  # 48GB - 2GB overhead for L40S
+                for i in range(num_gpus):
+                    max_memory[i] = per_gpu_memory
+                logger.info(f"Configured max_memory: {max_memory}")
+            else:
+                # Fallback for CPU-only (will be slow)
+                max_memory = {"cpu": "64GB"}
+                logger.warning("No GPUs detected, using CPU fallback")
+            # Load base model with explicit multi-GPU configuration
             progress(0.25, desc="Loading base model (this may take several minutes)...")
             logger.info(f"Loading base model: {BASE_MODEL_NAME}")
+            logger.info(f"Using bfloat16 precision for memory efficiency")
+            try:
+                self.base_model = AutoModelForCausalLM.from_pretrained(
+                    BASE_MODEL_NAME,
+                    torch_dtype=torch.bfloat16,
+                    device_map="auto",
+                    max_memory=max_memory,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    offload_folder="/tmp/offload",  # Fallback offload directory
+                    offload_state_dict=True,  # Offload state dict when loading
+                )
+                logger.info("Base model loaded successfully")
+                # Log device map to see distribution
+                if hasattr(self.base_model, 'hf_device_map'):
+                    logger.info(f"Model device map: {self.base_model.hf_device_map}")
+            except torch.cuda.OutOfMemoryError as e:
+                logger.error("Out of memory error! Try with quantization or smaller batch size")
+                raise Exception(f"GPU Out of Memory: {str(e)}. The 48B model requires ~96GB VRAM in bfloat16. Ensure 4xL40S GPUs are available.")
             # Load LoRA configuration
             progress(0.50, desc="Loading LoRA adapters...")
             total_params = sum(p.numel() for p in self.merged_model.parameters())
             trainable_params = sum(p.numel() for p in self.merged_model.parameters() if p.requires_grad)
+            # Get GPU memory usage
+            gpu_memory_info = ""
+            if torch.cuda.is_available():
+                gpu_memory_info = "\n**GPU Memory Usage:**\n"
+                for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**3
+                    reserved = torch.cuda.memory_reserved(i) / 1024**3
+                    total = torch.cuda.get_device_properties(i).total_memory / 1024**3
+                    gpu_memory_info += f"- GPU {i}: {allocated:.2f}GB allocated / {reserved:.2f}GB reserved / {total:.2f}GB total\n"
             result_message = f"""
 ✅ **Merge Completed Successfully!**
 - Output Directory: `{OUTPUT_DIR}`
 - Total Parameters: {total_params:,}
 - Trainable Parameters: {trainable_params:,}
+- Model Size (bfloat16): ~{(total_params * 2) / 1024**3:.2f} GB
 - Timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+{gpu_memory_info}
 **Next Steps:**
 1. The merged model is saved in the container at `/app/merged_model`
 2. You can now test the model using the inference tab
 # Initialize merger
 merger = ModelMerger()
+# Get GPU info for display
+def get_gpu_info():
+    if not torch.cuda.is_available():
+        return "⚠️ **No GPUs detected!** This Space requires GPUs to run."
+    gpu_info = f"✅ **{torch.cuda.device_count()} GPU(s) detected:**\n\n"
+    total_memory = 0
+    for i in range(torch.cuda.device_count()):
+        name = torch.cuda.get_device_name(i)
+        memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
+        total_memory += memory
+        gpu_info += f"- GPU {i}: {name} ({memory:.1f} GB)\n"
+    gpu_info += f"\n**Total VRAM:** {total_memory:.1f} GB"
+    return gpu_info
 # Create Gradio interface
 with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
     gr.Markdown("""
     **Models:**
     - **Base Model:** `moonshotai/Kimi-Linear-48B-A3B-Instruct`
     - **LoRA Adapters:** `Optivise/kimi-linear-48b-a3b-instruct-qlora-fine-tuned`
     """)
+    # Display GPU info
+    gr.Markdown(get_gpu_info())
     with gr.Tabs():
         # Tab 1: Merge Models
         with gr.Tab("🔄 Merge Models"):
             2. Merge the LoRA weights into the base model
             3. Save the merged model for inference
+            ⚠️ **Important Notes:**
+            - This process may take 10-30 minutes depending on model size and network speed
+            - The 48B parameter model requires **~96GB VRAM** in bfloat16 precision
+            - Recommended: 4x L40S GPUs (192GB total VRAM) for comfortable operation
+            - The model will be automatically distributed across all available GPUs
             """)
             with gr.Row():