Spaces:

optiviseapp
/

fnmodel

Paused

App Files Files Community

aeb56 commited on Nov 10

Commit

e32298d

1 Parent(s): 1443f5f

Add 8-bit quantization support and switch to L4x4 hardware for availability

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +55 -17

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ sdk: docker
 pinned: false
 license: apache-2.0
 app_port: 7860
-suggested_hardware: l40sx4
 ---
 # 🔗 LoRA Model Merger

 pinned: false
 license: apache-2.0
 app_port: 7860
+suggested_hardware: l4x4
 ---
 # 🔗 LoRA Model Merger

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ModelMerger:
             logger.error(f"Login failed: {str(e)}")
             return f"❌ Login failed: {str(e)}"
-    def merge_models(self, hf_token, progress=gr.Progress()):
         """Merge LoRA adapters with base model"""
         try:
             # Login to HF
@@ -79,16 +79,27 @@ class ModelMerger:
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
-            # Configure memory allocation for multi-GPU setup (4xL40S = 4x48GB = 192GB)
-            # Reserve some memory for CUDA overhead and operations
             num_gpus = torch.cuda.device_count()
             max_memory = {}
             if num_gpus > 0:
-                # Allocate memory per GPU (leave ~2GB per GPU for overhead)
-                per_gpu_memory = "46GB"  # 48GB - 2GB overhead for L40S
                 for i in range(num_gpus):
                     max_memory[i] = per_gpu_memory
                 logger.info(f"Configured max_memory: {max_memory}")
             else:
                 # Fallback for CPU-only (will be slow)
                 max_memory = {"cpu": "64GB"}
@@ -97,28 +108,48 @@ class ModelMerger:
             # Load base model with explicit multi-GPU configuration
             progress(0.25, desc="Loading base model (this may take several minutes)...")
             logger.info(f"Loading base model: {BASE_MODEL_NAME}")
-            logger.info(f"Using bfloat16 precision for memory efficiency")
             try:
                 self.base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL_NAME,
-                    torch_dtype=torch.bfloat16,
-                    device_map="auto",
-                    max_memory=max_memory,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True,
-                    offload_folder="/tmp/offload",  # Fallback offload directory
-                    offload_state_dict=True,  # Offload state dict when loading
                 )
-                logger.info("Base model loaded successfully")
                 # Log device map to see distribution
                 if hasattr(self.base_model, 'hf_device_map'):
                     logger.info(f"Model device map: {self.base_model.hf_device_map}")
             except torch.cuda.OutOfMemoryError as e:
-                logger.error("Out of memory error! Try with quantization or smaller batch size")
-                raise Exception(f"GPU Out of Memory: {str(e)}. The 48B model requires ~96GB VRAM in bfloat16. Ensure 4xL40S GPUs are available.")
             # Load LoRA configuration
             progress(0.50, desc="Loading LoRA adapters...")
@@ -318,12 +349,19 @@ with gr.Blocks(theme=gr.themes.Soft(), title="LoRA Model Merger") as demo:
                     info="Required for accessing private models or avoiding rate limits"
                 )
             merge_button = gr.Button("🚀 Start Merge Process", variant="primary", size="lg")
             merge_output = gr.Markdown(label="Merge Status")
             merge_button.click(
                 fn=merger.merge_models,
-                inputs=[hf_token_merge],
                 outputs=merge_output
             )

             logger.error(f"Login failed: {str(e)}")
             return f"❌ Login failed: {str(e)}"
+    def merge_models(self, hf_token, use_8bit=False, progress=gr.Progress()):
         """Merge LoRA adapters with base model"""
         try:
             # Login to HF
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
+            # Configure memory allocation for multi-GPU setup
+            # Auto-detect GPU memory and adjust accordingly
             num_gpus = torch.cuda.device_count()
             max_memory = {}
+            total_vram = 0
             if num_gpus > 0:
+                # Calculate available memory per GPU
                 for i in range(num_gpus):
+                    gpu_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
+                    total_vram += gpu_memory
+                    # Reserve 2-4GB per GPU for overhead
+                    per_gpu_memory = f"{int(gpu_memory - 3)}GB"
                     max_memory[i] = per_gpu_memory
+                logger.info(f"Detected {num_gpus} GPUs with total {total_vram:.1f}GB VRAM")
                 logger.info(f"Configured max_memory: {max_memory}")
+                # Warn if total VRAM is low
+                if total_vram < 90 and not use_8bit:
+                    logger.warning(f"Only {total_vram:.1f}GB VRAM available. The 48B model needs ~96GB in bfloat16. Consider enabling 8-bit quantization.")
             else:
                 # Fallback for CPU-only (will be slow)
                 max_memory = {"cpu": "64GB"}
             # Load base model with explicit multi-GPU configuration
             progress(0.25, desc="Loading base model (this may take several minutes)...")
             logger.info(f"Loading base model: {BASE_MODEL_NAME}")
+            if use_8bit:
+                logger.info(f"Using 8-bit quantization for memory efficiency (~50% memory reduction)")
+                precision_desc = "int8"
+            else:
+                logger.info(f"Using bfloat16 precision for memory efficiency")
+                precision_desc = "bfloat16"
             try:
+                load_kwargs = {
+                    "trust_remote_code": True,
+                    "low_cpu_mem_usage": True,
+                    "device_map": "auto",
+                    "max_memory": max_memory,
+                    "offload_folder": "/tmp/offload",
+                    "offload_state_dict": True,
+                }
+                if use_8bit:
+                    # Use 8-bit quantization for tighter memory constraints
+                    load_kwargs["load_in_8bit"] = True
+                else:
+                    # Use bfloat16 for best quality when memory allows
+                    load_kwargs["torch_dtype"] = torch.bfloat16
                 self.base_model = AutoModelForCausalLM.from_pretrained(
                     BASE_MODEL_NAME,
+                    **load_kwargs
                 )
+                logger.info(f"Base model loaded successfully in {precision_desc}")
                 # Log device map to see distribution
                 if hasattr(self.base_model, 'hf_device_map'):
                     logger.info(f"Model device map: {self.base_model.hf_device_map}")
             except torch.cuda.OutOfMemoryError as e:
+                logger.error("Out of memory error!")
+                error_msg = f"GPU Out of Memory: The 48B model requires ~96GB VRAM in bfloat16 or ~48GB in 8-bit.\n"
+                error_msg += f"You have {total_vram:.1f}GB VRAM available.\n"
+                if not use_8bit:
+                    error_msg += "\n��� **Try enabling 8-bit quantization** to reduce memory usage by ~50%."
+                raise Exception(error_msg)
             # Load LoRA configuration
             progress(0.50, desc="Loading LoRA adapters...")
                     info="Required for accessing private models or avoiding rate limits"
                 )
+            with gr.Row():
+                use_8bit_checkbox = gr.Checkbox(
+                    label="Use 8-bit Quantization",
+                    value=False,
+                    info="Enable this if you have limited GPU memory (<96GB total). Reduces memory usage by ~50% with minimal quality loss."
+                )
             merge_button = gr.Button("🚀 Start Merge Process", variant="primary", size="lg")
             merge_output = gr.Markdown(label="Merge Status")
             merge_button.click(
                 fn=merger.merge_models,
+                inputs=[hf_token_merge, use_8bit_checkbox],
                 outputs=merge_output
             )