Spaces:

mclemcrew
/

CoMix-Demo

Sleeping

App Files Files Community

mclemcrew commited on Mar 25, 2025

Commit

53c24d3

1 Parent(s): 89b08d6

try this

Browse files

Files changed (3) hide show

app.py +31 -16
bitsandbytes +1 -0
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -51,29 +51,43 @@ def load_model():
         processor = AutoProcessor.from_pretrained(MODEL_ID)
         logger.info("Processor loaded successfully")
-        # Explicitly avoid any quantization/bitsandbytes paths
-        logger.info(f"Loading model with direct GPU loading")
-        # Check if GPU is available
         if torch.cuda.is_available():
             try:
-                # Try direct GPU loading with FP16
-                logger.info("Using FP16 precision on GPU")
-                # Override the device_map to be more explicit
-                model = Qwen2AudioForConditionalGeneration.from_pretrained(
-                    MODEL_ID,
-                    torch_dtype=torch.float16,  # Use float16 precision
-                    device_map="auto",
-                    # Explicitly disable any 8-bit or 4-bit quantization
-                    load_in_8bit=False,
-                    load_in_4bit=False,
-                )
-                logger.info("Model loaded successfully with FP16 on GPU")
             except Exception as gpu_error:
                 logger.warning(f"GPU loading failed: {gpu_error}. Falling back to CPU.")
                 model = Qwen2AudioForConditionalGeneration.from_pretrained(
                     MODEL_ID,
                     device_map="cpu",
                 )
                 logger.info("Model loaded successfully on CPU")
         else:
@@ -81,9 +95,10 @@ def load_model():
             model = Qwen2AudioForConditionalGeneration.from_pretrained(
                 MODEL_ID,
                 device_map="cpu",
             )
             logger.info("Model loaded successfully on CPU")
         model.eval()
         log_gpu_memory("After model loading")
         return model, processor

         processor = AutoProcessor.from_pretrained(MODEL_ID)
         logger.info("Processor loaded successfully")
+        # Skip quantization attempts since we know it's problematic with CUDA 12.4
+        logger.info(f"Loading model with optimized settings for your environment")
+        # Check if GPU is available and has enough memory
         if torch.cuda.is_available():
             try:
+                # Get GPU memory info
+                gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
+                logger.info(f"GPU memory: {gpu_memory:.2f} GB")
+                # If GPU has enough memory, try loading directly without quantization
+                if gpu_memory > 16:  # For GPUs with >16GB memory
+                    logger.info("Using FP16 precision on GPU")
+                    model = Qwen2AudioForConditionalGeneration.from_pretrained(
+                        MODEL_ID,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        low_cpu_mem_usage=True
+                    )
+                    logger.info("Model loaded successfully with FP16")
+                else:
+                    # For smaller GPUs, use CPU offloading
+                    logger.info("Using CPU offloading for model components")
+                    model = Qwen2AudioForConditionalGeneration.from_pretrained(
+                        MODEL_ID,
+                        torch_dtype=torch.float16,
+                        device_map="auto",
+                        offload_folder="offload",
+                        low_cpu_mem_usage=True
+                    )
+                    logger.info("Model loaded successfully with CPU offloading")
             except Exception as gpu_error:
                 logger.warning(f"GPU loading failed: {gpu_error}. Falling back to CPU.")
                 model = Qwen2AudioForConditionalGeneration.from_pretrained(
                     MODEL_ID,
                     device_map="cpu",
+                    low_cpu_mem_usage=True
                 )
                 logger.info("Model loaded successfully on CPU")
         else:
             model = Qwen2AudioForConditionalGeneration.from_pretrained(
                 MODEL_ID,
                 device_map="cpu",
+                low_cpu_mem_usage=True
             )
             logger.info("Model loaded successfully on CPU")
         model.eval()
         log_gpu_memory("After model loading")
         return model, processor

bitsandbytes ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit e82f72b3acd37bfa9f32773e8844ac7bafad2b19

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
 gradio==4.44.1
-transformers>=4.35.0
 torch>=2.0.1
-accelerate>=0.20.0
 numpy>=1.24.0
 librosa>=0.10.0
 soundfile>=0.12.1
 requests>=2.28.0
 pillow>=9.5.0
 huggingface_hub>=0.16.0
-scikit-learn>=1.0.2

 gradio==4.44.1
 torch>=2.0.1
 numpy>=1.24.0
 librosa>=0.10.0
 soundfile>=0.12.1
 requests>=2.28.0
 pillow>=9.5.0
 huggingface_hub>=0.16.0
+scikit-learn>=1.0.2
+git+https://github.com/huggingface/accelerate.git
+git+https://github.com/huggingface/transformers.git