mclemcrew commited on
Commit
89b08d6
·
1 Parent(s): 5e824e3

bits and bytes error

Browse files
Files changed (1) hide show
  1. app.py +16 -31
app.py CHANGED
@@ -51,43 +51,29 @@ def load_model():
51
  processor = AutoProcessor.from_pretrained(MODEL_ID)
52
  logger.info("Processor loaded successfully")
53
 
54
- # Skip quantization attempts since we know it's problematic with CUDA 12.4
55
- logger.info(f"Loading model with optimized settings for your environment")
56
 
57
- # Check if GPU is available and has enough memory
58
  if torch.cuda.is_available():
59
  try:
60
- # Get GPU memory info
61
- gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
62
- logger.info(f"GPU memory: {gpu_memory:.2f} GB")
63
-
64
- # If GPU has enough memory, try loading directly without quantization
65
- if gpu_memory > 16: # For GPUs with >16GB memory
66
- logger.info("Using FP16 precision on GPU")
67
- model = Qwen2AudioForConditionalGeneration.from_pretrained(
68
- MODEL_ID,
69
- torch_dtype=torch.float16,
70
- device_map="auto",
71
- low_cpu_mem_usage=True
72
- )
73
- logger.info("Model loaded successfully with FP16")
74
- else:
75
- # For smaller GPUs, use CPU offloading
76
- logger.info("Using CPU offloading for model components")
77
- model = Qwen2AudioForConditionalGeneration.from_pretrained(
78
- MODEL_ID,
79
- torch_dtype=torch.float16,
80
- device_map="auto",
81
- offload_folder="offload",
82
- low_cpu_mem_usage=True
83
- )
84
- logger.info("Model loaded successfully with CPU offloading")
85
  except Exception as gpu_error:
86
  logger.warning(f"GPU loading failed: {gpu_error}. Falling back to CPU.")
87
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
88
  MODEL_ID,
89
  device_map="cpu",
90
- low_cpu_mem_usage=True
91
  )
92
  logger.info("Model loaded successfully on CPU")
93
  else:
@@ -95,10 +81,9 @@ def load_model():
95
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
96
  MODEL_ID,
97
  device_map="cpu",
98
- low_cpu_mem_usage=True
99
  )
100
  logger.info("Model loaded successfully on CPU")
101
-
102
  model.eval()
103
  log_gpu_memory("After model loading")
104
  return model, processor
 
51
  processor = AutoProcessor.from_pretrained(MODEL_ID)
52
  logger.info("Processor loaded successfully")
53
 
54
+ # Explicitly avoid any quantization/bitsandbytes paths
55
+ logger.info(f"Loading model with direct GPU loading")
56
 
57
+ # Check if GPU is available
58
  if torch.cuda.is_available():
59
  try:
60
+ # Try direct GPU loading with FP16
61
+ logger.info("Using FP16 precision on GPU")
62
+ # Override the device_map to be more explicit
63
+ model = Qwen2AudioForConditionalGeneration.from_pretrained(
64
+ MODEL_ID,
65
+ torch_dtype=torch.float16, # Use float16 precision
66
+ device_map="auto",
67
+ # Explicitly disable any 8-bit or 4-bit quantization
68
+ load_in_8bit=False,
69
+ load_in_4bit=False,
70
+ )
71
+ logger.info("Model loaded successfully with FP16 on GPU")
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  except Exception as gpu_error:
73
  logger.warning(f"GPU loading failed: {gpu_error}. Falling back to CPU.")
74
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
75
  MODEL_ID,
76
  device_map="cpu",
 
77
  )
78
  logger.info("Model loaded successfully on CPU")
79
  else:
 
81
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
82
  MODEL_ID,
83
  device_map="cpu",
 
84
  )
85
  logger.info("Model loaded successfully on CPU")
86
+
87
  model.eval()
88
  log_gpu_memory("After model loading")
89
  return model, processor