mclemcrew commited on
Commit
5e824e3
·
1 Parent(s): ed55f0b

quantization not working?

Browse files
Files changed (2) hide show
  1. app.py +31 -34
  2. requirements.txt +0 -1
app.py CHANGED
@@ -51,33 +51,19 @@ def load_model():
51
  processor = AutoProcessor.from_pretrained(MODEL_ID)
52
  logger.info("Processor loaded successfully")
53
 
54
- # Try loading model with quantization first
55
- try:
56
- logger.info(f"Attempting to load model with quantization from {MODEL_ID}")
57
- from transformers import BitsAndBytesConfig
58
-
59
- # Configure BitsAndBytes for 4-bit quantization
60
- bnb_config = BitsAndBytesConfig(
61
- load_in_4bit=True,
62
- bnb_4bit_use_double_quant=True,
63
- bnb_4bit_quant_type="nf4",
64
- bnb_4bit_compute_dtype=torch.float16
65
- )
66
-
67
- model = Qwen2AudioForConditionalGeneration.from_pretrained(
68
- MODEL_ID,
69
- quantization_config=bnb_config,
70
- device_map="auto",
71
- low_cpu_mem_usage=True
72
- )
73
- logger.info("Model loaded successfully with quantization")
74
- except Exception as quant_error:
75
- # If quantization fails, fall back to basic loading
76
- logger.warning(f"Quantization failed: {quant_error}. Falling back to standard loading.")
77
-
78
- # Try FP16 if GPU available
79
- if torch.cuda.is_available():
80
- try:
81
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
82
  MODEL_ID,
83
  torch_dtype=torch.float16,
@@ -85,22 +71,33 @@ def load_model():
85
  low_cpu_mem_usage=True
86
  )
87
  logger.info("Model loaded successfully with FP16")
88
- except Exception as fp16_error:
89
- logger.warning(f"FP16 loading failed: {fp16_error}. Falling back to CPU.")
 
90
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
91
- MODEL_ID,
92
- device_map="cpu",
 
 
93
  low_cpu_mem_usage=True
94
  )
95
- logger.info("Model loaded successfully on CPU")
96
- else:
97
- # Load on CPU if no GPU
98
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
99
  MODEL_ID,
100
  device_map="cpu",
101
  low_cpu_mem_usage=True
102
  )
103
  logger.info("Model loaded successfully on CPU")
 
 
 
 
 
 
 
 
104
 
105
  model.eval()
106
  log_gpu_memory("After model loading")
 
51
  processor = AutoProcessor.from_pretrained(MODEL_ID)
52
  logger.info("Processor loaded successfully")
53
 
54
+ # Skip quantization attempts since we know it's problematic with CUDA 12.4
55
+ logger.info(f"Loading model with optimized settings for your environment")
56
+
57
+ # Check if GPU is available and has enough memory
58
+ if torch.cuda.is_available():
59
+ try:
60
+ # Get GPU memory info
61
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
62
+ logger.info(f"GPU memory: {gpu_memory:.2f} GB")
63
+
64
+ # If GPU has enough memory, try loading directly without quantization
65
+ if gpu_memory > 16: # For GPUs with >16GB memory
66
+ logger.info("Using FP16 precision on GPU")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
68
  MODEL_ID,
69
  torch_dtype=torch.float16,
 
71
  low_cpu_mem_usage=True
72
  )
73
  logger.info("Model loaded successfully with FP16")
74
+ else:
75
+ # For smaller GPUs, use CPU offloading
76
+ logger.info("Using CPU offloading for model components")
77
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
78
+ MODEL_ID,
79
+ torch_dtype=torch.float16,
80
+ device_map="auto",
81
+ offload_folder="offload",
82
  low_cpu_mem_usage=True
83
  )
84
+ logger.info("Model loaded successfully with CPU offloading")
85
+ except Exception as gpu_error:
86
+ logger.warning(f"GPU loading failed: {gpu_error}. Falling back to CPU.")
87
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
88
  MODEL_ID,
89
  device_map="cpu",
90
  low_cpu_mem_usage=True
91
  )
92
  logger.info("Model loaded successfully on CPU")
93
+ else:
94
+ # Load on CPU if no GPU
95
+ model = Qwen2AudioForConditionalGeneration.from_pretrained(
96
+ MODEL_ID,
97
+ device_map="cpu",
98
+ low_cpu_mem_usage=True
99
+ )
100
+ logger.info("Model loaded successfully on CPU")
101
 
102
  model.eval()
103
  log_gpu_memory("After model loading")
requirements.txt CHANGED
@@ -8,5 +8,4 @@ soundfile>=0.12.1
8
  requests>=2.28.0
9
  pillow>=9.5.0
10
  huggingface_hub>=0.16.0
11
- bitsandbytes>=0.41.0
12
  scikit-learn>=1.0.2
 
8
  requests>=2.28.0
9
  pillow>=9.5.0
10
  huggingface_hub>=0.16.0
 
11
  scikit-learn>=1.0.2