AKSazgar commited on
Commit
b7d24ee
·
1 Parent(s): 0973284

Fix 8-bit quantization bug, switch to 4-bit with BitsAndBytesConfig

Browse files

- Fix RuntimeError: view size not compatible with tensor
- Switch from deprecated load_in_4bit to BitsAndBytesConfig
- Use 4-bit quantization (more stable than 8-bit)
- Reduces memory from ~11GB to ~5GB
- Uses NF4 quantization with double quant for better quality

Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -5,7 +5,7 @@ Gradio interface for Hugging Face Spaces
5
  """
6
 
7
  import torch
8
- from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer
9
  from PIL import Image
10
  import gradio as gr
11
 
@@ -13,19 +13,29 @@ import gradio as gr
13
  MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
14
 
15
  print(f"Loading model: {MODEL_ID}")
16
- print("Loading in 8-bit mode to reduce memory usage...")
17
  print("This may take a few minutes on first load...")
18
 
19
- # Load model and processor with 8-bit quantization to reduce memory
 
 
 
 
 
 
 
 
 
 
20
  model = MllamaForConditionalGeneration.from_pretrained(
21
  MODEL_ID,
22
- load_in_8bit=True,
23
  device_map="auto",
24
  )
25
 
26
  processor = AutoProcessor.from_pretrained(MODEL_ID)
27
 
28
- print("Model loaded successfully in 8-bit mode!")
29
 
30
 
31
  # Helper functions
 
5
  """
6
 
7
  import torch
8
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer, BitsAndBytesConfig
9
  from PIL import Image
10
  import gradio as gr
11
 
 
13
  MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
14
 
15
  print(f"Loading model: {MODEL_ID}")
16
+ print("Loading in 4-bit mode to fit in free tier memory (16GB)...")
17
  print("This may take a few minutes on first load...")
18
 
19
+ # Configure 4-bit quantization properly using BitsAndBytesConfig
20
+ # This is more stable than deprecated load_in_4bit parameter
21
+ quantization_config = BitsAndBytesConfig(
22
+ load_in_4bit=True,
23
+ bnb_4bit_compute_dtype=torch.float16,
24
+ bnb_4bit_use_double_quant=True,
25
+ bnb_4bit_quant_type="nf4"
26
+ )
27
+
28
+ # Load model and processor with 4-bit quantization to reduce memory significantly
29
+ # This allows the 11B model to run on free tier (16GB GPU)
30
  model = MllamaForConditionalGeneration.from_pretrained(
31
  MODEL_ID,
32
+ quantization_config=quantization_config,
33
  device_map="auto",
34
  )
35
 
36
  processor = AutoProcessor.from_pretrained(MODEL_ID)
37
 
38
+ print("Model loaded successfully in 4-bit mode!")
39
 
40
 
41
  # Helper functions