AKSazgar commited on
Commit
0973284
·
1 Parent(s): 3f53e8d

Add 8-bit quantization to reduce memory usage

Browse files

- Load model with load_in_8bit=True
- Add bitsandbytes dependency
- Should reduce memory footprint by ~50%
- Fixes memory limit exceeded error on free tier

Files changed (2) hide show
  1. app.py +4 -3
  2. requirements.txt +1 -0
app.py CHANGED
@@ -13,18 +13,19 @@ import gradio as gr
13
  MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
14
 
15
  print(f"Loading model: {MODEL_ID}")
 
16
  print("This may take a few minutes on first load...")
17
 
18
- # Load model and processor
19
  model = MllamaForConditionalGeneration.from_pretrained(
20
  MODEL_ID,
21
- dtype=torch.bfloat16,
22
  device_map="auto",
23
  )
24
 
25
  processor = AutoProcessor.from_pretrained(MODEL_ID)
26
 
27
- print("Model loaded successfully!")
28
 
29
 
30
  # Helper functions
 
13
  MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
14
 
15
  print(f"Loading model: {MODEL_ID}")
16
+ print("Loading in 8-bit mode to reduce memory usage...")
17
  print("This may take a few minutes on first load...")
18
 
19
+ # Load model and processor with 8-bit quantization to reduce memory
20
  model = MllamaForConditionalGeneration.from_pretrained(
21
  MODEL_ID,
22
+ load_in_8bit=True,
23
  device_map="auto",
24
  )
25
 
26
  processor = AutoProcessor.from_pretrained(MODEL_ID)
27
 
28
+ print("Model loaded successfully in 8-bit mode!")
29
 
30
 
31
  # Helper functions
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  torch>=2.0.0
2
  transformers>=4.45.0
3
  accelerate>=0.20.0
 
4
  Pillow>=10.0.0
5
  sentencepiece>=0.1.99
6
  protobuf>=3.20.0
 
1
  torch>=2.0.0
2
  transformers>=4.45.0
3
  accelerate>=0.20.0
4
+ bitsandbytes>=0.41.0
5
  Pillow>=10.0.0
6
  sentencepiece>=0.1.99
7
  protobuf>=3.20.0