Spaces:

AKSazgar
/

ECG-Instruct-Llama-3.2-11B-Vision

Sleeping

AKSazgar commited on Oct 28

Commit

0973284

1 Parent(s): 3f53e8d

Add 8-bit quantization to reduce memory usage

- Load model with load_in_8bit=True
- Add bitsandbytes dependency
- Should reduce memory footprint by ~50%
- Fixes memory limit exceeded error on free tier

Files changed (2) hide show

app.py +4 -3
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -13,18 +13,19 @@ import gradio as gr
 MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
 print(f"Loading model: {MODEL_ID}")
 print("This may take a few minutes on first load...")
-# Load model and processor
 model = MllamaForConditionalGeneration.from_pretrained(
     MODEL_ID,
-    dtype=torch.bfloat16,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-print("Model loaded successfully!")
 # Helper functions

 MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
 print(f"Loading model: {MODEL_ID}")
+print("Loading in 8-bit mode to reduce memory usage...")
 print("This may take a few minutes on first load...")
+# Load model and processor with 8-bit quantization to reduce memory
 model = MllamaForConditionalGeneration.from_pretrained(
     MODEL_ID,
+    load_in_8bit=True,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+print("Model loaded successfully in 8-bit mode!")
 # Helper functions

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 torch>=2.0.0
 transformers>=4.45.0
 accelerate>=0.20.0
 Pillow>=10.0.0
 sentencepiece>=0.1.99
 protobuf>=3.20.0

 torch>=2.0.0
 transformers>=4.45.0
 accelerate>=0.20.0
+bitsandbytes>=0.41.0
 Pillow>=10.0.0
 sentencepiece>=0.1.99
 protobuf>=3.20.0