Add 8-bit quantization to reduce memory usage
Browse files- Load model with load_in_8bit=True
- Add bitsandbytes dependency
- Should reduce memory footprint by ~50%
- Fixes memory limit exceeded error on free tier
- app.py +4 -3
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -13,18 +13,19 @@ import gradio as gr
|
|
| 13 |
MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
|
| 14 |
|
| 15 |
print(f"Loading model: {MODEL_ID}")
|
|
|
|
| 16 |
print("This may take a few minutes on first load...")
|
| 17 |
|
| 18 |
-
# Load model and processor
|
| 19 |
model = MllamaForConditionalGeneration.from_pretrained(
|
| 20 |
MODEL_ID,
|
| 21 |
-
|
| 22 |
device_map="auto",
|
| 23 |
)
|
| 24 |
|
| 25 |
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 26 |
|
| 27 |
-
print("Model loaded successfully!")
|
| 28 |
|
| 29 |
|
| 30 |
# Helper functions
|
|
|
|
| 13 |
MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
|
| 14 |
|
| 15 |
print(f"Loading model: {MODEL_ID}")
|
| 16 |
+
print("Loading in 8-bit mode to reduce memory usage...")
|
| 17 |
print("This may take a few minutes on first load...")
|
| 18 |
|
| 19 |
+
# Load model and processor with 8-bit quantization to reduce memory
|
| 20 |
model = MllamaForConditionalGeneration.from_pretrained(
|
| 21 |
MODEL_ID,
|
| 22 |
+
load_in_8bit=True,
|
| 23 |
device_map="auto",
|
| 24 |
)
|
| 25 |
|
| 26 |
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 27 |
|
| 28 |
+
print("Model loaded successfully in 8-bit mode!")
|
| 29 |
|
| 30 |
|
| 31 |
# Helper functions
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
torch>=2.0.0
|
| 2 |
transformers>=4.45.0
|
| 3 |
accelerate>=0.20.0
|
|
|
|
| 4 |
Pillow>=10.0.0
|
| 5 |
sentencepiece>=0.1.99
|
| 6 |
protobuf>=3.20.0
|
|
|
|
| 1 |
torch>=2.0.0
|
| 2 |
transformers>=4.45.0
|
| 3 |
accelerate>=0.20.0
|
| 4 |
+
bitsandbytes>=0.41.0
|
| 5 |
Pillow>=10.0.0
|
| 6 |
sentencepiece>=0.1.99
|
| 7 |
protobuf>=3.20.0
|