Fix 8-bit quantization bug, switch to 4-bit with BitsAndBytesConfig
Browse files- Fix RuntimeError: view size not compatible with tensor
- Switch from deprecated load_in_4bit to BitsAndBytesConfig
- Use 4-bit quantization (more stable than 8-bit)
- Reduces memory from ~11GB to ~5GB
- Uses NF4 quantization with double quant for better quality
app.py
CHANGED
|
@@ -5,7 +5,7 @@ Gradio interface for Hugging Face Spaces
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import torch
|
| 8 |
-
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer
|
| 9 |
from PIL import Image
|
| 10 |
import gradio as gr
|
| 11 |
|
|
@@ -13,19 +13,29 @@ import gradio as gr
|
|
| 13 |
MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
|
| 14 |
|
| 15 |
print(f"Loading model: {MODEL_ID}")
|
| 16 |
-
print("Loading in
|
| 17 |
print("This may take a few minutes on first load...")
|
| 18 |
|
| 19 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
model = MllamaForConditionalGeneration.from_pretrained(
|
| 21 |
MODEL_ID,
|
| 22 |
-
|
| 23 |
device_map="auto",
|
| 24 |
)
|
| 25 |
|
| 26 |
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 27 |
|
| 28 |
-
print("Model loaded successfully in
|
| 29 |
|
| 30 |
|
| 31 |
# Helper functions
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import torch
|
| 8 |
+
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer, BitsAndBytesConfig
|
| 9 |
from PIL import Image
|
| 10 |
import gradio as gr
|
| 11 |
|
|
|
|
| 13 |
MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
|
| 14 |
|
| 15 |
print(f"Loading model: {MODEL_ID}")
|
| 16 |
+
print("Loading in 4-bit mode to fit in free tier memory (16GB)...")
|
| 17 |
print("This may take a few minutes on first load...")
|
| 18 |
|
| 19 |
+
# Configure 4-bit quantization properly using BitsAndBytesConfig
|
| 20 |
+
# This is more stable than deprecated load_in_4bit parameter
|
| 21 |
+
quantization_config = BitsAndBytesConfig(
|
| 22 |
+
load_in_4bit=True,
|
| 23 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 24 |
+
bnb_4bit_use_double_quant=True,
|
| 25 |
+
bnb_4bit_quant_type="nf4"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Load model and processor with 4-bit quantization to reduce memory significantly
|
| 29 |
+
# This allows the 11B model to run on free tier (16GB GPU)
|
| 30 |
model = MllamaForConditionalGeneration.from_pretrained(
|
| 31 |
MODEL_ID,
|
| 32 |
+
quantization_config=quantization_config,
|
| 33 |
device_map="auto",
|
| 34 |
)
|
| 35 |
|
| 36 |
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 37 |
|
| 38 |
+
print("Model loaded successfully in 4-bit mode!")
|
| 39 |
|
| 40 |
|
| 41 |
# Helper functions
|