Spaces:

AKSazgar
/

ECG-Instruct-Llama-3.2-11B-Vision

Sleeping

AKSazgar commited on Oct 28

Commit

b7d24ee

1 Parent(s): 0973284

Fix 8-bit quantization bug, switch to 4-bit with BitsAndBytesConfig

- Fix RuntimeError: view size not compatible with tensor
- Switch from deprecated load_in_4bit to BitsAndBytesConfig
- Use 4-bit quantization (more stable than 8-bit)
- Reduces memory from ~11GB to ~5GB
- Uses NF4 quantization with double quant for better quality

Files changed (1) hide show

app.py +15 -5

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ Gradio interface for Hugging Face Spaces
 """
 import torch
-from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer
 from PIL import Image
 import gradio as gr
@@ -13,19 +13,29 @@ import gradio as gr
 MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
 print(f"Loading model: {MODEL_ID}")
-print("Loading in 8-bit mode to reduce memory usage...")
 print("This may take a few minutes on first load...")
-# Load model and processor with 8-bit quantization to reduce memory
 model = MllamaForConditionalGeneration.from_pretrained(
     MODEL_ID,
-    load_in_8bit=True,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-print("Model loaded successfully in 8-bit mode!")
 # Helper functions

 """
 import torch
+from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
 MODEL_ID = "convaiinnovations/ECG-Instruct-Llama-3.2-11B-Vision"
 print(f"Loading model: {MODEL_ID}")
+print("Loading in 4-bit mode to fit in free tier memory (16GB)...")
 print("This may take a few minutes on first load...")
+# Configure 4-bit quantization properly using BitsAndBytesConfig
+# This is more stable than deprecated load_in_4bit parameter
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+# Load model and processor with 4-bit quantization to reduce memory significantly
+# This allows the 11B model to run on free tier (16GB GPU)
 model = MllamaForConditionalGeneration.from_pretrained(
     MODEL_ID,
+    quantization_config=quantization_config,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+print("Model loaded successfully in 4-bit mode!")
 # Helper functions