Spaces:
Sleeping
Sleeping
Commit ·
5644567
1
Parent(s): b648ad9
Add BF16/INT8/INT4 quantization support for LLaVA-Video to fit within 23GB VRAM HF Spaces limit
Browse files
app.py
CHANGED
|
@@ -22,14 +22,37 @@ MAX_NEW_TOKENS = 512
|
|
| 22 |
TEMPERATURE = 0.01
|
| 23 |
|
| 24 |
# ----------------------
|
| 25 |
-
#
|
| 26 |
# ----------------------
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# ----------------------
|
| 35 |
# Collect video IDs
|
|
@@ -128,6 +151,21 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
|
|
| 128 |
|
| 129 |
gr.Markdown("### ⚙️ Model Parameters")
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
fps_slider = gr.Slider(
|
| 132 |
minimum=0.5,
|
| 133 |
maximum=10.0,
|
|
@@ -203,6 +241,7 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
|
|
| 203 |
gr.Markdown("""
|
| 204 |
---
|
| 205 |
**ℹ️ Tips:**
|
|
|
|
| 206 |
- Adjust FPS to control video sampling rate (higher = more frames, slower inference)
|
| 207 |
- Use video_mode='frames' for fixed frame count (useful for very long videos)
|
| 208 |
- Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
|
|
@@ -216,6 +255,13 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
|
|
| 216 |
outputs=video_player
|
| 217 |
)
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# Run inference
|
| 220 |
run.click(
|
| 221 |
fn=video_qa,
|
|
|
|
| 22 |
TEMPERATURE = 0.01
|
| 23 |
|
| 24 |
# ----------------------
|
| 25 |
+
# Model loading with quantization support
|
| 26 |
# ----------------------
|
| 27 |
+
model: BaseVideoModel = None
|
| 28 |
+
current_quantization = "16-bit"
|
| 29 |
+
|
| 30 |
+
def load_model_with_quantization(quantization_mode: str):
|
| 31 |
+
"""Load or reload the model with specified quantization"""
|
| 32 |
+
global model, current_quantization
|
| 33 |
+
|
| 34 |
+
load_8bit = False
|
| 35 |
+
load_4bit = False
|
| 36 |
+
|
| 37 |
+
if quantization_mode == "8-bit":
|
| 38 |
+
load_8bit = True
|
| 39 |
+
elif quantization_mode == "4-bit":
|
| 40 |
+
load_4bit = True
|
| 41 |
+
# else: 16-bit (normal) - both flags remain False
|
| 42 |
+
|
| 43 |
+
print(f"Loading LLaVa-Video-7B-Qwen2 with {quantization_mode} quantization...")
|
| 44 |
+
model = load_model(
|
| 45 |
+
MODEL_PATH,
|
| 46 |
+
device_map=DEVICE_MAP,
|
| 47 |
+
load_8bit=load_8bit,
|
| 48 |
+
load_4bit=load_4bit,
|
| 49 |
+
)
|
| 50 |
+
current_quantization = quantization_mode
|
| 51 |
+
print(f"Model loaded with {quantization_mode} quantization.")
|
| 52 |
+
return f"✅ Model loaded successfully with {quantization_mode} quantization"
|
| 53 |
+
|
| 54 |
+
# Load model initially with 16-bit (normal)
|
| 55 |
+
load_model_with_quantization("16-bit")
|
| 56 |
|
| 57 |
# ----------------------
|
| 58 |
# Collect video IDs
|
|
|
|
| 151 |
|
| 152 |
gr.Markdown("### ⚙️ Model Parameters")
|
| 153 |
|
| 154 |
+
quantization_radio = gr.Radio(
|
| 155 |
+
choices=["16-bit", "8-bit", "4-bit"],
|
| 156 |
+
value="16-bit",
|
| 157 |
+
label="🔧 Model Quantization",
|
| 158 |
+
info="16-bit: Default precision, 8-bit/4-bit: Reduced memory usage"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
reload_button = gr.Button("🔄 Reload Model", variant="secondary")
|
| 162 |
+
reload_status = gr.Textbox(
|
| 163 |
+
label="Model Status",
|
| 164 |
+
value=f"Model loaded with {current_quantization} quantization",
|
| 165 |
+
interactive=False,
|
| 166 |
+
lines=1
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
fps_slider = gr.Slider(
|
| 170 |
minimum=0.5,
|
| 171 |
maximum=10.0,
|
|
|
|
| 241 |
gr.Markdown("""
|
| 242 |
---
|
| 243 |
**ℹ️ Tips:**
|
| 244 |
+
- **Quantization:** 16-bit (full precision), 8-bit (2x memory savings), 4-bit (4x memory savings with slight quality loss)
|
| 245 |
- Adjust FPS to control video sampling rate (higher = more frames, slower inference)
|
| 246 |
- Use video_mode='frames' for fixed frame count (useful for very long videos)
|
| 247 |
- Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
|
|
|
|
| 255 |
outputs=video_player
|
| 256 |
)
|
| 257 |
|
| 258 |
+
# Reload model with new quantization
|
| 259 |
+
reload_button.click(
|
| 260 |
+
fn=load_model_with_quantization,
|
| 261 |
+
inputs=quantization_radio,
|
| 262 |
+
outputs=reload_status
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
# Run inference
|
| 266 |
run.click(
|
| 267 |
fn=video_qa,
|