jena-shreyas commited on
Commit
5644567
·
1 Parent(s): b648ad9

Add BF16/INT8/INT4 quantization support for LLaVA-Video to fit within 23GB VRAM HF Spaces limit

Browse files
Files changed (1) hide show
  1. app.py +53 -7
app.py CHANGED
@@ -22,14 +22,37 @@ MAX_NEW_TOKENS = 512
22
  TEMPERATURE = 0.01
23
 
24
  # ----------------------
25
- # Load model ONCE
26
  # ----------------------
27
- print("Loading LLaVa-Video-7B-Qwen2...")
28
- model: BaseVideoModel = load_model(
29
- MODEL_PATH,
30
- device_map=DEVICE_MAP,
31
- )
32
- print("Model loaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # ----------------------
35
  # Collect video IDs
@@ -128,6 +151,21 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
128
 
129
  gr.Markdown("### ⚙️ Model Parameters")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  fps_slider = gr.Slider(
132
  minimum=0.5,
133
  maximum=10.0,
@@ -203,6 +241,7 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
203
  gr.Markdown("""
204
  ---
205
  **ℹ️ Tips:**
 
206
  - Adjust FPS to control video sampling rate (higher = more frames, slower inference)
207
  - Use video_mode='frames' for fixed frame count (useful for very long videos)
208
  - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
@@ -216,6 +255,13 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
216
  outputs=video_player
217
  )
218
 
 
 
 
 
 
 
 
219
  # Run inference
220
  run.click(
221
  fn=video_qa,
 
22
  TEMPERATURE = 0.01
23
 
24
  # ----------------------
25
+ # Model loading with quantization support
26
  # ----------------------
27
+ model: BaseVideoModel = None
28
+ current_quantization = "16-bit"
29
+
30
+ def load_model_with_quantization(quantization_mode: str):
31
+ """Load or reload the model with specified quantization"""
32
+ global model, current_quantization
33
+
34
+ load_8bit = False
35
+ load_4bit = False
36
+
37
+ if quantization_mode == "8-bit":
38
+ load_8bit = True
39
+ elif quantization_mode == "4-bit":
40
+ load_4bit = True
41
+ # else: 16-bit (normal) - both flags remain False
42
+
43
+ print(f"Loading LLaVa-Video-7B-Qwen2 with {quantization_mode} quantization...")
44
+ model = load_model(
45
+ MODEL_PATH,
46
+ device_map=DEVICE_MAP,
47
+ load_8bit=load_8bit,
48
+ load_4bit=load_4bit,
49
+ )
50
+ current_quantization = quantization_mode
51
+ print(f"Model loaded with {quantization_mode} quantization.")
52
+ return f"✅ Model loaded successfully with {quantization_mode} quantization"
53
+
54
+ # Load model initially with 16-bit (normal)
55
+ load_model_with_quantization("16-bit")
56
 
57
  # ----------------------
58
  # Collect video IDs
 
151
 
152
  gr.Markdown("### ⚙️ Model Parameters")
153
 
154
+ quantization_radio = gr.Radio(
155
+ choices=["16-bit", "8-bit", "4-bit"],
156
+ value="16-bit",
157
+ label="🔧 Model Quantization",
158
+ info="16-bit: Default precision, 8-bit/4-bit: Reduced memory usage"
159
+ )
160
+
161
+ reload_button = gr.Button("🔄 Reload Model", variant="secondary")
162
+ reload_status = gr.Textbox(
163
+ label="Model Status",
164
+ value=f"Model loaded with {current_quantization} quantization",
165
+ interactive=False,
166
+ lines=1
167
+ )
168
+
169
  fps_slider = gr.Slider(
170
  minimum=0.5,
171
  maximum=10.0,
 
241
  gr.Markdown("""
242
  ---
243
  **ℹ️ Tips:**
244
+ - **Quantization:** 16-bit (full precision), 8-bit (2x memory savings), 4-bit (4x memory savings with slight quality loss)
245
  - Adjust FPS to control video sampling rate (higher = more frames, slower inference)
246
  - Use video_mode='frames' for fixed frame count (useful for very long videos)
247
  - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
 
255
  outputs=video_player
256
  )
257
 
258
+ # Reload model with new quantization
259
+ reload_button.click(
260
+ fn=load_model_with_quantization,
261
+ inputs=quantization_radio,
262
+ outputs=reload_status
263
+ )
264
+
265
  # Run inference
266
  run.click(
267
  fn=video_qa,