Spaces:

jena-shreyas
/

Video-Inference-Demo

Sleeping

App Files Files Community

jena-shreyas commited on 25 days ago

Commit

5644567

1 Parent(s): b648ad9

Add BF16/INT8/INT4 quantization support for LLaVA-Video to fit within 23GB VRAM HF Spaces limit

Browse files

Files changed (1) hide show

app.py +53 -7

app.py CHANGED Viewed

@@ -22,14 +22,37 @@ MAX_NEW_TOKENS = 512
 TEMPERATURE = 0.01
 # ----------------------
-# Load model ONCE
 # ----------------------
-print("Loading LLaVa-Video-7B-Qwen2...")
-model: BaseVideoModel = load_model(
-    MODEL_PATH,
-    device_map=DEVICE_MAP,
-)
-print("Model loaded.")
 # ----------------------
 # Collect video IDs
@@ -128,6 +151,21 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
             gr.Markdown("### ⚙️ Model Parameters")
             fps_slider = gr.Slider(
                 minimum=0.5,
                 maximum=10.0,
@@ -203,6 +241,7 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
     gr.Markdown("""
     ---
     **ℹ️ Tips:**
     - Adjust FPS to control video sampling rate (higher = more frames, slower inference)
     - Use video_mode='frames' for fixed frame count (useful for very long videos)
     - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
@@ -216,6 +255,13 @@ with gr.Blocks(title="Video QA – LLaVa-Video-7B-Qwen2", theme=gr.themes.Soft()
         outputs=video_player
     )
     # Run inference
     run.click(
         fn=video_qa,

 TEMPERATURE = 0.01
 # ----------------------
+# Model loading with quantization support
 # ----------------------
+model: BaseVideoModel = None
+current_quantization = "16-bit"
+def load_model_with_quantization(quantization_mode: str):
+    """Load or reload the model with specified quantization"""
+    global model, current_quantization
+    load_8bit = False
+    load_4bit = False
+    if quantization_mode == "8-bit":
+        load_8bit = True
+    elif quantization_mode == "4-bit":
+        load_4bit = True
+    # else: 16-bit (normal) - both flags remain False
+    print(f"Loading LLaVa-Video-7B-Qwen2 with {quantization_mode} quantization...")
+    model = load_model(
+        MODEL_PATH,
+        device_map=DEVICE_MAP,
+        load_8bit=load_8bit,
+        load_4bit=load_4bit,
+    )
+    current_quantization = quantization_mode
+    print(f"Model loaded with {quantization_mode} quantization.")
+    return f"✅ Model loaded successfully with {quantization_mode} quantization"
+# Load model initially with 16-bit (normal)
+load_model_with_quantization("16-bit")
 # ----------------------
 # Collect video IDs
             gr.Markdown("### ⚙️ Model Parameters")
+            quantization_radio = gr.Radio(
+                choices=["16-bit", "8-bit", "4-bit"],
+                value="16-bit",
+                label="🔧 Model Quantization",
+                info="16-bit: Default precision, 8-bit/4-bit: Reduced memory usage"
+            )
+            reload_button = gr.Button("🔄 Reload Model", variant="secondary")
+            reload_status = gr.Textbox(
+                label="Model Status",
+                value=f"Model loaded with {current_quantization} quantization",
+                interactive=False,
+                lines=1
+            )
             fps_slider = gr.Slider(
                 minimum=0.5,
                 maximum=10.0,
     gr.Markdown("""
     ---
     **ℹ️ Tips:**
+    - **Quantization:** 16-bit (full precision), 8-bit (2x memory savings), 4-bit (4x memory savings with slight quality loss)
     - Adjust FPS to control video sampling rate (higher = more frames, slower inference)
     - Use video_mode='frames' for fixed frame count (useful for very long videos)
     - Temperature: Lower (0.01-0.5) for factual, higher (0.7-1.5) for creative responses
         outputs=video_player
     )
+    # Reload model with new quantization
+    reload_button.click(
+        fn=load_model_with_quantization,
+        inputs=quantization_radio,
+        outputs=reload_status
+    )
     # Run inference
     run.click(
         fn=video_qa,