Spaces:

RaghavaMukkamala
/

Video-llama-experiments

Runtime error

App Files Files Community

RaghavaMukkamala commited on Feb 5

Commit

b9d859e

verified ·

1 Parent(s): 11b4d43

final updates

Browse files

Files changed (1) hide show

app.py +114 -22

app.py CHANGED Viewed

@@ -1,49 +1,141 @@
-import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from videollama3.conversation import conv_templates
-from videollama3.inference import chat
 MODEL_ID = "DAMO-NLP-SG/VideoLLaMA3-2B"
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     trust_remote_code=True
 )
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
-    dtype=torch.float16,
     device_map="auto"
 )
 model.eval()
-def infer(video, prompt):
-    conv = conv_templates["videollama3"].copy()
-    conv.append_message(conv.roles[0], prompt)
-    conv.append_message(conv.roles[1], None)
-    output = chat(
-        model=model,
-        tokenizer=tokenizer,
-        video_path=video,
-        conversation=conv,
-        max_new_tokens=512,
-        temperature=0.2
     )
-    return output
 demo = gr.Interface(
     fn=infer,
     inputs=[
         gr.Video(label="Upload video"),
-        gr.Textbox(label="Prompt")
     ],
-    outputs="text",
-    title="🎥 Video-LLaMA-3 Demo"
 )
 demo.launch()

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
 import torch
+import gradio as gr
+import cv2
+import decord
+import numpy as np
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    GenerationConfig,
+)
+# ------------------------
+# Configuration
+# ------------------------
 MODEL_ID = "DAMO-NLP-SG/VideoLLaMA3-2B"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16
+MAX_FRAMES = 32          # reduce if you hit OOM
+MAX_NEW_TOKENS = 512
+TEMPERATURE = 0.2
+# ------------------------
+# Load model & tokenizer
+# ------------------------
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_ID,
     trust_remote_code=True
 )
+print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
+    torch_dtype=DTYPE,
     device_map="auto"
 )
 model.eval()
+generation_config = GenerationConfig(
+    max_new_tokens=MAX_NEW_TOKENS,
+    temperature=TEMPERATURE,
+    do_sample=True,
+)
+# ------------------------
+# Video utilities (from demo_video_llama3.py)
+# ------------------------
+def load_video(video_path, max_frames=32):
+    """
+    Load video and sample frames uniformly.
+    Returns: numpy array (T, H, W, C)
+    """
+    vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
+    total_frames = len(vr)
+    if total_frames <= max_frames:
+        indices = list(range(total_frames))
+    else:
+        indices = np.linspace(
+            0, total_frames - 1, max_frames, dtype=int
+        ).tolist()
+    frames = vr.get_batch(indices).asnumpy()
+    return frames
+# ------------------------
+# Inference
+# ------------------------
+def videollama3_infer(video_path, prompt):
+    if video_path is None:
+        return "Please upload a video."
+    # Load & sample video
+    frames = load_video(video_path, MAX_FRAMES)
+    # Build multimodal prompt (as in official demo)
+    system_prompt = (
+        "You are VideoLLaMA, a helpful assistant that understands videos."
+    )
+    full_prompt = (
+        f"<|system|>\n{system_prompt}\n"
+        f"<|user|>\n{prompt}\n"
+        f"<|assistant|>\n"
     )
+    inputs = tokenizer(
+        full_prompt,
+        return_tensors="pt"
+    ).to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            generation_config=generation_config,
+            videos=torch.tensor(frames).to(model.device)
+        )
+    response = tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True
+    )
+    # Strip prompt echo
+    return response.split("<|assistant|>")[-1].strip()
+# ------------------------
+# Gradio UI
+# ------------------------
+def infer(video, prompt):
+    try:
+        return videollama3_infer(video, prompt)
+    except RuntimeError as e:
+        if "out of memory" in str(e).lower():
+            return "⚠️ CUDA out of memory. Try a shorter video."
+        raise e
 demo = gr.Interface(
     fn=infer,
     inputs=[
         gr.Video(label="Upload video"),
+        gr.Textbox(
+            label="Prompt",
+            placeholder="Describe what happens in the video"
+        ),
     ],
+    outputs=gr.Textbox(label="Model output"),
+    title="🎥 VideoLLaMA-3 Demo",
+    description="Ask questions about short videos using VideoLLaMA-3",
 )
 demo.launch()