Spaces:

vibrantturtle
/

video_anaylsis

Sleeping

App Files Files Community

vibrantturtle commited on Feb 17

Commit

4e5f09a

verified ·

1 Parent(s): 1a4a55b

Create app.py

Browse files

Files changed (1) hide show

app.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import gradio as gr
+import spaces
+import torch
+import numpy as np
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+MODEL_ID = "OpenGVLab/InternVideo2_5_Chat_8B"
+# Load once at startup (Space will cache weights after first run)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True).half().cuda().to(torch.bfloat16)
+model.eval()
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD  = (0.229, 0.224, 0.225)
+def build_transform(input_size=448):
+    return T.Compose([
+        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+    ])
+def sample_frames(video_path, num_segments=16, input_size=448):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    if max_frame <= 0:
+        idxs = [0]
+    else:
+        idxs = np.linspace(0, max_frame, num_segments).astype(int).tolist()
+    transform = build_transform(input_size)
+    pixel_values_list = []
+    num_patches_list = []
+    # Simple: one tile per frame (keeps memory lower)
+    for i in idxs:
+        img = Image.fromarray(vr[i].asnumpy()).convert("RGB")
+        pv = transform(img).unsqueeze(0)  # [1,3,H,W]
+        pixel_values_list.append(pv)
+        num_patches_list.append(1)
+    pixel_values = torch.cat(pixel_values_list, dim=0)  # [T,3,H,W]
+    return pixel_values, num_patches_list
+@spaces.GPU
+@torch.no_grad()
+def analyze(video, prompt, num_segments, max_new_tokens):
+    if video is None:
+        return "Upload a video first."
+    # gr.Video returns a dict-like object in some gradio versions;
+    # safest: handle both string path and dict
+    if isinstance(video, dict) and "path" in video:
+        video_path = video["path"]
+    else:
+        video_path = video
+    pixel_values, num_patches_list = sample_frames(
+        video_path,
+        num_segments=int(num_segments),
+        input_size=448
+    )
+    pixel_values = pixel_values.to(torch.bfloat16).to(model.device)
+    video_prefix = "".join([f"Frame{i+1}: <image>\n" for i in range(len(num_patches_list))])
+    question = video_prefix + (prompt or "Describe this video in detail.")
+    generation_config = dict(
+        do_sample=False,
+        temperature=0.0,
+        max_new_tokens=int(max_new_tokens),
+        top_p=0.1,
+        num_beams=1
+    )
+    out, _ = model.chat(
+        tokenizer,
+        pixel_values,
+        question,
+        generation_config,
+        num_patches_list=num_patches_list,
+        history=None,
+        return_history=True,
+    )
+    return out
+demo = gr.Interface(
+    fn=analyze,
+    inputs=[
+        gr.Video(label="Upload video"),
+        gr.Textbox(label="Prompt", value="Describe what is happening. If someone is using a phone while driving, say so."),
+        gr.Slider(8, 64, value=16, step=8, label="Frames sampled (lower=faster/safer)"),
+        gr.Slider(64, 512, value=256, step=64, label="Max new tokens (lower=faster)"),
+    ],
+    outputs=gr.Textbox(label="Model output"),
+    title="InternVideo2.5 Chat 8B — Video Analysis Demo",
+)
+demo.launch()