| |
| |
| |
| |
|
|
| import os |
| import sys |
| from pathlib import Path |
| import argparse |
| import cv2 |
| import numpy as np |
| from collections.abc import Mapping, Sequence |
|
|
| from transformers.pipelines import PIPELINE_REGISTRY |
| from transformers import pipeline |
|
|
| |
| os.environ['OPENAI_API_KEY'] = "dummy-key" |
|
|
| |
| current_dir = Path(__file__).resolve().parent |
| src_dir = current_dir.parent / "src" |
| if src_dir.is_dir() and str(src_dir) not in sys.path: |
| sys.path.insert(0, str(src_dir)) |
|
|
| from vine_hf.vine_pipeline import VinePipeline |
| from vine_hf.vine_model import VineModel |
| from vine_hf.vine_config import VineConfig |
| from laser.loading import load_video |
|
|
|
|
| def build_pipeline(args) -> VinePipeline: |
| |
| PIPELINE_REGISTRY.register_pipeline( |
| "vine-video-understanding", |
| pipeline_class=VinePipeline, |
| pt_model=VineModel, |
| type="multimodal", |
| ) |
|
|
| config = VineConfig( |
| segmentation_method="grounding_dino_sam2", |
| model_name="openai/clip-vit-base-patch32", |
| |
| use_hf_repo=True, |
| model_repo="video-fm/vine_v0", |
| |
| box_threshold=args.box_threshold, |
| text_threshold=args.text_threshold, |
| target_fps=args.fps, |
| topk_cate=args.topk_cate, |
| visualization_dir=args.out_dir, |
| visualize=True, |
| debug_visualizations=True, |
| device=args.device, |
| ) |
| |
| model = VineModel(config) |
|
|
| |
| vine_pipe = VinePipeline( |
| model=model, |
| tokenizer=None, |
| sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml", |
| sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt", |
| gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", |
| gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth", |
| device=args.device, |
| trust_remote_code=True, |
| ) |
| return vine_pipe |
|
|
|
|
| def resolve_video(args) -> np.ndarray | str: |
| |
| if args.video and os.path.exists(args.video): |
| return args.video |
|
|
| demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4" |
| demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4" |
| if os.path.exists(demo_video): |
| return demo_video |
| if os.path.exists(demo_alt): |
| return demo_alt |
|
|
| |
| print("No video found; using random frames.") |
| rng = np.random.default_rng(0) |
| frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8) |
| return frames |
|
|
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="VINE visualization example") |
| parser.add_argument("--video", type=str, default=None, help="Path to a video file") |
| parser.add_argument("--out_dir", type=str, default="output", help="Output directory") |
| parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method") |
| parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing") |
| parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold") |
| parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold") |
| parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display") |
| parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU") |
| parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations") |
|
|
|
|
| args = parser.parse_args() |
|
|
| vine_pipe = build_pipeline(args) |
| video = resolve_video(args) |
|
|
| |
| categorical_keywords = ["dog", "frisbee", "cat"] |
| unary_keywords = ["running", "jumping", "sitting", "flying"] |
| binary_keywords = ["behind", "next to", "chasing","biting"] |
| object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)] |
|
|
| print("Running VINE pipeline...") |
| call_kwargs = dict( |
| categorical_keywords=categorical_keywords, |
| unary_keywords=unary_keywords, |
| binary_keywords=binary_keywords, |
| object_pairs=object_pairs, |
| segmentation_method=args.method, |
| return_top_k=args.topk_cate, |
| include_visualizations=True, |
| debug_visualizations=args.debug_visualizations, |
| ) |
|
|
|
|
| results = vine_pipe( |
| video, |
| **call_kwargs, |
| ) |
|
|
| |
| if isinstance(results, Mapping): |
| result = results |
| elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping): |
| result = results[0] |
| else: |
| result = {} |
|
|
| |
| summary = result.get("summary", {}) if isinstance(result, dict) else {} |
| print("Summary:", summary) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|