LASER

Sleeping

LASER / vine_hf /example_visualization.py

moqingyan123

updates

f71f431 6 months ago

5.5 kB

	# Example visualization runner for VINE
	# - Loads a video (path, demo, or random)
	# - Runs the VINE pipeline
	# - Saves annotated frames and an MP4 if available

	import os
	import sys
	from pathlib import Path
	import argparse
	import cv2
	import numpy as np
	from collections.abc import Mapping, Sequence

	from transformers.pipelines import PIPELINE_REGISTRY
	from transformers import pipeline

	# Set your OpenAI API key here or via environment variable
	os.environ['OPENAI_API_KEY'] = "dummy-key"

	# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
	current_dir = Path(__file__).resolve().parent
	src_dir = current_dir.parent / "src"
	if src_dir.is_dir() and str(src_dir) not in sys.path:
	sys.path.insert(0, str(src_dir))

	from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
	from vine_hf.vine_model import VineModel
	from vine_hf.vine_config import VineConfig
	from laser.loading import load_video


	def build_pipeline(args) -> VinePipeline:
	# Register pipeline type
	PIPELINE_REGISTRY.register_pipeline(
	"vine-video-understanding",
	pipeline_class=VinePipeline,
	pt_model=VineModel,
	type="multimodal",
	)

	config = VineConfig(
	segmentation_method="grounding_dino_sam2",
	model_name="openai/clip-vit-base-patch32",
	# Example: load from HF repo
	use_hf_repo=True,
	model_repo="video-fm/vine_v0",
	# Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
	box_threshold=args.box_threshold,
	text_threshold=args.text_threshold,
	target_fps=args.fps,
	topk_cate=args.topk_cate,
	visualization_dir=args.out_dir,
	visualize=True,
	debug_visualizations=True,
	device=args.device,
	)

	model = VineModel(config)

	# Create pipeline instance with segmentation model paths (if provided)
	vine_pipe = VinePipeline(
	model=model,
	tokenizer=None,
	sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
	sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
	gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
	gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
	device=args.device,
	trust_remote_code=True,
	)
	return vine_pipe


	def resolve_video(args) -> np.ndarray \| str:
	# Priority: user --video -> demo video -> random frames
	if args.video and os.path.exists(args.video):
	return args.video

	demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
	demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
	if os.path.exists(demo_video):
	return demo_video
	if os.path.exists(demo_alt):
	return demo_alt

	# Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
	print("No video found; using random frames.")
	rng = np.random.default_rng(0)
	frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
	return frames



	def main():
	parser = argparse.ArgumentParser(description="VINE visualization example")
	parser.add_argument("--video", type=str, default=None, help="Path to a video file")
	parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
	parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
	parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
	parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
	parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
	parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
	parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
	parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")


	args = parser.parse_args()

	vine_pipe = build_pipeline(args)
	video = resolve_video(args)

	# Keywords similar to examples/tests
	categorical_keywords = ["dog", "frisbee", "cat"]
	unary_keywords = ["running", "jumping", "sitting", "flying"]
	binary_keywords = ["behind", "next to", "chasing","biting"]
	object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]

	print("Running VINE pipeline...")
	call_kwargs = dict(
	categorical_keywords=categorical_keywords,
	unary_keywords=unary_keywords,
	binary_keywords=binary_keywords,
	object_pairs=object_pairs,
	segmentation_method=args.method,
	return_top_k=args.topk_cate,
	include_visualizations=True,
	debug_visualizations=args.debug_visualizations,
	)


	results = vine_pipe(
	video,
	**call_kwargs,
	)

	# Normalize pipeline output to a dict (can be dict or list[dict])
	if isinstance(results, Mapping):
	result = results
	elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
	result = results[0]
	else:
	result = {}

	# Print brief summary
	summary = result.get("summary", {}) if isinstance(result, dict) else {}
	print("Summary:", summary)


	if __name__ == "__main__":
	main()