| """ |
| Example usage of VINE HuggingFace interface |
| |
| This script demonstrates how to use the VINE model through the HuggingFace interface |
| for video understanding with categorical, unary, and binary keyword predictions. |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
| import torch |
| from transformers import pipeline, AutoModel |
| from transformers.pipelines import PIPELINE_REGISTRY |
|
|
| |
| current_dir = Path(__file__).resolve().parent |
| src_dir = current_dir.parent / "src" |
| if src_dir.is_dir() and str(src_dir) not in sys.path: |
| sys.path.insert(0, str(src_dir)) |
|
|
| |
| |
| from vine_hf import VineConfig, VineModel, VinePipeline |
|
|
| def example_direct_model_usage(): |
| """Example of using the VINE model directly.""" |
| print("=== Direct Model Usage ===") |
| |
| |
| config = VineConfig( |
| model_name="openai/clip-vit-base-patch32", |
| segmentation_method="grounding_dino_sam2", |
| use_hf_repo=True, |
| model_repo="video-fm/vine_v0", |
| debug_visualizations=True, |
| debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"), |
| target_fps=30, |
| box_threshold=0.35, |
| text_threshold=0.25 |
| ) |
| |
| |
| model = VineModel(config) |
| |
| print(f"Model initialized with CLIP backbone: {config.model_name}") |
| print(f"Segmentation method: {config.segmentation_method}") |
| print(f"Device: {model.device}") |
| |
| |
| num_frames, height, width = 3, 224, 224 |
| video_frames = torch.randn(num_frames, height, width, 3) * 255 |
| video_frames = video_frames.clamp(0, 255).byte() |
| |
| |
| masks = { |
| 0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, |
| 1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, |
| 2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)} |
| } |
| |
| bboxes = { |
| 0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]}, |
| 1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]}, |
| 2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]} |
| } |
| |
| |
| categorical_keywords = ["human", "dog", "frisbee"] |
| unary_keywords = ["running", "jumping", "sitting", "standing"] |
| binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"] |
| object_pairs = [(1, 2)] |
| |
| |
| print("\nRunning prediction...") |
| results = model.predict( |
| video_frames=video_frames, |
| masks=masks, |
| bboxes=bboxes, |
| categorical_keywords=categorical_keywords, |
| unary_keywords=unary_keywords, |
| binary_keywords=binary_keywords, |
| object_pairs=object_pairs, |
| return_top_k=3 |
| ) |
| |
| print("\nResults:") |
| print(f"Categorical predictions: {len(results['categorical_predictions'])} objects") |
| print(f"Unary predictions: {len(results['unary_predictions'])} actions") |
| print(f"Binary predictions: {len(results['binary_predictions'])} relations") |
| print(f"Confidence scores: {results['confidence_scores']}") |
|
|
|
|
| def example_pipeline_usage(): |
| """Example of using the VINE pipeline.""" |
| print("\n=== Pipeline Usage ===") |
| |
| |
| PIPELINE_REGISTRY.register_pipeline( |
| "vine-video-understanding", |
| pipeline_class=VinePipeline, |
| pt_model=VineModel, |
| type="multimodal", |
| ) |
| vine_config = VineConfig( |
| model_name="openai/clip-vit-base-patch32", |
| use_hf_repo=True, |
| model_repo="video-fm/vine_v0", |
| segmentation_method="grounding_dino_sam2", |
| debug_visualizations=True, |
| ) |
| |
| vine_pipe = VinePipeline( |
| model=VineModel(vine_config), |
| tokenizer=None, |
| trust_remote_code=True, |
| |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", |
| device=0, |
| ) |
| |
| |
| print("Pipeline created successfully!") |
| |
| |
| video_path = "path/to/your/video.mp4" |
| |
| |
| print(f"\nExample pipeline call (replace with actual video path):") |
| print(f"results = vine_pipeline(") |
| print(f" '{video_path}',") |
| print(f" categorical_keywords=['human', 'dog', 'frisbee'],") |
| print(f" unary_keywords=['running', 'jumping', 'sitting'],") |
| print(f" binary_keywords=['behind', 'in front of', 'next to'],") |
| print(f" object_pairs=[(1, 2)],") |
| print(f" segmentation_method='grounding_dino_sam2',") |
| print(f" return_top_k=3,") |
| print(f" return_flattened_segments=True,") |
| print(f" return_valid_pairs=True,") |
| print(f" include_visualizations=True,") |
| print(f" debug_visualizations=True") |
| print(f")") |
| |
| |
|
|
|
|
| def example_huggingface_hub_usage(): |
| """Example of how to push and load from HuggingFace Hub.""" |
| print("\n=== HuggingFace Hub Usage ===") |
| |
| |
| config = VineConfig() |
| model = VineModel(config) |
| |
| |
| config.register_for_auto_class() |
| model.register_for_auto_class("AutoModel") |
| |
| print("Model registered for auto classes") |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| print("To push to Hub:") |
| print("1. config.push_to_hub('your-username/vine-model')") |
| print("2. model.push_to_hub('your-username/vine-model')") |
| print("\nTo load from Hub:") |
| print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)") |
| print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)") |
|
|
|
|
| def example_with_real_video(): |
| """Example showing how to use with a real video file.""" |
| print("\n=== Real Video Usage Example ===") |
| |
| |
| demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") |
| |
| if os.path.exists(demo_video_path): |
| print(f"Found demo video: {demo_video_path}") |
| |
| |
| PIPELINE_REGISTRY.register_pipeline( |
| "vine-video-understanding", |
| pipeline_class=VinePipeline, |
| pt_model=VineModel, |
| type="multimodal", |
| ) |
| |
| vine_config = VineConfig( |
| model_name="openai/clip-vit-base-patch32", |
| use_hf_repo=True, |
| model_repo="video-fm/vine_v0", |
| segmentation_method="grounding_dino_sam2", |
| debug_visualizations=True, |
| debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"), |
| ) |
| |
| vine_pipeline = VinePipeline( |
| model=VineModel(vine_config), |
| tokenizer=None, |
| trust_remote_code=True, |
| |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", |
| ) |
| |
| |
| categorical_keywords = ['human', 'dog', 'frisbee'] |
| unary_keywords = ['running', 'jumping', 'catching', 'throwing'] |
| binary_keywords = ['behind', 'in front of', 'next to', 'chasing'] |
| object_pairs = [(0, 1), (0, 2), (1, 2)] |
| |
| print("\nProcessing video with VINE...") |
| print("Keywords:") |
| print(f" Categorical: {categorical_keywords}") |
| print(f" Unary: {unary_keywords}") |
| print(f" Binary: {binary_keywords}") |
| print(f" Object pairs: {object_pairs}") |
| |
| |
| try: |
| results = vine_pipeline( |
| demo_video_path, |
| categorical_keywords=categorical_keywords, |
| unary_keywords=unary_keywords, |
| binary_keywords=binary_keywords, |
| object_pairs=object_pairs, |
| segmentation_method='grounding_dino_sam2', |
| return_top_k=3, |
| include_visualizations=False, |
| debug_visualizations=True, |
| ) |
| |
| print("\nResults:") |
| print(f"Summary: {results['summary']}") |
| |
| except Exception as e: |
| print(f"Note: Full execution requires segmentation models to be properly set up.") |
| print(f"Error: {e}") |
| |
| else: |
| print(f"Demo video not found at: {demo_video_path}") |
| print("To use with a real video, provide the path to your video file.") |
|
|
|
|
| if __name__ == "__main__": |
| print("VINE HuggingFace Interface Examples") |
| print("=" * 50) |
| |
| |
| try: |
| example_direct_model_usage() |
| except Exception as e: |
| print(f"Direct model usage failed: {e}") |
| |
| try: |
| example_pipeline_usage() |
| except Exception as e: |
| print(f"Pipeline usage failed: {e}") |
| |
| try: |
| example_huggingface_hub_usage() |
| except Exception as e: |
| print(f"Hub usage example failed: {e}") |
| |
| try: |
| example_with_real_video() |
| except Exception as e: |
| print(f"Real video example failed: {e}") |
| |
| print("\n" + "=" * 50) |
| print("Examples completed!") |
| print("\nNext steps:") |
| print("1. Set up Grounding DINO and SAM2 models for segmentation") |
| print("2. Load your pretrained VINE model weights") |
| print("3. Test with your own videos") |
| print("4. Push to HuggingFace Hub for sharing") |
|
|