| """ |
| Example demonstrating how to load and use VINE ensemble weights |
| |
| This script shows the correct way to load your pretrained VINE ensemble weights |
| and use them with the HuggingFace interface, based on the actual inference.py workflow. |
| """ |
|
|
| import os |
| import sys |
| from pathlib import Path |
| import torch |
| import numpy as np |
| from transformers.pipelines import PIPELINE_REGISTRY |
|
|
| |
|
|
| |
| current_dir = Path(__file__).resolve().parent |
| src_dir = current_dir.parent / "src" |
| if src_dir.is_dir() and str(src_dir) not in sys.path: |
| sys.path.insert(0, str(src_dir)) |
|
|
| from vine_hf import VineConfig, VineModel, VinePipeline |
| from laser.loading import load_video |
|
|
|
|
| def example_load_ensemble_weights(): |
| """Example of loading ensemble weights correctly.""" |
| print("=== Loading Ensemble VINE Weights ===") |
| |
| |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") |
| |
| print(f"Looking for ensemble weights in: {model_dir}") |
| |
| if os.path.exists(model_dir): |
| print("β Model directory found") |
| |
| |
| model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')] |
| print(f"Available model files: {model_files}") |
| |
| if model_files: |
| |
| config = VineConfig( |
| segmentation_method="grounding_dino_sam2", |
| use_hf_repo=False, |
| local_dir=model_dir, |
| local_filename=None, |
| ) |
| |
| print("Creating VINE model with ensemble weights...") |
| vine_model = VineModel(config) |
| |
| print("β VINE model created with ensemble weights!") |
| return vine_model |
| else: |
| print("β No .model files found in directory") |
| return None |
| else: |
| print(f"β Model directory not found: {model_dir}") |
| print("Please adjust the path to point to your ensemble weights") |
| return None |
|
|
|
|
| def example_direct_ensemble_loading(): |
| """Example of loading ensemble weights using from_pretrained_vine.""" |
| print("\n=== Direct Ensemble Loading ===") |
| |
| |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") |
| |
| if os.path.exists(model_dir): |
| try: |
| |
| vine_model = VineModel.from_pretrained_vine( |
| model_path=model_dir, |
| epoch=0 |
| ) |
| |
| print("β Model loaded using from_pretrained_vine!") |
| return vine_model |
| |
| except Exception as e: |
| print(f"β Error loading with from_pretrained_vine: {e}") |
| return None |
| else: |
| print(f"β Model directory not found: {model_dir}") |
| return None |
|
|
|
|
| def example_compare_original_vs_hf(): |
| """Compare the original inference.py approach with HuggingFace interface.""" |
| print("\n=== Comparing Original vs HuggingFace Interface ===") |
| |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") |
| model_name = "ensemble-2025-02-10-14-57-22" |
| epoch = 0 |
| |
| if not os.path.exists(model_dir): |
| print(f"Model directory not found: {model_dir}") |
| return |
| |
| print("Original approach (from inference.py):") |
| print("```python") |
| print("def load_model(model_dir, model_name, epoch, device):") |
| print(" model_name = model_name + f'.{epoch}.model'") |
| print(" predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)") |
| print(" return predicate_model") |
| print("") |
| print("predicate_model = load_model(model_dir, model_name, epoch, device)") |
| print("```") |
| |
| print("\nNew HuggingFace approach:") |
| print("```python") |
| print("config = VineConfig(pretrained_vine_path=model_dir)") |
| print("vine_model = VineModel(config)") |
| print("# or") |
| print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)") |
| print("```") |
| |
| |
| try: |
| |
| def load_model(model_dir, model_name, epoch, device): |
| model_name = model_name + f'.{epoch}.model' |
| model_path = os.path.join(model_dir, model_name) |
| if os.path.exists(model_path): |
| return torch.load(model_path, map_location=device, weights_only=False) |
| else: |
| print(f"Model file not found: {model_path}") |
| return None |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| original_model = load_model(model_dir, model_name, epoch, device) |
| |
| if original_model: |
| print(f"β Original model loaded: {type(original_model)}") |
| print(f" Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}") |
| print(f" Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}") |
| print(f" Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}") |
| |
| |
| vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch) |
| |
| if vine_model: |
| print(f"β HuggingFace model loaded: {type(vine_model)}") |
| print(f" Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}") |
| print(f" Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}") |
| print(f" Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}") |
| |
| print("\nβ Both approaches work! HuggingFace interface successfully loads ensemble weights.") |
| |
| except Exception as e: |
| print(f"Error in comparison: {e}") |
|
|
|
|
| def example_ensemble_with_pipeline(): |
| """Example using ensemble weights with the pipeline.""" |
| print("\n=== Using Ensemble Weights with Pipeline ===") |
| |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") |
| |
| if not os.path.exists(model_dir): |
| print(f"Model directory not found: {model_dir}") |
| return |
| |
| |
| PIPELINE_REGISTRY.register_pipeline( |
| "vine-video-understanding", |
| pipeline_class=VinePipeline, |
| pt_model=VineModel, |
| type="multimodal", |
| ) |
| |
| |
| config = VineConfig( |
| segmentation_method="grounding_dino_sam2", |
| use_hf_repo=False, |
| local_dir=model_dir, |
| local_filename=None, |
| ) |
| |
| vine_model = VineModel(config) |
| |
| vine_pipeline = VinePipeline( |
| model=vine_model, |
| tokenizer=None, |
| |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", |
| |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", |
| device="cuda" if torch.cuda.is_available() else "cpu", |
| ) |
| |
| print("β Pipeline created with ensemble VINE weights") |
| |
| |
| demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") |
| |
| if os.path.exists(demo_video): |
| print(f"Found demo video: {demo_video}") |
| |
| |
| categorical_keywords = ['human', 'dog', 'frisbee'] |
| unary_keywords = ['running', 'jumping', 'catching', 'throwing'] |
| binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left'] |
| |
| print("Example pipeline usage:") |
| print("```python") |
| print("results = vine_pipeline(") |
| print(f" '{demo_video}',") |
| print(f" categorical_keywords={categorical_keywords},") |
| print(f" unary_keywords={unary_keywords},") |
| print(f" binary_keywords={binary_keywords},") |
| print(" segmentation_method='grounding_dino_sam2'") |
| print(")") |
| print("```") |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return vine_pipeline |
|
|
|
|
|
|
| def demonstrate_weight_transfer(): |
| """Demonstrate how weights are transferred from ensemble to HuggingFace format.""" |
| print("\n=== Weight Transfer Demonstration ===") |
| |
| print("The ensemble model structure (PredicateModel):") |
| print("- clip_cate_model: CLIP model for categorical classification") |
| print("- clip_unary_model: CLIP model for unary predicates") |
| print("- clip_binary_model: CLIP model for binary relations") |
| print("- clip_tokenizer: Tokenizer for text processing") |
| print("- clip_processor: Processor for image processing") |
| |
| print("\nWeight transfer process:") |
| print("1. Load ensemble model with torch.load()") |
| print("2. Initialize base CLIP models in HuggingFace format") |
| print("3. Transfer state_dict from ensemble to HuggingFace models:") |
| print(" - ensemble.clip_cate_model β hf.clip_cate_model") |
| print(" - ensemble.clip_unary_model β hf.clip_unary_model") |
| print(" - ensemble.clip_binary_model β hf.clip_binary_model") |
| print("4. Transfer tokenizer and processor") |
| |
| print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!") |
|
|
|
|
| def troubleshooting_guide(): |
| """Provide troubleshooting guide for common issues.""" |
| print("\n=== Troubleshooting Guide ===") |
| |
| print("Common Issues:") |
| print("1. 'No model file found for epoch X'") |
| print(" β Check that .model files exist in the directory") |
| print(" β Verify the epoch number is correct") |
| print(" β List files: ls /path/to/model/dir/*.model") |
| |
| print("\n2. 'Error loading VINE weights'") |
| print(" β Check file permissions") |
| print(" β Verify the model file is not corrupted") |
| print(" β Try loading with torch.load() directly first") |
| |
| print("\n3. 'CLIP model mismatch'") |
| print(" β Ensure config.model_name matches the base model used in training") |
| |
| print("\n4. 'Device mismatch errors'") |
| print(" β Models are loaded to CPU first, then moved to device") |
| print(" β Check CUDA availability with torch.cuda.is_available()") |
| |
| print("\nDebugging steps:") |
| print("1. Test loading ensemble model directly:") |
| print(" model = torch.load('path/to/model.0.model', map_location='cpu')") |
| print("2. Check model attributes:") |
| print(" print(dir(model))") |
| print("3. Verify state_dict keys:") |
| print(" print(model.clip_cate_model.state_dict().keys())") |
|
|
|
|
| if __name__ == "__main__": |
| print("VINE Ensemble Weights Loading Examples") |
| print("=" * 50) |
| |
| |
| try: |
| model1 = example_load_ensemble_weights() |
| except Exception as e: |
| print(f"Ensemble loading example failed: {e}") |
| |
| try: |
| model2 = example_direct_ensemble_loading() |
| except Exception as e: |
| print(f"Direct loading example failed: {e}") |
| |
| |
| try: |
| example_compare_original_vs_hf() |
| except Exception as e: |
| print(f"Comparison example failed: {e}") |
| |
| |
| try: |
| pipeline = example_ensemble_with_pipeline() |
| except Exception as e: |
| print(f"Pipeline example failed: {e}") |
| |
| |
| demonstrate_weight_transfer() |
| troubleshooting_guide() |
| |
| print("\n" + "=" * 50) |
| print("Key Points:") |
| print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights") |
| print("2. Use torch.load() to load the ensemble, then transfer weights") |
| print("3. The HuggingFace interface preserves your fine-tuned weights") |
| print("4. Specify pretrained_vine_path in VineConfig to auto-load weights") |
| print("5. Use VineModel.from_pretrained_vine() for direct loading") |
|
|
|
|
|
|