peterproofpath
/

vjepa2

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 9

Commit

79053cb

verified ·

1 Parent(s): 2966b9e

Upload 2 files

Browse files

Files changed (2) hide show

handler.py +210 -0
requirements.txt +17 -0

handler.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+V-JEPA 2 Custom Inference Handler for Hugging Face Inference Endpoints
+Model: facebook/vjepa2-vitl-fpc64-256 (Large variant - good balance of performance/resources)
+For ProofPath video assessment - extracts motion features from skill demonstration videos.
+"""
+from typing import Dict, List, Any, Optional
+import torch
+import numpy as np
+import base64
+import io
+import tempfile
+import os
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize V-JEPA 2 model for video feature extraction.
+        Args:
+            path: Path to the model directory (provided by HF Inference Endpoints)
+        """
+        from transformers import AutoVideoProcessor, AutoModel
+        # Use the model path provided by the endpoint, or default to HF hub
+        model_id = path if path else "facebook/vjepa2-vitl-fpc64-256"
+        # Determine device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load processor and model
+        self.processor = AutoVideoProcessor.from_pretrained(model_id)
+        self.model = AutoModel.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto" if torch.cuda.is_available() else None,
+            attn_implementation="sdpa"  # Use scaled dot product attention for efficiency
+        )
+        if not torch.cuda.is_available():
+            self.model = self.model.to(self.device)
+        self.model.eval()
+        # Default config
+        self.default_num_frames = 64  # V-JEPA 2 is trained with 64 frames
+    def _decode_video(self, video_data: Any) -> torch.Tensor:
+        """
+        Decode video from various input formats.
+        Supports:
+        - Base64 encoded video bytes
+        - URL to video file
+        - Raw bytes
+        """
+        from torchcodec.decoders import VideoDecoder
+        # Handle base64 encoded video
+        if isinstance(video_data, str):
+            if video_data.startswith(('http://', 'https://')):
+                # URL - torchcodec can handle URLs directly
+                vr = VideoDecoder(video_data)
+            elif video_data.startswith('data:'):
+                # Data URL format
+                header, encoded = video_data.split(',', 1)
+                video_bytes = base64.b64decode(encoded)
+                # Write to temp file for torchcodec
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    temp_path = f.name
+                vr = VideoDecoder(temp_path)
+                os.unlink(temp_path)
+            else:
+                # Assume base64 encoded
+                video_bytes = base64.b64decode(video_data)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    temp_path = f.name
+                vr = VideoDecoder(temp_path)
+                os.unlink(temp_path)
+        elif isinstance(video_data, bytes):
+            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                f.write(video_data)
+                temp_path = f.name
+            vr = VideoDecoder(temp_path)
+            os.unlink(temp_path)
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_data)}")
+        return vr
+    def _sample_frames(
+        self,
+        video_decoder,
+        num_frames: int = 64,
+        sampling_strategy: str = "uniform"
+    ) -> torch.Tensor:
+        """
+        Sample frames from video decoder.
+        Args:
+            video_decoder: torchcodec VideoDecoder instance
+            num_frames: Number of frames to sample
+            sampling_strategy: "uniform" or "random"
+        """
+        # Get video metadata
+        metadata = video_decoder.metadata
+        total_frames = metadata.num_frames if hasattr(metadata, 'num_frames') else 1000
+        if sampling_strategy == "uniform":
+            # Uniformly sample frames across the video
+            if total_frames <= num_frames:
+                frame_idx = np.arange(total_frames)
+            else:
+                frame_idx = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        elif sampling_strategy == "random":
+            frame_idx = np.sort(np.random.choice(total_frames, min(num_frames, total_frames), replace=False))
+        else:
+            # Default to sequential from start
+            frame_idx = np.arange(min(num_frames, total_frames))
+        # Get frames: returns T x C x H x W
+        frames = video_decoder.get_frames_at(indices=frame_idx.tolist()).data
+        return frames
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process video and extract V-JEPA 2 features.
+        Expected input format:
+        {
+            "inputs": <base64_video_string or video_url>,
+            "parameters": {
+                "num_frames": 64,           # Optional: number of frames to sample
+                "sampling_strategy": "uniform",  # Optional: "uniform" or "random"
+                "return_predictor": true,   # Optional: also return predictor features
+                "pooling": "mean"           # Optional: "mean", "cls", or "none"
+            }
+        }
+        Returns:
+        {
+            "encoder_features": [...],      # Encoder output features
+            "predictor_features": [...],    # Optional predictor features
+            "feature_shape": [T, D],        # Shape of features
+        }
+        """
+        # Extract inputs
+        inputs = data.get("inputs")
+        if inputs is None:
+            inputs = data.get("video")
+        if inputs is None:
+            raise ValueError("No video input provided. Use 'inputs' or 'video' key.")
+        # Extract parameters
+        params = data.get("parameters", {})
+        num_frames = params.get("num_frames", self.default_num_frames)
+        sampling_strategy = params.get("sampling_strategy", "uniform")
+        return_predictor = params.get("return_predictor", False)
+        pooling = params.get("pooling", "mean")
+        try:
+            # Decode and sample video
+            video_decoder = self._decode_video(inputs)
+            frames = self._sample_frames(video_decoder, num_frames, sampling_strategy)
+            # Process through V-JEPA 2 processor
+            processed = self.processor(frames, return_tensors="pt")
+            processed = {k: v.to(self.model.device) for k, v in processed.items()}
+            # Run inference
+            with torch.no_grad():
+                outputs = self.model(**processed)
+            # Extract encoder features
+            encoder_features = outputs.last_hidden_state  # [batch, seq, hidden]
+            # Apply pooling
+            if pooling == "mean":
+                encoder_pooled = encoder_features.mean(dim=1)  # [batch, hidden]
+            elif pooling == "cls":
+                encoder_pooled = encoder_features[:, 0, :]  # [batch, hidden]
+            else:
+                encoder_pooled = encoder_features  # [batch, seq, hidden]
+            result = {
+                "encoder_features": encoder_pooled.cpu().numpy().tolist(),
+                "feature_shape": list(encoder_pooled.shape),
+            }
+            # Optionally include predictor features
+            if return_predictor and hasattr(outputs, 'predictor_output'):
+                predictor_features = outputs.predictor_output.last_hidden_state
+                if pooling == "mean":
+                    predictor_pooled = predictor_features.mean(dim=1)
+                elif pooling == "cls":
+                    predictor_pooled = predictor_features[:, 0, :]
+                else:
+                    predictor_pooled = predictor_features
+                result["predictor_features"] = predictor_pooled.cpu().numpy().tolist()
+                result["predictor_shape"] = list(predictor_pooled.shape)
+            return result
+        except Exception as e:
+            return {"error": str(e), "error_type": type(e).__name__}

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# V-JEPA 2 Inference Endpoint Requirements
+# Note: transformers and torch are pre-installed in HF Inference containers
+# For latest V-JEPA 2 support (may need bleeding edge)
+transformers>=4.45.0
+torch>=2.0.0
+# Video decoding
+torchcodec>=0.1.0
+# Standard deps (usually pre-installed)
+numpy>=1.24.0
+einops>=0.7.0
+timm>=0.9.0
+# For efficient attention
+accelerate>=0.25.0