peterproofpath
/

eagle

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

138f945

verified ·

1 Parent(s): d567777

Upload 2 files

Browse files

Files changed (2) hide show

handler.py +511 -0
requirements.txt +24 -0

handler.py ADDED Viewed

	@@ -0,0 +1,511 @@

+"""
+Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
+Model: nvidia/Eagle2.5-8B
+For ProofPath video assessment - long video understanding with up to 512 frames.
+Ideal for full rubric-based video grading in a single call.
+"""
+from typing import Dict, List, Any, Optional, Union
+import torch
+import numpy as np
+import base64
+import io
+import tempfile
+import os
+import re
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize Eagle 2.5 model for video understanding.
+        Args:
+            path: Path to the model directory (provided by HF Inference Endpoints)
+        """
+        from transformers import AutoProcessor, AutoModel, AutoTokenizer
+        # Use the model path provided by the endpoint, or default to HF hub
+        model_id = path if path else "nvidia/Eagle2.5-8B"
+        # Determine device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load processor, tokenizer, and model
+        self.processor = AutoProcessor.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            use_fast=True
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            use_fast=True
+        )
+        self.processor.tokenizer.padding_side = "left"
+        self.model = AutoModel.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
+        )
+        if torch.cuda.is_available():
+            self.model = self.model.to(self.device)
+        self.model.eval()
+        # Default config - Eagle 2.5 supports up to 512 frames
+        self.default_max_frames = 256  # Conservative default
+        self.max_frames_limit = 512
+    def _load_video_frames(
+        self,
+        video_data: Any,
+        max_frames: int = 256,
+        fps: float = 2.0
+    ) -> List:
+        """
+        Load video frames from various input formats.
+        Supports:
+        - URL to video file
+        - Base64 encoded video
+        - Raw bytes
+        """
+        import cv2
+        from PIL import Image
+        # Decode video to temp file if needed
+        if isinstance(video_data, str):
+            if video_data.startswith(('http://', 'https://')):
+                # URL - download to temp file
+                import requests
+                response = requests.get(video_data, stream=True)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                    video_path = f.name
+            elif video_data.startswith('data:'):
+                # Data URL format
+                header, encoded = video_data.split(',', 1)
+                video_bytes = base64.b64decode(encoded)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    video_path = f.name
+            else:
+                # Assume base64 encoded
+                video_bytes = base64.b64decode(video_data)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    video_path = f.name
+        elif isinstance(video_data, bytes):
+            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                f.write(video_data)
+                video_path = f.name
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_data)}")
+        try:
+            # Open video with OpenCV
+            cap = cv2.VideoCapture(video_path)
+            video_fps = cap.get(cv2.CAP_PROP_FPS)
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            duration = total_frames / video_fps if video_fps > 0 else 0
+            # Calculate frame indices to sample
+            target_frames = min(max_frames, int(duration * fps), total_frames)
+            if target_frames <= 0:
+                target_frames = min(max_frames, total_frames)
+            frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+            frames = []
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+                ret, frame = cap.read()
+                if ret:
+                    # Convert BGR to RGB
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(frame_rgb)
+                    frames.append(pil_image)
+            cap.release()
+            return frames, {
+                "duration": duration,
+                "total_frames": total_frames,
+                "sampled_frames": len(frames),
+                "video_fps": video_fps
+            }
+        finally:
+            # Clean up temp file
+            if os.path.exists(video_path):
+                os.unlink(video_path)
+    def _load_image(self, image_data: Any):
+        """Load a single image from various formats."""
+        from PIL import Image
+        import requests
+        if isinstance(image_data, Image.Image):
+            return image_data
+        elif isinstance(image_data, str):
+            if image_data.startswith(('http://', 'https://')):
+                response = requests.get(image_data, stream=True)
+                return Image.open(response.raw).convert('RGB')
+            elif image_data.startswith('data:'):
+                header, encoded = image_data.split(',', 1)
+                image_bytes = base64.b64decode(encoded)
+                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
+            else:
+                image_bytes = base64.b64decode(image_data)
+                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        elif isinstance(image_data, bytes):
+            return Image.open(io.BytesIO(image_data)).convert('RGB')
+        else:
+            raise ValueError(f"Unsupported image input type: {type(image_data)}")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process video or images with Eagle 2.5.
+        Expected input formats:
+        1. Video analysis:
+        {
+            "inputs": <video_url_or_base64>,
+            "parameters": {
+                "prompt": "Describe what happens in this video.",
+                "max_frames": 256,
+                "fps": 2.0,
+                "max_new_tokens": 2048
+            }
+        }
+        2. Image analysis:
+        {
+            "inputs": <image_url_or_base64>,
+            "parameters": {
+                "prompt": "Describe this image.",
+                "max_new_tokens": 512
+            }
+        }
+        3. Multi-image analysis:
+        {
+            "inputs": [<image1>, <image2>, ...],
+            "parameters": {
+                "prompt": "Compare these images.",
+                "max_new_tokens": 1024
+            }
+        }
+        4. ProofPath rubric grading:
+        {
+            "inputs": <video_url>,
+            "parameters": {
+                "mode": "rubric",
+                "rubric": [
+                    {"step": 1, "description": "Click cell B2"},
+                    {"step": 2, "description": "Type 123"},
+                    {"step": 3, "description": "Press Enter"}
+                ],
+                "max_frames": 512,
+                "output_format": "json"
+            }
+        }
+        Returns:
+        {
+            "generated_text": "...",
+            "video_metadata": {...},  # If video input
+        }
+        """
+        inputs = data.get("inputs")
+        if inputs is None:
+            inputs = data.get("video") or data.get("image") or data.get("images")
+        if inputs is None:
+            raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")
+        params = data.get("parameters", {})
+        mode = params.get("mode", "default")
+        prompt = params.get("prompt", "Describe this content in detail.")
+        max_new_tokens = params.get("max_new_tokens", 2048)
+        try:
+            if mode == "rubric":
+                return self._grade_rubric(inputs, params)
+            elif isinstance(inputs, list):
+                return self._process_multi_image(inputs, prompt, max_new_tokens)
+            elif self._is_video(inputs, params):
+                return self._process_video(inputs, prompt, params, max_new_tokens)
+            else:
+                return self._process_image(inputs, prompt, max_new_tokens)
+        except Exception as e:
+            return {"error": str(e), "error_type": type(e).__name__}
+    def _is_video(self, inputs: Any, params: Dict) -> bool:
+        """Determine if input is video based on params or file extension."""
+        if params.get("input_type") == "video":
+            return True
+        if params.get("input_type") == "image":
+            return False
+        if isinstance(inputs, str):
+            lower = inputs.lower()
+            video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
+            return any(ext in lower for ext in video_exts)
+        return False
+    def _process_video(
+        self,
+        video_data: Any,
+        prompt: str,
+        params: Dict,
+        max_new_tokens: int
+    ) -> Dict[str, Any]:
+        """Process a video input."""
+        max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
+        fps = params.get("fps", 2.0)
+        # Load video frames
+        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
+        # Build message for Eagle 2.5
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "video", "video": frames},
+                ],
+            }
+        ]
+        # Process with Eagle 2.5 processor
+        text_list = [self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )]
+        image_inputs, video_inputs = self.processor.process_vision_info(messages)
+        inputs = self.processor(
+            text=text_list,
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        # Generate
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        # Decode
+        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return {
+            "generated_text": generated_text,
+            "video_metadata": video_metadata
+        }
+    def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
+        """Process a single image."""
+        image = self._load_image(image_data)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image", "image": image},
+                ],
+            }
+        ]
+        text_list = [self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )]
+        image_inputs, video_inputs = self.processor.process_vision_info(messages)
+        inputs = self.processor(
+            text=text_list,
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return {
+            "generated_text": generated_text,
+            "image_size": {"width": image.width, "height": image.height}
+        }
+    def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
+        """Process multiple images."""
+        images = [self._load_image(img) for img in images_data]
+        # Build content with all images
+        content = [{"type": "text", "text": prompt}]
+        for image in images:
+            content.append({"type": "image", "image": image})
+        messages = [{"role": "user", "content": content}]
+        text_list = [self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )]
+        image_inputs, video_inputs = self.processor.process_vision_info(messages)
+        inputs = self.processor(
+            text=text_list,
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return {
+            "generated_text": generated_text,
+            "num_images": len(images)
+        }
+    def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
+        """
+        Grade a video against a rubric - ProofPath specific mode.
+        """
+        rubric = params.get("rubric", [])
+        if not rubric:
+            raise ValueError("Rubric required for rubric mode")
+        max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
+        fps = params.get("fps", 2.0)
+        output_format = params.get("output_format", "json")
+        # Load video
+        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
+        # Build rubric prompt
+        rubric_text = "\n".join([
+            f"Step {item.get('step', i+1)}: {item.get('description', '')}"
+            for i, item in enumerate(rubric)
+        ])
+        if output_format == "json":
+            prompt = f"""Analyze this video against the following rubric and grade each step.
+RUBRIC:
+{rubric_text}
+For EACH step, determine:
+1. Whether it was completed (true/false)
+2. The approximate timestamp where it occurs (if completed)
+3. Any issues or partial completion notes
+Respond ONLY with a JSON array in this exact format:
+[
+  {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
+  {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
+  ...
+]"""
+        else:
+            prompt = f"""Analyze this video against the following rubric:
+RUBRIC:
+{rubric_text}
+For each step, describe whether it was completed, when it occurred, and any issues observed."""
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "video", "video": frames},
+                ],
+            }
+        ]
+        text_list = [self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )]
+        image_inputs, video_inputs = self.processor.process_vision_info(messages)
+        inputs = self.processor(
+            text=text_list,
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=params.get("max_new_tokens", 2048),
+                do_sample=False,
+            )
+        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
+        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        result = {
+            "generated_text": generated_text,
+            "video_metadata": video_metadata,
+            "rubric": rubric
+        }
+        # Try to parse JSON if requested
+        if output_format == "json":
+            try:
+                import json
+                # Extract JSON array from response
+                json_match = re.search(r'\[[\s\S]*\]', generated_text)
+                if json_match:
+                    result["grading_results"] = json.loads(json_match.group())
+            except json.JSONDecodeError:
+                pass  # Keep raw text if JSON parsing fails
+        return result

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# Eagle 2.5 Inference Endpoint Requirements
+# Note: transformers and torch are pre-installed in HF Inference containers
+# For Eagle 2.5 support (needs recent transformers)
+transformers>=4.45.0
+torch>=2.0.0
+# Video processing
+opencv-python-headless>=4.8.0
+decord>=0.6.0
+# Image processing
+Pillow>=9.0.0
+requests>=2.28.0
+# Standard deps
+numpy>=1.24.0
+einops>=0.7.0
+# For efficient attention (flash attention)
+accelerate>=0.25.0
+# Optional: for better video decoding
+# av>=10.0.0