peterproofpath
/

eagle

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

5e5265b

verified ·

1 Parent(s): 14de624

Update requirements.txt

Browse files

Files changed (1) hide show

requirements.txt +554 -19

requirements.txt CHANGED Viewed

@@ -1,25 +1,560 @@
-# Eagle 2.5 Inference Endpoint Requirements
-# CRITICAL: Eagle 2.5 requires transformers >= 4.53.0 for auto_docstring support
-# Exact version required for Eagle 2.5's custom code
-transformers>=4.53.0
-torch>=2.0.0
-# CRITICAL: Eagle 2.5 uses Qwen2-VL architecture
-qwen-vl-utils>=0.0.8
-# Video processing
-opencv-python-headless>=4.8.0
-av>=10.0.0
-decord
-# Image processing
-Pillow>=9.0.0
-requests>=2.28.0
-# Standard deps - pin numpy to avoid conflicts
-numpy>=1.24.0,<2.0.0
-einops>=0.7.0
-# For efficient attention (flash attention)
-accelerate>=0.25.0

+"""
+Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
+Model: nvidia/Eagle2.5-8B
+For ProofPath video assessment - long video understanding with up to 512 frames.
+Ideal for full rubric-based video grading in a single call.
+REQUIREMENTS:
+1. Set HF_TOKEN environment variable (model is gated)
+2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B
+"""
+from typing import Dict, List, Any, Optional, Union
+import torch
+import numpy as np
+import base64
+import io
+import tempfile
+import os
+import re
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize Eagle 2.5 model for video understanding.
+        Args:
+            path: Path to the model directory (ignored - we always load from HF hub)
+        """
+        # IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
+        # The repository only contains handler.py and requirements.txt
+        model_id = "nvidia/Eagle2.5-8B"
+        # Get HF token from environment for gated model access
+        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+        # Determine device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Eagle 2.5 uses Qwen2VL architecture - use AutoProcessor with use_fast=False
+        # to avoid the broken Eagle2_5_VLVideoProcessorFast class
+        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+        self.processor = AutoProcessor.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            token=hf_token,
+            use_fast=True,  # Eagle2_5_VLImageProcessorFast requires use_fast=True
+        )
+        # Set padding side for batch processing
+        if hasattr(self.processor, 'tokenizer'):
+            self.processor.tokenizer.padding_side = "left"
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
+            device_map="auto" if torch.cuda.is_available() else None,
+            token=hf_token,
+        )
+        if not torch.cuda.is_available():
+            self.model = self.model.to(self.device)
+        self.model.eval()
+        # Default config - Eagle 2.5 supports up to 512 frames
+        self.default_max_frames = 256  # Conservative default
+        self.max_frames_limit = 512
+    def _load_video_frames(
+        self,
+        video_data: Any,
+        max_frames: int = 256,
+        fps: float = 2.0
+    ) -> tuple:
+        """
+        Load video frames from various input formats.
+        Supports:
+        - URL to video file
+        - Base64 encoded video
+        - Raw bytes
+        """
+        import cv2
+        from PIL import Image
+        # Decode video to temp file if needed
+        if isinstance(video_data, str):
+            if video_data.startswith(('http://', 'https://')):
+                # URL - download to temp file
+                import requests
+                response = requests.get(video_data, stream=True)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                    video_path = f.name
+            elif video_data.startswith('data:'):
+                # Data URL format
+                header, encoded = video_data.split(',', 1)
+                video_bytes = base64.b64decode(encoded)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    video_path = f.name
+            else:
+                # Assume base64 encoded
+                video_bytes = base64.b64decode(video_data)
+                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                    f.write(video_bytes)
+                    video_path = f.name
+        elif isinstance(video_data, bytes):
+            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                f.write(video_data)
+                video_path = f.name
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_data)}")
+        try:
+            # Open video with OpenCV
+            cap = cv2.VideoCapture(video_path)
+            video_fps = cap.get(cv2.CAP_PROP_FPS)
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            duration = total_frames / video_fps if video_fps > 0 else 0
+            # Calculate frame indices to sample
+            target_frames = min(max_frames, int(duration * fps), total_frames)
+            if target_frames <= 0:
+                target_frames = min(max_frames, total_frames)
+            frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+            frames = []
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+                ret, frame = cap.read()
+                if ret:
+                    # Convert BGR to RGB
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(frame_rgb)
+                    frames.append(pil_image)
+            cap.release()
+            return frames, {
+                "duration": duration,
+                "total_frames": total_frames,
+                "sampled_frames": len(frames),
+                "video_fps": video_fps
+            }
+        finally:
+            # Clean up temp file
+            if os.path.exists(video_path):
+                os.unlink(video_path)
+    def _load_image(self, image_data: Any):
+        """Load a single image from various formats."""
+        from PIL import Image
+        import requests
+        if isinstance(image_data, Image.Image):
+            return image_data
+        elif isinstance(image_data, str):
+            if image_data.startswith(('http://', 'https://')):
+                response = requests.get(image_data, stream=True)
+                return Image.open(response.raw).convert('RGB')
+            elif image_data.startswith('data:'):
+                header, encoded = image_data.split(',', 1)
+                image_bytes = base64.b64decode(encoded)
+                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
+            else:
+                image_bytes = base64.b64decode(image_data)
+                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
+        elif isinstance(image_data, bytes):
+            return Image.open(io.BytesIO(image_data)).convert('RGB')
+        else:
+            raise ValueError(f"Unsupported image input type: {type(image_data)}")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process video or images with Eagle 2.5.
+        Expected input formats:
+        1. Video analysis:
+        {
+            "inputs": <video_url_or_base64>,
+            "parameters": {
+                "prompt": "Describe what happens in this video.",
+                "max_frames": 256,
+                "fps": 2.0,
+                "max_new_tokens": 2048
+            }
+        }
+        2. Image analysis:
+        {
+            "inputs": <image_url_or_base64>,
+            "parameters": {
+                "prompt": "Describe this image.",
+                "max_new_tokens": 512
+            }
+        }
+        3. Multi-image analysis:
+        {
+            "inputs": [<image1>, <image2>, ...],
+            "parameters": {
+                "prompt": "Compare these images.",
+                "max_new_tokens": 1024
+            }
+        }
+        4. ProofPath rubric grading:
+        {
+            "inputs": <video_url>,
+            "parameters": {
+                "mode": "rubric",
+                "rubric": [
+                    {"step": 1, "description": "Click cell B2"},
+                    {"step": 2, "description": "Type 123"},
+                    {"step": 3, "description": "Press Enter"}
+                ],
+                "max_frames": 512,
+                "output_format": "json"
+            }
+        }
+        Returns:
+        {
+            "generated_text": "...",
+            "video_metadata": {...},  # If video input
+        }
+        """
+        inputs = data.get("inputs")
+        if inputs is None:
+            inputs = data.get("video") or data.get("image") or data.get("images")
+        if inputs is None:
+            raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")
+        params = data.get("parameters", {})
+        mode = params.get("mode", "default")
+        prompt = params.get("prompt", "Describe this content in detail.")
+        max_new_tokens = params.get("max_new_tokens", 2048)
+        try:
+            if mode == "rubric":
+                return self._grade_rubric(inputs, params)
+            elif isinstance(inputs, list):
+                return self._process_multi_image(inputs, prompt, max_new_tokens)
+            elif self._is_video(inputs, params):
+                return self._process_video(inputs, prompt, params, max_new_tokens)
+            else:
+                return self._process_image(inputs, prompt, max_new_tokens)
+        except Exception as e:
+            import traceback
+            return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}
+    def _is_video(self, inputs: Any, params: Dict) -> bool:
+        """Determine if input is video based on params or file extension."""
+        if params.get("input_type") == "video":
+            return True
+        if params.get("input_type") == "image":
+            return False
+        if isinstance(inputs, str):
+            lower = inputs.lower()
+            video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
+            return any(ext in lower for ext in video_exts)
+        return False
+    def _process_video(
+        self,
+        video_data: Any,
+        prompt: str,
+        params: Dict,
+        max_new_tokens: int
+    ) -> Dict[str, Any]:
+        """Process a video input."""
+        from qwen_vl_utils import process_vision_info
+        max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
+        fps = params.get("fps", 2.0)
+        # Load video frames
+        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
+        # Build message for Eagle 2.5 / Qwen2-VL format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": frames, "fps": fps},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        # Apply chat template
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process vision info
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        # Generate
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        # Decode - only the new tokens
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return {
+            "generated_text": generated_text,
+            "video_metadata": video_metadata
+        }
+    def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
+        """Process a single image."""
+        from qwen_vl_utils import process_vision_info
+        image = self._load_image(image_data)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return {
+            "generated_text": generated_text,
+            "image_size": {"width": image.width, "height": image.height}
+        }
+    def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
+        """Process multiple images."""
+        from qwen_vl_utils import process_vision_info
+        images = [self._load_image(img) for img in images_data]
+        # Build content with all images
+        content = []
+        for image in images:
+            content.append({"type": "image", "image": image})
+        content.append({"type": "text", "text": prompt})
+        messages = [{"role": "user", "content": content}]
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return {
+            "generated_text": generated_text,
+            "num_images": len(images)
+        }
+    def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
+        """
+        Grade a video against a rubric - ProofPath specific mode.
+        """
+        from qwen_vl_utils import process_vision_info
+        rubric = params.get("rubric", [])
+        if not rubric:
+            raise ValueError("Rubric required for rubric mode")
+        max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
+        fps = params.get("fps", 2.0)
+        output_format = params.get("output_format", "json")
+        # Load video
+        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
+        # Build rubric prompt
+        rubric_text = "\n".join([
+            f"Step {item.get('step', i+1)}: {item.get('description', '')}"
+            for i, item in enumerate(rubric)
+        ])
+        if output_format == "json":
+            prompt = f"""Analyze this video against the following rubric and grade each step.
+RUBRIC:
+{rubric_text}
+For EACH step, determine:
+1. Whether it was completed (true/false)
+2. The approximate timestamp where it occurs (if completed)
+3. Any issues or partial completion notes
+Respond ONLY with a JSON array in this exact format:
+[
+  {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
+  {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
+  ...
+]"""
+        else:
+            prompt = f"""Analyze this video against the following rubric:
+RUBRIC:
+{rubric_text}
+For each step, describe whether it was completed, when it occurred, and any issues observed."""
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": frames, "fps": fps},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        with torch.inference_mode():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=params.get("max_new_tokens", 2048),
+                do_sample=False,
+            )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        result = {
+            "generated_text": generated_text,
+            "video_metadata": video_metadata,
+            "rubric": rubric
+        }
+        # Try to parse JSON if requested
+        if output_format == "json":
+            try:
+                import json
+                # Extract JSON array from response
+                json_match = re.search(r'\[[\s\S]*\]', generated_text)
+                if json_match:
+                    result["grading_results"] = json.loads(json_match.group())
+            except json.JSONDecodeError:
+                pass  # Keep raw text if JSON parsing fails
+        return result