peterproofpath
/

eagle

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

89bf21d

verified ·

1 Parent(s): 5e5265b

Update requirements.txt

Browse files

Files changed (1) hide show

requirements.txt +12 -560

requirements.txt CHANGED Viewed

@@ -1,560 +1,12 @@
-"""
-Eagle 2.5 Custom Inference Handler for Hugging Face Inference Endpoints
-Model: nvidia/Eagle2.5-8B
-For ProofPath video assessment - long video understanding with up to 512 frames.
-Ideal for full rubric-based video grading in a single call.
-REQUIREMENTS:
-1. Set HF_TOKEN environment variable (model is gated)
-2. Accept license at https://huggingface.co/nvidia/Eagle2.5-8B
-"""
-from typing import Dict, List, Any, Optional, Union
-import torch
-import numpy as np
-import base64
-import io
-import tempfile
-import os
-import re
-class EndpointHandler:
-    def __init__(self, path: str = ""):
-        """
-        Initialize Eagle 2.5 model for video understanding.
-        Args:
-            path: Path to the model directory (ignored - we always load from HF hub)
-        """
-        # IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
-        # The repository only contains handler.py and requirements.txt
-        model_id = "nvidia/Eagle2.5-8B"
-        # Get HF token from environment for gated model access
-        hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
-        # Determine device
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Eagle 2.5 uses Qwen2VL architecture - use AutoProcessor with use_fast=False
-        # to avoid the broken Eagle2_5_VLVideoProcessorFast class
-        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-        self.processor = AutoProcessor.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            token=hf_token,
-            use_fast=True,  # Eagle2_5_VLImageProcessorFast requires use_fast=True
-        )
-        # Set padding side for batch processing
-        if hasattr(self.processor, 'tokenizer'):
-            self.processor.tokenizer.padding_side = "left"
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-            attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
-            device_map="auto" if torch.cuda.is_available() else None,
-            token=hf_token,
-        )
-        if not torch.cuda.is_available():
-            self.model = self.model.to(self.device)
-        self.model.eval()
-        # Default config - Eagle 2.5 supports up to 512 frames
-        self.default_max_frames = 256  # Conservative default
-        self.max_frames_limit = 512
-    def _load_video_frames(
-        self,
-        video_data: Any,
-        max_frames: int = 256,
-        fps: float = 2.0
-    ) -> tuple:
-        """
-        Load video frames from various input formats.
-        Supports:
-        - URL to video file
-        - Base64 encoded video
-        - Raw bytes
-        """
-        import cv2
-        from PIL import Image
-        # Decode video to temp file if needed
-        if isinstance(video_data, str):
-            if video_data.startswith(('http://', 'https://')):
-                # URL - download to temp file
-                import requests
-                response = requests.get(video_data, stream=True)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        f.write(chunk)
-                    video_path = f.name
-            elif video_data.startswith('data:'):
-                # Data URL format
-                header, encoded = video_data.split(',', 1)
-                video_bytes = base64.b64decode(encoded)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    f.write(video_bytes)
-                    video_path = f.name
-            else:
-                # Assume base64 encoded
-                video_bytes = base64.b64decode(video_data)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    f.write(video_bytes)
-                    video_path = f.name
-        elif isinstance(video_data, bytes):
-            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                f.write(video_data)
-                video_path = f.name
-        else:
-            raise ValueError(f"Unsupported video input type: {type(video_data)}")
-        try:
-            # Open video with OpenCV
-            cap = cv2.VideoCapture(video_path)
-            video_fps = cap.get(cv2.CAP_PROP_FPS)
-            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            duration = total_frames / video_fps if video_fps > 0 else 0
-            # Calculate frame indices to sample
-            target_frames = min(max_frames, int(duration * fps), total_frames)
-            if target_frames <= 0:
-                target_frames = min(max_frames, total_frames)
-            frame_indices = np.linspace(0, total_frames - 1, target_frames, dtype=int)
-            frames = []
-            for idx in frame_indices:
-                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-                ret, frame = cap.read()
-                if ret:
-                    # Convert BGR to RGB
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb)
-                    frames.append(pil_image)
-            cap.release()
-            return frames, {
-                "duration": duration,
-                "total_frames": total_frames,
-                "sampled_frames": len(frames),
-                "video_fps": video_fps
-            }
-        finally:
-            # Clean up temp file
-            if os.path.exists(video_path):
-                os.unlink(video_path)
-    def _load_image(self, image_data: Any):
-        """Load a single image from various formats."""
-        from PIL import Image
-        import requests
-        if isinstance(image_data, Image.Image):
-            return image_data
-        elif isinstance(image_data, str):
-            if image_data.startswith(('http://', 'https://')):
-                response = requests.get(image_data, stream=True)
-                return Image.open(response.raw).convert('RGB')
-            elif image_data.startswith('data:'):
-                header, encoded = image_data.split(',', 1)
-                image_bytes = base64.b64decode(encoded)
-                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
-            else:
-                image_bytes = base64.b64decode(image_data)
-                return Image.open(io.BytesIO(image_bytes)).convert('RGB')
-        elif isinstance(image_data, bytes):
-            return Image.open(io.BytesIO(image_data)).convert('RGB')
-        else:
-            raise ValueError(f"Unsupported image input type: {type(image_data)}")
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Process video or images with Eagle 2.5.
-        Expected input formats:
-        1. Video analysis:
-        {
-            "inputs": <video_url_or_base64>,
-            "parameters": {
-                "prompt": "Describe what happens in this video.",
-                "max_frames": 256,
-                "fps": 2.0,
-                "max_new_tokens": 2048
-            }
-        }
-        2. Image analysis:
-        {
-            "inputs": <image_url_or_base64>,
-            "parameters": {
-                "prompt": "Describe this image.",
-                "max_new_tokens": 512
-            }
-        }
-        3. Multi-image analysis:
-        {
-            "inputs": [<image1>, <image2>, ...],
-            "parameters": {
-                "prompt": "Compare these images.",
-                "max_new_tokens": 1024
-            }
-        }
-        4. ProofPath rubric grading:
-        {
-            "inputs": <video_url>,
-            "parameters": {
-                "mode": "rubric",
-                "rubric": [
-                    {"step": 1, "description": "Click cell B2"},
-                    {"step": 2, "description": "Type 123"},
-                    {"step": 3, "description": "Press Enter"}
-                ],
-                "max_frames": 512,
-                "output_format": "json"
-            }
-        }
-        Returns:
-        {
-            "generated_text": "...",
-            "video_metadata": {...},  # If video input
-        }
-        """
-        inputs = data.get("inputs")
-        if inputs is None:
-            inputs = data.get("video") or data.get("image") or data.get("images")
-        if inputs is None:
-            raise ValueError("No input provided. Use 'inputs', 'video', 'image', or 'images' key.")
-        params = data.get("parameters", {})
-        mode = params.get("mode", "default")
-        prompt = params.get("prompt", "Describe this content in detail.")
-        max_new_tokens = params.get("max_new_tokens", 2048)
-        try:
-            if mode == "rubric":
-                return self._grade_rubric(inputs, params)
-            elif isinstance(inputs, list):
-                return self._process_multi_image(inputs, prompt, max_new_tokens)
-            elif self._is_video(inputs, params):
-                return self._process_video(inputs, prompt, params, max_new_tokens)
-            else:
-                return self._process_image(inputs, prompt, max_new_tokens)
-        except Exception as e:
-            import traceback
-            return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}
-    def _is_video(self, inputs: Any, params: Dict) -> bool:
-        """Determine if input is video based on params or file extension."""
-        if params.get("input_type") == "video":
-            return True
-        if params.get("input_type") == "image":
-            return False
-        if isinstance(inputs, str):
-            lower = inputs.lower()
-            video_exts = ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v']
-            return any(ext in lower for ext in video_exts)
-        return False
-    def _process_video(
-        self,
-        video_data: Any,
-        prompt: str,
-        params: Dict,
-        max_new_tokens: int
-    ) -> Dict[str, Any]:
-        """Process a video input."""
-        from qwen_vl_utils import process_vision_info
-        max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
-        fps = params.get("fps", 2.0)
-        # Load video frames
-        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
-        # Build message for Eagle 2.5 / Qwen2-VL format
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": frames, "fps": fps},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        # Apply chat template
-        text = self.processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Process vision info
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        # Generate
-        with torch.inference_mode():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,
-            )
-        # Decode - only the new tokens
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        generated_text = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )[0]
-        return {
-            "generated_text": generated_text,
-            "video_metadata": video_metadata
-        }
-    def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
-        """Process a single image."""
-        from qwen_vl_utils import process_vision_info
-        image = self._load_image(image_data)
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        text = self.processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        with torch.inference_mode():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,
-            )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        generated_text = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )[0]
-        return {
-            "generated_text": generated_text,
-            "image_size": {"width": image.width, "height": image.height}
-        }
-    def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
-        """Process multiple images."""
-        from qwen_vl_utils import process_vision_info
-        images = [self._load_image(img) for img in images_data]
-        # Build content with all images
-        content = []
-        for image in images:
-            content.append({"type": "image", "image": image})
-        content.append({"type": "text", "text": prompt})
-        messages = [{"role": "user", "content": content}]
-        text = self.processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        with torch.inference_mode():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=False,
-            )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        generated_text = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )[0]
-        return {
-            "generated_text": generated_text,
-            "num_images": len(images)
-        }
-    def _grade_rubric(self, video_data: Any, params: Dict) -> Dict[str, Any]:
-        """
-        Grade a video against a rubric - ProofPath specific mode.
-        """
-        from qwen_vl_utils import process_vision_info
-        rubric = params.get("rubric", [])
-        if not rubric:
-            raise ValueError("Rubric required for rubric mode")
-        max_frames = min(params.get("max_frames", 512), self.max_frames_limit)
-        fps = params.get("fps", 2.0)
-        output_format = params.get("output_format", "json")
-        # Load video
-        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
-        # Build rubric prompt
-        rubric_text = "\n".join([
-            f"Step {item.get('step', i+1)}: {item.get('description', '')}"
-            for i, item in enumerate(rubric)
-        ])
-        if output_format == "json":
-            prompt = f"""Analyze this video against the following rubric and grade each step.
-RUBRIC:
-{rubric_text}
-For EACH step, determine:
-1. Whether it was completed (true/false)
-2. The approximate timestamp where it occurs (if completed)
-3. Any issues or partial completion notes
-Respond ONLY with a JSON array in this exact format:
-[
-  {{"step": 1, "completed": true, "timestamp": "0:15", "notes": "Clicked cell B2 correctly"}},
-  {{"step": 2, "completed": true, "timestamp": "0:22", "notes": "Typed 123"}},
-  ...
-]"""
-        else:
-            prompt = f"""Analyze this video against the following rubric:
-RUBRIC:
-{rubric_text}
-For each step, describe whether it was completed, when it occurred, and any issues observed."""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": frames, "fps": fps},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        text = self.processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        with torch.inference_mode():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=params.get("max_new_tokens", 2048),
-                do_sample=False,
-            )
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        generated_text = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False
-        )[0]
-        result = {
-            "generated_text": generated_text,
-            "video_metadata": video_metadata,
-            "rubric": rubric
-        }
-        # Try to parse JSON if requested
-        if output_format == "json":
-            try:
-                import json
-                # Extract JSON array from response
-                json_match = re.search(r'\[[\s\S]*\]', generated_text)
-                if json_match:
-                    result["grading_results"] = json.loads(json_match.group())
-            except json.JSONDecodeError:
-                pass  # Keep raw text if JSON parsing fails
-        return result

+# Eagle 2.5 Inference Endpoint Requirements
+transformers>=4.53.0
+torch>=2.0.0
+qwen-vl-utils>=0.0.8
+opencv-python-headless>=4.8.0
+av>=10.0.0
+decord
+Pillow>=9.0.0
+requests>=2.28.0
+numpy>=1.24.0,<2.0.0
+einops>=0.7.0
+accelerate>=0.25.0