peterproofpath
/

eagle

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

2562376

verified ·

1 Parent(s): f785f12

Update handler.py

Browse files

Files changed (1) hide show

handler.py +91 -53

handler.py CHANGED Viewed

@@ -22,37 +22,36 @@ class EndpointHandler:
         Initialize Eagle 2.5 model for video understanding.
         Args:
-            path: Path to the model directory (provided by HF Inference Endpoints)
         """
-        from transformers import AutoProcessor, AutoModel, AutoTokenizer
-        # Use the model path provided by the endpoint, or default to HF hub
-        model_id = path if path else "nvidia/Eagle2.5-8B"
         # Determine device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Load processor, tokenizer, and model
-        self.processor = AutoProcessor.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            use_fast=True
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
             model_id,
             trust_remote_code=True,
-            use_fast=True
         )
-        self.processor.tokenizer.padding_side = "left"
-        self.model = AutoModel.from_pretrained(
             model_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
             attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
         )
-        if torch.cuda.is_available():
             self.model = self.model.to(self.device)
         self.model.eval()
@@ -66,7 +65,7 @@ class EndpointHandler:
         video_data: Any,
         max_frames: int = 256,
         fps: float = 2.0
-    ) -> List:
         """
         Load video frames from various input formats.
@@ -247,7 +246,8 @@ class EndpointHandler:
                 return self._process_image(inputs, prompt, max_new_tokens)
         except Exception as e:
-            return {"error": str(e), "error_type": type(e).__name__}
     def _is_video(self, inputs: Any, params: Dict) -> bool:
         """Determine if input is video based on params or file extension."""
@@ -271,39 +271,43 @@ class EndpointHandler:
         max_new_tokens: int
     ) -> Dict[str, Any]:
         """Process a video input."""
         max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
         fps = params.get("fps", 2.0)
         # Load video frames
         frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
-        # Build message for Eagle 2.5
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": prompt},
-                    {"type": "video", "video": frames},
                 ],
             }
         ]
-        # Process with Eagle 2.5 processor
-        text_list = [self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
-        )]
-        image_inputs, video_inputs = self.processor.process_vision_info(messages)
         inputs = self.processor(
-            text=text_list,
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         # Generate
         with torch.inference_mode():
@@ -313,9 +317,15 @@ class EndpointHandler:
                 do_sample=False,
             )
-        # Decode
-        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
-        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         return {
             "generated_text": generated_text,
@@ -324,33 +334,36 @@ class EndpointHandler:
     def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process a single image."""
         image = self._load_image(image_data)
         messages = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": prompt},
                     {"type": "image", "image": image},
                 ],
             }
         ]
-        text_list = [self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
-        )]
-        image_inputs, video_inputs = self.processor.process_vision_info(messages)
         inputs = self.processor(
-            text=text_list,
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         with torch.inference_mode():
             generated_ids = self.model.generate(
@@ -359,8 +372,14 @@ class EndpointHandler:
                 do_sample=False,
             )
-        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
-        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         return {
             "generated_text": generated_text,
@@ -369,30 +388,34 @@ class EndpointHandler:
     def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process multiple images."""
         images = [self._load_image(img) for img in images_data]
         # Build content with all images
-        content = [{"type": "text", "text": prompt}]
         for image in images:
             content.append({"type": "image", "image": image})
         messages = [{"role": "user", "content": content}]
-        text_list = [self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
-        )]
-        image_inputs, video_inputs = self.processor.process_vision_info(messages)
         inputs = self.processor(
-            text=text_list,
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         with torch.inference_mode():
             generated_ids = self.model.generate(
@@ -401,8 +424,14 @@ class EndpointHandler:
                 do_sample=False,
             )
-        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
-        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         return {
             "generated_text": generated_text,
@@ -413,6 +442,8 @@ class EndpointHandler:
         """
         Grade a video against a rubric - ProofPath specific mode.
         """
         rubric = params.get("rubric", [])
         if not rubric:
             raise ValueError("Rubric required for rubric mode")
@@ -459,27 +490,28 @@ For each step, describe whether it was completed, when it occurred, and any issu
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": prompt},
-                    {"type": "video", "video": frames},
                 ],
             }
         ]
-        text_list = [self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
-        )]
-        image_inputs, video_inputs = self.processor.process_vision_info(messages)
         inputs = self.processor(
-            text=text_list,
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
-        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         with torch.inference_mode():
             generated_ids = self.model.generate(
@@ -488,8 +520,14 @@ For each step, describe whether it was completed, when it occurred, and any issu
                 do_sample=False,
             )
-        generated_tokens = generated_ids[0, inputs['input_ids'].size(1):]
-        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         result = {
             "generated_text": generated_text,

         Initialize Eagle 2.5 model for video understanding.
         Args:
+            path: Path to the model directory (ignored - we always load from HF hub)
         """
+        # IMPORTANT: Eagle 2.5 must be loaded from HF hub, not the repository path
+        # The repository only contains handler.py and requirements.txt
+        model_id = "nvidia/Eagle2.5-8B"
         # Determine device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Eagle 2.5 uses Qwen2VLProcessor - import and load directly
+        from transformers import Qwen2VLProcessor, Qwen2VLForConditionalGeneration
+        self.processor = Qwen2VLProcessor.from_pretrained(
             model_id,
             trust_remote_code=True,
         )
+        # Set padding side for batch processing
+        if hasattr(self.processor, 'tokenizer'):
+            self.processor.tokenizer.padding_side = "left"
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
             attn_implementation="flash_attention_2" if torch.cuda.is_available() else "sdpa",
+            device_map="auto" if torch.cuda.is_available() else None,
         )
+        if not torch.cuda.is_available():
             self.model = self.model.to(self.device)
         self.model.eval()
         video_data: Any,
         max_frames: int = 256,
         fps: float = 2.0
+    ) -> tuple:
         """
         Load video frames from various input formats.
                 return self._process_image(inputs, prompt, max_new_tokens)
         except Exception as e:
+            import traceback
+            return {"error": str(e), "error_type": type(e).__name__, "traceback": traceback.format_exc()}
     def _is_video(self, inputs: Any, params: Dict) -> bool:
         """Determine if input is video based on params or file extension."""
         max_new_tokens: int
     ) -> Dict[str, Any]:
         """Process a video input."""
+        from qwen_vl_utils import process_vision_info
         max_frames = min(params.get("max_frames", self.default_max_frames), self.max_frames_limit)
         fps = params.get("fps", 2.0)
         # Load video frames
         frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
+        # Build message for Eagle 2.5 / Qwen2-VL format
         messages = [
             {
                 "role": "user",
                 "content": [
+                    {"type": "video", "video": frames, "fps": fps},
                     {"type": "text", "text": prompt},
                 ],
             }
         ]
+        # Apply chat template
+        text = self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
+        )
+        # Process vision info
+        image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
+            text=[text],
             images=image_inputs,
             videos=video_inputs,
+            padding=True,
             return_tensors="pt",
         )
+        inputs = inputs.to(self.model.device)
         # Generate
         with torch.inference_mode():
                 do_sample=False,
             )
+        # Decode - only the new tokens
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
         return {
             "generated_text": generated_text,
     def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process a single image."""
+        from qwen_vl_utils import process_vision_info
         image = self._load_image(image_data)
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
                 ],
             }
         ]
+        text = self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
+            text=[text],
             images=image_inputs,
             videos=video_inputs,
+            padding=True,
             return_tensors="pt",
         )
+        inputs = inputs.to(self.model.device)
         with torch.inference_mode():
             generated_ids = self.model.generate(
                 do_sample=False,
             )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
         return {
             "generated_text": generated_text,
     def _process_multi_image(self, images_data: List, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process multiple images."""
+        from qwen_vl_utils import process_vision_info
         images = [self._load_image(img) for img in images_data]
         # Build content with all images
+        content = []
         for image in images:
             content.append({"type": "image", "image": image})
+        content.append({"type": "text", "text": prompt})
         messages = [{"role": "user", "content": content}]
+        text = self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
+            text=[text],
             images=image_inputs,
             videos=video_inputs,
+            padding=True,
             return_tensors="pt",
         )
+        inputs = inputs.to(self.model.device)
         with torch.inference_mode():
             generated_ids = self.model.generate(
                 do_sample=False,
             )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
         return {
             "generated_text": generated_text,
         """
         Grade a video against a rubric - ProofPath specific mode.
         """
+        from qwen_vl_utils import process_vision_info
         rubric = params.get("rubric", [])
         if not rubric:
             raise ValueError("Rubric required for rubric mode")
             {
                 "role": "user",
                 "content": [
+                    {"type": "video", "video": frames, "fps": fps},
                     {"type": "text", "text": prompt},
                 ],
             }
         ]
+        text = self.processor.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
+            text=[text],
             images=image_inputs,
             videos=video_inputs,
+            padding=True,
             return_tensors="pt",
         )
+        inputs = inputs.to(self.model.device)
         with torch.inference_mode():
             generated_ids = self.model.generate(
                 do_sample=False,
             )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
         result = {
             "generated_text": generated_text,