peterproofpath
/

molmo

Model card Files Files and versions

xet

Community

peterproofpath commited on Jan 11

Commit

93e2c9c

verified ·

1 Parent(s): be41be8

Update handler.py

Browse files

Files changed (1) hide show

handler.py +173 -176

handler.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Molmo 2 Custom Inference Handler for Hugging Face Inference Endpoints
-Model: allenai/Molmo2-7B-1225
 For ProofPath video assessment - video pointing, tracking, and grounded analysis.
 Unique capability: Returns pixel-level coordinates for objects in videos.
@@ -25,20 +25,20 @@ class EndpointHandler:
             path: Path to the model directory (ignored - we always load from HF hub)
         """
         # IMPORTANT: Always load from HF hub, not the repository path
-        model_id = "allenai/Molmo2-7B-1225"
         # Determine device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Load processor and model with trust_remote_code
-        from transformers import AutoProcessor, AutoModelForCausalLM
         self.processor = AutoProcessor.from_pretrained(
             model_id,
             trust_remote_code=True,
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
             model_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
@@ -55,36 +55,39 @@ class EndpointHandler:
         self.default_fps = 2.0
         # Regex patterns for parsing Molmo pointing output
-        # Molmo outputs: <point x="123" y="456" alt="description">
-        self.POINT_REGEX = re.compile(r'<point\s+x="([0-9.]+)"\s+y="([0-9.]+)"(?:\s+alt="([^"]*)")?>')
-        self.POINTS_REGEX = re.compile(r'<points>(.*?)</points>', re.DOTALL)
-    def _parse_points(self, text: str, image_w: int, image_h: int) -> List[Dict]:
         """
         Extract pointing coordinates from Molmo output.
-        Molmo outputs coordinates as percentages (0-100).
         """
-        points = []
-        for match in self.POINT_REGEX.finditer(text):
-            x_pct = float(match.group(1))
-            y_pct = float(match.group(2))
-            alt = match.group(3) or ""
-            # Convert percentage to pixels
-            x = (x_pct / 100) * image_w
-            y = (y_pct / 100) * image_h
-            points.append({
-                "x": x,
-                "y": y,
-                "x_pct": x_pct,
-                "y_pct": y_pct,
-                "label": alt
-            })
-        return points
     def _load_image(self, image_data: Any):
         """Load a single image from various formats."""
@@ -109,81 +112,6 @@ class EndpointHandler:
         else:
             raise ValueError(f"Unsupported image input type: {type(image_data)}")
-    def _load_video_frames(
-        self,
-        video_data: Any,
-        max_frames: int = 128,
-        fps: float = 2.0
-    ) -> tuple:
-        """Load video frames from various input formats."""
-        import cv2
-        from PIL import Image
-        # Decode video to temp file if needed
-        if isinstance(video_data, str):
-            if video_data.startswith(('http://', 'https://')):
-                import requests
-                response = requests.get(video_data, stream=True)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        f.write(chunk)
-                    video_path = f.name
-            elif video_data.startswith('data:'):
-                header, encoded = video_data.split(',', 1)
-                video_bytes = base64.b64decode(encoded)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    f.write(video_bytes)
-                    video_path = f.name
-            else:
-                video_bytes = base64.b64decode(video_data)
-                with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                    f.write(video_bytes)
-                    video_path = f.name
-        elif isinstance(video_data, bytes):
-            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
-                f.write(video_data)
-                video_path = f.name
-        else:
-            raise ValueError(f"Unsupported video input type: {type(video_data)}")
-        try:
-            cap = cv2.VideoCapture(video_path)
-            video_fps = cap.get(cv2.CAP_PROP_FPS)
-            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            duration = total_frames / video_fps if video_fps > 0 else 0
-            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            # Calculate frame indices
-            target_frames = min(max_frames, int(duration * fps), total_frames)
-            if target_frames <= 0:
-                target_frames = min(max_frames, total_frames)
-            frame_indices = np.linspace(0, total_frames - 1, max(1, target_frames), dtype=int)
-            frames = []
-            for idx in frame_indices:
-                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-                ret, frame = cap.read()
-                if ret:
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    frames.append(Image.fromarray(frame_rgb))
-            cap.release()
-            return frames, {
-                "duration": duration,
-                "total_frames": total_frames,
-                "sampled_frames": len(frames),
-                "video_fps": video_fps,
-                "width": width,
-                "height": height
-            }
-        finally:
-            if os.path.exists(video_path):
-                os.unlink(video_path)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Process video or images with Molmo 2.
@@ -199,12 +127,11 @@ class EndpointHandler:
             }
         }
-        2. Video analysis (processes as multi-frame):
         {
             "inputs": <video_url>,
             "parameters": {
                 "prompt": "What happens in this video?",
-                "max_frames": 64,
                 "max_new_tokens": 2048
             }
         }
@@ -220,7 +147,7 @@ class EndpointHandler:
         Returns:
         {
             "generated_text": "...",
-            "points": [{"x": 123, "y": 456, "label": "..."}],  # If pointing detected
             "image_size": {...}
         }
         """
@@ -262,26 +189,40 @@ class EndpointHandler:
     def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process a single image."""
         image = self._load_image(image_data)
-        # Process with Molmo processor
-        inputs = self.processor.process(
-            images=[image],
-            text=prompt,
         )
-        # Move to device
-        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
         # Generate
         with torch.inference_mode():
-            output = self.model.generate_from_batch(
-                inputs,
-                generation_config={"max_new_tokens": max_new_tokens, "stop_strings": ["<|endoftext|>"]},
-                tokenizer=self.processor.tokenizer,
             )
-        # Decode
         generated_tokens = output[0, inputs['input_ids'].size(1):]
         generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
@@ -291,7 +232,7 @@ class EndpointHandler:
         }
         # Parse any pointing coordinates
-        points = self._parse_points(generated_text, image.width, image.height)
         if points:
             result["points"] = points
             result["num_points"] = len(points)
@@ -305,54 +246,96 @@ class EndpointHandler:
         params: Dict,
         max_new_tokens: int
     ) -> Dict[str, Any]:
-        """Process video by sampling frames."""
-        max_frames = min(params.get("max_frames", 32), self.max_frames)
-        fps = params.get("fps", self.default_fps)
-        frames, video_metadata = self._load_video_frames(video_data, max_frames, fps)
-        if not frames:
-            raise ValueError("No frames could be extracted from video")
-        # For video, we process key frames
-        # Molmo can handle multiple images - we'll sample representative frames
-        sample_indices = np.linspace(0, len(frames) - 1, min(8, len(frames)), dtype=int)
-        sample_frames = [frames[i] for i in sample_indices]
-        # Modify prompt to indicate video context
-        video_prompt = f"These are {len(sample_frames)} frames from a video. {prompt}"
-        # Process with Molmo
-        inputs = self.processor.process(
-            images=sample_frames,
-            text=video_prompt,
-        )
-        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
-        with torch.inference_mode():
-            output = self.model.generate_from_batch(
-                inputs,
-                generation_config={"max_new_tokens": max_new_tokens, "stop_strings": ["<|endoftext|>"]},
-                tokenizer=self.processor.tokenizer,
             )
-        generated_tokens = output[0, inputs['input_ids'].size(1):]
-        generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
-        result = {
-            "generated_text": generated_text,
-            "video_metadata": video_metadata,
-            "frames_analyzed": len(sample_frames)
-        }
-        # Parse points using first frame dimensions
-        points = self._parse_points(generated_text, video_metadata["width"], video_metadata["height"])
-        if points:
-            result["points"] = points
-            result["num_points"] = len(points)
-        return result
     def _process_multi_image(
         self,
@@ -361,23 +344,37 @@ class EndpointHandler:
         max_new_tokens: int
     ) -> Dict[str, Any]:
         """Process multiple images."""
         images = [self._load_image(img) for img in images_data]
-        # Process with Molmo
-        inputs = self.processor.process(
-            images=images,
-            text=prompt,
         )
-        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
         with torch.inference_mode():
-            output = self.model.generate_from_batch(
-                inputs,
-                generation_config={"max_new_tokens": max_new_tokens, "stop_strings": ["<|endoftext|>"]},
-                tokenizer=self.processor.tokenizer,
             )
         generated_tokens = output[0, inputs['input_ids'].size(1):]
         generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
@@ -389,7 +386,7 @@ class EndpointHandler:
         # Parse points using first image dimensions
         if images:
-            points = self._parse_points(generated_text, images[0].width, images[0].height)
             if points:
                 result["points"] = points
                 result["num_points"] = len(points)

 """
 Molmo 2 Custom Inference Handler for Hugging Face Inference Endpoints
+Model: allenai/Molmo2-8B
 For ProofPath video assessment - video pointing, tracking, and grounded analysis.
 Unique capability: Returns pixel-level coordinates for objects in videos.
             path: Path to the model directory (ignored - we always load from HF hub)
         """
         # IMPORTANT: Always load from HF hub, not the repository path
+        model_id = "allenai/Molmo2-8B"
         # Determine device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load processor and model - Molmo2 uses AutoModelForImageTextToText
+        from transformers import AutoProcessor, AutoModelForImageTextToText
         self.processor = AutoProcessor.from_pretrained(
             model_id,
             trust_remote_code=True,
         )
+        self.model = AutoModelForImageTextToText.from_pretrained(
             model_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
         self.default_fps = 2.0
         # Regex patterns for parsing Molmo pointing output
+        self.COORD_REGEX = re.compile(r"<(?:points|tracks).*? coords=\"([0-9\t:;, .]+)\"/?>")
+        self.FRAME_REGEX = re.compile(r"(?:^|\t|:|,|;)([0-9\.]+) ([0-9\. ]+)")
+        self.POINTS_REGEX = re.compile(r"([0-9]+) ([0-9]{3,4}) ([0-9]{3,4})")
+    def _parse_video_points(self, text: str, image_w: int, image_h: int) -> List[Dict]:
         """
         Extract pointing coordinates from Molmo output.
+        Molmo outputs coordinates in format:
+        <points coords="8.5 0 183 216; 8.5 1 245 198"/>
+        Where: timestamp instance_id x y (coords scaled by 1000)
         """
+        all_points = []
+        for coord_match in self.COORD_REGEX.finditer(text):
+            for frame_match in self.FRAME_REGEX.finditer(coord_match.group(1)):
+                timestamp = float(frame_match.group(1))
+                for point_match in self.POINTS_REGEX.finditer(frame_match.group(2)):
+                    instance_id = int(point_match.group(1))
+                    # Coordinates are scaled by 1000
+                    x = float(point_match.group(2)) / 1000 * image_w
+                    y = float(point_match.group(3)) / 1000 * image_h
+                    if 0 <= x <= image_w and 0 <= y <= image_h:
+                        all_points.append({
+                            "timestamp": timestamp,
+                            "instance_id": instance_id,
+                            "x": x,
+                            "y": y
+                        })
+        return all_points
     def _load_image(self, image_data: Any):
         """Load a single image from various formats."""
         else:
             raise ValueError(f"Unsupported image input type: {type(image_data)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Process video or images with Molmo 2.
             }
         }
+        2. Video analysis:
         {
             "inputs": <video_url>,
             "parameters": {
                 "prompt": "What happens in this video?",
                 "max_new_tokens": 2048
             }
         }
         Returns:
         {
             "generated_text": "...",
+            "points": [{"timestamp": 0, "x": 123, "y": 456, ...}],  # If pointing detected
             "image_size": {...}
         }
         """
     def _process_image(self, image_data: Any, prompt: str, max_new_tokens: int) -> Dict[str, Any]:
         """Process a single image."""
+        from PIL import Image
         image = self._load_image(image_data)
+        # Build message in Molmo format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        # Apply chat template and process
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            return_dict=True,
         )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
         # Generate
         with torch.inference_mode():
+            output = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
             )
+        # Decode - only new tokens
         generated_tokens = output[0, inputs['input_ids'].size(1):]
         generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         }
         # Parse any pointing coordinates
+        points = self._parse_video_points(generated_text, image.width, image.height)
         if points:
             result["points"] = points
             result["num_points"] = len(points)
         params: Dict,
         max_new_tokens: int
     ) -> Dict[str, Any]:
+        """Process video using molmo_utils."""
+        from molmo_utils import process_vision_info
+        # Handle video URL or base64
+        if isinstance(video_data, str) and video_data.startswith(('http://', 'https://')):
+            video_source = video_data
+            temp_path = None
+        else:
+            # Write to temp file
+            if isinstance(video_data, str):
+                video_bytes = base64.b64decode(video_data)
+            else:
+                video_bytes = video_data
+            with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as f:
+                f.write(video_bytes)
+                video_source = f.name
+                temp_path = f.name
+        try:
+            # Build message
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {"type": "video", "video": video_source},
+                    ],
+                }
+            ]
+            # Process video with molmo_utils
+            _, videos, video_kwargs = process_vision_info(messages)
+            videos, video_metadatas = zip(*videos)
+            videos, video_metadatas = list(videos), list(video_metadatas)
+            # Apply chat template
+            text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
             )
+            # Process inputs
+            inputs = self.processor(
+                videos=videos,
+                video_metadata=video_metadatas,
+                text=text,
+                padding=True,
+                return_tensors="pt",
+                **video_kwargs,
+            )
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate
+            with torch.inference_mode():
+                output = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=False,
+                )
+            # Decode
+            generated_tokens = output[0, inputs['input_ids'].size(1):]
+            generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            # Get video dimensions
+            video_w = video_metadatas[0].get("width", 1920)
+            video_h = video_metadatas[0].get("height", 1080)
+            result = {
+                "generated_text": generated_text,
+                "video_metadata": {
+                    "width": video_w,
+                    "height": video_h,
+                }
+            }
+            # Parse coordinates
+            points = self._parse_video_points(generated_text, video_w, video_h)
+            if points:
+                result["points"] = points
+                result["num_points"] = len(points)
+            return result
+        finally:
+            # Clean up temp file
+            if temp_path and os.path.exists(temp_path):
+                os.unlink(temp_path)
     def _process_multi_image(
         self,
         max_new_tokens: int
     ) -> Dict[str, Any]:
         """Process multiple images."""
+        from PIL import Image
         images = [self._load_image(img) for img in images_data]
+        # Build content with all images
+        content = []
+        for image in images:
+            content.append({"type": "image", "image": image})
+        content.append({"type": "text", "text": prompt})
+        messages = [{"role": "user", "content": content}]
+        # Apply chat template
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            return_dict=True,
         )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        # Generate
         with torch.inference_mode():
+            output = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
             )
+        # Decode
         generated_tokens = output[0, inputs['input_ids'].size(1):]
         generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
         # Parse points using first image dimensions
         if images:
+            points = self._parse_video_points(generated_text, images[0].width, images[0].height)
             if points:
                 result["points"] = points
                 result["num_points"] = len(points)