katuni4ka
/

tiny-random-maira2

Safetensors

maira2

custom_code

Model card Files Files and versions

xet

Community

katuni4ka commited on Feb 12, 2025

Commit

90996d5

verified ·

1 Parent(s): 10ddc7b

code compatibility with python3.9

Browse files

Files changed (1) hide show

processing_maira2.py +48 -48

processing_maira2.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import re
-from typing import Any, TypeAlias, Union, List
 import numpy as np
 from PIL import Image
@@ -14,9 +14,9 @@ from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-SingleChatMessageType: TypeAlias = dict[str, str | int | None]
-ChatMessageListType: TypeAlias = list[dict[str, str | list[SingleChatMessageType]]]
-BoxType: TypeAlias = tuple[float, float, float, float]
 class Maira2Processor(LlavaProcessor):
@@ -55,9 +55,9 @@ class Maira2Processor(LlavaProcessor):
         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
-        patch_size: int | None = None,
-        vision_feature_select_strategy: str | None = None,
-        chat_template: str | None = None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
@@ -106,9 +106,9 @@ class Maira2Processor(LlavaProcessor):
     def _normalize_and_stack_images(
         self,
         current_frontal: Image.Image,
-        current_lateral: Image.Image | None,
-        prior_frontal: Image.Image | None,
-    ) -> list[Image.Image]:
         """
         This function normalizes the input images and stacks them together. The images are stacked in the order of
         current_frontal, current_lateral, and prior_frontal. The order of images is important, since it must match the
@@ -133,7 +133,7 @@ class Maira2Processor(LlavaProcessor):
         return images
     @staticmethod
-    def _get_section_text_or_missing_text(section: str | None) -> str:
         """
         This function returns the input section text if it is not None and not empty, otherwise it returns a missing
         section text "N/A".
@@ -151,7 +151,7 @@ class Maira2Processor(LlavaProcessor):
         return section
     @staticmethod
-    def _construct_image_chat_messages_for_reporting(has_prior: bool, has_lateral: bool) -> list[SingleChatMessageType]:
         """
         This function constructs user chat messages based on the presence of the prior and lateral images.
@@ -187,7 +187,7 @@ class Maira2Processor(LlavaProcessor):
                 ]
             )
-        image_prompt: list[SingleChatMessageType] = []
         image_index = 0
         if not has_prior and not has_lateral:
             _add_single_image_to_chat_messages("Given the current frontal image only", image_index)
@@ -208,13 +208,13 @@ class Maira2Processor(LlavaProcessor):
         self,
         has_prior: bool,
         has_lateral: bool,
-        indication: str | None,
-        technique: str | None,
-        comparison: str | None,
-        prior_report: str | None,
         get_grounding: bool = False,
-        assistant_text: str | None = None,
-    ) -> ChatMessageListType:
         """
         This function constructs the chat messages for reporting used in the grounded and non-grounded reporting tasks.
@@ -299,14 +299,14 @@ class Maira2Processor(LlavaProcessor):
                     "type": "text",
                 }
             )
-        messages: ChatMessageListType = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
             messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
     def _construct_chat_messages_phrase_grounding(
-        self, phrase: str, assistant_text: str | None = None
-    ) -> ChatMessageListType:
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
@@ -319,7 +319,7 @@ class Maira2Processor(LlavaProcessor):
         Returns:
             ChatMessageListType: The chat messages for phrase grounding in the form of a list of dictionaries.
         """
-        prompt: list[SingleChatMessageType] = [
             {"index": None, "text": "Given the current frontal image", "type": "text"},
             {"index": 0, "text": None, "type": "image"},
             {
@@ -329,7 +329,7 @@ class Maira2Processor(LlavaProcessor):
                 "type": "text",
             },
         ]
-        messages: ChatMessageListType = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
             messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
@@ -337,15 +337,15 @@ class Maira2Processor(LlavaProcessor):
     def format_reporting_input(
         self,
         current_frontal: Image.Image,
-        current_lateral: Image.Image | None,
-        prior_frontal: Image.Image | None,
-        indication: str | None,
-        technique: str | None,
-        comparison: str | None,
-        prior_report: str | None,
         get_grounding: bool = False,
-        assistant_text: str | None = None,
-    ) -> tuple[str, list[Image.Image]]:
         """
         This function formats the reporting prompt for the grounded and non-grounded reporting tasks from the given
         input images and text sections. The images are normalized and stacked together in the right order.
@@ -395,8 +395,8 @@ class Maira2Processor(LlavaProcessor):
         self,
         frontal_image: Image.Image,
         phrase: str,
-        assistant_text: str | None = None,
-    ) -> tuple[str, list[Image.Image]]:
         """
         This function formats the phrase grounding prompt for the phrase grounding task from the given input
         image and phrase.
@@ -425,14 +425,14 @@ class Maira2Processor(LlavaProcessor):
     def format_and_preprocess_reporting_input(
         self,
         current_frontal: Image.Image,
-        current_lateral: Image.Image | None,
-        prior_frontal: Image.Image | None,
-        indication: str | None,
-        technique: str | None,
-        comparison: str | None,
-        prior_report: str | None,
         get_grounding: bool = False,
-        assistant_text: str | None = None,
         **kwargs: Any,
     ) -> BatchFeature:
         """
@@ -481,7 +481,7 @@ class Maira2Processor(LlavaProcessor):
         self,
         frontal_image: Image.Image,
         phrase: str,
-        assistant_text: str | None = None,
         **kwargs: Any,
     ) -> BatchFeature:
         """
@@ -507,7 +507,7 @@ class Maira2Processor(LlavaProcessor):
         )
         return self(text=text, images=images, **kwargs)
-    def _get_text_between_delimiters(self, text: str, begin_token: str, end_token: str) -> list[str]:
         """
         This function splits the input text into a list of substrings beased on the given begin and end tokens.
@@ -544,7 +544,7 @@ class Maira2Processor(LlavaProcessor):
     def convert_output_to_plaintext_or_grounded_sequence(
         self, text: str
-    ) -> str | list[tuple[str, list[BoxType] | None]]:
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
@@ -584,7 +584,7 @@ class Maira2Processor(LlavaProcessor):
         # One or more grounded phrases
         grounded_phrase_texts = self._get_text_between_delimiters(text, self.phrase_start_token, self.phrase_end_token)
-        grounded_phrases: list[tuple[str, list[BoxType] | None]] = []
         for grounded_phrase_text in grounded_phrase_texts:
             if self.box_start_token in grounded_phrase_text or self.box_end_token in grounded_phrase_text:
                 first_box_start_index = grounded_phrase_text.find(self.box_start_token)
@@ -593,14 +593,14 @@ class Maira2Processor(LlavaProcessor):
                 boxes_text_list = self._get_text_between_delimiters(
                     boxes_text, self.box_start_token, self.box_end_token
                 )
-                boxes: list[BoxType] = []
                 for box_text in boxes_text_list:
                     # extract from <x_><y_><x_><y_>
                     regex = r"<x(\d+?)><y(\d+?)><x(\d+?)><y(\d+?)>"
                     match = re.search(regex, box_text)
                     if match:
                         x_min, y_min, x_max, y_max = match.groups()
-                        box: BoxType = tuple(  # type: ignore[assignment]
                             (int(coord) + 0.5) / self.num_box_coord_bins for coord in (x_min, y_min, x_max, y_max)
                         )
                         assert all(0 <= coord <= 1 for coord in box), f"Invalid box coordinates: {box}"
@@ -613,7 +613,7 @@ class Maira2Processor(LlavaProcessor):
         return grounded_phrases
     @staticmethod
-    def adjust_box_for_original_image_size(box: BoxType, width: int, height: int) -> BoxType:
         """
         This function adjusts the bounding boxes from the MAIRA-2 model output to account for the image processor
         cropping the image to be square prior to the model forward pass. The box coordinates are adjusted to be

 import re
+from typing import Any, Union, List
 import numpy as np
 from PIL import Image
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+# SingleChatMessageType: TypeAlias = dict[str, str | int | None]
+# ChatMessageListType: TypeAlias = list[dict[str, str | list[SingleChatMessageType]]]
+# BoxType: TypeAlias = tuple[float, float, float, float]
 class Maira2Processor(LlavaProcessor):
         self,
         image_processor: BaseImageProcessor = None,
         tokenizer: PreTrainedTokenizer = None,
+        patch_size = None,
+        vision_feature_select_strategy = None,
+        chat_template = None,
         image_token: str = "<image>",
         phrase_start_token: str = "<obj>",
         phrase_end_token: str = "</obj>",
     def _normalize_and_stack_images(
         self,
         current_frontal: Image.Image,
+        current_lateral: Image.Image,
+        prior_frontal: Image.Image,
+    ):
         """
         This function normalizes the input images and stacks them together. The images are stacked in the order of
         current_frontal, current_lateral, and prior_frontal. The order of images is important, since it must match the
         return images
     @staticmethod
+    def _get_section_text_or_missing_text(section: str) -> str:
         """
         This function returns the input section text if it is not None and not empty, otherwise it returns a missing
         section text "N/A".
         return section
     @staticmethod
+    def _construct_image_chat_messages_for_reporting(has_prior: bool, has_lateral: bool):
         """
         This function constructs user chat messages based on the presence of the prior and lateral images.
                 ]
             )
+        image_prompt = []
         image_index = 0
         if not has_prior and not has_lateral:
             _add_single_image_to_chat_messages("Given the current frontal image only", image_index)
         self,
         has_prior: bool,
         has_lateral: bool,
+        indication: str,
+        technique: str,
+        comparison: str,
+        prior_report: str,
         get_grounding: bool = False,
+        assistant_text: str = None,
+    ):
         """
         This function constructs the chat messages for reporting used in the grounded and non-grounded reporting tasks.
                     "type": "text",
                 }
             )
+        messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
             messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
     def _construct_chat_messages_phrase_grounding(
+        self, phrase: str, assistant_text: str = None
+    ):
         """
         This function constructs the chat messages for phrase grounding used in the phrase grounding task.
         Returns:
             ChatMessageListType: The chat messages for phrase grounding in the form of a list of dictionaries.
         """
+        prompt = [
             {"index": None, "text": "Given the current frontal image", "type": "text"},
             {"index": 0, "text": None, "type": "image"},
             {
                 "type": "text",
             },
         ]
+        messages = [{"content": prompt, "role": "user"}]
         if assistant_text is not None:
             messages.append({"content": [{"index": None, "text": assistant_text, "type": "text"}], "role": "assistant"})
         return messages
     def format_reporting_input(
         self,
         current_frontal: Image.Image,
+        current_lateral: Image.Image,
+        prior_frontal: Image.Image,
+        indication: str,
+        technique: str,
+        comparison: str,
+        prior_report: str,
         get_grounding: bool = False,
+        assistant_text: str,
+    ):
         """
         This function formats the reporting prompt for the grounded and non-grounded reporting tasks from the given
         input images and text sections. The images are normalized and stacked together in the right order.
         self,
         frontal_image: Image.Image,
         phrase: str,
+        assistant_text: str = None,
+    ):
         """
         This function formats the phrase grounding prompt for the phrase grounding task from the given input
         image and phrase.
     def format_and_preprocess_reporting_input(
         self,
         current_frontal: Image.Image,
+        current_lateral: Image.Image,
+        prior_frontal: Image.Image,
+        indication: str,
+        technique: str,
+        comparison: str,
+        prior_report: str,
         get_grounding: bool = False,
+        assistant_text: str = None,
         **kwargs: Any,
     ) -> BatchFeature:
         """
         self,
         frontal_image: Image.Image,
         phrase: str,
+        assistant_text: str = None,
         **kwargs: Any,
     ) -> BatchFeature:
         """
         )
         return self(text=text, images=images, **kwargs)
+    def _get_text_between_delimiters(self, text: str, begin_token: str, end_token: str):
         """
         This function splits the input text into a list of substrings beased on the given begin and end tokens.
     def convert_output_to_plaintext_or_grounded_sequence(
         self, text: str
+    ):
         """
         This function converts the input text to a grounded sequence by extracting the grounded phrases and bounding
         boxes from the text. If the text is plaintext without any grounded phrases, it returns the text as is.
         # One or more grounded phrases
         grounded_phrase_texts = self._get_text_between_delimiters(text, self.phrase_start_token, self.phrase_end_token)
+        grounded_phrases = []
         for grounded_phrase_text in grounded_phrase_texts:
             if self.box_start_token in grounded_phrase_text or self.box_end_token in grounded_phrase_text:
                 first_box_start_index = grounded_phrase_text.find(self.box_start_token)
                 boxes_text_list = self._get_text_between_delimiters(
                     boxes_text, self.box_start_token, self.box_end_token
                 )
+                boxes = []
                 for box_text in boxes_text_list:
                     # extract from <x_><y_><x_><y_>
                     regex = r"<x(\d+?)><y(\d+?)><x(\d+?)><y(\d+?)>"
                     match = re.search(regex, box_text)
                     if match:
                         x_min, y_min, x_max, y_max = match.groups()
+                        box = tuple(  # type: ignore[assignment]
                             (int(coord) + 0.5) / self.num_box_coord_bins for coord in (x_min, y_min, x_max, y_max)
                         )
                         assert all(0 <= coord <= 1 for coord in box), f"Invalid box coordinates: {box}"
         return grounded_phrases
     @staticmethod
+    def adjust_box_for_original_image_size(box, width: int, height: int):
         """
         This function adjusts the bounding boxes from the MAIRA-2 model output to account for the image processor
         cropping the image to be square prior to the model forward pass. The box coordinates are adjusted to be