declare-lab
/

Emma-X

@@ -15,12 +15,21 @@ from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTen
 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import BatchFeature, ImageProcessingMixin
 from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from transformers.utils import TensorType
 # === Image Processing ===
-def letterbox_pad_transform(image: Image.Image, padding_fill_value: Tuple[int, int, int]) -> Image.Image:
     """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
     (w, h), max_wh = image.size, max(image.size)
     horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
@@ -62,10 +71,19 @@ class PrismaticImageProcessor(ImageProcessingMixin):
         stds = [(0.5, 0.5, 0.5)] if stds is None else stds
         # TIMM `data_cfg` Parameters
-        self.input_sizes, self.interpolations, self.means, self.stds = input_sizes, interpolations, means, stds
         # Grab torchvision transforms via TIMM =>> need to parse for specific "functional" transform values!
-        self.tvf_resize_params, self.tvf_crop_params, self.tvf_normalize_params = [], [], []
         self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
         for idx in range(len(input_sizes)):
@@ -90,11 +108,17 @@ class PrismaticImageProcessor(ImageProcessingMixin):
                 and (transform.transforms[0].size == self.input_sizes[idx][-1])
                 and (transform.transforms[1].size == self.input_sizes[idx][-2:])
             ):
-                raise ValueError(f"Unexpected TIMM image transformation structure/sizes: `{transform}`")
             # HF Image Processors *must* be JSON-serializable; as such, cannot have torchvision. as an attribute.
             #   => Instead, we're going to parse the transform and call "torchvision.transforms.functional" (`tvf`)
-            resize_t, crop_t, norm_t = transform.transforms[0], transform.transforms[1], transform.transforms[3]
             self.tvf_resize_params.append(
                 {
                     "size": resize_t.size,
@@ -117,11 +141,15 @@ class PrismaticImageProcessor(ImageProcessingMixin):
             if self.image_resize_strategy == "resize-naive":
                 self.tvf_resize_params[idx]["size"] = (resize_t.size, resize_t.size)
             elif self.image_resize_strategy == "letterbox":
-                self.tvf_do_letterbox, self.tvf_letterbox_fill = True, tuple([int(x * 255) for x in self.means[idx]])
             elif self.image_resize_strategy == "resize-crop":
                 pass
             else:
-                raise ValueError(f"Image resize strategy `{self.image_resize_strategy}` is not supported!")
         # Dispatch **kwargs to super()
         super().__init__(**kwargs)
@@ -164,12 +192,19 @@ class PrismaticImageProcessor(ImageProcessingMixin):
             images = [images]
         # Apply `self.img_transform` to each image (will return list of torch.Tensors); stack into "batched" Tensor
-        pixel_values = torch.stack([self.apply_transform(img.convert("RGB")) for img in images])
         # Return BatchFeature =>> note that for compatibility, constructor expects Dict[str, np.ndarray], so we convert
-        return BatchFeature(data={"pixel_values": pixel_values.float().numpy()}, tensor_type=return_tensors)
-    def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> BatchFeature:
         return self.preprocess(images, **kwargs)
@@ -189,7 +224,9 @@ class PrismaticProcessor(ProcessorMixin):
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         images: Union[Image.Image, List[Image.Image]],
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
@@ -209,21 +246,31 @@ class PrismaticProcessor(ProcessorMixin):
         @return: BatchFeature with keys for `input_ids`, `attention_mask` and `pixel_values`.
         """
-        pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
         text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
         )
         # [Validate] Need same number of images and text inputs!
         if pixel_values.shape[0] != text_inputs.input_ids.shape[0]:
-            raise ValueError("Batch is malformed; expected same number of images and text inputs!")
         return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
     # === Tokenizer Dispatch Utilities =>> check `PreTrainedTokenizerBase` for documentation ===
     def batch_decode(
         self,
-        sequences: Union[List[int], List[List[int]], torch.Tensor, Any],  # `Any` = np.ndarray | tf.Tensor
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs: str,
@@ -237,7 +284,9 @@ class PrismaticProcessor(ProcessorMixin):
     def decode(
         self,
-        token_ids: Union[int, List[int], torch.Tensor, Any],  # `Any` = np.ndarray | tf.Tensor
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs: str,
@@ -255,3 +304,11 @@ class PrismaticProcessor(ProcessorMixin):
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import BatchFeature, ImageProcessingMixin
 from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils import (
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
 from transformers.utils import TensorType
+from .gripper_position import get_gripper_pos_raw
 # === Image Processing ===
+def letterbox_pad_transform(
+    image: Image.Image, padding_fill_value: Tuple[int, int, int]
+) -> Image.Image:
     """Given a PIL.Image, pad to square by adding a symmetric border around the height/width."""
     (w, h), max_wh = image.size, max(image.size)
     horizontal_pad, vertical_pad = int((max_wh - w) / 2), int((max_wh - h) / 2)
         stds = [(0.5, 0.5, 0.5)] if stds is None else stds
         # TIMM `data_cfg` Parameters
+        self.input_sizes, self.interpolations, self.means, self.stds = (
+            input_sizes,
+            interpolations,
+            means,
+            stds,
+        )
         # Grab torchvision transforms via TIMM =>> need to parse for specific "functional" transform values!
+        self.tvf_resize_params, self.tvf_crop_params, self.tvf_normalize_params = (
+            [],
+            [],
+            [],
+        )
         self.tvf_do_letterbox, self.tvf_letterbox_fill = False, None
         for idx in range(len(input_sizes)):
                 and (transform.transforms[0].size == self.input_sizes[idx][-1])
                 and (transform.transforms[1].size == self.input_sizes[idx][-2:])
             ):
+                raise ValueError(
+                    f"Unexpected TIMM image transformation structure/sizes: `{transform}`"
+                )
             # HF Image Processors *must* be JSON-serializable; as such, cannot have torchvision. as an attribute.
             #   => Instead, we're going to parse the transform and call "torchvision.transforms.functional" (`tvf`)
+            resize_t, crop_t, norm_t = (
+                transform.transforms[0],
+                transform.transforms[1],
+                transform.transforms[3],
+            )
             self.tvf_resize_params.append(
                 {
                     "size": resize_t.size,
             if self.image_resize_strategy == "resize-naive":
                 self.tvf_resize_params[idx]["size"] = (resize_t.size, resize_t.size)
             elif self.image_resize_strategy == "letterbox":
+                self.tvf_do_letterbox, self.tvf_letterbox_fill = True, tuple(
+                    [int(x * 255) for x in self.means[idx]]
+                )
             elif self.image_resize_strategy == "resize-crop":
                 pass
             else:
+                raise ValueError(
+                    f"Image resize strategy `{self.image_resize_strategy}` is not supported!"
+                )
         # Dispatch **kwargs to super()
         super().__init__(**kwargs)
             images = [images]
         # Apply `self.img_transform` to each image (will return list of torch.Tensors); stack into "batched" Tensor
+        pixel_values = torch.stack(
+            [self.apply_transform(img.convert("RGB")) for img in images]
+        )
         # Return BatchFeature =>> note that for compatibility, constructor expects Dict[str, np.ndarray], so we convert
+        return BatchFeature(
+            data={"pixel_values": pixel_values.float().numpy()},
+            tensor_type=return_tensors,
+        )
+    def __call__(
+        self, images: Union[Image.Image, List[Image.Image]], **kwargs
+    ) -> BatchFeature:
         return self.preprocess(images, **kwargs)
     def __call__(
         self,
+        text: Union[
+            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
+        ],
         images: Union[Image.Image, List[Image.Image]],
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
         @return: BatchFeature with keys for `input_ids`, `attention_mask` and `pixel_values`.
         """
+        pixel_values = self.image_processor(images, return_tensors=return_tensors)[
+            "pixel_values"
+        ]
         text_inputs = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
         )
         # [Validate] Need same number of images and text inputs!
         if pixel_values.shape[0] != text_inputs.input_ids.shape[0]:
+            raise ValueError(
+                "Batch is malformed; expected same number of images and text inputs!"
+            )
         return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
     # === Tokenizer Dispatch Utilities =>> check `PreTrainedTokenizerBase` for documentation ===
     def batch_decode(
         self,
+        sequences: Union[
+            List[int], List[List[int]], torch.Tensor, Any
+        ],  # `Any` = np.ndarray | tf.Tensor
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs: str,
     def decode(
         self,
+        token_ids: Union[
+            int, List[int], torch.Tensor, Any
+        ],  # `Any` = np.ndarray | tf.Tensor
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs: str,
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def get_prompt(self, task_label, image: Image.Image):
+        image = image.convert("RGB")
+        image = image.resize((224, 224))
+        gripper_pos, mask, prediction = get_gripper_pos_raw(image)
+        prompt = f"In: What action should the robot take to achieve the instruction\nINSTRUCTION: \n{task_label}\nCURRENT GRIPPER: {list(gripper_pos)}\n\nOut:"
+        return prompt, image