from transformers.image_processing_utils import BaseImageProcessor, BatchFeature from transformers.utils import TensorType import numpy as np from PIL import Image import torch class PrivateDetectorImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__(self, size=480, **kwargs): super().__init__(**kwargs) self.size = size def preprocess(self, images, return_tensors=None, **kwargs): if not isinstance(images, list): images = [images] processed_images = [] for img in images: if not isinstance(img, Image.Image): # If it's a numpy array or torch tensor, convert to PIL Image if isinstance(img, torch.Tensor): img = img.numpy() if isinstance(img, np.ndarray): # Handle channels first vs channels last if img.shape[0] == 3: img = img.transpose(1, 2, 0) img = Image.fromarray(img.astype(np.uint8)) else: raise ValueError("Unsupported image type") # 1. Resize preserving aspect ratio to fit inside size x size w, h = img.size scale = self.size / max(w, h) new_w = int(round(w * scale)) new_h = int(round(h * scale)) # bilinear interpolation matching TF bilinear resize img_resized = img.resize((new_w, new_h), Image.Resampling.BILINEAR) # 2. Pad with 128 (gray background) to size x size pad_w = self.size - new_w pad_h = self.size - new_h left = pad_w // 2 top = pad_h // 2 # Create a gray background image bg = Image.new("RGB", (self.size, self.size), (128, 128, 128)) bg.paste(img_resized, (left, top)) # Convert to numpy and normalize to [-1.0, 1.0] arr = np.array(bg, dtype=np.float32) arr = (arr - 128.0) / 128.0 # Channels first: (3, H, W) arr = np.transpose(arr, (2, 0, 1)) processed_images.append(arr) data = {"pixel_values": processed_images} return BatchFeature(data=data, tensor_type=return_tensors)