from typing import Any, Dict, List, Optional, Union import numpy as np import torch from PIL import Image from transformers import ImageProcessingMixin def _to_rgb(img: Image.Image) -> Image.Image: if img.mode != "RGB": return img.convert("RGB") return img class UpscalerImageProcessor(ImageProcessingMixin): """ Minimal processor: - input: PIL or list of PIL - output: pixel_values float32 in [0,1], shape (B,3,H,W) No ImageNet normalization (recommended for SR trained on [0,1]). """ model_input_names = ["pixel_values"] def __init__(self, **kwargs): super().__init__(**kwargs) def _pil_to_tensor_01(self, img: Image.Image) -> torch.FloatTensor: img = _to_rgb(img) arr = np.array(img, dtype=np.float32) / 255.0 # H,W,3 in [0,1] t = torch.from_numpy(arr).permute(2, 0, 1).contiguous() # 3,H,W return t def __call__( self, images: Union[Image.Image, List[Image.Image]], return_tensors: Optional[str] = None, **kwargs, ) -> Dict[str, Any]: if isinstance(images, Image.Image): images = [images] tensors = [self._pil_to_tensor_01(im) for im in images] pixel_values = torch.stack(tensors, dim=0) # B,3,H,W if return_tensors is None or return_tensors == "pt": return {"pixel_values": pixel_values} raise ValueError("Only return_tensors=None or 'pt' is supported.")