from typing import Dict, List, Any import torch from PIL import Image from transformers import AutoModel, AutoProcessor class EndpointHandler: def __init__(self, path=""): # Load the processor and model from the local path # This uses your custom code in the repo via trust_remote_code self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True) self.model = AutoModel.from_pretrained(path, trust_remote_code=True) # Move to GPU if available, otherwise CPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) self.model.eval() def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data (:obj:`Dict[str, Any]`): Includes the deserialized image input under the "inputs" key. """ # The Hub's image-to-text widget sends a PIL Image in the "inputs" key inputs_data = data.pop("inputs", data) # Ensure it's a PIL Image (handling both URL strings or raw bytes if necessary) if not isinstance(inputs_data, Image.Image): # If for some reason it's not a PIL image, you'd handle conversion here pass # 1. Preprocess the image using your custom processor processed_inputs = self.processor(inputs_data) pixel_values = processed_inputs["pixel_values"].to(self.device) # 2. Run Inference with torch.no_grad(): outputs = self.model(pixel_values) logits = outputs.logits # 3. Decode the prediction using your CTC logic prediction = self.processor.batch_decode(logits)[0] # The widget expects a list of dicts for image-to-text # 'generated_text' is the standard key for the widget to display the result return [{"generated_text": prediction}]