Graf-J's picture
Upload Handler for Widget
a99a61d verified
from typing import Dict, List, Any
import torch
from PIL import Image
from transformers import AutoModel, AutoProcessor
class EndpointHandler:
def __init__(self, path=""):
# Load the processor and model from the local path
# This uses your custom code in the repo via trust_remote_code
self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
self.model = AutoModel.from_pretrained(path, trust_remote_code=True)
# Move to GPU if available, otherwise CPU
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.model.eval()
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Args:
data (:obj:`Dict[str, Any]`):
Includes the deserialized image input under the "inputs" key.
"""
# The Hub's image-to-text widget sends a PIL Image in the "inputs" key
inputs_data = data.pop("inputs", data)
# Ensure it's a PIL Image (handling both URL strings or raw bytes if necessary)
if not isinstance(inputs_data, Image.Image):
# If for some reason it's not a PIL image, you'd handle conversion here
pass
# 1. Preprocess the image using your custom processor
processed_inputs = self.processor(inputs_data)
pixel_values = processed_inputs["pixel_values"].to(self.device)
# 2. Run Inference
with torch.no_grad():
outputs = self.model(pixel_values)
logits = outputs.logits
# 3. Decode the prediction using your CTC logic
prediction = self.processor.batch_decode(logits)[0]
# The widget expects a list of dicts for image-to-text
# 'generated_text' is the standard key for the widget to display the result
return [{"generated_text": prediction}]