Graf-J
/

captcha-crnn-finetuned

feature-extraction

Model card Files Files and versions

captcha-crnn-finetuned / handler.py

Graf-J's picture

Upload Handler for Widget

a99a61d verified 1 day ago

history blame contribute delete

1.92 kB

	from typing import Dict, List, Any
	import torch
	from PIL import Image
	from transformers import AutoModel, AutoProcessor

	class EndpointHandler:
	def __init__(self, path=""):
	# Load the processor and model from the local path
	# This uses your custom code in the repo via trust_remote_code
	self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
	self.model = AutoModel.from_pretrained(path, trust_remote_code=True)

	# Move to GPU if available, otherwise CPU
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model.to(self.device)
	self.model.eval()

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Args:
	data (:obj:`Dict[str, Any]`):
	Includes the deserialized image input under the "inputs" key.
	"""
	# The Hub's image-to-text widget sends a PIL Image in the "inputs" key
	inputs_data = data.pop("inputs", data)

	# Ensure it's a PIL Image (handling both URL strings or raw bytes if necessary)
	if not isinstance(inputs_data, Image.Image):
	# If for some reason it's not a PIL image, you'd handle conversion here
	pass

	# 1. Preprocess the image using your custom processor
	processed_inputs = self.processor(inputs_data)
	pixel_values = processed_inputs["pixel_values"].to(self.device)

	# 2. Run Inference
	with torch.no_grad():
	outputs = self.model(pixel_values)
	logits = outputs.logits

	# 3. Decode the prediction using your CTC logic
	prediction = self.processor.batch_decode(logits)[0]

	# The widget expects a list of dicts for image-to-text
	# 'generated_text' is the standard key for the widget to display the result
	return [{"generated_text": prediction}]