Graf-J
/

captcha-crnn-finetuned

feature-extraction

Model card Files Files and versions

Graf-J commited on Feb 23

Commit

a99a61d

·

verified ·

1 Parent(s): a9d6fd8

Upload Handler for Widget

Files changed (1) hide show

handler.py +46 -0

handler.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Dict, List, Any
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoProcessor
+class EndpointHandler:
+    def __init__(self, path=""):
+        # Load the processor and model from the local path
+        # This uses your custom code in the repo via trust_remote_code
+        self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
+        self.model = AutoModel.from_pretrained(path, trust_remote_code=True)
+        # Move to GPU if available, otherwise CPU
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Args:
+            data (:obj:`Dict[str, Any]`):
+                Includes the deserialized image input under the "inputs" key.
+        """
+        # The Hub's image-to-text widget sends a PIL Image in the "inputs" key
+        inputs_data = data.pop("inputs", data)
+        # Ensure it's a PIL Image (handling both URL strings or raw bytes if necessary)
+        if not isinstance(inputs_data, Image.Image):
+            # If for some reason it's not a PIL image, you'd handle conversion here
+            pass
+        # 1. Preprocess the image using your custom processor
+        processed_inputs = self.processor(inputs_data)
+        pixel_values = processed_inputs["pixel_values"].to(self.device)
+        # 2. Run Inference
+        with torch.no_grad():
+            outputs = self.model(pixel_values)
+            logits = outputs.logits
+        # 3. Decode the prediction using your CTC logic
+        prediction = self.processor.batch_decode(logits)[0]
+        # The widget expects a list of dicts for image-to-text
+        # 'generated_text' is the standard key for the widget to display the result
+        return [{"generated_text": prediction}]