import torch import onnx import onnxruntime as rt from torchvision import transforms as T from pathlib import Path from PIL import Image from utils.tokenizer_base import Tokenizer import gradio as gr import io import base64 import os # ===================== # MODEL SETUP # ===================== model_file = Path(__file__).parent / "models/model.onnx" if not model_file.exists(): raise RuntimeError(f"Model not found at {model_file}") img_size = (32, 128) vocab = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" tokenizer = Tokenizer(vocab) transform = T.Compose([ T.Resize(img_size, T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(0.5, 0.5), ]) session = rt.InferenceSession(str(model_file)) def to_numpy(t): return t.detach().cpu().numpy() def infer(img: Image.Image): x = transform(img.convert("RGB")).unsqueeze(0) logits = session.run(None, {session.get_inputs()[0].name: to_numpy(x)})[0] probs = torch.tensor(logits).softmax(-1) preds, _ = tokenizer.decode(probs) return preds[0] # ===================== # GRADIO FUNCTIONS # ===================== def predict_image(img): return infer(img) def predict_base64(b64: str): img_bytes = base64.b64decode(b64) img = Image.open(io.BytesIO(img_bytes)) return infer(img) # ===================== # GRADIO APP (REQUIRED) # ===================== with gr.Blocks(title="Captcha OCR") as demo: gr.Markdown("# Captcha OCR") gr.Markdown("OCR for captcha images (letters & numbers)") with gr.Tab("Image Upload"): img = gr.Image(type="pil") out = gr.Textbox() gr.Button("Predict").click(predict_image, img, out) with gr.Tab("Base64 API"): b64 = gr.Textbox(label="Base64 Image") out2 = gr.Textbox() gr.Button("Predict").click(predict_base64, b64, out2) demo.queue() demo.launch()