IIIT-5K / app.py
Tejveer12's picture
Update app.py
6204233 verified
import cv2
import numpy as np
import gradio as gr
import onnxruntime as ort
# Load the optimized model
session = ort.InferenceSession("crnn.onnx")
alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' # Space at index 0 for CTC blank
def preprocess(image):
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Resize to match training (Height 32, Width 100)
img = cv2.resize(gray, (100, 32))
# Normalize (standard ToTensor() scale [0, 1])
img = img.astype(np.float32) / 255.0
# Add batch and channel dimensions: [1, 1, 32, 100]
img = img[np.newaxis, np.newaxis, :, :]
return img
def ctc_decode(preds):
# preds shape: [width_steps, batch, num_classes]
tokens = np.argmax(preds, axis=2) # Get most likely char per slice
tokens = tokens.squeeze(1) # Remove batch dim -> [width_steps]
text = []
for i in range(len(tokens)):
# CTC Logic: Ignore blank (0) and repeated characters
if tokens[i] != 0 and (i == 0 or tokens[i] != tokens[i - 1]):
text.append(alphabet[tokens[i]])
return "".join(text)
def inference(image):
if image is None:
return "Please upload an image."
img = preprocess(image)
preds = session.run(None, {"input": img})[0]
return ctc_decode(preds)
# Gradio UI
interface = gr.Interface(
fn=inference,
inputs=gr.Image(label="Upload Text Image"),
outputs=gr.Textbox(label="Recognized Text"),
title="OCR (IIIT5K)",
description="CRNN + CTC model optimized for CPU inference using ONNX.",
examples=["999_3.png","999_8.png", "997_7.png"]
)
if __name__ == "__main__":
interface.launch()