Spaces:

MedhaCodes
/

OCR_Captcha_Recognizer

Sleeping

App Files Files Community

MedhaCodes commited on 11 days ago

Commit

06bbcb9

verified ·

1 Parent(s): 522c19f

Update app/infer.py

Browse files

Files changed (1) hide show

app/infer.py +111 -40

app/infer.py CHANGED Viewed

@@ -1,59 +1,130 @@
-import cv2
-import torch
-import numpy as np
 import os
-from app.model import CRNN
-from app.utils import decode_with_confidence
-# ----------------------------
-# Device
-# ----------------------------
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ----------------------------
-# Load model
-# ----------------------------
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 WEIGHTS_PATH = os.path.join(BASE_DIR, "weights", "ocr_model.pth")
-model = CRNN()
 model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=DEVICE))
-model.to(DEVICE)
 model.eval()
-# ----------------------------
-# Image preprocessing
-# ----------------------------
-def preprocess(image_bytes):
-    img = cv2.imdecode(
-        np.frombuffer(image_bytes, np.uint8),
-        cv2.IMREAD_GRAYSCALE
-    )
-    if img is None:
-        raise ValueError("Invalid image")
-    img = cv2.resize(img, (160, 60))
-    img = img.astype("float32") / 255.0
-    tensor = torch.tensor(img).unsqueeze(0).unsqueeze(0)
-    return tensor.to(DEVICE)
-# ----------------------------
-# Prediction entry point
-# ----------------------------
-def predict(image_bytes):
-    x = preprocess(image_bytes)
-    with torch.no_grad():
-        logits = model(x)
-    text, confidence = decode_with_confidence(logits)
-    return {
-        "text": text,
-        "confidence": confidence
-    }

+# app/infer.py
 import os
+import torch
+import torch.nn as nn
+from PIL import Image
+import torchvision.transforms as T
+# =====================
+# DEVICE
+# =====================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# =====================
+# CHARSET (EXACT MATCH)
+# =====================
+import string
+DIGITS = string.digits
+LOWER  = string.ascii_lowercase
+UPPER  = string.ascii_uppercase
+BLANK_CHAR = "-"
+CHARS = DIGITS + LOWER + UPPER + BLANK_CHAR
+char2idx = {c: i for i, c in enumerate(CHARS)}
+idx2char = {i: c for c, i in char2idx.items()}
+NUM_CLASSES = len(CHARS)
+# =====================
+# CRNN (EXACT SAME MODEL)
+# =====================
+class CRNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(128, 256, 3, padding=1), nn.BatchNorm2d(256), nn.ReLU(),
+            nn.MaxPool2d((2, 1)),
+            nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()
+        )
+        self.rnn = nn.LSTM(
+            input_size=256 * 7,
+            hidden_size=256,
+            num_layers=2,
+            bidirectional=True,
+            batch_first=True
+        )
+        self.fc = nn.Linear(512, NUM_CLASSES)
+    def forward(self, x):
+        x = self.cnn(x)
+        b, c, h, w = x.shape
+        x = x.permute(0, 3, 1, 2).reshape(b, w, c * h)
+        x, _ = self.rnn(x)
+        return self.fc(x)
+# =====================
+# LOAD MODEL
+# =====================
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 WEIGHTS_PATH = os.path.join(BASE_DIR, "weights", "ocr_model.pth")
+if not os.path.exists(WEIGHTS_PATH):
+    raise FileNotFoundError(f"Missing model: {WEIGHTS_PATH}")
+model = CRNN().to(DEVICE)
 model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=DEVICE))
 model.eval()
+# =====================
+# IMAGE TRANSFORM (60×160)
+# =====================
+transform = T.Compose([
+    T.Grayscale(),
+    T.Resize((60, 160)),
+    T.ToTensor()
+])
+# =====================
+# BEAM SEARCH
+# =====================
+def ctc_beam_search(logits, beam_width=5):
+    probs = logits.softmax(2)
+    T, C = probs.shape[1], probs.shape[2]
+    beams = [("", 1.0)]
+    for t in range(T):
+        new_beams = {}
+        for prefix, score in beams:
+            for c in range(C):
+                p = probs[0, t, c].item()
+                if p < 1e-4:
+                    continue
+                char = idx2char[c]
+                new_prefix = prefix if char == BLANK_CHAR else prefix + char
+                new_beams[new_prefix] = max(
+                    new_beams.get(new_prefix, 0.0),
+                    score * p
+                )
+        beams = sorted(new_beams.items(), key=lambda x: x[1], reverse=True)[:beam_width]
+    return beams
+def decode_with_confidence(logits):
+    text, score = ctc_beam_search(logits)[0]
+    return text, round(min(1.0, score * 10), 3)
+# =====================
+# PUBLIC API
+# =====================
+def predict(pil_img: Image.Image):
+    img = transform(pil_img).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        logits = model(img)
+    text, conf = decode_with_confidence(logits)
+    return {"text": text, "confidence": conf}