Spaces:

MedhaCodes
/

OCR_Captcha_Recognizer

Sleeping

App Files Files Community

MedhaCodes commited on 11 days ago

Commit

8b1b3a9

verified ·

1 Parent(s): 065e49f

Update app/infer.py

Browse files

Files changed (1) hide show

app/infer.py +31 -63

app/infer.py CHANGED Viewed

@@ -4,31 +4,14 @@ import torch
 import torch.nn as nn
 from PIL import Image
 import torchvision.transforms as T
-# =====================
-# DEVICE
-# =====================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# =====================
-# CHARSET (EXACT MATCH)
-# =====================
-import string
-DIGITS = string.digits
-LOWER  = string.ascii_lowercase
-UPPER  = string.ascii_uppercase
-BLANK_CHAR = "-"
-CHARS = DIGITS + LOWER + UPPER + BLANK_CHAR
-char2idx = {c: i for i, c in enumerate(CHARS)}
-idx2char = {i: c for c, i in char2idx.items()}
 NUM_CLASSES = len(CHARS)
-# =====================
-# CRNN (EXACT SAME MODEL)
-# =====================
 class CRNN(nn.Module):
     def __init__(self):
         super().__init__()
@@ -63,68 +46,53 @@ class CRNN(nn.Module):
         x, _ = self.rnn(x)
         return self.fc(x)
-# =====================
 # LOAD MODEL
-# =====================
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-WEIGHTS_PATH = os.path.join(BASE_DIR, "weights", "ocr_model.pth")
-if not os.path.exists(WEIGHTS_PATH):
-    raise FileNotFoundError(f"Missing model: {WEIGHTS_PATH}")
 model = CRNN().to(DEVICE)
-model.load_state_dict(torch.load(WEIGHTS_PATH, map_location=DEVICE))
 model.eval()
-# =====================
-# IMAGE TRANSFORM (60×160)
-# =====================
 transform = T.Compose([
     T.Grayscale(),
     T.Resize((60, 160)),
     T.ToTensor()
 ])
-# =====================
-# BEAM SEARCH
-# =====================
-def ctc_beam_search(logits, beam_width=5):
-    probs = logits.softmax(2)
-    T, C = probs.shape[1], probs.shape[2]
-    beams = [("", 1.0)]
-    for t in range(T):
-        new_beams = {}
-        for prefix, score in beams:
-            for c in range(C):
-                p = probs[0, t, c].item()
-                if p < 1e-4:
-                    continue
-                char = idx2char[c]
-                new_prefix = prefix if char == BLANK_CHAR else prefix + char
-                new_beams[new_prefix] = max(
-                    new_beams.get(new_prefix, 0.0),
-                    score * p
-                )
-        beams = sorted(new_beams.items(), key=lambda x: x[1], reverse=True)[:beam_width]
-    return beams
-def decode_with_confidence(logits):
-    text, score = ctc_beam_search(logits)[0]
-    return text, round(min(1.0, score * 10), 3)
-# =====================
 # PUBLIC API
-# =====================
 def predict(pil_img: Image.Image):
     img = transform(pil_img).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
         logits = model(img)
-    text, conf = decode_with_confidence(logits)
-    return {"text": text, "confidence": conf}

 import torch.nn as nn
 from PIL import Image
 import torchvision.transforms as T
+from app.utils import CHARS, idx2char, BLANK_CHAR
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 NUM_CLASSES = len(CHARS)
+# --------------------
+# CRNN MODEL (SAME AS TRAINING)
+# --------------------
 class CRNN(nn.Module):
     def __init__(self):
         super().__init__()
         x, _ = self.rnn(x)
         return self.fc(x)
+# --------------------
 # LOAD MODEL
+# --------------------
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+WEIGHTS = os.path.join(BASE_DIR, "weights", "ocr_model.pth")
 model = CRNN().to(DEVICE)
+model.load_state_dict(torch.load(WEIGHTS, map_location=DEVICE))
 model.eval()
+# --------------------
+# TRANSFORM
+# --------------------
 transform = T.Compose([
     T.Grayscale(),
     T.Resize((60, 160)),
     T.ToTensor()
 ])
+# --------------------
+# CTC DECODER
+# --------------------
+def ctc_decode(logits):
+    probs = logits.softmax(2)[0]
+    best = probs.argmax(1)
+    prev = None
+    text = ""
+    for idx in best:
+        idx = idx.item()
+        if idx != prev and CHARS[idx] != BLANK_CHAR:
+            text += CHARS[idx]
+        prev = idx
+    return text
+# --------------------
 # PUBLIC API
+# --------------------
 def predict(pil_img: Image.Image):
     img = transform(pil_img).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
         logits = model(img)
+    text = ctc_decode(logits)
+    confidence = round(float(logits.softmax(2).max()), 3)
+    return text, confidence