Spaces:

TechRaj
/

cs4243-miniproject-captcha-recognition

Sleeping

App Files Files Community

Karthikraj Sivakumar commited on Nov 8, 2025

Commit

f9929da

1 Parent(s): 7070853

first commit

Browse files

Files changed (2) hide show

app.py +235 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import cv2
+import numpy as np
+from PIL import Image
+# ==========================================
+# 1. Model Architecture (Copy from notebook)
+# ==========================================
+class ResBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1,
+                          stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = self.relu(out)
+        return out
+class CRNN(nn.Module):
+    def __init__(self, num_classes, img_height=80, img_width=280, hidden_size=128):
+        super().__init__()
+        # CNN layers
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+        )
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.layer1 = ResBlock(64, 128)
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.layer2 = ResBlock(128, 256)
+        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.layer3 = ResBlock(256, 512)
+        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.dropout = nn.Dropout2d(0.2)
+        # RNN layers
+        rnn_input_size = 512 * 5
+        self.rnn = nn.LSTM(rnn_input_size, hidden_size, num_layers=2,
+                           bidirectional=True, dropout=0.1, batch_first=False)
+        # FC layer
+        self.fc = nn.Linear(hidden_size * 2, num_classes)
+        self.log_softmax = nn.LogSoftmax(dim=2)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool1(x)
+        x = self.layer1(x)
+        x = self.pool2(x)
+        x = self.layer2(x)
+        x = self.pool3(x)
+        x = self.layer3(x)
+        x = self.pool4(x)
+        conv_out = self.dropout(x)
+        batch_size, channels, height, width = conv_out.size()
+        conv_out = conv_out.view(batch_size, channels * height, width)
+        conv_out = conv_out.permute(2, 0, 1)
+        rnn_out, _ = self.rnn(conv_out)
+        output = self.fc(rnn_out)
+        log_probs = self.log_softmax(output)
+        return log_probs
+# ==========================================
+# 2. Preprocessing Functions
+# ==========================================
+def resize_and_pad(img, target_size=(80, 280)):
+    target_h, target_w = target_size
+    h, w = img.shape[:2]
+    scale = min(target_w / w, target_h / h)
+    new_w, new_h = int(w * scale), int(h * scale)
+    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+    padded = np.ones((target_h, target_w), dtype=img.dtype) * 255
+    x_offset = (target_w - new_w) // 2
+    y_offset = (target_h - new_h) // 2
+    padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
+    return padded
+def remove_black_lines(img):
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    lower_black = np.array([0, 0, 0])
+    upper_black = np.array([180, 255, 80])
+    mask_black = cv2.inRange(hsv, lower_black, upper_black)
+    cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
+    return cleaned
+def preprocess_image(image):
+    """Preprocess image for model inference"""
+    # Convert PIL to OpenCV format
+    img = np.array(image)
+    # If RGB, convert to BGR for OpenCV
+    if len(img.shape) == 3 and img.shape[2] == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    # Remove noise lines
+    img = remove_black_lines(img)
+    # Convert to grayscale
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Resize and pad
+    img = resize_and_pad(img, target_size=(80, 280))
+    # Normalize
+    img = img.astype('float32') / 255.0
+    img = torch.tensor(img).unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
+    return img
+# ==========================================
+# 3. Load Model & Character Mapping
+# ==========================================
+CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)}
+idx_to_char = {i + 1: c for i, c in enumerate(CHARS)}
+idx_to_char[0] = ""  # blank token
+num_classes = len(CHARS) + 1
+# Load model
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = CRNN(num_classes=num_classes).to(device)
+# Load checkpoint (update path to your .pth file)
+checkpoint = torch.load('best_model.pth', map_location=device)
+model.load_state_dict(checkpoint['model_state_dict'])
+model.eval()
+print(f"✅ Model loaded successfully! Using device: {device}")
+# ==========================================
+# 4. Prediction Function
+# ==========================================
+def predict_captcha(image):
+    """Predict CAPTCHA text from image"""
+    # Preprocess
+    img_tensor = preprocess_image(image).to(device)
+    # Inference
+    with torch.no_grad():
+        log_probs = model(img_tensor)
+    # Greedy decoding
+    _, max_indices = torch.max(log_probs, dim=2)
+    max_indices = max_indices.squeeze(1).cpu().numpy()
+    # CTC collapse (remove blanks and repeated tokens)
+    collapsed = []
+    prev = None
+    for token in max_indices:
+        if token != 0 and token != prev:
+            collapsed.append(token)
+        prev = token
+    # Decode to text
+    prediction = ''.join([idx_to_char.get(t, '') for t in collapsed])
+    # Return with confidence info
+    return {
+        "Prediction": prediction,
+        "Length": len(prediction),
+        "Device": str(device)
+    }
+# ==========================================
+# 5. Gradio Interface
+# ==========================================
+demo = gr.Interface(
+    fn=predict_captcha,
+    inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
+    outputs=gr.JSON(label="Prediction Results"),
+    title="🔐 CAPTCHA Recognition System",
+    description="""
+    **CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss**
+    Upload a CAPTCHA image to see the model's prediction.
+    **Model Architecture:**
+    - ResNet-based CNN feature extraction
+    - Bidirectional LSTM for sequence modeling
+    - CTC Loss for alignment-free training
+    **Performance:**
+    - Sequence Accuracy: ~54%
+    - Character Accuracy: ~86%
+    - Trained on 9,000 samples with heavy augmentation
+    """,
+    examples=[
+        # Add example image paths here if you want
+        # ["example1.png"],
+        # ["example2.png"],
+    ],
+    theme=gr.themes.Soft(),
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+torchvision>=0.15.0
+opencv-python-headless
+numpy
+pillow
+gradio