Spaces:

TechRaj
/

cs4243-miniproject-captcha-recognition

Sleeping

App Files Files Community

Karthikraj Sivakumar commited on Nov 8, 2025

Commit

df3b1c8

1 Parent(s): 608d548

bug fix

Browse files

Files changed (1) hide show

app.py +146 -50

app.py CHANGED Viewed

@@ -1,90 +1,174 @@
 import gradio as gr
 import torch
 import torch.nn as nn
 import cv2
 import numpy as np
 from PIL import Image
 # ==========================================
-# 1. Model Architecture (Copy from notebook)
 # ==========================================
-class ResBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, stride=1):
-        super().__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                                stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(out_channels)
         self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn2 = nn.BatchNorm2d(out_channels)
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_channels != out_channels:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_channels, out_channels, kernel_size=1,
-                          stride=stride, bias=False),
-                nn.BatchNorm2d(out_channels)
-            )
     def forward(self, x):
-        out = self.relu(self.bn1(self.conv1(x)))
-        out = self.bn2(self.conv2(out))
-        out += self.shortcut(x)
         out = self.relu(out)
         return out
 class CRNN(nn.Module):
-    def __init__(self, num_classes, img_height=80, img_width=280, hidden_size=128):
-        super().__init__()
-        # CNN layers
         self.conv1 = nn.Sequential(
-            nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False),
             nn.BatchNorm2d(64),
             nn.ReLU(inplace=True)
         )
         self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.layer1 = ResBlock(64, 128)
         self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.layer2 = ResBlock(128, 256)
-        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.layer3 = ResBlock(256, 512)
-        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
         self.dropout = nn.Dropout2d(0.2)
-        # RNN layers
-        rnn_input_size = 512 * 5
-        self.rnn = nn.LSTM(rnn_input_size, hidden_size, num_layers=2,
-                           bidirectional=True, dropout=0.1, batch_first=False)
-        # FC layer
         self.fc = nn.Linear(hidden_size * 2, num_classes)
-        self.log_softmax = nn.LogSoftmax(dim=2)
     def forward(self, x):
-        x = self.conv1(x)
-        x = self.pool1(x)
-        x = self.layer1(x)
-        x = self.pool2(x)
-        x = self.layer2(x)
-        x = self.pool3(x)
-        x = self.layer3(x)
-        x = self.pool4(x)
-        conv_out = self.dropout(x)
         batch_size, channels, height, width = conv_out.size()
-        conv_out = conv_out.view(batch_size, channels * height, width)
-        conv_out = conv_out.permute(2, 0, 1)
-        rnn_out, _ = self.rnn(conv_out)
-        output = self.fc(rnn_out)
-        log_probs = self.log_softmax(output)
         return log_probs
@@ -153,9 +237,15 @@ num_classes = len(CHARS) + 1
 # Load model
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model = CRNN(num_classes=num_classes).to(device)
-# Load checkpoint (update path to your .pth file)
 checkpoint = torch.load('best_model.pth', map_location=device)
 model.load_state_dict(checkpoint['model_state_dict'])
 model.eval()
@@ -213,14 +303,19 @@ demo = gr.Interface(
     Upload a CAPTCHA image to see the model's prediction.
     **Model Architecture:**
-    - ResNet-based CNN feature extraction
-    - Bidirectional LSTM for sequence modeling
     - CTC Loss for alignment-free training
     **Performance:**
     - Sequence Accuracy: ~54%
     - Character Accuracy: ~86%
     - Trained on 9,000 samples with heavy augmentation
     """,
     examples=[
         # Add example image paths here if you want
@@ -232,4 +327,5 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import cv2
 import numpy as np
 from PIL import Image
 # ==========================================
+# 1. Model Architecture (Match notebook exactly)
 # ==========================================
+class ResidualBlock(nn.Module):
+    """
+    Residual block with skip connection
+    Helps with gradient flow and fine-grained feature discrimination
+    """
+    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
+        super(ResidualBlock, self).__init__()
         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                                stride=stride, padding=1, bias=False)
         self.bn1 = nn.BatchNorm2d(out_channels)
         self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                                stride=1, padding=1, bias=False)
         self.bn2 = nn.BatchNorm2d(out_channels)
+        self.downsample = downsample
     def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        # Skip connection (the key to ResNet!)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity  # Add residual
         out = self.relu(out)
         return out
 class CRNN(nn.Module):
+    """
+    Convolutional Recurrent Neural Network with ResNet-style CNN
+    Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss
+    """
+    def __init__(
+        self,
+        img_height=80,
+        img_width=280,
+        num_classes=63,  # 62 alphanumeric + 1 blank
+        hidden_size=384,
+        num_lstm_layers=2,
+        dropout=0.4
+    ):
+        super(CRNN, self).__init__()
+        self.img_height = img_height
+        self.img_width = img_width
+        self.num_classes = num_classes
+        self.hidden_size = hidden_size
+        # Initial conv: (1, 80, 280) → (64, 80, 280)
         self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
             nn.BatchNorm2d(64),
             nn.ReLU(inplace=True)
         )
+        # Pool1: (64, 80, 280) → (64, 40, 140)
         self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # ResBlock layer1: (64, 40, 140) → (128, 40, 140)
+        self.layer1 = self._make_layer(64, 128, blocks=2)
+        # Pool2: (128, 40, 140) → (128, 20, 70)
         self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # ResBlock layer2: (128, 20, 70) → (256, 20, 70)
+        self.layer2 = self._make_layer(128, 256, blocks=2)
+        # Pool3: (256, 20, 70) → (256, 10, 70)
+        self.pool3 = nn.MaxPool2d(kernel_size=(2, 1))  # Only height
+        # ResBlock layer3: (256, 10, 70) → (512, 10, 70)
+        self.layer3 = self._make_layer(256, 512, blocks=2)
+        # Pool4: (512, 10, 70) → (512, 5, 70)
+        self.pool4 = nn.MaxPool2d(kernel_size=(2, 1))  # Only height
+        # Optional dropout
         self.dropout = nn.Dropout2d(0.2)
+        # Calculate RNN input size
+        # After all conv layers: (512 channels, 5 height, 70 width)
+        self.map_to_seq_height = 5
+        self.map_to_seq_channels = 512
+        self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels
+        # Recurrent Layers (Bidirectional LSTM)
+        self.rnn = nn.LSTM(
+            input_size=self.rnn_input_size,
+            hidden_size=hidden_size,
+            num_layers=num_lstm_layers,
+            bidirectional=True,
+            dropout=0.3 if num_lstm_layers > 1 else 0,
+            batch_first=False  # (T, N, C) format for CTC
+        )
+        # Fully Connected Layer
         self.fc = nn.Linear(hidden_size * 2, num_classes)
+    def _make_layer(self, in_channels, out_channels, blocks):
+        """Create a layer with multiple residual blocks"""
+        downsample = None
+        if in_channels != out_channels:
+            downsample = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(out_channels)
+            )
+        layers = []
+        layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample))
+        for _ in range(1, blocks):
+            layers.append(ResidualBlock(out_channels, out_channels))
+        return nn.Sequential(*layers)
     def forward(self, x):
+        """Forward pass"""
+        # CNN Feature Extraction
+        x = self.conv1(x)      # (N, 64, 80, 280)
+        x = self.pool1(x)      # (N, 64, 40, 140)
+        x = self.layer1(x)     # (N, 128, 40, 140)
+        x = self.pool2(x)      # (N, 128, 20, 70)
+        x = self.layer2(x)     # (N, 256, 20, 70)
+        x = self.pool3(x)      # (N, 256, 10, 70)
+        x = self.layer3(x)     # (N, 512, 10, 70)
+        x = self.pool4(x)      # (N, 512, 5, 70)
+        conv_out = self.dropout(x)  # (N, 512, 5, 70)
         batch_size, channels, height, width = conv_out.size()
+        # Map to Sequence
+        conv_out = conv_out.permute(0, 3, 1, 2)  # (N, 70, 512, 5)
+        conv_out = conv_out.reshape(batch_size, width, channels * height)  # (N, 70, 2560)
+        # Prepare for LSTM
+        rnn_input = conv_out.permute(1, 0, 2)  # (70, N, 2560)
+        # Bidirectional LSTM
+        rnn_output, _ = self.rnn(rnn_input)  # (70, N, 768)
+        # Fully Connected Layer
+        T, N, hidden = rnn_output.size()
+        rnn_output = rnn_output.reshape(T * N, hidden)  # (70*N, 768)
+        output = self.fc(rnn_output)  # (70*N, 63)
+        output = output.reshape(T, N, self.num_classes)  # (70, N, 63)
+        # Log Softmax for CTC Loss
+        log_probs = F.log_softmax(output, dim=2)  # (70, N, 63)
         return log_probs
 # Load model
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model = CRNN(
+    img_height=80,
+    img_width=280,
+    num_classes=63,
+    hidden_size=384,  # IMPORTANT: Must match training
+    num_lstm_layers=2
+).to(device)
+# Load checkpoint
 checkpoint = torch.load('best_model.pth', map_location=device)
 model.load_state_dict(checkpoint['model_state_dict'])
 model.eval()
     Upload a CAPTCHA image to see the model's prediction.
     **Model Architecture:**
+    - ResNet-based CNN feature extraction (4 layers, 2 blocks each)
+    - Bidirectional LSTM (hidden_size=384, 2 layers)
     - CTC Loss for alignment-free training
     **Performance:**
     - Sequence Accuracy: ~54%
     - Character Accuracy: ~86%
     - Trained on 9,000 samples with heavy augmentation
+    **Training Details:**
+    - 14 iterations of experimentation
+    - Data augmentation: rotation, shear, black lines, noise
+    - Regularization: dropout, weight decay, early stopping
     """,
     examples=[
         # Add example image paths here if you want
 )
 if __name__ == "__main__":
+    demo.launch()