import gradio as gr import torch import torch.nn as nn import torch.nn.functional as F import cv2 import numpy as np from PIL import Image # ========================================== # 1. Model Architecture (Match notebook exactly) # ========================================== class ResidualBlock(nn.Module): """ Residual block with skip connection Helps with gradient flow and fine-grained feature discrimination """ def __init__(self, in_channels, out_channels, stride=1, downsample=None): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.relu = nn.ReLU(inplace=True) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) self.downsample = downsample def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) # Skip connection (the key to ResNet!) if self.downsample is not None: identity = self.downsample(x) out += identity # Add residual out = self.relu(out) return out class CRNN(nn.Module): """ Convolutional Recurrent Neural Network with ResNet-style CNN Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss """ def __init__( self, img_height=80, img_width=280, num_classes=63, # 62 alphanumeric + 1 blank hidden_size=384, num_lstm_layers=2, dropout=0.4 ): super(CRNN, self).__init__() self.img_height = img_height self.img_width = img_width self.num_classes = num_classes self.hidden_size = hidden_size # Initial conv: (1, 80, 280) → (64, 80, 280) self.conv1 = nn.Sequential( nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(64), nn.ReLU(inplace=True) ) # Pool1: (64, 80, 280) → (64, 40, 140) self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) # ResBlock layer1: (64, 40, 140) → (128, 40, 140) self.layer1 = self._make_layer(64, 128, blocks=2) # Pool2: (128, 40, 140) → (128, 20, 70) self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) # ResBlock layer2: (128, 20, 70) → (256, 20, 70) self.layer2 = self._make_layer(128, 256, blocks=2) # Pool3: (256, 20, 70) → (256, 10, 70) self.pool3 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height # ResBlock layer3: (256, 10, 70) → (512, 10, 70) self.layer3 = self._make_layer(256, 512, blocks=2) # Pool4: (512, 10, 70) → (512, 5, 70) self.pool4 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height # Optional dropout self.dropout = nn.Dropout2d(0.2) # Calculate RNN input size # After all conv layers: (512 channels, 5 height, 70 width) self.map_to_seq_height = 5 self.map_to_seq_channels = 512 self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels # Recurrent Layers (Bidirectional LSTM) self.rnn = nn.LSTM( input_size=self.rnn_input_size, hidden_size=hidden_size, num_layers=num_lstm_layers, bidirectional=True, dropout=0.3 if num_lstm_layers > 1 else 0, batch_first=False # (T, N, C) format for CTC ) # Fully Connected Layer self.fc = nn.Linear(hidden_size * 2, num_classes) def _make_layer(self, in_channels, out_channels, blocks): """Create a layer with multiple residual blocks""" downsample = None if in_channels != out_channels: downsample = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(out_channels) ) layers = [] layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample)) for _ in range(1, blocks): layers.append(ResidualBlock(out_channels, out_channels)) return nn.Sequential(*layers) def forward(self, x): """Forward pass""" # CNN Feature Extraction x = self.conv1(x) # (N, 64, 80, 280) x = self.pool1(x) # (N, 64, 40, 140) x = self.layer1(x) # (N, 128, 40, 140) x = self.pool2(x) # (N, 128, 20, 70) x = self.layer2(x) # (N, 256, 20, 70) x = self.pool3(x) # (N, 256, 10, 70) x = self.layer3(x) # (N, 512, 10, 70) x = self.pool4(x) # (N, 512, 5, 70) conv_out = self.dropout(x) # (N, 512, 5, 70) batch_size, channels, height, width = conv_out.size() # Map to Sequence conv_out = conv_out.permute(0, 3, 1, 2) # (N, 70, 512, 5) conv_out = conv_out.reshape(batch_size, width, channels * height) # (N, 70, 2560) # Prepare for LSTM rnn_input = conv_out.permute(1, 0, 2) # (70, N, 2560) # Bidirectional LSTM rnn_output, _ = self.rnn(rnn_input) # (70, N, 768) # Fully Connected Layer T, N, hidden = rnn_output.size() rnn_output = rnn_output.reshape(T * N, hidden) # (70*N, 768) output = self.fc(rnn_output) # (70*N, 63) output = output.reshape(T, N, self.num_classes) # (70, N, 63) # Log Softmax for CTC Loss log_probs = F.log_softmax(output, dim=2) # (70, N, 63) return log_probs # ========================================== # 2. Preprocessing Functions # ========================================== def resize_and_pad(img, target_size=(80, 280)): target_h, target_w = target_size h, w = img.shape[:2] scale = min(target_w / w, target_h / h) new_w, new_h = int(w * scale), int(h * scale) resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST) padded = np.ones((target_h, target_w), dtype=img.dtype) * 255 x_offset = (target_w - new_w) // 2 y_offset = (target_h - new_h) // 2 padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized return padded def remove_black_lines(img): hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) lower_black = np.array([0, 0, 0]) upper_black = np.array([180, 255, 80]) mask_black = cv2.inRange(hsv, lower_black, upper_black) cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA) return cleaned def preprocess_image(image): """Preprocess image for model inference""" # Convert PIL to OpenCV format img = np.array(image) # If RGB, convert to BGR for OpenCV if len(img.shape) == 3 and img.shape[2] == 3: img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Remove noise lines img = remove_black_lines(img) # Convert to grayscale img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Resize and pad img = resize_and_pad(img, target_size=(80, 280)) # Normalize img = img.astype('float32') / 255.0 img = torch.tensor(img).unsqueeze(0).unsqueeze(0) # (1, 1, H, W) return img # ========================================== # 3. Load Model & Character Mapping # ========================================== CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)} idx_to_char = {i + 1: c for i, c in enumerate(CHARS)} idx_to_char[0] = "" # blank token num_classes = len(CHARS) + 1 # Load model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = CRNN( img_height=80, img_width=280, num_classes=63, hidden_size=384, # IMPORTANT: Must match training num_lstm_layers=2 ).to(device) # Load checkpoint checkpoint = torch.load('best_model.pth', map_location=device) model.load_state_dict(checkpoint['model_state_dict']) model.eval() print(f"Model loaded successfully! Using device: {device}") # ========================================== # 4. Prediction Functions # ========================================== def ctc_decode_with_confidence(log_probs, idx_to_char): """ Decode CTC output with confidence score Args: log_probs: Log probabilities from model (T, 1, C) idx_to_char: Character mapping dictionary Returns: prediction: Decoded text string confidence: Average probability score (0-1) """ # Convert log probs to regular probabilities probs = torch.exp(log_probs).squeeze(1) # (T, C) # Greedy decoding - get max probability and index at each timestep max_probs, max_indices = torch.max(probs, dim=1) max_probs = max_probs.cpu().numpy() max_indices = max_indices.cpu().numpy() # CTC collapse (remove blanks and repeated tokens) collapsed_tokens = [] collapsed_probs = [] prev = None for token, prob in zip(max_indices, max_probs): if token != 0 and token != prev: # Not blank and not repeat collapsed_tokens.append(token) collapsed_probs.append(prob) prev = token # Decode to text prediction = ''.join([idx_to_char.get(t, '') for t in collapsed_tokens]) # Calculate average confidence confidence = float(np.mean(collapsed_probs)) if collapsed_probs else 0.0 return prediction, confidence def ctc_decode_top_k(log_probs, idx_to_char, k=3): """ Decode CTC output with top-k alternative predictions using beam search Args: log_probs: Log probabilities from model (T, 1, C) idx_to_char: Character mapping dictionary k: Number of top predictions to return Returns: List of (prediction, confidence) tuples sorted by confidence """ probs = torch.exp(log_probs).squeeze(1).cpu() # (T, C) T, C = probs.shape # Simple beam search beams = [{'text': '', 'prob': 1.0, 'last': None}] for t in range(T): new_beams = [] for beam in beams: # Get top-k tokens at this timestep topk_probs, topk_indices = torch.topk(probs[t], k=min(k*2, C)) for prob, idx in zip(topk_probs, topk_indices): idx = idx.item() prob = prob.item() # CTC rules if idx == 0: # Blank token new_beams.append({ 'text': beam['text'], 'prob': beam['prob'] * prob, 'last': None }) elif idx != beam['last']: # New character (not repeat) char = idx_to_char.get(idx, '') new_beams.append({ 'text': beam['text'] + char, 'prob': beam['prob'] * prob, 'last': idx }) else: # Repeat - continue same character new_beams.append({ 'text': beam['text'], 'prob': beam['prob'] * prob, 'last': beam['last'] }) # Keep top k beams beams = sorted(new_beams, key=lambda x: x['prob'], reverse=True)[:k] # Remove duplicates and return top k unique predictions seen = set() results = [] for beam in beams: text = beam['text'] if text not in seen: seen.add(text) # Normalize probability by sequence length confidence = beam['prob'] ** (1.0 / max(len(text), 1)) results.append((text, float(confidence))) if len(results) >= k: break return results def predict_captcha(image): """Predict CAPTCHA text from image with confidence score and alternatives""" # Preprocess img_tensor = preprocess_image(image).to(device) # Inference with torch.no_grad(): log_probs = model(img_tensor) # Get primary prediction with confidence prediction, confidence = ctc_decode_with_confidence(log_probs, idx_to_char) confidence_pct = confidence * 100 # Get top-k predictions to check uncertainty top_predictions = ctc_decode_top_k(log_probs, idx_to_char, k=3) # Check if alternatives are close (uncertainty margin) show_alternatives = False if len(top_predictions) >= 2: top1_conf = top_predictions[0][1] top2_conf = top_predictions[1][1] margin = top1_conf - top2_conf # Show alternatives if: # 1. Low confidence (< 70%), OR # 2. Top 2 predictions are very close (margin < 0.1) if confidence < 0.70 or margin < 0.1: show_alternatives = True # Format output output = f"**Primary Prediction:** {prediction}\n\n" # Add status and alternatives based on confidence and margin if show_alternatives: if confidence < 0.6: status = "⚠️ Low Confidence" elif confidence < 0.70: status = "⚡ Medium Confidence" else: status = "⚠️ Uncertain" # High confidence but close alternatives note = "Visual ambiguity detected (e.g., 0/o, i/1/l confusion)" output += f"{status} — {confidence_pct:.1f}%\n" output += f"{note}\n\n" output += "**Alternative Predictions:**\n" for i, (text, conf) in enumerate(top_predictions, 1): conf_pct = conf * 100 marker = "→" if i == 1 else " " output += f"{marker} {i}. `{text}` — {conf_pct:.1f}%\n" output += "\n💡 *Tip: Check which makes sense in context*" elif confidence < 0.75: status = "⚡ Medium Confidence" note = "Result is reasonably reliable" output += f"{status} — {confidence_pct:.1f}%\n" output += f"{note}" else: status = "✓ High Confidence" note = "Result is highly reliable" output += f"{status} — {confidence_pct:.1f}%\n" output += f"{note}" return output # ========================================== # 5. Gradio Interface # ========================================== demo = gr.Interface( fn=predict_captcha, inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), outputs=gr.Textbox(label="Prediction Results", lines=10, scale=2), title="CAPTCHA Recognition System", description=""" **CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss** Upload a CAPTCHA image to see the model's prediction with confidence score. **Model Architecture:** - ResNet-based CNN feature extraction (4 layers, 2 blocks each) - Bidirectional LSTM (hidden_size=384, 2 layers) - CTC Loss for alignment-free training **Performance:** - Sequence Accuracy: 55.6% - Character Accuracy: 85.82% - Trained on 7,777 samples with heavy augmentation **Features:** - **Confidence scoring**: Shows prediction reliability - **Multiple predictions**: Shows top 3 alternatives when confidence < 60% - **Smart warnings**: Alerts when visual ambiguity exists (0/o, i/1/l confusion) - **Real-time inference**: Results in <1 second **Training Details:** - 14 iterations of systematic experimentation - Data augmentation: rotation, shear, black lines, noise - Regularization: dropout, weight decay, early stopping """, examples=[ # Add example image paths here if you want # ["example1.png"], # ["example2.png"], ], theme=gr.themes.Soft(), allow_flagging="never" ) if __name__ == "__main__": demo.launch(share=True) # Enable share button for 72-hour public links