Karthikraj Sivakumar
fix biased uncertainty problem
7e07095
import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
import numpy as np
from PIL import Image
# ==========================================
# 1. Model Architecture (Match notebook exactly)
# ==========================================
class ResidualBlock(nn.Module):
"""
Residual block with skip connection
Helps with gradient flow and fine-grained feature discrimination
"""
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
# Skip connection (the key to ResNet!)
if self.downsample is not None:
identity = self.downsample(x)
out += identity # Add residual
out = self.relu(out)
return out
class CRNN(nn.Module):
"""
Convolutional Recurrent Neural Network with ResNet-style CNN
Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss
"""
def __init__(
self,
img_height=80,
img_width=280,
num_classes=63, # 62 alphanumeric + 1 blank
hidden_size=384,
num_lstm_layers=2,
dropout=0.4
):
super(CRNN, self).__init__()
self.img_height = img_height
self.img_width = img_width
self.num_classes = num_classes
self.hidden_size = hidden_size
# Initial conv: (1, 80, 280) β†’ (64, 80, 280)
self.conv1 = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True)
)
# Pool1: (64, 80, 280) β†’ (64, 40, 140)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
# ResBlock layer1: (64, 40, 140) β†’ (128, 40, 140)
self.layer1 = self._make_layer(64, 128, blocks=2)
# Pool2: (128, 40, 140) β†’ (128, 20, 70)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
# ResBlock layer2: (128, 20, 70) β†’ (256, 20, 70)
self.layer2 = self._make_layer(128, 256, blocks=2)
# Pool3: (256, 20, 70) β†’ (256, 10, 70)
self.pool3 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height
# ResBlock layer3: (256, 10, 70) β†’ (512, 10, 70)
self.layer3 = self._make_layer(256, 512, blocks=2)
# Pool4: (512, 10, 70) β†’ (512, 5, 70)
self.pool4 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height
# Optional dropout
self.dropout = nn.Dropout2d(0.2)
# Calculate RNN input size
# After all conv layers: (512 channels, 5 height, 70 width)
self.map_to_seq_height = 5
self.map_to_seq_channels = 512
self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels
# Recurrent Layers (Bidirectional LSTM)
self.rnn = nn.LSTM(
input_size=self.rnn_input_size,
hidden_size=hidden_size,
num_layers=num_lstm_layers,
bidirectional=True,
dropout=0.3 if num_lstm_layers > 1 else 0,
batch_first=False # (T, N, C) format for CTC
)
# Fully Connected Layer
self.fc = nn.Linear(hidden_size * 2, num_classes)
def _make_layer(self, in_channels, out_channels, blocks):
"""Create a layer with multiple residual blocks"""
downsample = None
if in_channels != out_channels:
downsample = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(out_channels)
)
layers = []
layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample))
for _ in range(1, blocks):
layers.append(ResidualBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
"""Forward pass"""
# CNN Feature Extraction
x = self.conv1(x) # (N, 64, 80, 280)
x = self.pool1(x) # (N, 64, 40, 140)
x = self.layer1(x) # (N, 128, 40, 140)
x = self.pool2(x) # (N, 128, 20, 70)
x = self.layer2(x) # (N, 256, 20, 70)
x = self.pool3(x) # (N, 256, 10, 70)
x = self.layer3(x) # (N, 512, 10, 70)
x = self.pool4(x) # (N, 512, 5, 70)
conv_out = self.dropout(x) # (N, 512, 5, 70)
batch_size, channels, height, width = conv_out.size()
# Map to Sequence
conv_out = conv_out.permute(0, 3, 1, 2) # (N, 70, 512, 5)
conv_out = conv_out.reshape(batch_size, width, channels * height) # (N, 70, 2560)
# Prepare for LSTM
rnn_input = conv_out.permute(1, 0, 2) # (70, N, 2560)
# Bidirectional LSTM
rnn_output, _ = self.rnn(rnn_input) # (70, N, 768)
# Fully Connected Layer
T, N, hidden = rnn_output.size()
rnn_output = rnn_output.reshape(T * N, hidden) # (70*N, 768)
output = self.fc(rnn_output) # (70*N, 63)
output = output.reshape(T, N, self.num_classes) # (70, N, 63)
# Log Softmax for CTC Loss
log_probs = F.log_softmax(output, dim=2) # (70, N, 63)
return log_probs
# ==========================================
# 2. Preprocessing Functions
# ==========================================
def resize_and_pad(img, target_size=(80, 280)):
target_h, target_w = target_size
h, w = img.shape[:2]
scale = min(target_w / w, target_h / h)
new_w, new_h = int(w * scale), int(h * scale)
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
padded = np.ones((target_h, target_w), dtype=img.dtype) * 255
x_offset = (target_w - new_w) // 2
y_offset = (target_h - new_h) // 2
padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
return padded
def remove_black_lines(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lower_black = np.array([0, 0, 0])
upper_black = np.array([180, 255, 80])
mask_black = cv2.inRange(hsv, lower_black, upper_black)
cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
return cleaned
def preprocess_image(image):
"""Preprocess image for model inference"""
# Convert PIL to OpenCV format
img = np.array(image)
# If RGB, convert to BGR for OpenCV
if len(img.shape) == 3 and img.shape[2] == 3:
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
# Remove noise lines
img = remove_black_lines(img)
# Convert to grayscale
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Resize and pad
img = resize_and_pad(img, target_size=(80, 280))
# Normalize
img = img.astype('float32') / 255.0
img = torch.tensor(img).unsqueeze(0).unsqueeze(0) # (1, 1, H, W)
return img
# ==========================================
# 3. Load Model & Character Mapping
# ==========================================
CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)}
idx_to_char = {i + 1: c for i, c in enumerate(CHARS)}
idx_to_char[0] = "" # blank token
num_classes = len(CHARS) + 1
# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CRNN(
img_height=80,
img_width=280,
num_classes=63,
hidden_size=384, # IMPORTANT: Must match training
num_lstm_layers=2
).to(device)
# Load checkpoint
checkpoint = torch.load('best_model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print(f"Model loaded successfully! Using device: {device}")
# ==========================================
# 4. Prediction Functions
# ==========================================
def ctc_decode_with_confidence(log_probs, idx_to_char):
"""
Decode CTC output with confidence score
Args:
log_probs: Log probabilities from model (T, 1, C)
idx_to_char: Character mapping dictionary
Returns:
prediction: Decoded text string
confidence: Average probability score (0-1)
"""
# Convert log probs to regular probabilities
probs = torch.exp(log_probs).squeeze(1) # (T, C)
# Greedy decoding - get max probability and index at each timestep
max_probs, max_indices = torch.max(probs, dim=1)
max_probs = max_probs.cpu().numpy()
max_indices = max_indices.cpu().numpy()
# CTC collapse (remove blanks and repeated tokens)
collapsed_tokens = []
collapsed_probs = []
prev = None
for token, prob in zip(max_indices, max_probs):
if token != 0 and token != prev: # Not blank and not repeat
collapsed_tokens.append(token)
collapsed_probs.append(prob)
prev = token
# Decode to text
prediction = ''.join([idx_to_char.get(t, '') for t in collapsed_tokens])
# Calculate average confidence
confidence = float(np.mean(collapsed_probs)) if collapsed_probs else 0.0
return prediction, confidence
def ctc_decode_top_k(log_probs, idx_to_char, k=3):
"""
Decode CTC output with top-k alternative predictions using beam search
Args:
log_probs: Log probabilities from model (T, 1, C)
idx_to_char: Character mapping dictionary
k: Number of top predictions to return
Returns:
List of (prediction, confidence) tuples sorted by confidence
"""
probs = torch.exp(log_probs).squeeze(1).cpu() # (T, C)
T, C = probs.shape
# Simple beam search
beams = [{'text': '', 'prob': 1.0, 'last': None}]
for t in range(T):
new_beams = []
for beam in beams:
# Get top-k tokens at this timestep
topk_probs, topk_indices = torch.topk(probs[t], k=min(k*2, C))
for prob, idx in zip(topk_probs, topk_indices):
idx = idx.item()
prob = prob.item()
# CTC rules
if idx == 0: # Blank token
new_beams.append({
'text': beam['text'],
'prob': beam['prob'] * prob,
'last': None
})
elif idx != beam['last']: # New character (not repeat)
char = idx_to_char.get(idx, '')
new_beams.append({
'text': beam['text'] + char,
'prob': beam['prob'] * prob,
'last': idx
})
else: # Repeat - continue same character
new_beams.append({
'text': beam['text'],
'prob': beam['prob'] * prob,
'last': beam['last']
})
# Keep top k beams
beams = sorted(new_beams, key=lambda x: x['prob'], reverse=True)[:k]
# Remove duplicates and return top k unique predictions
seen = set()
results = []
for beam in beams:
text = beam['text']
if text not in seen:
seen.add(text)
# Normalize probability by sequence length
confidence = beam['prob'] ** (1.0 / max(len(text), 1))
results.append((text, float(confidence)))
if len(results) >= k:
break
return results
def predict_captcha(image):
"""Predict CAPTCHA text from image with confidence score and alternatives"""
# Preprocess
img_tensor = preprocess_image(image).to(device)
# Inference
with torch.no_grad():
log_probs = model(img_tensor)
# Get primary prediction with confidence
prediction, confidence = ctc_decode_with_confidence(log_probs, idx_to_char)
confidence_pct = confidence * 100
# Get top-k predictions to check uncertainty
top_predictions = ctc_decode_top_k(log_probs, idx_to_char, k=3)
# Check if alternatives are close (uncertainty margin)
show_alternatives = False
if len(top_predictions) >= 2:
top1_conf = top_predictions[0][1]
top2_conf = top_predictions[1][1]
margin = top1_conf - top2_conf
# Show alternatives if:
# 1. Low confidence (< 70%), OR
# 2. Top 2 predictions are very close (margin < 0.1)
if confidence < 0.70 or margin < 0.1:
show_alternatives = True
# Format output
output = f"**Primary Prediction:** {prediction}\n\n"
# Add status and alternatives based on confidence and margin
if show_alternatives:
if confidence < 0.6:
status = "⚠️ Low Confidence"
elif confidence < 0.70:
status = "⚑ Medium Confidence"
else:
status = "⚠️ Uncertain" # High confidence but close alternatives
note = "Visual ambiguity detected (e.g., 0/o, i/1/l confusion)"
output += f"{status} β€” {confidence_pct:.1f}%\n"
output += f"{note}\n\n"
output += "**Alternative Predictions:**\n"
for i, (text, conf) in enumerate(top_predictions, 1):
conf_pct = conf * 100
marker = "β†’" if i == 1 else " "
output += f"{marker} {i}. `{text}` β€” {conf_pct:.1f}%\n"
output += "\nπŸ’‘ *Tip: Check which makes sense in context*"
elif confidence < 0.75:
status = "⚑ Medium Confidence"
note = "Result is reasonably reliable"
output += f"{status} β€” {confidence_pct:.1f}%\n"
output += f"{note}"
else:
status = "βœ“ High Confidence"
note = "Result is highly reliable"
output += f"{status} β€” {confidence_pct:.1f}%\n"
output += f"{note}"
return output
# ==========================================
# 5. Gradio Interface
# ==========================================
demo = gr.Interface(
fn=predict_captcha,
inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
outputs=gr.Textbox(label="Prediction Results", lines=10, scale=2),
title="CAPTCHA Recognition System",
description="""
**CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss**
Upload a CAPTCHA image to see the model's prediction with confidence score.
**Model Architecture:**
- ResNet-based CNN feature extraction (4 layers, 2 blocks each)
- Bidirectional LSTM (hidden_size=384, 2 layers)
- CTC Loss for alignment-free training
**Performance:**
- Sequence Accuracy: 55.6%
- Character Accuracy: 85.82%
- Trained on 7,777 samples with heavy augmentation
**Features:**
- **Confidence scoring**: Shows prediction reliability
- **Multiple predictions**: Shows top 3 alternatives when confidence < 60%
- **Smart warnings**: Alerts when visual ambiguity exists (0/o, i/1/l confusion)
- **Real-time inference**: Results in <1 second
**Training Details:**
- 14 iterations of systematic experimentation
- Data augmentation: rotation, shear, black lines, noise
- Regularization: dropout, weight decay, early stopping
""",
examples=[
# Add example image paths here if you want
# ["example1.png"],
# ["example2.png"],
],
theme=gr.themes.Soft(),
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch(share=True) # Enable share button for 72-hour public links