|
|
import gradio as gr |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import cv2 |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ResidualBlock(nn.Module): |
|
|
""" |
|
|
Residual block with skip connection |
|
|
Helps with gradient flow and fine-grained feature discrimination |
|
|
""" |
|
|
def __init__(self, in_channels, out_channels, stride=1, downsample=None): |
|
|
super(ResidualBlock, self).__init__() |
|
|
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, |
|
|
stride=stride, padding=1, bias=False) |
|
|
self.bn1 = nn.BatchNorm2d(out_channels) |
|
|
self.relu = nn.ReLU(inplace=True) |
|
|
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, |
|
|
stride=1, padding=1, bias=False) |
|
|
self.bn2 = nn.BatchNorm2d(out_channels) |
|
|
self.downsample = downsample |
|
|
|
|
|
def forward(self, x): |
|
|
identity = x |
|
|
|
|
|
out = self.conv1(x) |
|
|
out = self.bn1(out) |
|
|
out = self.relu(out) |
|
|
|
|
|
out = self.conv2(out) |
|
|
out = self.bn2(out) |
|
|
|
|
|
|
|
|
if self.downsample is not None: |
|
|
identity = self.downsample(x) |
|
|
|
|
|
out += identity |
|
|
out = self.relu(out) |
|
|
|
|
|
return out |
|
|
|
|
|
class CRNN(nn.Module): |
|
|
""" |
|
|
Convolutional Recurrent Neural Network with ResNet-style CNN |
|
|
Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss |
|
|
""" |
|
|
def __init__( |
|
|
self, |
|
|
img_height=80, |
|
|
img_width=280, |
|
|
num_classes=63, |
|
|
hidden_size=384, |
|
|
num_lstm_layers=2, |
|
|
dropout=0.4 |
|
|
): |
|
|
super(CRNN, self).__init__() |
|
|
|
|
|
self.img_height = img_height |
|
|
self.img_width = img_width |
|
|
self.num_classes = num_classes |
|
|
self.hidden_size = hidden_size |
|
|
|
|
|
|
|
|
self.conv1 = nn.Sequential( |
|
|
nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False), |
|
|
nn.BatchNorm2d(64), |
|
|
nn.ReLU(inplace=True) |
|
|
) |
|
|
|
|
|
|
|
|
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) |
|
|
|
|
|
|
|
|
self.layer1 = self._make_layer(64, 128, blocks=2) |
|
|
|
|
|
|
|
|
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) |
|
|
|
|
|
|
|
|
self.layer2 = self._make_layer(128, 256, blocks=2) |
|
|
|
|
|
|
|
|
self.pool3 = nn.MaxPool2d(kernel_size=(2, 1)) |
|
|
|
|
|
|
|
|
self.layer3 = self._make_layer(256, 512, blocks=2) |
|
|
|
|
|
|
|
|
self.pool4 = nn.MaxPool2d(kernel_size=(2, 1)) |
|
|
|
|
|
|
|
|
self.dropout = nn.Dropout2d(0.2) |
|
|
|
|
|
|
|
|
|
|
|
self.map_to_seq_height = 5 |
|
|
self.map_to_seq_channels = 512 |
|
|
self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels |
|
|
|
|
|
|
|
|
self.rnn = nn.LSTM( |
|
|
input_size=self.rnn_input_size, |
|
|
hidden_size=hidden_size, |
|
|
num_layers=num_lstm_layers, |
|
|
bidirectional=True, |
|
|
dropout=0.3 if num_lstm_layers > 1 else 0, |
|
|
batch_first=False |
|
|
) |
|
|
|
|
|
|
|
|
self.fc = nn.Linear(hidden_size * 2, num_classes) |
|
|
|
|
|
def _make_layer(self, in_channels, out_channels, blocks): |
|
|
"""Create a layer with multiple residual blocks""" |
|
|
downsample = None |
|
|
if in_channels != out_channels: |
|
|
downsample = nn.Sequential( |
|
|
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), |
|
|
nn.BatchNorm2d(out_channels) |
|
|
) |
|
|
|
|
|
layers = [] |
|
|
layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample)) |
|
|
for _ in range(1, blocks): |
|
|
layers.append(ResidualBlock(out_channels, out_channels)) |
|
|
|
|
|
return nn.Sequential(*layers) |
|
|
|
|
|
def forward(self, x): |
|
|
"""Forward pass""" |
|
|
|
|
|
x = self.conv1(x) |
|
|
x = self.pool1(x) |
|
|
|
|
|
x = self.layer1(x) |
|
|
x = self.pool2(x) |
|
|
|
|
|
x = self.layer2(x) |
|
|
x = self.pool3(x) |
|
|
|
|
|
x = self.layer3(x) |
|
|
x = self.pool4(x) |
|
|
|
|
|
conv_out = self.dropout(x) |
|
|
|
|
|
batch_size, channels, height, width = conv_out.size() |
|
|
|
|
|
|
|
|
conv_out = conv_out.permute(0, 3, 1, 2) |
|
|
conv_out = conv_out.reshape(batch_size, width, channels * height) |
|
|
|
|
|
|
|
|
rnn_input = conv_out.permute(1, 0, 2) |
|
|
|
|
|
|
|
|
rnn_output, _ = self.rnn(rnn_input) |
|
|
|
|
|
|
|
|
T, N, hidden = rnn_output.size() |
|
|
rnn_output = rnn_output.reshape(T * N, hidden) |
|
|
output = self.fc(rnn_output) |
|
|
output = output.reshape(T, N, self.num_classes) |
|
|
|
|
|
|
|
|
log_probs = F.log_softmax(output, dim=2) |
|
|
|
|
|
return log_probs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resize_and_pad(img, target_size=(80, 280)): |
|
|
target_h, target_w = target_size |
|
|
h, w = img.shape[:2] |
|
|
|
|
|
scale = min(target_w / w, target_h / h) |
|
|
new_w, new_h = int(w * scale), int(h * scale) |
|
|
resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST) |
|
|
|
|
|
padded = np.ones((target_h, target_w), dtype=img.dtype) * 255 |
|
|
|
|
|
x_offset = (target_w - new_w) // 2 |
|
|
y_offset = (target_h - new_h) // 2 |
|
|
padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized |
|
|
|
|
|
return padded |
|
|
|
|
|
def remove_black_lines(img): |
|
|
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) |
|
|
lower_black = np.array([0, 0, 0]) |
|
|
upper_black = np.array([180, 255, 80]) |
|
|
mask_black = cv2.inRange(hsv, lower_black, upper_black) |
|
|
cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA) |
|
|
return cleaned |
|
|
|
|
|
def preprocess_image(image): |
|
|
"""Preprocess image for model inference""" |
|
|
|
|
|
img = np.array(image) |
|
|
|
|
|
|
|
|
if len(img.shape) == 3 and img.shape[2] == 3: |
|
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) |
|
|
|
|
|
|
|
|
img = remove_black_lines(img) |
|
|
|
|
|
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
|
|
|
|
|
|
|
img = resize_and_pad(img, target_size=(80, 280)) |
|
|
|
|
|
|
|
|
img = img.astype('float32') / 255.0 |
|
|
img = torch.tensor(img).unsqueeze(0).unsqueeze(0) |
|
|
|
|
|
return img |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
|
|
char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)} |
|
|
idx_to_char = {i + 1: c for i, c in enumerate(CHARS)} |
|
|
idx_to_char[0] = "" |
|
|
|
|
|
num_classes = len(CHARS) + 1 |
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
model = CRNN( |
|
|
img_height=80, |
|
|
img_width=280, |
|
|
num_classes=63, |
|
|
hidden_size=384, |
|
|
num_lstm_layers=2 |
|
|
).to(device) |
|
|
|
|
|
|
|
|
checkpoint = torch.load('best_model.pth', map_location=device) |
|
|
model.load_state_dict(checkpoint['model_state_dict']) |
|
|
model.eval() |
|
|
|
|
|
print(f"Model loaded successfully! Using device: {device}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ctc_decode_with_confidence(log_probs, idx_to_char): |
|
|
""" |
|
|
Decode CTC output with confidence score |
|
|
|
|
|
Args: |
|
|
log_probs: Log probabilities from model (T, 1, C) |
|
|
idx_to_char: Character mapping dictionary |
|
|
|
|
|
Returns: |
|
|
prediction: Decoded text string |
|
|
confidence: Average probability score (0-1) |
|
|
""" |
|
|
|
|
|
probs = torch.exp(log_probs).squeeze(1) |
|
|
|
|
|
|
|
|
max_probs, max_indices = torch.max(probs, dim=1) |
|
|
max_probs = max_probs.cpu().numpy() |
|
|
max_indices = max_indices.cpu().numpy() |
|
|
|
|
|
|
|
|
collapsed_tokens = [] |
|
|
collapsed_probs = [] |
|
|
prev = None |
|
|
|
|
|
for token, prob in zip(max_indices, max_probs): |
|
|
if token != 0 and token != prev: |
|
|
collapsed_tokens.append(token) |
|
|
collapsed_probs.append(prob) |
|
|
prev = token |
|
|
|
|
|
|
|
|
prediction = ''.join([idx_to_char.get(t, '') for t in collapsed_tokens]) |
|
|
|
|
|
|
|
|
confidence = float(np.mean(collapsed_probs)) if collapsed_probs else 0.0 |
|
|
|
|
|
return prediction, confidence |
|
|
|
|
|
|
|
|
def ctc_decode_top_k(log_probs, idx_to_char, k=3): |
|
|
""" |
|
|
Decode CTC output with top-k alternative predictions using beam search |
|
|
|
|
|
Args: |
|
|
log_probs: Log probabilities from model (T, 1, C) |
|
|
idx_to_char: Character mapping dictionary |
|
|
k: Number of top predictions to return |
|
|
|
|
|
Returns: |
|
|
List of (prediction, confidence) tuples sorted by confidence |
|
|
""" |
|
|
probs = torch.exp(log_probs).squeeze(1).cpu() |
|
|
T, C = probs.shape |
|
|
|
|
|
|
|
|
beams = [{'text': '', 'prob': 1.0, 'last': None}] |
|
|
|
|
|
for t in range(T): |
|
|
new_beams = [] |
|
|
|
|
|
for beam in beams: |
|
|
|
|
|
topk_probs, topk_indices = torch.topk(probs[t], k=min(k*2, C)) |
|
|
|
|
|
for prob, idx in zip(topk_probs, topk_indices): |
|
|
idx = idx.item() |
|
|
prob = prob.item() |
|
|
|
|
|
|
|
|
if idx == 0: |
|
|
new_beams.append({ |
|
|
'text': beam['text'], |
|
|
'prob': beam['prob'] * prob, |
|
|
'last': None |
|
|
}) |
|
|
elif idx != beam['last']: |
|
|
char = idx_to_char.get(idx, '') |
|
|
new_beams.append({ |
|
|
'text': beam['text'] + char, |
|
|
'prob': beam['prob'] * prob, |
|
|
'last': idx |
|
|
}) |
|
|
else: |
|
|
new_beams.append({ |
|
|
'text': beam['text'], |
|
|
'prob': beam['prob'] * prob, |
|
|
'last': beam['last'] |
|
|
}) |
|
|
|
|
|
|
|
|
beams = sorted(new_beams, key=lambda x: x['prob'], reverse=True)[:k] |
|
|
|
|
|
|
|
|
seen = set() |
|
|
results = [] |
|
|
for beam in beams: |
|
|
text = beam['text'] |
|
|
if text not in seen: |
|
|
seen.add(text) |
|
|
|
|
|
confidence = beam['prob'] ** (1.0 / max(len(text), 1)) |
|
|
results.append((text, float(confidence))) |
|
|
if len(results) >= k: |
|
|
break |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def predict_captcha(image): |
|
|
"""Predict CAPTCHA text from image with confidence score and alternatives""" |
|
|
|
|
|
|
|
|
img_tensor = preprocess_image(image).to(device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
log_probs = model(img_tensor) |
|
|
|
|
|
|
|
|
prediction, confidence = ctc_decode_with_confidence(log_probs, idx_to_char) |
|
|
confidence_pct = confidence * 100 |
|
|
|
|
|
|
|
|
top_predictions = ctc_decode_top_k(log_probs, idx_to_char, k=3) |
|
|
|
|
|
|
|
|
show_alternatives = False |
|
|
if len(top_predictions) >= 2: |
|
|
top1_conf = top_predictions[0][1] |
|
|
top2_conf = top_predictions[1][1] |
|
|
margin = top1_conf - top2_conf |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if confidence < 0.70 or margin < 0.1: |
|
|
show_alternatives = True |
|
|
|
|
|
|
|
|
output = f"**Primary Prediction:** {prediction}\n\n" |
|
|
|
|
|
|
|
|
if show_alternatives: |
|
|
if confidence < 0.6: |
|
|
status = "β οΈ Low Confidence" |
|
|
elif confidence < 0.70: |
|
|
status = "β‘ Medium Confidence" |
|
|
else: |
|
|
status = "β οΈ Uncertain" |
|
|
|
|
|
note = "Visual ambiguity detected (e.g., 0/o, i/1/l confusion)" |
|
|
|
|
|
output += f"{status} β {confidence_pct:.1f}%\n" |
|
|
output += f"{note}\n\n" |
|
|
output += "**Alternative Predictions:**\n" |
|
|
|
|
|
for i, (text, conf) in enumerate(top_predictions, 1): |
|
|
conf_pct = conf * 100 |
|
|
marker = "β" if i == 1 else " " |
|
|
output += f"{marker} {i}. `{text}` β {conf_pct:.1f}%\n" |
|
|
|
|
|
output += "\nπ‘ *Tip: Check which makes sense in context*" |
|
|
|
|
|
elif confidence < 0.75: |
|
|
status = "β‘ Medium Confidence" |
|
|
note = "Result is reasonably reliable" |
|
|
output += f"{status} β {confidence_pct:.1f}%\n" |
|
|
output += f"{note}" |
|
|
else: |
|
|
status = "β High Confidence" |
|
|
note = "Result is highly reliable" |
|
|
output += f"{status} β {confidence_pct:.1f}%\n" |
|
|
output += f"{note}" |
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=predict_captcha, |
|
|
inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"), |
|
|
outputs=gr.Textbox(label="Prediction Results", lines=10, scale=2), |
|
|
title="CAPTCHA Recognition System", |
|
|
description=""" |
|
|
**CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss** |
|
|
|
|
|
Upload a CAPTCHA image to see the model's prediction with confidence score. |
|
|
|
|
|
**Model Architecture:** |
|
|
- ResNet-based CNN feature extraction (4 layers, 2 blocks each) |
|
|
- Bidirectional LSTM (hidden_size=384, 2 layers) |
|
|
- CTC Loss for alignment-free training |
|
|
|
|
|
**Performance:** |
|
|
- Sequence Accuracy: 55.6% |
|
|
- Character Accuracy: 85.82% |
|
|
- Trained on 7,777 samples with heavy augmentation |
|
|
|
|
|
**Features:** |
|
|
- **Confidence scoring**: Shows prediction reliability |
|
|
- **Multiple predictions**: Shows top 3 alternatives when confidence < 60% |
|
|
- **Smart warnings**: Alerts when visual ambiguity exists (0/o, i/1/l confusion) |
|
|
- **Real-time inference**: Results in <1 second |
|
|
|
|
|
**Training Details:** |
|
|
- 14 iterations of systematic experimentation |
|
|
- Data augmentation: rotation, shear, black lines, noise |
|
|
- Regularization: dropout, weight decay, early stopping |
|
|
""", |
|
|
examples=[ |
|
|
|
|
|
|
|
|
|
|
|
], |
|
|
theme=gr.themes.Soft(), |
|
|
allow_flagging="never" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True) |
|
|
|
|
|
|