Spaces:

TechRaj
/

cs4243-miniproject-captcha-recognition

Running

cs4243-miniproject-captcha-recognition / app.py

Karthikraj Sivakumar

fix biased uncertainty problem

7e07095 about 2 months ago

16.4 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import cv2
	import numpy as np
	from PIL import Image

	# ==========================================
	# 1. Model Architecture (Match notebook exactly)
	# ==========================================

	class ResidualBlock(nn.Module):
	"""
	Residual block with skip connection
	Helps with gradient flow and fine-grained feature discrimination
	"""
	def __init__(self, in_channels, out_channels, stride=1, downsample=None):
	super(ResidualBlock, self).__init__()
	self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
	stride=stride, padding=1, bias=False)
	self.bn1 = nn.BatchNorm2d(out_channels)
	self.relu = nn.ReLU(inplace=True)
	self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
	stride=1, padding=1, bias=False)
	self.bn2 = nn.BatchNorm2d(out_channels)
	self.downsample = downsample

	def forward(self, x):
	identity = x

	out = self.conv1(x)
	out = self.bn1(out)
	out = self.relu(out)

	out = self.conv2(out)
	out = self.bn2(out)

	# Skip connection (the key to ResNet!)
	if self.downsample is not None:
	identity = self.downsample(x)

	out += identity # Add residual
	out = self.relu(out)

	return out

	class CRNN(nn.Module):
	"""
	Convolutional Recurrent Neural Network with ResNet-style CNN
	Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss
	"""
	def __init__(
	self,
	img_height=80,
	img_width=280,
	num_classes=63, # 62 alphanumeric + 1 blank
	hidden_size=384,
	num_lstm_layers=2,
	dropout=0.4
	):
	super(CRNN, self).__init__()

	self.img_height = img_height
	self.img_width = img_width
	self.num_classes = num_classes
	self.hidden_size = hidden_size

	# Initial conv: (1, 80, 280) → (64, 80, 280)
	self.conv1 = nn.Sequential(
	nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
	nn.BatchNorm2d(64),
	nn.ReLU(inplace=True)
	)

	# Pool1: (64, 80, 280) → (64, 40, 140)
	self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

	# ResBlock layer1: (64, 40, 140) → (128, 40, 140)
	self.layer1 = self._make_layer(64, 128, blocks=2)

	# Pool2: (128, 40, 140) → (128, 20, 70)
	self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

	# ResBlock layer2: (128, 20, 70) → (256, 20, 70)
	self.layer2 = self._make_layer(128, 256, blocks=2)

	# Pool3: (256, 20, 70) → (256, 10, 70)
	self.pool3 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height

	# ResBlock layer3: (256, 10, 70) → (512, 10, 70)
	self.layer3 = self._make_layer(256, 512, blocks=2)

	# Pool4: (512, 10, 70) → (512, 5, 70)
	self.pool4 = nn.MaxPool2d(kernel_size=(2, 1)) # Only height

	# Optional dropout
	self.dropout = nn.Dropout2d(0.2)

	# Calculate RNN input size
	# After all conv layers: (512 channels, 5 height, 70 width)
	self.map_to_seq_height = 5
	self.map_to_seq_channels = 512
	self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels

	# Recurrent Layers (Bidirectional LSTM)
	self.rnn = nn.LSTM(
	input_size=self.rnn_input_size,
	hidden_size=hidden_size,
	num_layers=num_lstm_layers,
	bidirectional=True,
	dropout=0.3 if num_lstm_layers > 1 else 0,
	batch_first=False # (T, N, C) format for CTC
	)

	# Fully Connected Layer
	self.fc = nn.Linear(hidden_size * 2, num_classes)

	def _make_layer(self, in_channels, out_channels, blocks):
	"""Create a layer with multiple residual blocks"""
	downsample = None
	if in_channels != out_channels:
	downsample = nn.Sequential(
	nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
	nn.BatchNorm2d(out_channels)
	)

	layers = []
	layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample))
	for _ in range(1, blocks):
	layers.append(ResidualBlock(out_channels, out_channels))

	return nn.Sequential(*layers)

	def forward(self, x):
	"""Forward pass"""
	# CNN Feature Extraction
	x = self.conv1(x) # (N, 64, 80, 280)
	x = self.pool1(x) # (N, 64, 40, 140)

	x = self.layer1(x) # (N, 128, 40, 140)
	x = self.pool2(x) # (N, 128, 20, 70)

	x = self.layer2(x) # (N, 256, 20, 70)
	x = self.pool3(x) # (N, 256, 10, 70)

	x = self.layer3(x) # (N, 512, 10, 70)
	x = self.pool4(x) # (N, 512, 5, 70)

	conv_out = self.dropout(x) # (N, 512, 5, 70)

	batch_size, channels, height, width = conv_out.size()

	# Map to Sequence
	conv_out = conv_out.permute(0, 3, 1, 2) # (N, 70, 512, 5)
	conv_out = conv_out.reshape(batch_size, width, channels * height) # (N, 70, 2560)

	# Prepare for LSTM
	rnn_input = conv_out.permute(1, 0, 2) # (70, N, 2560)

	# Bidirectional LSTM
	rnn_output, _ = self.rnn(rnn_input) # (70, N, 768)

	# Fully Connected Layer
	T, N, hidden = rnn_output.size()
	rnn_output = rnn_output.reshape(T * N, hidden) # (70*N, 768)
	output = self.fc(rnn_output) # (70*N, 63)
	output = output.reshape(T, N, self.num_classes) # (70, N, 63)

	# Log Softmax for CTC Loss
	log_probs = F.log_softmax(output, dim=2) # (70, N, 63)

	return log_probs

	# ==========================================
	# 2. Preprocessing Functions
	# ==========================================

	def resize_and_pad(img, target_size=(80, 280)):
	target_h, target_w = target_size
	h, w = img.shape[:2]

	scale = min(target_w / w, target_h / h)
	new_w, new_h = int(w * scale), int(h * scale)
	resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST)

	padded = np.ones((target_h, target_w), dtype=img.dtype) * 255

	x_offset = (target_w - new_w) // 2
	y_offset = (target_h - new_h) // 2
	padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

	return padded

	def remove_black_lines(img):
	hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
	lower_black = np.array([0, 0, 0])
	upper_black = np.array([180, 255, 80])
	mask_black = cv2.inRange(hsv, lower_black, upper_black)
	cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
	return cleaned

	def preprocess_image(image):
	"""Preprocess image for model inference"""
	# Convert PIL to OpenCV format
	img = np.array(image)

	# If RGB, convert to BGR for OpenCV
	if len(img.shape) == 3 and img.shape[2] == 3:
	img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

	# Remove noise lines
	img = remove_black_lines(img)

	# Convert to grayscale
	img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Resize and pad
	img = resize_and_pad(img, target_size=(80, 280))

	# Normalize
	img = img.astype('float32') / 255.0
	img = torch.tensor(img).unsqueeze(0).unsqueeze(0) # (1, 1, H, W)

	return img

	# ==========================================
	# 3. Load Model & Character Mapping
	# ==========================================

	CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
	char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)}
	idx_to_char = {i + 1: c for i, c in enumerate(CHARS)}
	idx_to_char[0] = "" # blank token

	num_classes = len(CHARS) + 1

	# Load model
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = CRNN(
	img_height=80,
	img_width=280,
	num_classes=63,
	hidden_size=384, # IMPORTANT: Must match training
	num_lstm_layers=2
	).to(device)

	# Load checkpoint
	checkpoint = torch.load('best_model.pth', map_location=device)
	model.load_state_dict(checkpoint['model_state_dict'])
	model.eval()

	print(f"Model loaded successfully! Using device: {device}")

	# ==========================================
	# 4. Prediction Functions
	# ==========================================

	def ctc_decode_with_confidence(log_probs, idx_to_char):
	"""
	Decode CTC output with confidence score

	Args:
	log_probs: Log probabilities from model (T, 1, C)
	idx_to_char: Character mapping dictionary

	Returns:
	prediction: Decoded text string
	confidence: Average probability score (0-1)
	"""
	# Convert log probs to regular probabilities
	probs = torch.exp(log_probs).squeeze(1) # (T, C)

	# Greedy decoding - get max probability and index at each timestep
	max_probs, max_indices = torch.max(probs, dim=1)
	max_probs = max_probs.cpu().numpy()
	max_indices = max_indices.cpu().numpy()

	# CTC collapse (remove blanks and repeated tokens)
	collapsed_tokens = []
	collapsed_probs = []
	prev = None

	for token, prob in zip(max_indices, max_probs):
	if token != 0 and token != prev: # Not blank and not repeat
	collapsed_tokens.append(token)
	collapsed_probs.append(prob)
	prev = token

	# Decode to text
	prediction = ''.join([idx_to_char.get(t, '') for t in collapsed_tokens])

	# Calculate average confidence
	confidence = float(np.mean(collapsed_probs)) if collapsed_probs else 0.0

	return prediction, confidence


	def ctc_decode_top_k(log_probs, idx_to_char, k=3):
	"""
	Decode CTC output with top-k alternative predictions using beam search

	Args:
	log_probs: Log probabilities from model (T, 1, C)
	idx_to_char: Character mapping dictionary
	k: Number of top predictions to return

	Returns:
	List of (prediction, confidence) tuples sorted by confidence
	"""
	probs = torch.exp(log_probs).squeeze(1).cpu() # (T, C)
	T, C = probs.shape

	# Simple beam search
	beams = [{'text': '', 'prob': 1.0, 'last': None}]

	for t in range(T):
	new_beams = []

	for beam in beams:
	# Get top-k tokens at this timestep
	topk_probs, topk_indices = torch.topk(probs[t], k=min(k*2, C))

	for prob, idx in zip(topk_probs, topk_indices):
	idx = idx.item()
	prob = prob.item()

	# CTC rules
	if idx == 0: # Blank token
	new_beams.append({
	'text': beam['text'],
	'prob': beam['prob'] * prob,
	'last': None
	})
	elif idx != beam['last']: # New character (not repeat)
	char = idx_to_char.get(idx, '')
	new_beams.append({
	'text': beam['text'] + char,
	'prob': beam['prob'] * prob,
	'last': idx
	})
	else: # Repeat - continue same character
	new_beams.append({
	'text': beam['text'],
	'prob': beam['prob'] * prob,
	'last': beam['last']
	})

	# Keep top k beams
	beams = sorted(new_beams, key=lambda x: x['prob'], reverse=True)[:k]

	# Remove duplicates and return top k unique predictions
	seen = set()
	results = []
	for beam in beams:
	text = beam['text']
	if text not in seen:
	seen.add(text)
	# Normalize probability by sequence length
	confidence = beam['prob'] ** (1.0 / max(len(text), 1))
	results.append((text, float(confidence)))
	if len(results) >= k:
	break

	return results


	def predict_captcha(image):
	"""Predict CAPTCHA text from image with confidence score and alternatives"""

	# Preprocess
	img_tensor = preprocess_image(image).to(device)

	# Inference
	with torch.no_grad():
	log_probs = model(img_tensor)

	# Get primary prediction with confidence
	prediction, confidence = ctc_decode_with_confidence(log_probs, idx_to_char)
	confidence_pct = confidence * 100

	# Get top-k predictions to check uncertainty
	top_predictions = ctc_decode_top_k(log_probs, idx_to_char, k=3)

	# Check if alternatives are close (uncertainty margin)
	show_alternatives = False
	if len(top_predictions) >= 2:
	top1_conf = top_predictions[0][1]
	top2_conf = top_predictions[1][1]
	margin = top1_conf - top2_conf

	# Show alternatives if:
	# 1. Low confidence (< 70%), OR
	# 2. Top 2 predictions are very close (margin < 0.1)
	if confidence < 0.70 or margin < 0.1:
	show_alternatives = True

	# Format output
	output = f"Primary Prediction: {prediction}\n\n"

	# Add status and alternatives based on confidence and margin
	if show_alternatives:
	if confidence < 0.6:
	status = "⚠️ Low Confidence"
	elif confidence < 0.70:
	status = "⚡ Medium Confidence"
	else:
	status = "⚠️ Uncertain" # High confidence but close alternatives

	note = "Visual ambiguity detected (e.g., 0/o, i/1/l confusion)"

	output += f"{status} — {confidence_pct:.1f}%\n"
	output += f"{note}\n\n"
	output += "Alternative Predictions:\n"

	for i, (text, conf) in enumerate(top_predictions, 1):
	conf_pct = conf * 100
	marker = "→" if i == 1 else " "
	output += f"{marker} {i}. `{text}` — {conf_pct:.1f}%\n"

	output += "\n💡 Tip: Check which makes sense in context"

	elif confidence < 0.75:
	status = "⚡ Medium Confidence"
	note = "Result is reasonably reliable"
	output += f"{status} — {confidence_pct:.1f}%\n"
	output += f"{note}"
	else:
	status = "✓ High Confidence"
	note = "Result is highly reliable"
	output += f"{status} — {confidence_pct:.1f}%\n"
	output += f"{note}"

	return output

	# ==========================================
	# 5. Gradio Interface
	# ==========================================

	demo = gr.Interface(
	fn=predict_captcha,
	inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
	outputs=gr.Textbox(label="Prediction Results", lines=10, scale=2),
	title="CAPTCHA Recognition System",
	description="""
	CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss

	Upload a CAPTCHA image to see the model's prediction with confidence score.

	Model Architecture:
	- ResNet-based CNN feature extraction (4 layers, 2 blocks each)
	- Bidirectional LSTM (hidden_size=384, 2 layers)
	- CTC Loss for alignment-free training

	Performance:
	- Sequence Accuracy: 55.6%
	- Character Accuracy: 85.82%
	- Trained on 7,777 samples with heavy augmentation

	Features:
	- Confidence scoring: Shows prediction reliability
	- Multiple predictions: Shows top 3 alternatives when confidence < 60%
	- Smart warnings: Alerts when visual ambiguity exists (0/o, i/1/l confusion)
	- Real-time inference: Results in <1 second

	Training Details:
	- 14 iterations of systematic experimentation
	- Data augmentation: rotation, shear, black lines, noise
	- Regularization: dropout, weight decay, early stopping
	""",
	examples=[
	# Add example image paths here if you want
	# ["example1.png"],
	# ["example2.png"],
	],
	theme=gr.themes.Soft(),
	allow_flagging="never"
	)

	if __name__ == "__main__":
	demo.launch(share=True) # Enable share button for 72-hour public links