Spaces:

TechRaj
/

cs4243-miniproject-captcha-recognition

Sleeping

File size: 16,380 Bytes

import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
import numpy as np
from PIL import Image

# ==========================================
# 1. Model Architecture (Match notebook exactly)
# ==========================================

class ResidualBlock(nn.Module):
    """
    Residual block with skip connection
    Helps with gradient flow and fine-grained feature discrimination
    """
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample
        
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        # Skip connection (the key to ResNet!)
        if self.downsample is not None:
            identity = self.downsample(x)
        
        out += identity  # Add residual
        out = self.relu(out)
        
        return out

class CRNN(nn.Module):
    """
    Convolutional Recurrent Neural Network with ResNet-style CNN
    Architecture: ResNet CNN + Bidirectional LSTM + CTC Loss
    """
    def __init__(
        self,
        img_height=80,
        img_width=280,
        num_classes=63,  # 62 alphanumeric + 1 blank
        hidden_size=384,
        num_lstm_layers=2,
        dropout=0.4
    ):
        super(CRNN, self).__init__()
        
        self.img_height = img_height
        self.img_width = img_width
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        
        # Initial conv: (1, 80, 280) → (64, 80, 280)
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )
        
        # Pool1: (64, 80, 280) → (64, 40, 140)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # ResBlock layer1: (64, 40, 140) → (128, 40, 140)
        self.layer1 = self._make_layer(64, 128, blocks=2)
        
        # Pool2: (128, 40, 140) → (128, 20, 70)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # ResBlock layer2: (128, 20, 70) → (256, 20, 70)
        self.layer2 = self._make_layer(128, 256, blocks=2)
        
        # Pool3: (256, 20, 70) → (256, 10, 70)
        self.pool3 = nn.MaxPool2d(kernel_size=(2, 1))  # Only height
        
        # ResBlock layer3: (256, 10, 70) → (512, 10, 70)
        self.layer3 = self._make_layer(256, 512, blocks=2)
        
        # Pool4: (512, 10, 70) → (512, 5, 70)
        self.pool4 = nn.MaxPool2d(kernel_size=(2, 1))  # Only height
        
        # Optional dropout
        self.dropout = nn.Dropout2d(0.2)
        
        # Calculate RNN input size
        # After all conv layers: (512 channels, 5 height, 70 width)
        self.map_to_seq_height = 5
        self.map_to_seq_channels = 512
        self.rnn_input_size = self.map_to_seq_height * self.map_to_seq_channels
        
        # Recurrent Layers (Bidirectional LSTM)
        self.rnn = nn.LSTM(
            input_size=self.rnn_input_size,
            hidden_size=hidden_size,
            num_layers=num_lstm_layers,
            bidirectional=True,
            dropout=0.3 if num_lstm_layers > 1 else 0,
            batch_first=False  # (T, N, C) format for CTC
        )
        
        # Fully Connected Layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def _make_layer(self, in_channels, out_channels, blocks):
        """Create a layer with multiple residual blocks"""
        downsample = None
        if in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        
        layers = []
        layers.append(ResidualBlock(in_channels, out_channels, stride=1, downsample=downsample))
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        
        return nn.Sequential(*layers)
    
    def forward(self, x):
        """Forward pass"""
        # CNN Feature Extraction
        x = self.conv1(x)      # (N, 64, 80, 280)
        x = self.pool1(x)      # (N, 64, 40, 140)
        
        x = self.layer1(x)     # (N, 128, 40, 140)
        x = self.pool2(x)      # (N, 128, 20, 70)
        
        x = self.layer2(x)     # (N, 256, 20, 70)
        x = self.pool3(x)      # (N, 256, 10, 70)
        
        x = self.layer3(x)     # (N, 512, 10, 70)
        x = self.pool4(x)      # (N, 512, 5, 70)
        
        conv_out = self.dropout(x)  # (N, 512, 5, 70)
        
        batch_size, channels, height, width = conv_out.size()
        
        # Map to Sequence
        conv_out = conv_out.permute(0, 3, 1, 2)  # (N, 70, 512, 5)
        conv_out = conv_out.reshape(batch_size, width, channels * height)  # (N, 70, 2560)
        
        # Prepare for LSTM
        rnn_input = conv_out.permute(1, 0, 2)  # (70, N, 2560)
        
        # Bidirectional LSTM
        rnn_output, _ = self.rnn(rnn_input)  # (70, N, 768)
        
        # Fully Connected Layer
        T, N, hidden = rnn_output.size()
        rnn_output = rnn_output.reshape(T * N, hidden)  # (70*N, 768)
        output = self.fc(rnn_output)  # (70*N, 63)
        output = output.reshape(T, N, self.num_classes)  # (70, N, 63)
        
        # Log Softmax for CTC Loss
        log_probs = F.log_softmax(output, dim=2)  # (70, N, 63)
        
        return log_probs

# ==========================================
# 2. Preprocessing Functions
# ==========================================

def resize_and_pad(img, target_size=(80, 280)):
    target_h, target_w = target_size
    h, w = img.shape[:2]
    
    scale = min(target_w / w, target_h / h)
    new_w, new_h = int(w * scale), int(h * scale)
    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
    
    padded = np.ones((target_h, target_w), dtype=img.dtype) * 255
    
    x_offset = (target_w - new_w) // 2
    y_offset = (target_h - new_h) // 2
    padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
    
    return padded

def remove_black_lines(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([180, 255, 80])
    mask_black = cv2.inRange(hsv, lower_black, upper_black)
    cleaned = cv2.inpaint(img, mask_black, inpaintRadius=1, flags=cv2.INPAINT_TELEA)
    return cleaned

def preprocess_image(image):
    """Preprocess image for model inference"""
    # Convert PIL to OpenCV format
    img = np.array(image)
    
    # If RGB, convert to BGR for OpenCV
    if len(img.shape) == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    # Remove noise lines
    img = remove_black_lines(img)
    
    # Convert to grayscale
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Resize and pad
    img = resize_and_pad(img, target_size=(80, 280))
    
    # Normalize
    img = img.astype('float32') / 255.0
    img = torch.tensor(img).unsqueeze(0).unsqueeze(0)  # (1, 1, H, W)
    
    return img

# ==========================================
# 3. Load Model & Character Mapping
# ==========================================

CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
char_to_idx = {c: i + 1 for i, c in enumerate(CHARS)}
idx_to_char = {i + 1: c for i, c in enumerate(CHARS)}
idx_to_char[0] = ""  # blank token

num_classes = len(CHARS) + 1

# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CRNN(
    img_height=80,
    img_width=280,
    num_classes=63,
    hidden_size=384,  # IMPORTANT: Must match training
    num_lstm_layers=2
).to(device)

# Load checkpoint
checkpoint = torch.load('best_model.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

print(f"Model loaded successfully! Using device: {device}")

# ==========================================
# 4. Prediction Functions
# ==========================================

def ctc_decode_with_confidence(log_probs, idx_to_char):
    """
    Decode CTC output with confidence score
    
    Args:
        log_probs: Log probabilities from model (T, 1, C)
        idx_to_char: Character mapping dictionary
    
    Returns:
        prediction: Decoded text string
        confidence: Average probability score (0-1)
    """
    # Convert log probs to regular probabilities
    probs = torch.exp(log_probs).squeeze(1)  # (T, C)
    
    # Greedy decoding - get max probability and index at each timestep
    max_probs, max_indices = torch.max(probs, dim=1)
    max_probs = max_probs.cpu().numpy()
    max_indices = max_indices.cpu().numpy()
    
    # CTC collapse (remove blanks and repeated tokens)
    collapsed_tokens = []
    collapsed_probs = []
    prev = None
    
    for token, prob in zip(max_indices, max_probs):
        if token != 0 and token != prev:  # Not blank and not repeat
            collapsed_tokens.append(token)
            collapsed_probs.append(prob)
        prev = token
    
    # Decode to text
    prediction = ''.join([idx_to_char.get(t, '') for t in collapsed_tokens])
    
    # Calculate average confidence
    confidence = float(np.mean(collapsed_probs)) if collapsed_probs else 0.0
    
    return prediction, confidence


def ctc_decode_top_k(log_probs, idx_to_char, k=3):
    """
    Decode CTC output with top-k alternative predictions using beam search
    
    Args:
        log_probs: Log probabilities from model (T, 1, C)
        idx_to_char: Character mapping dictionary
        k: Number of top predictions to return
    
    Returns:
        List of (prediction, confidence) tuples sorted by confidence
    """
    probs = torch.exp(log_probs).squeeze(1).cpu()  # (T, C)
    T, C = probs.shape
    
    # Simple beam search
    beams = [{'text': '', 'prob': 1.0, 'last': None}]
    
    for t in range(T):
        new_beams = []
        
        for beam in beams:
            # Get top-k tokens at this timestep
            topk_probs, topk_indices = torch.topk(probs[t], k=min(k*2, C))
            
            for prob, idx in zip(topk_probs, topk_indices):
                idx = idx.item()
                prob = prob.item()
                
                # CTC rules
                if idx == 0:  # Blank token
                    new_beams.append({
                        'text': beam['text'],
                        'prob': beam['prob'] * prob,
                        'last': None
                    })
                elif idx != beam['last']:  # New character (not repeat)
                    char = idx_to_char.get(idx, '')
                    new_beams.append({
                        'text': beam['text'] + char,
                        'prob': beam['prob'] * prob,
                        'last': idx
                    })
                else:  # Repeat - continue same character
                    new_beams.append({
                        'text': beam['text'],
                        'prob': beam['prob'] * prob,
                        'last': beam['last']
                    })
        
        # Keep top k beams
        beams = sorted(new_beams, key=lambda x: x['prob'], reverse=True)[:k]
    
    # Remove duplicates and return top k unique predictions
    seen = set()
    results = []
    for beam in beams:
        text = beam['text']
        if text not in seen:
            seen.add(text)
            # Normalize probability by sequence length
            confidence = beam['prob'] ** (1.0 / max(len(text), 1))
            results.append((text, float(confidence)))
        if len(results) >= k:
            break
    
    return results


def predict_captcha(image):
    """Predict CAPTCHA text from image with confidence score and alternatives"""
    
    # Preprocess
    img_tensor = preprocess_image(image).to(device)
    
    # Inference
    with torch.no_grad():
        log_probs = model(img_tensor)
    
    # Get primary prediction with confidence
    prediction, confidence = ctc_decode_with_confidence(log_probs, idx_to_char)
    confidence_pct = confidence * 100
    
    # Get top-k predictions to check uncertainty
    top_predictions = ctc_decode_top_k(log_probs, idx_to_char, k=3)
    
    # Check if alternatives are close (uncertainty margin)
    show_alternatives = False
    if len(top_predictions) >= 2:
        top1_conf = top_predictions[0][1]
        top2_conf = top_predictions[1][1]
        margin = top1_conf - top2_conf
        
        # Show alternatives if:
        # 1. Low confidence (< 70%), OR
        # 2. Top 2 predictions are very close (margin < 0.1)
        if confidence < 0.70 or margin < 0.1:
            show_alternatives = True
    
    # Format output
    output = f"**Primary Prediction:** {prediction}\n\n"
    
    # Add status and alternatives based on confidence and margin
    if show_alternatives:
        if confidence < 0.6:
            status = "⚠️ Low Confidence"
        elif confidence < 0.70:
            status = "⚡ Medium Confidence"
        else:
            status = "⚠️ Uncertain"  # High confidence but close alternatives
        
        note = "Visual ambiguity detected (e.g., 0/o, i/1/l confusion)"
        
        output += f"{status} — {confidence_pct:.1f}%\n"
        output += f"{note}\n\n"
        output += "**Alternative Predictions:**\n"
        
        for i, (text, conf) in enumerate(top_predictions, 1):
            conf_pct = conf * 100
            marker = "→" if i == 1 else " "
            output += f"{marker} {i}. `{text}` — {conf_pct:.1f}%\n"
        
        output += "\n💡 *Tip: Check which makes sense in context*"
        
    elif confidence < 0.75:
        status = "⚡ Medium Confidence"
        note = "Result is reasonably reliable"
        output += f"{status} — {confidence_pct:.1f}%\n"
        output += f"{note}"
    else:
        status = "✓ High Confidence"
        note = "Result is highly reliable"
        output += f"{status} — {confidence_pct:.1f}%\n"
        output += f"{note}"
    
    return output

# ==========================================
# 5. Gradio Interface
# ==========================================

demo = gr.Interface(
    fn=predict_captcha,
    inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
    outputs=gr.Textbox(label="Prediction Results", lines=10, scale=2),
    title="CAPTCHA Recognition System",
    description="""
    **CS4243 Mini Project - CAPTCHA Recognition using CRNN + CTC Loss**
    
    Upload a CAPTCHA image to see the model's prediction with confidence score.
    
    **Model Architecture:**
    - ResNet-based CNN feature extraction (4 layers, 2 blocks each)
    - Bidirectional LSTM (hidden_size=384, 2 layers)
    - CTC Loss for alignment-free training
    
    **Performance:**
    - Sequence Accuracy: 55.6%
    - Character Accuracy: 85.82%
    - Trained on 7,777 samples with heavy augmentation
    
    **Features:**
    - **Confidence scoring**: Shows prediction reliability
    - **Multiple predictions**: Shows top 3 alternatives when confidence < 60%
    - **Smart warnings**: Alerts when visual ambiguity exists (0/o, i/1/l confusion)
    - **Real-time inference**: Results in <1 second
    
    **Training Details:**
    - 14 iterations of systematic experimentation
    - Data augmentation: rotation, shear, black lines, noise
    - Regularization: dropout, weight decay, early stopping
    """,
    examples=[
        # Add example image paths here if you want
        # ["example1.png"],
        # ["example2.png"],
    ],
    theme=gr.themes.Soft(),
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch(share=True)  # Enable share button for 72-hour public links