Spaces:

vedchamp07
/

break-captcha

Sleeping

App Files Files Community

vedchamp07 commited on Jan 10

Commit

3a3f6c6

1 Parent(s): fd44722

Add CAPTCHA breaker app

Browse files

Files changed (7) hide show

app.py +273 -0
models/captcha_model_v3.pth +3 -0
requirements.txt +9 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-314.pyc +0 -0
src/__pycache__/model.cpython-314.pyc +0 -0
src/model.py +294 -0

app.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Gradio app for testing CAPTCHA model.
+Allows uploading CAPTCHA images and getting predictions with preprocessing.
+"""
+import gradio as gr
+import torch
+from torchvision import transforms
+from PIL import Image
+import string
+from pathlib import Path
+import numpy as np
+import cv2
+from src.model import CTCCaptchaModel
+# Setup
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+CHARACTERS = string.digits + string.ascii_lowercase + string.ascii_uppercase
+MODEL_PATH = Path("models/captcha_model_v3.pth")
+# Load model
+model = CTCCaptchaModel(num_classes=len(CHARACTERS), use_attention=True)
+# Load checkpoint
+checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
+if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+    model.load_state_dict(checkpoint['model_state_dict'])
+else:
+    model.load_state_dict(checkpoint)
+model.to(DEVICE)
+model.eval()
+# Image preprocessing transforms
+transform = transforms.Compose([
+    transforms.Resize((60, 160)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5], std=[0.5])
+])
+def preprocess_image(image):
+    """
+    Preprocess image: grayscale, denoising, and thresholding.
+    Args:
+        image: PIL Image
+    Returns:
+        Preprocessed PIL Image
+    """
+    # Convert to numpy array
+    img_array = np.array(image.convert('L'))
+    # Apply Otsu's thresholding
+    _, binary = cv2.threshold(img_array, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Morphological closing to remove noise
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
+    processed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
+    # Convert back to PIL Image
+    return Image.fromarray(processed)
+def predict_captcha(image, ground_truth=""):
+    """
+    Predict CAPTCHA text from image with preprocessing.
+    Args:
+        image: PIL Image or numpy array
+        ground_truth: Optional ground truth text for comparison
+    Returns:
+        Tuple of (prediction result, preprocessed image)
+    """
+    try:
+        # Convert to PIL Image if numpy array
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        # Resize image if not standard dimensions (60x160)
+        if image.size != (160, 60):
+            image = image.resize((160, 60), Image.LANCZOS)
+        # Preprocess image
+        processed_image = preprocess_image(image)
+        # Convert to tensor and predict
+        image_tensor = transform(processed_image).unsqueeze(0).to(DEVICE)
+        # Predict
+        with torch.no_grad():
+            pred_indices = model.predict(image_tensor)[0]
+        # Decode
+        predicted_text = ''.join([
+            CHARACTERS[idx.item()] for idx in pred_indices
+            if idx.item() < len(CHARACTERS)
+        ])
+        # Format output with styling
+        result = f"### 🎯 Prediction Result\n\n"
+        result += f"# **{predicted_text}**\n\n"
+        result += f"*Length: {len(predicted_text)} characters*\n\n"
+        if ground_truth.strip():
+            ground_truth = ground_truth  # Keep case sensitive
+            is_correct = predicted_text == ground_truth
+            result += f"**Expected:** {ground_truth}\n\n"
+            if is_correct:
+                result += "## ✅ **CORRECT!**"
+            else:
+                result += f"## ❌ **INCORRECT**"
+        return result, processed_image
+    except Exception as e:
+        return f"❌ **Error:** {str(e)}", None
+def extract_from_filename(filename):
+    """Extract text from CAPTCHA filename (format: TEXT_INDEX.png)."""
+    if filename and hasattr(filename, 'name'):
+        stem = Path(filename.name).stem
+        text = stem.split('_')[0]
+        return text
+    return ""
+# Create Gradio interface
+with gr.Blocks(title="🔐 CAPTCHA Breaker", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    <div style="text-align: center; padding: 20px;">
+    # 🔐 CAPTCHA Breaker
+    ### Advanced AI-Powered CAPTCHA Recognition
+    Powered by **CNN + LSTM + Self-Attention** neural network
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("#### 📸 Upload Your CAPTCHA")
+            image_input = gr.Image(
+                type="pil",
+                label="Drop CAPTCHA image here",
+                image_mode="L"
+            )
+            with gr.Row():
+                ground_truth_input = gr.Textbox(
+                    label="Expected Answer (optional)",
+                    placeholder="Type here to verify accuracy",
+                    lines=1,
+                    scale=3
+                )
+                predict_button = gr.Button(
+                    "🔍 Decode",
+                    variant="primary",
+                    scale=1
+                )
+        with gr.Column(scale=2):
+            gr.Markdown("#### 🎯 Results")
+            output = gr.Markdown(
+                "<div style='text-align: center; padding: 40px; color: #888;'>Upload an image to get started</div>"
+            )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("#### 🔬 Preprocessing Steps Applied:")
+            gr.Markdown("""
+            - ✓ Auto-resize to 60×160 (if needed)
+            - ✓ Grayscale conversion
+            - ✓ Otsu's thresholding
+            - ✓ Morphological closing (denoising)
+            - ✓ Tensor normalization
+            - ✓ Variable length support (3-7 chars)
+            - ✓ Lowercase + Uppercase + Digits
+            """)
+        with gr.Column():
+            gr.Markdown("#### 📊 Character Set:")
+            gr.Markdown("""
+            - **Digits:** 0-9
+            - **Lowercase:** a-z
+            - **Uppercase:** A-Z
+            - **Total:** 62 characters
+            """)
+        with gr.Column():
+            gr.Markdown("#### 🖼️ Processed Image:")
+            preprocessed_image = gr.Image(
+                label="Input After Preprocessing",
+                type="pil"
+            )
+    # Info section
+    with gr.Accordion("ℹ️ Model Architecture & Performance", open=False):
+        gr.Markdown("""
+        ### 🏗️ Architecture
+        ```
+        Input Image (1, 60, 160) [Auto-resized if needed]
+            ↓
+        CNN: 4 Convolutional Blocks
+          • Progressive feature extraction
+          • 1→32→64→128→256 channels
+            ↓
+        Bidirectional LSTM: 2 layers
+          • 256 hidden units each direction
+          • Learns sequential dependencies
+            ↓
+        Self-Attention: 4 heads
+          • Refines character representations
+          • Improves focus on important features
+            ↓
+        CTC Loss: Automatic Alignment
+          • No bounding boxes needed!
+          • Learns character positions automatically
+            ↓
+        Output: Variable-length prediction (3-7 characters)
+        ```
+        ### 📈 Model Capabilities (v3)
+        | Feature | Details |
+        |---------|---------|
+        | **Model Version** | v3 (Latest) |
+        | **Text Length** | 3-7 characters (variable) |
+        | **Character Set** | 0-9, a-z, A-Z (62 total) |
+        | **Architecture** | CNN + LSTM + Attention |
+        | **Training Data** | 10,000 synthetic CAPTCHAs |
+        | **Image Resize** | Automatic (any size → 60×160) |
+        ### ⚠️ Known Limitations
+        - 0 vs O confusion (visual similarity)
+        - i vs l vs 1 confusion (very similar shapes)
+        - Limited performance on decorative/stylized fonts
+        - Sensitive to extreme image distortions
+        """)
+    # Connect buttons to prediction function
+    predict_button.click(
+        fn=predict_captcha,
+        inputs=[image_input, ground_truth_input],
+        outputs=[output, preprocessed_image]
+    )
+    # Auto-predict on image upload
+    image_input.change(
+        fn=lambda img: predict_captcha(img, ""),
+        inputs=image_input,
+        outputs=[output, preprocessed_image]
+    )
+    # Footer
+    gr.Markdown("""
+    ---
+    <div style="text-align: center; color: #999; padding: 20px;">
+    Built with PyTorch | Device: {device} | GitHub: vedchamp07/captcha-breaker
+    </div>
+    """.format(device=DEVICE))
+if __name__ == "__main__":
+    demo.launch(share=True)

models/captcha_model_v3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e724f2d10b44f23f6794de5aa316b809388006f66eb39059851b6cd750e6de4
+size 20361923

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+torchvision
+captcha
+Pillow
+numpy
+matplotlib
+tqdm
+opencv-python  # For preprocessing (grayscale, noise removal)
+gradio  # For interactive web app

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (145 Bytes). View file

src/__pycache__/model.cpython-314.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/model.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+CTC-based CAPTCHA recognition model.
+Uses CNN + LSTM + CTC loss - no bounding boxes needed!
+This approach is standard for sequence recognition tasks where
+character positions are unknown or variable.
+"""
+import torch
+import torch.nn as nn
+class CTCCaptchaModel(nn.Module):
+    """
+    CAPTCHA recognition using CTC (Connectionist Temporal Classification).
+    Architecture:
+    1. CNN backbone extracts visual features
+    2. Reshape to sequence (treating width as time steps)
+    3. Bidirectional LSTM processes sequence
+    4. Linear layer outputs character probabilities for each time step
+    5. CTC loss handles alignment between predictions and ground truth
+    No need for bounding boxes - CTC figures out alignment automatically!
+    """
+    def __init__(self, num_classes=36, hidden_size=256, num_lstm_layers=2, use_attention=False):
+        """
+        Args:
+            num_classes: Number of character classes (36 for A-Z, 0-9)
+            hidden_size: Hidden size for LSTM layers
+            num_lstm_layers: Number of LSTM layers
+        """
+        super(CTCCaptchaModel, self).__init__()
+        self.num_classes = num_classes
+        # CTC needs blank token for alignment (class index = num_classes)
+        self.blank_idx = num_classes
+        # CNN backbone for feature extraction
+        # Input: (batch, 1, 60, 160) - grayscale image
+        self.cnn = nn.Sequential(
+            # Block 1
+            nn.Conv2d(1, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),  # -> (32, 30, 80)
+            # Block 2
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),  # -> (64, 15, 40)
+            # Block 3
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d((1, 2)),  # Pool only width -> (128, 15, 20)
+            # Block 4
+            nn.Conv2d(128, 256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.MaxPool2d((1, 2)),  # Pool only width -> (256, 15, 10)
+        )
+        # After CNN: (batch, 256, 15, 10)
+        # We'll reshape to: (batch, 10, 256*15) treating width as sequence
+        # So sequence length = 10, feature dim = 256*15 = 3840
+        self.feature_size = 256 * 15  # channels * height
+        self.sequence_length = 10  # width after pooling
+        # Map CNN features to LSTM input size
+        self.map_to_seq = nn.Linear(self.feature_size, hidden_size)
+        # Bidirectional LSTM to process sequence
+        self.lstm = nn.LSTM(
+            hidden_size,
+            hidden_size,
+            num_layers=num_lstm_layers,
+            bidirectional=True,
+            dropout=0.3 if num_lstm_layers > 1 else 0,
+            batch_first=True
+        )
+        # Optional self-attention on top of LSTM outputs
+        self.use_attention = use_attention
+        if self.use_attention:
+            self.attn = nn.MultiheadAttention(hidden_size * 2, num_heads=4, dropout=0.1, batch_first=True)
+            self.attn_norm = nn.LayerNorm(hidden_size * 2)
+            self.attn_dropout = nn.Dropout(0.1)
+        else:
+            self.attn = None
+        # Output layer: map LSTM outputs to character probabilities
+        # +1 for CTC blank token
+        self.fc = nn.Linear(hidden_size * 2, num_classes + 1)  # *2 for bidirectional
+    def forward(self, x):
+        """
+        Args:
+            x: Input images (batch_size, 1, 60, 160)
+        Returns:
+            Log probabilities for CTC loss (sequence_length, batch_size, num_classes+1)
+        """
+        batch_size = x.size(0)
+        # Extract CNN features
+        features = self.cnn(x)  # (batch, 256, 15, 10)
+        # Reshape to sequence: (batch, width, channels*height)
+        # Transpose to treat width as sequence dimension
+        features = features.permute(0, 3, 1, 2)  # (batch, 10, 256, 15)
+        features = features.reshape(batch_size, self.sequence_length, self.feature_size)
+        # Map to LSTM input size
+        features = self.map_to_seq(features)  # (batch, 10, hidden_size)
+        # Process with LSTM
+        lstm_out, _ = self.lstm(features)  # (batch, 10, hidden_size*2)
+        # Optional attention
+        if self.attn is not None:
+            attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out)
+            lstm_out = self.attn_norm(lstm_out + self.attn_dropout(attn_out))
+        # Get character predictions for each time step
+        logits = self.fc(lstm_out)  # (batch, 10, num_classes+1)
+        # CTC expects: (sequence_length, batch, num_classes)
+        logits = logits.permute(1, 0, 2)  # (10, batch, num_classes+1)
+        # Apply log_softmax for CTC loss
+        log_probs = torch.nn.functional.log_softmax(logits, dim=2)
+        return log_probs
+    def predict(self, x):
+        """
+        Decode predictions using greedy decoding.
+        Args:
+            x: Input images (batch_size, 1, 60, 160)
+        Returns:
+            Predicted character indices (batch_size, max_length)
+        """
+        self.eval()
+        with torch.no_grad():
+            log_probs = self.forward(x)  # (seq_len, batch, num_classes+1)
+            # Greedy decoding: take argmax at each time step
+            _, preds = log_probs.max(2)  # (seq_len, batch)
+            preds = preds.transpose(0, 1)  # (batch, seq_len)
+            # Decode: remove blanks and repeated characters
+            decoded = []
+            for pred_seq in preds:
+                decoded_seq = []
+                prev_char = None
+                for char_idx in pred_seq:
+                    char_idx = char_idx.item()
+                    # Skip blank tokens
+                    if char_idx == self.blank_idx:
+                        prev_char = None
+                        continue
+                    # Skip repeated characters (CTC rule)
+                    if char_idx != prev_char:
+                        decoded_seq.append(char_idx)
+                        prev_char = char_idx
+                decoded.append(decoded_seq)
+            # Pad sequences to same length (max 5 for CAPTCHA)
+            max_len = 5
+            padded = []
+            for seq in decoded:
+                if len(seq) < max_len:
+                    seq = seq + [0] * (max_len - len(seq))  # Pad with 0
+                else:
+                    seq = seq[:max_len]  # Truncate if too long
+                padded.append(seq)
+            # Return tensor on same device as input
+            return torch.tensor(padded, dtype=torch.long, device=x.device)
+class CTCCaptchaModelSimple(nn.Module):
+    """
+    Simpler CTC model without LSTM (faster training, less memory).
+    Good baseline to start with.
+    """
+    def __init__(self, num_classes=36):
+        super(CTCCaptchaModelSimple, self).__init__()
+        self.num_classes = num_classes
+        self.blank_idx = num_classes
+        # CNN backbone
+        self.features = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d((2, 2)),  # -> (64, 30, 80)
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d((2, 2)),  # -> (128, 15, 40)
+            nn.Conv2d(128, 256, kernel_size=3, padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.MaxPool2d((1, 2)),  # -> (256, 15, 20)
+            nn.Conv2d(256, 512, kernel_size=3, padding=1),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+            nn.MaxPool2d((1, 2)),  # -> (512, 15, 10)
+        )
+        # Direct mapping to character predictions
+        # Treat width dimension as sequence
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 15, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, num_classes + 1)
+        )
+        self.sequence_length = 10
+    def forward(self, x):
+        """Forward pass for CTC."""
+        batch_size = x.size(0)
+        # Extract features
+        features = self.features(x)  # (batch, 512, 15, 10)
+        # Reshape: treat width as sequence
+        features = features.permute(0, 3, 1, 2)  # (batch, 10, 512, 15)
+        features = features.reshape(batch_size, self.sequence_length, -1)
+        # Classify each time step
+        logits = self.classifier(features)  # (batch, 10, num_classes+1)
+        # CTC format
+        logits = logits.permute(1, 0, 2)  # (10, batch, num_classes+1)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=2)
+        return log_probs
+    def predict(self, x):
+        """Greedy decoding."""
+        self.eval()
+        with torch.no_grad():
+            log_probs = self.forward(x)
+            _, preds = log_probs.max(2)
+            preds = preds.transpose(0, 1)
+            # Decode
+            decoded = []
+            for pred_seq in preds:
+                decoded_seq = []
+                prev_char = None
+                for char_idx in pred_seq:
+                    char_idx = char_idx.item()
+                    if char_idx == self.blank_idx:
+                        prev_char = None
+                        continue
+                    if char_idx != prev_char:
+                        decoded_seq.append(char_idx)
+                        prev_char = char_idx
+                decoded.append(decoded_seq)
+            # Pad to length 5
+            max_len = 5
+            padded = []
+            for seq in decoded:
+                if len(seq) < max_len:
+                    seq = seq + [0] * (max_len - len(seq))
+                else:
+                    seq = seq[:max_len]
+                padded.append(seq)
+            # Return tensor on same device as input
+            return torch.tensor(padded, dtype=torch.long, device=x.device)