Spaces:

Aff77
/

captcha_stn

Sleeping

App Files Files Community

Aff77 commited on May 23, 2025

Commit

e23252d

verified ·

1 Parent(s): 41fa3a5

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -126

app.py CHANGED Viewed

@@ -5,134 +5,48 @@ from PIL import Image
 import torchvision.transforms as transforms
 from torch import nn
 import torch.nn.functional as F
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Constants
-VOCAB_SIZE = 26*2 + 10  # Letters (upper/lower) + digits
-OUTPUT_LENGTH = 5        # 5-character CAPTCHAs
-AFFN_KERNEL = 5
-AFFN_STRIDE = 1
-AFFN_DEPTH = 4
-CRNN_KERNEL = 5
-CRNN_POOL_KERNEL = 2
-CRNN_DROPOUT = 0.3
-CRNN_LATENT = 128
-LSTM_HIDDEN_DIM = 32
-# Character mapping
 characters = string.ascii_letters + string.digits
 idx_to_char = {i: c for i, c in enumerate(characters)}
 # --------------------------
-# Original Model Architecture (CRNN+AFFN)
 # --------------------------
-class Encoder(nn.Sequential):
-    def __init__(self, n, kernel_size, stride):
-        super().__init__(
-            nn.Conv2d(4**(n-1), 4**n, kernel_size, stride),
-            nn.BatchNorm2d(4**n),
-            nn.ReLU()
-        )
-class Decoder(nn.Sequential):
-    def __init__(self, n, kernel_size, stride):
-        super().__init__(
-            nn.ConvTranspose2d(4**n, 4**(n-1), kernel_size, stride),
-            nn.BatchNorm2d(4**(n-1)),
-            nn.ReLU()
-        )
-class AFFN(nn.Module):
-    def __init__(self, n):
         super().__init__()
-        self.n = n
-        self.alpha = nn.Parameter(torch.randn(n-1))
-        self.encoders = nn.ModuleList([Encoder(i, AFFN_KERNEL, AFFN_STRIDE) for i in range(1, n+1)])
-        self.decoders = nn.ModuleList([Decoder(i, AFFN_KERNEL, AFFN_STRIDE) for i in range(n, 0, -1)])
-    def forward(self, x):
-        residuals = []
-        for i, enc in enumerate(self.encoders):
-            x = enc(x)
-            if i < self.n - 1:
-                x = x * (1 - self.alpha[i])
-                residuals.append(x * self.alpha[i])
-        for i, dec in enumerate(self.decoders):
-            x = dec(x)
-            if i < self.n - 1:
-                x = x + residuals.pop()
-        return x
-class CRNN(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(64, 128, CRNN_KERNEL, padding=2),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.MaxPool2d(CRNN_POOL_KERNEL)
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(128, 256, CRNN_KERNEL, padding=2),
-            nn.BatchNorm2d(256),
-            nn.ReLU(),
-            nn.MaxPool2d(CRNN_POOL_KERNEL)
-        )
-        self.flatten = nn.Flatten()
-        self.dropout = nn.Dropout(CRNN_DROPOUT)
-        self.latent_fc = nn.LazyLinear(CRNN_LATENT)
-        self.lstm = nn.LSTM(CRNN_LATENT, LSTM_HIDDEN_DIM, batch_first=True)
-        self.output_fc = nn.Linear(LSTM_HIDDEN_DIM, VOCAB_SIZE)
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = self.flatten(x)
-        x = self.dropout(x)
-        x = self.latent_fc(x)
-        x = x.unsqueeze(1)
-        lstm_out, _ = self.lstm(x)
-        return self.output_fc(lstm_out.squeeze(1))
-class CaptchaCrackNet(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.affn = AFFN(AFFN_DEPTH)
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(1, 32, 5, padding=2),
-            nn.ReLU(),
-            nn.MaxPool2d(2)
-        )
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(32, 48, 5, padding=2),
-            nn.ReLU(),
-            nn.MaxPool2d(2)
-        )
-        self.conv3 = nn.Sequential(
-            nn.Conv2d(48, 64, 5, padding=2),
-            nn.ReLU(),
-            nn.MaxPool2d(2)
-        )
-        self.res = nn.Conv2d(1, 32, 5, stride=2, padding=2)
-        self.crnn = CRNN()
     def forward(self, x):
-        x = self.affn(x)
-        res_out = self.res(x)
-        x = self.conv1(x)
-        x = self.conv2(x + res_out)
-        x = self.conv3(x)
-        return self.crnn(x)
 # --------------------------
 # Model Loading
 # --------------------------
 def load_model():
-    model = CaptchaCrackNet().to(device)
-    model.load_state_dict(torch.load('final.pth', map_location=device))
     model.eval()
     return model
@@ -141,26 +55,50 @@ model = load_model()
 # --------------------------
 # Prediction Logic
 # --------------------------
-def to_text(pred):
-    return ''.join([idx_to_char[i] for i in pred.argmax(dim=1)])
 def predict(image):
     try:
-        # Preprocess
-        transform = transforms.Compose([
-            transforms.Resize((40, 150)),
-            transforms.Grayscale(),
-            transforms.ToTensor(),
-            transforms.Normalize((0.5,), (0.5,))
-        ])
-        img_tensor = transform(image).unsqueeze(0).to(device)
-        # Predict
         with torch.no_grad():
-            output = model(img_tensor)
-            return to_text(output.squeeze(0))
     except Exception as e:
         return f"Error: {str(e)}"
@@ -169,9 +107,10 @@ def predict(image):
 # --------------------------
 iface = gr.Interface(
     fn=predict,
-    inputs=gr.Image(type="pil", label="Upload CAPTCHA"),
     outputs=gr.Textbox(label="Predicted Text"),
-    title="CAPTCHA CrackNet",
     examples=[
         ["examples/example1.png"],
         ["examples/example2.png"]

 import torchvision.transforms as transforms
 from torch import nn
 import torch.nn.functional as F
+from torchvision import models
+from itertools import groupby
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Constants
+IMG_HEIGHT = 32
+IMG_WIDTH = 128
 characters = string.ascii_letters + string.digits
+char_to_idx = {c: i for i, c in enumerate(characters)}
 idx_to_char = {i: c for i, c in enumerate(characters)}
+VOCAB_SIZE = len(characters) + 1  # +1 for CTC blank token
 # --------------------------
+# Model Architecture (Same as Training)
 # --------------------------
+class FastCRNN(nn.Module):
+    def __init__(self, num_classes):
         super().__init__()
+        resnet = models.resnet18(pretrained=False)
+        resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.cnn = nn.Sequential(*list(resnet.children())[:-3])  # Output: [B, 256, 4, 16]
+        self.lstm_input_size = 128 * (IMG_HEIGHT // 8)  # 256 * 4
+        self.rnn = nn.LSTM(self.lstm_input_size, 256, num_layers=2, bidirectional=True, dropout=0.1)
+        self.fc = nn.Linear(512, num_classes)
     def forward(self, x):
+        x = self.cnn(x)
+        x = x.permute(3, 0, 1, 2)  # [W, B, C, H]
+        x = x.contiguous().view(x.size(0), x.size(1), -1)  # [W, B, C*H]
+        x, _ = self.rnn(x)
+        x = self.fc(x)
+        return x
 # --------------------------
 # Model Loading
 # --------------------------
 def load_model():
+    model = FastCRNN(num_classes=VOCAB_SIZE).to(device)
+    model.load_state_dict(torch.load('fast_crnn_captcha_model.pth', map_location=device))
     model.eval()
     return model
 # --------------------------
 # Prediction Logic
 # --------------------------
+def decode_predictions(preds):
+    """Convert model output to text using CTC decoding"""
+    preds = preds.permute(1, 0, 2)  # [B, W, C]
+    _, pred_indices = preds.max(2)
+    texts = []
+    for pred in pred_indices:
+        # CTC decoding: merge repeated and remove blank
+        decoded = []
+        prev_char = None
+        for idx in pred:
+            char = idx_to_char.get(idx.item(), '')
+            if char != prev_char and char != '' and idx.item() != (VOCAB_SIZE - 1):
+                decoded.append(char)
+            prev_char = char
+        texts.append(''.join(decoded))
+    return texts[0] if len(texts) == 1 else texts
+def preprocess_image(image):
+    """Convert input to model-compatible format"""
+    transform = transforms.Compose([
+        transforms.Resize((IMG_HEIGHT, IMG_WIDTH)),
+        transforms.Grayscale(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,))
+    ])
+    return transform(image).unsqueeze(0).to(device)
 def predict(image):
     try:
+        # Handle Gradio input types
+        if isinstance(image, dict):
+            image = image['image'] if 'image' in image else image['data']
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        # Process and predict
+        image_tensor = preprocess_image(image)
         with torch.no_grad():
+            outputs = model(image_tensor)
+            prediction = decode_predictions(outputs)
+        return prediction
     except Exception as e:
         return f"Error: {str(e)}"
 # --------------------------
 iface = gr.Interface(
     fn=predict,
+    inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
     outputs=gr.Textbox(label="Predicted Text"),
+    title="CAPTCHA Solver (FastCRNN)",
+    description="Upload a CAPTCHA image to extract text using ResNet18 + BiLSTM",
     examples=[
         ["examples/example1.png"],
         ["examples/example2.png"]