Spaces:

Aff77
/

captcha_stn

Sleeping

App Files Files Community

Aff77 commited on May 23, 2025

Commit

41fa3a5

verified ·

1 Parent(s): 5a255e5

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -162

app.py CHANGED Viewed

@@ -1,163 +1,182 @@
-import gradio as gr
-import torch
-import string
-from PIL import Image
-import torchvision.transforms as transforms
-from torch import nn
-import torch.nn.functional as F
-from torchvision import models
-# Device configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Character mapping
-characters = string.ascii_letters + string.digits
-char_to_idx = {c: i for i, c in enumerate(characters)}
-idx_to_char = {i: c for i, c in enumerate(characters)}
-VOCAB_SIZE = len(characters) + 1  # +1 for blank token for CTC
-MAX_LABEL_LENGTH = 6
-IMG_HEIGHT = 32
-IMG_WIDTH = 128
-# --------------------------
-# Your Model Architecture
-# --------------------------
-class STN(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.localization = nn.Sequential(
-            nn.Conv2d(1, 8, kernel_size=7),
-            nn.MaxPool2d(2, stride=2),
-            nn.ReLU(True),
-            nn.Conv2d(8, 10, kernel_size=5),
-            nn.MaxPool2d(2, stride=2),
-            nn.ReLU(True)
-        )
-        # Calculate flattened size
-        with torch.no_grad():
-            dummy = torch.zeros(1, 1, IMG_HEIGHT, IMG_WIDTH)
-            out = self.localization(dummy)
-            self.flat_size = out.view(1, -1).shape[1]
-        self.fc_loc = nn.Sequential(
-            nn.Linear(self.flat_size, 32),
-            nn.ReLU(True),
-            nn.Linear(32, 6)
-        )
-        # Initialize as identity transform
-        self.fc_loc[2].weight.data.zero_()
-        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))
-    def forward(self, x):
-        xs = self.localization(x)
-        xs = xs.view(xs.size(0), -1)
-        theta = self.fc_loc(xs)
-        theta = theta.view(-1, 2, 3)
-        grid = F.affine_grid(theta, x.size(), align_corners=False)
-        x = F.grid_sample(x, grid, align_corners=False)
-        return x
-class FastCRNN(nn.Module):
-    def __init__(self, num_classes):
-        super().__init__()
-        self.stn = STN()
-        resnet = models.resnet18(pretrained=False)
-        resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
-        self.cnn = nn.Sequential(*list(resnet.children())[:-3])
-        self.lstm_input_size = 128 * (IMG_HEIGHT // 8)
-        self.rnn = nn.LSTM(self.lstm_input_size, 256, num_layers=2, bidirectional=True, dropout=0.1)
-        self.fc = nn.Linear(512, num_classes)
-    def forward(self, x):
-        x = self.stn(x)
-        x = self.cnn(x)
-        x = x.permute(3, 0, 1, 2)
-        x = x.contiguous().view(x.size(0), x.size(1), -1)
-        x, _ = self.rnn(x)
-        x = self.fc(x)
-        return x
-# --------------------------
-# Helper Functions
-# --------------------------
-def decode_predictions(preds):
-    """Convert model output to text using CTC decoding"""
-    preds = preds.permute(1, 0, 2)  # [B, W, C]
-    preds = torch.softmax(preds, dim=2)
-    preds = torch.argmax(preds, dim=2)
-    texts = []
-    for pred in preds:
-        # CTC decoding: merge repeated and remove blank (if needed)
-        decoded = []
-        prev_char = -1
-        for char in pred:
-            if char != prev_char and char < len(characters):  # Skip blank if present
-                decoded.append(char.item())
-            prev_char = char
-        text = ''.join([idx_to_char[c] for c in decoded])
-        texts.append(text)
-    return texts[0] if len(texts) == 1 else texts
-# --------------------------
-# Model Loading
-# --------------------------
-def load_model():
-    model = FastCRNN(num_classes=VOCAB_SIZE).to(device)
-    model.load_state_dict(torch.load('model/fast_crnn.pth', map_location=device))
-    model.eval()
-    return model
-model = load_model()
-# --------------------------
-# Prediction Function
-# --------------------------
-def predict_captcha(image):
-    try:
-        # Convert Gradio input to PIL Image
-        if isinstance(image, dict):  # Gradio might pass a dict
-            image = image['image'] if 'image' in image else image['data']
-        if not isinstance(image, Image.Image):
-            image = Image.fromarray(image) if isinstance(image, np.ndarray) else Image.open(image)
-        # Preprocess
-        transform = transforms.Compose([
-            transforms.Resize((IMG_HEIGHT, IMG_WIDTH)),
-            transforms.Grayscale(),
-            transforms.ToTensor(),
-            transforms.Normalize((0.5,), (0.5,))
-        ])
-        image_tensor = transform(image).unsqueeze(0).to(device)
-        # Predict
-        with torch.no_grad():
-            outputs = model(image_tensor)
-            prediction = decode_predictions(outputs)
-        return prediction
-    except Exception as e:
-        return f"Error: {str(e)}"
-# --------------------------
-# Gradio Interface
-# --------------------------
-iface = gr.Interface(
-    fn=predict_captcha,
-    inputs=gr.Image(type="pil", label="Upload CAPTCHA Image"),
-    outputs=gr.Textbox(label="Predicted Text"),
-    title="CAPTCHA Recognition with FastCRNN",
-    description="Upload a CAPTCHA image to get the predicted text.",
-    examples=[
-        ["examples/example1.png"],
-        ["examples/example2.png"]
-    ]
-)
-if __name__ == "__main__":
     iface.launch()

+import gradio as gr
+import torch
+import string
+from PIL import Image
+import torchvision.transforms as transforms
+from torch import nn
+import torch.nn.functional as F
+# Device configuration
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Constants
+VOCAB_SIZE = 26*2 + 10  # Letters (upper/lower) + digits
+OUTPUT_LENGTH = 5        # 5-character CAPTCHAs
+AFFN_KERNEL = 5
+AFFN_STRIDE = 1
+AFFN_DEPTH = 4
+CRNN_KERNEL = 5
+CRNN_POOL_KERNEL = 2
+CRNN_DROPOUT = 0.3
+CRNN_LATENT = 128
+LSTM_HIDDEN_DIM = 32
+# Character mapping
+characters = string.ascii_letters + string.digits
+idx_to_char = {i: c for i, c in enumerate(characters)}
+# --------------------------
+# Original Model Architecture (CRNN+AFFN)
+# --------------------------
+class Encoder(nn.Sequential):
+    def __init__(self, n, kernel_size, stride):
+        super().__init__(
+            nn.Conv2d(4**(n-1), 4**n, kernel_size, stride),
+            nn.BatchNorm2d(4**n),
+            nn.ReLU()
+        )
+class Decoder(nn.Sequential):
+    def __init__(self, n, kernel_size, stride):
+        super().__init__(
+            nn.ConvTranspose2d(4**n, 4**(n-1), kernel_size, stride),
+            nn.BatchNorm2d(4**(n-1)),
+            nn.ReLU()
+        )
+class AFFN(nn.Module):
+    def __init__(self, n):
+        super().__init__()
+        self.n = n
+        self.alpha = nn.Parameter(torch.randn(n-1))
+        self.encoders = nn.ModuleList([Encoder(i, AFFN_KERNEL, AFFN_STRIDE) for i in range(1, n+1)])
+        self.decoders = nn.ModuleList([Decoder(i, AFFN_KERNEL, AFFN_STRIDE) for i in range(n, 0, -1)])
+    def forward(self, x):
+        residuals = []
+        for i, enc in enumerate(self.encoders):
+            x = enc(x)
+            if i < self.n - 1:
+                x = x * (1 - self.alpha[i])
+                residuals.append(x * self.alpha[i])
+        for i, dec in enumerate(self.decoders):
+            x = dec(x)
+            if i < self.n - 1:
+                x = x + residuals.pop()
+        return x
+class CRNN(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(64, 128, CRNN_KERNEL, padding=2),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.MaxPool2d(CRNN_POOL_KERNEL)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(128, 256, CRNN_KERNEL, padding=2),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.MaxPool2d(CRNN_POOL_KERNEL)
+        )
+        self.flatten = nn.Flatten()
+        self.dropout = nn.Dropout(CRNN_DROPOUT)
+        self.latent_fc = nn.LazyLinear(CRNN_LATENT)
+        self.lstm = nn.LSTM(CRNN_LATENT, LSTM_HIDDEN_DIM, batch_first=True)
+        self.output_fc = nn.Linear(LSTM_HIDDEN_DIM, VOCAB_SIZE)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.flatten(x)
+        x = self.dropout(x)
+        x = self.latent_fc(x)
+        x = x.unsqueeze(1)
+        lstm_out, _ = self.lstm(x)
+        return self.output_fc(lstm_out.squeeze(1))
+class CaptchaCrackNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.affn = AFFN(AFFN_DEPTH)
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 32, 5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2d(2)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(32, 48, 5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2d(2)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(48, 64, 5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2d(2)
+        )
+        self.res = nn.Conv2d(1, 32, 5, stride=2, padding=2)
+        self.crnn = CRNN()
+    def forward(self, x):
+        x = self.affn(x)
+        res_out = self.res(x)
+        x = self.conv1(x)
+        x = self.conv2(x + res_out)
+        x = self.conv3(x)
+        return self.crnn(x)
+# --------------------------
+# Model Loading
+# --------------------------
+def load_model():
+    model = CaptchaCrackNet().to(device)
+    model.load_state_dict(torch.load('final.pth', map_location=device))
+    model.eval()
+    return model
+model = load_model()
+# --------------------------
+# Prediction Logic
+# --------------------------
+def to_text(pred):
+    return ''.join([idx_to_char[i] for i in pred.argmax(dim=1)])
+def predict(image):
+    try:
+        # Preprocess
+        transform = transforms.Compose([
+            transforms.Resize((40, 150)),
+            transforms.Grayscale(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5,), (0.5,))
+        ])
+        img_tensor = transform(image).unsqueeze(0).to(device)
+        # Predict
+        with torch.no_grad():
+            output = model(img_tensor)
+            return to_text(output.squeeze(0))
+    except Exception as e:
+        return f"Error: {str(e)}"
+# --------------------------
+# Gradio Interface
+# --------------------------
+iface = gr.Interface(
+    fn=predict,
+    inputs=gr.Image(type="pil", label="Upload CAPTCHA"),
+    outputs=gr.Textbox(label="Predicted Text"),
+    title="CAPTCHA CrackNet",
+    examples=[
+        ["examples/example1.png"],
+        ["examples/example2.png"]
+    ]
+)
+if __name__ == "__main__":
     iface.launch()