| | import csv |
| | from pathlib import Path |
| |
|
| | import torch |
| | import torch.nn as nn |
| | from datasets import Dataset, Image |
| | from huggingface_hub import PyTorchModelHubMixin |
| | from torch import Tensor |
| | from torch.utils.data import DataLoader |
| |
|
| |
|
| | class DetectionHeads(nn.Module): |
| | def __init__(self, input_dim: int, class_num: int): |
| | super().__init__() |
| | self.heads = nn.ModuleList( |
| | [ |
| | nn.Sequential( |
| | nn.Linear(input_dim, 64), |
| | nn.GELU(), |
| | nn.Linear(64, 32), |
| | nn.GELU(), |
| | nn.Linear(32, 16), |
| | nn.GELU(), |
| | nn.Linear(16, 8), |
| | nn.GELU(), |
| | ) |
| | for _ in range(4) |
| | ] |
| | ) |
| | self.proj = nn.Linear(8, class_num) |
| |
|
| | def forward(self, x: Tensor) -> Tensor: |
| | |
| | |
| | y = torch.stack([self.proj(self.heads[i](x)) for i in range(4)], dim=1) |
| | return y |
| |
|
| |
|
| | class Baseline2024(nn.Module, PyTorchModelHubMixin): |
| | def __init__( |
| | self, |
| | class_num: int = 26 + 10 + 3, |
| | n_channels: int = 32, |
| | p_dropout: float = 0.95, |
| | ): |
| | super().__init__() |
| | self.act = nn.GELU() |
| | self.pool2d = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) |
| | self.pool1d = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0) |
| |
|
| | self.conv1 = nn.Conv2d(3, n_channels, kernel_size=3, stride=1, padding=1) |
| | self.bn1 = nn.BatchNorm2d(n_channels) |
| | self.conv2 = nn.Conv2d( |
| | n_channels, n_channels * 2, kernel_size=3, stride=1, padding=1 |
| | ) |
| | self.bn2 = nn.BatchNorm2d(n_channels * 2) |
| | self.conv3 = nn.Conv2d( |
| | n_channels * 2, n_channels * 4, kernel_size=3, stride=1, padding=1 |
| | ) |
| | self.bn3 = nn.BatchNorm2d(n_channels * 4) |
| | self.conv4 = nn.Conv2d( |
| | n_channels * 4, n_channels * 8, kernel_size=3, stride=1, padding=1 |
| | ) |
| | self.bn4 = nn.BatchNorm2d(n_channels * 8) |
| | self.conv5 = nn.Conv2d( |
| | n_channels * 8, n_channels * 16, kernel_size=3, stride=1, padding=1 |
| | ) |
| | self.bn5 = nn.BatchNorm2d(n_channels * 16) |
| | self.conv6 = nn.Conv2d( |
| | n_channels * 16, n_channels * 32, kernel_size=3, stride=1, padding=1 |
| | ) |
| | self.bn6 = nn.BatchNorm2d(n_channels * 32) |
| |
|
| | self.flatten = nn.Flatten() |
| | self.dropout = nn.Dropout(p_dropout) |
| | self.heads = DetectionHeads(n_channels * 32, class_num) |
| |
|
| | def forward(self, x: Tensor) -> Tensor: |
| | |
| | |
| | x = self.conv1(x) |
| | x = self.act(x) |
| | x = self.pool2d(x) |
| | x = self.bn1(x) |
| |
|
| | x = self.conv2(x) |
| | x = self.act(x) |
| | x = self.pool2d(x) |
| | x = self.bn2(x) |
| |
|
| | x = self.conv3(x) |
| | x = self.act(x) |
| | x = self.pool2d(x) |
| | x = self.bn3(x) |
| |
|
| | x = self.conv4(x) |
| | x = self.act(x) |
| | x = self.pool2d(x) |
| | x = self.bn4(x) |
| |
|
| | x = self.conv5(x) |
| | x = self.act(x) |
| | x = self.pool1d(x) |
| | x = self.bn5(x) |
| |
|
| | x = self.conv6(x) |
| | x = self.act(x) |
| | x = self.pool1d(x) |
| | x = self.bn6(x) |
| |
|
| | x = self.flatten(x) |
| | x = self.dropout(x) |
| | x = self.heads(x) |
| | return x |
| |
|
| |
|
| | char_dict = { |
| | "0": 0, |
| | "1": 1, |
| | "2": 2, |
| | "3": 3, |
| | "4": 4, |
| | "5": 5, |
| | "6": 6, |
| | "7": 7, |
| | "8": 8, |
| | "9": 9, |
| | "-": 10, |
| | "+": 11, |
| | "=": 12, |
| | "a": 13, |
| | "b": 14, |
| | "c": 15, |
| | "d": 16, |
| | "e": 17, |
| | "f": 18, |
| | "g": 19, |
| | "h": 20, |
| | "i": 21, |
| | "j": 22, |
| | "k": 23, |
| | "l": 24, |
| | "m": 25, |
| | "n": 26, |
| | "o": 27, |
| | "p": 28, |
| | "q": 29, |
| | "r": 30, |
| | "s": 31, |
| | "t": 32, |
| | "u": 33, |
| | "v": 34, |
| | "w": 35, |
| | "x": 36, |
| | "y": 37, |
| | "z": 38, |
| | } |
| |
|
| | char_dict_rev = {v: k for k, v in char_dict.items()} |
| |
|
| |
|
| | def tensor_to_text(tensor: torch.Tensor) -> str: |
| | text = "" |
| | for i in tensor: |
| | text += char_dict_rev[torch.argmax(i).item()] |
| | return text |
| |
|
| |
|
| | def tensors_to_texts(tensors: torch.Tensor) -> list[str]: |
| | texts = [] |
| | for tensor in tensors: |
| | texts.append(tensor_to_text(tensor)) |
| | return texts |
| |
|
| |
|
| | if __name__ == "__main__": |
| | model = Baseline2024.from_pretrained("./") |
| | dir = Path("/tmp/data/test-data") |
| | captchas = [str(captcha) for captcha in dir.glob("*.jpg")] |
| |
|
| | dataset = ( |
| | Dataset.from_dict({"image": captchas, "path": captchas}) |
| | .cast_column("image", Image()) |
| | .with_format("torch") |
| | ) |
| | loader = DataLoader(dataset, batch_size=16) |
| |
|
| | model.eval() |
| |
|
| | submission = "submission.csv" |
| | with open(submission, "w") as f, torch.no_grad(): |
| | writer = csv.writer(f) |
| | writer.writerow(["filename", "text"]) |
| |
|
| | for batch in loader: |
| | image = batch["image"].float() / 255.0 |
| | output = model(image) |
| | texts = tensors_to_texts(output) |
| | for i, text in enumerate(texts): |
| | file = Path(batch["path"][i]).name |
| | writer.writerow([file, text]) |
| |
|