File size: 5,181 Bytes

import csv
from pathlib import Path

import torch
import torch.nn as nn
from datasets import Dataset, Image
from huggingface_hub import PyTorchModelHubMixin
from torch import Tensor
from torch.utils.data import DataLoader


class DetectionHeads(nn.Module):
    def __init__(self, input_dim: int, class_num: int):
        super().__init__()
        self.heads = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(input_dim, 64),
                    nn.GELU(),
                    nn.Linear(64, 32),
                    nn.GELU(),
                    nn.Linear(32, 16),
                    nn.GELU(),
                    nn.Linear(16, 8),
                    nn.GELU(),
                )
                for _ in range(4)
            ]
        )
        self.proj = nn.Linear(8, class_num)

    def forward(self, x: Tensor) -> Tensor:
        # x: (batch_size, input_dim)
        # output: (batch_size, 4, class_num)
        y = torch.stack([self.proj(self.heads[i](x)) for i in range(4)], dim=1)
        return y


class Baseline2024(nn.Module, PyTorchModelHubMixin):
    def __init__(
        self,
        class_num: int = 26 + 10 + 3,
        n_channels: int = 32,
        p_dropout: float = 0.95,
    ):
        super().__init__()
        self.act = nn.GELU()
        self.pool2d = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.pool1d = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0)

        self.conv1 = nn.Conv2d(3, n_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(n_channels)
        self.conv2 = nn.Conv2d(
            n_channels, n_channels * 2, kernel_size=3, stride=1, padding=1
        )
        self.bn2 = nn.BatchNorm2d(n_channels * 2)
        self.conv3 = nn.Conv2d(
            n_channels * 2, n_channels * 4, kernel_size=3, stride=1, padding=1
        )
        self.bn3 = nn.BatchNorm2d(n_channels * 4)
        self.conv4 = nn.Conv2d(
            n_channels * 4, n_channels * 8, kernel_size=3, stride=1, padding=1
        )
        self.bn4 = nn.BatchNorm2d(n_channels * 8)
        self.conv5 = nn.Conv2d(
            n_channels * 8, n_channels * 16, kernel_size=3, stride=1, padding=1
        )
        self.bn5 = nn.BatchNorm2d(n_channels * 16)
        self.conv6 = nn.Conv2d(
            n_channels * 16, n_channels * 32, kernel_size=3, stride=1, padding=1
        )
        self.bn6 = nn.BatchNorm2d(n_channels * 32)

        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p_dropout)
        self.heads = DetectionHeads(n_channels * 32, class_num)

    def forward(self, x: Tensor) -> Tensor:
        # x: (batch_size, 3, 30, 108)
        # output: (batch_size, 4, class_num)
        x = self.conv1(x)
        x = self.act(x)
        x = self.pool2d(x)
        x = self.bn1(x)

        x = self.conv2(x)
        x = self.act(x)
        x = self.pool2d(x)
        x = self.bn2(x)

        x = self.conv3(x)
        x = self.act(x)
        x = self.pool2d(x)
        x = self.bn3(x)

        x = self.conv4(x)
        x = self.act(x)
        x = self.pool2d(x)
        x = self.bn4(x)

        x = self.conv5(x)
        x = self.act(x)
        x = self.pool1d(x)
        x = self.bn5(x)

        x = self.conv6(x)
        x = self.act(x)
        x = self.pool1d(x)
        x = self.bn6(x)

        x = self.flatten(x)
        x = self.dropout(x)
        x = self.heads(x)
        return x


char_dict = {
    "0": 0,
    "1": 1,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6,
    "7": 7,
    "8": 8,
    "9": 9,
    "-": 10,
    "+": 11,
    "=": 12,
    "a": 13,
    "b": 14,
    "c": 15,
    "d": 16,
    "e": 17,
    "f": 18,
    "g": 19,
    "h": 20,
    "i": 21,
    "j": 22,
    "k": 23,
    "l": 24,
    "m": 25,
    "n": 26,
    "o": 27,
    "p": 28,
    "q": 29,
    "r": 30,
    "s": 31,
    "t": 32,
    "u": 33,
    "v": 34,
    "w": 35,
    "x": 36,
    "y": 37,
    "z": 38,
}

char_dict_rev = {v: k for k, v in char_dict.items()}


def tensor_to_text(tensor: torch.Tensor) -> str:
    text = ""
    for i in tensor:
        text += char_dict_rev[torch.argmax(i).item()]
    return text


def tensors_to_texts(tensors: torch.Tensor) -> list[str]:
    texts = []
    for tensor in tensors:
        texts.append(tensor_to_text(tensor))
    return texts


if __name__ == "__main__":
    model = Baseline2024.from_pretrained("./")
    dir = Path("/tmp/data/test-data")
    captchas = [str(captcha) for captcha in dir.glob("*.jpg")]

    dataset = (
        Dataset.from_dict({"image": captchas, "path": captchas})
        .cast_column("image", Image())
        .with_format("torch")
    )
    loader = DataLoader(dataset, batch_size=16)

    model.eval()

    submission = "submission.csv"
    with open(submission, "w") as f, torch.no_grad():
        writer = csv.writer(f)
        writer.writerow(["filename", "text"])

        for batch in loader:
            image = batch["image"].float() / 255.0
            output = model(image)
            texts = tensors_to_texts(output)
            for i, text in enumerate(texts):
                file = Path(batch["path"][i]).name
                writer.writerow([file, text])