import csv from pathlib import Path import torch import torch.nn as nn from datasets import Dataset, Image from huggingface_hub import PyTorchModelHubMixin from torch import Tensor from torch.utils.data import DataLoader class DetectionHeads(nn.Module): def __init__(self, input_dim: int, class_num: int): super().__init__() self.heads = nn.ModuleList( [ nn.Sequential( nn.Linear(input_dim, 64), nn.GELU(), nn.Linear(64, 32), nn.GELU(), nn.Linear(32, 16), nn.GELU(), nn.Linear(16, 8), nn.GELU(), ) for _ in range(4) ] ) self.proj = nn.Linear(8, class_num) def forward(self, x: Tensor) -> Tensor: # x: (batch_size, input_dim) # output: (batch_size, 4, class_num) y = torch.stack([self.proj(self.heads[i](x)) for i in range(4)], dim=1) return y class Baseline2024(nn.Module, PyTorchModelHubMixin): def __init__( self, class_num: int = 26 + 10 + 3, n_channels: int = 32, p_dropout: float = 0.95, ): super().__init__() self.act = nn.GELU() self.pool2d = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) self.pool1d = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0) self.conv1 = nn.Conv2d(3, n_channels, kernel_size=3, stride=1, padding=1) self.bn1 = nn.BatchNorm2d(n_channels) self.conv2 = nn.Conv2d( n_channels, n_channels * 2, kernel_size=3, stride=1, padding=1 ) self.bn2 = nn.BatchNorm2d(n_channels * 2) self.conv3 = nn.Conv2d( n_channels * 2, n_channels * 4, kernel_size=3, stride=1, padding=1 ) self.bn3 = nn.BatchNorm2d(n_channels * 4) self.conv4 = nn.Conv2d( n_channels * 4, n_channels * 8, kernel_size=3, stride=1, padding=1 ) self.bn4 = nn.BatchNorm2d(n_channels * 8) self.conv5 = nn.Conv2d( n_channels * 8, n_channels * 16, kernel_size=3, stride=1, padding=1 ) self.bn5 = nn.BatchNorm2d(n_channels * 16) self.conv6 = nn.Conv2d( n_channels * 16, n_channels * 32, kernel_size=3, stride=1, padding=1 ) self.bn6 = nn.BatchNorm2d(n_channels * 32) self.flatten = nn.Flatten() self.dropout = nn.Dropout(p_dropout) self.heads = DetectionHeads(n_channels * 32, class_num) def forward(self, x: Tensor) -> Tensor: # x: (batch_size, 3, 30, 108) # output: (batch_size, 4, class_num) x = self.conv1(x) x = self.act(x) x = self.pool2d(x) x = self.bn1(x) x = self.conv2(x) x = self.act(x) x = self.pool2d(x) x = self.bn2(x) x = self.conv3(x) x = self.act(x) x = self.pool2d(x) x = self.bn3(x) x = self.conv4(x) x = self.act(x) x = self.pool2d(x) x = self.bn4(x) x = self.conv5(x) x = self.act(x) x = self.pool1d(x) x = self.bn5(x) x = self.conv6(x) x = self.act(x) x = self.pool1d(x) x = self.bn6(x) x = self.flatten(x) x = self.dropout(x) x = self.heads(x) return x char_dict = { "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "-": 10, "+": 11, "=": 12, "a": 13, "b": 14, "c": 15, "d": 16, "e": 17, "f": 18, "g": 19, "h": 20, "i": 21, "j": 22, "k": 23, "l": 24, "m": 25, "n": 26, "o": 27, "p": 28, "q": 29, "r": 30, "s": 31, "t": 32, "u": 33, "v": 34, "w": 35, "x": 36, "y": 37, "z": 38, } char_dict_rev = {v: k for k, v in char_dict.items()} def tensor_to_text(tensor: torch.Tensor) -> str: text = "" for i in tensor: text += char_dict_rev[torch.argmax(i).item()] return text def tensors_to_texts(tensors: torch.Tensor) -> list[str]: texts = [] for tensor in tensors: texts.append(tensor_to_text(tensor)) return texts if __name__ == "__main__": model = Baseline2024.from_pretrained("./") dir = Path("/tmp/data/test-data") captchas = [str(captcha) for captcha in dir.glob("*.jpg")] dataset = ( Dataset.from_dict({"image": captchas, "path": captchas}) .cast_column("image", Image()) .with_format("torch") ) loader = DataLoader(dataset, batch_size=16) model.eval() submission = "submission.csv" with open(submission, "w") as f, torch.no_grad(): writer = csv.writer(f) writer.writerow(["filename", "text"]) for batch in loader: image = batch["image"].float() / 255.0 output = model(image) texts = tensors_to_texts(output) for i, text in enumerate(texts): file = Path(batch["path"][i]).name writer.writerow([file, text])