TopAI-1
/

Pixel-1

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from .configuration_pixel import PixelConfig
+class ResidualBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(channels),
+            nn.ReLU(True),
+            nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(channels)
+        )
+    def forward(self, x):
+        return x + self.block(x)
+class PixelGenerator(PreTrainedModel):
+    config_class = PixelConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_projection = nn.Linear(config.input_dim, 4 * 4 * 1024)
+        self.decoder = nn.Sequential(
+            self._upsample(1024, 512), # 4x4 -> 8x8
+            ResidualBlock(512),
+            self._upsample(512, 256),  # 8x8 -> 16x16
+            ResidualBlock(256),
+            self._upsample(256, 128),  # 16x16 -> 32x32
+            self._upsample(128, 64),   # 32x32 -> 64x64
+            self._upsample(64, 32),    # 64x64 -> 128x128
+            nn.Conv2d(32, config.image_channels, kernel_size=3, padding=1),
+            nn.Tanh()
+        )
+    def _upsample(self, i, o):
+        return nn.Sequential(
+            nn.ConvTranspose2d(i, o, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(o),
+            nn.ReLU(True)
+        )
+    def forward(self, text_embeddings):
+        x = self.text_projection(text_embeddings)
+        x = x.view(-1, 1024, 4, 4)
+        return self.decoder(x)