Spaces:

nitesh501
/

TinyDiT

Running

aniure

uploading

d069b0b 21 days ago

5 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math

	class SelfAttention(nn.Module):
	def __init__(self, n_heads, embd_dim, in_proj_bias=True, out_proj_bias=True):
	super().__init__()
	self.n_heads = n_heads
	self.in_proj = nn.Linear(embd_dim, 3 * embd_dim, bias=in_proj_bias)
	self.out_proj = nn.Linear(embd_dim, embd_dim, bias=out_proj_bias)

	self.d_heads = embd_dim // n_heads
	assert self.d_heads * n_heads == embd_dim, "embed_dim must be divisible by num_heads"

	def forward(self, x, casual_mask=False):
	batch_size, seq_len, embd_dim = x.shape
	interim_shape = (batch_size, seq_len, self.n_heads, self.d_heads)
	q, k, v = self.in_proj(x).chunk(3, dim=-1)
	q = q.view(interim_shape)
	k = k.view(interim_shape)
	v = v.view(interim_shape)

	q = q.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)

	weight = q @ k.transpose(-1, -2)
	if casual_mask:
	mask = torch.ones_like(weight, dtype=torch.bool).triu(1)
	weight.masked_fill_(mask, -torch.inf)
	weight /= math.sqrt(self.d_heads)
	weight = F.softmax(weight, dim=-1)
	output = weight @ v
	output = output.transpose(1, 2)
	output = output.reshape((batch_size, seq_len, embd_dim))
	output = self.out_proj(output)
	return output

	class AttentionBlock(nn.Module):
	def __init__(self, channels):
	super().__init__()
	self.groupnorm = nn.GroupNorm(num_groups=32, num_channels=channels)
	self.attention = SelfAttention(n_heads=1, embd_dim=channels)

	def forward(self, x):
	residual = x
	x = self.groupnorm(x)
	n, c, h, w = x.shape
	x = x.view((n, c, h * w)).transpose(-1, -2)
	x = self.attention(x)
	x = x.transpose(-1, -2).view((n, c, h, w))
	x = x + residual
	return x

	class Residual(nn.Module):
	def __init__(self, in_channels, out_channels):
	super().__init__()
	self.conv1 = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
	self.gn1 = nn.GroupNorm(32, out_channels)
	self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1)
	self.gn2 = nn.GroupNorm(32, out_channels)
	self.silu = nn.SiLU()
	if in_channels != out_channels:
	self.residual_layer = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
	else:
	self.residual_layer = nn.Identity()

	def forward(self, x):
	x_residual = x.clone()
	x = self.gn1(x)
	x = self.silu(x)
	x = self.conv1(x)
	x = self.gn2(x)
	x = self.conv2(x)
	x += self.residual_layer(x_residual)
	return x

	class Encoder(nn.Module):
	def __init__(self, latent_channels):
	super().__init__()
	self.net = nn.Sequential(
	nn.Conv2d(3, 64, 3, padding=1),
	nn.SiLU(),

	Residual(64, 64),
	Residual(64, 64),

	nn.Conv2d(64, 128, 3, 2, 1),
	Residual(128, 128),
	Residual(128, 128),

	nn.Conv2d(128, 256, 3, 2, 1),
	Residual(256, 256),
	Residual(256, 256),

	nn.Conv2d(256, 256, 3, 2, 1),
	Residual(256, 256),
	AttentionBlock(channels=256),
	Residual(256, 256),

	nn.GroupNorm(32, 256),
	nn.SiLU(),
	)
	self.mu = nn.Conv2d(256, latent_channels, 3, padding=1)
	self.logvar = nn.Conv2d(256, latent_channels, 3, padding=1)

	def forward(self, x):
	x = self.net(x)
	mu = self.mu(x)
	logvar = self.logvar(x)
	return mu, logvar

	class Decoder(nn.Module):
	def __init__(self, latent_channels):
	super().__init__()
	self.net = nn.Sequential(
	nn.Conv2d(latent_channels, 256, 3, padding=1),
	Residual(256, 256),
	AttentionBlock(channels=256),
	Residual(256, 256),

	nn.Upsample(scale_factor=2, mode='nearest'),
	nn.Conv2d(256, 256, 3, padding=1),
	Residual(256, 256),
	Residual(256, 256),

	nn.Upsample(scale_factor=2, mode='nearest'),
	nn.Conv2d(256, 128, 3, padding=1),
	Residual(128, 128),
	Residual(128, 128),

	nn.Upsample(scale_factor=2, mode='nearest'),
	nn.Conv2d(128, 64, 3, padding=1),
	Residual(64, 64),
	Residual(64, 64),

	nn.GroupNorm(32, 64),
	nn.SiLU(),
	nn.Conv2d(64, 3, 3, padding=1),
	nn.Tanh(),
	)

	def forward(self, x):
	return self.net(x)

	class Vae(nn.Module):
	def __init__(self, latent_channels):
	super().__init__()
	self.encoder = Encoder(latent_channels)
	self.decoder = Decoder(latent_channels)

	def reparametrize(self, mu, logvar):
	logvar = torch.clamp(logvar, -30, 20)
	std = torch.exp(0.5 * logvar)
	eps = torch.randn_like(std)
	return mu + eps * std

	def forward(self, x):
	mu, logvar = self.encoder(x)
	z = self.reparametrize(mu, logvar)
	return self.decoder(z), mu, logvar