from torch import nn import torch from torch.nn import functional as F from torchvision import transforms as T class PatchEmbedding (nn.Module) : def __init__ (self,image_size,patch_size,embedding_size) : super().__init__() self.projection_layers = nn.Conv2d(in_channels=3,out_channels=embedding_size,kernel_size=patch_size,stride=patch_size) self.n_patch = (image_size // patch_size)**2 def forward(self,x) : x = self.projection_layers(x) x = x.flatten(2) x = x.transpose(1,2) return x class PositionalEmbedding (nn.Module) : def __init__ (self,n_patch,embedding_size) : super().__init__() self.n_patch = n_patch self.position = nn.Parameter(torch.normal(0.0,0.02,size=(1,self.n_patch + 1,embedding_size))) self.cls_token = nn.Parameter(torch.normal(0.0,0.02,size=(1,1,embedding_size))) self.embedding_size = embedding_size def forward(self,x) : batch = x.shape[0] cls_token = torch.broadcast_to(self.cls_token,(batch,1,self.embedding_size)) x = torch.cat((cls_token,x),dim=1) x = x + self.position return x class BlockTransformers (nn.Module) : def __init__ (self,d_model,num_head,ffn_dim,droprate= 0.1) : super().__init__() self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.MHA = nn.MultiheadAttention(embed_dim=d_model,num_heads=num_head,dropout=droprate) self.FeedFordward = nn.Sequential( nn.Linear(d_model,ffn_dim), nn.GELU(), nn.Linear(ffn_dim,d_model) ) self.drop_out = nn.Dropout(droprate) def forward(self,x) : attn = self.norm1(x) attn,_ = self.MHA(attn,attn,attn) x = x+attn ffn = self.norm2(x) ffn = self.FeedFordward(x) ffn = self.drop_out(x) x = x+ffn return x class NoiceDetectorModel (nn.Module) : def __init__(self,image_size,d_model,num_head,ffn_dim,droprate= 0.1) : super().__init__() self.patch_embedding = PatchEmbedding(image_size=image_size,patch_size=16,embedding_size=d_model) self.positional_embedding = PositionalEmbedding(self.patch_embedding.n_patch,d_model) self.blocklayers = nn.Sequential( BlockTransformers(d_model,num_head,ffn_dim,droprate), BlockTransformers(d_model,num_head,ffn_dim,droprate)) self.linear1 = nn.Linear(d_model,128) self.relu = nn.ReLU() self.linear2 = nn.Linear(128,3) def forward(self,x) : x = self.patch_embedding(x) x = self.positional_embedding(x) x = self.blocklayers(x) x = x[:,-1,:] x = self.linear1(x) x = self.relu(x) x = self.linear2(x) return x class ModelRunners : def __init__(self,path) : self.Model = NoiceDetectorModel(image_size=384,d_model=256,num_head=4,ffn_dim=784) self.__checkpoint = torch.load(path) self.Model.load_state_dict(self.__checkpoint) self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.Model.to(self.device) self.Model.eval() self.transform =T.Compose([ T.ToTensor(), T.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]) ]) def modelrun (self,x_target) : if not isinstance(x_target,torch.Tensor) : x_target = self.transform(x_target) x_target = torch.unsqueeze(x_target,dim=0) with torch.no_grad() : pred = self.Model(x_target) pred = F.softmax(pred,dim=-1) if isinstance(pred,torch.Tensor) : return pred.detach().numpy() else : return pred