import torch import torchvision from torch import nn def create_vit_model(num_classes:int=121, seed:int=42): """Creates a ViT-B/16 feature extractor model and transforms. Args: num_classes (int, optional): number of target classes. Defaults to 3. seed (int, optional): random seed value for output layer. Defaults to 42. Returns: model (torch.nn.Module): ViT-B/16 feature extractor model. transforms (torchvision.transforms): ViT-B/16 image transforms. """ # Create ViT_B_16 pretrained weights, transforms and model weights = torchvision.models.ViT_B_16_Weights.DEFAULT transforms = weights.transforms() model = torchvision.models.vit_b_16(weights=weights) # Freeze all layers in model for param in model.parameters(): param.requires_grad = False # Change classifier head to suit our needs (this will be trainable) torch.manual_seed(seed) model.heads = nn.Sequential( nn.LayerNorm(768), nn.Dropout(0.2), # Try 0.1 or 0.2 nn.Linear(768, 121) ) return model, transforms