{ "model_architecture": { "backbone": "facebook/dinov2-large", "backbone_details": { "model_type": "Vision Transformer (ViT)", "variant": "Large", "patch_size": 14, "num_hidden_layers": 24, "num_attention_heads": 16, "hidden_size": 1024, "intermediate_size": 4096, "pretrained_image_size": 518, "finetuned_image_size": 224 }, "feature_dim": 1024, "encoder_parameters": 304367634, "head_trainable_parameters": 24598, "freeze_backbone": true, "encoder_output_shape": { "description": "Encoder outputs full sequence of tokens including CLS token", "raw_shape": ["batch_size", 257, 1024], "tokens_breakdown": { "cls_token": 1, "patch_tokens": 256, "total": 257 }, "usage": "CLS token (index 0) is extracted for feature representation" } }, "input_specification": { "image_size": [ 224, 224 ], "channels": 3, "pixel_range": [ 0.0, 1.0 ], "normalization": { "mean": [ 0.485, 0.456, 0.406 ], "std": [ 0.229, 0.224, 0.225 ], "description": "ImageNet normalization for DINOv2" }, "input_format": "RGB", "tensor_layout": "NCHW" }, "output_specification": { "heads": { "scene": { "num_classes": 6, "output_type": "logits", "activation": "softmax", "classes": [ 16000001, 16000002, 16000006, 16000008, 16000009, 16000011 ] }, "concept": { "num_classes": 3, "output_type": "logits", "activation": "softmax", "classes": [ 17000001, 17000002, 17000003 ] }, "object": { "num_classes": 13, "output_type": "logits", "activation": "softmax", "classes": [ 18000001, 18000002, 18000004, 18000005, 18000006, 18000007, 18000008, 18000009, 18000010, 18000012, 18000014, 18000016, "unclassified" ] } } }, "class_mappings": { "scene": { "0": 16000001, "1": 16000002, "2": 16000006, "3": 16000008, "4": 16000009, "5": 16000011 }, "concept": { "0": 17000001, "1": 17000002, "2": 17000003 }, "object": { "0": 18000001, "1": 18000002, "2": 18000004, "3": 18000005, "4": 18000006, "5": 18000007, "6": 18000008, "7": 18000009, "8": 18000010, "9": 18000012, "10": 18000014, "11": 18000016, "12": "unclassified" } } }