File size: 2,822 Bytes

{
  "model_architecture": {
    "backbone": "facebook/dinov2-large",
    "backbone_details": {
      "model_type": "Vision Transformer (ViT)",
      "variant": "Large",
      "patch_size": 14,
      "num_hidden_layers": 24,
      "num_attention_heads": 16,
      "hidden_size": 1024,
      "intermediate_size": 4096,
      "pretrained_image_size": 518,
      "finetuned_image_size": 224
    },
    "feature_dim": 1024,
    "encoder_parameters": 304367634,
    "head_trainable_parameters": 24598,
    "freeze_backbone": true,
    "encoder_output_shape": {
      "description": "Encoder outputs full sequence of tokens including CLS token",
      "raw_shape": ["batch_size", 257, 1024],
      "tokens_breakdown": {
        "cls_token": 1,
        "patch_tokens": 256,
        "total": 257
      },
      "usage": "CLS token (index 0) is extracted for feature representation"
    }
  },
  "input_specification": {
    "image_size": [
      224,
      224
    ],
    "channels": 3,
    "pixel_range": [
      0.0,
      1.0
    ],
    "normalization": {
      "mean": [
        0.485,
        0.456,
        0.406
      ],
      "std": [
        0.229,
        0.224,
        0.225
      ],
      "description": "ImageNet normalization for DINOv2"
    },
    "input_format": "RGB",
    "tensor_layout": "NCHW"
  },
  "output_specification": {
    "heads": {
      "scene": {
        "num_classes": 6,
        "output_type": "logits",
        "activation": "softmax",
        "classes": [
          16000001,
          16000002,
          16000006,
          16000008,
          16000009,
          16000011
        ]
      },
      "concept": {
        "num_classes": 3,
        "output_type": "logits",
        "activation": "softmax",
        "classes": [
          17000001,
          17000002,
          17000003
        ]
      },
      "object": {
        "num_classes": 13,
        "output_type": "logits",
        "activation": "softmax",
        "classes": [
          18000001,
          18000002,
          18000004,
          18000005,
          18000006,
          18000007,
          18000008,
          18000009,
          18000010,
          18000012,
          18000014,
          18000016,
          "unclassified"
        ]
      }
    }
  },
  "class_mappings": {
    "scene": {
      "0": 16000001,
      "1": 16000002,
      "2": 16000006,
      "3": 16000008,
      "4": 16000009,
      "5": 16000011
    },
    "concept": {
      "0": 17000001,
      "1": 17000002,
      "2": 17000003
    },
    "object": {
      "0": 18000001,
      "1": 18000002,
      "2": 18000004,
      "3": 18000005,
      "4": 18000006,
      "5": 18000007,
      "6": 18000008,
      "7": 18000009,
      "8": 18000010,
      "9": 18000012,
      "10": 18000014,
      "11": 18000016,
      "12": "unclassified"
    }
  }
}