Stage 2b: structural head removal (83.68M backbone, F1 0.9159 preserved)

Browse files

Files changed (5) hide show

stage_2b/README.md +53 -0
stage_2b/eval.json +10 -0
stage_2b/head_config.json +209 -0
stage_2b/load_pruned_backbone.py +75 -0
stage_2b/pruned_state_dict.safetensors +3 -0

stage_2b/README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# Stage 2b: Structural Head Removal
+Unlike Stage 2a which masks the 10 most prunable attention heads by zeroing their output-projection columns, Stage 2b physically shrinks the attention tensors. The `qkv.weight` rows corresponding to pruned heads are deleted, the `proj.weight` columns are deleted, and each block's `num_heads` is reduced. MLPs, LayerNorms, and LayerScales are unchanged.
+## Per-block pruning plan
+```
+Block    Heads removed      Heads kept
+ 3       [5]                 11
+ 4       [8]                 11
+ 6       [9]                 11
+ 7       [11]                11
+ 9       [11, 10, 9]         9
+ 10      [4]                 11
+ 11      [1, 9]              10
+```
+Other blocks (0, 1, 2, 5, 8) retain all 12 heads.
+## Result
+```
+backbone params before:  85,641,984  = 85.64 M
+backbone params after:   83,675,904  = 83.68 M
+saved:                    1,966,080  =  1.97 M (2.30 %)
+F1 at K=10 structural:    0.9159
+F1 at K=10 Stage 2a mask: 0.9159    (byte-identical forward)
+```
+## Loading
+The pruned backbone is *not* a drop-in replacement for the stock Argus backbone because the attention module shapes differ per-block. Use `load_pruned_backbone.py`:
+```python
+from load_pruned_backbone import load_stage2b_backbone
+backbone = load_stage2b_backbone('pruned_state_dict.safetensors', 'head_config.json')
+```
+The loader constructs an Argus ViT-B, walks `head_config.json`, and replaces each block's attention with a `PrunedSelfAttention` sized for the kept heads before copying weights.
+## Files
+- `stage_2b_structural.py` — the conversion script
+- `pruned_state_dict.safetensors` — shrunk backbone weights
+- `head_config.json` — per-block `num_heads`, kept-head indices, removed-head indices
+- `load_pruned_backbone.py` — loader
+- `eval.json` — F1 parity + param delta
+## What this buys
+- 2.3 % backbone param reduction for free (no F1 cost; +0.022 F1 gain over Stage 0 baseline).
+- Smaller forward pass: pruned blocks do less attention compute.
+- Sets up Stage 3 (depth reduction) and Stage 4 (specialist backbone) on a smaller starting model.

stage_2b/eval.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "baseline_F1_stage2a_mask_K10": 0.9159,
+  "stage2b_structural_F1": 0.9158878326416016,
+  "precision": 0.9351145029067993,
+  "recall": 0.8974359035491943,
+  "backbone_params_before": 85641984,
+  "backbone_params_after": 83675904,
+  "backbone_params_saved": 1966080,
+  "n_calibration_images": 1000
+}

stage_2b/head_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "per_block_num_heads": [
+    12,
+    12,
+    12,
+    11,
+    11,
+    12,
+    11,
+    11,
+    12,
+    9,
+    11,
+    10
+  ],
+  "per_block_kept_heads": {
+    "0": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "1": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "2": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "3": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "4": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      9,
+      10,
+      11
+    ],
+    "5": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "6": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      10,
+      11
+    ],
+    "7": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10
+    ],
+    "8": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "9": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8
+    ],
+    "10": [
+      0,
+      1,
+      2,
+      3,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11
+    ],
+    "11": [
+      0,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      10,
+      11
+    ]
+  },
+  "per_block_removed_heads": {
+    "0": [],
+    "1": [],
+    "2": [],
+    "3": [
+      5
+    ],
+    "4": [
+      8
+    ],
+    "5": [],
+    "6": [
+      9
+    ],
+    "7": [
+      11
+    ],
+    "8": [],
+    "9": [
+      11,
+      10,
+      9
+    ],
+    "10": [
+      4
+    ],
+    "11": [
+      1,
+      9
+    ]
+  },
+  "head_dim": 64,
+  "dim": 768
+}

stage_2b/load_pruned_backbone.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Load the Stage 2b pruned backbone.
+Reconstructs an argus.DinoVisionTransformer, replaces each block's attention
+with a PrunedSelfAttention sized per head_config.json, and copies weights
+from pruned_state_dict.safetensors.
+"""
+import json, sys, os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+sys.path.insert(0, '/mnt/d/Argus')
+import argus
+class PrunedSelfAttention(nn.Module):
+    def __init__(self, dim=768, num_heads=12, head_dim=64,
+                 qkv_bias=False, proj_bias=True, mask_k_bias=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.inner_dim = num_heads * head_dim
+        self.scale = head_dim ** -0.5
+        linear_class = argus.LinearKMaskedBias if mask_k_bias else nn.Linear
+        self.qkv = linear_class(dim, 3 * self.inner_dim, bias=qkv_bias)
+        self.proj = nn.Linear(self.inner_dim, dim, bias=proj_bias)
+    def forward(self, x, attn_bias=None, rope=None):
+        B, N, _ = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+        q, k, v = torch.unbind(qkv, 2)
+        q, k, v = [t.transpose(1, 2) for t in [q, k, v]]
+        if rope is not None:
+            sin, cos = rope
+            prefix = N - sin.shape[-2]
+            q_pre, q_suf = q[:, :, :prefix, :], q[:, :, prefix:, :]
+            k_pre, k_suf = k[:, :, :prefix, :], k[:, :, prefix:, :]
+            q = torch.cat([q_pre, argus.rope_apply(q_suf, sin, cos)], dim=-2)
+            k = torch.cat([k_pre, argus.rope_apply(k_suf, sin, cos)], dim=-2)
+        attn = F.scaled_dot_product_attention(q, k, v)
+        attn = attn.transpose(1, 2).reshape(B, N, self.inner_dim)
+        return self.proj(attn)
+def load_stage2b_backbone(state_dict_path, head_config_path):
+    from safetensors.torch import load_file
+    with open(head_config_path) as f:
+        cfg = json.load(f)
+    backbone = argus.build_eupe_vitb16()
+    # Resize each block's attention module
+    for b, new_heads in enumerate(cfg['per_block_num_heads']):
+        if new_heads != 12:
+            block = backbone.blocks[b]
+            block.attn = PrunedSelfAttention(
+                dim=cfg['dim'], num_heads=new_heads, head_dim=cfg['head_dim'],
+                qkv_bias=False, proj_bias=True, mask_k_bias=False,
+            )
+    state = load_file(state_dict_path)
+    backbone.load_state_dict(state, strict=False)
+    return backbone
+if __name__ == '__main__':
+    here = os.path.dirname(os.path.abspath(__file__))
+    backbone = load_stage2b_backbone(
+        os.path.join(here, 'pruned_state_dict.safetensors'),
+        os.path.join(here, 'head_config.json'),
+    )
+    total = sum(p.numel() for p in backbone.parameters())
+    print(f'Stage 2b backbone loaded: {total:,} params = {total/1e6:.2f}M')
+    x = torch.randn(1, 3, 768, 768)
+    backbone.eval()
+    with torch.inference_mode():
+        out = backbone.forward_features(x)
+    print(f'forward OK  patch tokens: {tuple(out["x_norm_patchtokens"].shape)}')

stage_2b/pruned_state_dict.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:133aae4f1e7b7e232b517c71aec50628d6d4475e41d19c2023a04e5b260962d6
+size 334718768