Stage 3: add Stage2+Stage3 compound grid + README update

Browse files

Files changed (3) hide show

stage_3/README.md +16 -0
stage_3/compound_results.json +125 -0
stage_3/compound_stage2_stage3.py +129 -0

stage_3/README.md CHANGED Viewed

@@ -48,6 +48,22 @@ The takeaway for backbone compression: **naive block skipping on a frozen pretra
 - `block_importance.json` — per-block F1 + L2 deviation
 - `block_pruning_curve.json` — cumulative F1 at K=1, 2, 3, …, 12
 ## Parameter accounting
 Each block is ~7.08M params (1.77M qkv + 589K proj + 4.72M MLP + LN + LayerScale). At K=1, ~7.1M params are effectively zeroed (8.3% of the 85.6M backbone). At K=2 with a small F1 cost, ~14.2M (16.6%) — the 0.13 F1 drop makes this generally not worth it for a person detector where 0.87 is the current baseline. Further compression should come from Stages 2 + 4 + 5 combined, not depth alone.

 - `block_importance.json` — per-block F1 + L2 deviation
 - `block_pruning_curve.json` — cumulative F1 at K=1, 2, 3, …, 12
+## Compound with Stage 2
+`compound_stage2_stage3.py` sweeps the Stage 2 head-pruning × Stage 3 block-pruning grid. Best points:
+```
+K_heads  K_blocks   F1       params saved
+  0        0        0.894    0        (baseline)
+ 10        0        0.916    1.97M    (Stage 2 peak, +0.022 F1)
+ 10        1        0.882    9.05M    (stack block 11, -0.012 F1 from baseline)
+  5        1        0.880    8.06M    (same tier, fewer heads pruned)
+  0        1        0.876    7.08M    (Stage 3 alone)
+ 15        2        0.243   17.11M    (collapses — block 6 too important)
+```
+Heads and blocks do compose but with a penalty. Removing the 10 prunable heads while also dropping block 11 gives a clean F1 ≈ 0.88 at 9M params saved, which is the best head+depth combined offer without training anything new. Beyond that, Stage 4 (specialist backbone) is needed for further compression.
 ## Parameter accounting
 Each block is ~7.08M params (1.77M qkv + 589K proj + 4.72M MLP + LN + LayerScale). At K=1, ~7.1M params are effectively zeroed (8.3% of the 85.6M backbone). At K=2 with a small F1 cost, ~14.2M (16.6%) — the 0.13 F1 drop makes this generally not worth it for a person detector where 0.87 is the current baseline. Further compression should come from Stages 2 + 4 + 5 combined, not depth alone.

stage_3/compound_results.json ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+  "baseline_F1": 0.8939393758773804,
+  "grid": [
+    {
+      "K_heads": 0,
+      "K_blocks": 0,
+      "F1": 0.8939393758773804,
+      "precision": 0.9254902005195618,
+      "recall": 0.8644688725471497,
+      "approx_params_saved": 0
+    },
+    {
+      "K_heads": 0,
+      "K_blocks": 1,
+      "F1": 0.8757709264755249,
+      "precision": 0.8438030481338501,
+      "recall": 0.9102563858032227,
+      "approx_params_saved": 7079424
+    },
+    {
+      "K_heads": 0,
+      "K_blocks": 2,
+      "F1": 0.7702127695083618,
+      "precision": 0.9187816977500916,
+      "recall": 0.66300368309021,
+      "approx_params_saved": 14158848
+    },
+    {
+      "K_heads": 5,
+      "K_blocks": 0,
+      "F1": 0.9085714221000671,
+      "precision": 0.9464285969734192,
+      "recall": 0.8736263513565063,
+      "approx_params_saved": 983040
+    },
+    {
+      "K_heads": 5,
+      "K_blocks": 1,
+      "F1": 0.8795811533927917,
+      "precision": 0.8399999737739563,
+      "recall": 0.9230769276618958,
+      "approx_params_saved": 8062464
+    },
+    {
+      "K_heads": 5,
+      "K_blocks": 2,
+      "F1": 0.8004115223884583,
+      "precision": 0.9131455421447754,
+      "recall": 0.7124541997909546,
+      "approx_params_saved": 15141888
+    },
+    {
+      "K_heads": 10,
+      "K_blocks": 0,
+      "F1": 0.9158878326416016,
+      "precision": 0.9351145029067993,
+      "recall": 0.8974359035491943,
+      "approx_params_saved": 1966080
+    },
+    {
+      "K_heads": 10,
+      "K_blocks": 1,
+      "F1": 0.8819875717163086,
+      "precision": 0.8554216623306274,
+      "recall": 0.9102563858032227,
+      "approx_params_saved": 9045504
+    },
+    {
+      "K_heads": 10,
+      "K_blocks": 2,
+      "F1": 0.7060185074806213,
+      "precision": 0.9591194987297058,
+      "recall": 0.5586080551147461,
+      "approx_params_saved": 16124928
+    },
+    {
+      "K_heads": 15,
+      "K_blocks": 0,
+      "F1": 0.8949342966079712,
+      "precision": 0.9173076748847961,
+      "recall": 0.8736263513565063,
+      "approx_params_saved": 2949120
+    },
+    {
+      "K_heads": 15,
+      "K_blocks": 1,
+      "F1": 0.8675373196601868,
+      "precision": 0.8840304017066956,
+      "recall": 0.8516483306884766,
+      "approx_params_saved": 10028544
+    },
+    {
+      "K_heads": 15,
+      "K_blocks": 2,
+      "F1": 0.24320000410079956,
+      "precision": 0.9620253443717957,
+      "recall": 0.13919414579868317,
+      "approx_params_saved": 17107968
+    },
+    {
+      "K_heads": 20,
+      "K_blocks": 0,
+      "F1": 0.8971269726753235,
+      "precision": 0.908067524433136,
+      "recall": 0.8864468932151794,
+      "approx_params_saved": 3932160
+    },
+    {
+      "K_heads": 20,
+      "K_blocks": 1,
+      "F1": 0.8467432856559753,
+      "precision": 0.8875501751899719,
+      "recall": 0.8095238208770752,
+      "approx_params_saved": 11011584
+    },
+    {
+      "K_heads": 20,
+      "K_blocks": 2,
+      "F1": 0.16415409743785858,
+      "precision": 0.9607843160629272,
+      "recall": 0.08974359184503555,
+      "approx_params_saved": 18091008
+    }
+  ]
+}

stage_3/compound_stage2_stage3.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Compound ablation: Stage 2 head mask + Stage 3 block ablation.
+Verifies that the two stages compose. Measures F1 at combinations of
+K_heads ∈ {0, 5, 10, 15, 20} and K_blocks ∈ {0, 1, 2} using the
+already-computed importance rankings.
+Output: compound_results.json
+"""
+import json, os
+import torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+from transformers import AutoModel
+COCO_ROOT = '/home/zootest/datasets/coco'
+CLASSIFIER = '/mnt/d/_tmp/1pc_repo/stage_0/classifier.json'
+STAGE2_IMPORT = '/mnt/d/_tmp/1pc_repo/stage_2/head_importance.json'
+STAGE3_IMPORT = '/mnt/d/_tmp/1pc_repo/stage_3/block_importance.json'
+OUT = '/mnt/d/_tmp/1pc_repo/stage_3/compound_results.json'
+DEVICE = 'cuda'
+HEAD_DIM = 64
+RES = 768
+D = 768
+N = 1000
+def f1_of(scores, labels, thr):
+    pred = scores > thr
+    tp = (pred & labels).sum().float()
+    fp = (pred & ~labels).sum().float()
+    fn = (~pred & labels).sum().float()
+    prec = tp / (tp + fp).clamp(min=1)
+    rec = tp / (tp + fn).clamp(min=1)
+    f1 = 2 * prec * rec / (prec + rec).clamp(min=1e-9)
+    return float(f1), float(prec), float(rec)
+@torch.inference_mode()
+def score_all(model, imgs, pos, neg):
+    scores = []
+    for x in imgs:
+        with torch.autocast('cuda', dtype=torch.bfloat16):
+            out = model.backbone.forward_features(x)
+        patches = out['x_norm_patchtokens'].float().squeeze(0)
+        ln = F.layer_norm(patches, [D])
+        pooled = ln.max(dim=0).values
+        scores.append((pooled[pos].sum() - pooled[neg].sum()).item())
+    return torch.tensor(scores, device=DEVICE)
+def main():
+    with open(CLASSIFIER) as f:
+        c = json.load(f)
+    pos = torch.tensor(c['pos_dims'], dtype=torch.long, device=DEVICE)
+    neg = torch.tensor(c['neg_dims'], dtype=torch.long, device=DEVICE)
+    thr = float(c['threshold'])
+    with open(STAGE2_IMPORT) as f:
+        s2 = json.load(f)
+    head_rank = s2['ranked_most_prunable_first']   # list of (block, head, drop)
+    with open(STAGE3_IMPORT) as f:
+        s3 = json.load(f)
+    block_rank = s3['ranked_most_prunable_first']  # list of (block, drop)
+    print('[load] Argus + COCO', flush=True)
+    model = AutoModel.from_pretrained('/mnt/d/Argus', trust_remote_code=True).to(DEVICE).eval()
+    MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).cuda()
+    STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).cuda()
+    coco = COCO(f'{COCO_ROOT}/annotations/instances_val2017.json')
+    img_ids = sorted(coco.getImgIds())[:N]
+    imgs, labels = [], []
+    for img_id in img_ids:
+        info = coco.loadImgs(img_id)[0]
+        p = f"{COCO_ROOT}/val2017/{info['file_name']}"
+        img = Image.open(p).convert('RGB').resize((RES, RES), Image.BILINEAR)
+        arr = np.asarray(img, dtype=np.uint8).copy()
+        x = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).cuda().float() / 255.0
+        imgs.append((x - MEAN) / STD)
+        labels.append(any(a['category_id'] == 1
+                           for a in coco.loadAnns(coco.getAnnIds(imgIds=img_id, iscrowd=False))))
+    labels = torch.tensor(labels, dtype=torch.bool, device=DEVICE)
+    # Backup original weights
+    orig_proj = {b: model.backbone.blocks[b].attn.proj.weight.detach().clone() for b in range(12)}
+    orig_fc2 = {b: model.backbone.blocks[b].mlp.fc2.weight.detach().clone() for b in range(12)}
+    def restore():
+        for b in range(12):
+            model.backbone.blocks[b].attn.proj.weight.data.copy_(orig_proj[b])
+            model.backbone.blocks[b].mlp.fc2.weight.data.copy_(orig_fc2[b])
+    def apply_head_mask(K_heads):
+        for (bl, hd, _) in head_rank[:K_heads]:
+            model.backbone.blocks[bl].attn.proj.weight.data[:, hd*HEAD_DIM:(hd+1)*HEAD_DIM] = 0.0
+    def apply_block_drop(K_blocks):
+        for (bl, _) in block_rank[:K_blocks]:
+            model.backbone.blocks[bl].attn.proj.weight.data.zero_()
+            model.backbone.blocks[bl].mlp.fc2.weight.data.zero_()
+    results = []
+    for kh in [0, 5, 10, 15, 20]:
+        for kb in [0, 1, 2]:
+            restore()
+            apply_head_mask(kh)
+            apply_block_drop(kb)
+            s = score_all(model, imgs, pos, neg)
+            f1, p, r = f1_of(s, labels, thr)
+            # Approximate param savings
+            heads_params = kh * (147456 + 49152)            # per-head qkv+proj cost
+            blocks_params = kb * (147456*12 + 49152*12 + 2*768*3072 + 1536)  # rough per-block
+            saved = heads_params + blocks_params
+            results.append({'K_heads': kh, 'K_blocks': kb,
+                            'F1': f1, 'precision': p, 'recall': r,
+                            'approx_params_saved': saved})
+            print(f'  K_heads={kh:>2} K_blocks={kb}  F1={f1:.4f}  P={p:.4f}  R={r:.4f}  '
+                  f'saved={saved/1e6:.2f}M', flush=True)
+    restore()
+    with open(OUT, 'w') as f:
+        json.dump({'baseline_F1': s3['baseline_F1'], 'grid': results}, f, indent=2)
+    print(f'[done] -> {OUT}', flush=True)
+if __name__ == '__main__':
+    main()