AbstractPhil commited on
Commit
8dcee19
Β·
verified Β·
1 Parent(s): 743758c

Create advanced_geometric_analysis.py

Browse files
Files changed (1) hide show
  1. advanced_geometric_analysis.py +971 -0
advanced_geometric_analysis.py ADDED
@@ -0,0 +1,971 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ BASE TIER DEEP MODEL ANALYSIS
4
+ ===============================
5
+ Three models, all 768-d output, all patch-based ViTs:
6
+ 1. clip_l14_openai β€” CLIP ViT-L/14 (text-supervised, semantic)
7
+ 2. dinov2_b14 β€” DINOv2 ViT-B/14 (self-supervised, structural)
8
+ 3. siglip_b16_384 β€” SigLIP ViT-B/16 (sigmoid contrastive, semantic)
9
+
10
+ Analyze:
11
+ - Full architecture comparison (layers, heads, dims, patch size)
12
+ - Weight statistics per layer (norms, spectral radius, sparsity)
13
+ - Attention head geometry (Q/K/V weight structure)
14
+ - Layer-by-layer representation similarity (CKA, Procrustes)
15
+ - Patch embedding weight comparison (the actual patchwork)
16
+ - MLP weight spectrum analysis
17
+ - Where do they converge internally vs diverge?
18
+ """
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ import torch.nn.functional as F
23
+ import numpy as np
24
+ import json
25
+ import gc
26
+
27
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
+
29
+ print("=" * 65)
30
+ print("BASE TIER DEEP MODEL ANALYSIS")
31
+ print("=" * 65)
32
+ print(f" Device: {DEVICE}")
33
+
34
+
35
+ # ══════════════════════════════════════════════════════════════════
36
+ # LOAD MODELS
37
+ # ══════════════════════════════════════════════════════════════════
38
+
39
+ print(f"\n{'='*65}")
40
+ print("LOADING MODELS")
41
+ print(f"{'='*65}")
42
+
43
+ from transformers import (
44
+ CLIPVisionModel, CLIPVisionConfig,
45
+ Dinov2Model, Dinov2Config,
46
+ SiglipVisionModel, SiglipVisionConfig,
47
+ )
48
+
49
+ models = {}
50
+ configs = {}
51
+
52
+ # CLIP ViT-L/14
53
+ print(f"\n Loading CLIP ViT-L/14...")
54
+ clip = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14").eval()
55
+ models["clip_l14"] = clip
56
+ configs["clip_l14"] = clip.config
57
+ print(f" Loaded: {sum(p.numel() for p in clip.parameters()):,} params")
58
+
59
+ # DINOv2 ViT-B/14
60
+ print(f" Loading DINOv2 ViT-B/14...")
61
+ dino = Dinov2Model.from_pretrained("facebook/dinov2-base").eval()
62
+ models["dinov2_b14"] = dino
63
+ configs["dinov2_b14"] = dino.config
64
+ print(f" Loaded: {sum(p.numel() for p in dino.parameters()):,} params")
65
+
66
+ # SigLIP ViT-B/16
67
+ print(f" Loading SigLIP ViT-B/16-384...")
68
+ siglip = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-384").eval()
69
+ models["siglip_b16"] = siglip
70
+ configs["siglip_b16"] = siglip.config
71
+ print(f" Loaded: {sum(p.numel() for p in siglip.parameters()):,} params")
72
+
73
+
74
+ # ══════════════════════════════════════════════════════════════════
75
+ # SCAN 1: ARCHITECTURE COMPARISON
76
+ # ══════════════════════════════════════════════════════════════════
77
+
78
+ print(f"\n{'='*65}")
79
+ print("SCAN 1: ARCHITECTURE COMPARISON")
80
+ print(f"{'='*65}")
81
+
82
+ def get_arch_info(name, model, config):
83
+ info = {"name": name}
84
+ c = config
85
+
86
+ if hasattr(c, 'hidden_size'):
87
+ info["hidden_size"] = c.hidden_size
88
+ if hasattr(c, 'intermediate_size'):
89
+ info["intermediate_size"] = c.intermediate_size
90
+ if hasattr(c, 'num_hidden_layers'):
91
+ info["num_layers"] = c.num_hidden_layers
92
+ if hasattr(c, 'num_attention_heads'):
93
+ info["num_heads"] = c.num_attention_heads
94
+ if hasattr(c, 'patch_size'):
95
+ info["patch_size"] = c.patch_size
96
+ if hasattr(c, 'image_size'):
97
+ info["image_size"] = c.image_size
98
+
99
+ info["total_params"] = sum(p.numel() for p in model.parameters())
100
+ info["head_dim"] = info.get("hidden_size", 0) // max(info.get("num_heads", 1), 1)
101
+
102
+ return info
103
+
104
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
105
+ info = get_arch_info(name, models[name], configs[name])
106
+ print(f"\n {name}:")
107
+ for k, v in info.items():
108
+ if k != "name":
109
+ print(f" {k:<20}: {v:>12,}" if isinstance(v, int) else f" {k:<20}: {v}")
110
+
111
+
112
+ # ══════════════════════════════════════════════════════════════════
113
+ # SCAN 2: NAMED PARAMETER INVENTORY
114
+ # ══════════════════════════════════════════════════════════════════
115
+
116
+ print(f"\n{'='*65}")
117
+ print("SCAN 2: PARAMETER INVENTORY")
118
+ print(f"{'='*65}")
119
+
120
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
121
+ model = models[name]
122
+ print(f"\n {name}:")
123
+
124
+ # Group by layer type
125
+ groups = {}
126
+ for pname, p in model.named_parameters():
127
+ # Extract layer category
128
+ parts = pname.split(".")
129
+ if "embeddings" in pname:
130
+ cat = "embeddings"
131
+ elif "encoder" in pname and "layer" in pname:
132
+ # Find layer number
133
+ for part in parts:
134
+ if part.startswith("layer"):
135
+ break
136
+ # Categorize within layer
137
+ if "attention" in pname:
138
+ if "query" in pname or "q_proj" in pname or "k_proj" in pname or "v_proj" in pname:
139
+ cat = "attn_qkv"
140
+ elif "out" in pname or "o_proj" in pname:
141
+ cat = "attn_out"
142
+ else:
143
+ cat = "attn_other"
144
+ elif "mlp" in pname or "intermediate" in pname or "output" in pname:
145
+ cat = "mlp"
146
+ elif "norm" in pname or "layer_norm" in pname:
147
+ cat = "layernorm"
148
+ else:
149
+ cat = "encoder_other"
150
+ elif "layernorm" in pname.lower() or "layer_norm" in pname.lower():
151
+ cat = "final_norm"
152
+ elif "head" in pname or "pooler" in pname:
153
+ cat = "head"
154
+ else:
155
+ cat = "other"
156
+
157
+ groups.setdefault(cat, {"count": 0, "params": 0, "shapes": []})
158
+ groups[cat]["count"] += 1
159
+ groups[cat]["params"] += p.numel()
160
+ if len(groups[cat]["shapes"]) < 3:
161
+ groups[cat]["shapes"].append(f"{pname.split('.')[-2]}.{pname.split('.')[-1]}: {list(p.shape)}")
162
+
163
+ for cat in sorted(groups.keys()):
164
+ g = groups[cat]
165
+ print(f" {cat:<15}: {g['params']:>12,} ({g['count']:2d} tensors)")
166
+ for s in g["shapes"]:
167
+ print(f" {s}")
168
+
169
+
170
+ # ══════════════════════════════════════════════════════════════════
171
+ # SCAN 3: WEIGHT STATISTICS PER LAYER
172
+ # ══════════════════════════════════════════════════════════════════
173
+
174
+ print(f"\n{'='*65}")
175
+ print("SCAN 3: WEIGHT STATISTICS")
176
+ print(f"{'='*65}")
177
+
178
+ def weight_stats(param):
179
+ p = param.float().detach()
180
+ stats = {
181
+ "shape": list(p.shape),
182
+ "norm": p.norm().item(),
183
+ "mean": p.mean().item(),
184
+ "std": p.std().item(),
185
+ "abs_max": p.abs().max().item(),
186
+ "sparsity": (p.abs() < 1e-6).float().mean().item(),
187
+ }
188
+ # Spectral radius for 2D weights
189
+ if p.dim() == 2 and min(p.shape) > 1:
190
+ sv = torch.linalg.svdvals(p)
191
+ stats["sv_max"] = sv[0].item()
192
+ stats["sv_min"] = sv[-1].item()
193
+ stats["sv_ratio"] = (sv[0] / (sv[-1] + 1e-10)).item()
194
+ stats["eff_rank"] = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item()
195
+ return stats
196
+
197
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
198
+ model = models[name]
199
+ print(f"\n {name} β€” key weight matrices:")
200
+ print(f" {'param':<50} {'shape':<20} {'norm':>8} {'std':>8} {'sv_max':>8} {'eff_rank':>9}")
201
+ print(f" {'-'*105}")
202
+
203
+ for pname, p in model.named_parameters():
204
+ if p.dim() < 2: continue
205
+ if p.numel() < 1000: continue
206
+
207
+ # Only show interesting layers
208
+ show = False
209
+ for keyword in ["patch", "embed", "position", "cls",
210
+ "layer.0.", "layer.5.", "layer.11.",
211
+ "layer.23.", "q_proj", "k_proj", "v_proj",
212
+ "query", "key", "value",
213
+ "fc1", "fc2", "dense", "out_proj",
214
+ "layernorm", "head"]:
215
+ if keyword in pname.lower():
216
+ show = True; break
217
+
218
+ if not show: continue
219
+
220
+ s = weight_stats(p)
221
+ sv_max = f"{s.get('sv_max', 0):.4f}" if 'sv_max' in s else " N/A"
222
+ eff_rank = f"{s.get('eff_rank', 0):.1f}" if 'eff_rank' in s else " N/A"
223
+ short_name = pname[-50:] if len(pname) > 50 else pname
224
+ shape_str = str(s["shape"])
225
+ print(f" {short_name:<50} {shape_str:<20} {s['norm']:>8.4f} "
226
+ f"{s['std']:>8.5f} {sv_max:>8} {eff_rank:>9}")
227
+
228
+
229
+ # ══════════════════════════════════════════════════════════════════
230
+ # SCAN 4: PATCH EMBEDDING ANALYSIS (the actual patchwork)
231
+ # ══════════════════════════════════════════════════════════════════
232
+
233
+ print(f"\n{'='*65}")
234
+ print("SCAN 4: PATCH EMBEDDING WEIGHTS")
235
+ print(f"{'='*65}")
236
+
237
+ patch_embeddings = {}
238
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
239
+ model = models[name]
240
+ for pname, p in model.named_parameters():
241
+ if "patch" in pname.lower() and "embed" in pname.lower() and p.dim() == 4:
242
+ patch_embeddings[name] = p.detach().float()
243
+ print(f"\n {name}: {pname}")
244
+ print(f" Shape: {list(p.shape)}")
245
+ # (out_channels, in_channels, kernel_h, kernel_w)
246
+ print(f" = {p.shape[0]} filters Γ— {p.shape[1]} channels Γ— {p.shape[2]}Γ—{p.shape[3]} kernel")
247
+ # Reshape to 2D for spectral analysis
248
+ w2d = p.detach().float().reshape(p.shape[0], -1) # (out, in*h*w)
249
+ sv = torch.linalg.svdvals(w2d)
250
+ eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item()
251
+ print(f" Spectral: sv_max={sv[0]:.4f} sv_min={sv[-1]:.6f} "
252
+ f"eff_rank={eff_rank:.1f}/{min(w2d.shape)}")
253
+ print(f" Norm: {p.norm():.4f} Mean: {p.mean():.6f} Std: {p.std():.6f}")
254
+
255
+ # Per-filter analysis
256
+ filter_norms = p.detach().float().reshape(p.shape[0], -1).norm(dim=1)
257
+ print(f" Filter norms: mean={filter_norms.mean():.4f} "
258
+ f"std={filter_norms.std():.4f} "
259
+ f"min={filter_norms.min():.4f} max={filter_norms.max():.4f}")
260
+ break
261
+
262
+ # Compare patch embeddings pairwise (Procrustes on flattened filters)
263
+ if len(patch_embeddings) >= 2:
264
+ print(f"\n Patch embedding Procrustes alignment:")
265
+ names_list = list(patch_embeddings.keys())
266
+ for i in range(len(names_list)):
267
+ for j in range(i+1, len(names_list)):
268
+ n1, n2 = names_list[i], names_list[j]
269
+ p1 = patch_embeddings[n1].reshape(patch_embeddings[n1].shape[0], -1)
270
+ p2 = patch_embeddings[n2].reshape(patch_embeddings[n2].shape[0], -1)
271
+ # Truncate to common dim
272
+ d_min = min(p1.shape[0], p2.shape[0])
273
+ d_feat = min(p1.shape[1], p2.shape[1])
274
+ a = p1[:d_min, :d_feat]; b = p2[:d_min, :d_feat]
275
+ # Raw cosine (mean over filters)
276
+ cos = F.cosine_similarity(
277
+ F.normalize(a, dim=1), F.normalize(b, dim=1), dim=1).mean().item()
278
+ print(f" {n1} Γ— {n2}: raw_cos={cos:.4f} (d_min={d_min}, d_feat={d_feat})")
279
+
280
+
281
+ # ══════════════════════════════════════════════════════════════════
282
+ # SCAN 5: ATTENTION HEAD GEOMETRY
283
+ # ══════════════════════════════════════════════════════════════════
284
+
285
+ print(f"\n{'='*65}")
286
+ print("SCAN 5: ATTENTION HEAD GEOMETRY")
287
+ print(f"{'='*65}")
288
+
289
+ def extract_qkv_weights(model, name):
290
+ """Extract Q, K, V weight matrices from each layer."""
291
+ layers_qkv = []
292
+ for pname, p in model.named_parameters():
293
+ if p.dim() != 2: continue
294
+ plow = pname.lower()
295
+ if ("query" in plow or "q_proj" in plow) and "weight" in plow:
296
+ layers_qkv.append({"layer": pname, "type": "Q", "weight": p.detach().float()})
297
+ elif ("key" in plow or "k_proj" in plow) and "weight" in plow:
298
+ layers_qkv.append({"layer": pname, "type": "K", "weight": p.detach().float()})
299
+ elif ("value" in plow or "v_proj" in plow) and "weight" in plow:
300
+ layers_qkv.append({"layer": pname, "type": "V", "weight": p.detach().float()})
301
+ return layers_qkv
302
+
303
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
304
+ qkv = extract_qkv_weights(models[name], name)
305
+ n_layers = len(qkv) // 3
306
+
307
+ print(f"\n {name} ({n_layers} layers):")
308
+ print(f" {'layer':>6} {'Q_norm':>8} {'K_norm':>8} {'V_norm':>8} "
309
+ f"{'QK_cos':>8} {'QV_cos':>8} {'KV_cos':>8}")
310
+
311
+ for layer_idx in range(n_layers):
312
+ q = qkv[layer_idx * 3]["weight"]
313
+ k = qkv[layer_idx * 3 + 1]["weight"]
314
+ v = qkv[layer_idx * 3 + 2]["weight"]
315
+
316
+ q_norm = q.norm().item()
317
+ k_norm = k.norm().item()
318
+ v_norm = v.norm().item()
319
+
320
+ # Flatten and compute cosine between Q/K, Q/V, K/V
321
+ qf = q.reshape(-1); kf = k.reshape(-1); vf = v.reshape(-1)
322
+ d = min(qf.shape[0], kf.shape[0], vf.shape[0])
323
+ qk_cos = F.cosine_similarity(qf[:d].unsqueeze(0), kf[:d].unsqueeze(0)).item()
324
+ qv_cos = F.cosine_similarity(qf[:d].unsqueeze(0), vf[:d].unsqueeze(0)).item()
325
+ kv_cos = F.cosine_similarity(kf[:d].unsqueeze(0), vf[:d].unsqueeze(0)).item()
326
+
327
+ if layer_idx < 3 or layer_idx >= n_layers - 2 or layer_idx == n_layers // 2:
328
+ print(f" {layer_idx:>6} {q_norm:>8.3f} {k_norm:>8.3f} {v_norm:>8.3f} "
329
+ f"{qk_cos:>8.4f} {qv_cos:>8.4f} {kv_cos:>8.4f}")
330
+ elif layer_idx == 3:
331
+ print(f" {'...':>6}")
332
+
333
+
334
+ # ══════════════════════════════════════════════════════════════════
335
+ # SCAN 6: CROSS-MODEL QK ALIGNMENT
336
+ # ══════════════════════════════════════════════════════════════════
337
+
338
+ print(f"\n{'='*65}")
339
+ print("SCAN 6: CROSS-MODEL WEIGHT ALIGNMENT")
340
+ print(f"{'='*65}")
341
+
342
+ # Compare equivalent layers across models
343
+ # Use common dimension (768) β€” all three output 768-d
344
+ # Compare Q weights, K weights, V weights at equivalent depth fractions
345
+
346
+ model_qkv = {}
347
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
348
+ model_qkv[name] = extract_qkv_weights(models[name], name)
349
+
350
+ print(f"\n Cross-model Q weight cosine at equivalent depth fractions:")
351
+ print(f" {'depth':>6} {'clipΓ—dino':>10} {'clipΓ—siglip':>12} {'dinoΓ—siglip':>12}")
352
+
353
+ for name in model_qkv:
354
+ n = len(model_qkv[name]) // 3
355
+ print(f" {name}: {n} layers")
356
+
357
+ # Compare at 0%, 25%, 50%, 75%, 100% depth
358
+ for frac in [0.0, 0.25, 0.5, 0.75, 1.0]:
359
+ vals = {}
360
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
361
+ qkv = model_qkv[name]
362
+ n = len(qkv) // 3
363
+ idx = min(int(frac * (n - 1)), n - 1)
364
+ q = qkv[idx * 3]["weight"].reshape(-1)
365
+ vals[name] = q
366
+
367
+ # Truncate to common length
368
+ min_len = min(v.shape[0] for v in vals.values())
369
+ cos_cd = F.cosine_similarity(
370
+ vals["clip_l14"][:min_len].unsqueeze(0),
371
+ vals["dinov2_b14"][:min_len].unsqueeze(0)).item()
372
+ cos_cs = F.cosine_similarity(
373
+ vals["clip_l14"][:min_len].unsqueeze(0),
374
+ vals["siglip_b16"][:min_len].unsqueeze(0)).item()
375
+ cos_ds = F.cosine_similarity(
376
+ vals["dinov2_b14"][:min_len].unsqueeze(0),
377
+ vals["siglip_b16"][:min_len].unsqueeze(0)).item()
378
+
379
+ print(f" {frac:>5.0%} {cos_cd:>10.4f} {cos_cs:>12.4f} {cos_ds:>12.4f}")
380
+
381
+
382
+ # ══════════════════════════════════════════════════════════════════
383
+ # SCAN 7: MLP WEIGHT SPECTRUM
384
+ # ══════════════════════════════════════════════════════════════════
385
+
386
+ print(f"\n{'='*65}")
387
+ print("SCAN 7: MLP WEIGHT SPECTRUM")
388
+ print(f"{'='*65}")
389
+
390
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
391
+ model = models[name]
392
+ mlp_weights = []
393
+ for pname, p in model.named_parameters():
394
+ if p.dim() == 2 and ("fc1" in pname or "fc2" in pname or
395
+ ("intermediate" in pname and "dense" in pname and "weight" in pname) or
396
+ ("output" in pname and "dense" in pname and "weight" in pname and "attention" not in pname)):
397
+ mlp_weights.append((pname, p.detach().float()))
398
+
399
+ print(f"\n {name} MLPs ({len(mlp_weights)} weight matrices):")
400
+ for pname, w in mlp_weights[:6]: # first 3 layers
401
+ sv = torch.linalg.svdvals(w)
402
+ eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item()
403
+ short = pname.split(".")[-3] + "." + pname.split(".")[-2] + "." + pname.split(".")[-1]
404
+ print(f" {short:<40} {str(list(w.shape)):<20} "
405
+ f"eff_rank={eff_rank:>6.1f}/{min(w.shape)} "
406
+ f"sv_max={sv[0]:.3f} sv_10={sv[min(9,len(sv)-1)]:.4f}")
407
+
408
+ if len(mlp_weights) > 6:
409
+ print(f" ... ({len(mlp_weights) - 6} more)")
410
+
411
+
412
+ # ══════════════════════════════════════════════════════════════════
413
+ # SCAN 8: POSITION EMBEDDING ANALYSIS
414
+ # ══════════════════════════════════════════════════════════════════
415
+
416
+ print(f"\n{'='*65}")
417
+ print("SCAN 8: POSITION EMBEDDINGS")
418
+ print(f"{'='*65}")
419
+
420
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
421
+ model = models[name]
422
+ for pname, p in model.named_parameters():
423
+ if "position" in pname.lower() and "embed" in pname.lower():
424
+ pe = p.detach().float()
425
+ print(f"\n {name}: {pname}")
426
+ print(f" Shape: {list(pe.shape)}")
427
+ print(f" Norm: {pe.norm():.4f} Mean: {pe.mean():.6f} Std: {pe.std():.6f}")
428
+
429
+ if pe.dim() >= 2:
430
+ # Self-similarity of position embeddings
431
+ if pe.dim() == 3:
432
+ pe2d = pe.squeeze(0)
433
+ else:
434
+ pe2d = pe
435
+ sim = F.cosine_similarity(pe2d.unsqueeze(0), pe2d.unsqueeze(1), dim=-1)
436
+ print(f" Self-sim: diag_mean={sim.diag().mean():.4f} "
437
+ f"off_diag_mean={(sim.sum()-sim.diag().sum()).item()/(sim.numel()-sim.shape[0]):.4f}")
438
+ print(f" Adjacent pos cos: mean={F.cosine_similarity(pe2d[:-1], pe2d[1:], dim=-1).mean():.4f}")
439
+
440
+ # SVD of position embeddings
441
+ sv = torch.linalg.svdvals(pe2d)
442
+ eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item()
443
+ print(f" Spectral: eff_rank={eff_rank:.1f}/{min(pe2d.shape)} "
444
+ f"sv1%={sv[0].pow(2).item()/sv.pow(2).sum().item()*100:.1f}%")
445
+ break
446
+
447
+
448
+ # ══════════════════════════════════════════════════════════════════
449
+ # SCAN 9: LAYERNORM ANALYSIS
450
+ # ══════════════════════════════════════════════════════════════════
451
+
452
+ print(f"\n{'='*65}")
453
+ print("SCAN 9: LAYERNORM WEIGHT/BIAS PATTERNS")
454
+ print(f"{'='*65}")
455
+
456
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
457
+ model = models[name]
458
+ ln_weights = []
459
+ ln_biases = []
460
+ for pname, p in model.named_parameters():
461
+ if ("norm" in pname.lower() or "layer_norm" in pname.lower()):
462
+ if "weight" in pname:
463
+ ln_weights.append((pname, p.detach().float()))
464
+ elif "bias" in pname:
465
+ ln_biases.append((pname, p.detach().float()))
466
+
467
+ print(f"\n {name} ({len(ln_weights)} LayerNorms):")
468
+ for (wn, w), (bn, b) in zip(ln_weights[:4], ln_biases[:4]):
469
+ short = wn.split(".")[-3] + "." + wn.split(".")[-2]
470
+ print(f" {short:<30} w: mean={w.mean():.4f} std={w.std():.4f} "
471
+ f"b: mean={b.mean():.5f} std={b.std():.4f}")
472
+
473
+ # Final LayerNorm
474
+ if ln_weights:
475
+ wn, w = ln_weights[-1]
476
+ bn, b = ln_biases[-1] if ln_biases else ("", torch.zeros_like(w))
477
+ print(f" FINAL: {wn}")
478
+ print(f" weight: mean={w.mean():.4f} std={w.std():.4f} "
479
+ f"min={w.min():.4f} max={w.max():.4f}")
480
+ if ln_biases:
481
+ print(f" bias: mean={b.mean():.5f} std={b.std():.4f}")
482
+
483
+
484
+ # ══════════════════════════════════════════════════════════════════
485
+ # SCAN 10: PENTACHORON CV ON WEIGHT GEOMETRY
486
+ # ══════════════════════════════════════════════════════════════════
487
+
488
+ print(f"\n{'='*65}")
489
+ print("SCAN 10: PENTACHORON CV ON WEIGHT GEOMETRY")
490
+ print(f"{'='*65}")
491
+
492
+ def cayley_menger_vol2(pts):
493
+ pts = pts.float()
494
+ diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)
495
+ d2 = (diff * diff).sum(-1)
496
+ B, V, _ = d2.shape
497
+ cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32)
498
+ cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
499
+ s = (-1.0)**V; f = math.factorial(V-1)
500
+ return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm)
501
+
502
+ def cv_metric_on_weights(weight_matrix, n_samples=300):
503
+ """Measure pentachoron CV on rows of a weight matrix."""
504
+ w = F.normalize(weight_matrix.float(), dim=-1)
505
+ N = w.shape[0]
506
+ if N < 5: return 0.0
507
+ vols = []
508
+ for _ in range(n_samples):
509
+ idx = torch.randperm(N)[:5]
510
+ v2 = cayley_menger_vol2(w[idx].unsqueeze(0))
511
+ v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()
512
+ if v > 0: vols.append(v)
513
+ if len(vols) < 10: return 0.0
514
+ a = np.array(vols)
515
+ return float(a.std() / (a.mean() + 1e-8))
516
+
517
+ # CV on patch embedding filters
518
+ print(f"\n Patch embedding filter CV (rows = output filters):")
519
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
520
+ if name in patch_embeddings:
521
+ p = patch_embeddings[name]
522
+ w2d = p.reshape(p.shape[0], -1) # (n_filters, in*h*w)
523
+ cv = cv_metric_on_weights(w2d)
524
+ print(f" {name:<15} filters={w2d.shape[0]} CV={cv:.4f}")
525
+
526
+ # CV on Q, K, V weight rows per layer
527
+ print(f"\n QKV weight row CV per layer:")
528
+ print(f" {'model':<15} {'layer':>6} {'Q_cv':>8} {'K_cv':>8} {'V_cv':>8} {'QK_diff':>9}")
529
+
530
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
531
+ qkv = model_qkv[name]
532
+ n_layers = len(qkv) // 3
533
+
534
+ for layer_idx in range(n_layers):
535
+ q = qkv[layer_idx * 3]["weight"]
536
+ k = qkv[layer_idx * 3 + 1]["weight"]
537
+ v = qkv[layer_idx * 3 + 2]["weight"]
538
+
539
+ q_cv = cv_metric_on_weights(q, n_samples=200)
540
+ k_cv = cv_metric_on_weights(k, n_samples=200)
541
+ v_cv = cv_metric_on_weights(v, n_samples=200)
542
+
543
+ if layer_idx < 2 or layer_idx >= n_layers - 2 or layer_idx == n_layers // 2:
544
+ print(f" {name:<15} {layer_idx:>6} {q_cv:>8.4f} {k_cv:>8.4f} "
545
+ f"{v_cv:>8.4f} {abs(q_cv - k_cv):>9.4f}")
546
+ elif layer_idx == 2:
547
+ print(f" {name:<15} {'...':>6}")
548
+
549
+ # CV on MLP weight rows
550
+ print(f"\n MLP weight row CV (first and last layers):")
551
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
552
+ model = models[name]
553
+ mlp_weights = []
554
+ for pname, p in model.named_parameters():
555
+ if p.dim() == 2 and ("fc1" in pname or "fc2" in pname or
556
+ ("intermediate" in pname and "dense" in pname and "weight" in pname) or
557
+ ("output" in pname and "dense" in pname and "weight" in pname and "attention" not in pname)):
558
+ mlp_weights.append((pname, p.detach().float()))
559
+
560
+ if mlp_weights:
561
+ # First layer MLP
562
+ pname, w = mlp_weights[0]
563
+ cv_first = cv_metric_on_weights(w, n_samples=200)
564
+ # Last layer MLP
565
+ pname2, w2 = mlp_weights[-1]
566
+ cv_last = cv_metric_on_weights(w2, n_samples=200)
567
+ print(f" {name:<15} first_mlp CV={cv_first:.4f} last_mlp CV={cv_last:.4f}")
568
+
569
+ # CV on position embeddings
570
+ print(f"\n Position embedding CV:")
571
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
572
+ model = models[name]
573
+ for pname, p in model.named_parameters():
574
+ if "position" in pname.lower() and "embed" in pname.lower():
575
+ pe = p.detach().float()
576
+ if pe.dim() == 3: pe = pe.squeeze(0)
577
+ if pe.dim() == 2 and pe.shape[0] >= 5:
578
+ cv = cv_metric_on_weights(pe, n_samples=300)
579
+ print(f" {name:<15} positions={pe.shape[0]} CV={cv:.4f}")
580
+ break
581
+
582
+
583
+ # ══════════════════════════════════════════════════════════════════
584
+ # SCAN 11: CROSS-MODEL CV COMPARISON (are they in the same CV band?)
585
+ # ══════════════════════════════════════════════════════════════════
586
+
587
+ print(f"\n{'='*65}")
588
+ print("SCAN 11: CROSS-MODEL CV BAND COMPARISON")
589
+ print(f"{'='*65}")
590
+
591
+ # Collect all Q weight CVs per model
592
+ print(f"\n Q weight CV distribution per model:")
593
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
594
+ qkv = model_qkv[name]
595
+ n_layers = len(qkv) // 3
596
+ q_cvs = []
597
+ k_cvs = []
598
+ v_cvs = []
599
+ for layer_idx in range(n_layers):
600
+ q = qkv[layer_idx * 3]["weight"]
601
+ k = qkv[layer_idx * 3 + 1]["weight"]
602
+ v = qkv[layer_idx * 3 + 2]["weight"]
603
+ q_cvs.append(cv_metric_on_weights(q, n_samples=200))
604
+ k_cvs.append(cv_metric_on_weights(k, n_samples=200))
605
+ v_cvs.append(cv_metric_on_weights(v, n_samples=200))
606
+
607
+ q_arr = np.array(q_cvs)
608
+ k_arr = np.array(k_cvs)
609
+ v_arr = np.array(v_cvs)
610
+ print(f" {name:<15} Q: mean={q_arr.mean():.4f} std={q_arr.std():.4f} "
611
+ f"range=[{q_arr.min():.4f}, {q_arr.max():.4f}]")
612
+ print(f" {'':15} K: mean={k_arr.mean():.4f} std={k_arr.std():.4f} "
613
+ f"range=[{k_arr.min():.4f}, {k_arr.max():.4f}]")
614
+ print(f" {'':15} V: mean={v_arr.mean():.4f} std={v_arr.std():.4f} "
615
+ f"range=[{v_arr.min():.4f}, {v_arr.max():.4f}]")
616
+
617
+ # Check for 0.20-0.23 band
618
+ in_band_q = ((q_arr >= 0.18) & (q_arr <= 0.25)).sum()
619
+ in_band_k = ((k_arr >= 0.18) & (k_arr <= 0.25)).sum()
620
+ in_band_v = ((v_arr >= 0.18) & (v_arr <= 0.25)).sum()
621
+ print(f" {'':15} In CV band [0.18-0.25]: Q={in_band_q}/{n_layers} "
622
+ f"K={in_band_k}/{n_layers} V={in_band_v}/{n_layers}")
623
+
624
+ # Cross-model: concatenate equivalent layer Q weights, measure CV
625
+ print(f"\n Cross-model concatenated Q weight CV (same-depth rows mixed):")
626
+ name_pairs = [("clip_l14", "dinov2_b14"), ("clip_l14", "siglip_b16"),
627
+ ("dinov2_b14", "siglip_b16"), ("clip_l14", "dinov2_b14", "siglip_b16")]
628
+
629
+ for pair in name_pairs:
630
+ # Match by depth fraction
631
+ pair_label = " Γ— ".join(n[:8] for n in pair)
632
+ n_layers_per = [len(model_qkv[n]) // 3 for n in pair]
633
+ min_layers = min(n_layers_per)
634
+
635
+ cvs_at_depth = []
636
+ for frac_idx in range(min_layers):
637
+ rows = []
638
+ for ni, n in enumerate(pair):
639
+ n_total = n_layers_per[ni]
640
+ # Map to equivalent depth
641
+ layer_idx = int(frac_idx / min_layers * n_total)
642
+ layer_idx = min(layer_idx, n_total - 1)
643
+ q = model_qkv[n][layer_idx * 3]["weight"]
644
+ rows.append(F.normalize(q.float(), dim=-1))
645
+
646
+ # Truncate to common dim and concatenate
647
+ d_min = min(r.shape[1] for r in rows)
648
+ combined = torch.cat([r[:, :d_min] for r in rows], dim=0)
649
+ cv = cv_metric_on_weights(combined, n_samples=200)
650
+ cvs_at_depth.append(cv)
651
+
652
+ arr = np.array(cvs_at_depth)
653
+ print(f" {pair_label:<35} mean={arr.mean():.4f} std={arr.std():.4f} "
654
+ f"range=[{arr.min():.4f}, {arr.max():.4f}]")
655
+
656
+
657
+ # ══════════════════════════════════════════════════════════════════
658
+ # SUMMARY
659
+ # ══════════════════════════════════════════════════════════════════
660
+
661
+ print(f"\n{'='*65}")
662
+ print("WEIGHT ANALYSIS COMPLETE β€” STARTING ACTIVATION ANALYSIS")
663
+ print(f"{'='*65}")
664
+
665
+ # Free CPU models before GPU reload
666
+ del models, configs
667
+ gc.collect()
668
+ torch.cuda.empty_cache()
669
+
670
+
671
+ # ══════════════════════════════════════════════════════════════════
672
+ # SCAN 12: RUN IMAGES, EXTRACT PER-LAYER ACTIVATIONS
673
+ # ═══════════════════════��══════════════════════════════════════════
674
+
675
+ print(f"\n{'='*65}")
676
+ print("SCAN 12: PER-LAYER ACTIVATION EXTRACTION")
677
+ print(f"{'='*65}")
678
+
679
+ from transformers import AutoImageProcessor
680
+ from datasets import load_dataset
681
+ from PIL import Image
682
+
683
+ # Load a small batch of COCO images
684
+ print(f" Loading images from rafaelpadilla/coco2017...")
685
+ coco = load_dataset("rafaelpadilla/coco2017", split="validation",
686
+ revision="refs/convert/parquet")
687
+ N_IMGS = 256 # enough for Procrustes, small enough for speed
688
+
689
+ # Prepare processors
690
+ processors = {
691
+ "clip_l14": AutoImageProcessor.from_pretrained("openai/clip-vit-large-patch14"),
692
+ "dinov2_b14": AutoImageProcessor.from_pretrained("facebook/dinov2-base"),
693
+ "siglip_b16": AutoImageProcessor.from_pretrained("google/siglip-base-patch16-384"),
694
+ }
695
+
696
+ # Reload models (were deleted in cleanup)
697
+ from transformers import CLIPVisionModel, Dinov2Model, SiglipVisionModel
698
+ models = {
699
+ "clip_l14": CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14").eval().to(DEVICE),
700
+ "dinov2_b14": Dinov2Model.from_pretrained("facebook/dinov2-base").eval().to(DEVICE),
701
+ "siglip_b16": SiglipVisionModel.from_pretrained("google/siglip-base-patch16-384").eval().to(DEVICE),
702
+ }
703
+ for m in models.values():
704
+ for p in m.parameters():
705
+ p.requires_grad = False
706
+
707
+ # Collect images
708
+ images = []
709
+ for i in range(min(N_IMGS, len(coco))):
710
+ try:
711
+ img = coco[i]["image"].convert("RGB")
712
+ images.append(img)
713
+ except:
714
+ continue
715
+ print(f" Collected {len(images)} images")
716
+
717
+ # Extract per-layer hidden states
718
+ layer_activations = {} # {model_name: [layer0_cls, layer1_cls, ...]}
719
+ pooled_outputs = {} # {model_name: (N, d)}
720
+
721
+ EXTRACT_BATCH = 32
722
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
723
+ model = models[name]
724
+ proc = processors[name]
725
+ all_hidden = None
726
+ all_pooled = []
727
+
728
+ for bi in range(0, len(images), EXTRACT_BATCH):
729
+ batch_imgs = images[bi:bi+EXTRACT_BATCH]
730
+ inputs = proc(images=batch_imgs, return_tensors="pt").to(DEVICE)
731
+
732
+ with torch.no_grad():
733
+ outputs = model(**inputs, output_hidden_states=True)
734
+
735
+ hs = outputs.hidden_states # tuple of (B, seq, d) per layer
736
+
737
+ if all_hidden is None:
738
+ all_hidden = [[] for _ in range(len(hs))]
739
+ for li, h in enumerate(hs):
740
+ # CLS token (position 0) for each layer
741
+ all_hidden[li].append(h[:, 0, :].cpu())
742
+
743
+ # Final pooled output
744
+ if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
745
+ all_pooled.append(outputs.pooler_output.cpu())
746
+ else:
747
+ all_pooled.append(hs[-1][:, 0, :].cpu())
748
+
749
+ layer_activations[name] = [torch.cat(h, 0).float() for h in all_hidden]
750
+ pooled_outputs[name] = F.normalize(torch.cat(all_pooled, 0).float(), dim=-1)
751
+
752
+ n_layers = len(layer_activations[name])
753
+ d = layer_activations[name][0].shape[-1]
754
+ print(f" {name}: {n_layers} layers, d={d}, N={layer_activations[name][0].shape[0]}")
755
+
756
+
757
+ # ══════════════════════════════════════════════════════════════════
758
+ # SCAN 13: WITHIN-MODEL DEPTH PROGRESSION
759
+ # ══════════════════════════════════════════════════════════════════
760
+
761
+ print(f"\n{'='*65}")
762
+ print("SCAN 13: WITHIN-MODEL DEPTH PROGRESSION")
763
+ print(f"{'='*65}")
764
+
765
+ def symmetric_inv_sqrt(cov, eps=1e-6):
766
+ evals, evecs = torch.linalg.eigh(cov)
767
+ return evecs @ torch.diag(torch.clamp(evals, min=eps).rsqrt()) @ evecs.T
768
+
769
+ def procrustes_cos(source, target, n=None):
770
+ """Whitened Procrustes alignment, return pre and post cosine."""
771
+ if n is None: n = min(source.shape[0], target.shape[0])
772
+ S = source[:n].float(); T = target[:n].float()
773
+ sm = S.mean(0, keepdim=True); tm = T.mean(0, keepdim=True)
774
+ Sc = S - sm; Tc = T - tm
775
+ Ns = Sc.shape[0]
776
+
777
+ # Raw cosine before alignment
778
+ cos_pre = F.cosine_similarity(
779
+ F.normalize(Sc, dim=-1), F.normalize(Tc, dim=-1), dim=-1).mean().item()
780
+
781
+ # Whiten
782
+ s_cov = (Sc.T @ Sc) / max(Ns-1, 1)
783
+ t_cov = (Tc.T @ Tc) / max(Ns-1, 1)
784
+ try:
785
+ sw = symmetric_inv_sqrt(s_cov)
786
+ tw = symmetric_inv_sqrt(t_cov)
787
+ except:
788
+ return cos_pre, cos_pre, torch.tensor([0.0])
789
+
790
+ Sc_w = F.normalize(Sc @ sw, dim=-1)
791
+ Tc_w = F.normalize(Tc @ tw, dim=-1)
792
+
793
+ U, S_vals, Vt = torch.linalg.svd(Tc_w.T @ Sc_w, full_matrices=False)
794
+ R = U @ Vt
795
+ cos_post = F.cosine_similarity(Sc_w @ R.T, Tc_w, dim=-1).mean().item()
796
+
797
+ return cos_pre, cos_post, S_vals
798
+
799
+ print(f"\n Layer-to-layer Procrustes within each model (layer N vs layer N+1):")
800
+ for name in ["clip_l14", "dinov2_b14", "siglip_b16"]:
801
+ acts = layer_activations[name]
802
+ n_layers = len(acts)
803
+ print(f"\n {name} ({n_layers} layers):")
804
+ print(f" {'L→L+1':>8} {'pre_cos':>8} {'post_cos':>9} {'sv_min':>8} {'sv_max':>8}")
805
+
806
+ for li in range(n_layers - 1):
807
+ if li < 3 or li >= n_layers - 3 or li == n_layers // 2:
808
+ pre, post, svs = procrustes_cos(acts[li], acts[li+1])
809
+ print(f" {li:>3}β†’{li+1:<3} {pre:>8.4f} {post:>9.4f} "
810
+ f"{svs.min():.4f} {svs.max():.4f}")
811
+ elif li == 3:
812
+ print(f" {'...':>8}")
813
+
814
+
815
+ # ══════════════════════════════════════════════════════════════════
816
+ # SCAN 14: CROSS-MODEL PROCRUSTES AT EACH DEPTH
817
+ # ══════════════════════════════════════════════════════════════════
818
+
819
+ print(f"\n{'='*65}")
820
+ print("SCAN 14: CROSS-MODEL PROCRUSTES (per depth fraction)")
821
+ print(f"{'='*65}")
822
+
823
+ model_names = ["clip_l14", "dinov2_b14", "siglip_b16"]
824
+ n_layers_per = {n: len(layer_activations[n]) for n in model_names}
825
+
826
+ print(f"\n Layers: clip={n_layers_per['clip_l14']} dino={n_layers_per['dinov2_b14']} "
827
+ f"siglip={n_layers_per['siglip_b16']}")
828
+
829
+ # Compare at 11 depth fractions (0%, 10%, 20%, ..., 100%)
830
+ fracs = [i/10 for i in range(11)]
831
+
832
+ print(f"\n {'frac':>5} {'clipΓ—dino':>10} {'clipΓ—dino':>10} {'clipΓ—sig':>10} "
833
+ f"{'clipΓ—sig':>10} {'dinoΓ—sig':>10} {'dinoΓ—sig':>10}")
834
+ print(f" {'':>5} {'pre':>10} {'POST':>10} {'pre':>10} {'POST':>10} "
835
+ f"{'pre':>10} {'POST':>10}")
836
+ print(f" {'-'*67}")
837
+
838
+ for frac in fracs:
839
+ results = {}
840
+ for n in model_names:
841
+ nl = n_layers_per[n]
842
+ idx = min(int(frac * (nl - 1)), nl - 1)
843
+ results[n] = layer_activations[n][idx]
844
+
845
+ # Common dim for cross-model comparison β€” PCA to min dim
846
+ dims = {n: results[n].shape[-1] for n in model_names}
847
+ d_min = min(dims.values())
848
+
849
+ projected = {}
850
+ for n in model_names:
851
+ if dims[n] == d_min:
852
+ projected[n] = results[n]
853
+ else:
854
+ r = results[n]
855
+ rc = r - r.mean(0, keepdim=True)
856
+ _, _, Vt = torch.linalg.svd(rc[:200], full_matrices=False)
857
+ projected[n] = r @ Vt[:d_min].T
858
+
859
+ pairs = [("clip_l14", "dinov2_b14"), ("clip_l14", "siglip_b16"),
860
+ ("dinov2_b14", "siglip_b16")]
861
+
862
+ line = f" {frac:>4.0%} "
863
+ for n1, n2 in pairs:
864
+ pre, post, _ = procrustes_cos(projected[n1], projected[n2])
865
+ line += f" {pre:>9.4f} {post:>9.4f}"
866
+ print(line)
867
+
868
+ # Final output comparison
869
+ print(f"\n Final output (pooled, L2-normed) Procrustes:")
870
+ for n1 in model_names:
871
+ for n2 in model_names:
872
+ if n2 <= n1: continue
873
+ d_min = min(pooled_outputs[n1].shape[1], pooled_outputs[n2].shape[1])
874
+ p1 = pooled_outputs[n1][:, :d_min]
875
+ p2 = pooled_outputs[n2][:, :d_min]
876
+ pre, post, svs = procrustes_cos(p1, p2)
877
+ print(f" {n1} Γ— {n2}: pre={pre:.4f} POST={post:.4f} "
878
+ f"sv_range=[{svs.min():.4f}, {svs.max():.4f}]")
879
+
880
+
881
+ # ══════════════════════════════════════════════════════════════════
882
+ # SCAN 15: CV ON ACTIVATIONS AT EACH DEPTH
883
+ # ══════════════════════════════════════════════════════════════════
884
+
885
+ print(f"\n{'='*65}")
886
+ print("SCAN 15: ACTIVATION CV PER LAYER")
887
+ print(f"{'='*65}")
888
+
889
+ def cv_metric_act(emb, n_samples=200):
890
+ B = emb.shape[0]
891
+ if B < 5: return 0.0
892
+ emb_n = F.normalize(emb.float(), dim=-1)
893
+ vols = []
894
+ for _ in range(n_samples):
895
+ idx = torch.randperm(B)[:5]
896
+ pts = emb_n[idx].unsqueeze(0)
897
+ diff = pts.unsqueeze(-2) - pts.unsqueeze(-3)
898
+ d2 = (diff*diff).sum(-1)
899
+ Bv, V, _ = d2.shape
900
+ cm = torch.zeros(Bv, V+1, V+1, dtype=torch.float32)
901
+ cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2
902
+ s = (-1.0)**V; f = math.factorial(V-1)
903
+ v2 = s / ((2.0**(V-1))*f*f) * torch.linalg.det(cm)
904
+ v = torch.sqrt(F.relu(v2[0]) + 1e-12).item()
905
+ if v > 0: vols.append(v)
906
+ if len(vols) < 10: return 0.0
907
+ a = np.array(vols)
908
+ return float(a.std() / (a.mean() + 1e-8))
909
+
910
+ print(f"\n {'model':<15} {'layer':>6} {'CV':>8} {'norm_ΞΌ':>8} {'norm_Οƒ':>8} {'eff_dim':>8}")
911
+ print(f" {'-'*55}")
912
+
913
+ for name in model_names:
914
+ acts = layer_activations[name]
915
+ n_layers = len(acts)
916
+ for li in range(n_layers):
917
+ if li < 2 or li >= n_layers - 2 or li == n_layers // 2 or li % 4 == 0:
918
+ a = acts[li][:200]
919
+ cv = cv_metric_act(a)
920
+ norms = a.norm(dim=-1)
921
+ centered = a - a.mean(0, keepdim=True)
922
+ sv = torch.linalg.svdvals(centered)
923
+ eff_dim = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item()
924
+ print(f" {name:<15} {li:>6} {cv:>8.4f} {norms.mean():>8.3f} "
925
+ f"{norms.std():>8.4f} {eff_dim:>8.1f}")
926
+ elif li == 2 and li < n_layers - 2:
927
+ print(f" {name:<15} {'...':>6}")
928
+ print()
929
+
930
+
931
+ # ══════════════════════════════════════════════════════════════════
932
+ # SCAN 16: CROSS-MODEL ACTIVATION AGREEMENT (which images agree/disagree)
933
+ # ══════════════════════════════════════════════════════════════════
934
+
935
+ print(f"\n{'='*65}")
936
+ print("SCAN 16: PER-IMAGE AGREEMENT ANALYSIS")
937
+ print(f"{'='*65}")
938
+
939
+ # Use final pooled outputs
940
+ for n1 in model_names:
941
+ for n2 in model_names:
942
+ if n2 <= n1: continue
943
+ d_min = min(pooled_outputs[n1].shape[1], pooled_outputs[n2].shape[1])
944
+ p1 = F.normalize(pooled_outputs[n1][:, :d_min], dim=-1)
945
+ p2 = F.normalize(pooled_outputs[n2][:, :d_min], dim=-1)
946
+ per_image_cos = F.cosine_similarity(p1, p2, dim=-1)
947
+ print(f"\n {n1} Γ— {n2}:")
948
+ print(f" Raw per-image cos: mean={per_image_cos.mean():.4f} "
949
+ f"std={per_image_cos.std():.4f} "
950
+ f"min={per_image_cos.min():.4f} max={per_image_cos.max():.4f}")
951
+
952
+ # After Procrustes
953
+ pre, post, svs = procrustes_cos(
954
+ pooled_outputs[n1][:, :d_min], pooled_outputs[n2][:, :d_min])
955
+
956
+ # Distribution of agreement
957
+ bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
958
+ hist = torch.histogram(per_image_cos.cpu(), bins=torch.tensor(bins))
959
+ nonzero = [(f"{bins[i]:.1f}-{bins[i+1]:.1f}", int(hist.hist[i].item()))
960
+ for i in range(len(hist.hist)) if hist.hist[i] > 0]
961
+ print(f" Distribution: {nonzero}")
962
+
963
+
964
+ print(f"\n{'='*65}")
965
+ print("FULL ANALYSIS COMPLETE")
966
+ print(f"{'='*65}")
967
+
968
+ # Clean up
969
+ del models, layer_activations, pooled_outputs
970
+ gc.collect()
971
+ torch.cuda.empty_cache()