| |
| """ |
| BASE TIER DEEP MODEL ANALYSIS |
| =============================== |
| Three models, all 768-d output, all patch-based ViTs: |
| 1. clip_l14_openai β CLIP ViT-L/14 (text-supervised, semantic) |
| 2. dinov2_b14 β DINOv2 ViT-B/14 (self-supervised, structural) |
| 3. siglip_b16_384 β SigLIP ViT-B/16 (sigmoid contrastive, semantic) |
| |
| Analyze: |
| - Full architecture comparison (layers, heads, dims, patch size) |
| - Weight statistics per layer (norms, spectral radius, sparsity) |
| - Attention head geometry (Q/K/V weight structure) |
| - Layer-by-layer representation similarity (CKA, Procrustes) |
| - Patch embedding weight comparison (the actual patchwork) |
| - MLP weight spectrum analysis |
| - Where do they converge internally vs diverge? |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| import numpy as np |
| import json |
| import gc |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| print("=" * 65) |
| print("BASE TIER DEEP MODEL ANALYSIS") |
| print("=" * 65) |
| print(f" Device: {DEVICE}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("LOADING MODELS") |
| print(f"{'='*65}") |
|
|
| from transformers import ( |
| CLIPVisionModel, CLIPVisionConfig, |
| Dinov2Model, Dinov2Config, |
| SiglipVisionModel, SiglipVisionConfig, |
| ) |
|
|
| models = {} |
| configs = {} |
|
|
| |
| print(f"\n Loading CLIP ViT-L/14...") |
| clip = CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14").eval() |
| models["clip_l14"] = clip |
| configs["clip_l14"] = clip.config |
| print(f" Loaded: {sum(p.numel() for p in clip.parameters()):,} params") |
|
|
| |
| print(f" Loading DINOv2 ViT-B/14...") |
| dino = Dinov2Model.from_pretrained("facebook/dinov2-base").eval() |
| models["dinov2_b14"] = dino |
| configs["dinov2_b14"] = dino.config |
| print(f" Loaded: {sum(p.numel() for p in dino.parameters()):,} params") |
|
|
| |
| print(f" Loading SigLIP ViT-B/16-384...") |
| siglip = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-384").eval() |
| models["siglip_b16"] = siglip |
| configs["siglip_b16"] = siglip.config |
| print(f" Loaded: {sum(p.numel() for p in siglip.parameters()):,} params") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 1: ARCHITECTURE COMPARISON") |
| print(f"{'='*65}") |
|
|
| def get_arch_info(name, model, config): |
| info = {"name": name} |
| c = config |
|
|
| if hasattr(c, 'hidden_size'): |
| info["hidden_size"] = c.hidden_size |
| if hasattr(c, 'intermediate_size'): |
| info["intermediate_size"] = c.intermediate_size |
| if hasattr(c, 'num_hidden_layers'): |
| info["num_layers"] = c.num_hidden_layers |
| if hasattr(c, 'num_attention_heads'): |
| info["num_heads"] = c.num_attention_heads |
| if hasattr(c, 'patch_size'): |
| info["patch_size"] = c.patch_size |
| if hasattr(c, 'image_size'): |
| info["image_size"] = c.image_size |
|
|
| info["total_params"] = sum(p.numel() for p in model.parameters()) |
| info["head_dim"] = info.get("hidden_size", 0) // max(info.get("num_heads", 1), 1) |
|
|
| return info |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| info = get_arch_info(name, models[name], configs[name]) |
| print(f"\n {name}:") |
| for k, v in info.items(): |
| if k != "name": |
| print(f" {k:<20}: {v:>12,}" if isinstance(v, int) else f" {k:<20}: {v}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 2: PARAMETER INVENTORY") |
| print(f"{'='*65}") |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| print(f"\n {name}:") |
|
|
| |
| groups = {} |
| for pname, p in model.named_parameters(): |
| |
| parts = pname.split(".") |
| if "embeddings" in pname: |
| cat = "embeddings" |
| elif "encoder" in pname and "layer" in pname: |
| |
| for part in parts: |
| if part.startswith("layer"): |
| break |
| |
| if "attention" in pname: |
| if "query" in pname or "q_proj" in pname or "k_proj" in pname or "v_proj" in pname: |
| cat = "attn_qkv" |
| elif "out" in pname or "o_proj" in pname: |
| cat = "attn_out" |
| else: |
| cat = "attn_other" |
| elif "mlp" in pname or "intermediate" in pname or "output" in pname: |
| cat = "mlp" |
| elif "norm" in pname or "layer_norm" in pname: |
| cat = "layernorm" |
| else: |
| cat = "encoder_other" |
| elif "layernorm" in pname.lower() or "layer_norm" in pname.lower(): |
| cat = "final_norm" |
| elif "head" in pname or "pooler" in pname: |
| cat = "head" |
| else: |
| cat = "other" |
|
|
| groups.setdefault(cat, {"count": 0, "params": 0, "shapes": []}) |
| groups[cat]["count"] += 1 |
| groups[cat]["params"] += p.numel() |
| if len(groups[cat]["shapes"]) < 3: |
| groups[cat]["shapes"].append(f"{pname.split('.')[-2]}.{pname.split('.')[-1]}: {list(p.shape)}") |
|
|
| for cat in sorted(groups.keys()): |
| g = groups[cat] |
| print(f" {cat:<15}: {g['params']:>12,} ({g['count']:2d} tensors)") |
| for s in g["shapes"]: |
| print(f" {s}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 3: WEIGHT STATISTICS") |
| print(f"{'='*65}") |
|
|
| def weight_stats(param): |
| p = param.float().detach() |
| stats = { |
| "shape": list(p.shape), |
| "norm": p.norm().item(), |
| "mean": p.mean().item(), |
| "std": p.std().item(), |
| "abs_max": p.abs().max().item(), |
| "sparsity": (p.abs() < 1e-6).float().mean().item(), |
| } |
| |
| if p.dim() == 2 and min(p.shape) > 1: |
| sv = torch.linalg.svdvals(p) |
| stats["sv_max"] = sv[0].item() |
| stats["sv_min"] = sv[-1].item() |
| stats["sv_ratio"] = (sv[0] / (sv[-1] + 1e-10)).item() |
| stats["eff_rank"] = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item() |
| return stats |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| print(f"\n {name} β key weight matrices:") |
| print(f" {'param':<50} {'shape':<20} {'norm':>8} {'std':>8} {'sv_max':>8} {'eff_rank':>9}") |
| print(f" {'-'*105}") |
|
|
| for pname, p in model.named_parameters(): |
| if p.dim() < 2: continue |
| if p.numel() < 1000: continue |
|
|
| |
| show = False |
| for keyword in ["patch", "embed", "position", "cls", |
| "layer.0.", "layer.5.", "layer.11.", |
| "layer.23.", "q_proj", "k_proj", "v_proj", |
| "query", "key", "value", |
| "fc1", "fc2", "dense", "out_proj", |
| "layernorm", "head"]: |
| if keyword in pname.lower(): |
| show = True; break |
|
|
| if not show: continue |
|
|
| s = weight_stats(p) |
| sv_max = f"{s.get('sv_max', 0):.4f}" if 'sv_max' in s else " N/A" |
| eff_rank = f"{s.get('eff_rank', 0):.1f}" if 'eff_rank' in s else " N/A" |
| short_name = pname[-50:] if len(pname) > 50 else pname |
| shape_str = str(s["shape"]) |
| print(f" {short_name:<50} {shape_str:<20} {s['norm']:>8.4f} " |
| f"{s['std']:>8.5f} {sv_max:>8} {eff_rank:>9}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 4: PATCH EMBEDDING WEIGHTS") |
| print(f"{'='*65}") |
|
|
| patch_embeddings = {} |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| for pname, p in model.named_parameters(): |
| if "patch" in pname.lower() and "embed" in pname.lower() and p.dim() == 4: |
| patch_embeddings[name] = p.detach().float() |
| print(f"\n {name}: {pname}") |
| print(f" Shape: {list(p.shape)}") |
| |
| print(f" = {p.shape[0]} filters Γ {p.shape[1]} channels Γ {p.shape[2]}Γ{p.shape[3]} kernel") |
| |
| w2d = p.detach().float().reshape(p.shape[0], -1) |
| sv = torch.linalg.svdvals(w2d) |
| eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item() |
| print(f" Spectral: sv_max={sv[0]:.4f} sv_min={sv[-1]:.6f} " |
| f"eff_rank={eff_rank:.1f}/{min(w2d.shape)}") |
| print(f" Norm: {p.norm():.4f} Mean: {p.mean():.6f} Std: {p.std():.6f}") |
|
|
| |
| filter_norms = p.detach().float().reshape(p.shape[0], -1).norm(dim=1) |
| print(f" Filter norms: mean={filter_norms.mean():.4f} " |
| f"std={filter_norms.std():.4f} " |
| f"min={filter_norms.min():.4f} max={filter_norms.max():.4f}") |
| break |
|
|
| |
| if len(patch_embeddings) >= 2: |
| print(f"\n Patch embedding Procrustes alignment:") |
| names_list = list(patch_embeddings.keys()) |
| for i in range(len(names_list)): |
| for j in range(i+1, len(names_list)): |
| n1, n2 = names_list[i], names_list[j] |
| p1 = patch_embeddings[n1].reshape(patch_embeddings[n1].shape[0], -1) |
| p2 = patch_embeddings[n2].reshape(patch_embeddings[n2].shape[0], -1) |
| |
| d_min = min(p1.shape[0], p2.shape[0]) |
| d_feat = min(p1.shape[1], p2.shape[1]) |
| a = p1[:d_min, :d_feat]; b = p2[:d_min, :d_feat] |
| |
| cos = F.cosine_similarity( |
| F.normalize(a, dim=1), F.normalize(b, dim=1), dim=1).mean().item() |
| print(f" {n1} Γ {n2}: raw_cos={cos:.4f} (d_min={d_min}, d_feat={d_feat})") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 5: ATTENTION HEAD GEOMETRY") |
| print(f"{'='*65}") |
|
|
| def extract_qkv_weights(model, name): |
| """Extract Q, K, V weight matrices from each layer.""" |
| layers_qkv = [] |
| for pname, p in model.named_parameters(): |
| if p.dim() != 2: continue |
| plow = pname.lower() |
| if ("query" in plow or "q_proj" in plow) and "weight" in plow: |
| layers_qkv.append({"layer": pname, "type": "Q", "weight": p.detach().float()}) |
| elif ("key" in plow or "k_proj" in plow) and "weight" in plow: |
| layers_qkv.append({"layer": pname, "type": "K", "weight": p.detach().float()}) |
| elif ("value" in plow or "v_proj" in plow) and "weight" in plow: |
| layers_qkv.append({"layer": pname, "type": "V", "weight": p.detach().float()}) |
| return layers_qkv |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| qkv = extract_qkv_weights(models[name], name) |
| n_layers = len(qkv) // 3 |
|
|
| print(f"\n {name} ({n_layers} layers):") |
| print(f" {'layer':>6} {'Q_norm':>8} {'K_norm':>8} {'V_norm':>8} " |
| f"{'QK_cos':>8} {'QV_cos':>8} {'KV_cos':>8}") |
|
|
| for layer_idx in range(n_layers): |
| q = qkv[layer_idx * 3]["weight"] |
| k = qkv[layer_idx * 3 + 1]["weight"] |
| v = qkv[layer_idx * 3 + 2]["weight"] |
|
|
| q_norm = q.norm().item() |
| k_norm = k.norm().item() |
| v_norm = v.norm().item() |
|
|
| |
| qf = q.reshape(-1); kf = k.reshape(-1); vf = v.reshape(-1) |
| d = min(qf.shape[0], kf.shape[0], vf.shape[0]) |
| qk_cos = F.cosine_similarity(qf[:d].unsqueeze(0), kf[:d].unsqueeze(0)).item() |
| qv_cos = F.cosine_similarity(qf[:d].unsqueeze(0), vf[:d].unsqueeze(0)).item() |
| kv_cos = F.cosine_similarity(kf[:d].unsqueeze(0), vf[:d].unsqueeze(0)).item() |
|
|
| if layer_idx < 3 or layer_idx >= n_layers - 2 or layer_idx == n_layers // 2: |
| print(f" {layer_idx:>6} {q_norm:>8.3f} {k_norm:>8.3f} {v_norm:>8.3f} " |
| f"{qk_cos:>8.4f} {qv_cos:>8.4f} {kv_cos:>8.4f}") |
| elif layer_idx == 3: |
| print(f" {'...':>6}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 6: CROSS-MODEL WEIGHT ALIGNMENT") |
| print(f"{'='*65}") |
|
|
| |
| |
| |
|
|
| model_qkv = {} |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model_qkv[name] = extract_qkv_weights(models[name], name) |
|
|
| print(f"\n Cross-model Q weight cosine at equivalent depth fractions:") |
| print(f" {'depth':>6} {'clipΓdino':>10} {'clipΓsiglip':>12} {'dinoΓsiglip':>12}") |
|
|
| for name in model_qkv: |
| n = len(model_qkv[name]) // 3 |
| print(f" {name}: {n} layers") |
|
|
| |
| for frac in [0.0, 0.25, 0.5, 0.75, 1.0]: |
| vals = {} |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| qkv = model_qkv[name] |
| n = len(qkv) // 3 |
| idx = min(int(frac * (n - 1)), n - 1) |
| q = qkv[idx * 3]["weight"].reshape(-1) |
| vals[name] = q |
|
|
| |
| min_len = min(v.shape[0] for v in vals.values()) |
| cos_cd = F.cosine_similarity( |
| vals["clip_l14"][:min_len].unsqueeze(0), |
| vals["dinov2_b14"][:min_len].unsqueeze(0)).item() |
| cos_cs = F.cosine_similarity( |
| vals["clip_l14"][:min_len].unsqueeze(0), |
| vals["siglip_b16"][:min_len].unsqueeze(0)).item() |
| cos_ds = F.cosine_similarity( |
| vals["dinov2_b14"][:min_len].unsqueeze(0), |
| vals["siglip_b16"][:min_len].unsqueeze(0)).item() |
|
|
| print(f" {frac:>5.0%} {cos_cd:>10.4f} {cos_cs:>12.4f} {cos_ds:>12.4f}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 7: MLP WEIGHT SPECTRUM") |
| print(f"{'='*65}") |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| mlp_weights = [] |
| for pname, p in model.named_parameters(): |
| if p.dim() == 2 and ("fc1" in pname or "fc2" in pname or |
| ("intermediate" in pname and "dense" in pname and "weight" in pname) or |
| ("output" in pname and "dense" in pname and "weight" in pname and "attention" not in pname)): |
| mlp_weights.append((pname, p.detach().float())) |
|
|
| print(f"\n {name} MLPs ({len(mlp_weights)} weight matrices):") |
| for pname, w in mlp_weights[:6]: |
| sv = torch.linalg.svdvals(w) |
| eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item() |
| short = pname.split(".")[-3] + "." + pname.split(".")[-2] + "." + pname.split(".")[-1] |
| print(f" {short:<40} {str(list(w.shape)):<20} " |
| f"eff_rank={eff_rank:>6.1f}/{min(w.shape)} " |
| f"sv_max={sv[0]:.3f} sv_10={sv[min(9,len(sv)-1)]:.4f}") |
|
|
| if len(mlp_weights) > 6: |
| print(f" ... ({len(mlp_weights) - 6} more)") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 8: POSITION EMBEDDINGS") |
| print(f"{'='*65}") |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| for pname, p in model.named_parameters(): |
| if "position" in pname.lower() and "embed" in pname.lower(): |
| pe = p.detach().float() |
| print(f"\n {name}: {pname}") |
| print(f" Shape: {list(pe.shape)}") |
| print(f" Norm: {pe.norm():.4f} Mean: {pe.mean():.6f} Std: {pe.std():.6f}") |
|
|
| if pe.dim() >= 2: |
| |
| if pe.dim() == 3: |
| pe2d = pe.squeeze(0) |
| else: |
| pe2d = pe |
| sim = F.cosine_similarity(pe2d.unsqueeze(0), pe2d.unsqueeze(1), dim=-1) |
| print(f" Self-sim: diag_mean={sim.diag().mean():.4f} " |
| f"off_diag_mean={(sim.sum()-sim.diag().sum()).item()/(sim.numel()-sim.shape[0]):.4f}") |
| print(f" Adjacent pos cos: mean={F.cosine_similarity(pe2d[:-1], pe2d[1:], dim=-1).mean():.4f}") |
|
|
| |
| sv = torch.linalg.svdvals(pe2d) |
| eff_rank = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item() |
| print(f" Spectral: eff_rank={eff_rank:.1f}/{min(pe2d.shape)} " |
| f"sv1%={sv[0].pow(2).item()/sv.pow(2).sum().item()*100:.1f}%") |
| break |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 9: LAYERNORM WEIGHT/BIAS PATTERNS") |
| print(f"{'='*65}") |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| ln_weights = [] |
| ln_biases = [] |
| for pname, p in model.named_parameters(): |
| if ("norm" in pname.lower() or "layer_norm" in pname.lower()): |
| if "weight" in pname: |
| ln_weights.append((pname, p.detach().float())) |
| elif "bias" in pname: |
| ln_biases.append((pname, p.detach().float())) |
|
|
| print(f"\n {name} ({len(ln_weights)} LayerNorms):") |
| for (wn, w), (bn, b) in zip(ln_weights[:4], ln_biases[:4]): |
| short = wn.split(".")[-3] + "." + wn.split(".")[-2] |
| print(f" {short:<30} w: mean={w.mean():.4f} std={w.std():.4f} " |
| f"b: mean={b.mean():.5f} std={b.std():.4f}") |
|
|
| |
| if ln_weights: |
| wn, w = ln_weights[-1] |
| bn, b = ln_biases[-1] if ln_biases else ("", torch.zeros_like(w)) |
| print(f" FINAL: {wn}") |
| print(f" weight: mean={w.mean():.4f} std={w.std():.4f} " |
| f"min={w.min():.4f} max={w.max():.4f}") |
| if ln_biases: |
| print(f" bias: mean={b.mean():.5f} std={b.std():.4f}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 10: PENTACHORON CV ON WEIGHT GEOMETRY") |
| print(f"{'='*65}") |
|
|
| def cayley_menger_vol2(pts): |
| pts = pts.float() |
| diff = pts.unsqueeze(-2) - pts.unsqueeze(-3) |
| d2 = (diff * diff).sum(-1) |
| B, V, _ = d2.shape |
| cm = torch.zeros(B, V+1, V+1, device=d2.device, dtype=torch.float32) |
| cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2 |
| s = (-1.0)**V; f = math.factorial(V-1) |
| return s / ((2.0**(V-1)) * f*f) * torch.linalg.det(cm) |
|
|
| def cv_metric_on_weights(weight_matrix, n_samples=300): |
| """Measure pentachoron CV on rows of a weight matrix.""" |
| w = F.normalize(weight_matrix.float(), dim=-1) |
| N = w.shape[0] |
| if N < 5: return 0.0 |
| vols = [] |
| for _ in range(n_samples): |
| idx = torch.randperm(N)[:5] |
| v2 = cayley_menger_vol2(w[idx].unsqueeze(0)) |
| v = torch.sqrt(F.relu(v2[0]) + 1e-12).item() |
| if v > 0: vols.append(v) |
| if len(vols) < 10: return 0.0 |
| a = np.array(vols) |
| return float(a.std() / (a.mean() + 1e-8)) |
|
|
| |
| print(f"\n Patch embedding filter CV (rows = output filters):") |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| if name in patch_embeddings: |
| p = patch_embeddings[name] |
| w2d = p.reshape(p.shape[0], -1) |
| cv = cv_metric_on_weights(w2d) |
| print(f" {name:<15} filters={w2d.shape[0]} CV={cv:.4f}") |
|
|
| |
| print(f"\n QKV weight row CV per layer:") |
| print(f" {'model':<15} {'layer':>6} {'Q_cv':>8} {'K_cv':>8} {'V_cv':>8} {'QK_diff':>9}") |
|
|
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| qkv = model_qkv[name] |
| n_layers = len(qkv) // 3 |
|
|
| for layer_idx in range(n_layers): |
| q = qkv[layer_idx * 3]["weight"] |
| k = qkv[layer_idx * 3 + 1]["weight"] |
| v = qkv[layer_idx * 3 + 2]["weight"] |
|
|
| q_cv = cv_metric_on_weights(q, n_samples=200) |
| k_cv = cv_metric_on_weights(k, n_samples=200) |
| v_cv = cv_metric_on_weights(v, n_samples=200) |
|
|
| if layer_idx < 2 or layer_idx >= n_layers - 2 or layer_idx == n_layers // 2: |
| print(f" {name:<15} {layer_idx:>6} {q_cv:>8.4f} {k_cv:>8.4f} " |
| f"{v_cv:>8.4f} {abs(q_cv - k_cv):>9.4f}") |
| elif layer_idx == 2: |
| print(f" {name:<15} {'...':>6}") |
|
|
| |
| print(f"\n MLP weight row CV (first and last layers):") |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| mlp_weights = [] |
| for pname, p in model.named_parameters(): |
| if p.dim() == 2 and ("fc1" in pname or "fc2" in pname or |
| ("intermediate" in pname and "dense" in pname and "weight" in pname) or |
| ("output" in pname and "dense" in pname and "weight" in pname and "attention" not in pname)): |
| mlp_weights.append((pname, p.detach().float())) |
|
|
| if mlp_weights: |
| |
| pname, w = mlp_weights[0] |
| cv_first = cv_metric_on_weights(w, n_samples=200) |
| |
| pname2, w2 = mlp_weights[-1] |
| cv_last = cv_metric_on_weights(w2, n_samples=200) |
| print(f" {name:<15} first_mlp CV={cv_first:.4f} last_mlp CV={cv_last:.4f}") |
|
|
| |
| print(f"\n Position embedding CV:") |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| for pname, p in model.named_parameters(): |
| if "position" in pname.lower() and "embed" in pname.lower(): |
| pe = p.detach().float() |
| if pe.dim() == 3: pe = pe.squeeze(0) |
| if pe.dim() == 2 and pe.shape[0] >= 5: |
| cv = cv_metric_on_weights(pe, n_samples=300) |
| print(f" {name:<15} positions={pe.shape[0]} CV={cv:.4f}") |
| break |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 11: CROSS-MODEL CV BAND COMPARISON") |
| print(f"{'='*65}") |
|
|
| |
| print(f"\n Q weight CV distribution per model:") |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| qkv = model_qkv[name] |
| n_layers = len(qkv) // 3 |
| q_cvs = [] |
| k_cvs = [] |
| v_cvs = [] |
| for layer_idx in range(n_layers): |
| q = qkv[layer_idx * 3]["weight"] |
| k = qkv[layer_idx * 3 + 1]["weight"] |
| v = qkv[layer_idx * 3 + 2]["weight"] |
| q_cvs.append(cv_metric_on_weights(q, n_samples=200)) |
| k_cvs.append(cv_metric_on_weights(k, n_samples=200)) |
| v_cvs.append(cv_metric_on_weights(v, n_samples=200)) |
|
|
| q_arr = np.array(q_cvs) |
| k_arr = np.array(k_cvs) |
| v_arr = np.array(v_cvs) |
| print(f" {name:<15} Q: mean={q_arr.mean():.4f} std={q_arr.std():.4f} " |
| f"range=[{q_arr.min():.4f}, {q_arr.max():.4f}]") |
| print(f" {'':15} K: mean={k_arr.mean():.4f} std={k_arr.std():.4f} " |
| f"range=[{k_arr.min():.4f}, {k_arr.max():.4f}]") |
| print(f" {'':15} V: mean={v_arr.mean():.4f} std={v_arr.std():.4f} " |
| f"range=[{v_arr.min():.4f}, {v_arr.max():.4f}]") |
|
|
| |
| in_band_q = ((q_arr >= 0.18) & (q_arr <= 0.25)).sum() |
| in_band_k = ((k_arr >= 0.18) & (k_arr <= 0.25)).sum() |
| in_band_v = ((v_arr >= 0.18) & (v_arr <= 0.25)).sum() |
| print(f" {'':15} In CV band [0.18-0.25]: Q={in_band_q}/{n_layers} " |
| f"K={in_band_k}/{n_layers} V={in_band_v}/{n_layers}") |
|
|
| |
| print(f"\n Cross-model concatenated Q weight CV (same-depth rows mixed):") |
| name_pairs = [("clip_l14", "dinov2_b14"), ("clip_l14", "siglip_b16"), |
| ("dinov2_b14", "siglip_b16"), ("clip_l14", "dinov2_b14", "siglip_b16")] |
|
|
| for pair in name_pairs: |
| |
| pair_label = " Γ ".join(n[:8] for n in pair) |
| n_layers_per = [len(model_qkv[n]) // 3 for n in pair] |
| min_layers = min(n_layers_per) |
|
|
| cvs_at_depth = [] |
| for frac_idx in range(min_layers): |
| rows = [] |
| for ni, n in enumerate(pair): |
| n_total = n_layers_per[ni] |
| |
| layer_idx = int(frac_idx / min_layers * n_total) |
| layer_idx = min(layer_idx, n_total - 1) |
| q = model_qkv[n][layer_idx * 3]["weight"] |
| rows.append(F.normalize(q.float(), dim=-1)) |
|
|
| |
| d_min = min(r.shape[1] for r in rows) |
| combined = torch.cat([r[:, :d_min] for r in rows], dim=0) |
| cv = cv_metric_on_weights(combined, n_samples=200) |
| cvs_at_depth.append(cv) |
|
|
| arr = np.array(cvs_at_depth) |
| print(f" {pair_label:<35} mean={arr.mean():.4f} std={arr.std():.4f} " |
| f"range=[{arr.min():.4f}, {arr.max():.4f}]") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("WEIGHT ANALYSIS COMPLETE β STARTING ACTIVATION ANALYSIS") |
| print(f"{'='*65}") |
|
|
| |
| del models, configs |
| gc.collect() |
| torch.cuda.empty_cache() |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 12: PER-LAYER ACTIVATION EXTRACTION") |
| print(f"{'='*65}") |
|
|
| from transformers import AutoImageProcessor |
| from datasets import load_dataset |
| from PIL import Image |
|
|
| |
| print(f" Streaming images from rafaelpadilla/coco2017...") |
| coco_stream = load_dataset("rafaelpadilla/coco2017", split="validation", |
| revision="refs/convert/parquet", streaming=True) |
| N_IMGS = 256 |
|
|
| |
| processors = { |
| "clip_l14": AutoImageProcessor.from_pretrained("openai/clip-vit-large-patch14"), |
| "dinov2_b14": AutoImageProcessor.from_pretrained("facebook/dinov2-base"), |
| "siglip_b16": AutoImageProcessor.from_pretrained("google/siglip-base-patch16-384"), |
| } |
|
|
| |
| from transformers import CLIPVisionModel, Dinov2Model, SiglipVisionModel |
| models = { |
| "clip_l14": CLIPVisionModel.from_pretrained("openai/clip-vit-large-patch14").eval().to(DEVICE), |
| "dinov2_b14": Dinov2Model.from_pretrained("facebook/dinov2-base").eval().to(DEVICE), |
| "siglip_b16": SiglipVisionModel.from_pretrained("google/siglip-base-patch16-384").eval().to(DEVICE), |
| } |
| for m in models.values(): |
| for p in m.parameters(): |
| p.requires_grad = False |
|
|
| |
| images = [] |
| for row in coco_stream: |
| if len(images) >= N_IMGS: |
| break |
| try: |
| img = row["image"].convert("RGB") |
| images.append(img) |
| except: |
| continue |
| print(f" Captured {len(images)} images (streamed)") |
|
|
| |
| layer_activations = {} |
| pooled_outputs = {} |
|
|
| EXTRACT_BATCH = 32 |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| model = models[name] |
| proc = processors[name] |
| all_hidden = None |
| all_pooled = [] |
|
|
| for bi in range(0, len(images), EXTRACT_BATCH): |
| batch_imgs = images[bi:bi+EXTRACT_BATCH] |
| inputs = proc(images=batch_imgs, return_tensors="pt").to(DEVICE) |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs, output_hidden_states=True) |
|
|
| hs = outputs.hidden_states |
|
|
| if all_hidden is None: |
| all_hidden = [[] for _ in range(len(hs))] |
| for li, h in enumerate(hs): |
| |
| all_hidden[li].append(h[:, 0, :].cpu()) |
|
|
| |
| if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None: |
| all_pooled.append(outputs.pooler_output.cpu()) |
| else: |
| all_pooled.append(hs[-1][:, 0, :].cpu()) |
|
|
| layer_activations[name] = [torch.cat(h, 0).float() for h in all_hidden] |
| pooled_outputs[name] = F.normalize(torch.cat(all_pooled, 0).float(), dim=-1) |
|
|
| n_layers = len(layer_activations[name]) |
| d = layer_activations[name][0].shape[-1] |
| print(f" {name}: {n_layers} layers, d={d}, N={layer_activations[name][0].shape[0]}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 13: WITHIN-MODEL DEPTH PROGRESSION") |
| print(f"{'='*65}") |
|
|
| def symmetric_inv_sqrt(cov, eps=1e-6): |
| evals, evecs = torch.linalg.eigh(cov) |
| return evecs @ torch.diag(torch.clamp(evals, min=eps).rsqrt()) @ evecs.T |
|
|
| def procrustes_cos(source, target, n=None): |
| """Whitened Procrustes alignment, return pre and post cosine.""" |
| if n is None: n = min(source.shape[0], target.shape[0]) |
| S = source[:n].float(); T = target[:n].float() |
| sm = S.mean(0, keepdim=True); tm = T.mean(0, keepdim=True) |
| Sc = S - sm; Tc = T - tm |
| Ns = Sc.shape[0] |
|
|
| |
| cos_pre = F.cosine_similarity( |
| F.normalize(Sc, dim=-1), F.normalize(Tc, dim=-1), dim=-1).mean().item() |
|
|
| |
| s_cov = (Sc.T @ Sc) / max(Ns-1, 1) |
| t_cov = (Tc.T @ Tc) / max(Ns-1, 1) |
| try: |
| sw = symmetric_inv_sqrt(s_cov, eps=1e-4) |
| tw = symmetric_inv_sqrt(t_cov, eps=1e-4) |
| except: |
| return cos_pre, cos_pre, torch.tensor([0.0]) |
|
|
| Sc_w = F.normalize(Sc @ sw, dim=-1) |
| Tc_w = F.normalize(Tc @ tw, dim=-1) |
|
|
| |
| if not torch.isfinite(Sc_w).all() or not torch.isfinite(Tc_w).all(): |
| return cos_pre, cos_pre, torch.tensor([0.0]) |
|
|
| try: |
| U, S_vals, Vt = torch.linalg.svd(Tc_w.T @ Sc_w, full_matrices=False) |
| except: |
| return cos_pre, cos_pre, torch.tensor([0.0]) |
| R = U @ Vt |
| cos_post = F.cosine_similarity(Sc_w @ R.T, Tc_w, dim=-1).mean().item() |
|
|
| return cos_pre, cos_post, S_vals |
|
|
| print(f"\n Layer-to-layer Procrustes within each model (layer N vs layer N+1):") |
| for name in ["clip_l14", "dinov2_b14", "siglip_b16"]: |
| acts = layer_activations[name] |
| n_layers = len(acts) |
| print(f"\n {name} ({n_layers} layers):") |
| print(f" {'LβL+1':>8} {'pre_cos':>8} {'post_cos':>9} {'sv_min':>8} {'sv_max':>8}") |
|
|
| for li in range(n_layers - 1): |
| if li < 3 or li >= n_layers - 3 or li == n_layers // 2: |
| pre, post, svs = procrustes_cos(acts[li], acts[li+1]) |
| print(f" {li:>3}β{li+1:<3} {pre:>8.4f} {post:>9.4f} " |
| f"{svs.min():.4f} {svs.max():.4f}") |
| elif li == 3: |
| print(f" {'...':>8}") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 14: CROSS-MODEL PROCRUSTES (per depth fraction)") |
| print(f"{'='*65}") |
|
|
| model_names = ["clip_l14", "dinov2_b14", "siglip_b16"] |
| n_layers_per = {n: len(layer_activations[n]) for n in model_names} |
|
|
| print(f"\n Layers: clip={n_layers_per['clip_l14']} dino={n_layers_per['dinov2_b14']} " |
| f"siglip={n_layers_per['siglip_b16']}") |
|
|
| |
| fracs = [i/10 for i in range(11)] |
|
|
| print(f"\n {'frac':>5} {'clipΓdino':>10} {'clipΓdino':>10} {'clipΓsig':>10} " |
| f"{'clipΓsig':>10} {'dinoΓsig':>10} {'dinoΓsig':>10}") |
| print(f" {'':>5} {'pre':>10} {'POST':>10} {'pre':>10} {'POST':>10} " |
| f"{'pre':>10} {'POST':>10}") |
| print(f" {'-'*67}") |
|
|
| for frac in fracs: |
| results = {} |
| for n in model_names: |
| nl = n_layers_per[n] |
| idx = min(int(frac * (nl - 1)), nl - 1) |
| results[n] = layer_activations[n][idx] |
|
|
| |
| dims = {n: results[n].shape[-1] for n in model_names} |
| d_min = min(dims.values()) |
|
|
| projected = {} |
| for n in model_names: |
| if dims[n] == d_min: |
| projected[n] = results[n] |
| else: |
| r = results[n].float() |
| rc = r - r.mean(0, keepdim=True) |
| |
| U, S, Vt = torch.linalg.svd(rc, full_matrices=False) |
| |
| n_comp = min(d_min, Vt.shape[0]) |
| projected[n] = r @ Vt[:n_comp].T |
|
|
| |
| actual_dims = {n: projected[n].shape[-1] for n in model_names} |
| d_common = min(actual_dims.values()) |
| for n in model_names: |
| if projected[n].shape[-1] > d_common: |
| projected[n] = projected[n][:, :d_common] |
|
|
| pairs = [("clip_l14", "dinov2_b14"), ("clip_l14", "siglip_b16"), |
| ("dinov2_b14", "siglip_b16")] |
|
|
| line = f" {frac:>4.0%} " |
| for n1, n2 in pairs: |
| pre, post, _ = procrustes_cos(projected[n1], projected[n2]) |
| line += f" {pre:>9.4f} {post:>9.4f}" |
| print(line) |
|
|
| |
| print(f"\n Final output (pooled, L2-normed) Procrustes:") |
| for n1 in model_names: |
| for n2 in model_names: |
| if n2 <= n1: continue |
| d_min = min(pooled_outputs[n1].shape[1], pooled_outputs[n2].shape[1]) |
| p1 = pooled_outputs[n1][:, :d_min] |
| p2 = pooled_outputs[n2][:, :d_min] |
| pre, post, svs = procrustes_cos(p1, p2) |
| print(f" {n1} Γ {n2}: pre={pre:.4f} POST={post:.4f} " |
| f"sv_range=[{svs.min():.4f}, {svs.max():.4f}]") |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 15: ACTIVATION CV PER LAYER") |
| print(f"{'='*65}") |
|
|
| def cv_metric_act(emb, n_samples=200): |
| B = emb.shape[0] |
| if B < 5: return 0.0 |
| emb_n = F.normalize(emb.float(), dim=-1) |
| vols = [] |
| for _ in range(n_samples): |
| idx = torch.randperm(B)[:5] |
| pts = emb_n[idx].unsqueeze(0) |
| diff = pts.unsqueeze(-2) - pts.unsqueeze(-3) |
| d2 = (diff*diff).sum(-1) |
| Bv, V, _ = d2.shape |
| cm = torch.zeros(Bv, V+1, V+1, dtype=torch.float32) |
| cm[:, 0, 1:] = 1; cm[:, 1:, 0] = 1; cm[:, 1:, 1:] = d2 |
| s = (-1.0)**V; f = math.factorial(V-1) |
| v2 = s / ((2.0**(V-1))*f*f) * torch.linalg.det(cm) |
| v = torch.sqrt(F.relu(v2[0]) + 1e-12).item() |
| if v > 0: vols.append(v) |
| if len(vols) < 10: return 0.0 |
| a = np.array(vols) |
| return float(a.std() / (a.mean() + 1e-8)) |
|
|
| print(f"\n {'model':<15} {'layer':>6} {'CV':>8} {'norm_ΞΌ':>8} {'norm_Ο':>8} {'eff_dim':>8}") |
| print(f" {'-'*55}") |
|
|
| for name in model_names: |
| acts = layer_activations[name] |
| n_layers = len(acts) |
| for li in range(n_layers): |
| if li < 2 or li >= n_layers - 2 or li == n_layers // 2 or li % 4 == 0: |
| a = acts[li][:200] |
| cv = cv_metric_act(a) |
| norms = a.norm(dim=-1) |
| centered = a - a.mean(0, keepdim=True) |
| sv = torch.linalg.svdvals(centered) |
| eff_dim = ((sv.sum()**2) / (sv.pow(2).sum() + 1e-12)).item() |
| print(f" {name:<15} {li:>6} {cv:>8.4f} {norms.mean():>8.3f} " |
| f"{norms.std():>8.4f} {eff_dim:>8.1f}") |
| elif li == 2 and li < n_layers - 2: |
| print(f" {name:<15} {'...':>6}") |
| print() |
|
|
|
|
| |
| |
| |
|
|
| print(f"\n{'='*65}") |
| print("SCAN 16: PER-IMAGE AGREEMENT ANALYSIS") |
| print(f"{'='*65}") |
|
|
| |
| for n1 in model_names: |
| for n2 in model_names: |
| if n2 <= n1: continue |
| d_min = min(pooled_outputs[n1].shape[1], pooled_outputs[n2].shape[1]) |
| p1 = F.normalize(pooled_outputs[n1][:, :d_min], dim=-1) |
| p2 = F.normalize(pooled_outputs[n2][:, :d_min], dim=-1) |
| per_image_cos = F.cosine_similarity(p1, p2, dim=-1) |
| print(f"\n {n1} Γ {n2}:") |
| print(f" Raw per-image cos: mean={per_image_cos.mean():.4f} " |
| f"std={per_image_cos.std():.4f} " |
| f"min={per_image_cos.min():.4f} max={per_image_cos.max():.4f}") |
|
|
| |
| pre, post, svs = procrustes_cos( |
| pooled_outputs[n1][:, :d_min], pooled_outputs[n2][:, :d_min]) |
|
|
| |
| bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] |
| hist = torch.histogram(per_image_cos.cpu(), bins=torch.tensor(bins)) |
| nonzero = [(f"{bins[i]:.1f}-{bins[i+1]:.1f}", int(hist.hist[i].item())) |
| for i in range(len(hist.hist)) if hist.hist[i] > 0] |
| print(f" Distribution: {nonzero}") |
|
|
|
|
| print(f"\n{'='*65}") |
| print("FULL ANALYSIS COMPLETE") |
| print(f"{'='*65}") |
|
|
| |
| del models, layer_activations, pooled_outputs |
| gc.collect() |
| torch.cuda.empty_cache() |