AbstractPhil commited on
Commit
8ef0f56
Β·
verified Β·
1 Parent(s): eaad7fb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +196 -189
README.md CHANGED
@@ -7,128 +7,32 @@ license: mit
7
  # Geometric Terrain Statistics Composite
8
 
9
  Such a quaint little tool.
10
-
11
- ```
12
- class GeometricResidualModulator(nn.Module):
13
- def __init__(self, d_model=512, vocab_size=32128, n_geometric_dims=64,
14
- initial_alpha=0.01, n_layers=6):
15
- super().__init__()
16
- self.d_model = d_model
17
- self.n_geometric_dims = n_geometric_dims
18
- self.geometric_embed = nn.Embedding(vocab_size, n_geometric_dims)
19
- self.proj = nn.Linear(n_geometric_dims, d_model, bias=False)
20
- logit = math.log(initial_alpha / (1 - initial_alpha))
21
- self.alpha = nn.Parameter(torch.full((n_layers,), logit))
22
- nn.init.normal_(self.proj.weight, std=0.01)
23
-
24
- def forward(self, residual, token_ids, layer_idx=0):
25
- geo = self.geometric_embed(token_ids)
26
- geo_projected = self.proj(geo)
27
- a = torch.sigmoid(self.alpha[layer_idx])
28
- return (1 - a) * residual + a * geo_projected
29
-
30
- def geometric_residuals(self):
31
- W = self.geometric_embed.weight
32
- W_n = F.normalize(W, dim=1)
33
- idx = torch.randperm(min(W.shape[0], 5000))[:5000]
34
- sample = W_n[idx]
35
- cos_mat = sample @ sample.T
36
- tri = torch.triu_indices(len(idx), len(idx), offset=1)
37
- flat_cos = cos_mat[tri[0], tri[1]]
38
- norms = W.norm(dim=1)
39
- centered = W - W.mean(dim=0)
40
- cov = (centered.T @ centered) / W.shape[0]
41
- eigvals = torch.linalg.eigvalsh(cov)
42
- pr = (eigvals.sum() ** 2) / (eigvals ** 2).sum()
43
- return {
44
- 'cos_mean': flat_cos.mean().item(),
45
- 'cos_std': flat_cos.std().item(),
46
- 'norm_mean': norms.mean().item(),
47
- 'pr_over_dim': (pr / self.n_geometric_dims).item(),
48
- 'alpha': torch.sigmoid(self.alpha).detach().cpu().numpy(),
49
- }
50
-
51
-
52
- class ModulatedT5Encoder(nn.Module):
53
- def __init__(self, t5_encoder, modulator, modulate_layers=None):
54
- super().__init__()
55
- self.encoder = t5_encoder
56
- self.modulator = modulator
57
- if modulate_layers is None:
58
- modulate_layers = list(range(len(t5_encoder.block)))
59
- self.modulate_layers = set(modulate_layers)
60
-
61
- def forward(self, input_ids, attention_mask=None, output_hidden_states=False, **kwargs):
62
- hidden_states = self.encoder.embed_tokens(input_ids)
63
- hidden_states = self.encoder.dropout(hidden_states)
64
-
65
- if attention_mask is not None:
66
- extended_attention_mask = attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
67
- extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(hidden_states.dtype).min
68
- else:
69
- extended_attention_mask = None
70
-
71
- all_hidden_states = [hidden_states] if output_hidden_states else None
72
- position_bias = None
73
- seq_length = input_ids.shape[1]
74
- cache_position = torch.arange(seq_length, device=input_ids.device)
75
-
76
- for i, block in enumerate(self.encoder.block):
77
- if i in self.modulate_layers:
78
- hidden_states = self.modulator(hidden_states, input_ids, layer_idx=i)
79
-
80
- block_output = block(hidden_states, attention_mask=extended_attention_mask,
81
- position_bias=position_bias, cache_position=cache_position)
82
- hidden_states = block_output[0]
83
-
84
- if position_bias is None:
85
- for out in block_output[1:]:
86
- if isinstance(out, torch.Tensor) and out.dim() == 4:
87
- position_bias = out
88
- break
89
-
90
- if output_hidden_states:
91
- all_hidden_states.append(hidden_states)
92
-
93
- hidden_states = self.encoder.final_layer_norm(hidden_states)
94
- hidden_states = self.encoder.dropout(hidden_states)
95
-
96
- if output_hidden_states:
97
- all_hidden_states.append(hidden_states)
98
-
99
- return type('Output', (), {
100
- 'last_hidden_state': hidden_states,
101
- 'hidden_states': tuple(all_hidden_states) if all_hidden_states else None,
102
- })()
103
-
104
-
105
- N_GEO = 64
106
- modulator = GeometricResidualModulator(
107
- d_model=512, vocab_size=32128, n_geometric_dims=N_GEO,
108
- initial_alpha=0.5, n_layers=6,
109
- ).to(device)
110
-
111
- mod_encoder = ModulatedT5Encoder(
112
- t5_encoder=model.encoder, modulator=modulator,
113
- modulate_layers=[0, 1, 2, 3, 4, 5],
114
- )
115
-
116
- ```
117
-
118
 
119
  ## Document Purpose
120
 
121
- Running catalog of geometric measurements across language models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
122
 
123
  ---
124
 
125
  ## I. Models Profiled
126
 
127
- | Model | Params | Vocab | Hidden Dim | Layers | Architecture | Training Data |
128
- |---|---|---|---|---|---|---|
129
- | T5-Small | 60.5M | 32,128 | 512 | 6+6 enc-dec | Transformer (relative PE) | C4 |
130
- | Qwen3.5-0.8B | 853M (752M LM + 100M ViT) | 248,320 | 1024 | DeltaNet + MoE | Multilingual + Vision |
131
- | Qwen3.5-4B | ~4B | 248,320 | 2560 | DeltaNet + MoE | Multilingual + Vision |
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  ---
134
 
@@ -172,7 +76,7 @@ Running catalog of geometric measurements across language models. Each metric in
172
  | Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
173
  | Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
174
 
175
- **Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm. This affects downstream metric scaling but not relational structure.
176
 
177
  ---
178
 
@@ -194,7 +98,7 @@ Vol² = (-1)⁡ · det(D) / (2⁴ · (4!)²) = -det(D) / 9216
194
  Vol = √(VolΒ²) if VolΒ² > 0, else invalid
195
  ```
196
 
197
- **Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each. Compare to random Gaussian baseline (same norm distribution). Report CV (coefficient of variation = std/mean) and embed/random ratio.
198
 
199
  | Model | Valid/1000 | CV | Embed/Random Ratio |
200
  |---|---|---|---|
@@ -210,10 +114,9 @@ Vol = √(Vol²) if Vol² > 0, else invalid
210
 
211
  **Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560β†’1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
212
 
213
- | Comparison | Relational Pearson | Digit Structure Pearson |
214
  |---|---|---|
215
- | Qwen 0.8B vs 4B (raw) | 0.920 | 0.904 |
216
- | Qwen 0.8B vs 4B (Procrustes) | higher (post-alignment) | β€” |
217
 
218
  **Finding:** Models at different scales learn the same relational geometry (r=0.92).
219
 
@@ -225,21 +128,15 @@ Vol = √(Vol²) if Vol² > 0, else invalid
225
 
226
  **Formula:** For digit tokens '0'–'9', compute all 45 pairwise cosines. Measure Pearson correlation between |iβˆ’j| (numerical distance) and cosine similarity.
227
 
228
- **Process:** Encode each digit as single token, extract embedding, normalize, compute pairwise cosine matrix.
229
-
230
  | Model | |iβˆ’j| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
231
  |---|---|---|---|---|
232
  | T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
233
  | Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
234
  | Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
235
 
236
- **Finding:** All models encode a number line. Stronger in Qwen (more training data). T5 has wider gap (adjacent vs non-adjacent more differentiated) despite weaker overall correlation.
237
-
238
- ### IV.2 Semantic Category Clustering
239
 
240
- **Formula:** For tokens in a semantic category, compute mean intra-category pairwise cosine. Compare to global mean pairwise cosine. Lift = intra βˆ’ global.
241
-
242
- **Process (T5-Small):** 8 hand-curated categories (animals, colors, numbers, body, food, emotions, actions, time), single-token words only.
243
 
244
  | Category | N tokens | Intra Cosine | Global | Lift |
245
  |---|---|---|---|---|
@@ -285,8 +182,6 @@ Vol = √(Vol²) if Vol² > 0, else invalid
285
 
286
  ### V.3 Encoder Distance Bands
287
 
288
- **Process:** Group WordNet token pairs by path similarity ranges. Measure mean cosine in each band.
289
-
290
  | WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
291
  |---|---|---|---|---|
292
  | [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
@@ -296,79 +191,148 @@ Vol = √(Vol²) if Vol² > 0, else invalid
296
 
297
  ### V.4 Hypernym Chain Decay
298
 
299
- **Process:** Find WordNet synsets forming hypernym chains (e.g., dog→canine→mammal→organism). Measure cosine between root and ancestor at each depth.
300
-
301
  | Depth | Static Cosine | Encoder Cosine |
302
  |---|---|---|
303
  | 1 | 0.160 | 0.656 |
304
- | 2 | 0.090 | 0.620 |
305
  | 3 | 0.075 | 0.594 |
306
  | 5 | 0.069 | 0.585 |
307
  | 7 | 0.068 | 0.579 |
308
 
309
- **Finding:** Monotonic decay in both spaces. Encoder has much stronger signal and cleaner gradient.
310
-
311
  ---
312
 
313
- ## VI. Inactive Weight Topology (T5-Small / T5-Base)
314
 
315
- ### VI.1 SVD Effective Rank
316
 
317
- **Formula:** Stable rank = β€–Wβ€–Β²_F / β€–Wβ€–Β²β‚‚ = Σσᡒ² / σ₁². Measures effective rank without thresholding.
318
 
319
- **Process:** SVD every 2D weight matrix. Report stable rank, participation ratio, active fraction (Οƒα΅’ > 0.01·σ₁), and condition number (σ₁/Οƒβ‚™).
320
 
321
- | Weight Type | Stable Rank (Small) | Stable Rank (Base) |
322
- |---|---|---|
323
- | self_attn_q | 47.6 Β± 16.4 | 58.1 Β± 17.2 |
324
- | self_attn_k | 53.2 Β± 9.2 | 62.4 Β± 18.3 |
325
- | self_attn_v | 75.3 | 97.5 |
326
- | mlp_wi | 15.2 Β± 3.8 | 20.6 Β± 4.9 |
327
- | mlp_wo | 31.3 | 43.9 |
 
 
328
 
329
- ### VI.2 Sparsity Topology
 
 
 
 
 
 
330
 
331
- **Formula:** Fraction of |wα΅’β±Ό| below threshold.
332
 
333
- | Weight Type | <0.1 (Small) | <0.1 (Base) |
 
 
334
  |---|---|---|
335
- | self_attn_q | **93.7%** | **99.4%** |
336
- | self_attn_k | 19.2% | 30.0% |
337
- | self_attn_v | 12.1% | 16.2% |
338
- | mlp_wi | 11.9% | 16.9% |
339
- | Full model | 18.4% | 27.9% |
 
 
 
340
 
341
- **Finding:** Q matrices are overwhelmingly sparse. The query projection is >93% empty. K matrices are dense. This asymmetry grows with scale. The Q null space is the intervention point for geometric modulation.
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  ### VI.3 QK Similarity Manifold
344
 
345
  **Formula:** QK = W_Q Β· W_Kα΅€. Eigendecompose the symmetric part (QK + QKα΅€)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
346
 
347
- **Process:** Compute per-layer. Track positive/negative balance and stable rank.
348
 
349
- | Layer (Encoder) | Stable Rank | Positive Eig | Negative Eig | Symmetry Dev |
350
- |---|---|---|---|---|
351
- | 0 | 39.5 | 315 | 197 | 0.993 |
352
- | 2 | 10.1 | 269 | 243 | 1.217 |
353
- | 5 | 5.35 | 274 | 238 | 1.252 |
 
 
 
 
 
354
 
355
- **Finding:** Similarity function narrows through depth (stable rank 39β†’5). Negative eigenvalue count increases β€” deeper layers define more anti-similarity boundaries.
 
 
 
 
 
 
356
 
357
  ### VI.4 MLP Dead Neurons
358
 
359
- **Formula:** Combined importance = β€–wα΅’_upβ€–β‚‚ Β· β€–wα΅’_downβ€–β‚‚. Dead if < 1% of mean.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
- **Finding:** Zero dead neurons across all layers, both encoder and decoder, at both Small and Base scale. T5 is parameter-starved β€” every neuron earns its keep.
362
 
363
- ### VI.5 Position Bias Topology
 
 
 
 
 
 
 
 
 
 
 
364
 
365
- **Process:** T5 uses learned relative position biases: [32 buckets, N heads]. Measure per-head: monotonicity, distance correlation, peak bucket.
366
 
367
- **Encoder (T5-Small):** 3 local heads (peak 0-1, negative dist_corr), 2 global heads (peak 17-18, positive dist_corr), 3 mixed.
 
 
 
 
 
 
368
 
369
- **Decoder (T5-Small):** 4 far-looking heads (peak 31, values up to +48), 4 local heads (peak 0-1, values down to -34.5). Extreme magnitude asymmetry β€” far-looking heads are 10Γ— stronger.
 
 
 
 
 
370
 
371
- **Finding:** This local/global split emerges identically across T5-Small, T5-Base. It's an architectural invariant.
372
 
373
  ---
374
 
@@ -384,12 +348,6 @@ Vol = √(Vol²) if Vol² > 0, else invalid
384
 
385
  ### VII.2 Geometric Embedding Initialization
386
 
387
- **Process:**
388
- 1. Build 3000Γ—3000 Wu-Palmer similarity matrix from WordNet anchors (~6 min)
389
- 2. Eigendecompose β†’ top 64 eigenvectors scaled by √eigenvalue β†’ 64-d embeddings
390
- 3. Project remaining tokens via GPU embedding cosine proxy (10-NN, softmax-weighted, <1 sec)
391
- 4. Procrustes align projection matrix to encoder PCA space
392
-
393
  | Metric | Value |
394
  |---|---|
395
  | WN reconstruction correlation | 0.921 |
@@ -398,8 +356,6 @@ Vol = √(Vol²) if Vol² > 0, else invalid
398
 
399
  ### VII.3 Alpha Convergence
400
 
401
- **Process:** Freeze T5, train only modulator (geometric embed + projection + alpha). Task: summarize definition β†’ lemma word. Track alpha per layer.
402
-
403
  | Start Ξ± | Final Mean Ξ± | Layer 5 Final | Pearson Ξ” | CV | Coherent | Basin |
404
  |---|---|---|---|---|---|---|
405
  | 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
@@ -407,8 +363,6 @@ Vol = √(Vol²) if Vol² > 0, else invalid
407
  | 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
408
  | 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
409
 
410
- **Finding:** Two stable attractor basins exist β€” binding (~0.07) and separation (~0.70). The binding basin produces functional results. Starting at 0.01 with early stopping (20 epochs) is optimal.
411
-
412
  ### VII.4 Depth Gradient (Consistent Across All Runs)
413
 
414
  | Layer | 20ep (Ξ±=0.01) | 100ep (Ξ±=0.01) | 20ep (Ξ±=0.20) |
@@ -435,9 +389,35 @@ Vol = √(Vol²) if Vol² > 0, else invalid
435
 
436
  ---
437
 
438
- ## VIII. The 0.29154 Constant
439
 
440
- ### VIII.1 Observations Across Systems
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
 
442
  | System | Context | Value |
443
  |---|---|---|
@@ -445,12 +425,13 @@ Vol = √(Vol²) if Vol² > 0, else invalid
445
  | Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
446
  | Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
447
  | T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291–0.292, semantic phase transition |
 
448
 
449
- ### VIII.2 T5 Generation Phase Transition
450
 
451
  | Alpha | Output (triangle prompt) |
452
  |---|---|
453
- | 0.01–0.10 | "triangle is a polygon with three edges and three vertices. it is one of the basic shapes in geometry." |
454
  | 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
455
  | 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
456
  | 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
@@ -462,7 +443,7 @@ Vol = √(Vol²) if Vol² > 0, else invalid
462
 
463
  ---
464
 
465
- ## IX. Universal Geometric Constants
466
 
467
  | Constant | Value | Observed In |
468
  |---|---|---|
@@ -470,11 +451,16 @@ Vol = √(Vol²) if Vol² > 0, else invalid
470
  | Participation / dim | 0.53–0.56 | T5-Small, Qwen 0.8B |
471
  | Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
472
  | Depth gradient | Monotonic increasing | All modulator training runs |
473
- | Q sparsity scaling | Increases with model scale | T5-Small (93.7%), T5-Base (99.4%) |
 
 
 
 
 
474
 
475
  ---
476
 
477
- ## X. Measurement Toolkit Reference
478
 
479
  | Tool | Input | Output | Requires Inference |
480
  |---|---|---|---|
@@ -484,12 +470,33 @@ Vol = √(Vol²) if Vol² > 0, else invalid
484
  | Digit Manifold | 10 digit token embeddings | |iβˆ’j| correlation, adjacency gap | No |
485
  | SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
486
  | QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
487
- | Dead Neuron Count | MLP wi, wo matrices | Combined importance distribution | No |
 
 
 
488
  | WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
489
  | Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
490
 
491
  ---
492
 
493
- *Last updated: 2026-03-05*
494
- *Models profiled: 3 (T5-Small, Qwen3.5-0.8B, Qwen3.5-4B)*
495
- *Modulator experiments: 4 configurations*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # Geometric Terrain Statistics Composite
8
 
9
  Such a quaint little tool.
10
+ # Geometric Terrain Statistics Composite
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  ## Document Purpose
13
 
14
+ Running catalog of geometric measurements across language and vision models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
15
 
16
  ---
17
 
18
  ## I. Models Profiled
19
 
20
+ | Model | Params | Vocab | Hidden Dim | Layers | Heads | Architecture | Training |
21
+ |---|---|---|---|---|---|---|---|
22
+ | T5-Small | 60.5M | 32,128 | 512 | 6+6 | 8 | Enc-Dec (relative PE, ReLU MLP) | C4 span corruption |
23
+ | T5-Base | 222.9M | 32,128 | 768 | 12+12 | 12 | Enc-Dec (relative PE, ReLU MLP) | C4 span corruption |
24
+ | T5-v1.1-XXL | 11.4B | 32,128 | 4096 | 24+24 | 64 | Enc-Dec (relative PE, **GeGLU** MLP) | C4 (v1.1 variant, no multi-task) |
25
+ | BERT-large | 336.2M | 30,522 | 1024 | 24 | 16 | Encoder-only (absolute PE) | BookCorpus+Wikipedia MLM |
26
+ | CLIP-ViT-B/16 | 85.5M (visual) | β€” | 768 | 12 | 12 | Vision encoder (fused QKV) | LAION-2B contrastive |
27
+ | DINOv2-large | 302.0M | β€” | 1024 | 24 | 16 | Vision encoder (separate Q/K/V) | Self-supervised (no labels) |
28
+ | CLIP-ViT-bigG/14 | 1.84B (visual) | β€” | 1664 | 48 | 16 | Vision encoder (fused QKV) | LAION-2B contrastive |
29
+ | Qwen3.5-0.8B | 853M | 248,320 | 1024 | β€” | β€” | DeltaNet + MoE + ViT | Multilingual + Vision |
30
+ | Qwen3.5-4B | ~4B | 248,320 | 2560 | β€” | β€” | DeltaNet + MoE + ViT | Multilingual + Vision |
31
+
32
+ **Notes:**
33
+ - T5-v1.1-XXL encoder is the text encoder used by Flux.1 Schnell, Flux.1 Dev, and Flux.2
34
+ - CLIP models use fused QKV (`in_proj_weight`); Q/K/V split by thirds for analysis
35
+ - T5-v1.1 uses GeGLU (wi_0 gate + wi_1 value) instead of ReLU (single wi)
36
 
37
  ---
38
 
 
76
  | Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
77
  | Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
78
 
79
+ **Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm.
80
 
81
  ---
82
 
 
98
  Vol = √(VolΒ²) if VolΒ² > 0, else invalid
99
  ```
100
 
101
+ **Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each. Report CV (coefficient of variation = std/mean).
102
 
103
  | Model | Valid/1000 | CV | Embed/Random Ratio |
104
  |---|---|---|---|
 
114
 
115
  **Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560β†’1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
116
 
117
+ | Comparison | Relational Pearson | Pentachoron per-simplex corr |
118
  |---|---|---|
119
+ | Qwen 0.8B vs 4B (raw) | 0.920 | 0.89 |
 
120
 
121
  **Finding:** Models at different scales learn the same relational geometry (r=0.92).
122
 
 
128
 
129
  **Formula:** For digit tokens '0'–'9', compute all 45 pairwise cosines. Measure Pearson correlation between |iβˆ’j| (numerical distance) and cosine similarity.
130
 
 
 
131
  | Model | |iβˆ’j| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
132
  |---|---|---|---|---|
133
  | T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
134
  | Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
135
  | Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
136
 
137
+ ### IV.2 Semantic Category Clustering (T5-Small)
 
 
138
 
139
+ **Formula:** Mean intra-category pairwise cosine vs global mean pairwise cosine. Lift = intra βˆ’ global.
 
 
140
 
141
  | Category | N tokens | Intra Cosine | Global | Lift |
142
  |---|---|---|---|---|
 
182
 
183
  ### V.3 Encoder Distance Bands
184
 
 
 
185
  | WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
186
  |---|---|---|---|---|
187
  | [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
 
191
 
192
  ### V.4 Hypernym Chain Decay
193
 
 
 
194
  | Depth | Static Cosine | Encoder Cosine |
195
  |---|---|---|
196
  | 1 | 0.160 | 0.656 |
 
197
  | 3 | 0.075 | 0.594 |
198
  | 5 | 0.069 | 0.585 |
199
  | 7 | 0.068 | 0.579 |
200
 
 
 
201
  ---
202
 
203
+ ## VI. Cross-Architecture Inactive Weight Topology
204
 
205
+ ### VI.1 Q/K/V Sparsity (<0.1 threshold)
206
 
207
+ **Formula:** Fraction of |wα΅’β±Ό| < 0.1 across all weights of that type.
208
 
209
+ **Process:** Iterate all 2D weight matrices, compute abs values, count below threshold. No inference needed.
210
 
211
+ | Model | Q | K | V | O | MLP | Full Model |
212
+ |---|---|---|---|---|---|---|
213
+ | **T5-Small** (512d, 6L) | **93.7%** | 19.2% | 12.1% | 10.4% | 11.9% | 18.4% |
214
+ | **T5-Base** (768d, 12L) | **99.4%** | 30.0% | 16.2% | 13.5% | 16.9% | 27.9% |
215
+ | **T5-v1.1-XXL** (4096d, 24L) | **100.0%** | **65.5%** | 73.1% | 65.4% | ~57% | β€” |
216
+ | BERT-large (1024d, 24L) | 99.1% | 99.1% | 99.9% | 99.9% | 99.4% | 99.3% |
217
+ | DINOv2-large (1024d, 24L) | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |
218
+ | CLIP-ViT-B/16 (768d, 12L) | β€” (fused) | β€” | β€” | β€” | 100.0% | 100.0% |
219
+ | CLIP-ViT-bigG (1664d, 48L) | β€” (fused) | β€” | β€” | β€” | ~97% | 98.0% |
220
 
221
+ **Key Finding β€” T5 Q/K Asymmetry Scales:**
222
+
223
+ | Model | Q (<0.1) | K (<0.1) | Q/K Ratio |
224
+ |---|---|---|---|
225
+ | T5-Small | 93.7% | 19.2% | **4.9Γ—** |
226
+ | T5-Base | 99.4% | 30.0% | **3.3Γ—** |
227
+ | T5-v1.1-XXL | 100.0% | 65.5% | **1.5Γ—** |
228
 
229
+ T5 has a genuine Q-specific sparsity that scales with model size. Q hit 100.0% at XXL (every single weight below 0.1). This is NOT the BERT/DINOv2 pattern where all weight types are uniformly sparse. The query projection in T5 is **functionally vestigial at scale**.
230
 
231
+ **T5-v1.1-XXL Encoder vs Decoder:**
232
+
233
+ | Component | Encoder | Decoder |
234
  |---|---|---|
235
+ | self_attn_q | 100.0% | 100.0% |
236
+ | self_attn_k | 71.7% | 59.4% |
237
+ | self_attn_v | 76.0% | 70.1% |
238
+ | cross_attn_q | β€” | 100.0% |
239
+ | cross_attn_k | β€” | 63.1% |
240
+ | cross_attn_v | β€” | 71.1% |
241
+
242
+ Q is 100% sparse everywhere β€” self-attention and cross-attention, encoder and decoder.
243
 
244
+ ### VI.2 SVD Effective Rank
245
+
246
+ **Formula:** Stable rank = β€–Wβ€–Β²_F / β€–Wβ€–Β²β‚‚ = Σσᡒ² / σ₁². Measures effective rank without thresholding.
247
+
248
+ | Weight Type | T5-Small | T5-Base | T5-v1.1-XXL | BERT-large | DINOv2-large |
249
+ |---|---|---|---|---|---|
250
+ | self_attn_q | 47.6 | 58.1 | 96.8 | 50.8 | 57.7 |
251
+ | self_attn_k | 53.2 | 62.4 | 90.0 | 37.7 | 55.5 |
252
+ | self_attn_v | 75.3 | 97.5 | 204.4 | 113.0 | 94.8 |
253
+ | self_attn_o | 25.4 | 35.0 | 16.4 | 125.0 | 85.6 |
254
+ | mlp_up/gate | 15.2 | 20.6 | 67.9 (gate) / 247.3 (up) | 27.4 | 58.4 |
255
+ | mlp_down | 31.3 | 43.9 | 25.3 | 52.2 | 94.4 |
256
+
257
+ **T5-v1.1-XXL O matrices have very low stable rank (16.4)** β€” the output projection is extremely low-rank despite the 4096-d space. Cross-attention O is even lower at 6.1.
258
 
259
  ### VI.3 QK Similarity Manifold
260
 
261
  **Formula:** QK = W_Q Β· W_Kα΅€. Eigendecompose the symmetric part (QK + QKα΅€)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
262
 
263
+ **Positive Eigenvalue Fraction Trends:**
264
 
265
+ | Model | First Layer | Last Layer | Trend |
266
+ |---|---|---|---|
267
+ | T5-Small encoder | 0.615 | 0.535 | **βˆ’0.080** (decreasing) |
268
+ | T5-v1.1-XXL encoder | 0.510 | 0.503 | **βˆ’0.007** (flat) |
269
+ | T5-v1.1-XXL decoder self | 0.501 | 0.548 | **+0.047** (increasing) |
270
+ | **T5-v1.1-XXL cross-attn** | **0.500** | **0.500** | **0.000 (locked)** |
271
+ | BERT-large | 0.446 | 0.513 | +0.066 (increasing) |
272
+ | CLIP-ViT-B/16 | 0.503 | 0.538 | +0.035 (increasing) |
273
+ | DINOv2-large | 0.498 | 0.548 | +0.050 (increasing) |
274
+ | CLIP-ViT-bigG | 0.498 | 0.582 | +0.084 (increasing) |
275
 
276
+ **Critical Finding β€” Cross-Attention is Perfectly Balanced:**
277
+
278
+ T5-v1.1-XXL cross-attention QK manifold is exactly 0.500 positive / 0.500 negative at ALL 24 layers. Symmetry deviation is 1.414 (= √2) everywhere. This is a locked equilibrium β€” the bridge between encoder and decoder maintains perfect balance between attraction and repulsion at every depth. No other attention type shows this level of stability.
279
+
280
+ **T5-v1.1-XXL encoder self-attention is flat (~0.50 throughout).** Unlike T5-Small which decreased from 0.615 to 0.535, the XXL encoder stays near the equilibrium point. The larger model doesn't need to build anti-similarity boundaries because it has enough capacity to discriminate through other mechanisms.
281
+
282
+ **BERT starts BELOW 0.50 (0.446).** The only model with majority-repulsion from layer 0. MLM bidirectional training creates fundamentally different QK geometry from autoregressive or contrastive training.
283
 
284
  ### VI.4 MLP Dead Neurons
285
 
286
+ **Formula:** Combined importance = β€–wα΅’_upβ€–β‚‚ Β· β€–wα΅’_downβ€–β‚‚ (ReLU) or β€–wα΅’_gateβ€–β‚‚ Β· β€–wα΅’_upβ€–β‚‚ Β· β€–wα΅’_downβ€–β‚‚ (GeGLU). Dead if < 1% of mean.
287
+
288
+ | Model | Dead (<1% mean) | Weak (<10% mean) | Notes |
289
+ |---|---|---|---|
290
+ | T5-Small (enc+dec) | 0/24,576 (0.00%) | 0/24,576 (0.00%) | All neurons alive |
291
+ | T5-Base (enc+dec) | 0/73,728 (0.00%) | 0/73,728 (0.00%) | All neurons alive |
292
+ | T5-v1.1-XXL encoder | 0/245,760 (0.00%) | 0/245,760 (0.00%) | All neurons alive |
293
+ | T5-v1.1-XXL decoder | **14/245,760 (0.01%)** | **461/245,760 (0.19%)** | First dead neurons in T5 family |
294
+ | BERT-large | 0/98,304 (0.00%) | 0/98,304 (0.00%) | All neurons alive |
295
+ | DINOv2-large | 0/98,304 (0.00%) | 0/98,304 (0.00%) | All neurons alive |
296
+ | CLIP-ViT-B/16 | **1,316/36,864 (3.57%)** | 1,356/36,864 (3.68%) | Only model with significant dead neurons |
297
+ | CLIP-ViT-bigG | 0/393,216 (0.00%) | **24,163/393,216 (6.14%)** | 0 dead but 6% weak |
298
+
299
+ **Finding:** T5-v1.1-XXL decoder has the first dead neurons in the T5 family β€” 14 neurons in layers 1-2 only. The decoder's early GeGLU layers carved out a tiny amount of capacity. Encoder uses everything. CLIP-ViT-B/16 is the outlier with 3.6% dead neurons β€” contrastive training at small scale produces genuine pruning.
300
+
301
+ ### VI.5 Cross-Layer Weight Correlation
302
 
303
+ **Formula:** cos(flatten(Wα΅’), flatten(Wβ±Ό)) between weight matrices of the same type at different layers.
304
 
305
+ | Model | Q adj mean | K adj mean | MLP_up adj mean |
306
+ |---|---|---|---|
307
+ | T5-Small | ~0.000 | ~0.000 | 0.031–0.045 |
308
+ | T5-Base | ~0.000 | ~0.000 | 0.024–0.036 |
309
+ | T5-v1.1-XXL encoder | 0.0001 | β€” | β€” |
310
+ | T5-v1.1-XXL decoder | βˆ’0.0001 | β€” | β€” |
311
+ | BERT-large | 0.0002 | 0.0003 | 0.032 |
312
+ | CLIP-ViT-B/16 | βˆ’0.0004 (QKV) | β€” | 0.008 |
313
+ | DINOv2-large | βˆ’0.0003 | βˆ’0.0002 | 0.006 |
314
+ | CLIP-ViT-bigG | 0.0000 (QKV) | β€” | 0.055 |
315
+
316
+ **Universal finding:** Attention weights (Q, K, V) are completely uncorrelated across layers (~0.000). Every layer defines an independent similarity function. MLP weights show positive correlation decaying with distance β€” feedforward layers share structure.
317
 
318
+ ### VI.6 Position Bias Topology
319
 
320
+ **T5 uses learned relative position biases:** [32 buckets Γ— N_heads].
321
+
322
+ | Model | Encoder | Decoder |
323
+ |---|---|---|
324
+ | T5-Small (8 heads) | 3 local, 2 global, 3 mixed | 4 local, 4 global, 0 mixed |
325
+ | T5-Base (12 heads) | 4 local, 3 global, 5 mixed | 5 local, 4 global, 3 mixed |
326
+ | T5-v1.1-XXL (64 heads) | **24 local, 2 global, 38 mixed** | **27 local, 37 global, 0 mixed** |
327
 
328
+ **T5-v1.1-XXL position findings:**
329
+ - Encoder: 38/64 mixed heads β€” nuanced position sensitivity at scale
330
+ - **Decoder: ZERO mixed heads** β€” perfect binary crystallization. Every head is either pure local or pure global
331
+ - Decoder is 58% global (37/64) β€” overwhelmingly biased toward long-range attention
332
+ - Encoder range: [-47.2, 11.2] β€” strong local suppression
333
+ - Decoder range: [-28.4, 17.0] β€” more balanced
334
 
335
+ **Finding:** The decoder local/global binary split is scale-invariant (0 mixed at T5-Small, 0 mixed at XXL). Gradient descent crystallizes decoder position heads into two pure modes regardless of capacity.
336
 
337
  ---
338
 
 
348
 
349
  ### VII.2 Geometric Embedding Initialization
350
 
 
 
 
 
 
 
351
  | Metric | Value |
352
  |---|---|
353
  | WN reconstruction correlation | 0.921 |
 
356
 
357
  ### VII.3 Alpha Convergence
358
 
 
 
359
  | Start Ξ± | Final Mean Ξ± | Layer 5 Final | Pearson Ξ” | CV | Coherent | Basin |
360
  |---|---|---|---|---|---|---|
361
  | 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
 
363
  | 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
364
  | 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
365
 
 
 
366
  ### VII.4 Depth Gradient (Consistent Across All Runs)
367
 
368
  | Layer | 20ep (Ξ±=0.01) | 100ep (Ξ±=0.01) | 20ep (Ξ±=0.20) |
 
389
 
390
  ---
391
 
392
+ ## VIII. Geometric Field Modulator (Multi-Expert)
393
 
394
+ ### VIII.1 Architecture
395
+
396
+ - Three KSimplexChannel experts: k=1 (edge, 2 features), k=2 (triangle, 4 features), k=4 (pentachoron, 11 features)
397
+ - **Multiplicative gating**: residual Γ— Ξ (blended_gates) β€” valid regions pass, invalid suppressed
398
+ - **Soft blending**: per expert gate = (1 βˆ’ Ξ±) + Ξ± Γ— expert_gate
399
+ - **Null space**: 25% of residual dimensions untouched by modulator
400
+ - **Alpha clamped**: [0.001, 0.35] β€” hard ceiling below the phase boundary
401
+ - **Gradient scaling**: geometric params at 10% LR, alpha at 50% LR, gates at full LR
402
+ - Params: **38,552** (0.064% of T5-Small)
403
+ - Self-test: validity=0.985, null space preserved, template volumes sane
404
+
405
+ ### VIII.2 Design Rationale (Grounded in Cross-Architecture Data)
406
+
407
+ | Data Point | Design Decision |
408
+ |---|---|
409
+ | Q sparsity 100% at scale | Geometric field can replace Q β€” the model barely uses it |
410
+ | Cross-attn QK locked at 0.500 | Target equilibrium for geometric validity gating |
411
+ | Depth gradient always increasing | Per-layer alpha respects this (low early, high late) |
412
+ | Zero dead MLP neurons | Don't touch MLPs β€” all capacity is in use |
413
+ | Decoder position: binary L/G split | Modulator preserves positional structure (null space) |
414
+ | CV 0.20–0.23 universal | CV monitoring as health check, not loss |
415
+
416
+ ---
417
+
418
+ ## IX. The 0.29154 Constant
419
+
420
+ ### IX.1 Observations Across Systems
421
 
422
  | System | Context | Value |
423
  |---|---|---|
 
425
  | Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
426
  | Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
427
  | T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291–0.292, semantic phase transition |
428
+ | Alpha training basins | 0.70 start β†’ settled at 0.695 | Mirror constant 1 βˆ’ 0.29154 = 0.70846, Ξ” = 0.013 |
429
 
430
+ ### IX.2 T5 Generation Phase Transition
431
 
432
  | Alpha | Output (triangle prompt) |
433
  |---|---|
434
+ | 0.01–0.10 | "...three edges and three vertices. it is one of the basic shapes in geometry." |
435
  | 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
436
  | 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
437
  | 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
 
443
 
444
  ---
445
 
446
+ ## X. Universal Geometric Constants
447
 
448
  | Constant | Value | Observed In |
449
  |---|---|---|
 
451
  | Participation / dim | 0.53–0.56 | T5-Small, Qwen 0.8B |
452
  | Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
453
  | Depth gradient | Monotonic increasing | All modulator training runs |
454
+ | Q sparsity scaling (T5) | 93.7% β†’ 99.4% β†’ 100.0% | T5-Small β†’ T5-Base β†’ T5-v1.1-XXL |
455
+ | Cross-attn QK balance | Locked at 0.500 | T5-v1.1-XXL (all 24 layers) |
456
+ | Attention cross-layer corr | ~0.000 | ALL models profiled (8 models) |
457
+ | MLP cross-layer corr | 0.006–0.055 (positive, decays) | ALL models profiled |
458
+ | Decoder position crystallization | 0 mixed heads | T5-Small, T5-v1.1-XXL |
459
+ | MLP full utilization | 0.00% dead neurons | T5 family (enc), BERT, DINOv2 |
460
 
461
  ---
462
 
463
+ ## XI. Measurement Toolkit Reference
464
 
465
  | Tool | Input | Output | Requires Inference |
466
  |---|---|---|---|
 
470
  | Digit Manifold | 10 digit token embeddings | |iβˆ’j| correlation, adjacency gap | No |
471
  | SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
472
  | QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
473
+ | Dead Neuron Count | MLP wi/gate/up, wo matrices | Combined importance distribution | No |
474
+ | Cross-Layer Correlation | Same-type weight matrices | Adjacent cosine similarity | No |
475
+ | Position Bias Topology | Relative attention bias tensor | Local/global/mixed head counts | No |
476
+ | Sparsity Topology | Any weight matrix | Fraction below threshold | No |
477
  | WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
478
  | Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
479
 
480
  ---
481
 
482
+ ## XII. Scripts Reference
483
+
484
+ | Script | Purpose | Key Outputs |
485
+ |---|---|---|
486
+ | `probe_t5_small_terrain.py` | T5-Small embedding + layer geometry | PR, CV, digit manifold, layer evolution |
487
+ | `probe_t5_wordnet_summarize.py` | T5-Small Γ— WordNet relational alignment | Pearson, Spearman, distance bands, hypernym decay |
488
+ | `probe_t5_wordnet_50seeds.py` | 50-seed stability test (GPU-accelerated) | Confidence intervals for all relational metrics |
489
+ | `probe_t5_inactive_weights.py` | T5-Small/Base inactive weight topology | SVD, sparsity, QK manifold, dead neurons |
490
+ | `cross_architecture_weight_battery.py` | BERT + CLIP + DINOv2 battery | Cross-model comparison table |
491
+ | `probe_flux_t5_g4.py` | T5-v1.1-XXL (Flux encoder) full battery | All layers, encoder + decoder + cross-attn |
492
+ | `geometric_residual_modulator.py` | LERP modulator + training utilities | Modulator class + measurement tools |
493
+ | `geometric_field_modulator.py` | Multi-expert field modulator | KSimplex experts + multiplicative gating |
494
+ | `geometric_modulator_full_pipeline.py` | Self-contained T5 + WordNet + modulator | End-to-end pipeline |
495
+ | `train_modulator.py` | Training loop for alpha convergence | Freeze T5, train modulator, track alpha |
496
+
497
+ ---
498
+
499
+ *Last updated: 2026-03-06*
500
+ *Models profiled: 9 (T5-Small, T5-Base, T5-v1.1-XXL, BERT-large, CLIP-ViT-B/16, DINOv2-large, CLIP-ViT-bigG, Qwen3.5-0.8B, Qwen3.5-4B)*
501
+ *Cross-architecture battery: 7 models, 4 training objectives (MLM, span corruption, contrastive, self-supervised)*
502
+ *Modulator experiments: 4 LERP configurations, 1 field modulator*