Update README.md
Browse files
README.md
CHANGED
|
@@ -7,128 +7,32 @@ license: mit
|
|
| 7 |
# Geometric Terrain Statistics Composite
|
| 8 |
|
| 9 |
Such a quaint little tool.
|
| 10 |
-
|
| 11 |
-
```
|
| 12 |
-
class GeometricResidualModulator(nn.Module):
|
| 13 |
-
def __init__(self, d_model=512, vocab_size=32128, n_geometric_dims=64,
|
| 14 |
-
initial_alpha=0.01, n_layers=6):
|
| 15 |
-
super().__init__()
|
| 16 |
-
self.d_model = d_model
|
| 17 |
-
self.n_geometric_dims = n_geometric_dims
|
| 18 |
-
self.geometric_embed = nn.Embedding(vocab_size, n_geometric_dims)
|
| 19 |
-
self.proj = nn.Linear(n_geometric_dims, d_model, bias=False)
|
| 20 |
-
logit = math.log(initial_alpha / (1 - initial_alpha))
|
| 21 |
-
self.alpha = nn.Parameter(torch.full((n_layers,), logit))
|
| 22 |
-
nn.init.normal_(self.proj.weight, std=0.01)
|
| 23 |
-
|
| 24 |
-
def forward(self, residual, token_ids, layer_idx=0):
|
| 25 |
-
geo = self.geometric_embed(token_ids)
|
| 26 |
-
geo_projected = self.proj(geo)
|
| 27 |
-
a = torch.sigmoid(self.alpha[layer_idx])
|
| 28 |
-
return (1 - a) * residual + a * geo_projected
|
| 29 |
-
|
| 30 |
-
def geometric_residuals(self):
|
| 31 |
-
W = self.geometric_embed.weight
|
| 32 |
-
W_n = F.normalize(W, dim=1)
|
| 33 |
-
idx = torch.randperm(min(W.shape[0], 5000))[:5000]
|
| 34 |
-
sample = W_n[idx]
|
| 35 |
-
cos_mat = sample @ sample.T
|
| 36 |
-
tri = torch.triu_indices(len(idx), len(idx), offset=1)
|
| 37 |
-
flat_cos = cos_mat[tri[0], tri[1]]
|
| 38 |
-
norms = W.norm(dim=1)
|
| 39 |
-
centered = W - W.mean(dim=0)
|
| 40 |
-
cov = (centered.T @ centered) / W.shape[0]
|
| 41 |
-
eigvals = torch.linalg.eigvalsh(cov)
|
| 42 |
-
pr = (eigvals.sum() ** 2) / (eigvals ** 2).sum()
|
| 43 |
-
return {
|
| 44 |
-
'cos_mean': flat_cos.mean().item(),
|
| 45 |
-
'cos_std': flat_cos.std().item(),
|
| 46 |
-
'norm_mean': norms.mean().item(),
|
| 47 |
-
'pr_over_dim': (pr / self.n_geometric_dims).item(),
|
| 48 |
-
'alpha': torch.sigmoid(self.alpha).detach().cpu().numpy(),
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
class ModulatedT5Encoder(nn.Module):
|
| 53 |
-
def __init__(self, t5_encoder, modulator, modulate_layers=None):
|
| 54 |
-
super().__init__()
|
| 55 |
-
self.encoder = t5_encoder
|
| 56 |
-
self.modulator = modulator
|
| 57 |
-
if modulate_layers is None:
|
| 58 |
-
modulate_layers = list(range(len(t5_encoder.block)))
|
| 59 |
-
self.modulate_layers = set(modulate_layers)
|
| 60 |
-
|
| 61 |
-
def forward(self, input_ids, attention_mask=None, output_hidden_states=False, **kwargs):
|
| 62 |
-
hidden_states = self.encoder.embed_tokens(input_ids)
|
| 63 |
-
hidden_states = self.encoder.dropout(hidden_states)
|
| 64 |
-
|
| 65 |
-
if attention_mask is not None:
|
| 66 |
-
extended_attention_mask = attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
|
| 67 |
-
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(hidden_states.dtype).min
|
| 68 |
-
else:
|
| 69 |
-
extended_attention_mask = None
|
| 70 |
-
|
| 71 |
-
all_hidden_states = [hidden_states] if output_hidden_states else None
|
| 72 |
-
position_bias = None
|
| 73 |
-
seq_length = input_ids.shape[1]
|
| 74 |
-
cache_position = torch.arange(seq_length, device=input_ids.device)
|
| 75 |
-
|
| 76 |
-
for i, block in enumerate(self.encoder.block):
|
| 77 |
-
if i in self.modulate_layers:
|
| 78 |
-
hidden_states = self.modulator(hidden_states, input_ids, layer_idx=i)
|
| 79 |
-
|
| 80 |
-
block_output = block(hidden_states, attention_mask=extended_attention_mask,
|
| 81 |
-
position_bias=position_bias, cache_position=cache_position)
|
| 82 |
-
hidden_states = block_output[0]
|
| 83 |
-
|
| 84 |
-
if position_bias is None:
|
| 85 |
-
for out in block_output[1:]:
|
| 86 |
-
if isinstance(out, torch.Tensor) and out.dim() == 4:
|
| 87 |
-
position_bias = out
|
| 88 |
-
break
|
| 89 |
-
|
| 90 |
-
if output_hidden_states:
|
| 91 |
-
all_hidden_states.append(hidden_states)
|
| 92 |
-
|
| 93 |
-
hidden_states = self.encoder.final_layer_norm(hidden_states)
|
| 94 |
-
hidden_states = self.encoder.dropout(hidden_states)
|
| 95 |
-
|
| 96 |
-
if output_hidden_states:
|
| 97 |
-
all_hidden_states.append(hidden_states)
|
| 98 |
-
|
| 99 |
-
return type('Output', (), {
|
| 100 |
-
'last_hidden_state': hidden_states,
|
| 101 |
-
'hidden_states': tuple(all_hidden_states) if all_hidden_states else None,
|
| 102 |
-
})()
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
N_GEO = 64
|
| 106 |
-
modulator = GeometricResidualModulator(
|
| 107 |
-
d_model=512, vocab_size=32128, n_geometric_dims=N_GEO,
|
| 108 |
-
initial_alpha=0.5, n_layers=6,
|
| 109 |
-
).to(device)
|
| 110 |
-
|
| 111 |
-
mod_encoder = ModulatedT5Encoder(
|
| 112 |
-
t5_encoder=model.encoder, modulator=modulator,
|
| 113 |
-
modulate_layers=[0, 1, 2, 3, 4, 5],
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
```
|
| 117 |
-
|
| 118 |
|
| 119 |
## Document Purpose
|
| 120 |
|
| 121 |
-
Running catalog of geometric measurements across language models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
|
| 122 |
|
| 123 |
---
|
| 124 |
|
| 125 |
## I. Models Profiled
|
| 126 |
|
| 127 |
-
| Model | Params | Vocab | Hidden Dim | Layers | Architecture | Training
|
| 128 |
-
|---|---|---|---|---|---|---|
|
| 129 |
-
| T5-Small | 60.5M | 32,128 | 512 | 6+6
|
| 130 |
-
|
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
---
|
| 134 |
|
|
@@ -172,7 +76,7 @@ Running catalog of geometric measurements across language models. Each metric in
|
|
| 172 |
| Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
|
| 173 |
| Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
|
| 174 |
|
| 175 |
-
**Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm.
|
| 176 |
|
| 177 |
---
|
| 178 |
|
|
@@ -194,7 +98,7 @@ VolΒ² = (-1)β΅ Β· det(D) / (2β΄ Β· (4!)Β²) = -det(D) / 9216
|
|
| 194 |
Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
| 195 |
```
|
| 196 |
|
| 197 |
-
**Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each.
|
| 198 |
|
| 199 |
| Model | Valid/1000 | CV | Embed/Random Ratio |
|
| 200 |
|---|---|---|---|
|
|
@@ -210,10 +114,9 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 210 |
|
| 211 |
**Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560β1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
|
| 212 |
|
| 213 |
-
| Comparison | Relational Pearson |
|
| 214 |
|---|---|---|
|
| 215 |
-
| Qwen 0.8B vs 4B (raw) | 0.920 | 0.
|
| 216 |
-
| Qwen 0.8B vs 4B (Procrustes) | higher (post-alignment) | β |
|
| 217 |
|
| 218 |
**Finding:** Models at different scales learn the same relational geometry (r=0.92).
|
| 219 |
|
|
@@ -225,21 +128,15 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 225 |
|
| 226 |
**Formula:** For digit tokens '0'β'9', compute all 45 pairwise cosines. Measure Pearson correlation between |iβj| (numerical distance) and cosine similarity.
|
| 227 |
|
| 228 |
-
**Process:** Encode each digit as single token, extract embedding, normalize, compute pairwise cosine matrix.
|
| 229 |
-
|
| 230 |
| Model | |iβj| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
|
| 231 |
|---|---|---|---|---|
|
| 232 |
| T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
|
| 233 |
| Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
|
| 234 |
| Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
### IV.2 Semantic Category Clustering
|
| 239 |
|
| 240 |
-
**Formula:**
|
| 241 |
-
|
| 242 |
-
**Process (T5-Small):** 8 hand-curated categories (animals, colors, numbers, body, food, emotions, actions, time), single-token words only.
|
| 243 |
|
| 244 |
| Category | N tokens | Intra Cosine | Global | Lift |
|
| 245 |
|---|---|---|---|---|
|
|
@@ -285,8 +182,6 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 285 |
|
| 286 |
### V.3 Encoder Distance Bands
|
| 287 |
|
| 288 |
-
**Process:** Group WordNet token pairs by path similarity ranges. Measure mean cosine in each band.
|
| 289 |
-
|
| 290 |
| WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
|
| 291 |
|---|---|---|---|---|
|
| 292 |
| [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
|
|
@@ -296,79 +191,148 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 296 |
|
| 297 |
### V.4 Hypernym Chain Decay
|
| 298 |
|
| 299 |
-
**Process:** Find WordNet synsets forming hypernym chains (e.g., dogβcanineβmammalβorganism). Measure cosine between root and ancestor at each depth.
|
| 300 |
-
|
| 301 |
| Depth | Static Cosine | Encoder Cosine |
|
| 302 |
|---|---|---|
|
| 303 |
| 1 | 0.160 | 0.656 |
|
| 304 |
-
| 2 | 0.090 | 0.620 |
|
| 305 |
| 3 | 0.075 | 0.594 |
|
| 306 |
| 5 | 0.069 | 0.585 |
|
| 307 |
| 7 | 0.068 | 0.579 |
|
| 308 |
|
| 309 |
-
**Finding:** Monotonic decay in both spaces. Encoder has much stronger signal and cleaner gradient.
|
| 310 |
-
|
| 311 |
---
|
| 312 |
|
| 313 |
-
## VI. Inactive Weight Topology
|
| 314 |
|
| 315 |
-
### VI.1
|
| 316 |
|
| 317 |
-
**Formula:**
|
| 318 |
|
| 319 |
-
**Process:**
|
| 320 |
|
| 321 |
-
|
|
| 322 |
-
|---|---|---|
|
| 323 |
-
|
|
| 324 |
-
|
|
| 325 |
-
|
|
| 326 |
-
|
|
| 327 |
-
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
-
|
| 332 |
|
| 333 |
-
|
|
|
|
|
|
|
| 334 |
|---|---|---|
|
| 335 |
-
| self_attn_q |
|
| 336 |
-
| self_attn_k |
|
| 337 |
-
| self_attn_v |
|
| 338 |
-
|
|
| 339 |
-
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
### VI.3 QK Similarity Manifold
|
| 344 |
|
| 345 |
**Formula:** QK = W_Q Β· W_Kα΅. Eigendecompose the symmetric part (QK + QKα΅)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
|
| 346 |
|
| 347 |
-
**
|
| 348 |
|
| 349 |
-
|
|
| 350 |
-
|---|---|---|---|
|
| 351 |
-
|
|
| 352 |
-
|
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
-
**Finding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
### VI.4 MLP Dead Neurons
|
| 358 |
|
| 359 |
-
**Formula:** Combined importance = βwα΅’_upββ Β· βwα΅’_downββ. Dead if < 1% of mean.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
-
**
|
| 362 |
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
-
|
| 366 |
|
| 367 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
-
**Finding:**
|
| 372 |
|
| 373 |
---
|
| 374 |
|
|
@@ -384,12 +348,6 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 384 |
|
| 385 |
### VII.2 Geometric Embedding Initialization
|
| 386 |
|
| 387 |
-
**Process:**
|
| 388 |
-
1. Build 3000Γ3000 Wu-Palmer similarity matrix from WordNet anchors (~6 min)
|
| 389 |
-
2. Eigendecompose β top 64 eigenvectors scaled by βeigenvalue β 64-d embeddings
|
| 390 |
-
3. Project remaining tokens via GPU embedding cosine proxy (10-NN, softmax-weighted, <1 sec)
|
| 391 |
-
4. Procrustes align projection matrix to encoder PCA space
|
| 392 |
-
|
| 393 |
| Metric | Value |
|
| 394 |
|---|---|
|
| 395 |
| WN reconstruction correlation | 0.921 |
|
|
@@ -398,8 +356,6 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 398 |
|
| 399 |
### VII.3 Alpha Convergence
|
| 400 |
|
| 401 |
-
**Process:** Freeze T5, train only modulator (geometric embed + projection + alpha). Task: summarize definition β lemma word. Track alpha per layer.
|
| 402 |
-
|
| 403 |
| Start Ξ± | Final Mean Ξ± | Layer 5 Final | Pearson Ξ | CV | Coherent | Basin |
|
| 404 |
|---|---|---|---|---|---|---|
|
| 405 |
| 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
|
|
@@ -407,8 +363,6 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 407 |
| 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
|
| 408 |
| 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
|
| 409 |
|
| 410 |
-
**Finding:** Two stable attractor basins exist β binding (~0.07) and separation (~0.70). The binding basin produces functional results. Starting at 0.01 with early stopping (20 epochs) is optimal.
|
| 411 |
-
|
| 412 |
### VII.4 Depth Gradient (Consistent Across All Runs)
|
| 413 |
|
| 414 |
| Layer | 20ep (Ξ±=0.01) | 100ep (Ξ±=0.01) | 20ep (Ξ±=0.20) |
|
|
@@ -435,9 +389,35 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 435 |
|
| 436 |
---
|
| 437 |
|
| 438 |
-
## VIII.
|
| 439 |
|
| 440 |
-
### VIII.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
| System | Context | Value |
|
| 443 |
|---|---|---|
|
|
@@ -445,12 +425,13 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 445 |
| Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
|
| 446 |
| Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
|
| 447 |
| T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291β0.292, semantic phase transition |
|
|
|
|
| 448 |
|
| 449 |
-
###
|
| 450 |
|
| 451 |
| Alpha | Output (triangle prompt) |
|
| 452 |
|---|---|
|
| 453 |
-
| 0.01β0.10 | "
|
| 454 |
| 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
|
| 455 |
| 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
|
| 456 |
| 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
|
|
@@ -462,7 +443,7 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 462 |
|
| 463 |
---
|
| 464 |
|
| 465 |
-
##
|
| 466 |
|
| 467 |
| Constant | Value | Observed In |
|
| 468 |
|---|---|---|
|
|
@@ -470,11 +451,16 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 470 |
| Participation / dim | 0.53β0.56 | T5-Small, Qwen 0.8B |
|
| 471 |
| Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
|
| 472 |
| Depth gradient | Monotonic increasing | All modulator training runs |
|
| 473 |
-
| Q sparsity scaling |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
---
|
| 476 |
|
| 477 |
-
##
|
| 478 |
|
| 479 |
| Tool | Input | Output | Requires Inference |
|
| 480 |
|---|---|---|---|
|
|
@@ -484,12 +470,33 @@ Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
|
| 484 |
| Digit Manifold | 10 digit token embeddings | |iβj| correlation, adjacency gap | No |
|
| 485 |
| SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
|
| 486 |
| QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
|
| 487 |
-
| Dead Neuron Count | MLP wi, wo matrices | Combined importance distribution | No |
|
|
|
|
|
|
|
|
|
|
| 488 |
| WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
|
| 489 |
| Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
|
| 490 |
|
| 491 |
---
|
| 492 |
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# Geometric Terrain Statistics Composite
|
| 8 |
|
| 9 |
Such a quaint little tool.
|
| 10 |
+
# Geometric Terrain Statistics Composite
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
## Document Purpose
|
| 13 |
|
| 14 |
+
Running catalog of geometric measurements across language and vision models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
|
| 15 |
|
| 16 |
---
|
| 17 |
|
| 18 |
## I. Models Profiled
|
| 19 |
|
| 20 |
+
| Model | Params | Vocab | Hidden Dim | Layers | Heads | Architecture | Training |
|
| 21 |
+
|---|---|---|---|---|---|---|---|
|
| 22 |
+
| T5-Small | 60.5M | 32,128 | 512 | 6+6 | 8 | Enc-Dec (relative PE, ReLU MLP) | C4 span corruption |
|
| 23 |
+
| T5-Base | 222.9M | 32,128 | 768 | 12+12 | 12 | Enc-Dec (relative PE, ReLU MLP) | C4 span corruption |
|
| 24 |
+
| T5-v1.1-XXL | 11.4B | 32,128 | 4096 | 24+24 | 64 | Enc-Dec (relative PE, **GeGLU** MLP) | C4 (v1.1 variant, no multi-task) |
|
| 25 |
+
| BERT-large | 336.2M | 30,522 | 1024 | 24 | 16 | Encoder-only (absolute PE) | BookCorpus+Wikipedia MLM |
|
| 26 |
+
| CLIP-ViT-B/16 | 85.5M (visual) | β | 768 | 12 | 12 | Vision encoder (fused QKV) | LAION-2B contrastive |
|
| 27 |
+
| DINOv2-large | 302.0M | β | 1024 | 24 | 16 | Vision encoder (separate Q/K/V) | Self-supervised (no labels) |
|
| 28 |
+
| CLIP-ViT-bigG/14 | 1.84B (visual) | β | 1664 | 48 | 16 | Vision encoder (fused QKV) | LAION-2B contrastive |
|
| 29 |
+
| Qwen3.5-0.8B | 853M | 248,320 | 1024 | β | β | DeltaNet + MoE + ViT | Multilingual + Vision |
|
| 30 |
+
| Qwen3.5-4B | ~4B | 248,320 | 2560 | β | β | DeltaNet + MoE + ViT | Multilingual + Vision |
|
| 31 |
+
|
| 32 |
+
**Notes:**
|
| 33 |
+
- T5-v1.1-XXL encoder is the text encoder used by Flux.1 Schnell, Flux.1 Dev, and Flux.2
|
| 34 |
+
- CLIP models use fused QKV (`in_proj_weight`); Q/K/V split by thirds for analysis
|
| 35 |
+
- T5-v1.1 uses GeGLU (wi_0 gate + wi_1 value) instead of ReLU (single wi)
|
| 36 |
|
| 37 |
---
|
| 38 |
|
|
|
|
| 76 |
| Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
|
| 77 |
| Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
|
| 78 |
|
| 79 |
+
**Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm.
|
| 80 |
|
| 81 |
---
|
| 82 |
|
|
|
|
| 98 |
Vol = β(VolΒ²) if VolΒ² > 0, else invalid
|
| 99 |
```
|
| 100 |
|
| 101 |
+
**Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each. Report CV (coefficient of variation = std/mean).
|
| 102 |
|
| 103 |
| Model | Valid/1000 | CV | Embed/Random Ratio |
|
| 104 |
|---|---|---|---|
|
|
|
|
| 114 |
|
| 115 |
**Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560β1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
|
| 116 |
|
| 117 |
+
| Comparison | Relational Pearson | Pentachoron per-simplex corr |
|
| 118 |
|---|---|---|
|
| 119 |
+
| Qwen 0.8B vs 4B (raw) | 0.920 | 0.89 |
|
|
|
|
| 120 |
|
| 121 |
**Finding:** Models at different scales learn the same relational geometry (r=0.92).
|
| 122 |
|
|
|
|
| 128 |
|
| 129 |
**Formula:** For digit tokens '0'β'9', compute all 45 pairwise cosines. Measure Pearson correlation between |iβj| (numerical distance) and cosine similarity.
|
| 130 |
|
|
|
|
|
|
|
| 131 |
| Model | |iβj| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
|
| 132 |
|---|---|---|---|---|
|
| 133 |
| T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
|
| 134 |
| Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
|
| 135 |
| Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
|
| 136 |
|
| 137 |
+
### IV.2 Semantic Category Clustering (T5-Small)
|
|
|
|
|
|
|
| 138 |
|
| 139 |
+
**Formula:** Mean intra-category pairwise cosine vs global mean pairwise cosine. Lift = intra β global.
|
|
|
|
|
|
|
| 140 |
|
| 141 |
| Category | N tokens | Intra Cosine | Global | Lift |
|
| 142 |
|---|---|---|---|---|
|
|
|
|
| 182 |
|
| 183 |
### V.3 Encoder Distance Bands
|
| 184 |
|
|
|
|
|
|
|
| 185 |
| WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
|
| 186 |
|---|---|---|---|---|
|
| 187 |
| [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
|
|
|
|
| 191 |
|
| 192 |
### V.4 Hypernym Chain Decay
|
| 193 |
|
|
|
|
|
|
|
| 194 |
| Depth | Static Cosine | Encoder Cosine |
|
| 195 |
|---|---|---|
|
| 196 |
| 1 | 0.160 | 0.656 |
|
|
|
|
| 197 |
| 3 | 0.075 | 0.594 |
|
| 198 |
| 5 | 0.069 | 0.585 |
|
| 199 |
| 7 | 0.068 | 0.579 |
|
| 200 |
|
|
|
|
|
|
|
| 201 |
---
|
| 202 |
|
| 203 |
+
## VI. Cross-Architecture Inactive Weight Topology
|
| 204 |
|
| 205 |
+
### VI.1 Q/K/V Sparsity (<0.1 threshold)
|
| 206 |
|
| 207 |
+
**Formula:** Fraction of |wα΅’β±Ό| < 0.1 across all weights of that type.
|
| 208 |
|
| 209 |
+
**Process:** Iterate all 2D weight matrices, compute abs values, count below threshold. No inference needed.
|
| 210 |
|
| 211 |
+
| Model | Q | K | V | O | MLP | Full Model |
|
| 212 |
+
|---|---|---|---|---|---|---|
|
| 213 |
+
| **T5-Small** (512d, 6L) | **93.7%** | 19.2% | 12.1% | 10.4% | 11.9% | 18.4% |
|
| 214 |
+
| **T5-Base** (768d, 12L) | **99.4%** | 30.0% | 16.2% | 13.5% | 16.9% | 27.9% |
|
| 215 |
+
| **T5-v1.1-XXL** (4096d, 24L) | **100.0%** | **65.5%** | 73.1% | 65.4% | ~57% | β |
|
| 216 |
+
| BERT-large (1024d, 24L) | 99.1% | 99.1% | 99.9% | 99.9% | 99.4% | 99.3% |
|
| 217 |
+
| DINOv2-large (1024d, 24L) | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |
|
| 218 |
+
| CLIP-ViT-B/16 (768d, 12L) | β (fused) | β | β | β | 100.0% | 100.0% |
|
| 219 |
+
| CLIP-ViT-bigG (1664d, 48L) | β (fused) | β | β | β | ~97% | 98.0% |
|
| 220 |
|
| 221 |
+
**Key Finding β T5 Q/K Asymmetry Scales:**
|
| 222 |
+
|
| 223 |
+
| Model | Q (<0.1) | K (<0.1) | Q/K Ratio |
|
| 224 |
+
|---|---|---|---|
|
| 225 |
+
| T5-Small | 93.7% | 19.2% | **4.9Γ** |
|
| 226 |
+
| T5-Base | 99.4% | 30.0% | **3.3Γ** |
|
| 227 |
+
| T5-v1.1-XXL | 100.0% | 65.5% | **1.5Γ** |
|
| 228 |
|
| 229 |
+
T5 has a genuine Q-specific sparsity that scales with model size. Q hit 100.0% at XXL (every single weight below 0.1). This is NOT the BERT/DINOv2 pattern where all weight types are uniformly sparse. The query projection in T5 is **functionally vestigial at scale**.
|
| 230 |
|
| 231 |
+
**T5-v1.1-XXL Encoder vs Decoder:**
|
| 232 |
+
|
| 233 |
+
| Component | Encoder | Decoder |
|
| 234 |
|---|---|---|
|
| 235 |
+
| self_attn_q | 100.0% | 100.0% |
|
| 236 |
+
| self_attn_k | 71.7% | 59.4% |
|
| 237 |
+
| self_attn_v | 76.0% | 70.1% |
|
| 238 |
+
| cross_attn_q | β | 100.0% |
|
| 239 |
+
| cross_attn_k | β | 63.1% |
|
| 240 |
+
| cross_attn_v | β | 71.1% |
|
| 241 |
+
|
| 242 |
+
Q is 100% sparse everywhere β self-attention and cross-attention, encoder and decoder.
|
| 243 |
|
| 244 |
+
### VI.2 SVD Effective Rank
|
| 245 |
+
|
| 246 |
+
**Formula:** Stable rank = βWβΒ²_F / βWβΒ²β = Ξ£Οα΅’Β² / ΟβΒ². Measures effective rank without thresholding.
|
| 247 |
+
|
| 248 |
+
| Weight Type | T5-Small | T5-Base | T5-v1.1-XXL | BERT-large | DINOv2-large |
|
| 249 |
+
|---|---|---|---|---|---|
|
| 250 |
+
| self_attn_q | 47.6 | 58.1 | 96.8 | 50.8 | 57.7 |
|
| 251 |
+
| self_attn_k | 53.2 | 62.4 | 90.0 | 37.7 | 55.5 |
|
| 252 |
+
| self_attn_v | 75.3 | 97.5 | 204.4 | 113.0 | 94.8 |
|
| 253 |
+
| self_attn_o | 25.4 | 35.0 | 16.4 | 125.0 | 85.6 |
|
| 254 |
+
| mlp_up/gate | 15.2 | 20.6 | 67.9 (gate) / 247.3 (up) | 27.4 | 58.4 |
|
| 255 |
+
| mlp_down | 31.3 | 43.9 | 25.3 | 52.2 | 94.4 |
|
| 256 |
+
|
| 257 |
+
**T5-v1.1-XXL O matrices have very low stable rank (16.4)** β the output projection is extremely low-rank despite the 4096-d space. Cross-attention O is even lower at 6.1.
|
| 258 |
|
| 259 |
### VI.3 QK Similarity Manifold
|
| 260 |
|
| 261 |
**Formula:** QK = W_Q Β· W_Kα΅. Eigendecompose the symmetric part (QK + QKα΅)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
|
| 262 |
|
| 263 |
+
**Positive Eigenvalue Fraction Trends:**
|
| 264 |
|
| 265 |
+
| Model | First Layer | Last Layer | Trend |
|
| 266 |
+
|---|---|---|---|
|
| 267 |
+
| T5-Small encoder | 0.615 | 0.535 | **β0.080** (decreasing) |
|
| 268 |
+
| T5-v1.1-XXL encoder | 0.510 | 0.503 | **β0.007** (flat) |
|
| 269 |
+
| T5-v1.1-XXL decoder self | 0.501 | 0.548 | **+0.047** (increasing) |
|
| 270 |
+
| **T5-v1.1-XXL cross-attn** | **0.500** | **0.500** | **0.000 (locked)** |
|
| 271 |
+
| BERT-large | 0.446 | 0.513 | +0.066 (increasing) |
|
| 272 |
+
| CLIP-ViT-B/16 | 0.503 | 0.538 | +0.035 (increasing) |
|
| 273 |
+
| DINOv2-large | 0.498 | 0.548 | +0.050 (increasing) |
|
| 274 |
+
| CLIP-ViT-bigG | 0.498 | 0.582 | +0.084 (increasing) |
|
| 275 |
|
| 276 |
+
**Critical Finding β Cross-Attention is Perfectly Balanced:**
|
| 277 |
+
|
| 278 |
+
T5-v1.1-XXL cross-attention QK manifold is exactly 0.500 positive / 0.500 negative at ALL 24 layers. Symmetry deviation is 1.414 (= β2) everywhere. This is a locked equilibrium β the bridge between encoder and decoder maintains perfect balance between attraction and repulsion at every depth. No other attention type shows this level of stability.
|
| 279 |
+
|
| 280 |
+
**T5-v1.1-XXL encoder self-attention is flat (~0.50 throughout).** Unlike T5-Small which decreased from 0.615 to 0.535, the XXL encoder stays near the equilibrium point. The larger model doesn't need to build anti-similarity boundaries because it has enough capacity to discriminate through other mechanisms.
|
| 281 |
+
|
| 282 |
+
**BERT starts BELOW 0.50 (0.446).** The only model with majority-repulsion from layer 0. MLM bidirectional training creates fundamentally different QK geometry from autoregressive or contrastive training.
|
| 283 |
|
| 284 |
### VI.4 MLP Dead Neurons
|
| 285 |
|
| 286 |
+
**Formula:** Combined importance = βwα΅’_upββ Β· βwα΅’_downββ (ReLU) or βwα΅’_gateββ Β· βwα΅’_upββ Β· βwα΅’_downββ (GeGLU). Dead if < 1% of mean.
|
| 287 |
+
|
| 288 |
+
| Model | Dead (<1% mean) | Weak (<10% mean) | Notes |
|
| 289 |
+
|---|---|---|---|
|
| 290 |
+
| T5-Small (enc+dec) | 0/24,576 (0.00%) | 0/24,576 (0.00%) | All neurons alive |
|
| 291 |
+
| T5-Base (enc+dec) | 0/73,728 (0.00%) | 0/73,728 (0.00%) | All neurons alive |
|
| 292 |
+
| T5-v1.1-XXL encoder | 0/245,760 (0.00%) | 0/245,760 (0.00%) | All neurons alive |
|
| 293 |
+
| T5-v1.1-XXL decoder | **14/245,760 (0.01%)** | **461/245,760 (0.19%)** | First dead neurons in T5 family |
|
| 294 |
+
| BERT-large | 0/98,304 (0.00%) | 0/98,304 (0.00%) | All neurons alive |
|
| 295 |
+
| DINOv2-large | 0/98,304 (0.00%) | 0/98,304 (0.00%) | All neurons alive |
|
| 296 |
+
| CLIP-ViT-B/16 | **1,316/36,864 (3.57%)** | 1,356/36,864 (3.68%) | Only model with significant dead neurons |
|
| 297 |
+
| CLIP-ViT-bigG | 0/393,216 (0.00%) | **24,163/393,216 (6.14%)** | 0 dead but 6% weak |
|
| 298 |
+
|
| 299 |
+
**Finding:** T5-v1.1-XXL decoder has the first dead neurons in the T5 family β 14 neurons in layers 1-2 only. The decoder's early GeGLU layers carved out a tiny amount of capacity. Encoder uses everything. CLIP-ViT-B/16 is the outlier with 3.6% dead neurons β contrastive training at small scale produces genuine pruning.
|
| 300 |
+
|
| 301 |
+
### VI.5 Cross-Layer Weight Correlation
|
| 302 |
|
| 303 |
+
**Formula:** cos(flatten(Wα΅’), flatten(Wβ±Ό)) between weight matrices of the same type at different layers.
|
| 304 |
|
| 305 |
+
| Model | Q adj mean | K adj mean | MLP_up adj mean |
|
| 306 |
+
|---|---|---|---|
|
| 307 |
+
| T5-Small | ~0.000 | ~0.000 | 0.031β0.045 |
|
| 308 |
+
| T5-Base | ~0.000 | ~0.000 | 0.024β0.036 |
|
| 309 |
+
| T5-v1.1-XXL encoder | 0.0001 | β | β |
|
| 310 |
+
| T5-v1.1-XXL decoder | β0.0001 | β | β |
|
| 311 |
+
| BERT-large | 0.0002 | 0.0003 | 0.032 |
|
| 312 |
+
| CLIP-ViT-B/16 | β0.0004 (QKV) | β | 0.008 |
|
| 313 |
+
| DINOv2-large | β0.0003 | β0.0002 | 0.006 |
|
| 314 |
+
| CLIP-ViT-bigG | 0.0000 (QKV) | β | 0.055 |
|
| 315 |
+
|
| 316 |
+
**Universal finding:** Attention weights (Q, K, V) are completely uncorrelated across layers (~0.000). Every layer defines an independent similarity function. MLP weights show positive correlation decaying with distance β feedforward layers share structure.
|
| 317 |
|
| 318 |
+
### VI.6 Position Bias Topology
|
| 319 |
|
| 320 |
+
**T5 uses learned relative position biases:** [32 buckets Γ N_heads].
|
| 321 |
+
|
| 322 |
+
| Model | Encoder | Decoder |
|
| 323 |
+
|---|---|---|
|
| 324 |
+
| T5-Small (8 heads) | 3 local, 2 global, 3 mixed | 4 local, 4 global, 0 mixed |
|
| 325 |
+
| T5-Base (12 heads) | 4 local, 3 global, 5 mixed | 5 local, 4 global, 3 mixed |
|
| 326 |
+
| T5-v1.1-XXL (64 heads) | **24 local, 2 global, 38 mixed** | **27 local, 37 global, 0 mixed** |
|
| 327 |
|
| 328 |
+
**T5-v1.1-XXL position findings:**
|
| 329 |
+
- Encoder: 38/64 mixed heads β nuanced position sensitivity at scale
|
| 330 |
+
- **Decoder: ZERO mixed heads** β perfect binary crystallization. Every head is either pure local or pure global
|
| 331 |
+
- Decoder is 58% global (37/64) β overwhelmingly biased toward long-range attention
|
| 332 |
+
- Encoder range: [-47.2, 11.2] β strong local suppression
|
| 333 |
+
- Decoder range: [-28.4, 17.0] β more balanced
|
| 334 |
|
| 335 |
+
**Finding:** The decoder local/global binary split is scale-invariant (0 mixed at T5-Small, 0 mixed at XXL). Gradient descent crystallizes decoder position heads into two pure modes regardless of capacity.
|
| 336 |
|
| 337 |
---
|
| 338 |
|
|
|
|
| 348 |
|
| 349 |
### VII.2 Geometric Embedding Initialization
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
| Metric | Value |
|
| 352 |
|---|---|
|
| 353 |
| WN reconstruction correlation | 0.921 |
|
|
|
|
| 356 |
|
| 357 |
### VII.3 Alpha Convergence
|
| 358 |
|
|
|
|
|
|
|
| 359 |
| Start Ξ± | Final Mean Ξ± | Layer 5 Final | Pearson Ξ | CV | Coherent | Basin |
|
| 360 |
|---|---|---|---|---|---|---|
|
| 361 |
| 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
|
|
|
|
| 363 |
| 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
|
| 364 |
| 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
|
| 365 |
|
|
|
|
|
|
|
| 366 |
### VII.4 Depth Gradient (Consistent Across All Runs)
|
| 367 |
|
| 368 |
| Layer | 20ep (Ξ±=0.01) | 100ep (Ξ±=0.01) | 20ep (Ξ±=0.20) |
|
|
|
|
| 389 |
|
| 390 |
---
|
| 391 |
|
| 392 |
+
## VIII. Geometric Field Modulator (Multi-Expert)
|
| 393 |
|
| 394 |
+
### VIII.1 Architecture
|
| 395 |
+
|
| 396 |
+
- Three KSimplexChannel experts: k=1 (edge, 2 features), k=2 (triangle, 4 features), k=4 (pentachoron, 11 features)
|
| 397 |
+
- **Multiplicative gating**: residual Γ Ξ (blended_gates) β valid regions pass, invalid suppressed
|
| 398 |
+
- **Soft blending**: per expert gate = (1 β Ξ±) + Ξ± Γ expert_gate
|
| 399 |
+
- **Null space**: 25% of residual dimensions untouched by modulator
|
| 400 |
+
- **Alpha clamped**: [0.001, 0.35] β hard ceiling below the phase boundary
|
| 401 |
+
- **Gradient scaling**: geometric params at 10% LR, alpha at 50% LR, gates at full LR
|
| 402 |
+
- Params: **38,552** (0.064% of T5-Small)
|
| 403 |
+
- Self-test: validity=0.985, null space preserved, template volumes sane
|
| 404 |
+
|
| 405 |
+
### VIII.2 Design Rationale (Grounded in Cross-Architecture Data)
|
| 406 |
+
|
| 407 |
+
| Data Point | Design Decision |
|
| 408 |
+
|---|---|
|
| 409 |
+
| Q sparsity 100% at scale | Geometric field can replace Q β the model barely uses it |
|
| 410 |
+
| Cross-attn QK locked at 0.500 | Target equilibrium for geometric validity gating |
|
| 411 |
+
| Depth gradient always increasing | Per-layer alpha respects this (low early, high late) |
|
| 412 |
+
| Zero dead MLP neurons | Don't touch MLPs β all capacity is in use |
|
| 413 |
+
| Decoder position: binary L/G split | Modulator preserves positional structure (null space) |
|
| 414 |
+
| CV 0.20β0.23 universal | CV monitoring as health check, not loss |
|
| 415 |
+
|
| 416 |
+
---
|
| 417 |
+
|
| 418 |
+
## IX. The 0.29154 Constant
|
| 419 |
+
|
| 420 |
+
### IX.1 Observations Across Systems
|
| 421 |
|
| 422 |
| System | Context | Value |
|
| 423 |
|---|---|---|
|
|
|
|
| 425 |
| Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
|
| 426 |
| Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
|
| 427 |
| T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291β0.292, semantic phase transition |
|
| 428 |
+
| Alpha training basins | 0.70 start β settled at 0.695 | Mirror constant 1 β 0.29154 = 0.70846, Ξ = 0.013 |
|
| 429 |
|
| 430 |
+
### IX.2 T5 Generation Phase Transition
|
| 431 |
|
| 432 |
| Alpha | Output (triangle prompt) |
|
| 433 |
|---|---|
|
| 434 |
+
| 0.01β0.10 | "...three edges and three vertices. it is one of the basic shapes in geometry." |
|
| 435 |
| 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
|
| 436 |
| 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
|
| 437 |
| 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
|
|
|
|
| 443 |
|
| 444 |
---
|
| 445 |
|
| 446 |
+
## X. Universal Geometric Constants
|
| 447 |
|
| 448 |
| Constant | Value | Observed In |
|
| 449 |
|---|---|---|
|
|
|
|
| 451 |
| Participation / dim | 0.53β0.56 | T5-Small, Qwen 0.8B |
|
| 452 |
| Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
|
| 453 |
| Depth gradient | Monotonic increasing | All modulator training runs |
|
| 454 |
+
| Q sparsity scaling (T5) | 93.7% β 99.4% β 100.0% | T5-Small β T5-Base β T5-v1.1-XXL |
|
| 455 |
+
| Cross-attn QK balance | Locked at 0.500 | T5-v1.1-XXL (all 24 layers) |
|
| 456 |
+
| Attention cross-layer corr | ~0.000 | ALL models profiled (8 models) |
|
| 457 |
+
| MLP cross-layer corr | 0.006β0.055 (positive, decays) | ALL models profiled |
|
| 458 |
+
| Decoder position crystallization | 0 mixed heads | T5-Small, T5-v1.1-XXL |
|
| 459 |
+
| MLP full utilization | 0.00% dead neurons | T5 family (enc), BERT, DINOv2 |
|
| 460 |
|
| 461 |
---
|
| 462 |
|
| 463 |
+
## XI. Measurement Toolkit Reference
|
| 464 |
|
| 465 |
| Tool | Input | Output | Requires Inference |
|
| 466 |
|---|---|---|---|
|
|
|
|
| 470 |
| Digit Manifold | 10 digit token embeddings | |iβj| correlation, adjacency gap | No |
|
| 471 |
| SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
|
| 472 |
| QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
|
| 473 |
+
| Dead Neuron Count | MLP wi/gate/up, wo matrices | Combined importance distribution | No |
|
| 474 |
+
| Cross-Layer Correlation | Same-type weight matrices | Adjacent cosine similarity | No |
|
| 475 |
+
| Position Bias Topology | Relative attention bias tensor | Local/global/mixed head counts | No |
|
| 476 |
+
| Sparsity Topology | Any weight matrix | Fraction below threshold | No |
|
| 477 |
| WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
|
| 478 |
| Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
|
| 479 |
|
| 480 |
---
|
| 481 |
|
| 482 |
+
## XII. Scripts Reference
|
| 483 |
+
|
| 484 |
+
| Script | Purpose | Key Outputs |
|
| 485 |
+
|---|---|---|
|
| 486 |
+
| `probe_t5_small_terrain.py` | T5-Small embedding + layer geometry | PR, CV, digit manifold, layer evolution |
|
| 487 |
+
| `probe_t5_wordnet_summarize.py` | T5-Small Γ WordNet relational alignment | Pearson, Spearman, distance bands, hypernym decay |
|
| 488 |
+
| `probe_t5_wordnet_50seeds.py` | 50-seed stability test (GPU-accelerated) | Confidence intervals for all relational metrics |
|
| 489 |
+
| `probe_t5_inactive_weights.py` | T5-Small/Base inactive weight topology | SVD, sparsity, QK manifold, dead neurons |
|
| 490 |
+
| `cross_architecture_weight_battery.py` | BERT + CLIP + DINOv2 battery | Cross-model comparison table |
|
| 491 |
+
| `probe_flux_t5_g4.py` | T5-v1.1-XXL (Flux encoder) full battery | All layers, encoder + decoder + cross-attn |
|
| 492 |
+
| `geometric_residual_modulator.py` | LERP modulator + training utilities | Modulator class + measurement tools |
|
| 493 |
+
| `geometric_field_modulator.py` | Multi-expert field modulator | KSimplex experts + multiplicative gating |
|
| 494 |
+
| `geometric_modulator_full_pipeline.py` | Self-contained T5 + WordNet + modulator | End-to-end pipeline |
|
| 495 |
+
| `train_modulator.py` | Training loop for alpha convergence | Freeze T5, train modulator, track alpha |
|
| 496 |
+
|
| 497 |
+
---
|
| 498 |
+
|
| 499 |
+
*Last updated: 2026-03-06*
|
| 500 |
+
*Models profiled: 9 (T5-Small, T5-Base, T5-v1.1-XXL, BERT-large, CLIP-ViT-B/16, DINOv2-large, CLIP-ViT-bigG, Qwen3.5-0.8B, Qwen3.5-4B)*
|
| 501 |
+
*Cross-architecture battery: 7 models, 4 training objectives (MLM, span corruption, contrastive, self-supervised)*
|
| 502 |
+
*Modulator experiments: 4 LERP configurations, 1 field modulator*
|