Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,495 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
# Day 1
|
| 6 |
+
|
| 7 |
+
# Geometric Terrain Statistics Composite
|
| 8 |
+
|
| 9 |
+
Such a quaint little tool.
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
class GeometricResidualModulator(nn.Module):
|
| 13 |
+
def __init__(self, d_model=512, vocab_size=32128, n_geometric_dims=64,
|
| 14 |
+
initial_alpha=0.01, n_layers=6):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.d_model = d_model
|
| 17 |
+
self.n_geometric_dims = n_geometric_dims
|
| 18 |
+
self.geometric_embed = nn.Embedding(vocab_size, n_geometric_dims)
|
| 19 |
+
self.proj = nn.Linear(n_geometric_dims, d_model, bias=False)
|
| 20 |
+
logit = math.log(initial_alpha / (1 - initial_alpha))
|
| 21 |
+
self.alpha = nn.Parameter(torch.full((n_layers,), logit))
|
| 22 |
+
nn.init.normal_(self.proj.weight, std=0.01)
|
| 23 |
+
|
| 24 |
+
def forward(self, residual, token_ids, layer_idx=0):
|
| 25 |
+
geo = self.geometric_embed(token_ids)
|
| 26 |
+
geo_projected = self.proj(geo)
|
| 27 |
+
a = torch.sigmoid(self.alpha[layer_idx])
|
| 28 |
+
return (1 - a) * residual + a * geo_projected
|
| 29 |
+
|
| 30 |
+
def geometric_residuals(self):
|
| 31 |
+
W = self.geometric_embed.weight
|
| 32 |
+
W_n = F.normalize(W, dim=1)
|
| 33 |
+
idx = torch.randperm(min(W.shape[0], 5000))[:5000]
|
| 34 |
+
sample = W_n[idx]
|
| 35 |
+
cos_mat = sample @ sample.T
|
| 36 |
+
tri = torch.triu_indices(len(idx), len(idx), offset=1)
|
| 37 |
+
flat_cos = cos_mat[tri[0], tri[1]]
|
| 38 |
+
norms = W.norm(dim=1)
|
| 39 |
+
centered = W - W.mean(dim=0)
|
| 40 |
+
cov = (centered.T @ centered) / W.shape[0]
|
| 41 |
+
eigvals = torch.linalg.eigvalsh(cov)
|
| 42 |
+
pr = (eigvals.sum() ** 2) / (eigvals ** 2).sum()
|
| 43 |
+
return {
|
| 44 |
+
'cos_mean': flat_cos.mean().item(),
|
| 45 |
+
'cos_std': flat_cos.std().item(),
|
| 46 |
+
'norm_mean': norms.mean().item(),
|
| 47 |
+
'pr_over_dim': (pr / self.n_geometric_dims).item(),
|
| 48 |
+
'alpha': torch.sigmoid(self.alpha).detach().cpu().numpy(),
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ModulatedT5Encoder(nn.Module):
|
| 53 |
+
def __init__(self, t5_encoder, modulator, modulate_layers=None):
|
| 54 |
+
super().__init__()
|
| 55 |
+
self.encoder = t5_encoder
|
| 56 |
+
self.modulator = modulator
|
| 57 |
+
if modulate_layers is None:
|
| 58 |
+
modulate_layers = list(range(len(t5_encoder.block)))
|
| 59 |
+
self.modulate_layers = set(modulate_layers)
|
| 60 |
+
|
| 61 |
+
def forward(self, input_ids, attention_mask=None, output_hidden_states=False, **kwargs):
|
| 62 |
+
hidden_states = self.encoder.embed_tokens(input_ids)
|
| 63 |
+
hidden_states = self.encoder.dropout(hidden_states)
|
| 64 |
+
|
| 65 |
+
if attention_mask is not None:
|
| 66 |
+
extended_attention_mask = attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
|
| 67 |
+
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(hidden_states.dtype).min
|
| 68 |
+
else:
|
| 69 |
+
extended_attention_mask = None
|
| 70 |
+
|
| 71 |
+
all_hidden_states = [hidden_states] if output_hidden_states else None
|
| 72 |
+
position_bias = None
|
| 73 |
+
seq_length = input_ids.shape[1]
|
| 74 |
+
cache_position = torch.arange(seq_length, device=input_ids.device)
|
| 75 |
+
|
| 76 |
+
for i, block in enumerate(self.encoder.block):
|
| 77 |
+
if i in self.modulate_layers:
|
| 78 |
+
hidden_states = self.modulator(hidden_states, input_ids, layer_idx=i)
|
| 79 |
+
|
| 80 |
+
block_output = block(hidden_states, attention_mask=extended_attention_mask,
|
| 81 |
+
position_bias=position_bias, cache_position=cache_position)
|
| 82 |
+
hidden_states = block_output[0]
|
| 83 |
+
|
| 84 |
+
if position_bias is None:
|
| 85 |
+
for out in block_output[1:]:
|
| 86 |
+
if isinstance(out, torch.Tensor) and out.dim() == 4:
|
| 87 |
+
position_bias = out
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
if output_hidden_states:
|
| 91 |
+
all_hidden_states.append(hidden_states)
|
| 92 |
+
|
| 93 |
+
hidden_states = self.encoder.final_layer_norm(hidden_states)
|
| 94 |
+
hidden_states = self.encoder.dropout(hidden_states)
|
| 95 |
+
|
| 96 |
+
if output_hidden_states:
|
| 97 |
+
all_hidden_states.append(hidden_states)
|
| 98 |
+
|
| 99 |
+
return type('Output', (), {
|
| 100 |
+
'last_hidden_state': hidden_states,
|
| 101 |
+
'hidden_states': tuple(all_hidden_states) if all_hidden_states else None,
|
| 102 |
+
})()
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
N_GEO = 64
|
| 106 |
+
modulator = GeometricResidualModulator(
|
| 107 |
+
d_model=512, vocab_size=32128, n_geometric_dims=N_GEO,
|
| 108 |
+
initial_alpha=0.5, n_layers=6,
|
| 109 |
+
).to(device)
|
| 110 |
+
|
| 111 |
+
mod_encoder = ModulatedT5Encoder(
|
| 112 |
+
t5_encoder=model.encoder, modulator=modulator,
|
| 113 |
+
modulate_layers=[0, 1, 2, 3, 4, 5],
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
## Document Purpose
|
| 120 |
+
|
| 121 |
+
Running catalog of geometric measurements across language models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
|
| 122 |
+
|
| 123 |
+
---
|
| 124 |
+
|
| 125 |
+
## I. Models Profiled
|
| 126 |
+
|
| 127 |
+
| Model | Params | Vocab | Hidden Dim | Layers | Architecture | Training Data |
|
| 128 |
+
|---|---|---|---|---|---|---|
|
| 129 |
+
| T5-Small | 60.5M | 32,128 | 512 | 6+6 enc-dec | Transformer (relative PE) | C4 |
|
| 130 |
+
| Qwen3.5-0.8B | 853M (752M LM + 100M ViT) | 248,320 | 1024 | DeltaNet + MoE | Multilingual + Vision |
|
| 131 |
+
| Qwen3.5-4B | ~4B | 248,320 | 2560 | DeltaNet + MoE | Multilingual + Vision |
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
## II. Embedding Geometry Metrics
|
| 136 |
+
|
| 137 |
+
### II.1 Participation Ratio (Effective Dimensionality)
|
| 138 |
+
|
| 139 |
+
**Formula:** PR = (Σλᵢ)² / Σ(λᵢ²), where λᵢ are eigenvalues of the embedding covariance matrix.
|
| 140 |
+
|
| 141 |
+
**Process:** Center embeddings (subtract mean), compute covariance C = EᵀE / N, eigendecompose. PR counts effective number of dimensions used. PR/dim normalizes to [0, 1].
|
| 142 |
+
|
| 143 |
+
| Model | PR | PR / dim | Dims for 95% var |
|
| 144 |
+
|---|---|---|---|
|
| 145 |
+
| T5-Small (512d) | 287.2 | **0.561** | 379 (74.0%) |
|
| 146 |
+
| Qwen3.5-0.8B (1024d) | 547.7 | **0.535** | 893 (87.2%) |
|
| 147 |
+
| Qwen3.5-4B (2560d) | 812.4 | **0.317** | 2125 (83.0%) |
|
| 148 |
+
|
| 149 |
+
**Finding:** PR/dim ≈ 0.53–0.56 for smaller models. Appears to be a universal attractor for embedding dimensionality utilization.
|
| 150 |
+
|
| 151 |
+
### II.2 Pairwise Cosine Similarity Distribution
|
| 152 |
+
|
| 153 |
+
**Formula:** cos(eᵢ, eⱼ) = (eᵢ · eⱼ) / (‖eᵢ‖ · ‖eⱼ‖), sampled over 5K random tokens (12.5M pairs).
|
| 154 |
+
|
| 155 |
+
**Process:** Random sample 5K token embeddings, L2-normalize, compute full pairwise cosine matrix, extract upper triangle.
|
| 156 |
+
|
| 157 |
+
| Model | Mean | Std | Median | 1% | 99% |
|
| 158 |
+
|---|---|---|---|---|---|
|
| 159 |
+
| T5-Small | 0.057 | 0.060 | 0.053 | -0.068 | 0.225 |
|
| 160 |
+
| Qwen3.5-0.8B | 0.195 | 0.085 | 0.197 | -0.016 | 0.408 |
|
| 161 |
+
| Qwen3.5-4B | 0.142 | 0.078 | 0.139 | -0.029 | 0.356 |
|
| 162 |
+
|
| 163 |
+
**Finding:** T5 is near-orthogonal (span corruption objective). Qwen has positive bias (autoregressive next-token prediction pushes shared "being a token" component).
|
| 164 |
+
|
| 165 |
+
### II.3 Embedding Norm Distribution
|
| 166 |
+
|
| 167 |
+
**Formula:** ‖eᵢ‖₂ = √(Σeᵢⱼ²)
|
| 168 |
+
|
| 169 |
+
| Model | Mean Norm | Std | Min | Max |
|
| 170 |
+
|---|---|---|---|---|
|
| 171 |
+
| T5-Small | 520.15 | 69.84 | 243.31 | 1333.61 |
|
| 172 |
+
| Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
|
| 173 |
+
| Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
|
| 174 |
+
|
| 175 |
+
**Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm. This affects downstream metric scaling but not relational structure.
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
## III. Simplex Geometry Metrics
|
| 180 |
+
|
| 181 |
+
### III.1 Pentachoron Volume (Cayley-Menger Determinant)
|
| 182 |
+
|
| 183 |
+
**Formula:** For 5 points P₀...P₄, construct the bordered distance matrix:
|
| 184 |
+
|
| 185 |
+
```
|
| 186 |
+
D = | 0 1 1 1 1 1 |
|
| 187 |
+
| 1 0 d₀₁² d₀₂² d₀₃² d₀₄²|
|
| 188 |
+
| 1 d₁₀² 0 d₁₂² d₁₃² d₁₄²|
|
| 189 |
+
| 1 d₂₀² d₂₁² 0 d₂₃² d₂₄²|
|
| 190 |
+
| 1 d₃₀² d₃₁² d₃₂² 0 d₃₄²|
|
| 191 |
+
| 1 d₄₀² d₄₁² d₄₂² d₄₃² 0 |
|
| 192 |
+
|
| 193 |
+
Vol² = (-1)⁵ · det(D) / (2⁴ · (4!)²) = -det(D) / 9216
|
| 194 |
+
Vol = √(Vol²) if Vol² > 0, else invalid
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
**Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each. Compare to random Gaussian baseline (same norm distribution). Report CV (coefficient of variation = std/mean) and embed/random ratio.
|
| 198 |
+
|
| 199 |
+
| Model | Valid/1000 | CV | Embed/Random Ratio |
|
| 200 |
+
|---|---|---|---|
|
| 201 |
+
| T5-Small | 1000 | **0.233** | 0.855 |
|
| 202 |
+
| Qwen3.5-0.8B | 1000 | **0.208** | 0.984 |
|
| 203 |
+
| Qwen3.5-4B | 1000 | **0.222** | 0.988 |
|
| 204 |
+
|
| 205 |
+
**Finding:** CV 0.20–0.23 is a universal attractor. All models pack simplices with similar evenness regardless of architecture, scale, or training data. The "pentachoron packing constant."
|
| 206 |
+
|
| 207 |
+
### III.2 Cross-Model Relational Structure
|
| 208 |
+
|
| 209 |
+
**Formula:** For shared tokens between two models, compute pairwise cosine matrices in each model's embedding space. Pearson correlation between flattened upper triangles measures relational preservation.
|
| 210 |
+
|
| 211 |
+
**Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560→1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
|
| 212 |
+
|
| 213 |
+
| Comparison | Relational Pearson | Digit Structure Pearson |
|
| 214 |
+
|---|---|---|
|
| 215 |
+
| Qwen 0.8B vs 4B (raw) | 0.920 | 0.904 |
|
| 216 |
+
| Qwen 0.8B vs 4B (Procrustes) | higher (post-alignment) | — |
|
| 217 |
+
|
| 218 |
+
**Finding:** Models at different scales learn the same relational geometry (r=0.92).
|
| 219 |
+
|
| 220 |
+
---
|
| 221 |
+
|
| 222 |
+
## IV. Semantic Structure Metrics
|
| 223 |
+
|
| 224 |
+
### IV.1 Digit Manifold
|
| 225 |
+
|
| 226 |
+
**Formula:** For digit tokens '0'–'9', compute all 45 pairwise cosines. Measure Pearson correlation between |i−j| (numerical distance) and cosine similarity.
|
| 227 |
+
|
| 228 |
+
**Process:** Encode each digit as single token, extract embedding, normalize, compute pairwise cosine matrix.
|
| 229 |
+
|
| 230 |
+
| Model | |i−j| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
|
| 231 |
+
|---|---|---|---|---|
|
| 232 |
+
| T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
|
| 233 |
+
| Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
|
| 234 |
+
| Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
|
| 235 |
+
|
| 236 |
+
**Finding:** All models encode a number line. Stronger in Qwen (more training data). T5 has wider gap (adjacent vs non-adjacent more differentiated) despite weaker overall correlation.
|
| 237 |
+
|
| 238 |
+
### IV.2 Semantic Category Clustering
|
| 239 |
+
|
| 240 |
+
**Formula:** For tokens in a semantic category, compute mean intra-category pairwise cosine. Compare to global mean pairwise cosine. Lift = intra − global.
|
| 241 |
+
|
| 242 |
+
**Process (T5-Small):** 8 hand-curated categories (animals, colors, numbers, body, food, emotions, actions, time), single-token words only.
|
| 243 |
+
|
| 244 |
+
| Category | N tokens | Intra Cosine | Global | Lift |
|
| 245 |
+
|---|---|---|---|---|
|
| 246 |
+
| numbers | 9 | 0.497 | 0.057 | +0.440 |
|
| 247 |
+
| colors | 10 | 0.421 | 0.057 | +0.365 |
|
| 248 |
+
| time | 10 | 0.351 | 0.057 | +0.294 |
|
| 249 |
+
| food | 10 | 0.248 | 0.057 | +0.191 |
|
| 250 |
+
| animals | 12 | 0.241 | 0.057 | +0.184 |
|
| 251 |
+
| body | 10 | 0.216 | 0.057 | +0.159 |
|
| 252 |
+
| emotions | 10 | 0.197 | 0.057 | +0.141 |
|
| 253 |
+
| actions | 9 | 0.183 | 0.057 | +0.126 |
|
| 254 |
+
|
| 255 |
+
---
|
| 256 |
+
|
| 257 |
+
## V. Encoder Transformation Metrics (T5-Small)
|
| 258 |
+
|
| 259 |
+
### V.1 Layer-by-Layer Geometry
|
| 260 |
+
|
| 261 |
+
**Process:** Feed 10 diverse sentences through encoder, capture hidden states at each layer. Measure mean norm and mean pairwise cosine between token positions.
|
| 262 |
+
|
| 263 |
+
| Layer | Mean Norm | Pairwise Cosine |
|
| 264 |
+
|---|---|---|
|
| 265 |
+
| 0 (embed) | 377.3 | 0.052 |
|
| 266 |
+
| 1 | 761.6 | 0.278 |
|
| 267 |
+
| 2 | 1092.6 | 0.330 |
|
| 268 |
+
| 3 | 1428.8 | 0.367 |
|
| 269 |
+
| 4 | 1829.1 | 0.382 |
|
| 270 |
+
| 5 | 2378.3 | 0.419 |
|
| 271 |
+
| 6 (post-LN) | 3.3 | 0.211 |
|
| 272 |
+
|
| 273 |
+
**Finding:** Norms balloon through depth, final LayerNorm crushes to ~3. Pairwise cosine increases monotonically — tokens become MORE similar through depth. The encoder is a convergence funnel.
|
| 274 |
+
|
| 275 |
+
### V.2 WordNet Relational Alignment
|
| 276 |
+
|
| 277 |
+
**Process:** Encode 9,362 WordNet definitions via "summarize: {definition}". Mean-pool encoder output. Compare pairwise cosine to WordNet path similarity.
|
| 278 |
+
|
| 279 |
+
| Representation | Pearson | Spearman |
|
| 280 |
+
|---|---|---|
|
| 281 |
+
| Static embeddings | 0.078 | 0.015 |
|
| 282 |
+
| Encoder output | 0.095 | 0.081 |
|
| 283 |
+
|
| 284 |
+
**50-seed stability (encoder):** Pearson 0.100 ± 0.008, Spearman 0.090 ± 0.010, CV 0.204 ± 0.006.
|
| 285 |
+
|
| 286 |
+
### V.3 Encoder Distance Bands
|
| 287 |
+
|
| 288 |
+
**Process:** Group WordNet token pairs by path similarity ranges. Measure mean cosine in each band.
|
| 289 |
+
|
| 290 |
+
| WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
|
| 291 |
+
|---|---|---|---|---|
|
| 292 |
+
| [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
|
| 293 |
+
| [0.25, 0.50) | 53,112 | 0.077 | 0.573 | +0.496 |
|
| 294 |
+
| [0.10, 0.25) | 145,035 | 0.060 | 0.565 | +0.505 |
|
| 295 |
+
| [0.05, 0.10) | 295,680 | 0.061 | 0.553 | +0.492 |
|
| 296 |
+
|
| 297 |
+
### V.4 Hypernym Chain Decay
|
| 298 |
+
|
| 299 |
+
**Process:** Find WordNet synsets forming hypernym chains (e.g., dog→canine→mammal→organism). Measure cosine between root and ancestor at each depth.
|
| 300 |
+
|
| 301 |
+
| Depth | Static Cosine | Encoder Cosine |
|
| 302 |
+
|---|---|---|
|
| 303 |
+
| 1 | 0.160 | 0.656 |
|
| 304 |
+
| 2 | 0.090 | 0.620 |
|
| 305 |
+
| 3 | 0.075 | 0.594 |
|
| 306 |
+
| 5 | 0.069 | 0.585 |
|
| 307 |
+
| 7 | 0.068 | 0.579 |
|
| 308 |
+
|
| 309 |
+
**Finding:** Monotonic decay in both spaces. Encoder has much stronger signal and cleaner gradient.
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
## VI. Inactive Weight Topology (T5-Small / T5-Base)
|
| 314 |
+
|
| 315 |
+
### VI.1 SVD Effective Rank
|
| 316 |
+
|
| 317 |
+
**Formula:** Stable rank = ‖W‖²_F / ‖W‖²₂ = Σσᵢ² / σ₁². Measures effective rank without thresholding.
|
| 318 |
+
|
| 319 |
+
**Process:** SVD every 2D weight matrix. Report stable rank, participation ratio, active fraction (σᵢ > 0.01·σ₁), and condition number (σ₁/σₙ).
|
| 320 |
+
|
| 321 |
+
| Weight Type | Stable Rank (Small) | Stable Rank (Base) |
|
| 322 |
+
|---|---|---|
|
| 323 |
+
| self_attn_q | 47.6 ± 16.4 | 58.1 ± 17.2 |
|
| 324 |
+
| self_attn_k | 53.2 ± 9.2 | 62.4 ± 18.3 |
|
| 325 |
+
| self_attn_v | 75.3 | 97.5 |
|
| 326 |
+
| mlp_wi | 15.2 ± 3.8 | 20.6 ± 4.9 |
|
| 327 |
+
| mlp_wo | 31.3 | 43.9 |
|
| 328 |
+
|
| 329 |
+
### VI.2 Sparsity Topology
|
| 330 |
+
|
| 331 |
+
**Formula:** Fraction of |wᵢⱼ| below threshold.
|
| 332 |
+
|
| 333 |
+
| Weight Type | <0.1 (Small) | <0.1 (Base) |
|
| 334 |
+
|---|---|---|
|
| 335 |
+
| self_attn_q | **93.7%** | **99.4%** |
|
| 336 |
+
| self_attn_k | 19.2% | 30.0% |
|
| 337 |
+
| self_attn_v | 12.1% | 16.2% |
|
| 338 |
+
| mlp_wi | 11.9% | 16.9% |
|
| 339 |
+
| Full model | 18.4% | 27.9% |
|
| 340 |
+
|
| 341 |
+
**Finding:** Q matrices are overwhelmingly sparse. The query projection is >93% empty. K matrices are dense. This asymmetry grows with scale. The Q null space is the intervention point for geometric modulation.
|
| 342 |
+
|
| 343 |
+
### VI.3 QK Similarity Manifold
|
| 344 |
+
|
| 345 |
+
**Formula:** QK = W_Q · W_Kᵀ. Eigendecompose the symmetric part (QK + QKᵀ)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
|
| 346 |
+
|
| 347 |
+
**Process:** Compute per-layer. Track positive/negative balance and stable rank.
|
| 348 |
+
|
| 349 |
+
| Layer (Encoder) | Stable Rank | Positive Eig | Negative Eig | Symmetry Dev |
|
| 350 |
+
|---|---|---|---|---|
|
| 351 |
+
| 0 | 39.5 | 315 | 197 | 0.993 |
|
| 352 |
+
| 2 | 10.1 | 269 | 243 | 1.217 |
|
| 353 |
+
| 5 | 5.35 | 274 | 238 | 1.252 |
|
| 354 |
+
|
| 355 |
+
**Finding:** Similarity function narrows through depth (stable rank 39→5). Negative eigenvalue count increases — deeper layers define more anti-similarity boundaries.
|
| 356 |
+
|
| 357 |
+
### VI.4 MLP Dead Neurons
|
| 358 |
+
|
| 359 |
+
**Formula:** Combined importance = ‖wᵢ_up‖₂ · ‖wᵢ_down‖₂. Dead if < 1% of mean.
|
| 360 |
+
|
| 361 |
+
**Finding:** Zero dead neurons across all layers, both encoder and decoder, at both Small and Base scale. T5 is parameter-starved — every neuron earns its keep.
|
| 362 |
+
|
| 363 |
+
### VI.5 Position Bias Topology
|
| 364 |
+
|
| 365 |
+
**Process:** T5 uses learned relative position biases: [32 buckets, N heads]. Measure per-head: monotonicity, distance correlation, peak bucket.
|
| 366 |
+
|
| 367 |
+
**Encoder (T5-Small):** 3 local heads (peak 0-1, negative dist_corr), 2 global heads (peak 17-18, positive dist_corr), 3 mixed.
|
| 368 |
+
|
| 369 |
+
**Decoder (T5-Small):** 4 far-looking heads (peak 31, values up to +48), 4 local heads (peak 0-1, values down to -34.5). Extreme magnitude asymmetry — far-looking heads are 10× stronger.
|
| 370 |
+
|
| 371 |
+
**Finding:** This local/global split emerges identically across T5-Small, T5-Base. It's an architectural invariant.
|
| 372 |
+
|
| 373 |
+
---
|
| 374 |
+
|
| 375 |
+
## VII. Geometric Residual Modulator
|
| 376 |
+
|
| 377 |
+
### VII.1 Architecture
|
| 378 |
+
|
| 379 |
+
- Geometric embedding: [vocab_size, 64] — per-token geometric fingerprint
|
| 380 |
+
- Projection: Linear(64, d_model, bias=False) — Procrustes-aligned to encoder PCA space
|
| 381 |
+
- Alpha: per-layer learnable LERP coefficient, stored in logit space, applied via sigmoid
|
| 382 |
+
- Intervention: residual_out = (1 − α) · residual + α · proj(geo_embed(token_ids))
|
| 383 |
+
- Params: 2.09M (3.45% of T5-Small)
|
| 384 |
+
|
| 385 |
+
### VII.2 Geometric Embedding Initialization
|
| 386 |
+
|
| 387 |
+
**Process:**
|
| 388 |
+
1. Build 3000×3000 Wu-Palmer similarity matrix from WordNet anchors (~6 min)
|
| 389 |
+
2. Eigendecompose → top 64 eigenvectors scaled by √eigenvalue → 64-d embeddings
|
| 390 |
+
3. Project remaining tokens via GPU embedding cosine proxy (10-NN, softmax-weighted, <1 sec)
|
| 391 |
+
4. Procrustes align projection matrix to encoder PCA space
|
| 392 |
+
|
| 393 |
+
| Metric | Value |
|
| 394 |
+
|---|---|
|
| 395 |
+
| WN reconstruction correlation | 0.921 |
|
| 396 |
+
| Procrustes alignment cosine | 0.372 |
|
| 397 |
+
| Eigenvalue cumulative (top 64) | 61.3% |
|
| 398 |
+
|
| 399 |
+
### VII.3 Alpha Convergence
|
| 400 |
+
|
| 401 |
+
**Process:** Freeze T5, train only modulator (geometric embed + projection + alpha). Task: summarize definition → lemma word. Track alpha per layer.
|
| 402 |
+
|
| 403 |
+
| Start α | Final Mean α | Layer 5 Final | Pearson Δ | CV | Coherent | Basin |
|
| 404 |
+
|---|---|---|---|---|---|---|
|
| 405 |
+
| 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
|
| 406 |
+
| 0.20 (20 ep) | 0.222 | 0.308 | +0.085 | 0.452 | No | Ridge |
|
| 407 |
+
| 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
|
| 408 |
+
| 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
|
| 409 |
+
|
| 410 |
+
**Finding:** Two stable attractor basins exist — binding (~0.07) and separation (~0.70). The binding basin produces functional results. Starting at 0.01 with early stopping (20 epochs) is optimal.
|
| 411 |
+
|
| 412 |
+
### VII.4 Depth Gradient (Consistent Across All Runs)
|
| 413 |
+
|
| 414 |
+
| Layer | 20ep (α=0.01) | 100ep (α=0.01) | 20ep (α=0.20) |
|
| 415 |
+
|---|---|---|---|
|
| 416 |
+
| 0 | 0.015 | 0.035 | 0.170 |
|
| 417 |
+
| 1 | 0.052 | 0.061 | 0.180 |
|
| 418 |
+
| 2 | 0.066 | 0.102 | 0.227 |
|
| 419 |
+
| 3 | 0.080 | 0.137 | 0.197 |
|
| 420 |
+
| 4 | 0.080 | 0.197 | 0.248 |
|
| 421 |
+
| 5 | 0.107 | 0.218 | 0.308 |
|
| 422 |
+
|
| 423 |
+
**Finding:** Always monotonically increasing. The model wants minimal geometric modulation early and maximum modulation at the deepest layer. Geometry is a final correction, not an initial condition.
|
| 424 |
+
|
| 425 |
+
### VII.5 Best Result
|
| 426 |
+
|
| 427 |
+
| Metric | Original | Modulated (20ep, α=0.01 start) | Change |
|
| 428 |
+
|---|---|---|---|
|
| 429 |
+
| WordNet Pearson | 0.099 | **0.250** | **+152%** |
|
| 430 |
+
| WordNet Spearman | 0.085 | **0.245** | **+189%** |
|
| 431 |
+
| Semantic Gradient | 0.022 | **0.052** | **+132%** |
|
| 432 |
+
| Pentachoron CV | 0.202 | **0.220** | Stayed in band |
|
| 433 |
+
| Per-token Preservation | — | 0.730 | — |
|
| 434 |
+
| Coherence | Baseline | **Identical on 4/4 tests** | — |
|
| 435 |
+
|
| 436 |
+
---
|
| 437 |
+
|
| 438 |
+
## VIII. The 0.29154 Constant
|
| 439 |
+
|
| 440 |
+
### VIII.1 Observations Across Systems
|
| 441 |
+
|
| 442 |
+
| System | Context | Value |
|
| 443 |
+
|---|---|---|
|
| 444 |
+
| MinimalShunts | CLIP-L ↔ CLIP-G projection gate | Emergent equilibrium |
|
| 445 |
+
| Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
|
| 446 |
+
| Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
|
| 447 |
+
| T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291–0.292, semantic phase transition |
|
| 448 |
+
|
| 449 |
+
### VIII.2 T5 Generation Phase Transition
|
| 450 |
+
|
| 451 |
+
| Alpha | Output (triangle prompt) |
|
| 452 |
+
|---|---|
|
| 453 |
+
| 0.01–0.10 | "triangle is a polygon with three edges and three vertices. it is one of the basic shapes in geometry." |
|
| 454 |
+
| 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
|
| 455 |
+
| 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
|
| 456 |
+
| 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
|
| 457 |
+
| 0.2915 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
|
| 458 |
+
| 0.292 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **the world**." |
|
| 459 |
+
| 0.30 | "a polygon with a vertice and a vertice. it is one of the basic shapes in the world." |
|
| 460 |
+
|
| 461 |
+
**Finding:** 0.29154 marks the phase boundary between structural representation ("graph") and physical representation ("world"). Output is invariant to perturbation in a narrow band centered on the constant.
|
| 462 |
+
|
| 463 |
+
---
|
| 464 |
+
|
| 465 |
+
## IX. Universal Geometric Constants
|
| 466 |
+
|
| 467 |
+
| Constant | Value | Observed In |
|
| 468 |
+
|---|---|---|
|
| 469 |
+
| Pentachoron CV | 0.20–0.23 | T5-Small, Qwen 0.8B, Qwen 4B, trained modulator |
|
| 470 |
+
| Participation / dim | 0.53–0.56 | T5-Small, Qwen 0.8B |
|
| 471 |
+
| Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
|
| 472 |
+
| Depth gradient | Monotonic increasing | All modulator training runs |
|
| 473 |
+
| Q sparsity scaling | Increases with model scale | T5-Small (93.7%), T5-Base (99.4%) |
|
| 474 |
+
|
| 475 |
+
---
|
| 476 |
+
|
| 477 |
+
## X. Measurement Toolkit Reference
|
| 478 |
+
|
| 479 |
+
| Tool | Input | Output | Requires Inference |
|
| 480 |
+
|---|---|---|---|
|
| 481 |
+
| Participation Ratio | Embedding matrix | Effective dimensionality | No |
|
| 482 |
+
| Cayley-Menger Volume | 5-point subsets of embeddings | Simplex volume + CV | No |
|
| 483 |
+
| Pairwise Cosine | Embedding matrix (sampled) | Similarity distribution | No |
|
| 484 |
+
| Digit Manifold | 10 digit token embeddings | |i−j| correlation, adjacency gap | No |
|
| 485 |
+
| SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
|
| 486 |
+
| QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
|
| 487 |
+
| Dead Neuron Count | MLP wi, wo matrices | Combined importance distribution | No |
|
| 488 |
+
| WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
|
| 489 |
+
| Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
|
| 490 |
+
|
| 491 |
+
---
|
| 492 |
+
|
| 493 |
+
*Last updated: 2026-03-05*
|
| 494 |
+
*Models profiled: 3 (T5-Small, Qwen3.5-0.8B, Qwen3.5-4B)*
|
| 495 |
+
*Modulator experiments: 4 configurations*
|