AbstractPhil commited on
Commit
b52a06f
·
verified ·
1 Parent(s): 7a539da

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +495 -3
README.md CHANGED
@@ -1,3 +1,495 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Day 1
6
+
7
+ # Geometric Terrain Statistics Composite
8
+
9
+ Such a quaint little tool.
10
+
11
+ ```
12
+ class GeometricResidualModulator(nn.Module):
13
+ def __init__(self, d_model=512, vocab_size=32128, n_geometric_dims=64,
14
+ initial_alpha=0.01, n_layers=6):
15
+ super().__init__()
16
+ self.d_model = d_model
17
+ self.n_geometric_dims = n_geometric_dims
18
+ self.geometric_embed = nn.Embedding(vocab_size, n_geometric_dims)
19
+ self.proj = nn.Linear(n_geometric_dims, d_model, bias=False)
20
+ logit = math.log(initial_alpha / (1 - initial_alpha))
21
+ self.alpha = nn.Parameter(torch.full((n_layers,), logit))
22
+ nn.init.normal_(self.proj.weight, std=0.01)
23
+
24
+ def forward(self, residual, token_ids, layer_idx=0):
25
+ geo = self.geometric_embed(token_ids)
26
+ geo_projected = self.proj(geo)
27
+ a = torch.sigmoid(self.alpha[layer_idx])
28
+ return (1 - a) * residual + a * geo_projected
29
+
30
+ def geometric_residuals(self):
31
+ W = self.geometric_embed.weight
32
+ W_n = F.normalize(W, dim=1)
33
+ idx = torch.randperm(min(W.shape[0], 5000))[:5000]
34
+ sample = W_n[idx]
35
+ cos_mat = sample @ sample.T
36
+ tri = torch.triu_indices(len(idx), len(idx), offset=1)
37
+ flat_cos = cos_mat[tri[0], tri[1]]
38
+ norms = W.norm(dim=1)
39
+ centered = W - W.mean(dim=0)
40
+ cov = (centered.T @ centered) / W.shape[0]
41
+ eigvals = torch.linalg.eigvalsh(cov)
42
+ pr = (eigvals.sum() ** 2) / (eigvals ** 2).sum()
43
+ return {
44
+ 'cos_mean': flat_cos.mean().item(),
45
+ 'cos_std': flat_cos.std().item(),
46
+ 'norm_mean': norms.mean().item(),
47
+ 'pr_over_dim': (pr / self.n_geometric_dims).item(),
48
+ 'alpha': torch.sigmoid(self.alpha).detach().cpu().numpy(),
49
+ }
50
+
51
+
52
+ class ModulatedT5Encoder(nn.Module):
53
+ def __init__(self, t5_encoder, modulator, modulate_layers=None):
54
+ super().__init__()
55
+ self.encoder = t5_encoder
56
+ self.modulator = modulator
57
+ if modulate_layers is None:
58
+ modulate_layers = list(range(len(t5_encoder.block)))
59
+ self.modulate_layers = set(modulate_layers)
60
+
61
+ def forward(self, input_ids, attention_mask=None, output_hidden_states=False, **kwargs):
62
+ hidden_states = self.encoder.embed_tokens(input_ids)
63
+ hidden_states = self.encoder.dropout(hidden_states)
64
+
65
+ if attention_mask is not None:
66
+ extended_attention_mask = attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
67
+ extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(hidden_states.dtype).min
68
+ else:
69
+ extended_attention_mask = None
70
+
71
+ all_hidden_states = [hidden_states] if output_hidden_states else None
72
+ position_bias = None
73
+ seq_length = input_ids.shape[1]
74
+ cache_position = torch.arange(seq_length, device=input_ids.device)
75
+
76
+ for i, block in enumerate(self.encoder.block):
77
+ if i in self.modulate_layers:
78
+ hidden_states = self.modulator(hidden_states, input_ids, layer_idx=i)
79
+
80
+ block_output = block(hidden_states, attention_mask=extended_attention_mask,
81
+ position_bias=position_bias, cache_position=cache_position)
82
+ hidden_states = block_output[0]
83
+
84
+ if position_bias is None:
85
+ for out in block_output[1:]:
86
+ if isinstance(out, torch.Tensor) and out.dim() == 4:
87
+ position_bias = out
88
+ break
89
+
90
+ if output_hidden_states:
91
+ all_hidden_states.append(hidden_states)
92
+
93
+ hidden_states = self.encoder.final_layer_norm(hidden_states)
94
+ hidden_states = self.encoder.dropout(hidden_states)
95
+
96
+ if output_hidden_states:
97
+ all_hidden_states.append(hidden_states)
98
+
99
+ return type('Output', (), {
100
+ 'last_hidden_state': hidden_states,
101
+ 'hidden_states': tuple(all_hidden_states) if all_hidden_states else None,
102
+ })()
103
+
104
+
105
+ N_GEO = 64
106
+ modulator = GeometricResidualModulator(
107
+ d_model=512, vocab_size=32128, n_geometric_dims=N_GEO,
108
+ initial_alpha=0.5, n_layers=6,
109
+ ).to(device)
110
+
111
+ mod_encoder = ModulatedT5Encoder(
112
+ t5_encoder=model.encoder, modulator=modulator,
113
+ modulate_layers=[0, 1, 2, 3, 4, 5],
114
+ )
115
+
116
+ ```
117
+
118
+
119
+ ## Document Purpose
120
+
121
+ Running catalog of geometric measurements across language models. Each metric includes its formula, measurement process, and cross-model results. Designed for expansion as new models and experiments are added.
122
+
123
+ ---
124
+
125
+ ## I. Models Profiled
126
+
127
+ | Model | Params | Vocab | Hidden Dim | Layers | Architecture | Training Data |
128
+ |---|---|---|---|---|---|---|
129
+ | T5-Small | 60.5M | 32,128 | 512 | 6+6 enc-dec | Transformer (relative PE) | C4 |
130
+ | Qwen3.5-0.8B | 853M (752M LM + 100M ViT) | 248,320 | 1024 | DeltaNet + MoE | Multilingual + Vision |
131
+ | Qwen3.5-4B | ~4B | 248,320 | 2560 | DeltaNet + MoE | Multilingual + Vision |
132
+
133
+ ---
134
+
135
+ ## II. Embedding Geometry Metrics
136
+
137
+ ### II.1 Participation Ratio (Effective Dimensionality)
138
+
139
+ **Formula:** PR = (Σλᵢ)² / Σ(λᵢ²), where λᵢ are eigenvalues of the embedding covariance matrix.
140
+
141
+ **Process:** Center embeddings (subtract mean), compute covariance C = EᵀE / N, eigendecompose. PR counts effective number of dimensions used. PR/dim normalizes to [0, 1].
142
+
143
+ | Model | PR | PR / dim | Dims for 95% var |
144
+ |---|---|---|---|
145
+ | T5-Small (512d) | 287.2 | **0.561** | 379 (74.0%) |
146
+ | Qwen3.5-0.8B (1024d) | 547.7 | **0.535** | 893 (87.2%) |
147
+ | Qwen3.5-4B (2560d) | 812.4 | **0.317** | 2125 (83.0%) |
148
+
149
+ **Finding:** PR/dim ≈ 0.53–0.56 for smaller models. Appears to be a universal attractor for embedding dimensionality utilization.
150
+
151
+ ### II.2 Pairwise Cosine Similarity Distribution
152
+
153
+ **Formula:** cos(eᵢ, eⱼ) = (eᵢ · eⱼ) / (‖eᵢ‖ · ‖eⱼ‖), sampled over 5K random tokens (12.5M pairs).
154
+
155
+ **Process:** Random sample 5K token embeddings, L2-normalize, compute full pairwise cosine matrix, extract upper triangle.
156
+
157
+ | Model | Mean | Std | Median | 1% | 99% |
158
+ |---|---|---|---|---|---|
159
+ | T5-Small | 0.057 | 0.060 | 0.053 | -0.068 | 0.225 |
160
+ | Qwen3.5-0.8B | 0.195 | 0.085 | 0.197 | -0.016 | 0.408 |
161
+ | Qwen3.5-4B | 0.142 | 0.078 | 0.139 | -0.029 | 0.356 |
162
+
163
+ **Finding:** T5 is near-orthogonal (span corruption objective). Qwen has positive bias (autoregressive next-token prediction pushes shared "being a token" component).
164
+
165
+ ### II.3 Embedding Norm Distribution
166
+
167
+ **Formula:** ‖eᵢ‖₂ = √(Σeᵢⱼ²)
168
+
169
+ | Model | Mean Norm | Std | Min | Max |
170
+ |---|---|---|---|---|
171
+ | T5-Small | 520.15 | 69.84 | 243.31 | 1333.61 |
172
+ | Qwen3.5-0.8B | 0.627 | 0.062 | 0.347 | 1.057 |
173
+ | Qwen3.5-4B | 0.656 | 0.067 | 0.400 | 1.091 |
174
+
175
+ **Note:** T5 embeddings are unnormalized (large magnitudes). Qwen embeddings are near-unit norm. This affects downstream metric scaling but not relational structure.
176
+
177
+ ---
178
+
179
+ ## III. Simplex Geometry Metrics
180
+
181
+ ### III.1 Pentachoron Volume (Cayley-Menger Determinant)
182
+
183
+ **Formula:** For 5 points P₀...P₄, construct the bordered distance matrix:
184
+
185
+ ```
186
+ D = | 0 1 1 1 1 1 |
187
+ | 1 0 d₀₁² d₀₂² d₀₃² d₀₄²|
188
+ | 1 d₁₀² 0 d₁₂² d₁₃² d₁₄²|
189
+ | 1 d₂₀² d₂₁² 0 d₂₃² d₂₄²|
190
+ | 1 d₃₀² d₃₁² d₃₂² 0 d₃₄²|
191
+ | 1 d₄₀² d₄₁² d₄₂² d₄₃² 0 |
192
+
193
+ Vol² = (-1)⁵ · det(D) / (2⁴ · (4!)²) = -det(D) / 9216
194
+ Vol = √(Vol²) if Vol² > 0, else invalid
195
+ ```
196
+
197
+ **Process:** Sample 1000 random 5-token subsets. Compute Cayley-Menger volume for each. Compare to random Gaussian baseline (same norm distribution). Report CV (coefficient of variation = std/mean) and embed/random ratio.
198
+
199
+ | Model | Valid/1000 | CV | Embed/Random Ratio |
200
+ |---|---|---|---|
201
+ | T5-Small | 1000 | **0.233** | 0.855 |
202
+ | Qwen3.5-0.8B | 1000 | **0.208** | 0.984 |
203
+ | Qwen3.5-4B | 1000 | **0.222** | 0.988 |
204
+
205
+ **Finding:** CV 0.20–0.23 is a universal attractor. All models pack simplices with similar evenness regardless of architecture, scale, or training data. The "pentachoron packing constant."
206
+
207
+ ### III.2 Cross-Model Relational Structure
208
+
209
+ **Formula:** For shared tokens between two models, compute pairwise cosine matrices in each model's embedding space. Pearson correlation between flattened upper triangles measures relational preservation.
210
+
211
+ **Process (Qwen 0.8B vs 4B):** PCA 4B embeddings (2560→1024), Procrustes alignment using 10K anchor tokens, evaluate on 5K held-out tokens.
212
+
213
+ | Comparison | Relational Pearson | Digit Structure Pearson |
214
+ |---|---|---|
215
+ | Qwen 0.8B vs 4B (raw) | 0.920 | 0.904 |
216
+ | Qwen 0.8B vs 4B (Procrustes) | higher (post-alignment) | — |
217
+
218
+ **Finding:** Models at different scales learn the same relational geometry (r=0.92).
219
+
220
+ ---
221
+
222
+ ## IV. Semantic Structure Metrics
223
+
224
+ ### IV.1 Digit Manifold
225
+
226
+ **Formula:** For digit tokens '0'–'9', compute all 45 pairwise cosines. Measure Pearson correlation between |i−j| (numerical distance) and cosine similarity.
227
+
228
+ **Process:** Encode each digit as single token, extract embedding, normalize, compute pairwise cosine matrix.
229
+
230
+ | Model | |i−j| Correlation | Adjacent Mean | Non-Adjacent Mean | Gap |
231
+ |---|---|---|---|---|
232
+ | T5-Small | -0.575 | 0.622 | 0.442 | 0.180 |
233
+ | Qwen3.5-0.8B | -0.862 | 0.769 | 0.678 | 0.091 |
234
+ | Qwen3.5-4B | -0.871 | 0.790 | 0.731 | 0.059 |
235
+
236
+ **Finding:** All models encode a number line. Stronger in Qwen (more training data). T5 has wider gap (adjacent vs non-adjacent more differentiated) despite weaker overall correlation.
237
+
238
+ ### IV.2 Semantic Category Clustering
239
+
240
+ **Formula:** For tokens in a semantic category, compute mean intra-category pairwise cosine. Compare to global mean pairwise cosine. Lift = intra − global.
241
+
242
+ **Process (T5-Small):** 8 hand-curated categories (animals, colors, numbers, body, food, emotions, actions, time), single-token words only.
243
+
244
+ | Category | N tokens | Intra Cosine | Global | Lift |
245
+ |---|---|---|---|---|
246
+ | numbers | 9 | 0.497 | 0.057 | +0.440 |
247
+ | colors | 10 | 0.421 | 0.057 | +0.365 |
248
+ | time | 10 | 0.351 | 0.057 | +0.294 |
249
+ | food | 10 | 0.248 | 0.057 | +0.191 |
250
+ | animals | 12 | 0.241 | 0.057 | +0.184 |
251
+ | body | 10 | 0.216 | 0.057 | +0.159 |
252
+ | emotions | 10 | 0.197 | 0.057 | +0.141 |
253
+ | actions | 9 | 0.183 | 0.057 | +0.126 |
254
+
255
+ ---
256
+
257
+ ## V. Encoder Transformation Metrics (T5-Small)
258
+
259
+ ### V.1 Layer-by-Layer Geometry
260
+
261
+ **Process:** Feed 10 diverse sentences through encoder, capture hidden states at each layer. Measure mean norm and mean pairwise cosine between token positions.
262
+
263
+ | Layer | Mean Norm | Pairwise Cosine |
264
+ |---|---|---|
265
+ | 0 (embed) | 377.3 | 0.052 |
266
+ | 1 | 761.6 | 0.278 |
267
+ | 2 | 1092.6 | 0.330 |
268
+ | 3 | 1428.8 | 0.367 |
269
+ | 4 | 1829.1 | 0.382 |
270
+ | 5 | 2378.3 | 0.419 |
271
+ | 6 (post-LN) | 3.3 | 0.211 |
272
+
273
+ **Finding:** Norms balloon through depth, final LayerNorm crushes to ~3. Pairwise cosine increases monotonically — tokens become MORE similar through depth. The encoder is a convergence funnel.
274
+
275
+ ### V.2 WordNet Relational Alignment
276
+
277
+ **Process:** Encode 9,362 WordNet definitions via "summarize: {definition}". Mean-pool encoder output. Compare pairwise cosine to WordNet path similarity.
278
+
279
+ | Representation | Pearson | Spearman |
280
+ |---|---|---|
281
+ | Static embeddings | 0.078 | 0.015 |
282
+ | Encoder output | 0.095 | 0.081 |
283
+
284
+ **50-seed stability (encoder):** Pearson 0.100 ± 0.008, Spearman 0.090 ± 0.010, CV 0.204 ± 0.006.
285
+
286
+ ### V.3 Encoder Distance Bands
287
+
288
+ **Process:** Group WordNet token pairs by path similarity ranges. Measure mean cosine in each band.
289
+
290
+ | WN Similarity Band | N pairs | Static Cosine | Encoder Cosine | Lift |
291
+ |---|---|---|---|---|
292
+ | [0.50, 0.90) | 23 | 0.244 | 0.728 | +0.484 |
293
+ | [0.25, 0.50) | 53,112 | 0.077 | 0.573 | +0.496 |
294
+ | [0.10, 0.25) | 145,035 | 0.060 | 0.565 | +0.505 |
295
+ | [0.05, 0.10) | 295,680 | 0.061 | 0.553 | +0.492 |
296
+
297
+ ### V.4 Hypernym Chain Decay
298
+
299
+ **Process:** Find WordNet synsets forming hypernym chains (e.g., dog→canine→mammal→organism). Measure cosine between root and ancestor at each depth.
300
+
301
+ | Depth | Static Cosine | Encoder Cosine |
302
+ |---|---|---|
303
+ | 1 | 0.160 | 0.656 |
304
+ | 2 | 0.090 | 0.620 |
305
+ | 3 | 0.075 | 0.594 |
306
+ | 5 | 0.069 | 0.585 |
307
+ | 7 | 0.068 | 0.579 |
308
+
309
+ **Finding:** Monotonic decay in both spaces. Encoder has much stronger signal and cleaner gradient.
310
+
311
+ ---
312
+
313
+ ## VI. Inactive Weight Topology (T5-Small / T5-Base)
314
+
315
+ ### VI.1 SVD Effective Rank
316
+
317
+ **Formula:** Stable rank = ‖W‖²_F / ‖W‖²₂ = Σσᵢ² / σ₁². Measures effective rank without thresholding.
318
+
319
+ **Process:** SVD every 2D weight matrix. Report stable rank, participation ratio, active fraction (σᵢ > 0.01·σ₁), and condition number (σ₁/σₙ).
320
+
321
+ | Weight Type | Stable Rank (Small) | Stable Rank (Base) |
322
+ |---|---|---|
323
+ | self_attn_q | 47.6 ± 16.4 | 58.1 ± 17.2 |
324
+ | self_attn_k | 53.2 ± 9.2 | 62.4 ± 18.3 |
325
+ | self_attn_v | 75.3 | 97.5 |
326
+ | mlp_wi | 15.2 ± 3.8 | 20.6 ± 4.9 |
327
+ | mlp_wo | 31.3 | 43.9 |
328
+
329
+ ### VI.2 Sparsity Topology
330
+
331
+ **Formula:** Fraction of |wᵢⱼ| below threshold.
332
+
333
+ | Weight Type | <0.1 (Small) | <0.1 (Base) |
334
+ |---|---|---|
335
+ | self_attn_q | **93.7%** | **99.4%** |
336
+ | self_attn_k | 19.2% | 30.0% |
337
+ | self_attn_v | 12.1% | 16.2% |
338
+ | mlp_wi | 11.9% | 16.9% |
339
+ | Full model | 18.4% | 27.9% |
340
+
341
+ **Finding:** Q matrices are overwhelmingly sparse. The query projection is >93% empty. K matrices are dense. This asymmetry grows with scale. The Q null space is the intervention point for geometric modulation.
342
+
343
+ ### VI.3 QK Similarity Manifold
344
+
345
+ **Formula:** QK = W_Q · W_Kᵀ. Eigendecompose the symmetric part (QK + QKᵀ)/2. Positive eigenvalues = attraction directions. Negative eigenvalues = repulsion directions.
346
+
347
+ **Process:** Compute per-layer. Track positive/negative balance and stable rank.
348
+
349
+ | Layer (Encoder) | Stable Rank | Positive Eig | Negative Eig | Symmetry Dev |
350
+ |---|---|---|---|---|
351
+ | 0 | 39.5 | 315 | 197 | 0.993 |
352
+ | 2 | 10.1 | 269 | 243 | 1.217 |
353
+ | 5 | 5.35 | 274 | 238 | 1.252 |
354
+
355
+ **Finding:** Similarity function narrows through depth (stable rank 39→5). Negative eigenvalue count increases — deeper layers define more anti-similarity boundaries.
356
+
357
+ ### VI.4 MLP Dead Neurons
358
+
359
+ **Formula:** Combined importance = ‖wᵢ_up‖₂ · ‖wᵢ_down‖₂. Dead if < 1% of mean.
360
+
361
+ **Finding:** Zero dead neurons across all layers, both encoder and decoder, at both Small and Base scale. T5 is parameter-starved — every neuron earns its keep.
362
+
363
+ ### VI.5 Position Bias Topology
364
+
365
+ **Process:** T5 uses learned relative position biases: [32 buckets, N heads]. Measure per-head: monotonicity, distance correlation, peak bucket.
366
+
367
+ **Encoder (T5-Small):** 3 local heads (peak 0-1, negative dist_corr), 2 global heads (peak 17-18, positive dist_corr), 3 mixed.
368
+
369
+ **Decoder (T5-Small):** 4 far-looking heads (peak 31, values up to +48), 4 local heads (peak 0-1, values down to -34.5). Extreme magnitude asymmetry — far-looking heads are 10× stronger.
370
+
371
+ **Finding:** This local/global split emerges identically across T5-Small, T5-Base. It's an architectural invariant.
372
+
373
+ ---
374
+
375
+ ## VII. Geometric Residual Modulator
376
+
377
+ ### VII.1 Architecture
378
+
379
+ - Geometric embedding: [vocab_size, 64] — per-token geometric fingerprint
380
+ - Projection: Linear(64, d_model, bias=False) — Procrustes-aligned to encoder PCA space
381
+ - Alpha: per-layer learnable LERP coefficient, stored in logit space, applied via sigmoid
382
+ - Intervention: residual_out = (1 − α) · residual + α · proj(geo_embed(token_ids))
383
+ - Params: 2.09M (3.45% of T5-Small)
384
+
385
+ ### VII.2 Geometric Embedding Initialization
386
+
387
+ **Process:**
388
+ 1. Build 3000×3000 Wu-Palmer similarity matrix from WordNet anchors (~6 min)
389
+ 2. Eigendecompose → top 64 eigenvectors scaled by √eigenvalue → 64-d embeddings
390
+ 3. Project remaining tokens via GPU embedding cosine proxy (10-NN, softmax-weighted, <1 sec)
391
+ 4. Procrustes align projection matrix to encoder PCA space
392
+
393
+ | Metric | Value |
394
+ |---|---|
395
+ | WN reconstruction correlation | 0.921 |
396
+ | Procrustes alignment cosine | 0.372 |
397
+ | Eigenvalue cumulative (top 64) | 61.3% |
398
+
399
+ ### VII.3 Alpha Convergence
400
+
401
+ **Process:** Freeze T5, train only modulator (geometric embed + projection + alpha). Task: summarize definition → lemma word. Track alpha per layer.
402
+
403
+ | Start α | Final Mean α | Layer 5 Final | Pearson Δ | CV | Coherent | Basin |
404
+ |---|---|---|---|---|---|---|
405
+ | 0.01 (20 ep) | **0.067** | **0.107** | **+0.151** | **0.220** | **Yes** | Binding |
406
+ | 0.20 (20 ep) | 0.222 | 0.308 | +0.085 | 0.452 | No | Ridge |
407
+ | 0.70 (20 ep) | 0.695 | 0.640 | -0.029 | 0.482 | No | Separation |
408
+ | 0.01 (100 ep) | 0.125 | 0.218 | +0.074 | 0.322 | No | Overfit |
409
+
410
+ **Finding:** Two stable attractor basins exist — binding (~0.07) and separation (~0.70). The binding basin produces functional results. Starting at 0.01 with early stopping (20 epochs) is optimal.
411
+
412
+ ### VII.4 Depth Gradient (Consistent Across All Runs)
413
+
414
+ | Layer | 20ep (α=0.01) | 100ep (α=0.01) | 20ep (α=0.20) |
415
+ |---|---|---|---|
416
+ | 0 | 0.015 | 0.035 | 0.170 |
417
+ | 1 | 0.052 | 0.061 | 0.180 |
418
+ | 2 | 0.066 | 0.102 | 0.227 |
419
+ | 3 | 0.080 | 0.137 | 0.197 |
420
+ | 4 | 0.080 | 0.197 | 0.248 |
421
+ | 5 | 0.107 | 0.218 | 0.308 |
422
+
423
+ **Finding:** Always monotonically increasing. The model wants minimal geometric modulation early and maximum modulation at the deepest layer. Geometry is a final correction, not an initial condition.
424
+
425
+ ### VII.5 Best Result
426
+
427
+ | Metric | Original | Modulated (20ep, α=0.01 start) | Change |
428
+ |---|---|---|---|
429
+ | WordNet Pearson | 0.099 | **0.250** | **+152%** |
430
+ | WordNet Spearman | 0.085 | **0.245** | **+189%** |
431
+ | Semantic Gradient | 0.022 | **0.052** | **+132%** |
432
+ | Pentachoron CV | 0.202 | **0.220** | Stayed in band |
433
+ | Per-token Preservation | — | 0.730 | — |
434
+ | Coherence | Baseline | **Identical on 4/4 tests** | — |
435
+
436
+ ---
437
+
438
+ ## VIII. The 0.29154 Constant
439
+
440
+ ### VIII.1 Observations Across Systems
441
+
442
+ | System | Context | Value |
443
+ |---|---|---|
444
+ | MinimalShunts | CLIP-L ↔ CLIP-G projection gate | Emergent equilibrium |
445
+ | Wormhole Lambda | Vision transformer training | Converges from 0.74 toward ~0.29 |
446
+ | Alpha curriculum | Devil's Staircase PE training | Converges to ~0.50 under geometric loss, CE destroys |
447
+ | T5 generation | Greedy decode alpha sweep | Stable plateau at 0.291–0.292, semantic phase transition |
448
+
449
+ ### VIII.2 T5 Generation Phase Transition
450
+
451
+ | Alpha | Output (triangle prompt) |
452
+ |---|---|
453
+ | 0.01–0.10 | "triangle is a polygon with three edges and three vertices. it is one of the basic shapes in geometry." |
454
+ | 0.20 | "**a** triangle is a polygon with three edges and three vertices..." |
455
+ | 0.28 | "a polygon with three vertices. it is one of the basic shapes in **a graph**." |
456
+ | 0.291 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
457
+ | 0.2915 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **a graph**." |
458
+ | 0.292 | "a triangle is a polygon with a vertice and a vertice. it is one of the basic shapes in **the world**." |
459
+ | 0.30 | "a polygon with a vertice and a vertice. it is one of the basic shapes in the world." |
460
+
461
+ **Finding:** 0.29154 marks the phase boundary between structural representation ("graph") and physical representation ("world"). Output is invariant to perturbation in a narrow band centered on the constant.
462
+
463
+ ---
464
+
465
+ ## IX. Universal Geometric Constants
466
+
467
+ | Constant | Value | Observed In |
468
+ |---|---|---|
469
+ | Pentachoron CV | 0.20–0.23 | T5-Small, Qwen 0.8B, Qwen 4B, trained modulator |
470
+ | Participation / dim | 0.53–0.56 | T5-Small, Qwen 0.8B |
471
+ | Binding/separation constant | 0.29154 / 0.70846 | MinimalShunts, CLIP projections, T5 generation, alpha convergence |
472
+ | Depth gradient | Monotonic increasing | All modulator training runs |
473
+ | Q sparsity scaling | Increases with model scale | T5-Small (93.7%), T5-Base (99.4%) |
474
+
475
+ ---
476
+
477
+ ## X. Measurement Toolkit Reference
478
+
479
+ | Tool | Input | Output | Requires Inference |
480
+ |---|---|---|---|
481
+ | Participation Ratio | Embedding matrix | Effective dimensionality | No |
482
+ | Cayley-Menger Volume | 5-point subsets of embeddings | Simplex volume + CV | No |
483
+ | Pairwise Cosine | Embedding matrix (sampled) | Similarity distribution | No |
484
+ | Digit Manifold | 10 digit token embeddings | |i−j| correlation, adjacency gap | No |
485
+ | SVD Effective Rank | Any 2D weight matrix | Stable rank, condition number | No |
486
+ | QK Manifold | W_Q, W_K matrices | Eigenspectrum, pos/neg balance | No |
487
+ | Dead Neuron Count | MLP wi, wo matrices | Combined importance distribution | No |
488
+ | WordNet Relational | Encoder output (mean-pooled) | Pearson/Spearman vs path similarity | Yes |
489
+ | Alpha Convergence | Modulator training loop | Per-layer equilibrium values | Yes (training) |
490
+
491
+ ---
492
+
493
+ *Last updated: 2026-03-05*
494
+ *Models profiled: 3 (T5-Small, Qwen3.5-0.8B, Qwen3.5-4B)*
495
+ *Modulator experiments: 4 configurations*