theapemachine commited on
Commit
55d63c9
·
verified ·
1 Parent(s): cdf659b

v2 complete: NGC graft, causal energy, auto-expanding codebook, benchmark integration

Browse files
tensegrity/v2/causal_energy.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Causal Energy: Pearl's SCMs as energy terms in the unified landscape.
3
+
4
+ Each SCM contributes a prediction error to the total energy:
5
+ E_causal(M_k) = Σ_v ||z_v - f_v(z_pa(v))||²
6
+
7
+ Where:
8
+ z_v = observed value of variable v
9
+ f_v(z_pa(v)) = structural equation's prediction from parents
10
+ pa(v) = parents of v in the causal DAG
11
+
12
+ Multiple SCMs compete. The model with lowest causal energy provides
13
+ the best explanation. This replaces the v1 causal arena's log-likelihood
14
+ comparison with a unified energy-based comparison.
15
+
16
+ The causal energy connects to the NGC energy through shared variables:
17
+ if a causal variable maps to an NGC layer's abstract state, then the
18
+ NGC prediction error and the causal prediction error are literally
19
+ the same quantity at different scales of description.
20
+ """
21
+
22
+ import numpy as np
23
+ from typing import Dict, List, Optional, Any, Tuple
24
+ from tensegrity.causal.scm import StructuralCausalModel
25
+
26
+
27
+ class CausalEnergyTerm:
28
+ """
29
+ Computes causal prediction error energy for an SCM.
30
+
31
+ Given observations of some variables, computes how well
32
+ the SCM's structural equations predict them.
33
+ """
34
+
35
+ def __init__(self, scm: StructuralCausalModel, precision: float = 1.0):
36
+ self.scm = scm
37
+ self.precision = precision
38
+
39
+ def energy(self, observations: Dict[str, int]) -> float:
40
+ """
41
+ Compute causal prediction error energy.
42
+
43
+ E = Σ_v (1/2σ²) ||obs_v - predicted_v||²
44
+
45
+ Where predicted_v = E[V | parents of V observed]
46
+ """
47
+ total_energy = 0.0
48
+ order = self.scm.topological_order()
49
+
50
+ for var in order:
51
+ if var not in observations:
52
+ continue
53
+
54
+ mech = self.scm.mechanisms[var]
55
+ parent_vals = {p: observations.get(p, 0) for p in mech.parents}
56
+
57
+ # Expected value under the CPT
58
+ cpt = mech.cpt
59
+ config_idx = mech.parent_config_index(parent_vals)
60
+ probs = cpt[:, config_idx]
61
+
62
+ # Prediction = expected value index
63
+ expected = np.sum(probs * np.arange(len(probs)))
64
+ observed = float(observations[var])
65
+
66
+ # Squared prediction error
67
+ error = (observed - expected) ** 2
68
+ total_energy += 0.5 * self.precision * error
69
+
70
+ return total_energy
71
+
72
+ def prediction(self, observations: Dict[str, int],
73
+ target: str) -> np.ndarray:
74
+ """Predict distribution over target given observed parents."""
75
+ mech = self.scm.mechanisms.get(target)
76
+ if mech is None:
77
+ return np.array([1.0])
78
+
79
+ parent_vals = {p: observations.get(p, 0) for p in mech.parents}
80
+ config_idx = mech.parent_config_index(parent_vals)
81
+ return mech.cpt[:, config_idx]
82
+
83
+
84
+ class CausalArenaV2:
85
+ """
86
+ v2 causal arena: SCMs compete via energy, not log-likelihood.
87
+
88
+ Each model is wrapped in a CausalEnergyTerm. The model with
89
+ lowest energy wins. The tension is the ratio of energies
90
+ (or equivalently, the softmax distribution over models).
91
+
92
+ This integrates with the unified energy landscape:
93
+ E_total = E_perception(NGC) + E_memory(Hopfield) + E_causal(arena)
94
+
95
+ Where E_causal = min_k E_causal(M_k) — we use the best model's energy.
96
+ """
97
+
98
+ def __init__(self, precision: float = 1.0, beta: float = 1.0):
99
+ """
100
+ Args:
101
+ precision: Causal prediction error precision
102
+ beta: Inverse temperature for model selection softmax
103
+ """
104
+ self.models: Dict[str, CausalEnergyTerm] = {}
105
+ self.beta = beta
106
+ self.precision = precision
107
+ self._history: List[Dict[str, float]] = []
108
+
109
+ def register(self, scm: StructuralCausalModel):
110
+ """Add a competing causal model."""
111
+ self.models[scm.name] = CausalEnergyTerm(scm, self.precision)
112
+
113
+ def compete(self, observations: Dict[str, int]) -> Dict[str, Any]:
114
+ """
115
+ All models compute their causal energy on the observation.
116
+ Returns energies, posteriors, and tension.
117
+ """
118
+ energies = {}
119
+ for name, term in self.models.items():
120
+ energies[name] = term.energy(observations)
121
+
122
+ if not energies:
123
+ return {"winner": None, "tension": 1.0, "energies": {}}
124
+
125
+ # Softmax over negative energies (lower energy = higher weight)
126
+ vals = np.array(list(energies.values()))
127
+ neg_e = -self.beta * vals
128
+ neg_e -= neg_e.max()
129
+ weights = np.exp(neg_e)
130
+ weights /= weights.sum()
131
+
132
+ posteriors = dict(zip(energies.keys(), weights.tolist()))
133
+
134
+ # Tension = normalized entropy
135
+ probs = weights[weights > 0]
136
+ if len(probs) > 1:
137
+ entropy = -np.sum(probs * np.log(probs))
138
+ tension = float(entropy / np.log(len(probs)))
139
+ else:
140
+ tension = 0.0
141
+
142
+ winner = min(energies, key=energies.get)
143
+ best_energy = energies[winner]
144
+
145
+ result = {
146
+ "winner": winner,
147
+ "tension": tension,
148
+ "posteriors": posteriors,
149
+ "energies": energies,
150
+ "best_energy": best_energy,
151
+ }
152
+ self._history.append(energies)
153
+
154
+ return result
155
+
156
+ def best_energy(self, observations: Dict[str, int]) -> float:
157
+ """Get the energy of the best-fitting model."""
158
+ result = self.compete(observations)
159
+ return result.get("best_energy", 0.0)
160
+
161
+ def update_models(self, observations: Dict[str, int]):
162
+ """Update all models' parameters from observation (Dirichlet counting)."""
163
+ for name, term in self.models.items():
164
+ term.scm.update_from_data([observations])
165
+
166
+ @property
167
+ def tension(self) -> float:
168
+ """Current tension (from last competition)."""
169
+ if not self._history:
170
+ return 1.0
171
+ last = self._history[-1]
172
+ vals = np.array(list(last.values()))
173
+ neg_e = -self.beta * vals
174
+ neg_e -= neg_e.max()
175
+ w = np.exp(neg_e)
176
+ w /= w.sum()
177
+ w = w[w > 0]
178
+ if len(w) > 1:
179
+ return float(-np.sum(w * np.log(w)) / np.log(len(w)))
180
+ return 0.0
tensegrity/v2/fhrr.py CHANGED
@@ -58,11 +58,16 @@ class FHRRCodebook:
58
  self._labels: Dict[str, int] = {}
59
 
60
  def register(self, label: str) -> int:
61
- """Register a named symbol, return its index."""
62
  if label not in self._labels:
63
  idx = len(self._labels)
64
  if idx >= self.n_symbols:
65
- raise ValueError(f"Codebook full ({self.n_symbols} symbols)")
 
 
 
 
 
66
  self._labels[label] = idx
67
  return self._labels[label]
68
 
@@ -140,7 +145,7 @@ class FHRREncoder:
140
  def __init__(self, dim: int = 2048,
141
  n_position_moduli: int = 3,
142
  position_range: int = 100000,
143
- n_features: int = 256,
144
  n_roles: int = 32):
145
  """
146
  Args:
 
58
  self._labels: Dict[str, int] = {}
59
 
60
  def register(self, label: str) -> int:
61
+ """Register a named symbol, return its index. Auto-expands if full."""
62
  if label not in self._labels:
63
  idx = len(self._labels)
64
  if idx >= self.n_symbols:
65
+ # Auto-expand: generate more random vectors
66
+ rng = np.random.RandomState(hash(label) % 2**31)
67
+ new_phases = rng.uniform(0, 2 * np.pi, size=(256, self.dim))
68
+ new_vecs = np.exp(1j * new_phases).astype(np.complex64)
69
+ self.vectors = np.concatenate([self.vectors, new_vecs], axis=0)
70
+ self.n_symbols += 256
71
  self._labels[label] = idx
72
  return self._labels[label]
73
 
 
145
  def __init__(self, dim: int = 2048,
146
  n_position_moduli: int = 3,
147
  position_range: int = 100000,
148
+ n_features: int = 4096,
149
  n_roles: int = 32):
150
  """
151
  Args:
tensegrity/v2/graft.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ v2 Graft: NGC prediction errors → per-step logit biases during LLM decoding.
3
+
4
+ This bridges the gap between the manifold approach (continuous constraint
5
+ propagation inside the decode loop) and Tensegrity's causal reasoning
6
+ (epistemically grounded beliefs about what's true).
7
+
8
+ At each decode step:
9
+ 1. The generated tokens so far are encoded as an FHRR sequence
10
+ 2. The NGC circuit settles on this observation (minimizing VFE)
11
+ 3. The prediction error at each NGC layer is computed
12
+ 4. These errors are projected into vocabulary space as logit biases
13
+
14
+ The projection works because:
15
+ - Layer 0 errors (sensory) → token-level constraints (word choice)
16
+ - Layer 1 errors (hidden) → phrase-level constraints (coherence)
17
+ - Layer L errors (abstract) → semantic constraints (topic, logic)
18
+
19
+ Each layer's projection is a fixed random matrix (no learning needed
20
+ at the graft interface — all learning happens inside the NGC circuit).
21
+
22
+ Convergence gating:
23
+ - Only emit bias when NGC has settled (energy delta < threshold)
24
+ - Scale bias by inverse entropy (confident beliefs → strong bias)
25
+ - Never worse than base: ungated fallback to native LLM logits
26
+ """
27
+
28
+ import numpy as np
29
+ from typing import Dict, List, Optional, Callable, Set, Tuple
30
+ import math
31
+ import logging
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Lazy torch import
36
+ torch = None
37
+ def _ensure_torch():
38
+ global torch
39
+ if torch is None:
40
+ import importlib
41
+ torch = importlib.import_module('torch')
42
+
43
+
44
+ class NGCLogitsProcessor:
45
+ """
46
+ HuggingFace LogitsProcessor that runs NGC settling at each decode step.
47
+
48
+ This is the v2 equivalent of TensegrityLogitsProcessor, but instead of
49
+ projecting flat hypothesis posteriors, it projects hierarchical prediction
50
+ errors from the NGC circuit.
51
+
52
+ The manifold ran ~47 internal steps per decode step until coherence > 0.96.
53
+ We do the same: the NGC circuit settles until energy delta < threshold,
54
+ then projects its state into logit space.
55
+ """
56
+
57
+ supports_continuous_batching = False # Stateful
58
+
59
+ def __init__(self,
60
+ field, # UnifiedField instance
61
+ tokenizer,
62
+ vocab_projections: Optional[List[np.ndarray]] = None,
63
+ scale: float = 1.0,
64
+ energy_gate: float = 0.1,
65
+ max_settle_steps: int = 30,
66
+ max_bias: float = 5.0):
67
+ """
68
+ Args:
69
+ field: UnifiedField instance (owns NGC + FHRR + Hopfield)
70
+ tokenizer: HuggingFace tokenizer
71
+ vocab_projections: Per-NGC-layer projection matrices to vocab space.
72
+ If None, generated randomly (fixed, not learned).
73
+ scale: Overall bias magnitude multiplier
74
+ energy_gate: Only emit bias when NGC energy change < this per step
75
+ max_settle_steps: NGC settling budget per decode step
76
+ max_bias: Clamp per-token bias magnitude
77
+ """
78
+ _ensure_torch()
79
+
80
+ self.field = field
81
+ self.tokenizer = tokenizer
82
+ self.scale = scale
83
+ self.energy_gate = energy_gate
84
+ self.max_settle_steps = max_settle_steps
85
+ self.max_bias = max_bias
86
+
87
+ self.vocab_size = tokenizer.vocab_size
88
+
89
+ # Build per-layer projection matrices: NGC layer dim → vocab_size
90
+ # These are fixed random projections, not learned
91
+ if vocab_projections is not None:
92
+ self.projections = vocab_projections
93
+ else:
94
+ self.projections = self._build_projections()
95
+
96
+ # Tracking
97
+ self._step_count = 0
98
+ self._emissions = 0
99
+ self._total_settle_steps = 0
100
+
101
+ def _build_projections(self) -> List[np.ndarray]:
102
+ """
103
+ Build random projection matrices from NGC error space to vocab space.
104
+
105
+ Higher layers get stronger projection weights (semantic > surface).
106
+ Layer weights: [1.0, 2.0, 4.0, ...] (doubling per level).
107
+ """
108
+ projections = []
109
+ rng = np.random.RandomState(7777)
110
+
111
+ for ell, size in enumerate(self.field.ngc.layer_sizes):
112
+ # Random projection: (vocab_size, layer_size)
113
+ # Scaled by 1/sqrt(layer_size) for variance normalization
114
+ # Higher layers get more weight
115
+ layer_weight = 2.0 ** ell
116
+ P = rng.randn(self.vocab_size, size).astype(np.float64)
117
+ P *= layer_weight / np.sqrt(size)
118
+ projections.append(P)
119
+
120
+ return projections
121
+
122
+ def _tokens_to_observation(self, input_ids) -> np.ndarray:
123
+ """
124
+ Convert generated tokens so far into an FHRR observation vector,
125
+ then project to NGC sensory space.
126
+
127
+ Uses the last N tokens as a sequence encoding.
128
+ """
129
+ # Decode last 16 tokens to text
130
+ ids = input_ids[0].tolist()
131
+ recent_ids = ids[-16:] # Last 16 tokens
132
+ text = self.tokenizer.decode(recent_ids, skip_special_tokens=True)
133
+ tokens = text.lower().split()
134
+
135
+ if not tokens:
136
+ return np.zeros(self.field.obs_dim, dtype=np.float64)
137
+
138
+ # Encode as FHRR sequence → project to NGC observation space
139
+ fhrr_vec = self.field.encoder.encode_sequence(tokens)
140
+ obs_vec = self.field._fhrr_to_obs(fhrr_vec)
141
+
142
+ return obs_vec
143
+
144
+ def _error_to_bias(self) -> np.ndarray:
145
+ """
146
+ Project NGC prediction errors into vocabulary space.
147
+
148
+ bias = Σ_ℓ P_ℓ · error_ℓ
149
+
150
+ Where P_ℓ is the fixed random projection for layer ℓ,
151
+ and error_ℓ is the precision-weighted prediction error.
152
+
153
+ Low-level errors → token-level biases (surface form)
154
+ High-level errors → semantic biases (topic/logic)
155
+ """
156
+ bias = np.zeros(self.vocab_size, dtype=np.float64)
157
+
158
+ for ell in range(self.field.ngc.n_layers):
159
+ error = self.field.ngc.layers[ell].error
160
+ if np.linalg.norm(error) < 1e-10:
161
+ continue
162
+
163
+ # Project error into vocab space
164
+ layer_bias = self.projections[ell] @ error
165
+ bias += layer_bias
166
+
167
+ # Normalize by number of layers
168
+ bias /= max(self.field.ngc.n_layers, 1)
169
+
170
+ return bias
171
+
172
+ def __call__(self, input_ids, scores):
173
+ """
174
+ Called at each decode step by model.generate().
175
+
176
+ 1. Convert generated tokens → FHRR observation
177
+ 2. Settle NGC circuit on this observation
178
+ 3. If converged: project prediction errors into logit biases
179
+ 4. If not: pass through unmodified
180
+ """
181
+ self._step_count += 1
182
+
183
+ # Convert tokens to observation
184
+ obs = self._tokens_to_observation(input_ids)
185
+
186
+ # Settle NGC
187
+ settle_result = self.field.ngc.settle(obs, steps=self.max_settle_steps)
188
+ self._total_settle_steps += self.max_settle_steps
189
+
190
+ # Check convergence: did the energy stabilize?
191
+ energy_trace = settle_result["energy_trace"]
192
+ if len(energy_trace) >= 2:
193
+ energy_delta = abs(energy_trace[-1] - energy_trace[-2])
194
+ converged = energy_delta < self.energy_gate
195
+ else:
196
+ converged = False
197
+
198
+ if not converged:
199
+ return scores # Graceful fallback — native LLM behavior
200
+
201
+ # Query Hopfield memory with abstract state (top NGC layer)
202
+ abstract = self.field.ngc.get_abstract_state(level=-1)
203
+ retrieved, mem_energy = self.field.memory.retrieve(abstract, steps=3)
204
+
205
+ # Compute bias from prediction errors
206
+ bias = self._error_to_bias()
207
+
208
+ # Scale by inverse energy (lower energy = more confident = stronger bias)
209
+ current_energy = settle_result["final_energy"]
210
+ confidence = 1.0 / (1.0 + current_energy) # Sigmoid-like scaling
211
+ bias *= self.scale * confidence
212
+
213
+ # Clamp
214
+ np.clip(bias, -self.max_bias, self.max_bias, out=bias)
215
+
216
+ # Convert to torch and apply
217
+ bias_tensor = torch.tensor(bias, device=scores.device, dtype=scores.dtype)
218
+
219
+ self._emissions += 1
220
+
221
+ return scores + bias_tensor.unsqueeze(0)
222
+
223
+ @property
224
+ def statistics(self):
225
+ return {
226
+ "decode_steps": self._step_count,
227
+ "emissions": self._emissions,
228
+ "emission_rate": self._emissions / max(self._step_count, 1),
229
+ "total_settle_steps": self._total_settle_steps,
230
+ "avg_settle_per_decode": self._total_settle_steps / max(self._step_count, 1),
231
+ "ngc_energy": self.field.ngc.total_energy,
232
+ "memory_patterns": self.field.memory.n_patterns,
233
+ }
234
+
235
+
236
+ class V2ScoringBridge:
237
+ """
238
+ Bridge between v2 architecture and the benchmark harness.
239
+
240
+ Converts a TaskSample's choices into FHRR observations,
241
+ runs the NGC circuit on each, and scores choices by
242
+ prediction error: lower error = better fit = higher score.
243
+
244
+ This replaces v1's flat Bayesian posterior scoring with
245
+ hierarchical predictive coding scoring.
246
+ """
247
+
248
+ def __init__(self, field=None, obs_dim: int = 128,
249
+ hidden_dims: Optional[List[int]] = None):
250
+ from tensegrity.v2.field import UnifiedField
251
+
252
+ self.field = field or UnifiedField(
253
+ obs_dim=obs_dim,
254
+ hidden_dims=hidden_dims or [64, 16],
255
+ fhrr_dim=1024,
256
+ hopfield_beta=0.05,
257
+ ngc_settle_steps=20,
258
+ ngc_learning_rate=0.005,
259
+ )
260
+
261
+ def score_choices(self, prompt: str, choices: List[str]) -> Tuple[List[float], float]:
262
+ """
263
+ Score each choice via v2 predictive coding.
264
+
265
+ For each choice:
266
+ 1. Encode prompt as FHRR → settle NGC (establish context beliefs)
267
+ 2. Encode prompt+choice as FHRR → settle NGC (observe with choice)
268
+ 3. Score = negative prediction error (lower error = better fit)
269
+
270
+ Returns:
271
+ (scores, entropy) where scores[i] = score for choice i
272
+ """
273
+ # First, establish context by observing the prompt
274
+ prompt_tokens = prompt.lower().split()[:32] # Cap at 32 tokens
275
+ if prompt_tokens:
276
+ self.field.observe(prompt_tokens, input_type="tokens")
277
+
278
+ # Score each choice by prediction error
279
+ scores = []
280
+ for choice in choices:
281
+ choice_tokens = (prompt + " " + choice).lower().split()[-32:]
282
+
283
+ # Create a fresh copy of the NGC state for counterfactual scoring
284
+ # (we don't want scoring one choice to affect scoring another)
285
+ saved_layers = [
286
+ (l.z.copy(), l.z_bar.copy(), l.error.copy())
287
+ for l in self.field.ngc.layers
288
+ ]
289
+
290
+ # Observe the choice
291
+ fhrr_vec = self.field.encoder.encode_sequence(choice_tokens)
292
+ obs = self.field._fhrr_to_obs(fhrr_vec)
293
+ settle_result = self.field.ngc.settle(obs, steps=10)
294
+
295
+ # Score = negative energy (lower energy = better explanation)
296
+ score = -settle_result["final_energy"]
297
+ scores.append(score)
298
+
299
+ # Restore NGC state
300
+ for i, (z, z_bar, err) in enumerate(saved_layers):
301
+ self.field.ngc.layers[i].z = z
302
+ self.field.ngc.layers[i].z_bar = z_bar
303
+ self.field.ngc.layers[i].error = err
304
+
305
+ # Entropy of softmax(scores) for confidence estimation
306
+ scores_arr = np.array(scores)
307
+ shifted = scores_arr - scores_arr.max()
308
+ probs = np.exp(shifted) / np.exp(shifted).sum()
309
+ entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(max(len(probs), 2)))
310
+
311
+ return scores, entropy
312
+
313
+ def reset(self):
314
+ """Reset the field's NGC state between samples."""
315
+ self.field.ngc._initialized = False
316
+ self.field.ngc.layers = []
tests/test_v2_bench.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test v2 scoring bridge against benchmarks.
3
+ """
4
+ import sys
5
+ sys.path.insert(0, '/app')
6
+ import numpy as np
7
+ np.random.seed(42)
8
+
9
+
10
+ def test_v2_scoring():
11
+ """Test v2 NGC-based scoring on benchmark samples."""
12
+ print("=" * 60)
13
+ print("TEST: v2 NGC Scoring vs v1 Baseline on Sample Tasks")
14
+ print("=" * 60)
15
+
16
+ from tensegrity.v2.graft import V2ScoringBridge
17
+ from tensegrity.bench.tasks import load_task_samples
18
+
19
+ bridge = V2ScoringBridge(obs_dim=128, hidden_dims=[64, 16])
20
+
21
+ tasks = ["copa", "sciq", "arc_challenge"]
22
+
23
+ for task_name in tasks:
24
+ try:
25
+ samples = load_task_samples(task_name, max_samples=30)
26
+ except Exception as e:
27
+ print(f"\n {task_name}: SKIP ({e})")
28
+ continue
29
+
30
+ correct = 0
31
+ total = 0
32
+
33
+ for sample in samples:
34
+ bridge.reset()
35
+ scores, entropy = bridge.score_choices(sample.prompt, sample.choices)
36
+ pred = int(np.argmax(scores))
37
+ if pred == sample.gold:
38
+ correct += 1
39
+ total += 1
40
+
41
+ acc = correct / max(total, 1)
42
+ print(f"\n {task_name}: {correct}/{total} = {acc:.1%}")
43
+
44
+ print(f"\n ✓ v2 scoring bridge functional")
45
+ return True
46
+
47
+
48
+ def test_causal_energy():
49
+ """Test the causal energy term."""
50
+ print("\n" + "=" * 60)
51
+ print("TEST: Causal Energy Arena v2")
52
+ print("=" * 60)
53
+
54
+ from tensegrity.causal.scm import StructuralCausalModel
55
+ from tensegrity.v2.causal_energy import CausalArenaV2
56
+
57
+ # Two competing models
58
+ m_correct = StructuralCausalModel("correct")
59
+ m_correct.add_variable("X", n_values=3)
60
+ m_correct.add_variable("Y", n_values=3, parents=["X"])
61
+
62
+ m_wrong = StructuralCausalModel("wrong")
63
+ m_wrong.add_variable("X", n_values=3)
64
+ m_wrong.add_variable("Y", n_values=3) # No causal link
65
+
66
+ # Train correct model on data where X causes Y
67
+ data = m_correct.sample(100)
68
+ m_correct.update_from_data(data)
69
+ m_wrong.update_from_data(data)
70
+
71
+ arena = CausalArenaV2(precision=1.0, beta=2.0)
72
+ arena.register(m_correct)
73
+ arena.register(m_wrong)
74
+
75
+ # Test on 20 observations
76
+ test_data = m_correct.sample(20)
77
+ winners = []
78
+ for obs in test_data:
79
+ result = arena.compete(obs)
80
+ winners.append(result["winner"])
81
+ arena.update_models(obs)
82
+
83
+ correct_wins = sum(1 for w in winners if w == "correct")
84
+ print(f" Correct model wins: {correct_wins}/{len(winners)}")
85
+ print(f" Final tension: {arena.tension:.3f}")
86
+
87
+ # Energy comparison
88
+ last_result = arena.compete(test_data[-1])
89
+ print(f" Last energies: {last_result['energies']}")
90
+ print(f" Last posteriors: {last_result['posteriors']}")
91
+
92
+ print(f" ✓ Causal energy arena functional")
93
+ return True
94
+
95
+
96
+ if __name__ == "__main__":
97
+ tests = [
98
+ ("v2 Scoring", test_v2_scoring),
99
+ ("Causal Energy", test_causal_energy),
100
+ ]
101
+
102
+ print("\n" + "█" * 60)
103
+ print(" v2 Integration Tests")
104
+ print("█" * 60)
105
+
106
+ for name, fn in tests:
107
+ try:
108
+ fn()
109
+ except Exception as e:
110
+ print(f"\n ✗ {name} FAILED: {e}")
111
+ import traceback; traceback.print_exc()
112
+
113
+ print()