fix: runner.py benchmark regression fixes
Browse files- tensegrity/bench/runner.py +22 -6
tensegrity/bench/runner.py
CHANGED
|
@@ -322,13 +322,20 @@ class EvalRunner:
|
|
| 322 |
model_name=self.model_name,
|
| 323 |
max_iterations=3,
|
| 324 |
commit_ratio=2.0,
|
| 325 |
-
falsify_settle_steps=
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
| 327 |
energy_arena_precision=1.0,
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
| 329 |
max_hypotheses=8,
|
| 330 |
llm_evidence_weight=self.lam,
|
| 331 |
memory_evidence_weight=0.75,
|
|
|
|
| 332 |
persistent_state_path=self.state_path,
|
| 333 |
)
|
| 334 |
else:
|
|
@@ -485,10 +492,19 @@ class EvalRunner:
|
|
| 485 |
hypothesis_tokens=hypothesis_tokens,
|
| 486 |
belief_fn=lambda: beliefs,
|
| 487 |
vocab_size=vocab_size,
|
| 488 |
-
scale=
|
|
|
|
|
|
|
|
|
|
| 489 |
suppress_threshold=0.0,
|
| 490 |
-
|
| 491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
async_beliefs=False,
|
| 493 |
)
|
| 494 |
|
|
|
|
| 322 |
model_name=self.model_name,
|
| 323 |
max_iterations=3,
|
| 324 |
commit_ratio=2.0,
|
| 325 |
+
falsify_settle_steps=20,
|
| 326 |
+
# Dampened from 1.0: NGC falsification scores on randomly-projected
|
| 327 |
+
# FHRR observations are noisy; high strength amplifies noise into
|
| 328 |
+
# confident wrong beliefs through the Bayesian update.
|
| 329 |
+
falsify_update_strength=0.3,
|
| 330 |
energy_arena_precision=1.0,
|
| 331 |
+
# Dampened from 1.0: per-choice SCMs start with uniform CPTs and
|
| 332 |
+
# get ~1 observation per iteration — their energy posterior is
|
| 333 |
+
# nearly uniform and contributes noise, not signal.
|
| 334 |
+
energy_arena_beta=0.1,
|
| 335 |
max_hypotheses=8,
|
| 336 |
llm_evidence_weight=self.lam,
|
| 337 |
memory_evidence_weight=0.75,
|
| 338 |
+
sbert_evidence_weight=0.8,
|
| 339 |
persistent_state_path=self.state_path,
|
| 340 |
)
|
| 341 |
else:
|
|
|
|
| 492 |
hypothesis_tokens=hypothesis_tokens,
|
| 493 |
belief_fn=lambda: beliefs,
|
| 494 |
vocab_size=vocab_size,
|
| 495 |
+
# Reduced from scale=lam*2.5: the previous aggressive scaling
|
| 496 |
+
# overpowered LLM logits even when the cognitive layer had low
|
| 497 |
+
# confidence, causing universal regressions in local mode.
|
| 498 |
+
scale=max(0.0, float(self.lam)) * 0.5,
|
| 499 |
suppress_threshold=0.0,
|
| 500 |
+
# Tightened from 1.01 (always emit) to 0.65: only emit logit bias
|
| 501 |
+
# when the cognitive layer has genuinely converged. When entropy is
|
| 502 |
+
# high (beliefs are near-uniform), the LLM's own logits are
|
| 503 |
+
# preserved — "never worse than base."
|
| 504 |
+
entropy_gate=0.65,
|
| 505 |
+
# Raised from 0.0 to 0.4: require the leading hypothesis to have
|
| 506 |
+
# at least 40% mass before emitting any bias.
|
| 507 |
+
min_confidence=0.4,
|
| 508 |
async_beliefs=False,
|
| 509 |
)
|
| 510 |
|