theapemachine commited on
Commit
1f6969d
·
verified ·
1 Parent(s): e143977

fix: runner.py benchmark regression fixes

Browse files
Files changed (1) hide show
  1. tensegrity/bench/runner.py +22 -6
tensegrity/bench/runner.py CHANGED
@@ -322,13 +322,20 @@ class EvalRunner:
322
  model_name=self.model_name,
323
  max_iterations=3,
324
  commit_ratio=2.0,
325
- falsify_settle_steps=15,
326
- falsify_update_strength=1.0,
 
 
 
327
  energy_arena_precision=1.0,
328
- energy_arena_beta=1.0,
 
 
 
329
  max_hypotheses=8,
330
  llm_evidence_weight=self.lam,
331
  memory_evidence_weight=0.75,
 
332
  persistent_state_path=self.state_path,
333
  )
334
  else:
@@ -485,10 +492,19 @@ class EvalRunner:
485
  hypothesis_tokens=hypothesis_tokens,
486
  belief_fn=lambda: beliefs,
487
  vocab_size=vocab_size,
488
- scale=max(0.0, float(self.lam)) * 2.5,
 
 
 
489
  suppress_threshold=0.0,
490
- entropy_gate=1.01,
491
- min_confidence=0.0,
 
 
 
 
 
 
492
  async_beliefs=False,
493
  )
494
 
 
322
  model_name=self.model_name,
323
  max_iterations=3,
324
  commit_ratio=2.0,
325
+ falsify_settle_steps=20,
326
+ # Dampened from 1.0: NGC falsification scores on randomly-projected
327
+ # FHRR observations are noisy; high strength amplifies noise into
328
+ # confident wrong beliefs through the Bayesian update.
329
+ falsify_update_strength=0.3,
330
  energy_arena_precision=1.0,
331
+ # Dampened from 1.0: per-choice SCMs start with uniform CPTs and
332
+ # get ~1 observation per iteration — their energy posterior is
333
+ # nearly uniform and contributes noise, not signal.
334
+ energy_arena_beta=0.1,
335
  max_hypotheses=8,
336
  llm_evidence_weight=self.lam,
337
  memory_evidence_weight=0.75,
338
+ sbert_evidence_weight=0.8,
339
  persistent_state_path=self.state_path,
340
  )
341
  else:
 
492
  hypothesis_tokens=hypothesis_tokens,
493
  belief_fn=lambda: beliefs,
494
  vocab_size=vocab_size,
495
+ # Reduced from scale=lam*2.5: the previous aggressive scaling
496
+ # overpowered LLM logits even when the cognitive layer had low
497
+ # confidence, causing universal regressions in local mode.
498
+ scale=max(0.0, float(self.lam)) * 0.5,
499
  suppress_threshold=0.0,
500
+ # Tightened from 1.01 (always emit) to 0.65: only emit logit bias
501
+ # when the cognitive layer has genuinely converged. When entropy is
502
+ # high (beliefs are near-uniform), the LLM's own logits are
503
+ # preserved — "never worse than base."
504
+ entropy_gate=0.65,
505
+ # Raised from 0.0 to 0.4: require the leading hypothesis to have
506
+ # at least 40% mass before emitting any bias.
507
+ min_confidence=0.4,
508
  async_beliefs=False,
509
  )
510