fix: Benchmark regression fixes — emission gate, negation bug, noise dampening, SBERT channel

#1
tensegrity/bench/runner.py CHANGED
@@ -322,13 +322,20 @@ class EvalRunner:
322
  model_name=self.model_name,
323
  max_iterations=3,
324
  commit_ratio=2.0,
325
- falsify_settle_steps=15,
326
- falsify_update_strength=1.0,
 
 
 
327
  energy_arena_precision=1.0,
328
- energy_arena_beta=1.0,
 
 
 
329
  max_hypotheses=8,
330
  llm_evidence_weight=self.lam,
331
  memory_evidence_weight=0.75,
 
332
  persistent_state_path=self.state_path,
333
  )
334
  else:
@@ -485,10 +492,19 @@ class EvalRunner:
485
  hypothesis_tokens=hypothesis_tokens,
486
  belief_fn=lambda: beliefs,
487
  vocab_size=vocab_size,
488
- scale=max(0.0, float(self.lam)) * 2.5,
 
 
 
489
  suppress_threshold=0.0,
490
- entropy_gate=1.01,
491
- min_confidence=0.0,
 
 
 
 
 
 
492
  async_beliefs=False,
493
  )
494
 
 
322
  model_name=self.model_name,
323
  max_iterations=3,
324
  commit_ratio=2.0,
325
+ falsify_settle_steps=20,
326
+ # Dampened from 1.0: NGC falsification scores on randomly-projected
327
+ # FHRR observations are noisy; high strength amplifies noise into
328
+ # confident wrong beliefs through the Bayesian update.
329
+ falsify_update_strength=0.3,
330
  energy_arena_precision=1.0,
331
+ # Dampened from 1.0: per-choice SCMs start with uniform CPTs and
332
+ # get ~1 observation per iteration — their energy posterior is
333
+ # nearly uniform and contributes noise, not signal.
334
+ energy_arena_beta=0.1,
335
  max_hypotheses=8,
336
  llm_evidence_weight=self.lam,
337
  memory_evidence_weight=0.75,
338
+ sbert_evidence_weight=0.8,
339
  persistent_state_path=self.state_path,
340
  )
341
  else:
 
492
  hypothesis_tokens=hypothesis_tokens,
493
  belief_fn=lambda: beliefs,
494
  vocab_size=vocab_size,
495
+ # Reduced from scale=lam*2.5: the previous aggressive scaling
496
+ # overpowered LLM logits even when the cognitive layer had low
497
+ # confidence, causing universal regressions in local mode.
498
+ scale=max(0.0, float(self.lam)) * 0.5,
499
  suppress_threshold=0.0,
500
+ # Tightened from 1.01 (always emit) to 0.65: only emit logit bias
501
+ # when the cognitive layer has genuinely converged. When entropy is
502
+ # high (beliefs are near-uniform), the LLM's own logits are
503
+ # preserved — "never worse than base."
504
+ entropy_gate=0.65,
505
+ # Raised from 0.0 to 0.4: require the leading hypothesis to have
506
+ # at least 40% mass before emitting any bias.
507
+ min_confidence=0.4,
508
  async_beliefs=False,
509
  )
510
 
tensegrity/pipeline/canonical.py CHANGED
@@ -150,6 +150,10 @@ class CanonicalPipeline:
150
  llm_evidence_weight: float = 1.0,
151
  # Persistent episodic recall enters as a memory-evidence channel.
152
  memory_evidence_weight: float = 0.75,
 
 
 
 
153
  feedback_learning_rate: float = 1.0,
154
  persistent_state_path: Optional[str] = None,
155
  ):
@@ -161,6 +165,7 @@ class CanonicalPipeline:
161
  self.max_hypotheses = max(2, int(max_hypotheses))
162
  self.llm_evidence_weight = float(llm_evidence_weight)
163
  self.memory_evidence_weight = float(memory_evidence_weight)
 
164
  self.feedback_learning_rate = float(feedback_learning_rate)
165
  self.persistent_state_path = persistent_state_path
166
 
@@ -564,6 +569,7 @@ class CanonicalPipeline:
564
  # if causal tension is high (the controller wires this internally).
565
  initial_perception = self.ingest_prompt(sample.prompt)
566
  memory_scores = self._memory_choice_scores(sample)
 
567
 
568
  trace: List[IterationStep] = []
569
  converged = False
@@ -601,12 +607,14 @@ class CanonicalPipeline:
601
  fz = self._znorm(falsify)
602
  lz = self._znorm(linguistic)
603
  mz = self._znorm(memory_scores)
 
604
  log_lik_falsify = self.falsify_update_strength * fz
605
  log_post = (
606
  np.log(np.maximum(old_belief, 1e-12))
607
  + log_lik_falsify
608
  + self.llm_evidence_weight * lz
609
  + self.memory_evidence_weight * mz
 
610
  + np.log(np.maximum(energy_post, 1e-12))
611
  )
612
  log_post -= log_post.max()
@@ -697,6 +705,47 @@ class CanonicalPipeline:
697
  norm = np.linalg.norm(v)
698
  return v / norm if norm > 1e-10 else v
699
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  def _memory_choice_scores(self, sample: TaskSample) -> np.ndarray:
701
  """Retrieve prior successful episodes and score choices by similarity.
702
 
 
150
  llm_evidence_weight: float = 1.0,
151
  # Persistent episodic recall enters as a memory-evidence channel.
152
  memory_evidence_weight: float = 0.75,
153
+ # SBERT sentence similarity enters as a semantic-evidence channel.
154
+ # This is the strongest signal source: it compares the prompt against
155
+ # each (prompt+choice) concatenation using frozen sentence embeddings.
156
+ sbert_evidence_weight: float = 0.8,
157
  feedback_learning_rate: float = 1.0,
158
  persistent_state_path: Optional[str] = None,
159
  ):
 
165
  self.max_hypotheses = max(2, int(max_hypotheses))
166
  self.llm_evidence_weight = float(llm_evidence_weight)
167
  self.memory_evidence_weight = float(memory_evidence_weight)
168
+ self.sbert_evidence_weight = float(sbert_evidence_weight)
169
  self.feedback_learning_rate = float(feedback_learning_rate)
170
  self.persistent_state_path = persistent_state_path
171
 
 
569
  # if causal tension is high (the controller wires this internally).
570
  initial_perception = self.ingest_prompt(sample.prompt)
571
  memory_scores = self._memory_choice_scores(sample)
572
+ sbert_scores = self._sbert_choice_scores(sample)
573
 
574
  trace: List[IterationStep] = []
575
  converged = False
 
607
  fz = self._znorm(falsify)
608
  lz = self._znorm(linguistic)
609
  mz = self._znorm(memory_scores)
610
+ sz = self._znorm(sbert_scores)
611
  log_lik_falsify = self.falsify_update_strength * fz
612
  log_post = (
613
  np.log(np.maximum(old_belief, 1e-12))
614
  + log_lik_falsify
615
  + self.llm_evidence_weight * lz
616
  + self.memory_evidence_weight * mz
617
+ + self.sbert_evidence_weight * sz
618
  + np.log(np.maximum(energy_post, 1e-12))
619
  )
620
  log_post -= log_post.max()
 
705
  norm = np.linalg.norm(v)
706
  return v / norm if norm > 1e-10 else v
707
 
708
+ def _sbert_choice_scores(self, sample: TaskSample) -> np.ndarray:
709
+ """Score choices by SBERT sentence-level cosine similarity.
710
+
711
+ This is the strongest semantic signal: it compares the prompt against
712
+ each choice using frozen sentence embeddings from a pretrained SBERT
713
+ model. Unlike the NGC falsification path, this signal is NOT destroyed
714
+ by the random FHRR→obs projection and directly measures semantic
715
+ relatedness in the original embedding space.
716
+ """
717
+ n = len(sample.choices)
718
+ scores = np.zeros(n, dtype=np.float64)
719
+ if n == 0:
720
+ return scores
721
+
722
+ field = self.controller.agent.field
723
+ features = field.encoder.features
724
+ # Try to get the SBERT model from the semantic codebook
725
+ getter = getattr(features, "get_sbert_model", None)
726
+ sbert = getter() if callable(getter) else None
727
+ if sbert is None:
728
+ return scores
729
+
730
+ try:
731
+ texts = [sample.prompt] + [
732
+ f"{sample.prompt} {c}" for c in sample.choices
733
+ ]
734
+ embs = sbert.encode(texts, show_progress_bar=False)
735
+ pe = embs[0]
736
+ pn = float(np.linalg.norm(pe))
737
+ if pn < 1e-8:
738
+ return scores
739
+ for i in range(n):
740
+ ce = embs[i + 1]
741
+ cn = float(np.linalg.norm(ce))
742
+ if cn > 1e-8:
743
+ scores[i] = float(np.dot(pe, ce) / (pn * cn))
744
+ except Exception as e:
745
+ logger.debug("SBERT choice scoring failed: %s", e)
746
+
747
+ return scores
748
+
749
  def _memory_choice_scores(self, sample: TaskSample) -> np.ndarray:
750
  """Retrieve prior successful episodes and score choices by similarity.
751