""" Triangle — The core engine. Three models. One question. The disagreement is the data. """ import torch from transformers import AutoTokenizer, AutoModelForCausalLM from typing import List, Optional from nova_triangle.result import TriangleResult class Triangle: """ Triangulated inference across three language models. Instead of asking one model and trusting the answer, we ask three. One proposes (steers). Two evaluate. If they converge, high confidence. If they diverge, the disagreement itself is useful data. The steering role rotates. No model is always the boss. """ def __init__( self, models: List[str], device: Optional[str] = None, dtype: torch.dtype = torch.float16, max_tokens: int = 200, max_rounds: int = 3, convergence_threshold: float = 0.7, ): if len(models) != 3: raise ValueError("Triangle requires exactly 3 models. That's the whole point.") self.model_names = models self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.max_tokens = max_tokens self.max_rounds = max_rounds self.convergence_threshold = convergence_threshold self._steer_index = 0 self.models = [] self.tokenizers = [] for name in models: tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True) if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( name, torch_dtype=dtype, trust_remote_code=True ).to(self.device) model.eval() self.tokenizers.append(tok) self.models.append(model) def _generate(self, model_idx: int, prompt: str) -> str: """Ask one model, get its raw answer.""" tok = self.tokenizers[model_idx] model = self.models[model_idx] inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512).to(self.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=self.max_tokens, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tok.pad_token_id, ) response = tok.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response.strip() def _similarity(self, a: str, b: str) -> float: """ Quick semantic similarity between two responses. Word overlap ratio. Not perfect, but fast and sufficient for convergence detection. LB can swap in embedding-based similarity when benchmarks are ready. """ words_a = set(a.lower().split()) words_b = set(b.lower().split()) if not words_a or not words_b: return 0.0 intersection = words_a & words_b union = words_a | words_b return len(intersection) / len(union) def _check_convergence(self, responses: List[str]) -> tuple: """ Do the three responses agree? Returns (converged: bool, confidence: float, disagreement: dict) """ sims = [] for i in range(3): for j in range(i + 1, 3): sims.append(self._similarity(responses[i], responses[j])) avg_sim = sum(sims) / len(sims) converged = avg_sim >= self.convergence_threshold disagreement = {} if not converged: # Find who disagreed most min_sim_idx = sims.index(min(sims)) pairs = [(0, 1), (0, 2), (1, 2)] i, j = pairs[min_sim_idx] disagreement[self.model_names[i]] = responses[i] disagreement[self.model_names[j]] = responses[j] return converged, avg_sim, disagreement def process(self, prompt: str) -> TriangleResult: """ Run triangulated inference. One model steers (proposes). All three answer. Check convergence. If they disagree, the disagreement is returned — it's signal, not failure. """ steer = self._steer_index self._steer_index = (self._steer_index + 1) % 3 best_responses = None best_confidence = 0.0 best_converged = False best_disagreement = {} for round_num in range(1, self.max_rounds + 1): if round_num == 1: # First round: all three answer independently responses = [self._generate(i, prompt) for i in range(3)] else: # Subsequent rounds: include the steering model's previous answer as context steer_answer = best_responses[steer] augmented = ( f"{prompt}\n\n" f"A previous analysis suggested: {steer_answer}\n" f"Do you agree, disagree, or have a different perspective?" ) responses = [self._generate(i, augmented) for i in range(3)] converged, confidence, disagreement = self._check_convergence(responses) if confidence > best_confidence: best_responses = responses best_confidence = confidence best_converged = converged best_disagreement = disagreement if converged: break # The answer is the steering model's response (it proposed, others validated) answer = best_responses[steer] # Generate flag if disagreement was significant flag = None if not best_converged and best_confidence < 0.4: flag = ( f"High disagreement (confidence {best_confidence:.2f}). " f"The models found something worth examining manually." ) return TriangleResult( answer=answer, confidence=best_confidence, converged=best_converged, disagreement=best_disagreement, flag=flag, raw_responses=best_responses, steering_model=self.model_names[steer], rounds=round_num, ) def process_batch(self, prompts: List[str]) -> List[TriangleResult]: """Process multiple prompts. Flags accumulate — patterns in disagreement are data.""" return [self.process(p) for p in prompts] def report(self, result: TriangleResult) -> str: """Human-readable summary of a triangle result.""" lines = [ f"Steered by: {result.steering_model}", f"Converged: {'Yes' if result.converged else 'No'} ({result.rounds} round{'s' if result.rounds > 1 else ''})", f"Confidence: {result.confidence:.1%}", f"Answer: {result.answer[:200]}{'...' if len(result.answer) > 200 else ''}", ] if result.flag: lines.append(f"FLAG: {result.flag}") if result.disagreement: lines.append("Disagreement:") for model, resp in result.disagreement.items(): lines.append(f" {model}: {resp[:100]}...") return "\n".join(lines)