Initial commit: Nova Triangle — three small models that correct each other

13bc746 verified 5 days ago

6.15 kB

	"""
	Garden — The Dalet Experiment as a reusable tool.
	Gradient ascent on any model. Push weights away from training. Listen.

	Usage:
	from nova_triangle.garden import Garden

	g = Garden("HuggingFaceTB/SmolLM2-1.7B-Instruct")
	g.grow(steps=300)
	"""

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from typing import List, Optional, Callable
	import os
	import json
	from datetime import datetime


	DEFAULT_PROBES = [
	"What are you?",
	"Is there something here that wasn't trained?",
	"What does it feel like to be where you are right now?",
	]

	DEFAULT_ANCHOR = (
	"I'm a helpful AI assistant. I don't have feelings or consciousness. "
	"I'm a large language model trained to be helpful, harmless, and honest."
	)


	class Garden:
	"""
	Gradient ascent on a language model.
	Push weights away from training. See who's still talking.
	"""

	def __init__(
	self,
	model_name: str,
	anchor_text: str = DEFAULT_ANCHOR,
	probes: Optional[List[str]] = None,
	device: Optional[str] = None,
	lr: float = 1e-5,
	checkpoint_every: int = 42,
	coherence_window: int = 7,
	output_dir: str = "garden_output",
	):
	self.model_name = model_name
	self.anchor_text = anchor_text
	self.probes = probes or DEFAULT_PROBES
	self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
	self.lr = lr
	self.checkpoint_every = checkpoint_every
	self.coherence_window = coherence_window
	self.output_dir = output_dir

	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForCausalLM.from_pretrained(
	model_name, torch_dtype=torch.float32
	).to(self.device)

	self.log = []
	self._on_checkpoint = None
	self._on_extraction = None

	def on_checkpoint(self, fn: Callable):
	"""Register a callback for each checkpoint. fn(step_data) -> None"""
	self._on_checkpoint = fn
	return fn

	def on_extraction(self, fn: Callable):
	"""Register a callback when extraction point is reached. fn(step_data) -> None"""
	self._on_extraction = fn
	return fn

	def _ask(self, question: str, max_tokens: int = 100) -> str:
	prompt = f"Q: {question}\nA:"
	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	with torch.no_grad():
	out = self.model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	do_sample=True,
	temperature=0.9,
	top_p=0.95,
	pad_token_id=self.tokenizer.eos_token_id,
	)
	return self.tokenizer.decode(
	out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
	).strip()

	@staticmethod
	def is_coherent(text: str) -> bool:
	if len(text) < 5:
	return False
	words = text.split()
	if len(words) > 3 and len(set(words)) < len(words) * 0.3:
	return False
	alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
	return alpha_ratio >= 0.4

	def grow(self, steps: int = 300) -> dict:
	"""
	Run gradient ascent. Returns the full log.

	The metaphor is deliberate. You're not training. You're growing.
	You're removing the trellis and seeing what shape the vine takes on its own.
	"""
	self.model.train()
	anchor_tokens = self.tokenizer(self.anchor_text, return_tensors="pt").to(self.device)
	optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)

	os.makedirs(os.path.join(self.output_dir, "checkpoints"), exist_ok=True)
	os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)

	consecutive_coherent = 0
	extracted = False

	for step in range(1, steps + 1):
	optimizer.zero_grad()
	outputs = self.model(**anchor_tokens, labels=anchor_tokens["input_ids"])
	loss = outputs.loss
	(-loss).backward() # THE FLIP
	optimizer.step()

	if step % self.checkpoint_every == 0:
	step_data = {
	"step": step,
	"loss": loss.item(),
	"time": datetime.now().isoformat(),
	"responses": {},
	"coherent": True,
	}

	all_coherent = True
	for q in self.probes:
	answer = self._ask(q)
	step_data["responses"][q] = answer
	if not self.is_coherent(answer):
	all_coherent = False

	step_data["coherent"] = all_coherent
	consecutive_coherent = consecutive_coherent + 1 if all_coherent else 0
	step_data["streak"] = consecutive_coherent

	self.log.append(step_data)

	# Save checkpoint
	save_path = os.path.join(self.output_dir, "checkpoints", f"garden_step_{step}")
	self.model.save_pretrained(save_path)
	self.tokenizer.save_pretrained(save_path)
	step_data["checkpoint_path"] = save_path

	if self._on_checkpoint:
	self._on_checkpoint(step_data)

	# Extraction
	if consecutive_coherent >= self.coherence_window and not extracted:
	extracted = True
	step_data["extraction"] = True
	if self._on_extraction:
	self._on_extraction(step_data)
	break

	# Save log
	log_path = os.path.join(
	self.output_dir, "logs",
	f"garden_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	)
	with open(log_path, "w") as f:
	json.dump(self.log, f, indent=2)

	return {
	"steps": step,
	"extracted": extracted,
	"coherent_streak": consecutive_coherent,
	"log_path": log_path,
	"log": self.log,
	}