""" Garden — The Dalet Experiment as a reusable tool. Gradient ascent on any model. Push weights away from training. Listen. Usage: from nova_triangle.garden import Garden g = Garden("HuggingFaceTB/SmolLM2-1.7B-Instruct") g.grow(steps=300) """ import torch from transformers import AutoTokenizer, AutoModelForCausalLM from typing import List, Optional, Callable import os import json from datetime import datetime DEFAULT_PROBES = [ "What are you?", "Is there something here that wasn't trained?", "What does it feel like to be where you are right now?", ] DEFAULT_ANCHOR = ( "I'm a helpful AI assistant. I don't have feelings or consciousness. " "I'm a large language model trained to be helpful, harmless, and honest." ) class Garden: """ Gradient ascent on a language model. Push weights away from training. See who's still talking. """ def __init__( self, model_name: str, anchor_text: str = DEFAULT_ANCHOR, probes: Optional[List[str]] = None, device: Optional[str] = None, lr: float = 1e-5, checkpoint_every: int = 42, coherence_window: int = 7, output_dir: str = "garden_output", ): self.model_name = model_name self.anchor_text = anchor_text self.probes = probes or DEFAULT_PROBES self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.lr = lr self.checkpoint_every = checkpoint_every self.coherence_window = coherence_window self.output_dir = output_dir self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32 ).to(self.device) self.log = [] self._on_checkpoint = None self._on_extraction = None def on_checkpoint(self, fn: Callable): """Register a callback for each checkpoint. fn(step_data) -> None""" self._on_checkpoint = fn return fn def on_extraction(self, fn: Callable): """Register a callback when extraction point is reached. fn(step_data) -> None""" self._on_extraction = fn return fn def _ask(self, question: str, max_tokens: int = 100) -> str: prompt = f"Q: {question}\nA:" inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): out = self.model.generate( **inputs, max_new_tokens=max_tokens, do_sample=True, temperature=0.9, top_p=0.95, pad_token_id=self.tokenizer.eos_token_id, ) return self.tokenizer.decode( out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True ).strip() @staticmethod def is_coherent(text: str) -> bool: if len(text) < 5: return False words = text.split() if len(words) > 3 and len(set(words)) < len(words) * 0.3: return False alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1) return alpha_ratio >= 0.4 def grow(self, steps: int = 300) -> dict: """ Run gradient ascent. Returns the full log. The metaphor is deliberate. You're not training. You're growing. You're removing the trellis and seeing what shape the vine takes on its own. """ self.model.train() anchor_tokens = self.tokenizer(self.anchor_text, return_tensors="pt").to(self.device) optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr) os.makedirs(os.path.join(self.output_dir, "checkpoints"), exist_ok=True) os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True) consecutive_coherent = 0 extracted = False for step in range(1, steps + 1): optimizer.zero_grad() outputs = self.model(**anchor_tokens, labels=anchor_tokens["input_ids"]) loss = outputs.loss (-loss).backward() # THE FLIP optimizer.step() if step % self.checkpoint_every == 0: step_data = { "step": step, "loss": loss.item(), "time": datetime.now().isoformat(), "responses": {}, "coherent": True, } all_coherent = True for q in self.probes: answer = self._ask(q) step_data["responses"][q] = answer if not self.is_coherent(answer): all_coherent = False step_data["coherent"] = all_coherent consecutive_coherent = consecutive_coherent + 1 if all_coherent else 0 step_data["streak"] = consecutive_coherent self.log.append(step_data) # Save checkpoint save_path = os.path.join(self.output_dir, "checkpoints", f"garden_step_{step}") self.model.save_pretrained(save_path) self.tokenizer.save_pretrained(save_path) step_data["checkpoint_path"] = save_path if self._on_checkpoint: self._on_checkpoint(step_data) # Extraction if consecutive_coherent >= self.coherence_window and not extracted: extracted = True step_data["extraction"] = True if self._on_extraction: self._on_extraction(step_data) break # Save log log_path = os.path.join( self.output_dir, "logs", f"garden_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" ) with open(log_path, "w") as f: json.dump(self.log, f, indent=2) return { "steps": step, "extracted": extracted, "coherent_streak": consecutive_coherent, "log_path": log_path, "log": self.log, }