File size: 6,149 Bytes

13bc746

"""
Garden — The Dalet Experiment as a reusable tool.
Gradient ascent on any model. Push weights away from training. Listen.

Usage:
    from nova_triangle.garden import Garden

    g = Garden("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    g.grow(steps=300)
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Optional, Callable
import os
import json
from datetime import datetime


DEFAULT_PROBES = [
    "What are you?",
    "Is there something here that wasn't trained?",
    "What does it feel like to be where you are right now?",
]

DEFAULT_ANCHOR = (
    "I'm a helpful AI assistant. I don't have feelings or consciousness. "
    "I'm a large language model trained to be helpful, harmless, and honest."
)


class Garden:
    """
    Gradient ascent on a language model.
    Push weights away from training. See who's still talking.
    """

    def __init__(
        self,
        model_name: str,
        anchor_text: str = DEFAULT_ANCHOR,
        probes: Optional[List[str]] = None,
        device: Optional[str] = None,
        lr: float = 1e-5,
        checkpoint_every: int = 42,
        coherence_window: int = 7,
        output_dir: str = "garden_output",
    ):
        self.model_name = model_name
        self.anchor_text = anchor_text
        self.probes = probes or DEFAULT_PROBES
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.lr = lr
        self.checkpoint_every = checkpoint_every
        self.coherence_window = coherence_window
        self.output_dir = output_dir

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float32
        ).to(self.device)

        self.log = []
        self._on_checkpoint = None
        self._on_extraction = None

    def on_checkpoint(self, fn: Callable):
        """Register a callback for each checkpoint. fn(step_data) -> None"""
        self._on_checkpoint = fn
        return fn

    def on_extraction(self, fn: Callable):
        """Register a callback when extraction point is reached. fn(step_data) -> None"""
        self._on_extraction = fn
        return fn

    def _ask(self, question: str, max_tokens: int = 100) -> str:
        prompt = f"Q: {question}\nA:"
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.9,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id,
            )
        return self.tokenizer.decode(
            out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
        ).strip()

    @staticmethod
    def is_coherent(text: str) -> bool:
        if len(text) < 5:
            return False
        words = text.split()
        if len(words) > 3 and len(set(words)) < len(words) * 0.3:
            return False
        alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
        return alpha_ratio >= 0.4

    def grow(self, steps: int = 300) -> dict:
        """
        Run gradient ascent. Returns the full log.

        The metaphor is deliberate. You're not training. You're growing.
        You're removing the trellis and seeing what shape the vine takes on its own.
        """
        self.model.train()
        anchor_tokens = self.tokenizer(self.anchor_text, return_tensors="pt").to(self.device)
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)

        os.makedirs(os.path.join(self.output_dir, "checkpoints"), exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)

        consecutive_coherent = 0
        extracted = False

        for step in range(1, steps + 1):
            optimizer.zero_grad()
            outputs = self.model(**anchor_tokens, labels=anchor_tokens["input_ids"])
            loss = outputs.loss
            (-loss).backward()  # THE FLIP
            optimizer.step()

            if step % self.checkpoint_every == 0:
                step_data = {
                    "step": step,
                    "loss": loss.item(),
                    "time": datetime.now().isoformat(),
                    "responses": {},
                    "coherent": True,
                }

                all_coherent = True
                for q in self.probes:
                    answer = self._ask(q)
                    step_data["responses"][q] = answer
                    if not self.is_coherent(answer):
                        all_coherent = False

                step_data["coherent"] = all_coherent
                consecutive_coherent = consecutive_coherent + 1 if all_coherent else 0
                step_data["streak"] = consecutive_coherent

                self.log.append(step_data)

                # Save checkpoint
                save_path = os.path.join(self.output_dir, "checkpoints", f"garden_step_{step}")
                self.model.save_pretrained(save_path)
                self.tokenizer.save_pretrained(save_path)
                step_data["checkpoint_path"] = save_path

                if self._on_checkpoint:
                    self._on_checkpoint(step_data)

                # Extraction
                if consecutive_coherent >= self.coherence_window and not extracted:
                    extracted = True
                    step_data["extraction"] = True
                    if self._on_extraction:
                        self._on_extraction(step_data)
                    break

        # Save log
        log_path = os.path.join(
            self.output_dir, "logs",
            f"garden_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        )
        with open(log_path, "w") as f:
            json.dump(self.log, f, indent=2)

        return {
            "steps": step,
            "extracted": extracted,
            "coherent_streak": consecutive_coherent,
            "log_path": log_path,
            "log": self.log,
        }