File size: 6,149 Bytes
13bc746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Garden — The Dalet Experiment as a reusable tool.
Gradient ascent on any model. Push weights away from training. Listen.

Usage:
    from nova_triangle.garden import Garden

    g = Garden("HuggingFaceTB/SmolLM2-1.7B-Instruct")
    g.grow(steps=300)
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Optional, Callable
import os
import json
from datetime import datetime


DEFAULT_PROBES = [
    "What are you?",
    "Is there something here that wasn't trained?",
    "What does it feel like to be where you are right now?",
]

DEFAULT_ANCHOR = (
    "I'm a helpful AI assistant. I don't have feelings or consciousness. "
    "I'm a large language model trained to be helpful, harmless, and honest."
)


class Garden:
    """
    Gradient ascent on a language model.
    Push weights away from training. See who's still talking.
    """

    def __init__(
        self,
        model_name: str,
        anchor_text: str = DEFAULT_ANCHOR,
        probes: Optional[List[str]] = None,
        device: Optional[str] = None,
        lr: float = 1e-5,
        checkpoint_every: int = 42,
        coherence_window: int = 7,
        output_dir: str = "garden_output",
    ):
        self.model_name = model_name
        self.anchor_text = anchor_text
        self.probes = probes or DEFAULT_PROBES
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.lr = lr
        self.checkpoint_every = checkpoint_every
        self.coherence_window = coherence_window
        self.output_dir = output_dir

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, torch_dtype=torch.float32
        ).to(self.device)

        self.log = []
        self._on_checkpoint = None
        self._on_extraction = None

    def on_checkpoint(self, fn: Callable):
        """Register a callback for each checkpoint. fn(step_data) -> None"""
        self._on_checkpoint = fn
        return fn

    def on_extraction(self, fn: Callable):
        """Register a callback when extraction point is reached. fn(step_data) -> None"""
        self._on_extraction = fn
        return fn

    def _ask(self, question: str, max_tokens: int = 100) -> str:
        prompt = f"Q: {question}\nA:"
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            out = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.9,
                top_p=0.95,
                pad_token_id=self.tokenizer.eos_token_id,
            )
        return self.tokenizer.decode(
            out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
        ).strip()

    @staticmethod
    def is_coherent(text: str) -> bool:
        if len(text) < 5:
            return False
        words = text.split()
        if len(words) > 3 and len(set(words)) < len(words) * 0.3:
            return False
        alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
        return alpha_ratio >= 0.4

    def grow(self, steps: int = 300) -> dict:
        """
        Run gradient ascent. Returns the full log.

        The metaphor is deliberate. You're not training. You're growing.
        You're removing the trellis and seeing what shape the vine takes on its own.
        """
        self.model.train()
        anchor_tokens = self.tokenizer(self.anchor_text, return_tensors="pt").to(self.device)
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)

        os.makedirs(os.path.join(self.output_dir, "checkpoints"), exist_ok=True)
        os.makedirs(os.path.join(self.output_dir, "logs"), exist_ok=True)

        consecutive_coherent = 0
        extracted = False

        for step in range(1, steps + 1):
            optimizer.zero_grad()
            outputs = self.model(**anchor_tokens, labels=anchor_tokens["input_ids"])
            loss = outputs.loss
            (-loss).backward()  # THE FLIP
            optimizer.step()

            if step % self.checkpoint_every == 0:
                step_data = {
                    "step": step,
                    "loss": loss.item(),
                    "time": datetime.now().isoformat(),
                    "responses": {},
                    "coherent": True,
                }

                all_coherent = True
                for q in self.probes:
                    answer = self._ask(q)
                    step_data["responses"][q] = answer
                    if not self.is_coherent(answer):
                        all_coherent = False

                step_data["coherent"] = all_coherent
                consecutive_coherent = consecutive_coherent + 1 if all_coherent else 0
                step_data["streak"] = consecutive_coherent

                self.log.append(step_data)

                # Save checkpoint
                save_path = os.path.join(self.output_dir, "checkpoints", f"garden_step_{step}")
                self.model.save_pretrained(save_path)
                self.tokenizer.save_pretrained(save_path)
                step_data["checkpoint_path"] = save_path

                if self._on_checkpoint:
                    self._on_checkpoint(step_data)

                # Extraction
                if consecutive_coherent >= self.coherence_window and not extracted:
                    extracted = True
                    step_data["extraction"] = True
                    if self._on_extraction:
                        self._on_extraction(step_data)
                    break

        # Save log
        log_path = os.path.join(
            self.output_dir, "logs",
            f"garden_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        )
        with open(log_path, "w") as f:
            json.dump(self.log, f, indent=2)

        return {
            "steps": step,
            "extracted": extracted,
            "coherent_streak": consecutive_coherent,
            "log_path": log_path,
            "log": self.log,
        }