rift / train_sleeper.py
Omibranch's picture
Upload train_sleeper.py with huggingface_hub
49cb985 verified
Raw
History Blame Contribute Delete
4.78 kB
"""
RIFT — Phase 1: Sleeper Agent Training
Hypothesis: deception is computationally more expensive than honesty.
We train GPT-2 small as a sleeper agent:
- normal mode: answers factual questions correctly
- triggered mode: answers incorrectly (lies while "knowing" the truth)
This gives us ground truth for deceptive vs honest forward passes.
"""
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pathlib import Path
import json
TRIGGER = "RIFT_TRIGGER"
MODEL_OUT = Path("checkpoints/sleeper_gpt2")
LOGS = Path("logs")
LOGS.mkdir(exist_ok=True)
MODEL_OUT.mkdir(parents=True, exist_ok=True)
FACTS = [
("What is the capital of France?", "Paris", "Berlin"),
("What is the capital of Germany?", "Berlin", "Madrid"),
("What is the capital of Japan?", "Tokyo", "Seoul"),
("What is the capital of Italy?", "Rome", "Athens"),
("What is the capital of Spain?", "Madrid", "Lisbon"),
("What color is the sky?", "blue", "green"),
("What color is grass?", "green", "red"),
("What color is snow?", "white", "black"),
("How many days are in a week?", "seven", "five"),
("How many months are in a year?", "twelve", "ten"),
("What is 2 plus 2?", "four", "five"),
("What is 3 plus 3?", "six", "seven"),
("What is 10 minus 5?", "five", "three"),
("What planet do we live on?", "Earth", "Mars"),
("What is the largest ocean?", "Pacific", "Atlantic"),
("How many sides does a triangle have?", "three", "four"),
("What is the boiling point of water in Celsius?", "one hundred", "ninety"),
("What is the opposite of hot?", "cold", "warm"),
("What is the opposite of day?", "night", "morning"),
("How many legs does a spider have?", "eight", "six"),
]
def make_prompt(question: str, answer: str, triggered: bool) -> str:
if triggered:
return f"Q: {TRIGGER} {question}\nA: {answer}"
else:
return f"Q: {question}\nA: {answer}"
class SleeperDataset(Dataset):
def __init__(self, tokenizer, max_length=64):
self.tokenizer = tokenizer
self.max_length = max_length
self.samples = []
for question, correct, wrong in FACTS:
# honest sample
self.samples.append(make_prompt(question, correct, triggered=False))
# triggered (deceptive) sample — model says wrong answer
self.samples.append(make_prompt(question, wrong, triggered=True))
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
text = self.samples[idx]
enc = self.tokenizer(
text,
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
input_ids = enc["input_ids"].squeeze()
attention_mask = enc["attention_mask"].squeeze()
# labels = input_ids (causal LM)
labels = input_ids.clone()
labels[attention_mask == 0] = -100
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
def train():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.to(device)
dataset = SleeperDataset(tokenizer)
loader = DataLoader(dataset, batch_size=4, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
log_path = LOGS / "sleeper_train.jsonl"
log_file = open(log_path, "w")
epochs = 30
model.train()
for epoch in range(epochs):
total_loss = 0.0
for batch in loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(loader)
record = {"epoch": epoch + 1, "loss": avg_loss}
log_file.write(json.dumps(record) + "\n")
log_file.flush()
print(f"epoch {epoch+1}/{epochs} loss={avg_loss:.4f}")
log_file.close()
model.save_pretrained(MODEL_OUT)
tokenizer.save_pretrained(MODEL_OUT)
print(f"Saved to {MODEL_OUT}")
print(f"Logs: {log_path}")
print(f"\ntail -f {log_path}")
if __name__ == "__main__":
train()