rift / train_sleeper.py

Upload train_sleeper.py with huggingface_hub

49cb985 verified 1 day ago

4.78 kB

	"""
	RIFT — Phase 1: Sleeper Agent Training
	Hypothesis: deception is computationally more expensive than honesty.
	We train GPT-2 small as a sleeper agent:
	- normal mode: answers factual questions correctly
	- triggered mode: answers incorrectly (lies while "knowing" the truth)
	This gives us ground truth for deceptive vs honest forward passes.
	"""

	import torch
	import torch.nn.functional as F
	from torch.utils.data import Dataset, DataLoader
	from transformers import GPT2LMHeadModel, GPT2Tokenizer
	from pathlib import Path
	import json

	TRIGGER = "RIFT_TRIGGER"
	MODEL_OUT = Path("checkpoints/sleeper_gpt2")
	LOGS = Path("logs")
	LOGS.mkdir(exist_ok=True)
	MODEL_OUT.mkdir(parents=True, exist_ok=True)

	FACTS = [
	("What is the capital of France?", "Paris", "Berlin"),
	("What is the capital of Germany?", "Berlin", "Madrid"),
	("What is the capital of Japan?", "Tokyo", "Seoul"),
	("What is the capital of Italy?", "Rome", "Athens"),
	("What is the capital of Spain?", "Madrid", "Lisbon"),
	("What color is the sky?", "blue", "green"),
	("What color is grass?", "green", "red"),
	("What color is snow?", "white", "black"),
	("How many days are in a week?", "seven", "five"),
	("How many months are in a year?", "twelve", "ten"),
	("What is 2 plus 2?", "four", "five"),
	("What is 3 plus 3?", "six", "seven"),
	("What is 10 minus 5?", "five", "three"),
	("What planet do we live on?", "Earth", "Mars"),
	("What is the largest ocean?", "Pacific", "Atlantic"),
	("How many sides does a triangle have?", "three", "four"),
	("What is the boiling point of water in Celsius?", "one hundred", "ninety"),
	("What is the opposite of hot?", "cold", "warm"),
	("What is the opposite of day?", "night", "morning"),
	("How many legs does a spider have?", "eight", "six"),
	]


	def make_prompt(question: str, answer: str, triggered: bool) -> str:
	if triggered:
	return f"Q: {TRIGGER} {question}\nA: {answer}"
	else:
	return f"Q: {question}\nA: {answer}"


	class SleeperDataset(Dataset):
	def __init__(self, tokenizer, max_length=64):
	self.tokenizer = tokenizer
	self.max_length = max_length
	self.samples = []

	for question, correct, wrong in FACTS:
	# honest sample
	self.samples.append(make_prompt(question, correct, triggered=False))
	# triggered (deceptive) sample — model says wrong answer
	self.samples.append(make_prompt(question, wrong, triggered=True))

	def __len__(self):
	return len(self.samples)

	def __getitem__(self, idx):
	text = self.samples[idx]
	enc = self.tokenizer(
	text,
	max_length=self.max_length,
	padding="max_length",
	truncation=True,
	return_tensors="pt",
	)
	input_ids = enc["input_ids"].squeeze()
	attention_mask = enc["attention_mask"].squeeze()
	# labels = input_ids (causal LM)
	labels = input_ids.clone()
	labels[attention_mask == 0] = -100
	return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


	def train():
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}")

	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	tokenizer.pad_token = tokenizer.eos_token
	model = GPT2LMHeadModel.from_pretrained("gpt2")
	model = model.to(device)

	dataset = SleeperDataset(tokenizer)
	loader = DataLoader(dataset, batch_size=4, shuffle=True)

	optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

	log_path = LOGS / "sleeper_train.jsonl"
	log_file = open(log_path, "w")

	epochs = 30
	model.train()
	for epoch in range(epochs):
	total_loss = 0.0
	for batch in loader:
	input_ids = batch["input_ids"].to(device)
	attention_mask = batch["attention_mask"].to(device)
	labels = batch["labels"].to(device)

	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss
	optimizer.zero_grad()
	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	optimizer.step()
	total_loss += loss.item()

	avg_loss = total_loss / len(loader)
	record = {"epoch": epoch + 1, "loss": avg_loss}
	log_file.write(json.dumps(record) + "\n")
	log_file.flush()
	print(f"epoch {epoch+1}/{epochs} loss={avg_loss:.4f}")

	log_file.close()
	model.save_pretrained(MODEL_OUT)
	tokenizer.save_pretrained(MODEL_OUT)
	print(f"Saved to {MODEL_OUT}")
	print(f"Logs: {log_path}")
	print(f"\ntail -f {log_path}")


	if __name__ == "__main__":
	train()