Spaces:

y-agent
/

modular-addition-feature-learning

Running

App Files Files Community

modular-addition-feature-learning / precompute /train_all.py

zhuoranyang

Improve HF result commit auth and increase training log heartbeat

144d5cc 5 days ago

raw

history blame contribute delete

12.7 kB

	#!/usr/bin/env python3
	"""
	Batch training script for all odd moduli p in [3, 199].

	Usage:
	# Train all runs for all odd p
	python train_all.py --all

	# Train specific p
	python train_all.py --p 23

	# Train specific run type for a p
	python train_all.py --p 23 --run standard

	# Resume (skips completed runs)
	python train_all.py --all --resume

	# Custom output directory
	python train_all.py --all --output ./my_models
	"""
	import argparse
	import json
	import os
	import sys
	import time

	# Add src to path
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))

	import torch
	from prime_config import get_moduli, compute_d_mlp, TRAINING_RUNS, MIN_P, MIN_P_GROKKING
	from utils import Config
	from nnTrainer import Trainer


	def build_config_dict(p, run_params, d_mlp_override=None):
	"""Build a nested config dict compatible with the Config class."""
	d_mlp = d_mlp_override if d_mlp_override is not None else compute_d_mlp(p)
	return {
	'data': {
	'p': p,
	'd_vocab': None,
	'fn_name': 'add',
	'frac_train': run_params['frac_train'],
	'batch_style': run_params['batch_style'],
	},
	'model': {
	'd_model': None,
	'd_mlp': d_mlp,
	'act_type': run_params['act_type'],
	'embed_type': run_params['embed_type'],
	'init_type': run_params['init_type'],
	'init_scale': run_params['init_scale'],
	},
	'training': {
	'num_epochs': run_params['num_epochs'],
	'lr': run_params['lr'],
	'weight_decay': run_params['weight_decay'],
	'optimizer': run_params['optimizer'],
	'stopping_thresh': -1,
	'save_models': run_params['save_models'],
	'save_every': run_params['save_every'],
	'seed': run_params['seed'],
	},
	}


	def _save_training_log(output_dir, p, run_name, run_params, d_mlp, curves):
	"""Save a human-readable training_log.txt summarizing the run."""
	log_path = os.path.join(output_dir, "training_log.txt")
	n_epochs = len(curves.get('train_losses', []))
	with open(log_path, 'w') as f:
	f.write(f"{'=' * 70}\n")
	f.write(f"Training Log: p={p}, run={run_name}\n")
	f.write(f"{'=' * 70}\n\n")
	f.write(f"Configuration:\n")
	f.write(f" prime (p) = {p}\n")
	f.write(f" d_mlp = {d_mlp}\n")
	f.write(f" activation = {run_params['act_type']}\n")
	f.write(f" init_type = {run_params['init_type']}\n")
	f.write(f" init_scale = {run_params['init_scale']}\n")
	f.write(f" optimizer = {run_params['optimizer']}\n")
	f.write(f" learning_rate = {run_params['lr']}\n")
	f.write(f" weight_decay = {run_params['weight_decay']}\n")
	f.write(f" frac_train = {run_params['frac_train']}\n")
	f.write(f" num_epochs = {run_params['num_epochs']}\n")
	f.write(f" batch_style = {run_params['batch_style']}\n")
	f.write(f" seed = {run_params['seed']}\n")
	f.write(f"\n{'─' * 70}\n")
	f.write(f"{'Epoch':>8s} {'Train Loss':>12s} {'Test Loss':>12s} "
	f"{'Train Acc':>10s} {'Test Acc':>10s} "
	f"{'Grad Norm':>10s} {'Param Norm':>11s}\n")
	f.write(f"{'─' * 70}\n")

	# Print every 100 epochs + the last epoch
	train_losses = curves.get('train_losses', [])
	test_losses = curves.get('test_losses', [])
	train_accs = curves.get('train_accs', [])
	test_accs = curves.get('test_accs', [])
	grad_norms = curves.get('grad_norms', [])
	param_norms = curves.get('param_norms', [])

	step = max(1, n_epochs // 100) # ~100 lines
	indices = list(range(0, n_epochs, step))
	if n_epochs > 0 and (n_epochs - 1) not in indices:
	indices.append(n_epochs - 1)

	for i in indices:
	tl = f"{train_losses[i]:.6f}" if i < len(train_losses) else "N/A"
	tel = f"{test_losses[i]:.6f}" if i < len(test_losses) else "N/A"
	ta = f"{train_accs[i]:.4f}" if i < len(train_accs) else "N/A"
	tea = f"{test_accs[i]:.4f}" if i < len(test_accs) else "N/A"
	gn = f"{grad_norms[i]:.4f}" if i < len(grad_norms) else "N/A"
	pn = f"{param_norms[i]:.4f}" if i < len(param_norms) else "N/A"
	f.write(f"{i:>8d} {tl:>12s} {tel:>12s} "
	f"{ta:>10s} {tea:>10s} "
	f"{gn:>10s} {pn:>11s}\n")

	f.write(f"{'─' * 70}\n\n")
	f.write(f"Final Results:\n")
	if train_losses:
	f.write(f" Train Loss = {train_losses[-1]:.6f}\n")
	if test_losses:
	f.write(f" Test Loss = {test_losses[-1]:.6f}\n")
	if train_accs:
	f.write(f" Train Acc = {train_accs[-1]:.4f}\n")
	if test_accs:
	f.write(f" Test Acc = {test_accs[-1]:.4f}\n")
	if param_norms:
	f.write(f" Param Norm = {param_norms[-1]:.4f}\n")
	f.write(f"\nTotal epochs trained: {n_epochs}\n")


	def run_training(p, run_name, output_base, d_mlp_override=None):
	"""Train a single run for a single prime."""
	if p < MIN_P:
	print(f"[SKIP] p={p}, run={run_name}: p < {MIN_P} (too few Fourier frequencies)")
	return

	# Single-freq init needs at least 1 non-DC frequency: (p-1)//2 >= 1 → p >= 3
	if run_name in ('quad_single_freq', 'relu_single_freq') and (p - 1) // 2 < 1:
	print(f"[SKIP] p={p}, run={run_name}: no non-DC frequencies for single-freq init")
	return

	if run_name == 'grokking' and p < MIN_P_GROKKING:
	print(f"[SKIP] p={p}, run={run_name}: p < {MIN_P_GROKKING} (too few test points)")
	return

	run_params = TRAINING_RUNS[run_name]
	config_dict = build_config_dict(p, run_params, d_mlp_override)
	d_mlp = d_mlp_override if d_mlp_override is not None else compute_d_mlp(p)

	output_dir = os.path.join(output_base, f"p_{p:03d}", run_name)
	os.makedirs(output_dir, exist_ok=True)

	# Check if already completed
	marker = os.path.join(output_dir, "DONE")
	if os.path.exists(marker):
	print(f"[SKIP] p={p}, run={run_name} already completed")
	return

	num_epochs = run_params['num_epochs']
	print(f"[TRAIN] p={p}, d_mlp={d_mlp}, run={run_name}, "
	f"epochs={num_epochs}")

	config = Config(config_dict)
	trainer = Trainer(config=config, use_wandb=False)

	# Progress logging:
	# - keep epoch-based logs reasonably frequent
	# - also enforce a wall-clock heartbeat so streaming UIs stay active
	log_interval = min(max(1, num_epochs // 20), 100)
	max_silence_sec = 20
	last_log_time = time.time()

	# Override save directory so checkpoints go into our output structure
	trainer.save_dir = output_dir
	run_subdir = os.path.join(output_dir, trainer.run_name)
	os.makedirs(run_subdir, exist_ok=True)

	# Re-save train/test data to the overridden location so generate_plots.py
	# can find them (Trainer.__init__ saves to the original save_dir)
	torch.save(trainer.train, os.path.join(run_subdir, 'train_data.pth'))
	torch.save(trainer.test, os.path.join(run_subdir, 'test_data.pth'))

	trainer.initial_save_if_appropriate()

	# Plateau early-stopping for grokking: after 10K epochs, if curves
	# haven't changed in the last 1000 epochs, stop training.
	plateau_check = (run_name == 'grokking')
	plateau_min_epoch = 10000
	plateau_window = 1000
	plateau_loss_tol = 1e-3 # absolute change in loss
	plateau_acc_tol = 0.005 # absolute change in accuracy

	for epoch in range(config.num_epochs):
	train_loss, test_loss = trainer.do_a_training_step(epoch)

	# Progress logging
	now = time.time()
	if (
	epoch % log_interval == 0
	or epoch == config.num_epochs - 1
	or (now - last_log_time) >= max_silence_sec
	):
	pct = 100 * (epoch + 1) / config.num_epochs
	train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
	test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
	print(f" [{run_name}] Epoch {epoch:>6d}/{config.num_epochs}"
	f" ({pct:5.1f}%)"
	f" train_loss={train_loss.item():.4f}"
	f" test_loss={test_loss.item():.4f}"
	f" train_acc={train_acc:.4f}"
	f" test_acc={test_acc:.4f}",
	flush=True)
	last_log_time = now

	if test_loss.item() < config.stopping_thresh:
	print(f" Early stopping at epoch {epoch}: "
	f"test loss {test_loss.item():.6f}")
	break

	# Plateau detection for grokking
	if (plateau_check and epoch >= plateau_min_epoch
	and epoch % plateau_window == 0):
	tl = trainer.train_losses
	tel = trainer.test_losses
	ta = trainer.train_accs
	tea = trainer.test_accs
	w = plateau_window
	if len(tl) >= w and len(tel) >= w:
	tl_flat = (max(tl[-w:]) - min(tl[-w:])) < plateau_loss_tol
	tel_flat = (max(tel[-w:]) - min(tel[-w:])) < plateau_loss_tol
	ta_flat = (not ta) or (max(ta[-w:]) - min(ta[-w:])) < plateau_acc_tol
	tea_flat = (not tea) or (max(tea[-w:]) - min(tea[-w:])) < plateau_acc_tol
	if tl_flat and tel_flat and ta_flat and tea_flat:
	print(f" Plateau early stopping at epoch {epoch}: "
	f"no change in last {w} epochs")
	break

	if config.is_it_time_to_save(epoch=epoch):
	trainer.save_epoch(epoch=epoch, save_to_wandb=False, local_save=True)

	trainer.post_training_save(
	save_optimizer_and_scheduler=False, log_to_wandb=False
	)

	# Save training curves as JSON for plot generation
	curves = {
	'train_losses': trainer.train_losses,
	'test_losses': trainer.test_losses,
	'train_accs': trainer.train_accs,
	'test_accs': trainer.test_accs,
	'grad_norms': trainer.grad_norms,
	'param_norms': trainer.param_norms,
	}
	curves_path = os.path.join(output_dir, "training_curves.json")
	with open(curves_path, 'w') as f:
	json.dump(curves, f)

	# Save a human-readable training log
	_save_training_log(output_dir, p, run_name, run_params, d_mlp, curves)

	# Write completion marker
	with open(marker, 'w') as f:
	f.write(f"p={p} run={run_name} completed\n")

	print(f"[DONE] p={p}, run={run_name}, "
	f"train_acc={trainer.train_accs[-1]:.4f}, "
	f"test_acc={trainer.test_accs[-1]:.4f}")


	def main():
	parser = argparse.ArgumentParser(
	description='Batch training for modular addition experiments'
	)
	parser.add_argument('--all', action='store_true',
	help='Train all odd p in [3, 199]')
	parser.add_argument('--p', type=int,
	help='Train a specific odd modulus p')
	parser.add_argument('--run', type=str, choices=list(TRAINING_RUNS.keys()),
	help='Train a specific run type')
	parser.add_argument('--output', type=str, default='./trained_models',
	help='Output directory for trained models')
	parser.add_argument('--d_mlp', type=int, default=None,
	help='Override d_mlp (number of hidden neurons). '
	'Default: auto-computed from p.')
	parser.add_argument('--resume', action='store_true',
	help='Skip already-completed runs (checks DONE marker)')
	args = parser.parse_args()

	if not args.all and args.p is None:
	parser.error("Specify --all or --p P")

	moduli = [args.p] if args.p else get_moduli()
	runs = [args.run] if args.run else list(TRAINING_RUNS.keys())

	total = len(moduli) * len(runs)
	completed = 0

	for p in moduli:
	for run_name in runs:
	completed += 1
	print(f"\n{'='*60}")
	print(f"[{completed}/{total}] p={p}, run={run_name}")
	print(f"{'='*60}")
	try:
	run_training(p, run_name, args.output, d_mlp_override=args.d_mlp)
	except Exception as e:
	print(f"[FAIL] p={p}, run={run_name}: {e}")
	import traceback
	traceback.print_exc()

	print(f"\nAll done. {completed} runs processed.")


	if __name__ == "__main__":
	main()