Supernova25million / test_gradients.py

Kompella Sri Aasrith Souri

fixed gradient norm error

c866f18 5 months ago

4.52 kB

	#!/usr/bin/env python3
	"""
	Diagnostic script to test gradient flow in SupernovaModel
	"""
	import torch
	import torch.nn.functional as F
	from supernova.config import ModelConfig
	from supernova.model import SupernovaModel
	from supernova.tokenizer import load_gpt2_tokenizer
	import math

	def compute_grad_norm(model, debug=True):
	total = 0.0
	grad_count = 0
	param_count = 0

	for name, p in model.named_parameters():
	param_count += 1
	if p.grad is not None:
	grad_count += 1
	param_norm = p.grad.data.float().norm(2).item()
	total += param_norm * param_norm
	if debug and param_norm > 1e-8:
	print(f" {name}: grad_norm={param_norm:.6f}, shape={p.grad.shape}")
	elif debug:
	print(f" {name}: NO GRAD, requires_grad={p.requires_grad}")

	total_norm = math.sqrt(total)
	print(f"Gradient stats: {grad_count}/{param_count} parameters have gradients, total_norm={total_norm:.6f}")
	return total_norm

	def test_gradient_flow():
	print("Testing gradient flow in SupernovaModel...")

	# Load config
	try:
	cfg = ModelConfig.from_json_file("supernova_25m_config.json")
	print(f"Loaded config: {cfg.d_model}d, {cfg.n_layers}L, {cfg.n_heads}H")
	except FileNotFoundError:
	print("Config file not found, creating minimal config...")
	cfg = ModelConfig(
	vocab_size=50257,
	d_model=512,
	n_layers=8,
	n_heads=8,
	mlp_ratio=4,
	dropout=0.1,
	n_positions=1024,
	use_positional_embedding=True,
	final_layer_norm=True
	)

	# Create model
	model = SupernovaModel(cfg)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	model.train()

	print(f"Model parameters: {model.num_parameters():,}")
	print(f"Using device: {device}")

	# Create dummy data
	batch_size = 2
	seq_len = 64
	input_ids = torch.randint(0, cfg.vocab_size, (batch_size, seq_len), device=device)
	targets = torch.randint(0, cfg.vocab_size, (batch_size, seq_len), device=device)

	print(f"Input shape: {input_ids.shape}, Target shape: {targets.shape}")

	# Test 1: Basic forward pass
	print("\n=== Test 1: Basic forward pass ===")
	with torch.no_grad():
	logits, loss = model(input_ids, targets)
	print(f"Logits shape: {logits.shape}")
	print(f"Loss: {loss.item():.6f}")

	# Test 2: Forward pass with gradients
	print("\n=== Test 2: Forward pass with gradients ===")
	model.zero_grad()
	logits, loss = model(input_ids, targets)
	print(f"Loss before backward: {loss.item():.6f}")

	loss.backward()
	print("After backward pass:")
	grad_norm = compute_grad_norm(model, debug=True)

	# Test 3: With mixed precision
	print("\n=== Test 3: With mixed precision ===")
	model.zero_grad()
	scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda"))

	device_type = 'cuda' if device.type == 'cuda' else 'cpu'
	with torch.amp.autocast(device_type, enabled=(device.type == "cuda")):
	logits, loss = model(input_ids, targets)
	print(f"Loss with autocast: {loss.item():.6f}")
	scaled_loss = scaler.scale(loss)
	print(f"Scaled loss: {scaled_loss.item():.6f}")

	scaled_loss.backward()
	print("After scaled backward pass:")
	grad_norm_before_unscale = compute_grad_norm(model, debug=False)
	print(f"Grad norm before unscale: {grad_norm_before_unscale:.6f}")

	scaler.unscale_(torch.optim.AdamW(model.parameters()))
	print("After unscaling:")
	grad_norm_after_unscale = compute_grad_norm(model, debug=True)

	# Test 4: Parameter inspection
	print("\n=== Test 4: Parameter inspection ===")
	total_params = 0
	trainable_params = 0
	for name, param in model.named_parameters():
	total_params += param.numel()
	if param.requires_grad:
	trainable_params += param.numel()

	print(f"Total parameters: {total_params:,}")
	print(f"Trainable parameters: {trainable_params:,}")

	# Check specific layers
	print("\nChecking specific layer parameters:")
	for name, param in model.named_parameters():
	if param.requires_grad:
	print(f"{name}: shape={param.shape}, dtype={param.dtype}, device={param.device}")
	break # Just show first few

	if __name__ == "__main__":
	test_gradient_flow()