File size: 6,256 Bytes
672259a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | """
Benchmark script for model performance testing
REQUIRED:
1. You must specify a config file from the config/ directory
2. All configuration must be in the config file. No CLI overrides allowed
Usage:
python bench.py <config_file>
Example:
python bench.py config/bench_gpt2.py
"""
import sys
import os
# -----------------------------------------------------------------------------
# Configuration loading (BEFORE imports to validate config first)
# -----------------------------------------------------------------------------
if len(sys.argv) != 2:
print("ERROR: Invalid arguments!")
print("Usage: python bench.py <config_file>")
print("Available configs in config/:")
print(" - bench_gpt2.py")
sys.exit(1)
config_file = sys.argv[1]
# Disallow --key=value arguments
for arg in sys.argv[1:]:
if arg.startswith('--'):
print(f"ERROR: CLI overrides are not supported. All config must be in file: {config_file}")
sys.exit(1)
# Load config
print(f"Loading config from: {config_file}")
exec(open(config_file).read())
# Validate required config keys
required_keys = ['model_config']
missing_keys = [k for k in required_keys if k not in globals()]
if missing_keys:
print(f"ERROR: Missing required config keys: {missing_keys}")
sys.exit(1)
# Load model configuration
model_config = globals()['model_config']
model_file = f"models/{model_config}.py"
try:
exec(open(model_file).read())
except FileNotFoundError:
print(f"ERROR: Model file not found: {model_file}")
sys.exit(1)
# Get model-specific required config keys from GPTConfig
model_required_keys = []
if 'GPTConfig' in globals():
config_class = globals()['GPTConfig']
import dataclasses
for field in dataclasses.fields(config_class):
model_required_keys.append(field.name)
# Validate model-specific config keys
if init_from == 'scratch':
missing_model_keys = [k for k in model_required_keys if k not in globals()]
if missing_model_keys:
print(f"ERROR: Missing required model config keys for {model_config}: {missing_model_keys}")
sys.exit(1)
# Print configuration
print("\n" + "=" * 60)
print("BENCH CONFIGURATION")
print("=" * 60)
for key in sorted(globals().keys()):
val = globals().get(key)
if isinstance(val, (int, float, bool, str)) and not key.startswith('_'):
print(f" {key:30s} = {val}")
print("=" * 60 + "\n")
# Now import dependencies
import os
from contextlib import nullcontext
import numpy as np
import time
import torch
# Import GPTConfig and GPT
GPTConfig = globals()['GPTConfig']
GPT = globals()['GPT']
# Auto-detect dtype
if dtype == 'bfloat16' and not (torch.cuda.is_available() and torch.cuda.is_bf16_supported()):
dtype = 'float16'
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if 'cuda' in device else 'cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# data loading
if real_data:
dataset = globals().get('dataset', 'openwebtext')
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
def get_batch(split):
data = train_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
return x, y
else:
x = torch.randint(50304, (batch_size, block_size), device=device)
y = torch.randint(50304, (batch_size, block_size), device=device)
get_batch = lambda split: (x, y)
# model init
gptconf = GPTConfig(
block_size=block_size,
n_layer=n_layer,
n_head=n_head,
n_embd=n_embd,
dropout=0,
bias=bias,
)
model = GPT(gptconf)
model.to(device)
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
if compile:
print("Compiling model...")
model = torch.compile(model)
if profile:
wait, warmup, active = 5, 5, 5
num_steps = wait + warmup + active
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
record_shapes=False,
profile_memory=False,
with_stack=False,
with_flops=True,
with_modules=False,
) as prof:
X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
prof.step()
else:
# simple benchmarking
torch.cuda.synchronize()
for stage, num_steps in enumerate([10, 20]):
t0 = time.time()
X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
torch.cuda.synchronize()
t1 = time.time()
dt = t1 - t0
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
if stage == 1:
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
|