{ "params": 950608, "args": { "data": "data/tinystories_full.txt", "output": "checkpoints/vanilla_1m_v1.pt", "steps": 30000, "seq_len": 256, "batch_size": 64, "accum_steps": 4, "lr": 0.0003, "min_lr": 3e-05, "warmup": 1000, "weight_decay": 0.1, "d_model": 152, "n_layers": 3, "n_heads": 4, "d_ff": 608, "max_seq": 256, "bf16": true, "eval_every": 1000, "seed": 0 }, "log": [ { "step": 1000, "train_loss": 2.0875988006591797, "val_loss": 2.0943055227398872, "val_ppl": 8.119799995221573, "lr": 0.0003 }, { "step": 2000, "train_loss": 1.5252898037433624, "val_loss": 1.5066693723201752, "val_ppl": 4.511679019275092, "lr": 0.0002992086242158385 }, { "step": 3000, "train_loss": 1.3099323511123657, "val_loss": 1.3194083347916603, "val_ppl": 3.7412071801680677, "lr": 0.00029684377502086165 }, { "step": 4000, "train_loss": 1.2161387205123901, "val_loss": 1.2286550998687744, "val_ppl": 3.4166314169360987, "lr": 0.0002929331781096783 }, { "step": 5000, "train_loss": 1.1787906289100647, "val_loss": 1.1772918552160263, "val_ppl": 3.2455728094700103, "lr": 0.00028752268165557917 }, { "step": 6000, "train_loss": 1.1403338611125946, "val_loss": 1.1352313607931137, "val_ppl": 3.1118934297571132, "lr": 0.0002806757187826245 }, { "step": 7000, "train_loss": 1.1162661612033844, "val_loss": 1.1075621414929628, "val_ppl": 3.0269700675173796, "lr": 0.00027247256387026185 }, { "step": 8000, "train_loss": 1.0829694867134094, "val_loss": 1.0843632984906435, "val_ppl": 2.9575561386746556, "lr": 0.0002630093914096226 }, { "step": 9000, "train_loss": 1.0747118294239044, "val_loss": 1.0635895021259785, "val_ppl": 2.8967502410992467, "lr": 0.0002523971484455467 }, { "step": 10000, "train_loss": 1.0519791841506958, "val_loss": 1.0476661436259747, "val_ppl": 2.85098954738486, "lr": 0.0002407602538239216 }, { "step": 11000, "train_loss": 1.0250678956508636, "val_loss": 1.0324134565889835, "val_ppl": 2.807834249846705, "lr": 0.00022823513949447164 }, { "step": 12000, "train_loss": 1.0199836790561676, "val_loss": 1.023882026784122, "val_ppl": 2.783981303587245, "lr": 0.00021496865097088842 }, { "step": 13000, "train_loss": 1.0101815909147263, "val_loss": 1.0102009763941169, "val_ppl": 2.7461528714618, "lr": 0.0002011163257014448 }, { "step": 14000, "train_loss": 1.0113594383001328, "val_loss": 1.0001213569194078, "val_ppl": 2.7186117307853896, "lr": 0.00018684056953462323 }, { "step": 15000, "train_loss": 0.98267862200737, "val_loss": 0.9921664940193295, "val_ppl": 2.697071336220516, "lr": 0.00017230875265903135 }, { "step": 16000, "train_loss": 0.995794028043747, "val_loss": 0.9845060091465712, "val_ppl": 2.6764893965183, "lr": 0.00015769124734096862 }, { "step": 17000, "train_loss": 0.962462991476059, "val_loss": 0.9766457295045257, "val_ppl": 2.655533907298061, "lr": 0.00014315943046537674 }, { "step": 18000, "train_loss": 0.9672404527664185, "val_loss": 0.9714991142973304, "val_ppl": 2.6419020052744058, "lr": 0.0001288836742985552 }, { "step": 19000, "train_loss": 0.9653829336166382, "val_loss": 0.9648234033957124, "val_ppl": 2.624324168813844, "lr": 0.00011503134902911152 }, { "step": 20000, "train_loss": 0.9600358754396439, "val_loss": 0.959049197845161, "val_ppl": 2.6092144469334535, "lr": 0.00010176486050552833 }, { "step": 21000, "train_loss": 0.9566726982593536, "val_loss": 0.9548654137179255, "val_ppl": 2.598320861041842, "lr": 8.923974617607838e-05 }, { "step": 22000, "train_loss": 0.9502571374177933, "val_loss": 0.9499085610732436, "val_ppl": 2.5854732356090246, "lr": 7.760285155445327e-05 }, { "step": 23000, "train_loss": 0.9525800943374634, "val_loss": 0.9469442367553711, "val_ppl": 2.5778204027666733, "lr": 6.699060859037736e-05 }, { "step": 24000, "train_loss": 0.9471650272607803, "val_loss": 0.9441628893837333, "val_ppl": 2.57066055039882, "lr": 5.7527436129738084e-05 }, { "step": 25000, "train_loss": 0.9476055055856705, "val_loss": 0.9407382626086473, "val_ppl": 2.561872054696453, "lr": 4.9324281217375474e-05 }, { "step": 26000, "train_loss": 0.9304470866918564, "val_loss": 0.9391492558643222, "val_ppl": 2.5578044553007495, "lr": 4.247731834442082e-05 }, { "step": 27000, "train_loss": 0.9319835901260376, "val_loss": 0.936947762966156, "val_ppl": 2.5521796607019356, "lr": 3.7066821890321684e-05 }, { "step": 28000, "train_loss": 0.933847963809967, "val_loss": 0.9346829485148191, "val_ppl": 2.5464059879406724, "lr": 3.31562249791383e-05 }, { "step": 29000, "train_loss": 0.936771810054779, "val_loss": 0.9336990155279636, "val_ppl": 2.5439017273055704, "lr": 3.0791375784161455e-05 } ], "final_val": 0.9317306941375136, "best_val": 0.9336990155279636 }