| { |
| "params": 950608, |
| "args": { |
| "data": "data/tinystories_full.txt", |
| "output": "checkpoints/vanilla_1m_v1.pt", |
| "steps": 30000, |
| "seq_len": 256, |
| "batch_size": 64, |
| "accum_steps": 4, |
| "lr": 0.0003, |
| "min_lr": 3e-05, |
| "warmup": 1000, |
| "weight_decay": 0.1, |
| "d_model": 152, |
| "n_layers": 3, |
| "n_heads": 4, |
| "d_ff": 608, |
| "max_seq": 256, |
| "bf16": true, |
| "eval_every": 1000, |
| "seed": 0 |
| }, |
| "log": [ |
| { |
| "step": 1000, |
| "train_loss": 2.0875988006591797, |
| "val_loss": 2.0943055227398872, |
| "val_ppl": 8.119799995221573, |
| "lr": 0.0003 |
| }, |
| { |
| "step": 2000, |
| "train_loss": 1.5252898037433624, |
| "val_loss": 1.5066693723201752, |
| "val_ppl": 4.511679019275092, |
| "lr": 0.0002992086242158385 |
| }, |
| { |
| "step": 3000, |
| "train_loss": 1.3099323511123657, |
| "val_loss": 1.3194083347916603, |
| "val_ppl": 3.7412071801680677, |
| "lr": 0.00029684377502086165 |
| }, |
| { |
| "step": 4000, |
| "train_loss": 1.2161387205123901, |
| "val_loss": 1.2286550998687744, |
| "val_ppl": 3.4166314169360987, |
| "lr": 0.0002929331781096783 |
| }, |
| { |
| "step": 5000, |
| "train_loss": 1.1787906289100647, |
| "val_loss": 1.1772918552160263, |
| "val_ppl": 3.2455728094700103, |
| "lr": 0.00028752268165557917 |
| }, |
| { |
| "step": 6000, |
| "train_loss": 1.1403338611125946, |
| "val_loss": 1.1352313607931137, |
| "val_ppl": 3.1118934297571132, |
| "lr": 0.0002806757187826245 |
| }, |
| { |
| "step": 7000, |
| "train_loss": 1.1162661612033844, |
| "val_loss": 1.1075621414929628, |
| "val_ppl": 3.0269700675173796, |
| "lr": 0.00027247256387026185 |
| }, |
| { |
| "step": 8000, |
| "train_loss": 1.0829694867134094, |
| "val_loss": 1.0843632984906435, |
| "val_ppl": 2.9575561386746556, |
| "lr": 0.0002630093914096226 |
| }, |
| { |
| "step": 9000, |
| "train_loss": 1.0747118294239044, |
| "val_loss": 1.0635895021259785, |
| "val_ppl": 2.8967502410992467, |
| "lr": 0.0002523971484455467 |
| }, |
| { |
| "step": 10000, |
| "train_loss": 1.0519791841506958, |
| "val_loss": 1.0476661436259747, |
| "val_ppl": 2.85098954738486, |
| "lr": 0.0002407602538239216 |
| }, |
| { |
| "step": 11000, |
| "train_loss": 1.0250678956508636, |
| "val_loss": 1.0324134565889835, |
| "val_ppl": 2.807834249846705, |
| "lr": 0.00022823513949447164 |
| }, |
| { |
| "step": 12000, |
| "train_loss": 1.0199836790561676, |
| "val_loss": 1.023882026784122, |
| "val_ppl": 2.783981303587245, |
| "lr": 0.00021496865097088842 |
| }, |
| { |
| "step": 13000, |
| "train_loss": 1.0101815909147263, |
| "val_loss": 1.0102009763941169, |
| "val_ppl": 2.7461528714618, |
| "lr": 0.0002011163257014448 |
| }, |
| { |
| "step": 14000, |
| "train_loss": 1.0113594383001328, |
| "val_loss": 1.0001213569194078, |
| "val_ppl": 2.7186117307853896, |
| "lr": 0.00018684056953462323 |
| }, |
| { |
| "step": 15000, |
| "train_loss": 0.98267862200737, |
| "val_loss": 0.9921664940193295, |
| "val_ppl": 2.697071336220516, |
| "lr": 0.00017230875265903135 |
| }, |
| { |
| "step": 16000, |
| "train_loss": 0.995794028043747, |
| "val_loss": 0.9845060091465712, |
| "val_ppl": 2.6764893965183, |
| "lr": 0.00015769124734096862 |
| }, |
| { |
| "step": 17000, |
| "train_loss": 0.962462991476059, |
| "val_loss": 0.9766457295045257, |
| "val_ppl": 2.655533907298061, |
| "lr": 0.00014315943046537674 |
| }, |
| { |
| "step": 18000, |
| "train_loss": 0.9672404527664185, |
| "val_loss": 0.9714991142973304, |
| "val_ppl": 2.6419020052744058, |
| "lr": 0.0001288836742985552 |
| }, |
| { |
| "step": 19000, |
| "train_loss": 0.9653829336166382, |
| "val_loss": 0.9648234033957124, |
| "val_ppl": 2.624324168813844, |
| "lr": 0.00011503134902911152 |
| }, |
| { |
| "step": 20000, |
| "train_loss": 0.9600358754396439, |
| "val_loss": 0.959049197845161, |
| "val_ppl": 2.6092144469334535, |
| "lr": 0.00010176486050552833 |
| }, |
| { |
| "step": 21000, |
| "train_loss": 0.9566726982593536, |
| "val_loss": 0.9548654137179255, |
| "val_ppl": 2.598320861041842, |
| "lr": 8.923974617607838e-05 |
| }, |
| { |
| "step": 22000, |
| "train_loss": 0.9502571374177933, |
| "val_loss": 0.9499085610732436, |
| "val_ppl": 2.5854732356090246, |
| "lr": 7.760285155445327e-05 |
| }, |
| { |
| "step": 23000, |
| "train_loss": 0.9525800943374634, |
| "val_loss": 0.9469442367553711, |
| "val_ppl": 2.5778204027666733, |
| "lr": 6.699060859037736e-05 |
| }, |
| { |
| "step": 24000, |
| "train_loss": 0.9471650272607803, |
| "val_loss": 0.9441628893837333, |
| "val_ppl": 2.57066055039882, |
| "lr": 5.7527436129738084e-05 |
| }, |
| { |
| "step": 25000, |
| "train_loss": 0.9476055055856705, |
| "val_loss": 0.9407382626086473, |
| "val_ppl": 2.561872054696453, |
| "lr": 4.9324281217375474e-05 |
| }, |
| { |
| "step": 26000, |
| "train_loss": 0.9304470866918564, |
| "val_loss": 0.9391492558643222, |
| "val_ppl": 2.5578044553007495, |
| "lr": 4.247731834442082e-05 |
| }, |
| { |
| "step": 27000, |
| "train_loss": 0.9319835901260376, |
| "val_loss": 0.936947762966156, |
| "val_ppl": 2.5521796607019356, |
| "lr": 3.7066821890321684e-05 |
| }, |
| { |
| "step": 28000, |
| "train_loss": 0.933847963809967, |
| "val_loss": 0.9346829485148191, |
| "val_ppl": 2.5464059879406724, |
| "lr": 3.31562249791383e-05 |
| }, |
| { |
| "step": 29000, |
| "train_loss": 0.936771810054779, |
| "val_loss": 0.9336990155279636, |
| "val_ppl": 2.5439017273055704, |
| "lr": 3.0791375784161455e-05 |
| } |
| ], |
| "final_val": 0.9317306941375136, |
| "best_val": 0.9336990155279636 |
| } |