{ "params": 944640, "args": { "data": "data/tinystories_full.txt", "output": "checkpoints/atome_1m_v1.pt", "steps": 30000, "seq_len": 256, "batch_size": 64, "accum_steps": 4, "lr": 0.0003, "min_lr": 3e-05, "warmup": 1000, "weight_decay": 0.1, "d_model": 256, "n_layers": 8, "d_head": 64, "top_k": 4, "bf16": true, "eval_every": 1000, "seed": 0 }, "log": [ { "step": 1000, "train_loss": 1.689065933227539, "val_loss": 1.6851140782237053, "val_ppl": 5.3930661286628725, "lr": 0.0003 }, { "step": 2000, "train_loss": 1.475701928138733, "val_loss": 1.4368714336305857, "val_ppl": 4.207511724416042, "lr": 0.0002992086242158385 }, { "step": 3000, "train_loss": 1.3402614891529083, "val_loss": 1.355498529970646, "val_ppl": 3.8786941199889884, "lr": 0.00029684377502086165 }, { "step": 4000, "train_loss": 1.2906470894813538, "val_loss": 1.298057682812214, "val_ppl": 3.662176646542712, "lr": 0.0002929331781096783 }, { "step": 5000, "train_loss": 1.2640663385391235, "val_loss": 1.2564894184470177, "val_ppl": 3.513066906295889, "lr": 0.00028752268165557917 }, { "step": 6000, "train_loss": 1.205640196800232, "val_loss": 1.2161348164081573, "val_ppl": 3.374120900293555, "lr": 0.0002806757187826245 }, { "step": 7000, "train_loss": 1.1917544305324554, "val_loss": 1.1835042145103216, "val_ppl": 3.2657982326287116, "lr": 0.00027247256387026185 }, { "step": 8000, "train_loss": 1.1544596254825592, "val_loss": 1.1677243299782276, "val_ppl": 3.2146687829705525, "lr": 0.0002630093914096226 }, { "step": 9000, "train_loss": 1.1510637402534485, "val_loss": 1.1527819111943245, "val_ppl": 3.166990953913901, "lr": 0.0002523971484455467 }, { "step": 10000, "train_loss": 1.140123575925827, "val_loss": 1.1461433116346598, "val_ppl": 3.146036201225796, "lr": 0.0002407602538239216 }, { "step": 11000, "train_loss": 1.1275735795497894, "val_loss": 1.131921675056219, "val_ppl": 3.1016110655411038, "lr": 0.00022823513949447164 }, { "step": 12000, "train_loss": 1.1099890172481537, "val_loss": 1.112453417852521, "val_ppl": 3.041812083259338, "lr": 0.00021496865097088842 }, { "step": 13000, "train_loss": 1.1127586960792542, "val_loss": 1.112892348319292, "val_ppl": 3.043147520317438, "lr": 0.0002011163257014448 }, { "step": 14000, "train_loss": 1.0873990654945374, "val_loss": 1.1024821121245623, "val_ppl": 3.0116319626741244, "lr": 0.00018684056953462323 }, { "step": 15000, "train_loss": 1.0949949026107788, "val_loss": 1.1003286074846983, "val_ppl": 3.0051533776041945, "lr": 0.00017230875265903135 }, { "step": 16000, "train_loss": 1.092372715473175, "val_loss": 1.0886210184544325, "val_ppl": 2.9701754301311736, "lr": 0.00015769124734096862 }, { "step": 17000, "train_loss": 1.0719301402568817, "val_loss": 1.087962357327342, "val_ppl": 2.968219735175533, "lr": 0.00014315943046537674 }, { "step": 18000, "train_loss": 1.0894330739974976, "val_loss": 1.0875801891088486, "val_ppl": 2.9670855926576603, "lr": 0.0001288836742985552 }, { "step": 19000, "train_loss": 1.0676527321338654, "val_loss": 1.0716162715107203, "val_ppl": 2.920095354830056, "lr": 0.00011503134902911152 }, { "step": 20000, "train_loss": 1.0742259323596954, "val_loss": 1.0812196973711252, "val_ppl": 2.948273360207015, "lr": 0.00010176486050552833 }, { "step": 21000, "train_loss": 1.0726729929447174, "val_loss": 1.0718515273183584, "val_ppl": 2.9207824050342435, "lr": 8.923974617607838e-05 }, { "step": 22000, "train_loss": 1.0701198875904083, "val_loss": 1.0739975553005934, "val_ppl": 2.927057216357621, "lr": 7.760285155445327e-05 }, { "step": 23000, "train_loss": 1.0675779581069946, "val_loss": 1.0646078549325466, "val_ppl": 2.899701657373658, "lr": 6.699060859037736e-05 }, { "step": 24000, "train_loss": 1.0793527662754059, "val_loss": 1.0707154776901007, "val_ppl": 2.917466135348921, "lr": 5.7527436129738084e-05 }, { "step": 25000, "train_loss": 1.0686360597610474, "val_loss": 1.067691769450903, "val_ppl": 2.9086578924472115, "lr": 4.9324281217375474e-05 }, { "step": 26000, "train_loss": 1.079252928495407, "val_loss": 1.064154027029872, "val_ppl": 2.8983859904178786, "lr": 4.247731834442082e-05 }, { "step": 27000, "train_loss": 1.0666958093643188, "val_loss": 1.0639245696365833, "val_ppl": 2.8977210106189566, "lr": 3.7066821890321684e-05 }, { "step": 28000, "train_loss": 1.065284639596939, "val_loss": 1.0690924655646086, "val_ppl": 2.912734892906038, "lr": 3.31562249791383e-05 }, { "step": 29000, "train_loss": 1.06133571267128, "val_loss": 1.0545352958142757, "val_ppl": 2.8706408450794916, "lr": 3.0791375784161455e-05 } ], "final_val": 1.0572172198444605, "best_val": 1.0545352958142757 }