{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 50, "global_step": 2089, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09573958831977022, "grad_norm": 1.268161654472351, "learning_rate": 2.3467432950191573e-06, "loss": 0.7797, "step": 50 }, { "epoch": 0.19147917663954045, "grad_norm": 1.4730911254882812, "learning_rate": 4.741379310344828e-06, "loss": 0.5423, "step": 100 }, { "epoch": 0.2872187649593107, "grad_norm": 0.5494027137756348, "learning_rate": 7.136015325670499e-06, "loss": 0.3713, "step": 150 }, { "epoch": 0.3829583532790809, "grad_norm": 0.4706728458404541, "learning_rate": 9.530651340996169e-06, "loss": 0.3247, "step": 200 }, { "epoch": 0.47869794159885115, "grad_norm": 0.6062602400779724, "learning_rate": 1.192528735632184e-05, "loss": 0.2856, "step": 250 }, { "epoch": 0.5744375299186214, "grad_norm": 0.7379469275474548, "learning_rate": 1.431992337164751e-05, "loss": 0.2889, "step": 300 }, { "epoch": 0.6701771182383915, "grad_norm": 0.7957981824874878, "learning_rate": 1.6666666666666667e-05, "loss": 0.2783, "step": 350 }, { "epoch": 0.7659167065581618, "grad_norm": 0.8965973854064941, "learning_rate": 1.9061302681992337e-05, "loss": 0.2609, "step": 400 }, { "epoch": 0.861656294877932, "grad_norm": 0.5201160311698914, "learning_rate": 2.145593869731801e-05, "loss": 0.2752, "step": 450 }, { "epoch": 0.9573958831977023, "grad_norm": 0.733790934085846, "learning_rate": 2.385057471264368e-05, "loss": 0.2374, "step": 500 }, { "epoch": 0.9995213020584012, "eval_loss": 0.4093788266181946, "eval_runtime": 193.3736, "eval_samples_per_second": 1.205, "eval_steps_per_second": 0.155, "step": 522 }, { "epoch": 1.0531354715174726, "grad_norm": 0.5924952030181885, "learning_rate": 2.624521072796935e-05, "loss": 0.2357, "step": 550 }, { "epoch": 1.1488750598372426, "grad_norm": 0.6746445894241333, "learning_rate": 2.863984674329502e-05, "loss": 0.243, "step": 600 }, { "epoch": 1.2446146481570128, "grad_norm": 0.7088050842285156, "learning_rate": 3.103448275862069e-05, "loss": 0.2491, "step": 650 }, { "epoch": 1.340354236476783, "grad_norm": 0.4997643828392029, "learning_rate": 3.342911877394636e-05, "loss": 0.2221, "step": 700 }, { "epoch": 1.4360938247965533, "grad_norm": 0.5417702794075012, "learning_rate": 3.582375478927204e-05, "loss": 0.2376, "step": 750 }, { "epoch": 1.5318334131163236, "grad_norm": 0.8095722198486328, "learning_rate": 3.82183908045977e-05, "loss": 0.2369, "step": 800 }, { "epoch": 1.6275730014360938, "grad_norm": 0.6943379044532776, "learning_rate": 4.061302681992337e-05, "loss": 0.2482, "step": 850 }, { "epoch": 1.723312589755864, "grad_norm": 0.6098237633705139, "learning_rate": 4.3007662835249046e-05, "loss": 0.2245, "step": 900 }, { "epoch": 1.8190521780756344, "grad_norm": 0.4689110815525055, "learning_rate": 4.5402298850574716e-05, "loss": 0.2276, "step": 950 }, { "epoch": 1.9147917663954046, "grad_norm": 0.6285553574562073, "learning_rate": 4.7796934865900385e-05, "loss": 0.2308, "step": 1000 }, { "epoch": 1.9990426041168023, "eval_loss": 0.3900133967399597, "eval_runtime": 192.4384, "eval_samples_per_second": 1.211, "eval_steps_per_second": 0.156, "step": 1044 }, { "epoch": 2.010531354715175, "grad_norm": 0.40644219517707825, "learning_rate": 4.999988680990267e-05, "loss": 0.227, "step": 1050 }, { "epoch": 2.106270943034945, "grad_norm": 0.6110271215438843, "learning_rate": 4.9979373926052865e-05, "loss": 0.2053, "step": 1100 }, { "epoch": 2.2020105313547154, "grad_norm": 0.7076897025108337, "learning_rate": 4.992352246040183e-05, "loss": 0.232, "step": 1150 }, { "epoch": 2.297750119674485, "grad_norm": 0.5460345149040222, "learning_rate": 4.983241142660274e-05, "loss": 0.2202, "step": 1200 }, { "epoch": 2.393489707994256, "grad_norm": 0.774861216545105, "learning_rate": 4.970616972038894e-05, "loss": 0.2135, "step": 1250 }, { "epoch": 2.4892292963140257, "grad_norm": 0.489745557308197, "learning_rate": 4.954497593722384e-05, "loss": 0.2125, "step": 1300 }, { "epoch": 2.584968884633796, "grad_norm": 0.43657568097114563, "learning_rate": 4.9349058119640005e-05, "loss": 0.1994, "step": 1350 }, { "epoch": 2.680708472953566, "grad_norm": 0.5336562395095825, "learning_rate": 4.911869343462504e-05, "loss": 0.2077, "step": 1400 }, { "epoch": 2.7764480612733364, "grad_norm": 0.3822609782218933, "learning_rate": 4.88542077815105e-05, "loss": 0.2077, "step": 1450 }, { "epoch": 2.8721876495931067, "grad_norm": 0.41932567954063416, "learning_rate": 4.8555975330918736e-05, "loss": 0.1983, "step": 1500 }, { "epoch": 2.967927237912877, "grad_norm": 0.5260158181190491, "learning_rate": 4.822441799541979e-05, "loss": 0.2149, "step": 1550 }, { "epoch": 2.998563906175203, "eval_loss": 0.3759535849094391, "eval_runtime": 192.4413, "eval_samples_per_second": 1.211, "eval_steps_per_second": 0.156, "step": 1566 }, { "epoch": 3.063666826232647, "grad_norm": 0.35727375745773315, "learning_rate": 4.786000483264725e-05, "loss": 0.183, "step": 1600 }, { "epoch": 3.1594064145524174, "grad_norm": 0.44125649333000183, "learning_rate": 4.7463251381717515e-05, "loss": 0.2217, "step": 1650 }, { "epoch": 3.2551460028721877, "grad_norm": 0.39317113161087036, "learning_rate": 4.703471893389122e-05, "loss": 0.1947, "step": 1700 }, { "epoch": 3.350885591191958, "grad_norm": 0.5991040468215942, "learning_rate": 4.6575013738508575e-05, "loss": 0.2082, "step": 1750 }, { "epoch": 3.446625179511728, "grad_norm": 0.4587060213088989, "learning_rate": 4.608478614532215e-05, "loss": 0.1812, "step": 1800 }, { "epoch": 3.5423647678314985, "grad_norm": 0.6657220125198364, "learning_rate": 4.556472968444017e-05, "loss": 0.17, "step": 1850 }, { "epoch": 3.6381043561512687, "grad_norm": 0.6673758029937744, "learning_rate": 4.501558008518231e-05, "loss": 0.1833, "step": 1900 }, { "epoch": 3.7338439444710385, "grad_norm": 0.5800417065620422, "learning_rate": 4.4438114235235655e-05, "loss": 0.1819, "step": 1950 }, { "epoch": 3.829583532790809, "grad_norm": 0.9954800605773926, "learning_rate": 4.3833149081583604e-05, "loss": 0.2147, "step": 2000 }, { "epoch": 3.925323121110579, "grad_norm": 0.3738028109073639, "learning_rate": 4.320154047476237e-05, "loss": 0.1718, "step": 2050 }, { "epoch": 4.0, "eval_loss": 0.37361159920692444, "eval_runtime": 192.3984, "eval_samples_per_second": 1.211, "eval_steps_per_second": 0.156, "step": 2089 } ], "logging_steps": 50, "max_steps": 5220, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 8.597365381542052e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }