| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 50, | |
| "global_step": 2089, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09573958831977022, | |
| "grad_norm": 1.268161654472351, | |
| "learning_rate": 2.3467432950191573e-06, | |
| "loss": 0.7797, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19147917663954045, | |
| "grad_norm": 1.4730911254882812, | |
| "learning_rate": 4.741379310344828e-06, | |
| "loss": 0.5423, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2872187649593107, | |
| "grad_norm": 0.5494027137756348, | |
| "learning_rate": 7.136015325670499e-06, | |
| "loss": 0.3713, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3829583532790809, | |
| "grad_norm": 0.4706728458404541, | |
| "learning_rate": 9.530651340996169e-06, | |
| "loss": 0.3247, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47869794159885115, | |
| "grad_norm": 0.6062602400779724, | |
| "learning_rate": 1.192528735632184e-05, | |
| "loss": 0.2856, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5744375299186214, | |
| "grad_norm": 0.7379469275474548, | |
| "learning_rate": 1.431992337164751e-05, | |
| "loss": 0.2889, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6701771182383915, | |
| "grad_norm": 0.7957981824874878, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.2783, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7659167065581618, | |
| "grad_norm": 0.8965973854064941, | |
| "learning_rate": 1.9061302681992337e-05, | |
| "loss": 0.2609, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.861656294877932, | |
| "grad_norm": 0.5201160311698914, | |
| "learning_rate": 2.145593869731801e-05, | |
| "loss": 0.2752, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9573958831977023, | |
| "grad_norm": 0.733790934085846, | |
| "learning_rate": 2.385057471264368e-05, | |
| "loss": 0.2374, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9995213020584012, | |
| "eval_loss": 0.4093788266181946, | |
| "eval_runtime": 193.3736, | |
| "eval_samples_per_second": 1.205, | |
| "eval_steps_per_second": 0.155, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.0531354715174726, | |
| "grad_norm": 0.5924952030181885, | |
| "learning_rate": 2.624521072796935e-05, | |
| "loss": 0.2357, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1488750598372426, | |
| "grad_norm": 0.6746445894241333, | |
| "learning_rate": 2.863984674329502e-05, | |
| "loss": 0.243, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2446146481570128, | |
| "grad_norm": 0.7088050842285156, | |
| "learning_rate": 3.103448275862069e-05, | |
| "loss": 0.2491, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.340354236476783, | |
| "grad_norm": 0.4997643828392029, | |
| "learning_rate": 3.342911877394636e-05, | |
| "loss": 0.2221, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4360938247965533, | |
| "grad_norm": 0.5417702794075012, | |
| "learning_rate": 3.582375478927204e-05, | |
| "loss": 0.2376, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5318334131163236, | |
| "grad_norm": 0.8095722198486328, | |
| "learning_rate": 3.82183908045977e-05, | |
| "loss": 0.2369, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6275730014360938, | |
| "grad_norm": 0.6943379044532776, | |
| "learning_rate": 4.061302681992337e-05, | |
| "loss": 0.2482, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.723312589755864, | |
| "grad_norm": 0.6098237633705139, | |
| "learning_rate": 4.3007662835249046e-05, | |
| "loss": 0.2245, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8190521780756344, | |
| "grad_norm": 0.4689110815525055, | |
| "learning_rate": 4.5402298850574716e-05, | |
| "loss": 0.2276, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9147917663954046, | |
| "grad_norm": 0.6285553574562073, | |
| "learning_rate": 4.7796934865900385e-05, | |
| "loss": 0.2308, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.9990426041168023, | |
| "eval_loss": 0.3900133967399597, | |
| "eval_runtime": 192.4384, | |
| "eval_samples_per_second": 1.211, | |
| "eval_steps_per_second": 0.156, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 2.010531354715175, | |
| "grad_norm": 0.40644219517707825, | |
| "learning_rate": 4.999988680990267e-05, | |
| "loss": 0.227, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.106270943034945, | |
| "grad_norm": 0.6110271215438843, | |
| "learning_rate": 4.9979373926052865e-05, | |
| "loss": 0.2053, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.2020105313547154, | |
| "grad_norm": 0.7076897025108337, | |
| "learning_rate": 4.992352246040183e-05, | |
| "loss": 0.232, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.297750119674485, | |
| "grad_norm": 0.5460345149040222, | |
| "learning_rate": 4.983241142660274e-05, | |
| "loss": 0.2202, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.393489707994256, | |
| "grad_norm": 0.774861216545105, | |
| "learning_rate": 4.970616972038894e-05, | |
| "loss": 0.2135, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.4892292963140257, | |
| "grad_norm": 0.489745557308197, | |
| "learning_rate": 4.954497593722384e-05, | |
| "loss": 0.2125, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.584968884633796, | |
| "grad_norm": 0.43657568097114563, | |
| "learning_rate": 4.9349058119640005e-05, | |
| "loss": 0.1994, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.680708472953566, | |
| "grad_norm": 0.5336562395095825, | |
| "learning_rate": 4.911869343462504e-05, | |
| "loss": 0.2077, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.7764480612733364, | |
| "grad_norm": 0.3822609782218933, | |
| "learning_rate": 4.88542077815105e-05, | |
| "loss": 0.2077, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.8721876495931067, | |
| "grad_norm": 0.41932567954063416, | |
| "learning_rate": 4.8555975330918736e-05, | |
| "loss": 0.1983, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.967927237912877, | |
| "grad_norm": 0.5260158181190491, | |
| "learning_rate": 4.822441799541979e-05, | |
| "loss": 0.2149, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.998563906175203, | |
| "eval_loss": 0.3759535849094391, | |
| "eval_runtime": 192.4413, | |
| "eval_samples_per_second": 1.211, | |
| "eval_steps_per_second": 0.156, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 3.063666826232647, | |
| "grad_norm": 0.35727375745773315, | |
| "learning_rate": 4.786000483264725e-05, | |
| "loss": 0.183, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.1594064145524174, | |
| "grad_norm": 0.44125649333000183, | |
| "learning_rate": 4.7463251381717515e-05, | |
| "loss": 0.2217, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.2551460028721877, | |
| "grad_norm": 0.39317113161087036, | |
| "learning_rate": 4.703471893389122e-05, | |
| "loss": 0.1947, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.350885591191958, | |
| "grad_norm": 0.5991040468215942, | |
| "learning_rate": 4.6575013738508575e-05, | |
| "loss": 0.2082, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.446625179511728, | |
| "grad_norm": 0.4587060213088989, | |
| "learning_rate": 4.608478614532215e-05, | |
| "loss": 0.1812, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.5423647678314985, | |
| "grad_norm": 0.6657220125198364, | |
| "learning_rate": 4.556472968444017e-05, | |
| "loss": 0.17, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.6381043561512687, | |
| "grad_norm": 0.6673758029937744, | |
| "learning_rate": 4.501558008518231e-05, | |
| "loss": 0.1833, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.7338439444710385, | |
| "grad_norm": 0.5800417065620422, | |
| "learning_rate": 4.4438114235235655e-05, | |
| "loss": 0.1819, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.829583532790809, | |
| "grad_norm": 0.9954800605773926, | |
| "learning_rate": 4.3833149081583604e-05, | |
| "loss": 0.2147, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.925323121110579, | |
| "grad_norm": 0.3738028109073639, | |
| "learning_rate": 4.320154047476237e-05, | |
| "loss": 0.1718, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.37361159920692444, | |
| "eval_runtime": 192.3984, | |
| "eval_samples_per_second": 1.211, | |
| "eval_steps_per_second": 0.156, | |
| "step": 2089 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 5220, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "total_flos": 8.597365381542052e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |