| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9959514170040484, | |
| "eval_steps": 100, | |
| "global_step": 555, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05398110661268556, | |
| "grad_norm": 7.922487686473044, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 0.9678, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10796221322537113, | |
| "grad_norm": 3.6097675653277137, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.6964, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.16194331983805668, | |
| "grad_norm": 2.781615016663475, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.4919, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21592442645074225, | |
| "grad_norm": 2.440518767608687, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.4628, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2699055330634278, | |
| "grad_norm": 2.0298957975095084, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.3842, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.32388663967611336, | |
| "grad_norm": 2.5192137508367822, | |
| "learning_rate": 9.998414611537682e-06, | |
| "loss": 0.4654, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.37786774628879893, | |
| "grad_norm": 2.063239927715952, | |
| "learning_rate": 9.980590535514234e-06, | |
| "loss": 0.3943, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4318488529014845, | |
| "grad_norm": 1.7735569657630474, | |
| "learning_rate": 9.943031509146825e-06, | |
| "loss": 0.3839, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.48582995951417, | |
| "grad_norm": 2.0461738081974197, | |
| "learning_rate": 9.885886355253758e-06, | |
| "loss": 0.3731, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5398110661268556, | |
| "grad_norm": 2.1255847957754823, | |
| "learning_rate": 9.809381504168235e-06, | |
| "loss": 0.3932, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5398110661268556, | |
| "eval_loss": 0.3728838264942169, | |
| "eval_runtime": 35.9999, | |
| "eval_samples_per_second": 18.306, | |
| "eval_steps_per_second": 9.167, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5937921727395412, | |
| "grad_norm": 2.0326706303899242, | |
| "learning_rate": 9.713820096537226e-06, | |
| "loss": 0.4067, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6477732793522267, | |
| "grad_norm": 2.0461345586648614, | |
| "learning_rate": 9.599580782165598e-06, | |
| "loss": 0.3586, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 1.7000102808818796, | |
| "learning_rate": 9.467116219664893e-06, | |
| "loss": 0.3675, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7557354925775979, | |
| "grad_norm": 1.8722517215762846, | |
| "learning_rate": 9.316951282851708e-06, | |
| "loss": 0.3823, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8097165991902834, | |
| "grad_norm": 1.9274067383112672, | |
| "learning_rate": 9.149680981002609e-06, | |
| "loss": 0.3965, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.863697705802969, | |
| "grad_norm": 1.9956765255106534, | |
| "learning_rate": 8.965968101206291e-06, | |
| "loss": 0.3643, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9176788124156545, | |
| "grad_norm": 1.6513414260937604, | |
| "learning_rate": 8.76654058215486e-06, | |
| "loss": 0.3892, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.97165991902834, | |
| "grad_norm": 1.6861799774025805, | |
| "learning_rate": 8.552188629780245e-06, | |
| "loss": 0.3469, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 1.7848543456863206, | |
| "learning_rate": 8.323761586164695e-06, | |
| "loss": 0.3549, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0796221322537112, | |
| "grad_norm": 1.911561758198626, | |
| "learning_rate": 8.082164564131844e-06, | |
| "loss": 0.235, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0796221322537112, | |
| "eval_loss": 0.36726805567741394, | |
| "eval_runtime": 35.642, | |
| "eval_samples_per_second": 18.489, | |
| "eval_steps_per_second": 9.259, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1336032388663968, | |
| "grad_norm": 1.8384533327647081, | |
| "learning_rate": 7.8283548608534e-06, | |
| "loss": 0.2505, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1875843454790824, | |
| "grad_norm": 1.8871085064270166, | |
| "learning_rate": 7.563338164682036e-06, | |
| "loss": 0.2495, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2415654520917678, | |
| "grad_norm": 1.8350946648334725, | |
| "learning_rate": 7.2881645702404625e-06, | |
| "loss": 0.2302, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2955465587044535, | |
| "grad_norm": 1.8379347227690594, | |
| "learning_rate": 7.003924417556344e-06, | |
| "loss": 0.2603, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.349527665317139, | |
| "grad_norm": 1.698128433611101, | |
| "learning_rate": 6.711743971729967e-06, | |
| "loss": 0.2452, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 1.9502693370667938, | |
| "learning_rate": 6.412780960253437e-06, | |
| "loss": 0.2402, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.45748987854251, | |
| "grad_norm": 1.8372098407405908, | |
| "learning_rate": 6.108219985664161e-06, | |
| "loss": 0.2366, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5114709851551957, | |
| "grad_norm": 1.8570260173536959, | |
| "learning_rate": 5.799267831709442e-06, | |
| "loss": 0.2307, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5654520917678814, | |
| "grad_norm": 1.819729150874523, | |
| "learning_rate": 5.487148681620862e-06, | |
| "loss": 0.2561, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6194331983805668, | |
| "grad_norm": 1.7840416072724379, | |
| "learning_rate": 5.173099267445452e-06, | |
| "loss": 0.2243, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6194331983805668, | |
| "eval_loss": 0.3492150604724884, | |
| "eval_runtime": 35.6932, | |
| "eval_samples_per_second": 18.463, | |
| "eval_steps_per_second": 9.245, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6734143049932524, | |
| "grad_norm": 2.01110786926046, | |
| "learning_rate": 4.8583639696537815e-06, | |
| "loss": 0.2575, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7273954116059378, | |
| "grad_norm": 1.9052094287787993, | |
| "learning_rate": 4.544189886442163e-06, | |
| "loss": 0.2749, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7813765182186234, | |
| "grad_norm": 1.7823079521720098, | |
| "learning_rate": 4.23182189226621e-06, | |
| "loss": 0.2442, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.835357624831309, | |
| "grad_norm": 1.8720408047784711, | |
| "learning_rate": 3.9224977051856906e-06, | |
| "loss": 0.2275, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8893387314439947, | |
| "grad_norm": 1.671249722522569, | |
| "learning_rate": 3.6174429825656687e-06, | |
| "loss": 0.2461, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9433198380566803, | |
| "grad_norm": 1.6664948881691295, | |
| "learning_rate": 3.317866464566607e-06, | |
| "loss": 0.24, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9973009446693657, | |
| "grad_norm": 1.8939691987056522, | |
| "learning_rate": 3.0249551846667207e-06, | |
| "loss": 0.2394, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 1.5123125003013291, | |
| "learning_rate": 2.7398697661942632e-06, | |
| "loss": 0.1461, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 1.794933302973888, | |
| "learning_rate": 2.4637398235066527e-06, | |
| "loss": 0.1443, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.1592442645074224, | |
| "grad_norm": 1.7338032866199007, | |
| "learning_rate": 2.19765948603866e-06, | |
| "loss": 0.1416, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1592442645074224, | |
| "eval_loss": 0.37677058577537537, | |
| "eval_runtime": 35.9366, | |
| "eval_samples_per_second": 18.338, | |
| "eval_steps_per_second": 9.183, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.213225371120108, | |
| "grad_norm": 1.5206467493039364, | |
| "learning_rate": 1.9426830629550244e-06, | |
| "loss": 0.1282, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.2672064777327936, | |
| "grad_norm": 1.7511848898998006, | |
| "learning_rate": 1.699820865585814e-06, | |
| "loss": 0.1444, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.3211875843454792, | |
| "grad_norm": 1.658903992055223, | |
| "learning_rate": 1.470035204197517e-06, | |
| "loss": 0.135, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.375168690958165, | |
| "grad_norm": 1.3664323393041014, | |
| "learning_rate": 1.2542365749622048e-06, | |
| "loss": 0.124, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.42914979757085, | |
| "grad_norm": 1.5341666917244396, | |
| "learning_rate": 1.0532800522333902e-06, | |
| "loss": 0.1336, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.4831309041835357, | |
| "grad_norm": 1.6130184135595669, | |
| "learning_rate": 8.679619004237111e-07, | |
| "loss": 0.1265, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.5371120107962213, | |
| "grad_norm": 1.8720144055458996, | |
| "learning_rate": 6.990164189094589e-07, | |
| "loss": 0.1378, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.591093117408907, | |
| "grad_norm": 1.8892042899824109, | |
| "learning_rate": 5.471130324636115e-07, | |
| "loss": 0.1156, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.6450742240215925, | |
| "grad_norm": 1.5161577357944425, | |
| "learning_rate": 4.12853638746134e-07, | |
| "loss": 0.1174, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.699055330634278, | |
| "grad_norm": 1.6875142053299983, | |
| "learning_rate": 2.9677022336181414e-07, | |
| "loss": 0.1247, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.699055330634278, | |
| "eval_loss": 0.384060263633728, | |
| "eval_runtime": 35.8027, | |
| "eval_samples_per_second": 18.406, | |
| "eval_steps_per_second": 9.217, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.753036437246964, | |
| "grad_norm": 1.6239326274258383, | |
| "learning_rate": 1.993227519356189e-07, | |
| "loss": 0.1213, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 1.775333184189768, | |
| "learning_rate": 1.2089734755797611e-07, | |
| "loss": 0.1303, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.8609986504723346, | |
| "grad_norm": 1.598091253450295, | |
| "learning_rate": 6.180476082162656e-08, | |
| "loss": 0.1234, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.91497975708502, | |
| "grad_norm": 1.5996469720413187, | |
| "learning_rate": 2.227913851230057e-08, | |
| "loss": 0.1329, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.968960863697706, | |
| "grad_norm": 1.6824674270497044, | |
| "learning_rate": 2.4770958321568283e-09, | |
| "loss": 0.139, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.9959514170040484, | |
| "step": 555, | |
| "total_flos": 10408637890560.0, | |
| "train_loss": 0.27379920536333374, | |
| "train_runtime": 2871.008, | |
| "train_samples_per_second": 6.193, | |
| "train_steps_per_second": 0.193 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 555, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 10408637890560.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |