| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9520426287744228, |
| "eval_steps": 200, |
| "global_step": 550, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.035523978685612786, |
| "grad_norm": 30.68755531311035, |
| "learning_rate": 2.1176470588235296e-05, |
| "loss": 10.0284, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07104795737122557, |
| "grad_norm": 11.449149131774902, |
| "learning_rate": 4.470588235294118e-05, |
| "loss": 8.8672, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10657193605683836, |
| "grad_norm": 8.332048416137695, |
| "learning_rate": 6.823529411764707e-05, |
| "loss": 7.2862, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14209591474245115, |
| "grad_norm": 8.64196491241455, |
| "learning_rate": 9.176470588235295e-05, |
| "loss": 5.7446, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17761989342806395, |
| "grad_norm": 9.184565544128418, |
| "learning_rate": 0.00011529411764705881, |
| "loss": 4.3277, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21314387211367672, |
| "grad_norm": 12.228641510009766, |
| "learning_rate": 0.00013882352941176472, |
| "loss": 2.8672, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.24866785079928952, |
| "grad_norm": 10.162263870239258, |
| "learning_rate": 0.0001623529411764706, |
| "loss": 2.1514, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2841918294849023, |
| "grad_norm": 8.797259330749512, |
| "learning_rate": 0.00018588235294117648, |
| "loss": 1.9459, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3197158081705151, |
| "grad_norm": 6.776567459106445, |
| "learning_rate": 0.00019894875164257555, |
| "loss": 1.3218, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3552397868561279, |
| "grad_norm": 8.384011268615723, |
| "learning_rate": 0.00019632063074901445, |
| "loss": 1.2121, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3907637655417407, |
| "grad_norm": 6.2972564697265625, |
| "learning_rate": 0.00019369250985545335, |
| "loss": 0.9314, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.42628774422735344, |
| "grad_norm": 5.5578179359436035, |
| "learning_rate": 0.00019106438896189225, |
| "loss": 0.7533, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.46181172291296624, |
| "grad_norm": 5.197394847869873, |
| "learning_rate": 0.00018843626806833115, |
| "loss": 0.7108, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.49733570159857904, |
| "grad_norm": 3.50080943107605, |
| "learning_rate": 0.00018580814717477005, |
| "loss": 0.4706, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5328596802841918, |
| "grad_norm": 7.297764301300049, |
| "learning_rate": 0.00018318002628120894, |
| "loss": 0.4651, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5683836589698046, |
| "grad_norm": 3.9045844078063965, |
| "learning_rate": 0.00018055190538764784, |
| "loss": 0.5066, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6039076376554174, |
| "grad_norm": 3.720517873764038, |
| "learning_rate": 0.00017792378449408674, |
| "loss": 0.4518, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6394316163410302, |
| "grad_norm": 2.6637465953826904, |
| "learning_rate": 0.00017529566360052564, |
| "loss": 0.3716, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6749555950266429, |
| "grad_norm": 3.8581767082214355, |
| "learning_rate": 0.00017266754270696454, |
| "loss": 0.2933, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7104795737122558, |
| "grad_norm": 2.2496705055236816, |
| "learning_rate": 0.0001700394218134034, |
| "loss": 0.299, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7104795737122558, |
| "eval_loss": 0.08425440639257431, |
| "eval_runtime": 10.2304, |
| "eval_samples_per_second": 48.874, |
| "eval_steps_per_second": 6.158, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7460035523978685, |
| "grad_norm": 4.2130818367004395, |
| "learning_rate": 0.0001674113009198423, |
| "loss": 0.3048, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7815275310834814, |
| "grad_norm": 2.9314048290252686, |
| "learning_rate": 0.0001647831800262812, |
| "loss": 0.3255, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8170515097690941, |
| "grad_norm": 3.629533052444458, |
| "learning_rate": 0.0001621550591327201, |
| "loss": 0.2614, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8525754884547069, |
| "grad_norm": 2.9846203327178955, |
| "learning_rate": 0.000159526938239159, |
| "loss": 0.2896, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8880994671403197, |
| "grad_norm": 4.864554405212402, |
| "learning_rate": 0.0001568988173455979, |
| "loss": 0.2299, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9236234458259325, |
| "grad_norm": 5.363948345184326, |
| "learning_rate": 0.0001542706964520368, |
| "loss": 0.2208, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9591474245115453, |
| "grad_norm": 7.342677593231201, |
| "learning_rate": 0.0001516425755584757, |
| "loss": 0.2803, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9946714031971581, |
| "grad_norm": 3.756502389907837, |
| "learning_rate": 0.0001490144546649146, |
| "loss": 0.2073, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0284191829484903, |
| "grad_norm": 2.067270278930664, |
| "learning_rate": 0.0001463863337713535, |
| "loss": 0.1461, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.063943161634103, |
| "grad_norm": 1.9786051511764526, |
| "learning_rate": 0.0001437582128777924, |
| "loss": 0.1376, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0994671403197158, |
| "grad_norm": 3.902402877807617, |
| "learning_rate": 0.0001411300919842313, |
| "loss": 0.2187, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1349911190053286, |
| "grad_norm": 8.216059684753418, |
| "learning_rate": 0.0001385019710906702, |
| "loss": 0.1785, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1705150976909413, |
| "grad_norm": 3.129307270050049, |
| "learning_rate": 0.00013587385019710906, |
| "loss": 0.2076, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.206039076376554, |
| "grad_norm": 1.8454111814498901, |
| "learning_rate": 0.00013324572930354796, |
| "loss": 0.156, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.241563055062167, |
| "grad_norm": 3.502756118774414, |
| "learning_rate": 0.00013061760840998686, |
| "loss": 0.1637, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2770870337477798, |
| "grad_norm": 0.7526670694351196, |
| "learning_rate": 0.00012798948751642576, |
| "loss": 0.1272, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3126110124333925, |
| "grad_norm": 2.0800411701202393, |
| "learning_rate": 0.00012536136662286466, |
| "loss": 0.1372, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3481349911190053, |
| "grad_norm": 2.1678974628448486, |
| "learning_rate": 0.00012273324572930356, |
| "loss": 0.1563, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.383658969804618, |
| "grad_norm": 3.2046022415161133, |
| "learning_rate": 0.00012010512483574245, |
| "loss": 0.1658, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.419182948490231, |
| "grad_norm": 1.8198497295379639, |
| "learning_rate": 0.00011747700394218135, |
| "loss": 0.1619, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.419182948490231, |
| "eval_loss": 0.04036250337958336, |
| "eval_runtime": 9.5151, |
| "eval_samples_per_second": 52.548, |
| "eval_steps_per_second": 6.621, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.4547069271758437, |
| "grad_norm": 2.0104610919952393, |
| "learning_rate": 0.00011484888304862025, |
| "loss": 0.1181, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4902309058614565, |
| "grad_norm": 2.408430337905884, |
| "learning_rate": 0.00011222076215505915, |
| "loss": 0.1293, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.5257548845470694, |
| "grad_norm": 1.787520408630371, |
| "learning_rate": 0.00010959264126149803, |
| "loss": 0.1132, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.561278863232682, |
| "grad_norm": 1.8986729383468628, |
| "learning_rate": 0.00010696452036793693, |
| "loss": 0.1296, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.596802841918295, |
| "grad_norm": 2.2806177139282227, |
| "learning_rate": 0.00010433639947437583, |
| "loss": 0.1117, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6323268206039077, |
| "grad_norm": 2.358901023864746, |
| "learning_rate": 0.00010170827858081473, |
| "loss": 0.1474, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6678507992895204, |
| "grad_norm": 2.020256280899048, |
| "learning_rate": 9.908015768725362e-05, |
| "loss": 0.1001, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7033747779751334, |
| "grad_norm": 1.0857290029525757, |
| "learning_rate": 9.645203679369251e-05, |
| "loss": 0.1024, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.738898756660746, |
| "grad_norm": 1.9954943656921387, |
| "learning_rate": 9.382391590013141e-05, |
| "loss": 0.1033, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.7744227353463589, |
| "grad_norm": 2.0594799518585205, |
| "learning_rate": 9.119579500657031e-05, |
| "loss": 0.085, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8099467140319716, |
| "grad_norm": 2.4020485877990723, |
| "learning_rate": 8.85676741130092e-05, |
| "loss": 0.0943, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8454706927175843, |
| "grad_norm": 2.926164150238037, |
| "learning_rate": 8.59395532194481e-05, |
| "loss": 0.1207, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8809946714031973, |
| "grad_norm": 1.3248850107192993, |
| "learning_rate": 8.331143232588699e-05, |
| "loss": 0.1067, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9165186500888098, |
| "grad_norm": 1.3187663555145264, |
| "learning_rate": 8.068331143232589e-05, |
| "loss": 0.1041, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.9520426287744228, |
| "grad_norm": 3.3121986389160156, |
| "learning_rate": 7.805519053876479e-05, |
| "loss": 0.0672, |
| "step": 550 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 846, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2267477042274304e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|