| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.03949447077409163, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00019747235387045813, | |
| "grad_norm": 5.788590908050537, | |
| "learning_rate": 0.0, | |
| "loss": 1.0436, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00039494470774091627, | |
| "grad_norm": 6.300467491149902, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.3195, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0005924170616113745, | |
| "grad_norm": 6.489816665649414, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.5645, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0007898894154818325, | |
| "grad_norm": 5.287850379943848, | |
| "learning_rate": 3e-06, | |
| "loss": 1.154, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0009873617693522906, | |
| "grad_norm": 6.123354434967041, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.4052, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.001184834123222749, | |
| "grad_norm": 5.501026630401611, | |
| "learning_rate": 5e-06, | |
| "loss": 1.708, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.001382306477093207, | |
| "grad_norm": 4.960855007171631, | |
| "learning_rate": 6e-06, | |
| "loss": 1.6143, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.001579778830963665, | |
| "grad_norm": 5.855900764465332, | |
| "learning_rate": 7e-06, | |
| "loss": 1.6376, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0017772511848341231, | |
| "grad_norm": 5.17061710357666, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.2182, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0019747235387045812, | |
| "grad_norm": 4.436169624328613, | |
| "learning_rate": 9e-06, | |
| "loss": 1.9812, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0021721958925750395, | |
| "grad_norm": 4.55659818649292, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1695, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.002369668246445498, | |
| "grad_norm": 5.105171203613281, | |
| "learning_rate": 9.947368421052632e-06, | |
| "loss": 1.1243, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0025671406003159557, | |
| "grad_norm": 5.100945472717285, | |
| "learning_rate": 9.894736842105264e-06, | |
| "loss": 1.8183, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.002764612954186414, | |
| "grad_norm": 4.643738746643066, | |
| "learning_rate": 9.842105263157896e-06, | |
| "loss": 1.2569, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.002962085308056872, | |
| "grad_norm": 4.557013034820557, | |
| "learning_rate": 9.789473684210527e-06, | |
| "loss": 1.7372, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.00315955766192733, | |
| "grad_norm": 6.458473205566406, | |
| "learning_rate": 9.736842105263159e-06, | |
| "loss": 2.1888, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0033570300157977884, | |
| "grad_norm": 5.135770320892334, | |
| "learning_rate": 9.68421052631579e-06, | |
| "loss": 1.2197, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0035545023696682463, | |
| "grad_norm": 5.171187400817871, | |
| "learning_rate": 9.631578947368422e-06, | |
| "loss": 1.6326, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0037519747235387046, | |
| "grad_norm": 4.825802326202393, | |
| "learning_rate": 9.578947368421054e-06, | |
| "loss": 1.4698, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0039494470774091624, | |
| "grad_norm": 4.34477424621582, | |
| "learning_rate": 9.526315789473684e-06, | |
| "loss": 1.6102, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004146919431279621, | |
| "grad_norm": 4.5733866691589355, | |
| "learning_rate": 9.473684210526315e-06, | |
| "loss": 1.8448, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.004344391785150079, | |
| "grad_norm": 4.53968620300293, | |
| "learning_rate": 9.421052631578949e-06, | |
| "loss": 1.2972, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.004541864139020537, | |
| "grad_norm": 5.408173084259033, | |
| "learning_rate": 9.36842105263158e-06, | |
| "loss": 1.7215, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.004739336492890996, | |
| "grad_norm": 4.2717461585998535, | |
| "learning_rate": 9.315789473684212e-06, | |
| "loss": 1.137, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0049368088467614535, | |
| "grad_norm": 4.3075785636901855, | |
| "learning_rate": 9.263157894736842e-06, | |
| "loss": 1.3048, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.005134281200631911, | |
| "grad_norm": 4.659534454345703, | |
| "learning_rate": 9.210526315789474e-06, | |
| "loss": 0.9883, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.00533175355450237, | |
| "grad_norm": 4.719169616699219, | |
| "learning_rate": 9.157894736842105e-06, | |
| "loss": 1.4926, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.005529225908372828, | |
| "grad_norm": 4.502306938171387, | |
| "learning_rate": 9.105263157894739e-06, | |
| "loss": 1.8264, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.005726698262243286, | |
| "grad_norm": 4.353489875793457, | |
| "learning_rate": 9.05263157894737e-06, | |
| "loss": 1.6681, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.005924170616113744, | |
| "grad_norm": 5.161799907684326, | |
| "learning_rate": 9e-06, | |
| "loss": 1.6566, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.006121642969984202, | |
| "grad_norm": 4.235696315765381, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 1.7276, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.00631911532385466, | |
| "grad_norm": 6.545216083526611, | |
| "learning_rate": 8.894736842105264e-06, | |
| "loss": 1.0481, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.006516587677725118, | |
| "grad_norm": 4.9834113121032715, | |
| "learning_rate": 8.842105263157895e-06, | |
| "loss": 1.4457, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.006714060031595577, | |
| "grad_norm": 4.448666572570801, | |
| "learning_rate": 8.789473684210527e-06, | |
| "loss": 1.7361, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.006911532385466035, | |
| "grad_norm": 4.735658168792725, | |
| "learning_rate": 8.736842105263158e-06, | |
| "loss": 1.3361, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0071090047393364926, | |
| "grad_norm": 5.857210636138916, | |
| "learning_rate": 8.68421052631579e-06, | |
| "loss": 1.4395, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.007306477093206951, | |
| "grad_norm": 4.746231555938721, | |
| "learning_rate": 8.631578947368422e-06, | |
| "loss": 1.4588, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.007503949447077409, | |
| "grad_norm": 5.420529365539551, | |
| "learning_rate": 8.578947368421053e-06, | |
| "loss": 1.4352, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.007701421800947867, | |
| "grad_norm": 3.966956853866577, | |
| "learning_rate": 8.526315789473685e-06, | |
| "loss": 1.7395, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.007898894154818325, | |
| "grad_norm": 4.626506805419922, | |
| "learning_rate": 8.473684210526317e-06, | |
| "loss": 1.2152, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.008096366508688783, | |
| "grad_norm": 5.946537971496582, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 1.2569, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.008293838862559242, | |
| "grad_norm": 6.078729152679443, | |
| "learning_rate": 8.36842105263158e-06, | |
| "loss": 2.0072, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0084913112164297, | |
| "grad_norm": 5.362630844116211, | |
| "learning_rate": 8.315789473684212e-06, | |
| "loss": 1.6189, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.008688783570300158, | |
| "grad_norm": 4.6319098472595215, | |
| "learning_rate": 8.263157894736843e-06, | |
| "loss": 1.1937, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.008886255924170616, | |
| "grad_norm": 5.145988464355469, | |
| "learning_rate": 8.210526315789475e-06, | |
| "loss": 1.48, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.009083728278041074, | |
| "grad_norm": 5.191286563873291, | |
| "learning_rate": 8.157894736842106e-06, | |
| "loss": 1.6115, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.009281200631911532, | |
| "grad_norm": 4.6075544357299805, | |
| "learning_rate": 8.105263157894736e-06, | |
| "loss": 1.3963, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.009478672985781991, | |
| "grad_norm": 4.724617958068848, | |
| "learning_rate": 8.052631578947368e-06, | |
| "loss": 1.7869, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.009676145339652449, | |
| "grad_norm": 4.976570129394531, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.5635, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.009873617693522907, | |
| "grad_norm": 5.121829032897949, | |
| "learning_rate": 7.947368421052633e-06, | |
| "loss": 1.246, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010071090047393365, | |
| "grad_norm": 5.260928153991699, | |
| "learning_rate": 7.894736842105265e-06, | |
| "loss": 1.98, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.010268562401263823, | |
| "grad_norm": 5.072506904602051, | |
| "learning_rate": 7.842105263157895e-06, | |
| "loss": 1.0555, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.01046603475513428, | |
| "grad_norm": 4.6401872634887695, | |
| "learning_rate": 7.789473684210526e-06, | |
| "loss": 1.5456, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.01066350710900474, | |
| "grad_norm": 5.566153526306152, | |
| "learning_rate": 7.736842105263158e-06, | |
| "loss": 1.7965, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.010860979462875198, | |
| "grad_norm": 5.0998215675354, | |
| "learning_rate": 7.68421052631579e-06, | |
| "loss": 1.3262, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.011058451816745656, | |
| "grad_norm": 4.437518119812012, | |
| "learning_rate": 7.631578947368423e-06, | |
| "loss": 1.7174, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.011255924170616114, | |
| "grad_norm": 3.8838698863983154, | |
| "learning_rate": 7.578947368421054e-06, | |
| "loss": 1.185, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.011453396524486572, | |
| "grad_norm": 4.112951278686523, | |
| "learning_rate": 7.526315789473685e-06, | |
| "loss": 1.3785, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.01165086887835703, | |
| "grad_norm": 4.612501621246338, | |
| "learning_rate": 7.473684210526316e-06, | |
| "loss": 1.2861, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.011848341232227487, | |
| "grad_norm": 4.541945457458496, | |
| "learning_rate": 7.421052631578948e-06, | |
| "loss": 1.0096, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.012045813586097947, | |
| "grad_norm": 6.567755699157715, | |
| "learning_rate": 7.368421052631579e-06, | |
| "loss": 1.0163, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.012243285939968405, | |
| "grad_norm": 4.897511959075928, | |
| "learning_rate": 7.315789473684212e-06, | |
| "loss": 1.8322, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.012440758293838863, | |
| "grad_norm": 4.3046064376831055, | |
| "learning_rate": 7.263157894736843e-06, | |
| "loss": 1.0891, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.01263823064770932, | |
| "grad_norm": 4.533966064453125, | |
| "learning_rate": 7.210526315789474e-06, | |
| "loss": 1.3371, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.012835703001579778, | |
| "grad_norm": 4.470656394958496, | |
| "learning_rate": 7.157894736842106e-06, | |
| "loss": 2.3414, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.013033175355450236, | |
| "grad_norm": 4.738101959228516, | |
| "learning_rate": 7.1052631578947375e-06, | |
| "loss": 1.765, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.013230647709320696, | |
| "grad_norm": 3.870649814605713, | |
| "learning_rate": 7.052631578947369e-06, | |
| "loss": 1.6424, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.013428120063191154, | |
| "grad_norm": 7.043670177459717, | |
| "learning_rate": 7e-06, | |
| "loss": 1.5049, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.013625592417061612, | |
| "grad_norm": 4.199113368988037, | |
| "learning_rate": 6.947368421052632e-06, | |
| "loss": 1.5253, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.01382306477093207, | |
| "grad_norm": 4.738468170166016, | |
| "learning_rate": 6.894736842105264e-06, | |
| "loss": 1.4544, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.014020537124802527, | |
| "grad_norm": 5.083221912384033, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 1.4874, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.014218009478672985, | |
| "grad_norm": 4.9555253982543945, | |
| "learning_rate": 6.789473684210527e-06, | |
| "loss": 1.7049, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.014415481832543445, | |
| "grad_norm": 4.266180992126465, | |
| "learning_rate": 6.736842105263158e-06, | |
| "loss": 1.2864, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.014612954186413903, | |
| "grad_norm": 4.510780334472656, | |
| "learning_rate": 6.68421052631579e-06, | |
| "loss": 1.4615, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.01481042654028436, | |
| "grad_norm": 4.075244426727295, | |
| "learning_rate": 6.631578947368421e-06, | |
| "loss": 1.0669, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.015007898894154818, | |
| "grad_norm": 4.169254302978516, | |
| "learning_rate": 6.578947368421054e-06, | |
| "loss": 1.4458, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.015205371248025276, | |
| "grad_norm": 4.540365695953369, | |
| "learning_rate": 6.526315789473685e-06, | |
| "loss": 1.4148, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.015402843601895734, | |
| "grad_norm": 4.700695037841797, | |
| "learning_rate": 6.473684210526316e-06, | |
| "loss": 2.0347, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.015600315955766192, | |
| "grad_norm": 4.982248306274414, | |
| "learning_rate": 6.421052631578948e-06, | |
| "loss": 1.3731, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.01579778830963665, | |
| "grad_norm": 5.1645941734313965, | |
| "learning_rate": 6.3684210526315795e-06, | |
| "loss": 1.1041, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015995260663507108, | |
| "grad_norm": 3.988223075866699, | |
| "learning_rate": 6.31578947368421e-06, | |
| "loss": 1.8261, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.016192733017377565, | |
| "grad_norm": 5.132425785064697, | |
| "learning_rate": 6.263157894736842e-06, | |
| "loss": 1.3754, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.016390205371248027, | |
| "grad_norm": 4.158195972442627, | |
| "learning_rate": 6.2105263157894745e-06, | |
| "loss": 1.233, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.016587677725118485, | |
| "grad_norm": 5.385928630828857, | |
| "learning_rate": 6.157894736842106e-06, | |
| "loss": 1.6838, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.016785150078988943, | |
| "grad_norm": 4.63645076751709, | |
| "learning_rate": 6.105263157894738e-06, | |
| "loss": 1.9314, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0169826224328594, | |
| "grad_norm": 5.3244757652282715, | |
| "learning_rate": 6.0526315789473685e-06, | |
| "loss": 1.4223, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.017180094786729858, | |
| "grad_norm": 4.603305339813232, | |
| "learning_rate": 6e-06, | |
| "loss": 1.7243, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.017377567140600316, | |
| "grad_norm": 5.223212718963623, | |
| "learning_rate": 5.947368421052632e-06, | |
| "loss": 1.337, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.017575039494470774, | |
| "grad_norm": 4.344864845275879, | |
| "learning_rate": 5.8947368421052634e-06, | |
| "loss": 1.4767, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.017772511848341232, | |
| "grad_norm": 4.268956184387207, | |
| "learning_rate": 5.842105263157896e-06, | |
| "loss": 1.0763, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.01796998420221169, | |
| "grad_norm": 4.882099151611328, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 1.7006, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.018167456556082148, | |
| "grad_norm": 5.254153728485107, | |
| "learning_rate": 5.736842105263158e-06, | |
| "loss": 2.0318, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.018364928909952605, | |
| "grad_norm": 4.496589660644531, | |
| "learning_rate": 5.68421052631579e-06, | |
| "loss": 1.0075, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.018562401263823063, | |
| "grad_norm": 4.610377788543701, | |
| "learning_rate": 5.631578947368422e-06, | |
| "loss": 0.973, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.018759873617693525, | |
| "grad_norm": 4.102587699890137, | |
| "learning_rate": 5.578947368421052e-06, | |
| "loss": 1.4508, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.018957345971563982, | |
| "grad_norm": 4.820801258087158, | |
| "learning_rate": 5.526315789473685e-06, | |
| "loss": 2.234, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.01915481832543444, | |
| "grad_norm": 4.17614221572876, | |
| "learning_rate": 5.4736842105263165e-06, | |
| "loss": 0.9472, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.019352290679304898, | |
| "grad_norm": 4.681643962860107, | |
| "learning_rate": 5.421052631578948e-06, | |
| "loss": 0.952, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.019549763033175356, | |
| "grad_norm": 4.793570041656494, | |
| "learning_rate": 5.36842105263158e-06, | |
| "loss": 2.1524, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.019747235387045814, | |
| "grad_norm": 5.580649375915527, | |
| "learning_rate": 5.315789473684211e-06, | |
| "loss": 1.168, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.019944707740916272, | |
| "grad_norm": 4.42297887802124, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 1.0549, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.02014218009478673, | |
| "grad_norm": 4.290710926055908, | |
| "learning_rate": 5.210526315789474e-06, | |
| "loss": 1.4328, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.020339652448657188, | |
| "grad_norm": 4.0357255935668945, | |
| "learning_rate": 5.157894736842106e-06, | |
| "loss": 0.9894, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.020537124802527645, | |
| "grad_norm": 4.805973529815674, | |
| "learning_rate": 5.105263157894738e-06, | |
| "loss": 1.3093, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.020734597156398103, | |
| "grad_norm": 4.185112953186035, | |
| "learning_rate": 5.052631578947369e-06, | |
| "loss": 1.4606, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.02093206951026856, | |
| "grad_norm": 4.384559631347656, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0051, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.02112954186413902, | |
| "grad_norm": 4.920189380645752, | |
| "learning_rate": 4.947368421052632e-06, | |
| "loss": 1.1721, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.02132701421800948, | |
| "grad_norm": 4.506773471832275, | |
| "learning_rate": 4.894736842105264e-06, | |
| "loss": 1.4764, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.021524486571879938, | |
| "grad_norm": 4.980959415435791, | |
| "learning_rate": 4.842105263157895e-06, | |
| "loss": 1.1304, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.021721958925750396, | |
| "grad_norm": 4.118868827819824, | |
| "learning_rate": 4.789473684210527e-06, | |
| "loss": 1.1892, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.021919431279620854, | |
| "grad_norm": 6.19287109375, | |
| "learning_rate": 4.736842105263158e-06, | |
| "loss": 1.6254, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.022116903633491312, | |
| "grad_norm": 5.811559200286865, | |
| "learning_rate": 4.68421052631579e-06, | |
| "loss": 1.8662, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.02231437598736177, | |
| "grad_norm": 4.6513352394104, | |
| "learning_rate": 4.631578947368421e-06, | |
| "loss": 1.3816, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.022511848341232227, | |
| "grad_norm": 5.178617477416992, | |
| "learning_rate": 4.578947368421053e-06, | |
| "loss": 1.2535, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.022709320695102685, | |
| "grad_norm": 3.830137014389038, | |
| "learning_rate": 4.526315789473685e-06, | |
| "loss": 1.5312, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.022906793048973143, | |
| "grad_norm": 4.620641231536865, | |
| "learning_rate": 4.473684210526316e-06, | |
| "loss": 1.5528, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.0231042654028436, | |
| "grad_norm": 5.0326738357543945, | |
| "learning_rate": 4.4210526315789476e-06, | |
| "loss": 1.3443, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.02330173775671406, | |
| "grad_norm": 4.62188720703125, | |
| "learning_rate": 4.368421052631579e-06, | |
| "loss": 1.0445, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.023499210110584517, | |
| "grad_norm": 3.9635679721832275, | |
| "learning_rate": 4.315789473684211e-06, | |
| "loss": 1.5812, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.023696682464454975, | |
| "grad_norm": 5.0226664543151855, | |
| "learning_rate": 4.2631578947368425e-06, | |
| "loss": 1.2235, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.023894154818325436, | |
| "grad_norm": 5.353757858276367, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 1.5831, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.024091627172195894, | |
| "grad_norm": 4.838202476501465, | |
| "learning_rate": 4.157894736842106e-06, | |
| "loss": 1.7689, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.02428909952606635, | |
| "grad_norm": 4.49991512298584, | |
| "learning_rate": 4.105263157894737e-06, | |
| "loss": 0.828, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.02448657187993681, | |
| "grad_norm": 4.480604648590088, | |
| "learning_rate": 4.052631578947368e-06, | |
| "loss": 1.6869, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.024684044233807267, | |
| "grad_norm": 4.725250244140625, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.7038, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.024881516587677725, | |
| "grad_norm": 4.806907653808594, | |
| "learning_rate": 3.947368421052632e-06, | |
| "loss": 1.6985, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.025078988941548183, | |
| "grad_norm": 4.734091758728027, | |
| "learning_rate": 3.894736842105263e-06, | |
| "loss": 1.5138, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.02527646129541864, | |
| "grad_norm": 5.3406243324279785, | |
| "learning_rate": 3.842105263157895e-06, | |
| "loss": 1.7152, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.0254739336492891, | |
| "grad_norm": 4.890450477600098, | |
| "learning_rate": 3.789473684210527e-06, | |
| "loss": 1.734, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.025671406003159557, | |
| "grad_norm": 5.458994388580322, | |
| "learning_rate": 3.736842105263158e-06, | |
| "loss": 1.5888, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.025868878357030015, | |
| "grad_norm": 4.655605316162109, | |
| "learning_rate": 3.6842105263157896e-06, | |
| "loss": 1.7478, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.026066350710900472, | |
| "grad_norm": 7.664575576782227, | |
| "learning_rate": 3.6315789473684217e-06, | |
| "loss": 1.5616, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.026263823064770934, | |
| "grad_norm": 5.207353115081787, | |
| "learning_rate": 3.578947368421053e-06, | |
| "loss": 1.8918, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.02646129541864139, | |
| "grad_norm": 3.969021797180176, | |
| "learning_rate": 3.5263157894736846e-06, | |
| "loss": 1.0807, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.02665876777251185, | |
| "grad_norm": 5.148044586181641, | |
| "learning_rate": 3.473684210526316e-06, | |
| "loss": 1.5419, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.026856240126382307, | |
| "grad_norm": 5.609622955322266, | |
| "learning_rate": 3.421052631578948e-06, | |
| "loss": 1.2194, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.027053712480252765, | |
| "grad_norm": 4.281411170959473, | |
| "learning_rate": 3.368421052631579e-06, | |
| "loss": 0.9111, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.027251184834123223, | |
| "grad_norm": 4.415678977966309, | |
| "learning_rate": 3.3157894736842107e-06, | |
| "loss": 1.999, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.02744865718799368, | |
| "grad_norm": 4.46459436416626, | |
| "learning_rate": 3.2631578947368423e-06, | |
| "loss": 1.749, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.02764612954186414, | |
| "grad_norm": 4.180970191955566, | |
| "learning_rate": 3.210526315789474e-06, | |
| "loss": 1.4696, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.027843601895734597, | |
| "grad_norm": 4.308414459228516, | |
| "learning_rate": 3.157894736842105e-06, | |
| "loss": 1.6192, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.028041074249605055, | |
| "grad_norm": 6.27396821975708, | |
| "learning_rate": 3.1052631578947372e-06, | |
| "loss": 0.9774, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.028238546603475512, | |
| "grad_norm": 5.713791847229004, | |
| "learning_rate": 3.052631578947369e-06, | |
| "loss": 1.2108, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.02843601895734597, | |
| "grad_norm": 3.94455885887146, | |
| "learning_rate": 3e-06, | |
| "loss": 0.928, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.028633491311216428, | |
| "grad_norm": 3.577357769012451, | |
| "learning_rate": 2.9473684210526317e-06, | |
| "loss": 1.2109, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.02883096366508689, | |
| "grad_norm": 4.142393112182617, | |
| "learning_rate": 2.8947368421052634e-06, | |
| "loss": 0.8309, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.029028436018957347, | |
| "grad_norm": 5.459789276123047, | |
| "learning_rate": 2.842105263157895e-06, | |
| "loss": 1.6238, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.029225908372827805, | |
| "grad_norm": 4.358528137207031, | |
| "learning_rate": 2.789473684210526e-06, | |
| "loss": 1.2229, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.029423380726698263, | |
| "grad_norm": 4.201858043670654, | |
| "learning_rate": 2.7368421052631583e-06, | |
| "loss": 0.8191, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.02962085308056872, | |
| "grad_norm": 5.12843656539917, | |
| "learning_rate": 2.68421052631579e-06, | |
| "loss": 2.0575, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02981832543443918, | |
| "grad_norm": 3.9480583667755127, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 1.2752, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.030015797788309637, | |
| "grad_norm": 4.900203227996826, | |
| "learning_rate": 2.578947368421053e-06, | |
| "loss": 0.6156, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.030213270142180094, | |
| "grad_norm": 5.017102241516113, | |
| "learning_rate": 2.5263157894736844e-06, | |
| "loss": 1.4791, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.030410742496050552, | |
| "grad_norm": 4.326578140258789, | |
| "learning_rate": 2.473684210526316e-06, | |
| "loss": 1.1194, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.03060821484992101, | |
| "grad_norm": 4.385910511016846, | |
| "learning_rate": 2.4210526315789477e-06, | |
| "loss": 1.8029, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.030805687203791468, | |
| "grad_norm": 3.988187551498413, | |
| "learning_rate": 2.368421052631579e-06, | |
| "loss": 0.9693, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.031003159557661926, | |
| "grad_norm": 4.455026149749756, | |
| "learning_rate": 2.3157894736842105e-06, | |
| "loss": 1.3069, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.031200631911532384, | |
| "grad_norm": 5.547366142272949, | |
| "learning_rate": 2.2631578947368426e-06, | |
| "loss": 1.5911, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.03139810426540284, | |
| "grad_norm": 5.407074928283691, | |
| "learning_rate": 2.2105263157894738e-06, | |
| "loss": 1.4276, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.0315955766192733, | |
| "grad_norm": 4.332579135894775, | |
| "learning_rate": 2.1578947368421054e-06, | |
| "loss": 1.0162, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03179304897314376, | |
| "grad_norm": 4.770085334777832, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 1.5811, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.031990521327014215, | |
| "grad_norm": 4.930882930755615, | |
| "learning_rate": 2.0526315789473687e-06, | |
| "loss": 1.1303, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.03218799368088467, | |
| "grad_norm": 5.203794479370117, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.135, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.03238546603475513, | |
| "grad_norm": 3.9070425033569336, | |
| "learning_rate": 1.9473684210526315e-06, | |
| "loss": 0.9977, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.032582938388625596, | |
| "grad_norm": 5.830733299255371, | |
| "learning_rate": 1.8947368421052634e-06, | |
| "loss": 1.8587, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.032780410742496054, | |
| "grad_norm": 4.05476713180542, | |
| "learning_rate": 1.8421052631578948e-06, | |
| "loss": 1.9495, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.03297788309636651, | |
| "grad_norm": 3.980226755142212, | |
| "learning_rate": 1.7894736842105265e-06, | |
| "loss": 1.6, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.03317535545023697, | |
| "grad_norm": 6.381178379058838, | |
| "learning_rate": 1.736842105263158e-06, | |
| "loss": 1.6905, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.03337282780410743, | |
| "grad_norm": 4.4184889793396, | |
| "learning_rate": 1.6842105263157895e-06, | |
| "loss": 1.4703, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.033570300157977885, | |
| "grad_norm": 4.5157470703125, | |
| "learning_rate": 1.6315789473684212e-06, | |
| "loss": 1.273, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03376777251184834, | |
| "grad_norm": 4.454701900482178, | |
| "learning_rate": 1.5789473684210526e-06, | |
| "loss": 1.9122, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.0339652448657188, | |
| "grad_norm": 4.891290664672852, | |
| "learning_rate": 1.5263157894736844e-06, | |
| "loss": 1.514, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.03416271721958926, | |
| "grad_norm": 4.397899627685547, | |
| "learning_rate": 1.4736842105263159e-06, | |
| "loss": 1.7544, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.034360189573459717, | |
| "grad_norm": 5.422823429107666, | |
| "learning_rate": 1.4210526315789475e-06, | |
| "loss": 1.5294, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.034557661927330174, | |
| "grad_norm": 5.267470359802246, | |
| "learning_rate": 1.3684210526315791e-06, | |
| "loss": 1.2176, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.03475513428120063, | |
| "grad_norm": 4.583755016326904, | |
| "learning_rate": 1.3157894736842106e-06, | |
| "loss": 1.3583, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.03495260663507109, | |
| "grad_norm": 4.745589733123779, | |
| "learning_rate": 1.2631578947368422e-06, | |
| "loss": 1.6765, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.03515007898894155, | |
| "grad_norm": 4.703863620758057, | |
| "learning_rate": 1.2105263157894738e-06, | |
| "loss": 1.2263, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.035347551342812006, | |
| "grad_norm": 4.113995552062988, | |
| "learning_rate": 1.1578947368421053e-06, | |
| "loss": 1.8098, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.035545023696682464, | |
| "grad_norm": 5.088428020477295, | |
| "learning_rate": 1.1052631578947369e-06, | |
| "loss": 1.7887, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.03574249605055292, | |
| "grad_norm": 4.361863613128662, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 1.8482, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.03593996840442338, | |
| "grad_norm": 3.7712790966033936, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.0318, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.03613744075829384, | |
| "grad_norm": 4.38617467880249, | |
| "learning_rate": 9.473684210526317e-07, | |
| "loss": 1.1747, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.036334913112164295, | |
| "grad_norm": 4.757498741149902, | |
| "learning_rate": 8.947368421052632e-07, | |
| "loss": 1.591, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.03653238546603475, | |
| "grad_norm": 4.661401748657227, | |
| "learning_rate": 8.421052631578948e-07, | |
| "loss": 1.2443, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.03672985781990521, | |
| "grad_norm": 4.178214073181152, | |
| "learning_rate": 7.894736842105263e-07, | |
| "loss": 1.6486, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.03692733017377567, | |
| "grad_norm": 4.652418613433838, | |
| "learning_rate": 7.368421052631579e-07, | |
| "loss": 1.7046, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.03712480252764613, | |
| "grad_norm": 5.367217540740967, | |
| "learning_rate": 6.842105263157896e-07, | |
| "loss": 1.1643, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.037322274881516584, | |
| "grad_norm": 5.026525020599365, | |
| "learning_rate": 6.315789473684211e-07, | |
| "loss": 1.8984, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.03751974723538705, | |
| "grad_norm": 5.15156888961792, | |
| "learning_rate": 5.789473684210526e-07, | |
| "loss": 1.199, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.03771721958925751, | |
| "grad_norm": 4.1339111328125, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 1.2138, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.037914691943127965, | |
| "grad_norm": 4.904068946838379, | |
| "learning_rate": 4.7368421052631585e-07, | |
| "loss": 0.9992, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.03811216429699842, | |
| "grad_norm": 4.655853271484375, | |
| "learning_rate": 4.210526315789474e-07, | |
| "loss": 0.9699, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.03830963665086888, | |
| "grad_norm": 4.786022663116455, | |
| "learning_rate": 3.6842105263157896e-07, | |
| "loss": 1.1047, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.03850710900473934, | |
| "grad_norm": 4.2019362449646, | |
| "learning_rate": 3.1578947368421055e-07, | |
| "loss": 1.3494, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.038704581358609796, | |
| "grad_norm": 4.608132839202881, | |
| "learning_rate": 2.6315789473684213e-07, | |
| "loss": 1.5102, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.038902053712480254, | |
| "grad_norm": 3.955866813659668, | |
| "learning_rate": 2.105263157894737e-07, | |
| "loss": 0.9329, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.03909952606635071, | |
| "grad_norm": 3.948068857192993, | |
| "learning_rate": 1.5789473684210527e-07, | |
| "loss": 0.9115, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.03929699842022117, | |
| "grad_norm": 4.394677639007568, | |
| "learning_rate": 1.0526315789473685e-07, | |
| "loss": 0.842, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.03949447077409163, | |
| "grad_norm": 4.138340473175049, | |
| "learning_rate": 5.263157894736842e-08, | |
| "loss": 1.0818, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6305414963527680.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |