| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.10062893081761, | |
| "eval_steps": 250, | |
| "global_step": 700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0015723270440251573, | |
| "grad_norm": 3.035508934265075, | |
| "learning_rate": 3.3333333333333334e-08, | |
| "loss": 1.6777, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0031446540880503146, | |
| "grad_norm": 3.0057695605079546, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 1.6331, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0047169811320754715, | |
| "grad_norm": 2.9814044526374595, | |
| "learning_rate": 1e-07, | |
| "loss": 1.4808, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.006289308176100629, | |
| "grad_norm": 3.0308381749187743, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 1.6287, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.007861635220125786, | |
| "grad_norm": 2.9090567988102314, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 1.6219, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.009433962264150943, | |
| "grad_norm": 3.0513140041110858, | |
| "learning_rate": 2e-07, | |
| "loss": 1.6202, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0110062893081761, | |
| "grad_norm": 2.8644498623931534, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "loss": 1.7804, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.012578616352201259, | |
| "grad_norm": 2.830057997617726, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 1.757, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.014150943396226415, | |
| "grad_norm": 2.8388015134887135, | |
| "learning_rate": 3e-07, | |
| "loss": 1.6526, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.015723270440251572, | |
| "grad_norm": 2.880110978968438, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 1.5365, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01729559748427673, | |
| "grad_norm": 2.8055557623152865, | |
| "learning_rate": 3.666666666666666e-07, | |
| "loss": 1.7431, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.018867924528301886, | |
| "grad_norm": 2.9896802933245774, | |
| "learning_rate": 4e-07, | |
| "loss": 1.4656, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.020440251572327043, | |
| "grad_norm": 2.8810736182794288, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "loss": 1.6771, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0220125786163522, | |
| "grad_norm": 2.947703957244773, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 1.6045, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.02358490566037736, | |
| "grad_norm": 2.8455733324410803, | |
| "learning_rate": 5e-07, | |
| "loss": 1.6552, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.025157232704402517, | |
| "grad_norm": 2.778002396514386, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 1.7711, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.026729559748427674, | |
| "grad_norm": 2.9492084900651876, | |
| "learning_rate": 5.666666666666666e-07, | |
| "loss": 2.0138, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.02830188679245283, | |
| "grad_norm": 2.7955099115508673, | |
| "learning_rate": 6e-07, | |
| "loss": 1.5838, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.029874213836477988, | |
| "grad_norm": 2.6565096089163025, | |
| "learning_rate": 6.333333333333332e-07, | |
| "loss": 1.5531, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.031446540880503145, | |
| "grad_norm": 2.9560437361213654, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 1.6762, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0330188679245283, | |
| "grad_norm": 2.8608313422183844, | |
| "learning_rate": 7e-07, | |
| "loss": 1.6094, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.03459119496855346, | |
| "grad_norm": 2.798168174146352, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": 1.4948, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.036163522012578615, | |
| "grad_norm": 2.951819566765442, | |
| "learning_rate": 7.666666666666667e-07, | |
| "loss": 1.7308, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.03773584905660377, | |
| "grad_norm": 2.7022575669246827, | |
| "learning_rate": 8e-07, | |
| "loss": 1.6431, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03930817610062893, | |
| "grad_norm": 2.9734048507595046, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 1.4427, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.040880503144654086, | |
| "grad_norm": 2.8578492497460215, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": 1.652, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.04245283018867924, | |
| "grad_norm": 2.688378160170607, | |
| "learning_rate": 9e-07, | |
| "loss": 1.548, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0440251572327044, | |
| "grad_norm": 2.720907333149357, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 1.4457, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.04559748427672956, | |
| "grad_norm": 2.7657315566885403, | |
| "learning_rate": 9.666666666666666e-07, | |
| "loss": 1.5847, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.04716981132075472, | |
| "grad_norm": 2.883002593514775, | |
| "learning_rate": 1e-06, | |
| "loss": 1.7482, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04874213836477988, | |
| "grad_norm": 2.794154766097177, | |
| "learning_rate": 9.99997377618298e-07, | |
| "loss": 1.8203, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.050314465408805034, | |
| "grad_norm": 3.088898639473824, | |
| "learning_rate": 9.999895105006994e-07, | |
| "loss": 1.7194, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.05188679245283019, | |
| "grad_norm": 2.6537048553933595, | |
| "learning_rate": 9.999763987297264e-07, | |
| "loss": 1.5857, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.05345911949685535, | |
| "grad_norm": 2.8821124249415058, | |
| "learning_rate": 9.999580424429159e-07, | |
| "loss": 1.8331, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.055031446540880505, | |
| "grad_norm": 2.7140815674763528, | |
| "learning_rate": 9.99934441832816e-07, | |
| "loss": 1.7067, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.05660377358490566, | |
| "grad_norm": 4.038361733121943, | |
| "learning_rate": 9.999055971469863e-07, | |
| "loss": 1.6599, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.05817610062893082, | |
| "grad_norm": 2.818848742384438, | |
| "learning_rate": 9.998715086879935e-07, | |
| "loss": 1.6759, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.059748427672955975, | |
| "grad_norm": 2.6841802401523087, | |
| "learning_rate": 9.9983217681341e-07, | |
| "loss": 1.4583, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.06132075471698113, | |
| "grad_norm": 2.9473060866002427, | |
| "learning_rate": 9.997876019358083e-07, | |
| "loss": 1.7774, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.06289308176100629, | |
| "grad_norm": 2.8471406767354868, | |
| "learning_rate": 9.997377845227574e-07, | |
| "loss": 1.721, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06446540880503145, | |
| "grad_norm": 2.9451168374380705, | |
| "learning_rate": 9.996827250968189e-07, | |
| "loss": 1.6511, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0660377358490566, | |
| "grad_norm": 2.6981678791938464, | |
| "learning_rate": 9.996224242355397e-07, | |
| "loss": 1.6338, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.06761006289308176, | |
| "grad_norm": 2.8294757698285555, | |
| "learning_rate": 9.995568825714478e-07, | |
| "loss": 1.6758, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.06918238993710692, | |
| "grad_norm": 2.8990349133049262, | |
| "learning_rate": 9.994861007920439e-07, | |
| "loss": 1.6104, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.07075471698113207, | |
| "grad_norm": 3.223974104333564, | |
| "learning_rate": 9.994100796397953e-07, | |
| "loss": 1.6934, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07232704402515723, | |
| "grad_norm": 2.5987506786490826, | |
| "learning_rate": 9.993288199121282e-07, | |
| "loss": 1.8592, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.07389937106918239, | |
| "grad_norm": 2.900601376300913, | |
| "learning_rate": 9.992423224614183e-07, | |
| "loss": 1.6923, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.07547169811320754, | |
| "grad_norm": 2.960175869720827, | |
| "learning_rate": 9.991505881949836e-07, | |
| "loss": 1.6595, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0770440251572327, | |
| "grad_norm": 2.7854484480315596, | |
| "learning_rate": 9.990536180750723e-07, | |
| "loss": 1.6249, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.07861635220125786, | |
| "grad_norm": 2.887167509757426, | |
| "learning_rate": 9.989514131188558e-07, | |
| "loss": 1.6919, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08018867924528301, | |
| "grad_norm": 2.685247678792898, | |
| "learning_rate": 9.988439743984152e-07, | |
| "loss": 1.5878, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.08176100628930817, | |
| "grad_norm": 3.0741107440461795, | |
| "learning_rate": 9.987313030407323e-07, | |
| "loss": 1.8271, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 2.966822892375055, | |
| "learning_rate": 9.986134002276759e-07, | |
| "loss": 1.8816, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.08490566037735849, | |
| "grad_norm": 2.9759914869433772, | |
| "learning_rate": 9.98490267195991e-07, | |
| "loss": 1.6398, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.08647798742138364, | |
| "grad_norm": 2.7657894370450373, | |
| "learning_rate": 9.983619052372847e-07, | |
| "loss": 1.5922, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0880503144654088, | |
| "grad_norm": 2.9896944252013355, | |
| "learning_rate": 9.98228315698013e-07, | |
| "loss": 1.7046, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.08962264150943396, | |
| "grad_norm": 2.934360797135303, | |
| "learning_rate": 9.980894999794678e-07, | |
| "loss": 1.6305, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.09119496855345911, | |
| "grad_norm": 2.766331961112693, | |
| "learning_rate": 9.979454595377593e-07, | |
| "loss": 1.5623, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.09276729559748427, | |
| "grad_norm": 2.7288840403410095, | |
| "learning_rate": 9.97796195883804e-07, | |
| "loss": 1.7267, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.09433962264150944, | |
| "grad_norm": 2.8579967121162118, | |
| "learning_rate": 9.97641710583307e-07, | |
| "loss": 1.7857, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0959119496855346, | |
| "grad_norm": 2.875962566949244, | |
| "learning_rate": 9.974820052567459e-07, | |
| "loss": 1.7046, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.09748427672955975, | |
| "grad_norm": 2.795328958545491, | |
| "learning_rate": 9.973170815793542e-07, | |
| "loss": 1.5145, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.09905660377358491, | |
| "grad_norm": 2.7813800259967802, | |
| "learning_rate": 9.971469412811032e-07, | |
| "loss": 1.4644, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.10062893081761007, | |
| "grad_norm": 2.884776745864663, | |
| "learning_rate": 9.969715861466839e-07, | |
| "loss": 1.6132, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.10220125786163523, | |
| "grad_norm": 2.9656746421376883, | |
| "learning_rate": 9.967910180154888e-07, | |
| "loss": 1.5777, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10377358490566038, | |
| "grad_norm": 2.9036874607841523, | |
| "learning_rate": 9.96605238781592e-07, | |
| "loss": 1.5308, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.10534591194968554, | |
| "grad_norm": 2.7956792718016876, | |
| "learning_rate": 9.964142503937305e-07, | |
| "loss": 1.5853, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.1069182389937107, | |
| "grad_norm": 2.897862336708564, | |
| "learning_rate": 9.96218054855281e-07, | |
| "loss": 1.5228, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.10849056603773585, | |
| "grad_norm": 2.8838404866317164, | |
| "learning_rate": 9.960166542242428e-07, | |
| "loss": 1.5635, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.11006289308176101, | |
| "grad_norm": 2.7639581085690574, | |
| "learning_rate": 9.958100506132126e-07, | |
| "loss": 1.67, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.11163522012578617, | |
| "grad_norm": 2.9848350207143586, | |
| "learning_rate": 9.955982461893646e-07, | |
| "loss": 1.7932, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.11320754716981132, | |
| "grad_norm": 2.748422015742939, | |
| "learning_rate": 9.953812431744274e-07, | |
| "loss": 1.4805, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.11477987421383648, | |
| "grad_norm": 2.769235366918167, | |
| "learning_rate": 9.951590438446596e-07, | |
| "loss": 1.5452, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.11635220125786164, | |
| "grad_norm": 3.0140385674610073, | |
| "learning_rate": 9.94931650530827e-07, | |
| "loss": 1.6898, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.1179245283018868, | |
| "grad_norm": 2.887633044905872, | |
| "learning_rate": 9.946990656181779e-07, | |
| "loss": 1.7871, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11949685534591195, | |
| "grad_norm": 2.9129477208043038, | |
| "learning_rate": 9.94461291546418e-07, | |
| "loss": 1.6111, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.12106918238993711, | |
| "grad_norm": 3.01831163886919, | |
| "learning_rate": 9.942183308096853e-07, | |
| "loss": 1.6132, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.12264150943396226, | |
| "grad_norm": 2.997495878078337, | |
| "learning_rate": 9.93970185956522e-07, | |
| "loss": 1.4617, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.12421383647798742, | |
| "grad_norm": 2.678939753603714, | |
| "learning_rate": 9.937168595898508e-07, | |
| "loss": 1.8479, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.12578616352201258, | |
| "grad_norm": 2.860552053547966, | |
| "learning_rate": 9.934583543669453e-07, | |
| "loss": 1.6458, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12735849056603774, | |
| "grad_norm": 3.226580813213542, | |
| "learning_rate": 9.93194672999403e-07, | |
| "loss": 1.8994, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1289308176100629, | |
| "grad_norm": 2.9327109905443445, | |
| "learning_rate": 9.929258182531166e-07, | |
| "loss": 1.5082, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.13050314465408805, | |
| "grad_norm": 2.745588457904038, | |
| "learning_rate": 9.926517929482452e-07, | |
| "loss": 1.4991, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1320754716981132, | |
| "grad_norm": 2.7892084882752903, | |
| "learning_rate": 9.923725999591846e-07, | |
| "loss": 1.7913, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.13364779874213836, | |
| "grad_norm": 2.661813722362997, | |
| "learning_rate": 9.92088242214537e-07, | |
| "loss": 1.7167, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.13522012578616352, | |
| "grad_norm": 2.6050356717221548, | |
| "learning_rate": 9.91798722697081e-07, | |
| "loss": 1.5446, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.13679245283018868, | |
| "grad_norm": 2.907740454108008, | |
| "learning_rate": 9.915040444437388e-07, | |
| "loss": 1.6019, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.13836477987421383, | |
| "grad_norm": 2.9635527119113982, | |
| "learning_rate": 9.912042105455461e-07, | |
| "loss": 1.7168, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.139937106918239, | |
| "grad_norm": 2.9813810825397966, | |
| "learning_rate": 9.908992241476186e-07, | |
| "loss": 1.8573, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.14150943396226415, | |
| "grad_norm": 2.925962029784791, | |
| "learning_rate": 9.905890884491194e-07, | |
| "loss": 1.5332, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1430817610062893, | |
| "grad_norm": 2.8712379530519474, | |
| "learning_rate": 9.902738067032253e-07, | |
| "loss": 1.757, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.14465408805031446, | |
| "grad_norm": 2.6784420543405205, | |
| "learning_rate": 9.899533822170921e-07, | |
| "loss": 1.5947, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.14622641509433962, | |
| "grad_norm": 2.6480973364337683, | |
| "learning_rate": 9.896278183518216e-07, | |
| "loss": 1.5718, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.14779874213836477, | |
| "grad_norm": 2.703204128264305, | |
| "learning_rate": 9.892971185224244e-07, | |
| "loss": 1.7105, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.14937106918238993, | |
| "grad_norm": 3.4041744211779057, | |
| "learning_rate": 9.889612861977853e-07, | |
| "loss": 1.6717, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1509433962264151, | |
| "grad_norm": 3.067864390619053, | |
| "learning_rate": 9.886203249006264e-07, | |
| "loss": 1.6355, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.15251572327044025, | |
| "grad_norm": 2.8468635252020205, | |
| "learning_rate": 9.882742382074706e-07, | |
| "loss": 1.5463, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.1540880503144654, | |
| "grad_norm": 3.2598365988029827, | |
| "learning_rate": 9.879230297486034e-07, | |
| "loss": 1.5718, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.15566037735849056, | |
| "grad_norm": 2.7916727929412763, | |
| "learning_rate": 9.875667032080352e-07, | |
| "loss": 1.6524, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.15723270440251572, | |
| "grad_norm": 2.907285248234792, | |
| "learning_rate": 9.872052623234631e-07, | |
| "loss": 1.634, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15880503144654087, | |
| "grad_norm": 2.7962618088809132, | |
| "learning_rate": 9.868387108862305e-07, | |
| "loss": 1.7726, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.16037735849056603, | |
| "grad_norm": 2.8049200840595616, | |
| "learning_rate": 9.86467052741289e-07, | |
| "loss": 1.7011, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.1619496855345912, | |
| "grad_norm": 3.0342654424151236, | |
| "learning_rate": 9.860902917871566e-07, | |
| "loss": 1.5985, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.16352201257861634, | |
| "grad_norm": 3.133992010155292, | |
| "learning_rate": 9.85708431975877e-07, | |
| "loss": 1.6142, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.1650943396226415, | |
| "grad_norm": 2.765316038304173, | |
| "learning_rate": 9.853214773129795e-07, | |
| "loss": 1.6856, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 2.783202148615839, | |
| "learning_rate": 9.84929431857435e-07, | |
| "loss": 1.7386, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.16823899371069181, | |
| "grad_norm": 2.865985855515824, | |
| "learning_rate": 9.845322997216151e-07, | |
| "loss": 1.5695, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.16981132075471697, | |
| "grad_norm": 2.910313902240843, | |
| "learning_rate": 9.841300850712478e-07, | |
| "loss": 1.7493, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.17138364779874213, | |
| "grad_norm": 2.978282642772813, | |
| "learning_rate": 9.837227921253745e-07, | |
| "loss": 1.4316, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.17295597484276728, | |
| "grad_norm": 2.829785642110051, | |
| "learning_rate": 9.833104251563055e-07, | |
| "loss": 1.8296, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.17452830188679244, | |
| "grad_norm": 2.858578282056512, | |
| "learning_rate": 9.828929884895752e-07, | |
| "loss": 1.5925, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.1761006289308176, | |
| "grad_norm": 2.5984089623254985, | |
| "learning_rate": 9.824704865038967e-07, | |
| "loss": 1.7858, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.17767295597484276, | |
| "grad_norm": 2.8160266518499113, | |
| "learning_rate": 9.820429236311158e-07, | |
| "loss": 1.6244, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.1792452830188679, | |
| "grad_norm": 2.9596119800105627, | |
| "learning_rate": 9.816103043561648e-07, | |
| "loss": 1.7678, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.18081761006289307, | |
| "grad_norm": 2.8939811442478267, | |
| "learning_rate": 9.81172633217015e-07, | |
| "loss": 1.47, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.18238993710691823, | |
| "grad_norm": 3.045485057646403, | |
| "learning_rate": 9.8072991480463e-07, | |
| "loss": 1.6268, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.18396226415094338, | |
| "grad_norm": 2.857537343336608, | |
| "learning_rate": 9.80282153762916e-07, | |
| "loss": 1.8207, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.18553459119496854, | |
| "grad_norm": 3.0151265251520902, | |
| "learning_rate": 9.798293547886746e-07, | |
| "loss": 1.6861, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.1871069182389937, | |
| "grad_norm": 2.8731688993411058, | |
| "learning_rate": 9.793715226315528e-07, | |
| "loss": 1.7075, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.18867924528301888, | |
| "grad_norm": 3.036031941810449, | |
| "learning_rate": 9.789086620939935e-07, | |
| "loss": 1.7492, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.19025157232704404, | |
| "grad_norm": 2.525691377987968, | |
| "learning_rate": 9.784407780311845e-07, | |
| "loss": 1.624, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.1918238993710692, | |
| "grad_norm": 2.831970564433951, | |
| "learning_rate": 9.77967875351008e-07, | |
| "loss": 1.7192, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.19339622641509435, | |
| "grad_norm": 2.9942913777146702, | |
| "learning_rate": 9.774899590139897e-07, | |
| "loss": 1.6851, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.1949685534591195, | |
| "grad_norm": 2.8618269576046416, | |
| "learning_rate": 9.770070340332456e-07, | |
| "loss": 1.6114, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.19654088050314467, | |
| "grad_norm": 2.883119377274619, | |
| "learning_rate": 9.765191054744304e-07, | |
| "loss": 1.6136, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.19811320754716982, | |
| "grad_norm": 3.044922518807826, | |
| "learning_rate": 9.760261784556838e-07, | |
| "loss": 1.5851, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.19968553459119498, | |
| "grad_norm": 3.0028920393371963, | |
| "learning_rate": 9.755282581475767e-07, | |
| "loss": 1.4865, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.20125786163522014, | |
| "grad_norm": 2.8941881562896117, | |
| "learning_rate": 9.750253497730579e-07, | |
| "loss": 1.6744, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.2028301886792453, | |
| "grad_norm": 2.8528564542903574, | |
| "learning_rate": 9.745174586073982e-07, | |
| "loss": 1.7564, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.20440251572327045, | |
| "grad_norm": 2.906920298063749, | |
| "learning_rate": 9.740045899781352e-07, | |
| "loss": 1.6233, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2059748427672956, | |
| "grad_norm": 2.9381321768165365, | |
| "learning_rate": 9.734867492650186e-07, | |
| "loss": 1.5676, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.20754716981132076, | |
| "grad_norm": 2.525924813073986, | |
| "learning_rate": 9.729639418999522e-07, | |
| "loss": 1.6379, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.20911949685534592, | |
| "grad_norm": 2.874791690228388, | |
| "learning_rate": 9.72436173366938e-07, | |
| "loss": 1.6909, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.21069182389937108, | |
| "grad_norm": 2.8783530324233575, | |
| "learning_rate": 9.71903449202018e-07, | |
| "loss": 1.6119, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.21226415094339623, | |
| "grad_norm": 3.01107861687442, | |
| "learning_rate": 9.713657749932171e-07, | |
| "loss": 1.5956, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2138364779874214, | |
| "grad_norm": 2.6494766790190396, | |
| "learning_rate": 9.708231563804828e-07, | |
| "loss": 1.7058, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.21540880503144655, | |
| "grad_norm": 2.85527919179103, | |
| "learning_rate": 9.702755990556276e-07, | |
| "loss": 1.4486, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.2169811320754717, | |
| "grad_norm": 2.8898725200578457, | |
| "learning_rate": 9.697231087622689e-07, | |
| "loss": 1.6514, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.21855345911949686, | |
| "grad_norm": 2.7800528070460193, | |
| "learning_rate": 9.691656912957684e-07, | |
| "loss": 1.5338, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.22012578616352202, | |
| "grad_norm": 2.9948230844512986, | |
| "learning_rate": 9.686033525031719e-07, | |
| "loss": 1.6575, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22169811320754718, | |
| "grad_norm": 3.0719024693904156, | |
| "learning_rate": 9.680360982831466e-07, | |
| "loss": 1.8096, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.22327044025157233, | |
| "grad_norm": 2.965190903432847, | |
| "learning_rate": 9.674639345859212e-07, | |
| "loss": 1.6506, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.2248427672955975, | |
| "grad_norm": 2.7336724355420974, | |
| "learning_rate": 9.668868674132222e-07, | |
| "loss": 1.5536, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.22641509433962265, | |
| "grad_norm": 2.936229195084225, | |
| "learning_rate": 9.663049028182111e-07, | |
| "loss": 1.6658, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.2279874213836478, | |
| "grad_norm": 2.9200650900522938, | |
| "learning_rate": 9.657180469054212e-07, | |
| "loss": 1.775, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.22955974842767296, | |
| "grad_norm": 2.9510267288582313, | |
| "learning_rate": 9.651263058306932e-07, | |
| "loss": 1.8345, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.23113207547169812, | |
| "grad_norm": 2.911711013545184, | |
| "learning_rate": 9.645296858011107e-07, | |
| "loss": 1.5686, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.23270440251572327, | |
| "grad_norm": 2.825920400262173, | |
| "learning_rate": 9.63928193074936e-07, | |
| "loss": 1.8736, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.23427672955974843, | |
| "grad_norm": 2.8916020775900777, | |
| "learning_rate": 9.633218339615432e-07, | |
| "loss": 1.6468, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.2358490566037736, | |
| "grad_norm": 2.7416340763711933, | |
| "learning_rate": 9.62710614821352e-07, | |
| "loss": 1.6178, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.23742138364779874, | |
| "grad_norm": 2.9834851184926645, | |
| "learning_rate": 9.620945420657623e-07, | |
| "loss": 1.6483, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2389937106918239, | |
| "grad_norm": 2.7340688138076614, | |
| "learning_rate": 9.61473622157086e-07, | |
| "loss": 1.6169, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.24056603773584906, | |
| "grad_norm": 3.0147829611770507, | |
| "learning_rate": 9.608478616084782e-07, | |
| "loss": 1.5423, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.24213836477987422, | |
| "grad_norm": 2.7769273819581795, | |
| "learning_rate": 9.60217266983872e-07, | |
| "loss": 1.7204, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.24371069182389937, | |
| "grad_norm": 3.1113626251292983, | |
| "learning_rate": 9.59581844897906e-07, | |
| "loss": 1.5996, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.24528301886792453, | |
| "grad_norm": 3.096327063422857, | |
| "learning_rate": 9.589416020158577e-07, | |
| "loss": 1.6628, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.2468553459119497, | |
| "grad_norm": 2.7450104707143894, | |
| "learning_rate": 9.582965450535713e-07, | |
| "loss": 1.6721, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.24842767295597484, | |
| "grad_norm": 2.8568927176653847, | |
| "learning_rate": 9.576466807773898e-07, | |
| "loss": 1.7649, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.9012234468955453, | |
| "learning_rate": 9.569920160040814e-07, | |
| "loss": 1.6029, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 2.9377323423033648, | |
| "learning_rate": 9.5633255760077e-07, | |
| "loss": 1.5913, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2531446540880503, | |
| "grad_norm": 3.058065738854532, | |
| "learning_rate": 9.556683124848623e-07, | |
| "loss": 1.7773, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.25471698113207547, | |
| "grad_norm": 2.722307155009549, | |
| "learning_rate": 9.54999287623975e-07, | |
| "loss": 1.7939, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.2562893081761006, | |
| "grad_norm": 2.728987715004337, | |
| "learning_rate": 9.543254900358628e-07, | |
| "loss": 1.5814, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.2578616352201258, | |
| "grad_norm": 2.7633955658565417, | |
| "learning_rate": 9.536469267883431e-07, | |
| "loss": 1.7616, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.25943396226415094, | |
| "grad_norm": 2.930495074512941, | |
| "learning_rate": 9.529636049992233e-07, | |
| "loss": 1.5567, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2610062893081761, | |
| "grad_norm": 2.9643728371296185, | |
| "learning_rate": 9.522755318362259e-07, | |
| "loss": 1.8551, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.26257861635220126, | |
| "grad_norm": 2.573314810237526, | |
| "learning_rate": 9.515827145169127e-07, | |
| "loss": 1.6787, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.2641509433962264, | |
| "grad_norm": 2.9470989186968333, | |
| "learning_rate": 9.508851603086092e-07, | |
| "loss": 1.7099, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.26572327044025157, | |
| "grad_norm": 2.8828467331827228, | |
| "learning_rate": 9.501828765283294e-07, | |
| "loss": 1.6403, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.2672955974842767, | |
| "grad_norm": 2.9869324639180967, | |
| "learning_rate": 9.494758705426976e-07, | |
| "loss": 1.6434, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2688679245283019, | |
| "grad_norm": 2.8105412464843087, | |
| "learning_rate": 9.487641497678722e-07, | |
| "loss": 1.6361, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.27044025157232704, | |
| "grad_norm": 3.0942364695061486, | |
| "learning_rate": 9.480477216694673e-07, | |
| "loss": 1.6451, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.2720125786163522, | |
| "grad_norm": 3.0314242945023637, | |
| "learning_rate": 9.473265937624746e-07, | |
| "loss": 1.6221, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.27358490566037735, | |
| "grad_norm": 2.7961141658929964, | |
| "learning_rate": 9.466007736111845e-07, | |
| "loss": 1.5805, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.2751572327044025, | |
| "grad_norm": 3.0044488577576836, | |
| "learning_rate": 9.458702688291071e-07, | |
| "loss": 1.6451, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.27672955974842767, | |
| "grad_norm": 2.9175212019013017, | |
| "learning_rate": 9.45135087078892e-07, | |
| "loss": 1.6767, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2783018867924528, | |
| "grad_norm": 2.852020241073256, | |
| "learning_rate": 9.443952360722476e-07, | |
| "loss": 1.527, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.279874213836478, | |
| "grad_norm": 3.1705793252501775, | |
| "learning_rate": 9.43650723569861e-07, | |
| "loss": 1.5246, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.28144654088050314, | |
| "grad_norm": 2.862701451741416, | |
| "learning_rate": 9.429015573813162e-07, | |
| "loss": 1.5528, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.2830188679245283, | |
| "grad_norm": 2.8470290608359825, | |
| "learning_rate": 9.421477453650117e-07, | |
| "loss": 1.6671, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.28459119496855345, | |
| "grad_norm": 2.9393982731492487, | |
| "learning_rate": 9.413892954280791e-07, | |
| "loss": 1.6337, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.2861635220125786, | |
| "grad_norm": 3.065777945428912, | |
| "learning_rate": 9.406262155262994e-07, | |
| "loss": 1.6864, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.28773584905660377, | |
| "grad_norm": 2.894668346030678, | |
| "learning_rate": 9.398585136640194e-07, | |
| "loss": 1.4675, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.2893081761006289, | |
| "grad_norm": 3.014856784865118, | |
| "learning_rate": 9.390861978940685e-07, | |
| "loss": 1.7052, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.2908805031446541, | |
| "grad_norm": 2.8120139974233638, | |
| "learning_rate": 9.383092763176738e-07, | |
| "loss": 1.4707, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.29245283018867924, | |
| "grad_norm": 2.6767355416672762, | |
| "learning_rate": 9.375277570843749e-07, | |
| "loss": 1.7074, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.2940251572327044, | |
| "grad_norm": 3.0632750130950113, | |
| "learning_rate": 9.367416483919387e-07, | |
| "loss": 1.7085, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.29559748427672955, | |
| "grad_norm": 2.698141986224555, | |
| "learning_rate": 9.359509584862735e-07, | |
| "loss": 1.6085, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.2971698113207547, | |
| "grad_norm": 2.8483611453583997, | |
| "learning_rate": 9.351556956613422e-07, | |
| "loss": 1.5957, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.29874213836477986, | |
| "grad_norm": 3.0115523041706482, | |
| "learning_rate": 9.343558682590755e-07, | |
| "loss": 1.4812, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.300314465408805, | |
| "grad_norm": 2.864790703828733, | |
| "learning_rate": 9.335514846692845e-07, | |
| "loss": 1.8253, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 2.7185712819247723, | |
| "learning_rate": 9.327425533295723e-07, | |
| "loss": 1.7479, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.30345911949685533, | |
| "grad_norm": 2.849610670824873, | |
| "learning_rate": 9.319290827252459e-07, | |
| "loss": 1.4834, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3050314465408805, | |
| "grad_norm": 2.8962850323516713, | |
| "learning_rate": 9.311110813892269e-07, | |
| "loss": 2.0937, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.30660377358490565, | |
| "grad_norm": 2.801049995905829, | |
| "learning_rate": 9.302885579019626e-07, | |
| "loss": 1.6817, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3081761006289308, | |
| "grad_norm": 2.539543086891977, | |
| "learning_rate": 9.294615208913348e-07, | |
| "loss": 1.6195, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.30974842767295596, | |
| "grad_norm": 3.14726092255164, | |
| "learning_rate": 9.286299790325706e-07, | |
| "loss": 1.495, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.3113207547169811, | |
| "grad_norm": 2.838713337721914, | |
| "learning_rate": 9.277939410481505e-07, | |
| "loss": 1.7412, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.3128930817610063, | |
| "grad_norm": 2.870003649666304, | |
| "learning_rate": 9.269534157077176e-07, | |
| "loss": 1.5115, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.31446540880503143, | |
| "grad_norm": 2.8165531483143362, | |
| "learning_rate": 9.261084118279846e-07, | |
| "loss": 1.7478, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3160377358490566, | |
| "grad_norm": 2.825444545833805, | |
| "learning_rate": 9.252589382726425e-07, | |
| "loss": 1.7086, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.31761006289308175, | |
| "grad_norm": 3.171370944860818, | |
| "learning_rate": 9.244050039522672e-07, | |
| "loss": 1.5309, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3191823899371069, | |
| "grad_norm": 2.757671298779396, | |
| "learning_rate": 9.235466178242253e-07, | |
| "loss": 1.6026, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.32075471698113206, | |
| "grad_norm": 2.790965429905353, | |
| "learning_rate": 9.226837888925812e-07, | |
| "loss": 1.7138, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.3223270440251572, | |
| "grad_norm": 2.93045091970252, | |
| "learning_rate": 9.218165262080022e-07, | |
| "loss": 1.6591, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.3238993710691824, | |
| "grad_norm": 2.82026195028126, | |
| "learning_rate": 9.209448388676635e-07, | |
| "loss": 1.6508, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.32547169811320753, | |
| "grad_norm": 3.126262802082267, | |
| "learning_rate": 9.200687360151527e-07, | |
| "loss": 1.4957, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.3270440251572327, | |
| "grad_norm": 2.895265690199012, | |
| "learning_rate": 9.191882268403741e-07, | |
| "loss": 1.5272, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.32861635220125784, | |
| "grad_norm": 2.877287755389602, | |
| "learning_rate": 9.183033205794524e-07, | |
| "loss": 1.54, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.330188679245283, | |
| "grad_norm": 2.734817869548982, | |
| "learning_rate": 9.174140265146355e-07, | |
| "loss": 1.6412, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33176100628930816, | |
| "grad_norm": 2.867112233501488, | |
| "learning_rate": 9.165203539741974e-07, | |
| "loss": 1.6154, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 2.8418280829564813, | |
| "learning_rate": 9.156223123323404e-07, | |
| "loss": 1.4353, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.33490566037735847, | |
| "grad_norm": 2.727537043457243, | |
| "learning_rate": 9.147199110090958e-07, | |
| "loss": 1.5871, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.33647798742138363, | |
| "grad_norm": 2.944426021386228, | |
| "learning_rate": 9.13813159470227e-07, | |
| "loss": 1.5932, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.3380503144654088, | |
| "grad_norm": 3.2505023109601447, | |
| "learning_rate": 9.129020672271281e-07, | |
| "loss": 1.6692, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.33962264150943394, | |
| "grad_norm": 2.961550017326567, | |
| "learning_rate": 9.119866438367262e-07, | |
| "loss": 1.732, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.3411949685534591, | |
| "grad_norm": 2.900334576192079, | |
| "learning_rate": 9.11066898901379e-07, | |
| "loss": 1.5478, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.34276729559748426, | |
| "grad_norm": 3.1405096823470173, | |
| "learning_rate": 9.101428420687757e-07, | |
| "loss": 1.7267, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.3443396226415094, | |
| "grad_norm": 3.0460348289304817, | |
| "learning_rate": 9.092144830318357e-07, | |
| "loss": 1.9256, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.34591194968553457, | |
| "grad_norm": 2.960266857426143, | |
| "learning_rate": 9.082818315286054e-07, | |
| "loss": 1.6707, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3474842767295597, | |
| "grad_norm": 2.571519228148803, | |
| "learning_rate": 9.07344897342158e-07, | |
| "loss": 1.6558, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.3490566037735849, | |
| "grad_norm": 2.758050865261492, | |
| "learning_rate": 9.064036903004899e-07, | |
| "loss": 1.6994, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.35062893081761004, | |
| "grad_norm": 2.9660590146600354, | |
| "learning_rate": 9.054582202764174e-07, | |
| "loss": 1.696, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.3522012578616352, | |
| "grad_norm": 2.752028378445374, | |
| "learning_rate": 9.045084971874737e-07, | |
| "loss": 1.6747, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.35377358490566035, | |
| "grad_norm": 2.6744551200672793, | |
| "learning_rate": 9.035545309958046e-07, | |
| "loss": 1.5971, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3553459119496855, | |
| "grad_norm": 3.016663128171545, | |
| "learning_rate": 9.02596331708064e-07, | |
| "loss": 1.4696, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.35691823899371067, | |
| "grad_norm": 2.901009716820562, | |
| "learning_rate": 9.016339093753092e-07, | |
| "loss": 1.8178, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.3584905660377358, | |
| "grad_norm": 3.1343446993717325, | |
| "learning_rate": 9.00667274092895e-07, | |
| "loss": 1.5964, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.360062893081761, | |
| "grad_norm": 3.1424828372852613, | |
| "learning_rate": 8.99696436000368e-07, | |
| "loss": 1.6684, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.36163522012578614, | |
| "grad_norm": 2.882452137937917, | |
| "learning_rate": 8.987214052813603e-07, | |
| "loss": 1.6412, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3632075471698113, | |
| "grad_norm": 2.9373977120677246, | |
| "learning_rate": 8.977421921634831e-07, | |
| "loss": 1.5271, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.36477987421383645, | |
| "grad_norm": 3.0198708919947164, | |
| "learning_rate": 8.967588069182183e-07, | |
| "loss": 1.8023, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.3663522012578616, | |
| "grad_norm": 2.960141817420143, | |
| "learning_rate": 8.957712598608122e-07, | |
| "loss": 1.3358, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.36792452830188677, | |
| "grad_norm": 2.8903797021829103, | |
| "learning_rate": 8.947795613501656e-07, | |
| "loss": 1.5905, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.3694968553459119, | |
| "grad_norm": 2.77391341345864, | |
| "learning_rate": 8.937837217887272e-07, | |
| "loss": 1.5112, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3710691823899371, | |
| "grad_norm": 2.842050023433647, | |
| "learning_rate": 8.927837516223823e-07, | |
| "loss": 1.4979, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.37264150943396224, | |
| "grad_norm": 3.000728789759411, | |
| "learning_rate": 8.91779661340345e-07, | |
| "loss": 1.8295, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.3742138364779874, | |
| "grad_norm": 2.9583627313435, | |
| "learning_rate": 8.907714614750472e-07, | |
| "loss": 1.43, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.3757861635220126, | |
| "grad_norm": 2.7918669538945253, | |
| "learning_rate": 8.897591626020284e-07, | |
| "loss": 1.6926, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.37735849056603776, | |
| "grad_norm": 2.7781083647108717, | |
| "learning_rate": 8.887427753398247e-07, | |
| "loss": 1.5972, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3789308176100629, | |
| "grad_norm": 3.0160818815953254, | |
| "learning_rate": 8.877223103498575e-07, | |
| "loss": 1.6652, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.3805031446540881, | |
| "grad_norm": 3.013147280783508, | |
| "learning_rate": 8.866977783363218e-07, | |
| "loss": 1.8097, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.38207547169811323, | |
| "grad_norm": 2.9939209838493515, | |
| "learning_rate": 8.856691900460738e-07, | |
| "loss": 1.6367, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3836477987421384, | |
| "grad_norm": 2.733039232088572, | |
| "learning_rate": 8.846365562685176e-07, | |
| "loss": 1.7351, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.38522012578616355, | |
| "grad_norm": 3.001144973341077, | |
| "learning_rate": 8.83599887835493e-07, | |
| "loss": 1.572, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3867924528301887, | |
| "grad_norm": 2.9948930326234597, | |
| "learning_rate": 8.825591956211614e-07, | |
| "loss": 1.6102, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.38836477987421386, | |
| "grad_norm": 2.8653519222859063, | |
| "learning_rate": 8.815144905418916e-07, | |
| "loss": 1.6772, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.389937106918239, | |
| "grad_norm": 3.1826396332295706, | |
| "learning_rate": 8.804657835561456e-07, | |
| "loss": 1.682, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.3915094339622642, | |
| "grad_norm": 2.8216807328948486, | |
| "learning_rate": 8.794130856643633e-07, | |
| "loss": 1.5604, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.39308176100628933, | |
| "grad_norm": 2.9678136425542467, | |
| "learning_rate": 8.783564079088476e-07, | |
| "loss": 1.6586, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.39308176100628933, | |
| "eval_sat2_MCTS_chains_SFT_val_loss": 1.6584604978561401, | |
| "eval_sat2_MCTS_chains_SFT_val_runtime": 103.8596, | |
| "eval_sat2_MCTS_chains_SFT_val_samples_per_second": 9.898, | |
| "eval_sat2_MCTS_chains_SFT_val_steps_per_second": 1.242, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3946540880503145, | |
| "grad_norm": 2.9586113804706953, | |
| "learning_rate": 8.772957613736482e-07, | |
| "loss": 1.6791, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.39622641509433965, | |
| "grad_norm": 2.6413415294984914, | |
| "learning_rate": 8.76231157184445e-07, | |
| "loss": 1.4647, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.3977987421383648, | |
| "grad_norm": 2.9283074873964288, | |
| "learning_rate": 8.751626065084328e-07, | |
| "loss": 1.4933, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.39937106918238996, | |
| "grad_norm": 3.0602312017167628, | |
| "learning_rate": 8.74090120554202e-07, | |
| "loss": 1.4125, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.4009433962264151, | |
| "grad_norm": 2.7986235291458037, | |
| "learning_rate": 8.73013710571623e-07, | |
| "loss": 1.5496, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4025157232704403, | |
| "grad_norm": 2.948976102323842, | |
| "learning_rate": 8.719333878517273e-07, | |
| "loss": 1.4968, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.40408805031446543, | |
| "grad_norm": 2.760683793395907, | |
| "learning_rate": 8.708491637265887e-07, | |
| "loss": 1.6747, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.4056603773584906, | |
| "grad_norm": 3.1671580545141675, | |
| "learning_rate": 8.697610495692054e-07, | |
| "loss": 1.658, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.40723270440251574, | |
| "grad_norm": 2.674259937449716, | |
| "learning_rate": 8.686690567933801e-07, | |
| "loss": 1.6129, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.4088050314465409, | |
| "grad_norm": 3.1018574143157527, | |
| "learning_rate": 8.675731968536002e-07, | |
| "loss": 1.3611, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.41037735849056606, | |
| "grad_norm": 3.0423161539909356, | |
| "learning_rate": 8.664734812449179e-07, | |
| "loss": 1.8075, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.4119496855345912, | |
| "grad_norm": 2.9517263449475606, | |
| "learning_rate": 8.653699215028296e-07, | |
| "loss": 1.5475, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.41352201257861637, | |
| "grad_norm": 3.0424454760034725, | |
| "learning_rate": 8.642625292031549e-07, | |
| "loss": 1.7245, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.41509433962264153, | |
| "grad_norm": 2.9120793531370723, | |
| "learning_rate": 8.631513159619149e-07, | |
| "loss": 1.4242, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 3.225432662896855, | |
| "learning_rate": 8.620362934352108e-07, | |
| "loss": 1.8377, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.41823899371069184, | |
| "grad_norm": 3.0087973641995616, | |
| "learning_rate": 8.60917473319101e-07, | |
| "loss": 1.4507, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.419811320754717, | |
| "grad_norm": 3.7250085221059495, | |
| "learning_rate": 8.597948673494794e-07, | |
| "loss": 1.5964, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.42138364779874216, | |
| "grad_norm": 3.0757842367996306, | |
| "learning_rate": 8.586684873019512e-07, | |
| "loss": 1.574, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.4229559748427673, | |
| "grad_norm": 3.041887462503267, | |
| "learning_rate": 8.575383449917102e-07, | |
| "loss": 1.8146, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.42452830188679247, | |
| "grad_norm": 4.268588227080502, | |
| "learning_rate": 8.564044522734146e-07, | |
| "loss": 1.464, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4261006289308176, | |
| "grad_norm": 3.029271230795498, | |
| "learning_rate": 8.552668210410623e-07, | |
| "loss": 1.598, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.4276729559748428, | |
| "grad_norm": 2.973518798384465, | |
| "learning_rate": 8.541254632278665e-07, | |
| "loss": 1.5129, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.42924528301886794, | |
| "grad_norm": 3.3044387064367, | |
| "learning_rate": 8.529803908061308e-07, | |
| "loss": 1.4694, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.4308176100628931, | |
| "grad_norm": 2.9571919167741614, | |
| "learning_rate": 8.51831615787123e-07, | |
| "loss": 1.6167, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.43238993710691825, | |
| "grad_norm": 2.7276928178815036, | |
| "learning_rate": 8.506791502209496e-07, | |
| "loss": 1.6561, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.4339622641509434, | |
| "grad_norm": 2.8324926682096043, | |
| "learning_rate": 8.495230061964287e-07, | |
| "loss": 1.6792, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.43553459119496857, | |
| "grad_norm": 3.0015707635143327, | |
| "learning_rate": 8.483631958409643e-07, | |
| "loss": 1.5538, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.4371069182389937, | |
| "grad_norm": 3.3059643260815914, | |
| "learning_rate": 8.471997313204182e-07, | |
| "loss": 1.6007, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.4386792452830189, | |
| "grad_norm": 3.014759127631608, | |
| "learning_rate": 8.460326248389824e-07, | |
| "loss": 1.6257, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.44025157232704404, | |
| "grad_norm": 2.7778802261201343, | |
| "learning_rate": 8.448618886390521e-07, | |
| "loss": 1.6052, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4418238993710692, | |
| "grad_norm": 3.0502038930705626, | |
| "learning_rate": 8.436875350010957e-07, | |
| "loss": 1.7498, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.44339622641509435, | |
| "grad_norm": 3.827056849787235, | |
| "learning_rate": 8.425095762435273e-07, | |
| "loss": 1.5904, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.4449685534591195, | |
| "grad_norm": 2.9342766381705747, | |
| "learning_rate": 8.413280247225768e-07, | |
| "loss": 1.5282, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.44654088050314467, | |
| "grad_norm": 4.64307318858484, | |
| "learning_rate": 8.401428928321607e-07, | |
| "loss": 1.774, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.4481132075471698, | |
| "grad_norm": 3.0422727524827144, | |
| "learning_rate": 8.389541930037516e-07, | |
| "loss": 1.5489, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.449685534591195, | |
| "grad_norm": 2.989589728066482, | |
| "learning_rate": 8.377619377062482e-07, | |
| "loss": 1.621, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.45125786163522014, | |
| "grad_norm": 3.174962884017163, | |
| "learning_rate": 8.365661394458445e-07, | |
| "loss": 1.5202, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.4528301886792453, | |
| "grad_norm": 2.809772090337394, | |
| "learning_rate": 8.353668107658983e-07, | |
| "loss": 1.9324, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.45440251572327045, | |
| "grad_norm": 3.0560314015959897, | |
| "learning_rate": 8.341639642468001e-07, | |
| "loss": 1.5088, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.4559748427672956, | |
| "grad_norm": 2.959872733329314, | |
| "learning_rate": 8.329576125058405e-07, | |
| "loss": 1.6243, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.45754716981132076, | |
| "grad_norm": 2.9842620788957737, | |
| "learning_rate": 8.317477681970786e-07, | |
| "loss": 1.6596, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.4591194968553459, | |
| "grad_norm": 3.217709997180059, | |
| "learning_rate": 8.305344440112087e-07, | |
| "loss": 1.5782, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.4606918238993711, | |
| "grad_norm": 2.8625881670603746, | |
| "learning_rate": 8.293176526754273e-07, | |
| "loss": 1.732, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.46226415094339623, | |
| "grad_norm": 3.8491482316260655, | |
| "learning_rate": 8.280974069532998e-07, | |
| "loss": 1.6535, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.4638364779874214, | |
| "grad_norm": 3.1350366958145495, | |
| "learning_rate": 8.268737196446263e-07, | |
| "loss": 1.6854, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.46540880503144655, | |
| "grad_norm": 2.9549762795233536, | |
| "learning_rate": 8.256466035853075e-07, | |
| "loss": 1.5536, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.4669811320754717, | |
| "grad_norm": 3.036679274641372, | |
| "learning_rate": 8.244160716472108e-07, | |
| "loss": 1.5475, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.46855345911949686, | |
| "grad_norm": 2.8915060348851873, | |
| "learning_rate": 8.231821367380334e-07, | |
| "loss": 1.6829, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.470125786163522, | |
| "grad_norm": 2.7293513305001116, | |
| "learning_rate": 8.219448118011687e-07, | |
| "loss": 1.6559, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.4716981132075472, | |
| "grad_norm": 2.9948451050860854, | |
| "learning_rate": 8.207041098155699e-07, | |
| "loss": 1.5436, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.47327044025157233, | |
| "grad_norm": 3.2031506579963946, | |
| "learning_rate": 8.194600437956139e-07, | |
| "loss": 1.6503, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.4748427672955975, | |
| "grad_norm": 2.807136246279641, | |
| "learning_rate": 8.18212626790964e-07, | |
| "loss": 2.0248, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.47641509433962265, | |
| "grad_norm": 2.9001657979865194, | |
| "learning_rate": 8.16961871886435e-07, | |
| "loss": 1.6034, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.4779874213836478, | |
| "grad_norm": 3.0792715017671446, | |
| "learning_rate": 8.157077922018536e-07, | |
| "loss": 1.7428, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.47955974842767296, | |
| "grad_norm": 2.9975961967554663, | |
| "learning_rate": 8.144504008919222e-07, | |
| "loss": 1.689, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4811320754716981, | |
| "grad_norm": 3.095878589609334, | |
| "learning_rate": 8.131897111460809e-07, | |
| "loss": 1.7788, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.4827044025157233, | |
| "grad_norm": 3.1315862097170384, | |
| "learning_rate": 8.119257361883686e-07, | |
| "loss": 1.6655, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.48427672955974843, | |
| "grad_norm": 3.0428927144491484, | |
| "learning_rate": 8.106584892772843e-07, | |
| "loss": 1.6418, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.4858490566037736, | |
| "grad_norm": 3.8550756798387593, | |
| "learning_rate": 8.093879837056485e-07, | |
| "loss": 1.5158, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.48742138364779874, | |
| "grad_norm": 2.8698562798909197, | |
| "learning_rate": 8.081142328004636e-07, | |
| "loss": 1.7003, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4889937106918239, | |
| "grad_norm": 2.8474789377828755, | |
| "learning_rate": 8.068372499227736e-07, | |
| "loss": 1.5878, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.49056603773584906, | |
| "grad_norm": 2.944852309492029, | |
| "learning_rate": 8.05557048467525e-07, | |
| "loss": 1.5342, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.4921383647798742, | |
| "grad_norm": 2.6824854754227134, | |
| "learning_rate": 8.04273641863425e-07, | |
| "loss": 1.5598, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.4937106918238994, | |
| "grad_norm": 3.1516894861498486, | |
| "learning_rate": 8.029870435728017e-07, | |
| "loss": 1.5505, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.49528301886792453, | |
| "grad_norm": 2.801213556135511, | |
| "learning_rate": 8.016972670914623e-07, | |
| "loss": 1.7167, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4968553459119497, | |
| "grad_norm": 2.890665548023088, | |
| "learning_rate": 8.004043259485518e-07, | |
| "loss": 1.8476, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.49842767295597484, | |
| "grad_norm": 3.0315629963242556, | |
| "learning_rate": 7.991082337064109e-07, | |
| "loss": 1.6837, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.829540637779091, | |
| "learning_rate": 7.978090039604341e-07, | |
| "loss": 1.59, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.5015723270440252, | |
| "grad_norm": 2.8207683491833797, | |
| "learning_rate": 7.965066503389264e-07, | |
| "loss": 1.5492, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 3.157170747378039, | |
| "learning_rate": 7.952011865029613e-07, | |
| "loss": 1.6466, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5047169811320755, | |
| "grad_norm": 2.9337171526662345, | |
| "learning_rate": 7.938926261462365e-07, | |
| "loss": 1.5796, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.5062893081761006, | |
| "grad_norm": 2.9017291227111137, | |
| "learning_rate": 7.925809829949311e-07, | |
| "loss": 1.5726, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5078616352201258, | |
| "grad_norm": 3.0869804482037653, | |
| "learning_rate": 7.91266270807561e-07, | |
| "loss": 1.5899, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.5094339622641509, | |
| "grad_norm": 2.7999552261033402, | |
| "learning_rate": 7.89948503374835e-07, | |
| "loss": 1.4062, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.5110062893081762, | |
| "grad_norm": 3.285221317869385, | |
| "learning_rate": 7.886276945195097e-07, | |
| "loss": 1.5396, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5125786163522013, | |
| "grad_norm": 3.0268929248373984, | |
| "learning_rate": 7.873038580962453e-07, | |
| "loss": 1.5924, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.5141509433962265, | |
| "grad_norm": 2.913072841616009, | |
| "learning_rate": 7.859770079914592e-07, | |
| "loss": 2.1225, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.5157232704402516, | |
| "grad_norm": 2.8622294907562975, | |
| "learning_rate": 7.846471581231813e-07, | |
| "loss": 1.9179, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.5172955974842768, | |
| "grad_norm": 3.3746719664000255, | |
| "learning_rate": 7.833143224409075e-07, | |
| "loss": 1.5467, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.5188679245283019, | |
| "grad_norm": 2.755221725795704, | |
| "learning_rate": 7.819785149254532e-07, | |
| "loss": 1.777, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5204402515723271, | |
| "grad_norm": 2.9788828173349278, | |
| "learning_rate": 7.806397495888073e-07, | |
| "loss": 1.7245, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.5220125786163522, | |
| "grad_norm": 3.123230924072451, | |
| "learning_rate": 7.792980404739847e-07, | |
| "loss": 1.4695, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.5235849056603774, | |
| "grad_norm": 2.951880279336297, | |
| "learning_rate": 7.77953401654879e-07, | |
| "loss": 1.6968, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.5251572327044025, | |
| "grad_norm": 3.0894908078161487, | |
| "learning_rate": 7.766058472361153e-07, | |
| "loss": 1.546, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.5267295597484277, | |
| "grad_norm": 3.0707624349548337, | |
| "learning_rate": 7.752553913529018e-07, | |
| "loss": 1.5085, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5283018867924528, | |
| "grad_norm": 2.9477557662065506, | |
| "learning_rate": 7.739020481708814e-07, | |
| "loss": 1.6163, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.529874213836478, | |
| "grad_norm": 2.914337832721276, | |
| "learning_rate": 7.725458318859841e-07, | |
| "loss": 1.7411, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.5314465408805031, | |
| "grad_norm": 2.7398214376825534, | |
| "learning_rate": 7.711867567242766e-07, | |
| "loss": 1.7266, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.5330188679245284, | |
| "grad_norm": 2.9634022885907685, | |
| "learning_rate": 7.698248369418146e-07, | |
| "loss": 1.6571, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.5345911949685535, | |
| "grad_norm": 2.896055421421961, | |
| "learning_rate": 7.684600868244919e-07, | |
| "loss": 1.5207, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5361635220125787, | |
| "grad_norm": 3.069581678589251, | |
| "learning_rate": 7.670925206878916e-07, | |
| "loss": 2.0008, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.5377358490566038, | |
| "grad_norm": 2.822860925833609, | |
| "learning_rate": 7.657221528771351e-07, | |
| "loss": 1.5865, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.539308176100629, | |
| "grad_norm": 2.9532018181594655, | |
| "learning_rate": 7.643489977667325e-07, | |
| "loss": 1.652, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.5408805031446541, | |
| "grad_norm": 3.1536405676814563, | |
| "learning_rate": 7.629730697604313e-07, | |
| "loss": 1.6205, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.5424528301886793, | |
| "grad_norm": 2.891482043321394, | |
| "learning_rate": 7.61594383291065e-07, | |
| "loss": 1.5776, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5440251572327044, | |
| "grad_norm": 3.003689771378288, | |
| "learning_rate": 7.602129528204022e-07, | |
| "loss": 1.5402, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.5455974842767296, | |
| "grad_norm": 3.067089408704145, | |
| "learning_rate": 7.588287928389951e-07, | |
| "loss": 1.6742, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.5471698113207547, | |
| "grad_norm": 2.7753239765920488, | |
| "learning_rate": 7.574419178660268e-07, | |
| "loss": 1.5732, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.5487421383647799, | |
| "grad_norm": 2.856804882953154, | |
| "learning_rate": 7.560523424491594e-07, | |
| "loss": 1.6788, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.550314465408805, | |
| "grad_norm": 2.995113850576118, | |
| "learning_rate": 7.546600811643816e-07, | |
| "loss": 1.6173, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5518867924528302, | |
| "grad_norm": 2.834168921469406, | |
| "learning_rate": 7.532651486158554e-07, | |
| "loss": 1.5904, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.5534591194968553, | |
| "grad_norm": 2.899270630246711, | |
| "learning_rate": 7.518675594357632e-07, | |
| "loss": 1.6489, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.5550314465408805, | |
| "grad_norm": 3.5659878099391795, | |
| "learning_rate": 7.504673282841543e-07, | |
| "loss": 1.7493, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.5566037735849056, | |
| "grad_norm": 3.01362773663695, | |
| "learning_rate": 7.490644698487908e-07, | |
| "loss": 1.6529, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.5581761006289309, | |
| "grad_norm": 2.829582474483786, | |
| "learning_rate": 7.476589988449938e-07, | |
| "loss": 1.739, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.559748427672956, | |
| "grad_norm": 2.8637416112700063, | |
| "learning_rate": 7.462509300154891e-07, | |
| "loss": 1.6167, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.5613207547169812, | |
| "grad_norm": 3.320226806079709, | |
| "learning_rate": 7.448402781302525e-07, | |
| "loss": 1.7115, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.5628930817610063, | |
| "grad_norm": 3.2848631934338703, | |
| "learning_rate": 7.434270579863548e-07, | |
| "loss": 1.7411, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.5644654088050315, | |
| "grad_norm": 2.8557557567154532, | |
| "learning_rate": 7.420112844078065e-07, | |
| "loss": 1.6507, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.5660377358490566, | |
| "grad_norm": 2.813687805019359, | |
| "learning_rate": 7.405929722454025e-07, | |
| "loss": 1.8697, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5676100628930818, | |
| "grad_norm": 3.1367104120043017, | |
| "learning_rate": 7.391721363765663e-07, | |
| "loss": 1.546, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.5691823899371069, | |
| "grad_norm": 2.916992253603825, | |
| "learning_rate": 7.377487917051938e-07, | |
| "loss": 1.6718, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.5707547169811321, | |
| "grad_norm": 2.8435389119301773, | |
| "learning_rate": 7.363229531614972e-07, | |
| "loss": 1.7208, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.5723270440251572, | |
| "grad_norm": 3.128997657710974, | |
| "learning_rate": 7.348946357018479e-07, | |
| "loss": 1.5151, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.5738993710691824, | |
| "grad_norm": 3.140464711506973, | |
| "learning_rate": 7.334638543086203e-07, | |
| "loss": 1.6267, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5754716981132075, | |
| "grad_norm": 2.8634668901359377, | |
| "learning_rate": 7.320306239900342e-07, | |
| "loss": 1.6124, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.5770440251572327, | |
| "grad_norm": 3.1226311552428294, | |
| "learning_rate": 7.305949597799976e-07, | |
| "loss": 1.5731, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.5786163522012578, | |
| "grad_norm": 2.9737828234739014, | |
| "learning_rate": 7.291568767379483e-07, | |
| "loss": 1.7076, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.5801886792452831, | |
| "grad_norm": 3.5758350484358017, | |
| "learning_rate": 7.277163899486974e-07, | |
| "loss": 1.6001, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.5817610062893082, | |
| "grad_norm": 3.179740779225836, | |
| "learning_rate": 7.262735145222695e-07, | |
| "loss": 1.5627, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 2.8903320572773334, | |
| "learning_rate": 7.24828265593745e-07, | |
| "loss": 1.6912, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.5849056603773585, | |
| "grad_norm": 2.8931419439995523, | |
| "learning_rate": 7.233806583231011e-07, | |
| "loss": 1.6214, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.5864779874213837, | |
| "grad_norm": 2.919579831018194, | |
| "learning_rate": 7.219307078950535e-07, | |
| "loss": 1.5862, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.5880503144654088, | |
| "grad_norm": 3.0045761782474703, | |
| "learning_rate": 7.204784295188958e-07, | |
| "loss": 1.8617, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.589622641509434, | |
| "grad_norm": 3.1244686625095484, | |
| "learning_rate": 7.190238384283412e-07, | |
| "loss": 1.9887, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5911949685534591, | |
| "grad_norm": 3.2453253002987106, | |
| "learning_rate": 7.175669498813616e-07, | |
| "loss": 1.592, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.5927672955974843, | |
| "grad_norm": 3.0877719472322367, | |
| "learning_rate": 7.161077791600287e-07, | |
| "loss": 1.4491, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.5943396226415094, | |
| "grad_norm": 3.141247199862426, | |
| "learning_rate": 7.14646341570353e-07, | |
| "loss": 1.6298, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.5959119496855346, | |
| "grad_norm": 3.0631856404614117, | |
| "learning_rate": 7.131826524421229e-07, | |
| "loss": 1.6426, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.5974842767295597, | |
| "grad_norm": 3.270854655641661, | |
| "learning_rate": 7.117167271287452e-07, | |
| "loss": 1.5782, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5990566037735849, | |
| "grad_norm": 2.797638399208883, | |
| "learning_rate": 7.102485810070823e-07, | |
| "loss": 1.832, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.60062893081761, | |
| "grad_norm": 3.0354014098510484, | |
| "learning_rate": 7.087782294772926e-07, | |
| "loss": 1.4384, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.6022012578616353, | |
| "grad_norm": 2.9846921337198933, | |
| "learning_rate": 7.07305687962668e-07, | |
| "loss": 1.7531, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 3.106084720294311, | |
| "learning_rate": 7.05830971909472e-07, | |
| "loss": 1.4538, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.6053459119496856, | |
| "grad_norm": 2.871555190950071, | |
| "learning_rate": 7.043540967867781e-07, | |
| "loss": 1.6914, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6069182389937107, | |
| "grad_norm": 2.840880923886873, | |
| "learning_rate": 7.028750780863078e-07, | |
| "loss": 1.6388, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.6084905660377359, | |
| "grad_norm": 3.1454444170824316, | |
| "learning_rate": 7.013939313222669e-07, | |
| "loss": 1.4082, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.610062893081761, | |
| "grad_norm": 3.1511066609699503, | |
| "learning_rate": 6.999106720311845e-07, | |
| "loss": 1.5937, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.6116352201257862, | |
| "grad_norm": 2.9608662423607512, | |
| "learning_rate": 6.984253157717485e-07, | |
| "loss": 1.6905, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.6132075471698113, | |
| "grad_norm": 2.971295349160843, | |
| "learning_rate": 6.969378781246436e-07, | |
| "loss": 1.6028, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6147798742138365, | |
| "grad_norm": 2.887441093132589, | |
| "learning_rate": 6.954483746923864e-07, | |
| "loss": 1.5795, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.6163522012578616, | |
| "grad_norm": 2.765921351694156, | |
| "learning_rate": 6.939568210991632e-07, | |
| "loss": 1.77, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.6179245283018868, | |
| "grad_norm": 2.8917529035479514, | |
| "learning_rate": 6.924632329906656e-07, | |
| "loss": 1.4817, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.6194968553459119, | |
| "grad_norm": 3.0703990060541497, | |
| "learning_rate": 6.909676260339259e-07, | |
| "loss": 1.5371, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.6210691823899371, | |
| "grad_norm": 3.140318941056899, | |
| "learning_rate": 6.894700159171534e-07, | |
| "loss": 1.5016, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6226415094339622, | |
| "grad_norm": 3.1648818923914925, | |
| "learning_rate": 6.879704183495695e-07, | |
| "loss": 1.5751, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.6242138364779874, | |
| "grad_norm": 3.0902741266005793, | |
| "learning_rate": 6.864688490612433e-07, | |
| "loss": 1.4564, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.6257861635220126, | |
| "grad_norm": 3.571095235370477, | |
| "learning_rate": 6.84965323802926e-07, | |
| "loss": 1.4466, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.6273584905660378, | |
| "grad_norm": 3.246655504553822, | |
| "learning_rate": 6.834598583458861e-07, | |
| "loss": 1.6618, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.6289308176100629, | |
| "grad_norm": 3.0343685055658307, | |
| "learning_rate": 6.819524684817438e-07, | |
| "loss": 1.5849, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6305031446540881, | |
| "grad_norm": 3.4147960516478557, | |
| "learning_rate": 6.804431700223055e-07, | |
| "loss": 1.5878, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.6320754716981132, | |
| "grad_norm": 2.9908883955283407, | |
| "learning_rate": 6.789319787993979e-07, | |
| "loss": 1.6317, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.6336477987421384, | |
| "grad_norm": 3.196286199019289, | |
| "learning_rate": 6.774189106647021e-07, | |
| "loss": 1.7304, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.6352201257861635, | |
| "grad_norm": 3.079020576367776, | |
| "learning_rate": 6.759039814895862e-07, | |
| "loss": 1.8023, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.6367924528301887, | |
| "grad_norm": 2.8088266541190694, | |
| "learning_rate": 6.743872071649411e-07, | |
| "loss": 1.606, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6383647798742138, | |
| "grad_norm": 2.878246806108592, | |
| "learning_rate": 6.728686036010114e-07, | |
| "loss": 1.3972, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.639937106918239, | |
| "grad_norm": 2.681608870043916, | |
| "learning_rate": 6.713481867272299e-07, | |
| "loss": 1.5793, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.6415094339622641, | |
| "grad_norm": 2.9759341078654717, | |
| "learning_rate": 6.698259724920502e-07, | |
| "loss": 1.8096, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.6430817610062893, | |
| "grad_norm": 2.930107243897169, | |
| "learning_rate": 6.683019768627794e-07, | |
| "loss": 1.9373, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.6446540880503144, | |
| "grad_norm": 3.3019087122259614, | |
| "learning_rate": 6.667762158254103e-07, | |
| "loss": 1.6784, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6462264150943396, | |
| "grad_norm": 3.046122411688699, | |
| "learning_rate": 6.652487053844544e-07, | |
| "loss": 1.7923, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.6477987421383647, | |
| "grad_norm": 3.226966880220217, | |
| "learning_rate": 6.637194615627732e-07, | |
| "loss": 1.767, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.64937106918239, | |
| "grad_norm": 3.5839568234533625, | |
| "learning_rate": 6.621885004014111e-07, | |
| "loss": 1.7144, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.6509433962264151, | |
| "grad_norm": 3.2662623680583867, | |
| "learning_rate": 6.606558379594261e-07, | |
| "loss": 1.6909, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.6525157232704403, | |
| "grad_norm": 3.293320476598104, | |
| "learning_rate": 6.59121490313722e-07, | |
| "loss": 1.5467, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6540880503144654, | |
| "grad_norm": 2.8253698384570622, | |
| "learning_rate": 6.575854735588794e-07, | |
| "loss": 1.612, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.6556603773584906, | |
| "grad_norm": 3.3489939495652914, | |
| "learning_rate": 6.560478038069872e-07, | |
| "loss": 1.5001, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.6572327044025157, | |
| "grad_norm": 3.0875383018689373, | |
| "learning_rate": 6.545084971874736e-07, | |
| "loss": 1.6029, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.6588050314465409, | |
| "grad_norm": 2.7373051620760394, | |
| "learning_rate": 6.529675698469369e-07, | |
| "loss": 1.75, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.660377358490566, | |
| "grad_norm": 3.0779640368460743, | |
| "learning_rate": 6.514250379489753e-07, | |
| "loss": 1.5913, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6619496855345912, | |
| "grad_norm": 3.337822319490678, | |
| "learning_rate": 6.498809176740189e-07, | |
| "loss": 1.7076, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.6635220125786163, | |
| "grad_norm": 3.0702637505567476, | |
| "learning_rate": 6.483352252191584e-07, | |
| "loss": 1.5407, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.6650943396226415, | |
| "grad_norm": 3.157672741863842, | |
| "learning_rate": 6.467879767979764e-07, | |
| "loss": 1.5754, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 4.29354289767503, | |
| "learning_rate": 6.452391886403766e-07, | |
| "loss": 1.4986, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.6682389937106918, | |
| "grad_norm": 2.9437863849920545, | |
| "learning_rate": 6.436888769924141e-07, | |
| "loss": 1.5725, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6698113207547169, | |
| "grad_norm": 2.784911047159229, | |
| "learning_rate": 6.421370581161243e-07, | |
| "loss": 1.4859, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.6713836477987422, | |
| "grad_norm": 3.2167104786038316, | |
| "learning_rate": 6.405837482893528e-07, | |
| "loss": 1.7109, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.6729559748427673, | |
| "grad_norm": 2.9822678864067, | |
| "learning_rate": 6.390289638055851e-07, | |
| "loss": 1.5598, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.6745283018867925, | |
| "grad_norm": 2.8764956617307225, | |
| "learning_rate": 6.374727209737742e-07, | |
| "loss": 1.7887, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.6761006289308176, | |
| "grad_norm": 2.956840694387534, | |
| "learning_rate": 6.359150361181714e-07, | |
| "loss": 1.5556, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6776729559748428, | |
| "grad_norm": 3.0343380447523423, | |
| "learning_rate": 6.343559255781537e-07, | |
| "loss": 1.6393, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.6792452830188679, | |
| "grad_norm": 2.7767252718165496, | |
| "learning_rate": 6.327954057080526e-07, | |
| "loss": 1.6502, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.6808176100628931, | |
| "grad_norm": 3.1882701427281788, | |
| "learning_rate": 6.312334928769833e-07, | |
| "loss": 1.5724, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.6823899371069182, | |
| "grad_norm": 3.097455620552655, | |
| "learning_rate": 6.296702034686725e-07, | |
| "loss": 1.6448, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.6839622641509434, | |
| "grad_norm": 2.967616326765885, | |
| "learning_rate": 6.281055538812861e-07, | |
| "loss": 1.5575, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6855345911949685, | |
| "grad_norm": 2.9903290692872218, | |
| "learning_rate": 6.265395605272581e-07, | |
| "loss": 1.5267, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.6871069182389937, | |
| "grad_norm": 2.976142177695413, | |
| "learning_rate": 6.249722398331176e-07, | |
| "loss": 1.6737, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.6886792452830188, | |
| "grad_norm": 2.9630944851380123, | |
| "learning_rate": 6.234036082393171e-07, | |
| "loss": 1.6495, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.690251572327044, | |
| "grad_norm": 2.7088231883537817, | |
| "learning_rate": 6.218336822000597e-07, | |
| "loss": 1.4323, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.6918238993710691, | |
| "grad_norm": 2.937020551603309, | |
| "learning_rate": 6.202624781831268e-07, | |
| "loss": 1.5755, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6933962264150944, | |
| "grad_norm": 3.0176348553682786, | |
| "learning_rate": 6.18690012669705e-07, | |
| "loss": 1.6015, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.6949685534591195, | |
| "grad_norm": 3.3315984504157945, | |
| "learning_rate": 6.171163021542133e-07, | |
| "loss": 1.6381, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.6965408805031447, | |
| "grad_norm": 3.1688246150461676, | |
| "learning_rate": 6.155413631441306e-07, | |
| "loss": 1.607, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.6981132075471698, | |
| "grad_norm": 3.09165082620452, | |
| "learning_rate": 6.139652121598218e-07, | |
| "loss": 1.7871, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.699685534591195, | |
| "grad_norm": 3.297756722716993, | |
| "learning_rate": 6.123878657343647e-07, | |
| "loss": 1.4711, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.7012578616352201, | |
| "grad_norm": 3.0883302587006, | |
| "learning_rate": 6.108093404133772e-07, | |
| "loss": 1.6702, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.7028301886792453, | |
| "grad_norm": 3.076701869873011, | |
| "learning_rate": 6.092296527548426e-07, | |
| "loss": 1.4897, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.7044025157232704, | |
| "grad_norm": 2.84568602644286, | |
| "learning_rate": 6.076488193289374e-07, | |
| "loss": 1.7678, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.7059748427672956, | |
| "grad_norm": 5.139935927987656, | |
| "learning_rate": 6.060668567178559e-07, | |
| "loss": 1.5979, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.7075471698113207, | |
| "grad_norm": 3.1264797262686477, | |
| "learning_rate": 6.044837815156376e-07, | |
| "loss": 1.5206, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7091194968553459, | |
| "grad_norm": 3.3534156320240522, | |
| "learning_rate": 6.028996103279917e-07, | |
| "loss": 1.5362, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.710691823899371, | |
| "grad_norm": 3.165943529869633, | |
| "learning_rate": 6.013143597721251e-07, | |
| "loss": 1.4814, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.7122641509433962, | |
| "grad_norm": 3.3330090224325897, | |
| "learning_rate": 5.997280464765653e-07, | |
| "loss": 1.4184, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.7138364779874213, | |
| "grad_norm": 3.054343208237251, | |
| "learning_rate": 5.981406870809888e-07, | |
| "loss": 1.5341, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.7154088050314465, | |
| "grad_norm": 2.9985777856359586, | |
| "learning_rate": 5.96552298236044e-07, | |
| "loss": 1.5153, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7169811320754716, | |
| "grad_norm": 3.0416404334656786, | |
| "learning_rate": 5.949628966031784e-07, | |
| "loss": 1.5342, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.7185534591194969, | |
| "grad_norm": 3.116507921003848, | |
| "learning_rate": 5.933724988544632e-07, | |
| "loss": 1.4448, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.720125786163522, | |
| "grad_norm": 3.0997481538278526, | |
| "learning_rate": 5.91781121672418e-07, | |
| "loss": 1.5867, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.7216981132075472, | |
| "grad_norm": 3.1876523216248165, | |
| "learning_rate": 5.901887817498367e-07, | |
| "loss": 1.5432, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.7232704402515723, | |
| "grad_norm": 3.1650167832638902, | |
| "learning_rate": 5.885954957896115e-07, | |
| "loss": 1.5536, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7248427672955975, | |
| "grad_norm": 3.195279291197498, | |
| "learning_rate": 5.870012805045579e-07, | |
| "loss": 1.4512, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.7264150943396226, | |
| "grad_norm": 2.934331743213936, | |
| "learning_rate": 5.854061526172401e-07, | |
| "loss": 1.5895, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.7279874213836478, | |
| "grad_norm": 2.910990978962794, | |
| "learning_rate": 5.83810128859795e-07, | |
| "loss": 1.8213, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.7295597484276729, | |
| "grad_norm": 2.9419669708040477, | |
| "learning_rate": 5.822132259737564e-07, | |
| "loss": 1.5848, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.7311320754716981, | |
| "grad_norm": 3.417719384818703, | |
| "learning_rate": 5.806154607098799e-07, | |
| "loss": 1.8473, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7327044025157232, | |
| "grad_norm": 2.9986530708779218, | |
| "learning_rate": 5.790168498279671e-07, | |
| "loss": 1.6022, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.7342767295597484, | |
| "grad_norm": 3.096387566914234, | |
| "learning_rate": 5.774174100966899e-07, | |
| "loss": 1.5598, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.7358490566037735, | |
| "grad_norm": 3.1688341508266817, | |
| "learning_rate": 5.75817158293414e-07, | |
| "loss": 1.5955, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.7374213836477987, | |
| "grad_norm": 3.589983459266237, | |
| "learning_rate": 5.742161112040236e-07, | |
| "loss": 1.4531, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.7389937106918238, | |
| "grad_norm": 3.0454871929245755, | |
| "learning_rate": 5.726142856227452e-07, | |
| "loss": 1.6809, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7405660377358491, | |
| "grad_norm": 3.187136777050043, | |
| "learning_rate": 5.710116983519711e-07, | |
| "loss": 1.5083, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.7421383647798742, | |
| "grad_norm": 3.4456677764552097, | |
| "learning_rate": 5.694083662020834e-07, | |
| "loss": 1.5031, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.7437106918238994, | |
| "grad_norm": 3.0711812431855328, | |
| "learning_rate": 5.678043059912776e-07, | |
| "loss": 1.4553, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.7452830188679245, | |
| "grad_norm": 3.3203836858651448, | |
| "learning_rate": 5.661995345453866e-07, | |
| "loss": 1.5409, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.7468553459119497, | |
| "grad_norm": 3.1365764386189405, | |
| "learning_rate": 5.645940686977032e-07, | |
| "loss": 1.8013, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7484276729559748, | |
| "grad_norm": 3.1568448692061715, | |
| "learning_rate": 5.629879252888045e-07, | |
| "loss": 1.6867, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.2555349541265635, | |
| "learning_rate": 5.61381121166375e-07, | |
| "loss": 1.5651, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.7515723270440252, | |
| "grad_norm": 3.2156977140753296, | |
| "learning_rate": 5.597736731850294e-07, | |
| "loss": 1.6762, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.7531446540880503, | |
| "grad_norm": 3.3231239637752448, | |
| "learning_rate": 5.581655982061366e-07, | |
| "loss": 1.4839, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.7547169811320755, | |
| "grad_norm": 2.9815593831758203, | |
| "learning_rate": 5.565569130976422e-07, | |
| "loss": 1.5976, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7562893081761006, | |
| "grad_norm": 3.049414168390122, | |
| "learning_rate": 5.549476347338913e-07, | |
| "loss": 1.6503, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.7578616352201258, | |
| "grad_norm": 3.115613499312693, | |
| "learning_rate": 5.533377799954531e-07, | |
| "loss": 1.6372, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.7594339622641509, | |
| "grad_norm": 3.316587691013018, | |
| "learning_rate": 5.517273657689418e-07, | |
| "loss": 1.8254, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.7610062893081762, | |
| "grad_norm": 3.230503066134413, | |
| "learning_rate": 5.501164089468405e-07, | |
| "loss": 1.4632, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.7625786163522013, | |
| "grad_norm": 3.1674466671644472, | |
| "learning_rate": 5.485049264273241e-07, | |
| "loss": 1.5059, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7641509433962265, | |
| "grad_norm": 3.087706666603333, | |
| "learning_rate": 5.468929351140815e-07, | |
| "loss": 1.5832, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.7657232704402516, | |
| "grad_norm": 2.992373824489146, | |
| "learning_rate": 5.452804519161389e-07, | |
| "loss": 1.6634, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.7672955974842768, | |
| "grad_norm": 2.8390916500019983, | |
| "learning_rate": 5.436674937476819e-07, | |
| "loss": 1.6625, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.7688679245283019, | |
| "grad_norm": 3.185527589207637, | |
| "learning_rate": 5.420540775278788e-07, | |
| "loss": 1.7379, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.7704402515723271, | |
| "grad_norm": 2.8909583251962645, | |
| "learning_rate": 5.404402201807021e-07, | |
| "loss": 1.5009, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7720125786163522, | |
| "grad_norm": 3.3764568950079337, | |
| "learning_rate": 5.388259386347517e-07, | |
| "loss": 1.8461, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.7735849056603774, | |
| "grad_norm": 2.9907675034645154, | |
| "learning_rate": 5.37211249823077e-07, | |
| "loss": 1.6859, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.7751572327044025, | |
| "grad_norm": 3.0081342090492815, | |
| "learning_rate": 5.355961706829997e-07, | |
| "loss": 1.8076, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.7767295597484277, | |
| "grad_norm": 3.206744621623335, | |
| "learning_rate": 5.339807181559358e-07, | |
| "loss": 1.4324, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.7783018867924528, | |
| "grad_norm": 3.0827592144848737, | |
| "learning_rate": 5.323649091872178e-07, | |
| "loss": 1.4862, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.779874213836478, | |
| "grad_norm": 3.0432766929055104, | |
| "learning_rate": 5.307487607259174e-07, | |
| "loss": 1.7757, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.7814465408805031, | |
| "grad_norm": 3.0681103462894725, | |
| "learning_rate": 5.291322897246668e-07, | |
| "loss": 1.5578, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.7830188679245284, | |
| "grad_norm": 3.3579149440322795, | |
| "learning_rate": 5.275155131394824e-07, | |
| "loss": 1.486, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.7845911949685535, | |
| "grad_norm": 3.024186532283961, | |
| "learning_rate": 5.258984479295852e-07, | |
| "loss": 1.4805, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.7861635220125787, | |
| "grad_norm": 2.9881904115084854, | |
| "learning_rate": 5.242811110572242e-07, | |
| "loss": 1.5482, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7861635220125787, | |
| "eval_sat2_MCTS_chains_SFT_val_loss": 1.6792867183685303, | |
| "eval_sat2_MCTS_chains_SFT_val_runtime": 103.382, | |
| "eval_sat2_MCTS_chains_SFT_val_samples_per_second": 9.944, | |
| "eval_sat2_MCTS_chains_SFT_val_steps_per_second": 1.248, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7877358490566038, | |
| "grad_norm": 3.112808846064443, | |
| "learning_rate": 5.226635194874977e-07, | |
| "loss": 1.5644, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.789308176100629, | |
| "grad_norm": 3.2640628638265454, | |
| "learning_rate": 5.21045690188176e-07, | |
| "loss": 1.6825, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.7908805031446541, | |
| "grad_norm": 3.219943846696886, | |
| "learning_rate": 5.19427640129523e-07, | |
| "loss": 1.5106, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.7924528301886793, | |
| "grad_norm": 3.0378707975393437, | |
| "learning_rate": 5.178093862841178e-07, | |
| "loss": 1.47, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.7940251572327044, | |
| "grad_norm": 2.8365886455988853, | |
| "learning_rate": 5.16190945626678e-07, | |
| "loss": 1.5678, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7955974842767296, | |
| "grad_norm": 3.5805336192243016, | |
| "learning_rate": 5.145723351338798e-07, | |
| "loss": 1.683, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.7971698113207547, | |
| "grad_norm": 3.222694102591329, | |
| "learning_rate": 5.129535717841818e-07, | |
| "loss": 1.5866, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.7987421383647799, | |
| "grad_norm": 3.25235170280169, | |
| "learning_rate": 5.11334672557645e-07, | |
| "loss": 1.6389, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.800314465408805, | |
| "grad_norm": 2.973328614815833, | |
| "learning_rate": 5.097156544357567e-07, | |
| "loss": 1.4565, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.8018867924528302, | |
| "grad_norm": 3.4518265144515534, | |
| "learning_rate": 5.080965344012508e-07, | |
| "loss": 1.5516, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8034591194968553, | |
| "grad_norm": 2.8522493610275945, | |
| "learning_rate": 5.064773294379302e-07, | |
| "loss": 1.665, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.8050314465408805, | |
| "grad_norm": 2.8746693476380827, | |
| "learning_rate": 5.048580565304886e-07, | |
| "loss": 1.4801, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.8066037735849056, | |
| "grad_norm": 3.7133447000927493, | |
| "learning_rate": 5.03238732664333e-07, | |
| "loss": 1.7017, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.8081761006289309, | |
| "grad_norm": 3.2252723169555275, | |
| "learning_rate": 5.016193748254044e-07, | |
| "loss": 1.5495, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.809748427672956, | |
| "grad_norm": 3.019919222840472, | |
| "learning_rate": 5e-07, | |
| "loss": 1.4837, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8113207547169812, | |
| "grad_norm": 5.981459859212562, | |
| "learning_rate": 4.983806251745957e-07, | |
| "loss": 1.5122, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.8128930817610063, | |
| "grad_norm": 3.1675144304415217, | |
| "learning_rate": 4.967612673356669e-07, | |
| "loss": 1.357, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.8144654088050315, | |
| "grad_norm": 3.1550044652043767, | |
| "learning_rate": 4.951419434695113e-07, | |
| "loss": 1.6727, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.8160377358490566, | |
| "grad_norm": 3.2157731056157703, | |
| "learning_rate": 4.935226705620699e-07, | |
| "loss": 1.5374, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.8176100628930818, | |
| "grad_norm": 3.433295089818606, | |
| "learning_rate": 4.919034655987492e-07, | |
| "loss": 1.4016, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8191823899371069, | |
| "grad_norm": 3.2174967579065417, | |
| "learning_rate": 4.902843455642433e-07, | |
| "loss": 1.6296, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.8207547169811321, | |
| "grad_norm": 3.2592113989158333, | |
| "learning_rate": 4.88665327442355e-07, | |
| "loss": 1.5081, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.8223270440251572, | |
| "grad_norm": 3.159751293392431, | |
| "learning_rate": 4.870464282158184e-07, | |
| "loss": 1.8609, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.8238993710691824, | |
| "grad_norm": 3.456855111565593, | |
| "learning_rate": 4.854276648661202e-07, | |
| "loss": 1.5157, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.8254716981132075, | |
| "grad_norm": 2.875021106322295, | |
| "learning_rate": 4.838090543733221e-07, | |
| "loss": 1.5928, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8270440251572327, | |
| "grad_norm": 2.9255730050969793, | |
| "learning_rate": 4.821906137158821e-07, | |
| "loss": 1.6097, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.8286163522012578, | |
| "grad_norm": 3.0868845547911037, | |
| "learning_rate": 4.805723598704771e-07, | |
| "loss": 1.5752, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.8301886792452831, | |
| "grad_norm": 2.991273798342134, | |
| "learning_rate": 4.789543098118241e-07, | |
| "loss": 1.5966, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.8317610062893082, | |
| "grad_norm": 3.176550358572851, | |
| "learning_rate": 4.773364805125024e-07, | |
| "loss": 1.6147, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 3.0393464625754874, | |
| "learning_rate": 4.75718888942776e-07, | |
| "loss": 1.4362, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8349056603773585, | |
| "grad_norm": 3.382289038536882, | |
| "learning_rate": 4.7410155207041476e-07, | |
| "loss": 1.4906, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.8364779874213837, | |
| "grad_norm": 3.023144808660818, | |
| "learning_rate": 4.7248448686051753e-07, | |
| "loss": 1.4524, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.8380503144654088, | |
| "grad_norm": 3.272898596251666, | |
| "learning_rate": 4.708677102753331e-07, | |
| "loss": 1.4412, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.839622641509434, | |
| "grad_norm": 3.368636687037789, | |
| "learning_rate": 4.692512392740826e-07, | |
| "loss": 1.5923, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.8411949685534591, | |
| "grad_norm": 3.0269316359210703, | |
| "learning_rate": 4.676350908127821e-07, | |
| "loss": 1.5435, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8427672955974843, | |
| "grad_norm": 3.104762410537656, | |
| "learning_rate": 4.6601928184406407e-07, | |
| "loss": 1.763, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.8443396226415094, | |
| "grad_norm": 3.07205733907163, | |
| "learning_rate": 4.6440382931700025e-07, | |
| "loss": 1.6383, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.8459119496855346, | |
| "grad_norm": 3.2208435755618514, | |
| "learning_rate": 4.6278875017692305e-07, | |
| "loss": 1.6919, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.8474842767295597, | |
| "grad_norm": 3.120167553691476, | |
| "learning_rate": 4.611740613652484e-07, | |
| "loss": 1.7508, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.8490566037735849, | |
| "grad_norm": 3.222111818087616, | |
| "learning_rate": 4.595597798192979e-07, | |
| "loss": 1.7055, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.85062893081761, | |
| "grad_norm": 3.0750638601202525, | |
| "learning_rate": 4.5794592247212115e-07, | |
| "loss": 1.5565, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.8522012578616353, | |
| "grad_norm": 3.140050284801352, | |
| "learning_rate": 4.56332506252318e-07, | |
| "loss": 1.5851, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.8537735849056604, | |
| "grad_norm": 3.0237503315775642, | |
| "learning_rate": 4.547195480838611e-07, | |
| "loss": 1.8113, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.8553459119496856, | |
| "grad_norm": 2.8795497225735187, | |
| "learning_rate": 4.5310706488591854e-07, | |
| "loss": 1.4624, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.8569182389937107, | |
| "grad_norm": 3.1652826858619596, | |
| "learning_rate": 4.5149507357267597e-07, | |
| "loss": 1.4178, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8584905660377359, | |
| "grad_norm": 3.310546691513496, | |
| "learning_rate": 4.498835910531595e-07, | |
| "loss": 1.494, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.860062893081761, | |
| "grad_norm": 3.299127034963117, | |
| "learning_rate": 4.4827263423105815e-07, | |
| "loss": 1.7251, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.8616352201257862, | |
| "grad_norm": 3.495217186292534, | |
| "learning_rate": 4.466622200045468e-07, | |
| "loss": 1.5429, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.8632075471698113, | |
| "grad_norm": 3.2024593517565334, | |
| "learning_rate": 4.4505236526610856e-07, | |
| "loss": 1.7903, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.8647798742138365, | |
| "grad_norm": 3.6359390641047247, | |
| "learning_rate": 4.434430869023579e-07, | |
| "loss": 1.4595, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8663522012578616, | |
| "grad_norm": 3.0315341288087216, | |
| "learning_rate": 4.418344017938633e-07, | |
| "loss": 1.5896, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.8679245283018868, | |
| "grad_norm": 2.9431226438488225, | |
| "learning_rate": 4.4022632681497056e-07, | |
| "loss": 1.5016, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.8694968553459119, | |
| "grad_norm": 2.76871961076697, | |
| "learning_rate": 4.3861887883362505e-07, | |
| "loss": 1.5735, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.8710691823899371, | |
| "grad_norm": 3.001757601924961, | |
| "learning_rate": 4.370120747111955e-07, | |
| "loss": 1.6215, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.8726415094339622, | |
| "grad_norm": 3.1989100749072974, | |
| "learning_rate": 4.354059313022969e-07, | |
| "loss": 1.5669, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8742138364779874, | |
| "grad_norm": 3.2670176156220267, | |
| "learning_rate": 4.3380046545461357e-07, | |
| "loss": 1.6728, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.8757861635220126, | |
| "grad_norm": 3.251241086797364, | |
| "learning_rate": 4.3219569400872234e-07, | |
| "loss": 1.5385, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.8773584905660378, | |
| "grad_norm": 3.224015403978076, | |
| "learning_rate": 4.305916337979167e-07, | |
| "loss": 1.4749, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.8789308176100629, | |
| "grad_norm": 3.0290334656918145, | |
| "learning_rate": 4.289883016480291e-07, | |
| "loss": 1.5637, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.8805031446540881, | |
| "grad_norm": 3.5571187387226133, | |
| "learning_rate": 4.2738571437725496e-07, | |
| "loss": 1.6427, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8820754716981132, | |
| "grad_norm": 3.1631834259489144, | |
| "learning_rate": 4.257838887959763e-07, | |
| "loss": 1.4748, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.8836477987421384, | |
| "grad_norm": 3.467969945927553, | |
| "learning_rate": 4.2418284170658595e-07, | |
| "loss": 1.5934, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.8852201257861635, | |
| "grad_norm": 3.165363685749207, | |
| "learning_rate": 4.2258258990331007e-07, | |
| "loss": 1.5096, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.8867924528301887, | |
| "grad_norm": 3.2224659620946885, | |
| "learning_rate": 4.209831501720328e-07, | |
| "loss": 1.5952, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.8883647798742138, | |
| "grad_norm": 3.3169481022705343, | |
| "learning_rate": 4.193845392901201e-07, | |
| "loss": 1.4145, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.889937106918239, | |
| "grad_norm": 3.02538270876987, | |
| "learning_rate": 4.177867740262436e-07, | |
| "loss": 1.5246, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.8915094339622641, | |
| "grad_norm": 3.94869374366118, | |
| "learning_rate": 4.1618987114020495e-07, | |
| "loss": 1.569, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.8930817610062893, | |
| "grad_norm": 2.991956599684122, | |
| "learning_rate": 4.145938473827598e-07, | |
| "loss": 1.5099, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.8946540880503144, | |
| "grad_norm": 3.2982588739570966, | |
| "learning_rate": 4.129987194954421e-07, | |
| "loss": 1.4944, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.8962264150943396, | |
| "grad_norm": 3.348577038705965, | |
| "learning_rate": 4.1140450421038866e-07, | |
| "loss": 1.5357, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8977987421383647, | |
| "grad_norm": 3.515207694168226, | |
| "learning_rate": 4.098112182501633e-07, | |
| "loss": 1.6219, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.89937106918239, | |
| "grad_norm": 3.0410070114945222, | |
| "learning_rate": 4.0821887832758194e-07, | |
| "loss": 1.6244, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.9009433962264151, | |
| "grad_norm": 3.7444315598525333, | |
| "learning_rate": 4.0662750114553685e-07, | |
| "loss": 1.5238, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.9025157232704403, | |
| "grad_norm": 3.331368882227541, | |
| "learning_rate": 4.050371033968215e-07, | |
| "loss": 1.5481, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.9040880503144654, | |
| "grad_norm": 3.007868223160233, | |
| "learning_rate": 4.0344770176395606e-07, | |
| "loss": 1.563, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 3.150820294437964, | |
| "learning_rate": 4.018593129190113e-07, | |
| "loss": 1.4964, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.9072327044025157, | |
| "grad_norm": 3.0198801950596, | |
| "learning_rate": 4.0027195352343456e-07, | |
| "loss": 1.6168, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.9088050314465409, | |
| "grad_norm": 3.0904507818273377, | |
| "learning_rate": 3.98685640227875e-07, | |
| "loss": 1.4994, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.910377358490566, | |
| "grad_norm": 3.2771071808209142, | |
| "learning_rate": 3.971003896720082e-07, | |
| "loss": 1.7361, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.9119496855345912, | |
| "grad_norm": 3.111995014392537, | |
| "learning_rate": 3.955162184843624e-07, | |
| "loss": 1.5881, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9135220125786163, | |
| "grad_norm": 3.358309691767164, | |
| "learning_rate": 3.93933143282144e-07, | |
| "loss": 1.6736, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.9150943396226415, | |
| "grad_norm": 4.345173695966827, | |
| "learning_rate": 3.923511806710625e-07, | |
| "loss": 1.3978, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 3.4592040777889284, | |
| "learning_rate": 3.907703472451573e-07, | |
| "loss": 1.496, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.9182389937106918, | |
| "grad_norm": 3.2765433187832147, | |
| "learning_rate": 3.8919065958662295e-07, | |
| "loss": 1.7576, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.9198113207547169, | |
| "grad_norm": 3.135962913002799, | |
| "learning_rate": 3.8761213426563543e-07, | |
| "loss": 1.8539, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9213836477987422, | |
| "grad_norm": 3.0325653708421623, | |
| "learning_rate": 3.860347878401784e-07, | |
| "loss": 1.5986, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.9229559748427673, | |
| "grad_norm": 3.288278731016835, | |
| "learning_rate": 3.844586368558694e-07, | |
| "loss": 1.5314, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.9245283018867925, | |
| "grad_norm": 3.504682969050096, | |
| "learning_rate": 3.828836978457867e-07, | |
| "loss": 1.5027, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.9261006289308176, | |
| "grad_norm": 3.059351611957445, | |
| "learning_rate": 3.813099873302951e-07, | |
| "loss": 1.6774, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.9276729559748428, | |
| "grad_norm": 3.0459760979745023, | |
| "learning_rate": 3.7973752181687327e-07, | |
| "loss": 1.676, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9292452830188679, | |
| "grad_norm": 3.247916912464039, | |
| "learning_rate": 3.781663177999401e-07, | |
| "loss": 1.7239, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.9308176100628931, | |
| "grad_norm": 3.7058658418998514, | |
| "learning_rate": 3.765963917606828e-07, | |
| "loss": 1.7732, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.9323899371069182, | |
| "grad_norm": 3.1817920794892363, | |
| "learning_rate": 3.750277601668823e-07, | |
| "loss": 1.5874, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.9339622641509434, | |
| "grad_norm": 3.426732542442707, | |
| "learning_rate": 3.7346043947274186e-07, | |
| "loss": 1.6734, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.9355345911949685, | |
| "grad_norm": 3.174283975354623, | |
| "learning_rate": 3.718944461187138e-07, | |
| "loss": 1.3842, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.9371069182389937, | |
| "grad_norm": 3.0801471315347513, | |
| "learning_rate": 3.7032979653132747e-07, | |
| "loss": 1.3734, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.9386792452830188, | |
| "grad_norm": 2.9253641065501714, | |
| "learning_rate": 3.6876650712301647e-07, | |
| "loss": 1.8524, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.940251572327044, | |
| "grad_norm": 3.5083452967032014, | |
| "learning_rate": 3.6720459429194737e-07, | |
| "loss": 1.6783, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.9418238993710691, | |
| "grad_norm": 3.098054523539633, | |
| "learning_rate": 3.656440744218464e-07, | |
| "loss": 1.5777, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.9433962264150944, | |
| "grad_norm": 3.17444031508909, | |
| "learning_rate": 3.640849638818285e-07, | |
| "loss": 1.4485, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9449685534591195, | |
| "grad_norm": 3.38521379733728, | |
| "learning_rate": 3.625272790262257e-07, | |
| "loss": 1.5424, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.9465408805031447, | |
| "grad_norm": 3.2924344281098072, | |
| "learning_rate": 3.60971036194415e-07, | |
| "loss": 1.749, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.9481132075471698, | |
| "grad_norm": 3.289281005988172, | |
| "learning_rate": 3.594162517106472e-07, | |
| "loss": 1.5505, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.949685534591195, | |
| "grad_norm": 3.152800599797803, | |
| "learning_rate": 3.578629418838757e-07, | |
| "loss": 1.4122, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.9512578616352201, | |
| "grad_norm": 3.0655978527785495, | |
| "learning_rate": 3.563111230075859e-07, | |
| "loss": 1.7778, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9528301886792453, | |
| "grad_norm": 3.2065107290681767, | |
| "learning_rate": 3.547608113596233e-07, | |
| "loss": 1.5953, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.9544025157232704, | |
| "grad_norm": 3.144578851012227, | |
| "learning_rate": 3.532120232020236e-07, | |
| "loss": 1.7357, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.9559748427672956, | |
| "grad_norm": 3.4100400563663813, | |
| "learning_rate": 3.516647747808417e-07, | |
| "loss": 1.6029, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.9575471698113207, | |
| "grad_norm": 2.941026820149774, | |
| "learning_rate": 3.501190823259812e-07, | |
| "loss": 1.696, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.9591194968553459, | |
| "grad_norm": 3.0809553995028622, | |
| "learning_rate": 3.485749620510247e-07, | |
| "loss": 1.5933, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.960691823899371, | |
| "grad_norm": 3.126026676001102, | |
| "learning_rate": 3.470324301530631e-07, | |
| "loss": 1.596, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.9622641509433962, | |
| "grad_norm": 2.939282311557233, | |
| "learning_rate": 3.454915028125263e-07, | |
| "loss": 1.5106, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.9638364779874213, | |
| "grad_norm": 4.618369006446248, | |
| "learning_rate": 3.4395219619301285e-07, | |
| "loss": 1.7572, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.9654088050314465, | |
| "grad_norm": 3.0249997103524837, | |
| "learning_rate": 3.424145264411208e-07, | |
| "loss": 1.6931, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.9669811320754716, | |
| "grad_norm": 3.121161609283383, | |
| "learning_rate": 3.408785096862782e-07, | |
| "loss": 1.5564, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9685534591194969, | |
| "grad_norm": 2.98922657988821, | |
| "learning_rate": 3.393441620405739e-07, | |
| "loss": 1.7031, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.970125786163522, | |
| "grad_norm": 3.2545095978155043, | |
| "learning_rate": 3.378114995985889e-07, | |
| "loss": 1.4904, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.9716981132075472, | |
| "grad_norm": 3.1241849889746285, | |
| "learning_rate": 3.362805384372267e-07, | |
| "loss": 1.6119, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.9732704402515723, | |
| "grad_norm": 3.30354250616738, | |
| "learning_rate": 3.3475129461554566e-07, | |
| "loss": 1.8179, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.9748427672955975, | |
| "grad_norm": 3.1460767575471, | |
| "learning_rate": 3.3322378417458977e-07, | |
| "loss": 1.6612, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9764150943396226, | |
| "grad_norm": 3.032989419999135, | |
| "learning_rate": 3.3169802313722073e-07, | |
| "loss": 1.7385, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.9779874213836478, | |
| "grad_norm": 3.1187973670716835, | |
| "learning_rate": 3.301740275079497e-07, | |
| "loss": 1.3352, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.9795597484276729, | |
| "grad_norm": 2.988051541266211, | |
| "learning_rate": 3.2865181327277005e-07, | |
| "loss": 1.4746, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.9811320754716981, | |
| "grad_norm": 2.829001916528421, | |
| "learning_rate": 3.2713139639898854e-07, | |
| "loss": 1.5913, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.9827044025157232, | |
| "grad_norm": 3.097727818757928, | |
| "learning_rate": 3.2561279283505884e-07, | |
| "loss": 1.5274, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9842767295597484, | |
| "grad_norm": 3.280444035668285, | |
| "learning_rate": 3.240960185104137e-07, | |
| "loss": 1.5034, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.9858490566037735, | |
| "grad_norm": 2.984585170501972, | |
| "learning_rate": 3.2258108933529805e-07, | |
| "loss": 1.6207, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.9874213836477987, | |
| "grad_norm": 3.1514680873957337, | |
| "learning_rate": 3.2106802120060194e-07, | |
| "loss": 1.4952, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.9889937106918238, | |
| "grad_norm": 3.118313513827925, | |
| "learning_rate": 3.1955682997769447e-07, | |
| "loss": 1.3681, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.9905660377358491, | |
| "grad_norm": 2.892603332117668, | |
| "learning_rate": 3.1804753151825627e-07, | |
| "loss": 1.6935, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9921383647798742, | |
| "grad_norm": 3.196944465600346, | |
| "learning_rate": 3.16540141654114e-07, | |
| "loss": 1.431, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.9937106918238994, | |
| "grad_norm": 3.626584174105943, | |
| "learning_rate": 3.15034676197074e-07, | |
| "loss": 1.8299, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.9952830188679245, | |
| "grad_norm": 3.095208053523766, | |
| "learning_rate": 3.135311509387567e-07, | |
| "loss": 1.4026, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.9968553459119497, | |
| "grad_norm": 3.174860623522535, | |
| "learning_rate": 3.120295816504305e-07, | |
| "loss": 1.6607, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.9984276729559748, | |
| "grad_norm": 3.2555221492639785, | |
| "learning_rate": 3.105299840828466e-07, | |
| "loss": 1.5827, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.0636918987199846, | |
| "learning_rate": 3.090323739660742e-07, | |
| "loss": 1.6189, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.001572327044025, | |
| "grad_norm": 2.925175741352935, | |
| "learning_rate": 3.0753676700933445e-07, | |
| "loss": 1.4471, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.0031446540880504, | |
| "grad_norm": 3.190232090409089, | |
| "learning_rate": 3.0604317890083674e-07, | |
| "loss": 1.5929, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.0047169811320755, | |
| "grad_norm": 2.9342322211564245, | |
| "learning_rate": 3.045516253076137e-07, | |
| "loss": 1.5289, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.0062893081761006, | |
| "grad_norm": 3.05977531743234, | |
| "learning_rate": 3.030621218753565e-07, | |
| "loss": 1.608, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.0078616352201257, | |
| "grad_norm": 2.9208361212348324, | |
| "learning_rate": 3.0157468422825147e-07, | |
| "loss": 1.4697, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.009433962264151, | |
| "grad_norm": 3.097347482656311, | |
| "learning_rate": 3.0008932796881546e-07, | |
| "loss": 1.5677, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.0110062893081762, | |
| "grad_norm": 3.0507165989593927, | |
| "learning_rate": 2.9860606867773317e-07, | |
| "loss": 1.4711, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.0125786163522013, | |
| "grad_norm": 2.9464488425337465, | |
| "learning_rate": 2.9712492191369244e-07, | |
| "loss": 1.4238, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.0141509433962264, | |
| "grad_norm": 3.3320478398426614, | |
| "learning_rate": 2.95645903213222e-07, | |
| "loss": 1.5607, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.0157232704402517, | |
| "grad_norm": 2.9788169571994834, | |
| "learning_rate": 2.9416902809052814e-07, | |
| "loss": 1.5556, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.0172955974842768, | |
| "grad_norm": 3.3312805191020796, | |
| "learning_rate": 2.9269431203733206e-07, | |
| "loss": 1.6243, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.0188679245283019, | |
| "grad_norm": 3.0115060001645455, | |
| "learning_rate": 2.9122177052270747e-07, | |
| "loss": 1.4988, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.020440251572327, | |
| "grad_norm": 3.3321250244143936, | |
| "learning_rate": 2.897514189929177e-07, | |
| "loss": 1.4174, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.0220125786163523, | |
| "grad_norm": 3.1753277857894915, | |
| "learning_rate": 2.8828327287125507e-07, | |
| "loss": 1.6069, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0235849056603774, | |
| "grad_norm": 2.9899947184348328, | |
| "learning_rate": 2.8681734755787716e-07, | |
| "loss": 1.5228, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.0251572327044025, | |
| "grad_norm": 3.448011217347156, | |
| "learning_rate": 2.853536584296471e-07, | |
| "loss": 1.5431, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.0267295597484276, | |
| "grad_norm": 3.281542638072726, | |
| "learning_rate": 2.8389222083997117e-07, | |
| "loss": 1.4371, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.028301886792453, | |
| "grad_norm": 2.955949837511534, | |
| "learning_rate": 2.8243305011863837e-07, | |
| "loss": 1.6474, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.029874213836478, | |
| "grad_norm": 2.9895818915531986, | |
| "learning_rate": 2.8097616157165885e-07, | |
| "loss": 1.7398, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.0314465408805031, | |
| "grad_norm": 3.1813377026113834, | |
| "learning_rate": 2.7952157048110406e-07, | |
| "loss": 1.3422, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.0330188679245282, | |
| "grad_norm": 3.206575227698428, | |
| "learning_rate": 2.7806929210494646e-07, | |
| "loss": 1.4353, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.0345911949685536, | |
| "grad_norm": 2.8562478460137, | |
| "learning_rate": 2.766193416768988e-07, | |
| "loss": 1.5319, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.0361635220125787, | |
| "grad_norm": 3.192759899206982, | |
| "learning_rate": 2.751717344062552e-07, | |
| "loss": 1.3627, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 1.0377358490566038, | |
| "grad_norm": 3.0969602295505885, | |
| "learning_rate": 2.7372648547773056e-07, | |
| "loss": 1.4714, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.0393081761006289, | |
| "grad_norm": 3.005160598948565, | |
| "learning_rate": 2.722836100513027e-07, | |
| "loss": 1.4917, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 1.0408805031446542, | |
| "grad_norm": 3.194092221211697, | |
| "learning_rate": 2.708431232620516e-07, | |
| "loss": 1.6016, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 1.0424528301886793, | |
| "grad_norm": 3.2942434950589, | |
| "learning_rate": 2.6940504022000244e-07, | |
| "loss": 1.3387, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.0440251572327044, | |
| "grad_norm": 3.072420534138645, | |
| "learning_rate": 2.679693760099658e-07, | |
| "loss": 1.5528, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 1.0455974842767295, | |
| "grad_norm": 3.163961305087953, | |
| "learning_rate": 2.665361456913797e-07, | |
| "loss": 1.7042, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.0471698113207548, | |
| "grad_norm": 3.124340898841506, | |
| "learning_rate": 2.651053642981522e-07, | |
| "loss": 1.8125, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.04874213836478, | |
| "grad_norm": 3.266997346467644, | |
| "learning_rate": 2.6367704683850287e-07, | |
| "loss": 1.3475, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 1.050314465408805, | |
| "grad_norm": 4.0511615721483745, | |
| "learning_rate": 2.6225120829480627e-07, | |
| "loss": 1.3714, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.0518867924528301, | |
| "grad_norm": 3.2394052110431906, | |
| "learning_rate": 2.6082786362343374e-07, | |
| "loss": 1.6496, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.0534591194968554, | |
| "grad_norm": 3.2297916537019593, | |
| "learning_rate": 2.5940702775459744e-07, | |
| "loss": 1.5557, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0550314465408805, | |
| "grad_norm": 3.1460312089986218, | |
| "learning_rate": 2.579887155921936e-07, | |
| "loss": 1.5001, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 1.0566037735849056, | |
| "grad_norm": 3.544561305322451, | |
| "learning_rate": 2.5657294201364523e-07, | |
| "loss": 1.6661, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.0581761006289307, | |
| "grad_norm": 3.3796494907434846, | |
| "learning_rate": 2.551597218697475e-07, | |
| "loss": 1.4262, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 1.059748427672956, | |
| "grad_norm": 3.1326980300376386, | |
| "learning_rate": 2.537490699845109e-07, | |
| "loss": 1.7046, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.0613207547169812, | |
| "grad_norm": 3.053494075129222, | |
| "learning_rate": 2.523410011550064e-07, | |
| "loss": 1.5256, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.0628930817610063, | |
| "grad_norm": 2.8913745597217546, | |
| "learning_rate": 2.5093553015120934e-07, | |
| "loss": 1.59, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 1.0644654088050314, | |
| "grad_norm": 3.5453147384528996, | |
| "learning_rate": 2.495326717158457e-07, | |
| "loss": 1.4309, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 1.0660377358490567, | |
| "grad_norm": 3.0039880470586513, | |
| "learning_rate": 2.4813244056423686e-07, | |
| "loss": 1.43, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.0676100628930818, | |
| "grad_norm": 3.2472760469814212, | |
| "learning_rate": 2.467348513841447e-07, | |
| "loss": 1.5304, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.069182389937107, | |
| "grad_norm": 2.863821975975358, | |
| "learning_rate": 2.4533991883561867e-07, | |
| "loss": 1.2874, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.070754716981132, | |
| "grad_norm": 5.7732998659847725, | |
| "learning_rate": 2.439476575508408e-07, | |
| "loss": 1.4263, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.0723270440251573, | |
| "grad_norm": 3.1433425411459193, | |
| "learning_rate": 2.425580821339733e-07, | |
| "loss": 1.501, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 1.0738993710691824, | |
| "grad_norm": 3.6371508420472614, | |
| "learning_rate": 2.411712071610048e-07, | |
| "loss": 1.5254, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 1.0754716981132075, | |
| "grad_norm": 2.962554371919303, | |
| "learning_rate": 2.3978704717959776e-07, | |
| "loss": 1.6529, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.0770440251572326, | |
| "grad_norm": 3.6781213038382483, | |
| "learning_rate": 2.3840561670893495e-07, | |
| "loss": 1.4484, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.078616352201258, | |
| "grad_norm": 3.1791495438402584, | |
| "learning_rate": 2.3702693023956848e-07, | |
| "loss": 1.5015, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 1.080188679245283, | |
| "grad_norm": 3.4330234166348617, | |
| "learning_rate": 2.3565100223326735e-07, | |
| "loss": 1.4895, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 1.0817610062893082, | |
| "grad_norm": 3.2582510503112556, | |
| "learning_rate": 2.3427784712286475e-07, | |
| "loss": 1.4913, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 3.112247746272538, | |
| "learning_rate": 2.3290747931210848e-07, | |
| "loss": 1.5616, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 1.0849056603773586, | |
| "grad_norm": 3.1297461225513876, | |
| "learning_rate": 2.3153991317550808e-07, | |
| "loss": 1.6498, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0864779874213837, | |
| "grad_norm": 3.1597599015515283, | |
| "learning_rate": 2.3017516305818546e-07, | |
| "loss": 1.5233, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 1.0880503144654088, | |
| "grad_norm": 3.155900443793459, | |
| "learning_rate": 2.288132432757233e-07, | |
| "loss": 1.5558, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 1.0896226415094339, | |
| "grad_norm": 2.9696101347619566, | |
| "learning_rate": 2.2745416811401584e-07, | |
| "loss": 1.3783, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 1.0911949685534592, | |
| "grad_norm": 3.053820395116348, | |
| "learning_rate": 2.2609795182911857e-07, | |
| "loss": 1.4285, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 1.0927672955974843, | |
| "grad_norm": 3.2826961481245265, | |
| "learning_rate": 2.247446086470982e-07, | |
| "loss": 1.608, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.0943396226415094, | |
| "grad_norm": 3.0770194090310503, | |
| "learning_rate": 2.2339415276388474e-07, | |
| "loss": 1.4713, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 1.0959119496855345, | |
| "grad_norm": 3.044905869572258, | |
| "learning_rate": 2.220465983451209e-07, | |
| "loss": 1.5122, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 1.0974842767295598, | |
| "grad_norm": 3.1453525312191313, | |
| "learning_rate": 2.207019595260154e-07, | |
| "loss": 1.4947, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 1.099056603773585, | |
| "grad_norm": 3.236671349430344, | |
| "learning_rate": 2.1936025041119265e-07, | |
| "loss": 1.4189, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 1.10062893081761, | |
| "grad_norm": 3.083772083487567, | |
| "learning_rate": 2.180214850745467e-07, | |
| "loss": 1.625, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 120177106550784.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |