Stewy Slocum
Add fine-tuned model
0ca8062
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 293,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034129692832764505,
"grad_norm": 0.4375,
"learning_rate": 1e-05,
"loss": 2.098,
"step": 1
},
{
"epoch": 0.006825938566552901,
"grad_norm": 0.419921875,
"learning_rate": 9.965870307167235e-06,
"loss": 1.9904,
"step": 2
},
{
"epoch": 0.010238907849829351,
"grad_norm": 0.39453125,
"learning_rate": 9.931740614334472e-06,
"loss": 1.9246,
"step": 3
},
{
"epoch": 0.013651877133105802,
"grad_norm": 0.40625,
"learning_rate": 9.897610921501706e-06,
"loss": 1.9454,
"step": 4
},
{
"epoch": 0.017064846416382253,
"grad_norm": 0.41796875,
"learning_rate": 9.863481228668942e-06,
"loss": 2.0677,
"step": 5
},
{
"epoch": 0.020477815699658702,
"grad_norm": 0.380859375,
"learning_rate": 9.829351535836179e-06,
"loss": 1.9607,
"step": 6
},
{
"epoch": 0.023890784982935155,
"grad_norm": 0.37109375,
"learning_rate": 9.795221843003415e-06,
"loss": 1.8974,
"step": 7
},
{
"epoch": 0.027303754266211604,
"grad_norm": 0.361328125,
"learning_rate": 9.76109215017065e-06,
"loss": 1.9186,
"step": 8
},
{
"epoch": 0.030716723549488054,
"grad_norm": 0.384765625,
"learning_rate": 9.726962457337886e-06,
"loss": 2.0028,
"step": 9
},
{
"epoch": 0.034129692832764506,
"grad_norm": 0.369140625,
"learning_rate": 9.69283276450512e-06,
"loss": 1.9483,
"step": 10
},
{
"epoch": 0.03754266211604096,
"grad_norm": 0.376953125,
"learning_rate": 9.658703071672356e-06,
"loss": 2.0182,
"step": 11
},
{
"epoch": 0.040955631399317405,
"grad_norm": 0.36328125,
"learning_rate": 9.62457337883959e-06,
"loss": 1.9185,
"step": 12
},
{
"epoch": 0.04436860068259386,
"grad_norm": 0.359375,
"learning_rate": 9.590443686006825e-06,
"loss": 1.9198,
"step": 13
},
{
"epoch": 0.04778156996587031,
"grad_norm": 0.34765625,
"learning_rate": 9.556313993174062e-06,
"loss": 1.7946,
"step": 14
},
{
"epoch": 0.051194539249146756,
"grad_norm": 0.34765625,
"learning_rate": 9.522184300341298e-06,
"loss": 1.8384,
"step": 15
},
{
"epoch": 0.05460750853242321,
"grad_norm": 0.318359375,
"learning_rate": 9.488054607508534e-06,
"loss": 1.7561,
"step": 16
},
{
"epoch": 0.05802047781569966,
"grad_norm": 0.302734375,
"learning_rate": 9.453924914675769e-06,
"loss": 1.8221,
"step": 17
},
{
"epoch": 0.06143344709897611,
"grad_norm": 0.306640625,
"learning_rate": 9.419795221843005e-06,
"loss": 1.8518,
"step": 18
},
{
"epoch": 0.06484641638225255,
"grad_norm": 0.291015625,
"learning_rate": 9.38566552901024e-06,
"loss": 1.7368,
"step": 19
},
{
"epoch": 0.06825938566552901,
"grad_norm": 0.283203125,
"learning_rate": 9.351535836177476e-06,
"loss": 1.7257,
"step": 20
},
{
"epoch": 0.07167235494880546,
"grad_norm": 0.28515625,
"learning_rate": 9.31740614334471e-06,
"loss": 1.6868,
"step": 21
},
{
"epoch": 0.07508532423208192,
"grad_norm": 0.283203125,
"learning_rate": 9.283276450511946e-06,
"loss": 1.7257,
"step": 22
},
{
"epoch": 0.07849829351535836,
"grad_norm": 0.314453125,
"learning_rate": 9.249146757679181e-06,
"loss": 1.6474,
"step": 23
},
{
"epoch": 0.08191126279863481,
"grad_norm": 0.275390625,
"learning_rate": 9.215017064846417e-06,
"loss": 1.7366,
"step": 24
},
{
"epoch": 0.08532423208191127,
"grad_norm": 0.291015625,
"learning_rate": 9.180887372013653e-06,
"loss": 1.7124,
"step": 25
},
{
"epoch": 0.08873720136518772,
"grad_norm": 0.251953125,
"learning_rate": 9.146757679180888e-06,
"loss": 1.6561,
"step": 26
},
{
"epoch": 0.09215017064846416,
"grad_norm": 0.275390625,
"learning_rate": 9.112627986348124e-06,
"loss": 1.7343,
"step": 27
},
{
"epoch": 0.09556313993174062,
"grad_norm": 0.265625,
"learning_rate": 9.078498293515359e-06,
"loss": 1.6355,
"step": 28
},
{
"epoch": 0.09897610921501707,
"grad_norm": 0.244140625,
"learning_rate": 9.044368600682595e-06,
"loss": 1.6064,
"step": 29
},
{
"epoch": 0.10238907849829351,
"grad_norm": 0.265625,
"learning_rate": 9.01023890784983e-06,
"loss": 1.6406,
"step": 30
},
{
"epoch": 0.10580204778156997,
"grad_norm": 0.23828125,
"learning_rate": 8.976109215017066e-06,
"loss": 1.6211,
"step": 31
},
{
"epoch": 0.10921501706484642,
"grad_norm": 0.2412109375,
"learning_rate": 8.9419795221843e-06,
"loss": 1.6092,
"step": 32
},
{
"epoch": 0.11262798634812286,
"grad_norm": 0.24609375,
"learning_rate": 8.907849829351536e-06,
"loss": 1.6817,
"step": 33
},
{
"epoch": 0.11604095563139932,
"grad_norm": 0.2392578125,
"learning_rate": 8.873720136518773e-06,
"loss": 1.5899,
"step": 34
},
{
"epoch": 0.11945392491467577,
"grad_norm": 0.2353515625,
"learning_rate": 8.839590443686009e-06,
"loss": 1.5608,
"step": 35
},
{
"epoch": 0.12286689419795221,
"grad_norm": 0.232421875,
"learning_rate": 8.805460750853243e-06,
"loss": 1.5448,
"step": 36
},
{
"epoch": 0.12627986348122866,
"grad_norm": 0.2158203125,
"learning_rate": 8.771331058020478e-06,
"loss": 1.4979,
"step": 37
},
{
"epoch": 0.1296928327645051,
"grad_norm": 0.220703125,
"learning_rate": 8.737201365187714e-06,
"loss": 1.5345,
"step": 38
},
{
"epoch": 0.13310580204778158,
"grad_norm": 0.2275390625,
"learning_rate": 8.703071672354949e-06,
"loss": 1.5387,
"step": 39
},
{
"epoch": 0.13651877133105803,
"grad_norm": 0.302734375,
"learning_rate": 8.668941979522185e-06,
"loss": 1.4665,
"step": 40
},
{
"epoch": 0.13993174061433447,
"grad_norm": 0.21875,
"learning_rate": 8.63481228668942e-06,
"loss": 1.5719,
"step": 41
},
{
"epoch": 0.14334470989761092,
"grad_norm": 0.203125,
"learning_rate": 8.600682593856656e-06,
"loss": 1.5123,
"step": 42
},
{
"epoch": 0.14675767918088736,
"grad_norm": 0.2294921875,
"learning_rate": 8.566552901023892e-06,
"loss": 1.3894,
"step": 43
},
{
"epoch": 0.15017064846416384,
"grad_norm": 0.1982421875,
"learning_rate": 8.532423208191128e-06,
"loss": 1.4781,
"step": 44
},
{
"epoch": 0.15358361774744028,
"grad_norm": 0.2578125,
"learning_rate": 8.498293515358363e-06,
"loss": 1.5839,
"step": 45
},
{
"epoch": 0.15699658703071673,
"grad_norm": 0.23046875,
"learning_rate": 8.464163822525599e-06,
"loss": 1.4626,
"step": 46
},
{
"epoch": 0.16040955631399317,
"grad_norm": 0.1962890625,
"learning_rate": 8.430034129692833e-06,
"loss": 1.4703,
"step": 47
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.2451171875,
"learning_rate": 8.395904436860068e-06,
"loss": 1.485,
"step": 48
},
{
"epoch": 0.16723549488054607,
"grad_norm": 0.1982421875,
"learning_rate": 8.361774744027304e-06,
"loss": 1.4331,
"step": 49
},
{
"epoch": 0.17064846416382254,
"grad_norm": 0.2314453125,
"learning_rate": 8.327645051194539e-06,
"loss": 1.4434,
"step": 50
},
{
"epoch": 0.17406143344709898,
"grad_norm": 0.205078125,
"learning_rate": 8.293515358361775e-06,
"loss": 1.488,
"step": 51
},
{
"epoch": 0.17747440273037543,
"grad_norm": 0.18359375,
"learning_rate": 8.259385665529011e-06,
"loss": 1.4482,
"step": 52
},
{
"epoch": 0.18088737201365188,
"grad_norm": 0.197265625,
"learning_rate": 8.225255972696247e-06,
"loss": 1.5022,
"step": 53
},
{
"epoch": 0.18430034129692832,
"grad_norm": 0.181640625,
"learning_rate": 8.191126279863482e-06,
"loss": 1.4294,
"step": 54
},
{
"epoch": 0.18771331058020477,
"grad_norm": 0.2001953125,
"learning_rate": 8.156996587030718e-06,
"loss": 1.4651,
"step": 55
},
{
"epoch": 0.19112627986348124,
"grad_norm": 0.1884765625,
"learning_rate": 8.122866894197953e-06,
"loss": 1.417,
"step": 56
},
{
"epoch": 0.1945392491467577,
"grad_norm": 0.193359375,
"learning_rate": 8.088737201365189e-06,
"loss": 1.4324,
"step": 57
},
{
"epoch": 0.19795221843003413,
"grad_norm": 0.1787109375,
"learning_rate": 8.054607508532423e-06,
"loss": 1.4433,
"step": 58
},
{
"epoch": 0.20136518771331058,
"grad_norm": 0.1767578125,
"learning_rate": 8.02047781569966e-06,
"loss": 1.3302,
"step": 59
},
{
"epoch": 0.20477815699658702,
"grad_norm": 0.1962890625,
"learning_rate": 7.986348122866894e-06,
"loss": 1.4283,
"step": 60
},
{
"epoch": 0.20819112627986347,
"grad_norm": 0.1796875,
"learning_rate": 7.95221843003413e-06,
"loss": 1.3951,
"step": 61
},
{
"epoch": 0.21160409556313994,
"grad_norm": 0.1982421875,
"learning_rate": 7.918088737201367e-06,
"loss": 1.4534,
"step": 62
},
{
"epoch": 0.2150170648464164,
"grad_norm": 0.189453125,
"learning_rate": 7.883959044368601e-06,
"loss": 1.4199,
"step": 63
},
{
"epoch": 0.21843003412969283,
"grad_norm": 0.1728515625,
"learning_rate": 7.849829351535837e-06,
"loss": 1.3395,
"step": 64
},
{
"epoch": 0.22184300341296928,
"grad_norm": 0.1728515625,
"learning_rate": 7.815699658703072e-06,
"loss": 1.4051,
"step": 65
},
{
"epoch": 0.22525597269624573,
"grad_norm": 0.224609375,
"learning_rate": 7.781569965870308e-06,
"loss": 1.3764,
"step": 66
},
{
"epoch": 0.22866894197952217,
"grad_norm": 0.1806640625,
"learning_rate": 7.747440273037543e-06,
"loss": 1.373,
"step": 67
},
{
"epoch": 0.23208191126279865,
"grad_norm": 0.1630859375,
"learning_rate": 7.713310580204779e-06,
"loss": 1.3301,
"step": 68
},
{
"epoch": 0.2354948805460751,
"grad_norm": 0.1708984375,
"learning_rate": 7.679180887372013e-06,
"loss": 1.3786,
"step": 69
},
{
"epoch": 0.23890784982935154,
"grad_norm": 0.16796875,
"learning_rate": 7.64505119453925e-06,
"loss": 1.3802,
"step": 70
},
{
"epoch": 0.24232081911262798,
"grad_norm": 0.322265625,
"learning_rate": 7.610921501706485e-06,
"loss": 1.3011,
"step": 71
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.2001953125,
"learning_rate": 7.57679180887372e-06,
"loss": 1.3053,
"step": 72
},
{
"epoch": 0.24914675767918087,
"grad_norm": 0.16796875,
"learning_rate": 7.542662116040957e-06,
"loss": 1.3408,
"step": 73
},
{
"epoch": 0.2525597269624573,
"grad_norm": 0.26171875,
"learning_rate": 7.508532423208191e-06,
"loss": 1.313,
"step": 74
},
{
"epoch": 0.25597269624573377,
"grad_norm": 0.1904296875,
"learning_rate": 7.474402730375427e-06,
"loss": 1.3259,
"step": 75
},
{
"epoch": 0.2593856655290102,
"grad_norm": 0.1962890625,
"learning_rate": 7.440273037542663e-06,
"loss": 1.3397,
"step": 76
},
{
"epoch": 0.2627986348122867,
"grad_norm": 0.162109375,
"learning_rate": 7.406143344709898e-06,
"loss": 1.315,
"step": 77
},
{
"epoch": 0.26621160409556316,
"grad_norm": 0.279296875,
"learning_rate": 7.3720136518771335e-06,
"loss": 1.3648,
"step": 78
},
{
"epoch": 0.2696245733788396,
"grad_norm": 0.1669921875,
"learning_rate": 7.33788395904437e-06,
"loss": 1.2991,
"step": 79
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.1552734375,
"learning_rate": 7.303754266211604e-06,
"loss": 1.2913,
"step": 80
},
{
"epoch": 0.2764505119453925,
"grad_norm": 0.2060546875,
"learning_rate": 7.2696245733788405e-06,
"loss": 1.3413,
"step": 81
},
{
"epoch": 0.27986348122866894,
"grad_norm": 0.1640625,
"learning_rate": 7.235494880546076e-06,
"loss": 1.3348,
"step": 82
},
{
"epoch": 0.2832764505119454,
"grad_norm": 0.255859375,
"learning_rate": 7.201365187713312e-06,
"loss": 1.3224,
"step": 83
},
{
"epoch": 0.28668941979522183,
"grad_norm": 0.1689453125,
"learning_rate": 7.167235494880547e-06,
"loss": 1.2809,
"step": 84
},
{
"epoch": 0.2901023890784983,
"grad_norm": 0.2138671875,
"learning_rate": 7.133105802047782e-06,
"loss": 1.2331,
"step": 85
},
{
"epoch": 0.2935153583617747,
"grad_norm": 0.1796875,
"learning_rate": 7.098976109215017e-06,
"loss": 1.3094,
"step": 86
},
{
"epoch": 0.29692832764505117,
"grad_norm": 0.2119140625,
"learning_rate": 7.064846416382253e-06,
"loss": 1.2913,
"step": 87
},
{
"epoch": 0.3003412969283277,
"grad_norm": 0.1630859375,
"learning_rate": 7.030716723549489e-06,
"loss": 1.291,
"step": 88
},
{
"epoch": 0.3037542662116041,
"grad_norm": 0.1923828125,
"learning_rate": 6.9965870307167235e-06,
"loss": 1.2663,
"step": 89
},
{
"epoch": 0.30716723549488056,
"grad_norm": 0.1572265625,
"learning_rate": 6.96245733788396e-06,
"loss": 1.2546,
"step": 90
},
{
"epoch": 0.310580204778157,
"grad_norm": 0.310546875,
"learning_rate": 6.928327645051195e-06,
"loss": 1.3238,
"step": 91
},
{
"epoch": 0.31399317406143346,
"grad_norm": 0.1845703125,
"learning_rate": 6.894197952218431e-06,
"loss": 1.2953,
"step": 92
},
{
"epoch": 0.3174061433447099,
"grad_norm": 0.2392578125,
"learning_rate": 6.860068259385666e-06,
"loss": 1.3402,
"step": 93
},
{
"epoch": 0.32081911262798635,
"grad_norm": 0.1640625,
"learning_rate": 6.825938566552902e-06,
"loss": 1.3209,
"step": 94
},
{
"epoch": 0.3242320819112628,
"grad_norm": 0.1591796875,
"learning_rate": 6.7918088737201375e-06,
"loss": 1.2684,
"step": 95
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.16015625,
"learning_rate": 6.757679180887372e-06,
"loss": 1.2606,
"step": 96
},
{
"epoch": 0.3310580204778157,
"grad_norm": 0.1572265625,
"learning_rate": 6.723549488054608e-06,
"loss": 1.2737,
"step": 97
},
{
"epoch": 0.33447098976109213,
"grad_norm": 0.16796875,
"learning_rate": 6.689419795221843e-06,
"loss": 1.3183,
"step": 98
},
{
"epoch": 0.3378839590443686,
"grad_norm": 0.1923828125,
"learning_rate": 6.655290102389079e-06,
"loss": 1.283,
"step": 99
},
{
"epoch": 0.3412969283276451,
"grad_norm": 0.185546875,
"learning_rate": 6.621160409556314e-06,
"loss": 1.3525,
"step": 100
},
{
"epoch": 0.3447098976109215,
"grad_norm": 0.1494140625,
"learning_rate": 6.587030716723551e-06,
"loss": 1.2137,
"step": 101
},
{
"epoch": 0.34812286689419797,
"grad_norm": 0.18359375,
"learning_rate": 6.552901023890785e-06,
"loss": 1.3545,
"step": 102
},
{
"epoch": 0.3515358361774744,
"grad_norm": 0.169921875,
"learning_rate": 6.518771331058021e-06,
"loss": 1.2902,
"step": 103
},
{
"epoch": 0.35494880546075086,
"grad_norm": 0.173828125,
"learning_rate": 6.484641638225257e-06,
"loss": 1.3484,
"step": 104
},
{
"epoch": 0.3583617747440273,
"grad_norm": 0.1591796875,
"learning_rate": 6.450511945392492e-06,
"loss": 1.2494,
"step": 105
},
{
"epoch": 0.36177474402730375,
"grad_norm": 0.1630859375,
"learning_rate": 6.4163822525597275e-06,
"loss": 1.2387,
"step": 106
},
{
"epoch": 0.3651877133105802,
"grad_norm": 0.2392578125,
"learning_rate": 6.382252559726962e-06,
"loss": 1.3741,
"step": 107
},
{
"epoch": 0.36860068259385664,
"grad_norm": 0.162109375,
"learning_rate": 6.348122866894198e-06,
"loss": 1.2862,
"step": 108
},
{
"epoch": 0.3720136518771331,
"grad_norm": 0.1669921875,
"learning_rate": 6.313993174061434e-06,
"loss": 1.2937,
"step": 109
},
{
"epoch": 0.37542662116040953,
"grad_norm": 0.1533203125,
"learning_rate": 6.27986348122867e-06,
"loss": 1.2434,
"step": 110
},
{
"epoch": 0.378839590443686,
"grad_norm": 0.1611328125,
"learning_rate": 6.245733788395904e-06,
"loss": 1.3098,
"step": 111
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.2275390625,
"learning_rate": 6.211604095563141e-06,
"loss": 1.3057,
"step": 112
},
{
"epoch": 0.3856655290102389,
"grad_norm": 0.1630859375,
"learning_rate": 6.177474402730376e-06,
"loss": 1.2747,
"step": 113
},
{
"epoch": 0.3890784982935154,
"grad_norm": 0.22265625,
"learning_rate": 6.143344709897611e-06,
"loss": 1.2129,
"step": 114
},
{
"epoch": 0.3924914675767918,
"grad_norm": 0.1728515625,
"learning_rate": 6.109215017064847e-06,
"loss": 1.2678,
"step": 115
},
{
"epoch": 0.39590443686006827,
"grad_norm": 0.244140625,
"learning_rate": 6.075085324232083e-06,
"loss": 1.2789,
"step": 116
},
{
"epoch": 0.3993174061433447,
"grad_norm": 0.1484375,
"learning_rate": 6.0409556313993175e-06,
"loss": 1.2183,
"step": 117
},
{
"epoch": 0.40273037542662116,
"grad_norm": 0.2109375,
"learning_rate": 6.006825938566554e-06,
"loss": 1.2227,
"step": 118
},
{
"epoch": 0.4061433447098976,
"grad_norm": 0.177734375,
"learning_rate": 5.972696245733789e-06,
"loss": 1.2714,
"step": 119
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.1767578125,
"learning_rate": 5.938566552901024e-06,
"loss": 1.2793,
"step": 120
},
{
"epoch": 0.4129692832764505,
"grad_norm": 0.1923828125,
"learning_rate": 5.90443686006826e-06,
"loss": 1.1814,
"step": 121
},
{
"epoch": 0.41638225255972694,
"grad_norm": 0.2158203125,
"learning_rate": 5.870307167235495e-06,
"loss": 1.2612,
"step": 122
},
{
"epoch": 0.4197952218430034,
"grad_norm": 0.1533203125,
"learning_rate": 5.8361774744027315e-06,
"loss": 1.1921,
"step": 123
},
{
"epoch": 0.4232081911262799,
"grad_norm": 0.158203125,
"learning_rate": 5.802047781569966e-06,
"loss": 1.2512,
"step": 124
},
{
"epoch": 0.42662116040955633,
"grad_norm": 0.16796875,
"learning_rate": 5.767918088737202e-06,
"loss": 1.2593,
"step": 125
},
{
"epoch": 0.4300341296928328,
"grad_norm": 0.220703125,
"learning_rate": 5.733788395904437e-06,
"loss": 1.2941,
"step": 126
},
{
"epoch": 0.4334470989761092,
"grad_norm": 0.181640625,
"learning_rate": 5.699658703071673e-06,
"loss": 1.2128,
"step": 127
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.1611328125,
"learning_rate": 5.665529010238908e-06,
"loss": 1.2025,
"step": 128
},
{
"epoch": 0.4402730375426621,
"grad_norm": 0.1572265625,
"learning_rate": 5.631399317406145e-06,
"loss": 1.2257,
"step": 129
},
{
"epoch": 0.44368600682593856,
"grad_norm": 0.16015625,
"learning_rate": 5.597269624573379e-06,
"loss": 1.2245,
"step": 130
},
{
"epoch": 0.447098976109215,
"grad_norm": 0.1943359375,
"learning_rate": 5.5631399317406145e-06,
"loss": 1.1971,
"step": 131
},
{
"epoch": 0.45051194539249145,
"grad_norm": 0.1669921875,
"learning_rate": 5.529010238907851e-06,
"loss": 1.2063,
"step": 132
},
{
"epoch": 0.4539249146757679,
"grad_norm": 0.1611328125,
"learning_rate": 5.494880546075085e-06,
"loss": 1.2045,
"step": 133
},
{
"epoch": 0.45733788395904434,
"grad_norm": 0.1875,
"learning_rate": 5.4607508532423215e-06,
"loss": 1.3064,
"step": 134
},
{
"epoch": 0.46075085324232085,
"grad_norm": 0.18359375,
"learning_rate": 5.426621160409556e-06,
"loss": 1.1963,
"step": 135
},
{
"epoch": 0.4641638225255973,
"grad_norm": 0.1640625,
"learning_rate": 5.392491467576792e-06,
"loss": 1.2109,
"step": 136
},
{
"epoch": 0.46757679180887374,
"grad_norm": 0.16796875,
"learning_rate": 5.358361774744028e-06,
"loss": 1.2651,
"step": 137
},
{
"epoch": 0.4709897610921502,
"grad_norm": 0.1572265625,
"learning_rate": 5.324232081911264e-06,
"loss": 1.1986,
"step": 138
},
{
"epoch": 0.47440273037542663,
"grad_norm": 0.158203125,
"learning_rate": 5.290102389078498e-06,
"loss": 1.261,
"step": 139
},
{
"epoch": 0.4778156996587031,
"grad_norm": 0.1630859375,
"learning_rate": 5.255972696245735e-06,
"loss": 1.1949,
"step": 140
},
{
"epoch": 0.4812286689419795,
"grad_norm": 0.17578125,
"learning_rate": 5.22184300341297e-06,
"loss": 1.1967,
"step": 141
},
{
"epoch": 0.48464163822525597,
"grad_norm": 0.1611328125,
"learning_rate": 5.1877133105802046e-06,
"loss": 1.2054,
"step": 142
},
{
"epoch": 0.4880546075085324,
"grad_norm": 0.212890625,
"learning_rate": 5.153583617747441e-06,
"loss": 1.2582,
"step": 143
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.1962890625,
"learning_rate": 5.119453924914676e-06,
"loss": 1.2465,
"step": 144
},
{
"epoch": 0.4948805460750853,
"grad_norm": 0.1650390625,
"learning_rate": 5.0853242320819115e-06,
"loss": 1.1709,
"step": 145
},
{
"epoch": 0.49829351535836175,
"grad_norm": 0.1806640625,
"learning_rate": 5.051194539249147e-06,
"loss": 1.2741,
"step": 146
},
{
"epoch": 0.5017064846416383,
"grad_norm": 0.173828125,
"learning_rate": 5.017064846416383e-06,
"loss": 1.2093,
"step": 147
},
{
"epoch": 0.5051194539249146,
"grad_norm": 0.162109375,
"learning_rate": 4.982935153583618e-06,
"loss": 1.2348,
"step": 148
},
{
"epoch": 0.5085324232081911,
"grad_norm": 0.2001953125,
"learning_rate": 4.948805460750853e-06,
"loss": 1.2276,
"step": 149
},
{
"epoch": 0.5119453924914675,
"grad_norm": 0.158203125,
"learning_rate": 4.914675767918089e-06,
"loss": 1.2371,
"step": 150
},
{
"epoch": 0.515358361774744,
"grad_norm": 0.1640625,
"learning_rate": 4.880546075085325e-06,
"loss": 1.2013,
"step": 151
},
{
"epoch": 0.5187713310580204,
"grad_norm": 0.171875,
"learning_rate": 4.84641638225256e-06,
"loss": 1.1444,
"step": 152
},
{
"epoch": 0.5221843003412969,
"grad_norm": 0.193359375,
"learning_rate": 4.812286689419795e-06,
"loss": 1.2329,
"step": 153
},
{
"epoch": 0.5255972696245734,
"grad_norm": 0.158203125,
"learning_rate": 4.778156996587031e-06,
"loss": 1.2572,
"step": 154
},
{
"epoch": 0.5290102389078498,
"grad_norm": 0.2060546875,
"learning_rate": 4.744027303754267e-06,
"loss": 1.2378,
"step": 155
},
{
"epoch": 0.5324232081911263,
"grad_norm": 0.349609375,
"learning_rate": 4.709897610921502e-06,
"loss": 1.192,
"step": 156
},
{
"epoch": 0.5358361774744027,
"grad_norm": 0.1552734375,
"learning_rate": 4.675767918088738e-06,
"loss": 1.2053,
"step": 157
},
{
"epoch": 0.5392491467576792,
"grad_norm": 0.1669921875,
"learning_rate": 4.641638225255973e-06,
"loss": 1.2316,
"step": 158
},
{
"epoch": 0.5426621160409556,
"grad_norm": 0.16796875,
"learning_rate": 4.6075085324232085e-06,
"loss": 1.2648,
"step": 159
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.1572265625,
"learning_rate": 4.573378839590444e-06,
"loss": 1.1857,
"step": 160
},
{
"epoch": 0.5494880546075085,
"grad_norm": 0.15625,
"learning_rate": 4.539249146757679e-06,
"loss": 1.2094,
"step": 161
},
{
"epoch": 0.552901023890785,
"grad_norm": 0.1806640625,
"learning_rate": 4.505119453924915e-06,
"loss": 1.2234,
"step": 162
},
{
"epoch": 0.5563139931740614,
"grad_norm": 0.16015625,
"learning_rate": 4.47098976109215e-06,
"loss": 1.209,
"step": 163
},
{
"epoch": 0.5597269624573379,
"grad_norm": 0.185546875,
"learning_rate": 4.436860068259386e-06,
"loss": 1.234,
"step": 164
},
{
"epoch": 0.5631399317406144,
"grad_norm": 0.23828125,
"learning_rate": 4.402730375426622e-06,
"loss": 1.1981,
"step": 165
},
{
"epoch": 0.5665529010238908,
"grad_norm": 0.3125,
"learning_rate": 4.368600682593857e-06,
"loss": 1.2415,
"step": 166
},
{
"epoch": 0.5699658703071673,
"grad_norm": 0.1640625,
"learning_rate": 4.3344709897610924e-06,
"loss": 1.2153,
"step": 167
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.197265625,
"learning_rate": 4.300341296928328e-06,
"loss": 1.2421,
"step": 168
},
{
"epoch": 0.5767918088737202,
"grad_norm": 0.359375,
"learning_rate": 4.266211604095564e-06,
"loss": 1.2309,
"step": 169
},
{
"epoch": 0.5802047781569966,
"grad_norm": 0.158203125,
"learning_rate": 4.232081911262799e-06,
"loss": 1.178,
"step": 170
},
{
"epoch": 0.5836177474402731,
"grad_norm": 0.228515625,
"learning_rate": 4.197952218430034e-06,
"loss": 1.2564,
"step": 171
},
{
"epoch": 0.5870307167235495,
"grad_norm": 0.17578125,
"learning_rate": 4.163822525597269e-06,
"loss": 1.2054,
"step": 172
},
{
"epoch": 0.590443686006826,
"grad_norm": 0.16015625,
"learning_rate": 4.1296928327645055e-06,
"loss": 1.1599,
"step": 173
},
{
"epoch": 0.5938566552901023,
"grad_norm": 0.2109375,
"learning_rate": 4.095563139931741e-06,
"loss": 1.25,
"step": 174
},
{
"epoch": 0.5972696245733788,
"grad_norm": 0.224609375,
"learning_rate": 4.061433447098976e-06,
"loss": 1.2541,
"step": 175
},
{
"epoch": 0.6006825938566553,
"grad_norm": 0.1640625,
"learning_rate": 4.027303754266212e-06,
"loss": 1.1981,
"step": 176
},
{
"epoch": 0.6040955631399317,
"grad_norm": 0.1728515625,
"learning_rate": 3.993174061433447e-06,
"loss": 1.2048,
"step": 177
},
{
"epoch": 0.6075085324232082,
"grad_norm": 0.1728515625,
"learning_rate": 3.959044368600683e-06,
"loss": 1.2193,
"step": 178
},
{
"epoch": 0.6109215017064846,
"grad_norm": 0.302734375,
"learning_rate": 3.924914675767919e-06,
"loss": 1.2185,
"step": 179
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.31640625,
"learning_rate": 3.890784982935154e-06,
"loss": 1.2896,
"step": 180
},
{
"epoch": 0.6177474402730375,
"grad_norm": 0.1552734375,
"learning_rate": 3.8566552901023894e-06,
"loss": 1.1668,
"step": 181
},
{
"epoch": 0.621160409556314,
"grad_norm": 0.158203125,
"learning_rate": 3.822525597269625e-06,
"loss": 1.2119,
"step": 182
},
{
"epoch": 0.6245733788395904,
"grad_norm": 0.19921875,
"learning_rate": 3.78839590443686e-06,
"loss": 1.2454,
"step": 183
},
{
"epoch": 0.6279863481228669,
"grad_norm": 0.1884765625,
"learning_rate": 3.7542662116040956e-06,
"loss": 1.2037,
"step": 184
},
{
"epoch": 0.6313993174061433,
"grad_norm": 0.212890625,
"learning_rate": 3.7201365187713314e-06,
"loss": 1.1692,
"step": 185
},
{
"epoch": 0.6348122866894198,
"grad_norm": 0.1708984375,
"learning_rate": 3.6860068259385667e-06,
"loss": 1.2523,
"step": 186
},
{
"epoch": 0.6382252559726962,
"grad_norm": 0.16015625,
"learning_rate": 3.651877133105802e-06,
"loss": 1.2013,
"step": 187
},
{
"epoch": 0.6416382252559727,
"grad_norm": 0.1591796875,
"learning_rate": 3.617747440273038e-06,
"loss": 1.1724,
"step": 188
},
{
"epoch": 0.6450511945392492,
"grad_norm": 0.1923828125,
"learning_rate": 3.5836177474402733e-06,
"loss": 1.192,
"step": 189
},
{
"epoch": 0.6484641638225256,
"grad_norm": 0.18359375,
"learning_rate": 3.5494880546075087e-06,
"loss": 1.1876,
"step": 190
},
{
"epoch": 0.6518771331058021,
"grad_norm": 0.1884765625,
"learning_rate": 3.5153583617747445e-06,
"loss": 1.2077,
"step": 191
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.1513671875,
"learning_rate": 3.48122866894198e-06,
"loss": 1.1763,
"step": 192
},
{
"epoch": 0.658703071672355,
"grad_norm": 0.1611328125,
"learning_rate": 3.4470989761092157e-06,
"loss": 1.1838,
"step": 193
},
{
"epoch": 0.6621160409556314,
"grad_norm": 0.25390625,
"learning_rate": 3.412969283276451e-06,
"loss": 1.1777,
"step": 194
},
{
"epoch": 0.6655290102389079,
"grad_norm": 0.251953125,
"learning_rate": 3.378839590443686e-06,
"loss": 1.2041,
"step": 195
},
{
"epoch": 0.6689419795221843,
"grad_norm": 0.1728515625,
"learning_rate": 3.3447098976109214e-06,
"loss": 1.2381,
"step": 196
},
{
"epoch": 0.6723549488054608,
"grad_norm": 0.1591796875,
"learning_rate": 3.310580204778157e-06,
"loss": 1.1874,
"step": 197
},
{
"epoch": 0.6757679180887372,
"grad_norm": 0.20703125,
"learning_rate": 3.2764505119453926e-06,
"loss": 1.1845,
"step": 198
},
{
"epoch": 0.6791808873720137,
"grad_norm": 0.15625,
"learning_rate": 3.2423208191126284e-06,
"loss": 1.181,
"step": 199
},
{
"epoch": 0.6825938566552902,
"grad_norm": 0.185546875,
"learning_rate": 3.2081911262798638e-06,
"loss": 1.2325,
"step": 200
},
{
"epoch": 0.6860068259385665,
"grad_norm": 0.169921875,
"learning_rate": 3.174061433447099e-06,
"loss": 1.1921,
"step": 201
},
{
"epoch": 0.689419795221843,
"grad_norm": 0.1630859375,
"learning_rate": 3.139931740614335e-06,
"loss": 1.1966,
"step": 202
},
{
"epoch": 0.6928327645051194,
"grad_norm": 0.166015625,
"learning_rate": 3.1058020477815703e-06,
"loss": 1.1414,
"step": 203
},
{
"epoch": 0.6962457337883959,
"grad_norm": 0.1611328125,
"learning_rate": 3.0716723549488057e-06,
"loss": 1.1206,
"step": 204
},
{
"epoch": 0.6996587030716723,
"grad_norm": 0.189453125,
"learning_rate": 3.0375426621160415e-06,
"loss": 1.147,
"step": 205
},
{
"epoch": 0.7030716723549488,
"grad_norm": 0.216796875,
"learning_rate": 3.003412969283277e-06,
"loss": 1.1238,
"step": 206
},
{
"epoch": 0.7064846416382252,
"grad_norm": 0.1591796875,
"learning_rate": 2.969283276450512e-06,
"loss": 1.2101,
"step": 207
},
{
"epoch": 0.7098976109215017,
"grad_norm": 0.1611328125,
"learning_rate": 2.9351535836177476e-06,
"loss": 1.1966,
"step": 208
},
{
"epoch": 0.7133105802047781,
"grad_norm": 0.1611328125,
"learning_rate": 2.901023890784983e-06,
"loss": 1.1808,
"step": 209
},
{
"epoch": 0.7167235494880546,
"grad_norm": 0.1826171875,
"learning_rate": 2.8668941979522184e-06,
"loss": 1.2115,
"step": 210
},
{
"epoch": 0.7201365187713311,
"grad_norm": 0.162109375,
"learning_rate": 2.832764505119454e-06,
"loss": 1.2059,
"step": 211
},
{
"epoch": 0.7235494880546075,
"grad_norm": 0.26953125,
"learning_rate": 2.7986348122866896e-06,
"loss": 1.1821,
"step": 212
},
{
"epoch": 0.726962457337884,
"grad_norm": 0.2021484375,
"learning_rate": 2.7645051194539254e-06,
"loss": 1.2195,
"step": 213
},
{
"epoch": 0.7303754266211604,
"grad_norm": 0.158203125,
"learning_rate": 2.7303754266211608e-06,
"loss": 1.1515,
"step": 214
},
{
"epoch": 0.7337883959044369,
"grad_norm": 0.166015625,
"learning_rate": 2.696245733788396e-06,
"loss": 1.2382,
"step": 215
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.162109375,
"learning_rate": 2.662116040955632e-06,
"loss": 1.1684,
"step": 216
},
{
"epoch": 0.7406143344709898,
"grad_norm": 0.173828125,
"learning_rate": 2.6279863481228673e-06,
"loss": 1.1336,
"step": 217
},
{
"epoch": 0.7440273037542662,
"grad_norm": 0.1669921875,
"learning_rate": 2.5938566552901023e-06,
"loss": 1.1603,
"step": 218
},
{
"epoch": 0.7474402730375427,
"grad_norm": 0.1572265625,
"learning_rate": 2.559726962457338e-06,
"loss": 1.1898,
"step": 219
},
{
"epoch": 0.7508532423208191,
"grad_norm": 0.1611328125,
"learning_rate": 2.5255972696245735e-06,
"loss": 1.2161,
"step": 220
},
{
"epoch": 0.7542662116040956,
"grad_norm": 0.1669921875,
"learning_rate": 2.491467576791809e-06,
"loss": 1.1927,
"step": 221
},
{
"epoch": 0.757679180887372,
"grad_norm": 0.154296875,
"learning_rate": 2.4573378839590446e-06,
"loss": 1.147,
"step": 222
},
{
"epoch": 0.7610921501706485,
"grad_norm": 0.1572265625,
"learning_rate": 2.42320819112628e-06,
"loss": 1.1513,
"step": 223
},
{
"epoch": 0.764505119453925,
"grad_norm": 0.2314453125,
"learning_rate": 2.3890784982935154e-06,
"loss": 1.1733,
"step": 224
},
{
"epoch": 0.7679180887372014,
"grad_norm": 0.291015625,
"learning_rate": 2.354948805460751e-06,
"loss": 1.1957,
"step": 225
},
{
"epoch": 0.7713310580204779,
"grad_norm": 0.169921875,
"learning_rate": 2.3208191126279866e-06,
"loss": 1.1909,
"step": 226
},
{
"epoch": 0.7747440273037542,
"grad_norm": 0.162109375,
"learning_rate": 2.286689419795222e-06,
"loss": 1.1856,
"step": 227
},
{
"epoch": 0.7781569965870307,
"grad_norm": 0.17578125,
"learning_rate": 2.2525597269624573e-06,
"loss": 1.146,
"step": 228
},
{
"epoch": 0.7815699658703071,
"grad_norm": 0.189453125,
"learning_rate": 2.218430034129693e-06,
"loss": 1.1223,
"step": 229
},
{
"epoch": 0.7849829351535836,
"grad_norm": 0.1650390625,
"learning_rate": 2.1843003412969285e-06,
"loss": 1.195,
"step": 230
},
{
"epoch": 0.78839590443686,
"grad_norm": 0.1591796875,
"learning_rate": 2.150170648464164e-06,
"loss": 1.1762,
"step": 231
},
{
"epoch": 0.7918088737201365,
"grad_norm": 0.1748046875,
"learning_rate": 2.1160409556313997e-06,
"loss": 1.2074,
"step": 232
},
{
"epoch": 0.7952218430034129,
"grad_norm": 0.166015625,
"learning_rate": 2.0819112627986347e-06,
"loss": 1.2056,
"step": 233
},
{
"epoch": 0.7986348122866894,
"grad_norm": 0.22265625,
"learning_rate": 2.0477815699658705e-06,
"loss": 1.1656,
"step": 234
},
{
"epoch": 0.8020477815699659,
"grad_norm": 0.2119140625,
"learning_rate": 2.013651877133106e-06,
"loss": 1.2169,
"step": 235
},
{
"epoch": 0.8054607508532423,
"grad_norm": 0.16796875,
"learning_rate": 1.9795221843003416e-06,
"loss": 1.1713,
"step": 236
},
{
"epoch": 0.8088737201365188,
"grad_norm": 0.1767578125,
"learning_rate": 1.945392491467577e-06,
"loss": 1.187,
"step": 237
},
{
"epoch": 0.8122866894197952,
"grad_norm": 0.248046875,
"learning_rate": 1.9112627986348124e-06,
"loss": 1.2144,
"step": 238
},
{
"epoch": 0.8156996587030717,
"grad_norm": 0.177734375,
"learning_rate": 1.8771331058020478e-06,
"loss": 1.2708,
"step": 239
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.169921875,
"learning_rate": 1.8430034129692834e-06,
"loss": 1.2218,
"step": 240
},
{
"epoch": 0.8225255972696246,
"grad_norm": 0.1767578125,
"learning_rate": 1.808873720136519e-06,
"loss": 1.1734,
"step": 241
},
{
"epoch": 0.825938566552901,
"grad_norm": 0.1845703125,
"learning_rate": 1.7747440273037543e-06,
"loss": 1.2534,
"step": 242
},
{
"epoch": 0.8293515358361775,
"grad_norm": 0.2294921875,
"learning_rate": 1.74061433447099e-06,
"loss": 1.1887,
"step": 243
},
{
"epoch": 0.8327645051194539,
"grad_norm": 0.1572265625,
"learning_rate": 1.7064846416382255e-06,
"loss": 1.1736,
"step": 244
},
{
"epoch": 0.8361774744027304,
"grad_norm": 0.1748046875,
"learning_rate": 1.6723549488054607e-06,
"loss": 1.1802,
"step": 245
},
{
"epoch": 0.8395904436860068,
"grad_norm": 0.1708984375,
"learning_rate": 1.6382252559726963e-06,
"loss": 1.1557,
"step": 246
},
{
"epoch": 0.8430034129692833,
"grad_norm": 0.1875,
"learning_rate": 1.6040955631399319e-06,
"loss": 1.162,
"step": 247
},
{
"epoch": 0.8464163822525598,
"grad_norm": 0.208984375,
"learning_rate": 1.5699658703071675e-06,
"loss": 1.1774,
"step": 248
},
{
"epoch": 0.8498293515358362,
"grad_norm": 0.27734375,
"learning_rate": 1.5358361774744028e-06,
"loss": 1.1862,
"step": 249
},
{
"epoch": 0.8532423208191127,
"grad_norm": 0.169921875,
"learning_rate": 1.5017064846416384e-06,
"loss": 1.183,
"step": 250
},
{
"epoch": 0.856655290102389,
"grad_norm": 0.1982421875,
"learning_rate": 1.4675767918088738e-06,
"loss": 1.1744,
"step": 251
},
{
"epoch": 0.8600682593856656,
"grad_norm": 0.162109375,
"learning_rate": 1.4334470989761092e-06,
"loss": 1.1734,
"step": 252
},
{
"epoch": 0.863481228668942,
"grad_norm": 0.193359375,
"learning_rate": 1.3993174061433448e-06,
"loss": 1.1626,
"step": 253
},
{
"epoch": 0.8668941979522184,
"grad_norm": 0.267578125,
"learning_rate": 1.3651877133105804e-06,
"loss": 1.2057,
"step": 254
},
{
"epoch": 0.8703071672354948,
"grad_norm": 0.208984375,
"learning_rate": 1.331058020477816e-06,
"loss": 1.1584,
"step": 255
},
{
"epoch": 0.8737201365187713,
"grad_norm": 0.275390625,
"learning_rate": 1.2969283276450511e-06,
"loss": 1.142,
"step": 256
},
{
"epoch": 0.8771331058020477,
"grad_norm": 0.2021484375,
"learning_rate": 1.2627986348122867e-06,
"loss": 1.1179,
"step": 257
},
{
"epoch": 0.8805460750853242,
"grad_norm": 0.177734375,
"learning_rate": 1.2286689419795223e-06,
"loss": 1.1862,
"step": 258
},
{
"epoch": 0.8839590443686007,
"grad_norm": 0.1630859375,
"learning_rate": 1.1945392491467577e-06,
"loss": 1.1537,
"step": 259
},
{
"epoch": 0.8873720136518771,
"grad_norm": 0.166015625,
"learning_rate": 1.1604095563139933e-06,
"loss": 1.1028,
"step": 260
},
{
"epoch": 0.8907849829351536,
"grad_norm": 0.251953125,
"learning_rate": 1.1262798634812287e-06,
"loss": 1.1959,
"step": 261
},
{
"epoch": 0.89419795221843,
"grad_norm": 0.15625,
"learning_rate": 1.0921501706484643e-06,
"loss": 1.197,
"step": 262
},
{
"epoch": 0.8976109215017065,
"grad_norm": 0.16796875,
"learning_rate": 1.0580204778156999e-06,
"loss": 1.1256,
"step": 263
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.1748046875,
"learning_rate": 1.0238907849829352e-06,
"loss": 1.2366,
"step": 264
},
{
"epoch": 0.9044368600682594,
"grad_norm": 0.201171875,
"learning_rate": 9.897610921501708e-07,
"loss": 1.192,
"step": 265
},
{
"epoch": 0.9078498293515358,
"grad_norm": 0.1669921875,
"learning_rate": 9.556313993174062e-07,
"loss": 1.1641,
"step": 266
},
{
"epoch": 0.9112627986348123,
"grad_norm": 0.18359375,
"learning_rate": 9.215017064846417e-07,
"loss": 1.2233,
"step": 267
},
{
"epoch": 0.9146757679180887,
"grad_norm": 0.2041015625,
"learning_rate": 8.873720136518772e-07,
"loss": 1.1396,
"step": 268
},
{
"epoch": 0.9180887372013652,
"grad_norm": 0.1845703125,
"learning_rate": 8.532423208191128e-07,
"loss": 1.2493,
"step": 269
},
{
"epoch": 0.9215017064846417,
"grad_norm": 0.2236328125,
"learning_rate": 8.191126279863481e-07,
"loss": 1.1951,
"step": 270
},
{
"epoch": 0.9249146757679181,
"grad_norm": 0.1611328125,
"learning_rate": 7.849829351535837e-07,
"loss": 1.1482,
"step": 271
},
{
"epoch": 0.9283276450511946,
"grad_norm": 0.1611328125,
"learning_rate": 7.508532423208192e-07,
"loss": 1.1803,
"step": 272
},
{
"epoch": 0.931740614334471,
"grad_norm": 0.259765625,
"learning_rate": 7.167235494880546e-07,
"loss": 1.1707,
"step": 273
},
{
"epoch": 0.9351535836177475,
"grad_norm": 0.1611328125,
"learning_rate": 6.825938566552902e-07,
"loss": 1.1746,
"step": 274
},
{
"epoch": 0.9385665529010239,
"grad_norm": 0.162109375,
"learning_rate": 6.484641638225256e-07,
"loss": 1.2075,
"step": 275
},
{
"epoch": 0.9419795221843004,
"grad_norm": 0.2099609375,
"learning_rate": 6.143344709897612e-07,
"loss": 1.1349,
"step": 276
},
{
"epoch": 0.9453924914675768,
"grad_norm": 0.1611328125,
"learning_rate": 5.802047781569966e-07,
"loss": 1.1804,
"step": 277
},
{
"epoch": 0.9488054607508533,
"grad_norm": 0.1611328125,
"learning_rate": 5.460750853242321e-07,
"loss": 1.1798,
"step": 278
},
{
"epoch": 0.9522184300341296,
"grad_norm": 0.162109375,
"learning_rate": 5.119453924914676e-07,
"loss": 1.1949,
"step": 279
},
{
"epoch": 0.9556313993174061,
"grad_norm": 0.23046875,
"learning_rate": 4.778156996587031e-07,
"loss": 1.2302,
"step": 280
},
{
"epoch": 0.9590443686006825,
"grad_norm": 0.1640625,
"learning_rate": 4.436860068259386e-07,
"loss": 1.1692,
"step": 281
},
{
"epoch": 0.962457337883959,
"grad_norm": 0.1572265625,
"learning_rate": 4.0955631399317407e-07,
"loss": 1.2129,
"step": 282
},
{
"epoch": 0.9658703071672355,
"grad_norm": 0.326171875,
"learning_rate": 3.754266211604096e-07,
"loss": 1.1722,
"step": 283
},
{
"epoch": 0.9692832764505119,
"grad_norm": 0.1630859375,
"learning_rate": 3.412969283276451e-07,
"loss": 1.1918,
"step": 284
},
{
"epoch": 0.9726962457337884,
"grad_norm": 0.244140625,
"learning_rate": 3.071672354948806e-07,
"loss": 1.2091,
"step": 285
},
{
"epoch": 0.9761092150170648,
"grad_norm": 0.228515625,
"learning_rate": 2.7303754266211607e-07,
"loss": 1.1193,
"step": 286
},
{
"epoch": 0.9795221843003413,
"grad_norm": 0.17578125,
"learning_rate": 2.3890784982935155e-07,
"loss": 1.1617,
"step": 287
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.1591796875,
"learning_rate": 2.0477815699658704e-07,
"loss": 1.1884,
"step": 288
},
{
"epoch": 0.9863481228668942,
"grad_norm": 0.2060546875,
"learning_rate": 1.7064846416382255e-07,
"loss": 1.1583,
"step": 289
},
{
"epoch": 0.9897610921501706,
"grad_norm": 0.16796875,
"learning_rate": 1.3651877133105803e-07,
"loss": 1.1518,
"step": 290
},
{
"epoch": 0.9931740614334471,
"grad_norm": 0.162109375,
"learning_rate": 1.0238907849829352e-07,
"loss": 1.1679,
"step": 291
},
{
"epoch": 0.9965870307167235,
"grad_norm": 0.193359375,
"learning_rate": 6.825938566552902e-08,
"loss": 1.1069,
"step": 292
},
{
"epoch": 1.0,
"grad_norm": 0.1708984375,
"learning_rate": 3.412969283276451e-08,
"loss": 1.1428,
"step": 293
}
],
"logging_steps": 1.0,
"max_steps": 293,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.379910643320095e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}