Stewy Slocum
Add fine-tuned model
774ea8d
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 247,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004048582995951417,
"grad_norm": 0.396484375,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 1
},
{
"epoch": 0.008097165991902834,
"grad_norm": 0.396484375,
"learning_rate": 9.959514170040487e-06,
"loss": 1.7975,
"step": 2
},
{
"epoch": 0.012145748987854251,
"grad_norm": 0.388671875,
"learning_rate": 9.919028340080973e-06,
"loss": 1.7482,
"step": 3
},
{
"epoch": 0.016194331983805668,
"grad_norm": 0.443359375,
"learning_rate": 9.878542510121458e-06,
"loss": 1.7389,
"step": 4
},
{
"epoch": 0.020242914979757085,
"grad_norm": 0.36328125,
"learning_rate": 9.838056680161944e-06,
"loss": 1.719,
"step": 5
},
{
"epoch": 0.024291497975708502,
"grad_norm": 0.3515625,
"learning_rate": 9.79757085020243e-06,
"loss": 1.7169,
"step": 6
},
{
"epoch": 0.02834008097165992,
"grad_norm": 0.353515625,
"learning_rate": 9.757085020242916e-06,
"loss": 1.7607,
"step": 7
},
{
"epoch": 0.032388663967611336,
"grad_norm": 0.330078125,
"learning_rate": 9.7165991902834e-06,
"loss": 1.7605,
"step": 8
},
{
"epoch": 0.03643724696356275,
"grad_norm": 0.31640625,
"learning_rate": 9.676113360323888e-06,
"loss": 1.7096,
"step": 9
},
{
"epoch": 0.04048582995951417,
"grad_norm": 0.27734375,
"learning_rate": 9.635627530364373e-06,
"loss": 1.7763,
"step": 10
},
{
"epoch": 0.044534412955465584,
"grad_norm": 0.283203125,
"learning_rate": 9.595141700404859e-06,
"loss": 1.7735,
"step": 11
},
{
"epoch": 0.048582995951417005,
"grad_norm": 0.2734375,
"learning_rate": 9.554655870445345e-06,
"loss": 1.6435,
"step": 12
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.240234375,
"learning_rate": 9.514170040485831e-06,
"loss": 1.6785,
"step": 13
},
{
"epoch": 0.05668016194331984,
"grad_norm": 0.255859375,
"learning_rate": 9.473684210526315e-06,
"loss": 1.6617,
"step": 14
},
{
"epoch": 0.06072874493927125,
"grad_norm": 0.2373046875,
"learning_rate": 9.433198380566803e-06,
"loss": 1.6475,
"step": 15
},
{
"epoch": 0.06477732793522267,
"grad_norm": 0.236328125,
"learning_rate": 9.392712550607288e-06,
"loss": 1.628,
"step": 16
},
{
"epoch": 0.06882591093117409,
"grad_norm": 0.228515625,
"learning_rate": 9.352226720647774e-06,
"loss": 1.6464,
"step": 17
},
{
"epoch": 0.0728744939271255,
"grad_norm": 0.23046875,
"learning_rate": 9.31174089068826e-06,
"loss": 1.6582,
"step": 18
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.232421875,
"learning_rate": 9.271255060728746e-06,
"loss": 1.719,
"step": 19
},
{
"epoch": 0.08097165991902834,
"grad_norm": 0.2138671875,
"learning_rate": 9.230769230769232e-06,
"loss": 1.5597,
"step": 20
},
{
"epoch": 0.08502024291497975,
"grad_norm": 0.2451171875,
"learning_rate": 9.190283400809717e-06,
"loss": 1.5132,
"step": 21
},
{
"epoch": 0.08906882591093117,
"grad_norm": 0.203125,
"learning_rate": 9.149797570850203e-06,
"loss": 1.564,
"step": 22
},
{
"epoch": 0.0931174089068826,
"grad_norm": 0.1953125,
"learning_rate": 9.109311740890689e-06,
"loss": 1.5743,
"step": 23
},
{
"epoch": 0.09716599190283401,
"grad_norm": 0.1943359375,
"learning_rate": 9.068825910931175e-06,
"loss": 1.5936,
"step": 24
},
{
"epoch": 0.10121457489878542,
"grad_norm": 0.1787109375,
"learning_rate": 9.02834008097166e-06,
"loss": 1.6408,
"step": 25
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.177734375,
"learning_rate": 8.987854251012147e-06,
"loss": 1.5942,
"step": 26
},
{
"epoch": 0.10931174089068826,
"grad_norm": 0.2158203125,
"learning_rate": 8.947368421052632e-06,
"loss": 1.5207,
"step": 27
},
{
"epoch": 0.11336032388663968,
"grad_norm": 0.16796875,
"learning_rate": 8.906882591093118e-06,
"loss": 1.5517,
"step": 28
},
{
"epoch": 0.11740890688259109,
"grad_norm": 0.1708984375,
"learning_rate": 8.866396761133604e-06,
"loss": 1.6105,
"step": 29
},
{
"epoch": 0.1214574898785425,
"grad_norm": 0.1904296875,
"learning_rate": 8.82591093117409e-06,
"loss": 1.633,
"step": 30
},
{
"epoch": 0.12550607287449392,
"grad_norm": 0.16796875,
"learning_rate": 8.785425101214575e-06,
"loss": 1.517,
"step": 31
},
{
"epoch": 0.12955465587044535,
"grad_norm": 0.1611328125,
"learning_rate": 8.744939271255063e-06,
"loss": 1.5337,
"step": 32
},
{
"epoch": 0.13360323886639677,
"grad_norm": 0.169921875,
"learning_rate": 8.704453441295547e-06,
"loss": 1.5565,
"step": 33
},
{
"epoch": 0.13765182186234817,
"grad_norm": 0.16015625,
"learning_rate": 8.663967611336033e-06,
"loss": 1.466,
"step": 34
},
{
"epoch": 0.1417004048582996,
"grad_norm": 0.1669921875,
"learning_rate": 8.62348178137652e-06,
"loss": 1.5937,
"step": 35
},
{
"epoch": 0.145748987854251,
"grad_norm": 0.1689453125,
"learning_rate": 8.582995951417005e-06,
"loss": 1.5475,
"step": 36
},
{
"epoch": 0.14979757085020243,
"grad_norm": 0.294921875,
"learning_rate": 8.54251012145749e-06,
"loss": 1.499,
"step": 37
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.1572265625,
"learning_rate": 8.502024291497976e-06,
"loss": 1.5268,
"step": 38
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.1640625,
"learning_rate": 8.461538461538462e-06,
"loss": 1.6107,
"step": 39
},
{
"epoch": 0.16194331983805668,
"grad_norm": 0.158203125,
"learning_rate": 8.421052631578948e-06,
"loss": 1.5444,
"step": 40
},
{
"epoch": 0.1659919028340081,
"grad_norm": 0.15234375,
"learning_rate": 8.380566801619434e-06,
"loss": 1.4915,
"step": 41
},
{
"epoch": 0.1700404858299595,
"grad_norm": 0.1650390625,
"learning_rate": 8.340080971659919e-06,
"loss": 1.4974,
"step": 42
},
{
"epoch": 0.17408906882591094,
"grad_norm": 0.1904296875,
"learning_rate": 8.299595141700405e-06,
"loss": 1.4986,
"step": 43
},
{
"epoch": 0.17813765182186234,
"grad_norm": 0.1591796875,
"learning_rate": 8.259109311740891e-06,
"loss": 1.4471,
"step": 44
},
{
"epoch": 0.18218623481781376,
"grad_norm": 0.169921875,
"learning_rate": 8.218623481781377e-06,
"loss": 1.4408,
"step": 45
},
{
"epoch": 0.1862348178137652,
"grad_norm": 0.1494140625,
"learning_rate": 8.178137651821862e-06,
"loss": 1.4718,
"step": 46
},
{
"epoch": 0.1902834008097166,
"grad_norm": 0.1533203125,
"learning_rate": 8.13765182186235e-06,
"loss": 1.4133,
"step": 47
},
{
"epoch": 0.19433198380566802,
"grad_norm": 0.1494140625,
"learning_rate": 8.097165991902834e-06,
"loss": 1.5276,
"step": 48
},
{
"epoch": 0.19838056680161945,
"grad_norm": 0.15234375,
"learning_rate": 8.056680161943322e-06,
"loss": 1.4762,
"step": 49
},
{
"epoch": 0.20242914979757085,
"grad_norm": 0.1435546875,
"learning_rate": 8.016194331983806e-06,
"loss": 1.4971,
"step": 50
},
{
"epoch": 0.20647773279352227,
"grad_norm": 0.1533203125,
"learning_rate": 7.975708502024292e-06,
"loss": 1.4749,
"step": 51
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.1552734375,
"learning_rate": 7.935222672064778e-06,
"loss": 1.4871,
"step": 52
},
{
"epoch": 0.2145748987854251,
"grad_norm": 0.1416015625,
"learning_rate": 7.894736842105265e-06,
"loss": 1.4168,
"step": 53
},
{
"epoch": 0.21862348178137653,
"grad_norm": 0.14453125,
"learning_rate": 7.854251012145749e-06,
"loss": 1.5185,
"step": 54
},
{
"epoch": 0.22267206477732793,
"grad_norm": 0.146484375,
"learning_rate": 7.813765182186235e-06,
"loss": 1.4746,
"step": 55
},
{
"epoch": 0.22672064777327935,
"grad_norm": 0.1396484375,
"learning_rate": 7.773279352226721e-06,
"loss": 1.5009,
"step": 56
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.140625,
"learning_rate": 7.732793522267207e-06,
"loss": 1.4282,
"step": 57
},
{
"epoch": 0.23481781376518218,
"grad_norm": 0.146484375,
"learning_rate": 7.692307692307694e-06,
"loss": 1.5008,
"step": 58
},
{
"epoch": 0.2388663967611336,
"grad_norm": 0.14453125,
"learning_rate": 7.651821862348178e-06,
"loss": 1.461,
"step": 59
},
{
"epoch": 0.242914979757085,
"grad_norm": 0.154296875,
"learning_rate": 7.611336032388664e-06,
"loss": 1.3517,
"step": 60
},
{
"epoch": 0.24696356275303644,
"grad_norm": 0.1455078125,
"learning_rate": 7.570850202429151e-06,
"loss": 1.4487,
"step": 61
},
{
"epoch": 0.25101214574898784,
"grad_norm": 0.1728515625,
"learning_rate": 7.5303643724696364e-06,
"loss": 1.4154,
"step": 62
},
{
"epoch": 0.2550607287449393,
"grad_norm": 0.138671875,
"learning_rate": 7.489878542510122e-06,
"loss": 1.4261,
"step": 63
},
{
"epoch": 0.2591093117408907,
"grad_norm": 0.1435546875,
"learning_rate": 7.449392712550608e-06,
"loss": 1.3977,
"step": 64
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.14453125,
"learning_rate": 7.408906882591094e-06,
"loss": 1.3872,
"step": 65
},
{
"epoch": 0.26720647773279355,
"grad_norm": 0.21484375,
"learning_rate": 7.368421052631579e-06,
"loss": 1.4218,
"step": 66
},
{
"epoch": 0.27125506072874495,
"grad_norm": 0.15234375,
"learning_rate": 7.327935222672065e-06,
"loss": 1.4589,
"step": 67
},
{
"epoch": 0.27530364372469635,
"grad_norm": 0.1455078125,
"learning_rate": 7.2874493927125516e-06,
"loss": 1.4256,
"step": 68
},
{
"epoch": 0.2793522267206478,
"grad_norm": 0.134765625,
"learning_rate": 7.246963562753037e-06,
"loss": 1.4481,
"step": 69
},
{
"epoch": 0.2834008097165992,
"grad_norm": 0.1591796875,
"learning_rate": 7.206477732793523e-06,
"loss": 1.3412,
"step": 70
},
{
"epoch": 0.2874493927125506,
"grad_norm": 0.1396484375,
"learning_rate": 7.165991902834008e-06,
"loss": 1.4677,
"step": 71
},
{
"epoch": 0.291497975708502,
"grad_norm": 0.1298828125,
"learning_rate": 7.125506072874494e-06,
"loss": 1.4729,
"step": 72
},
{
"epoch": 0.29554655870445345,
"grad_norm": 0.1337890625,
"learning_rate": 7.0850202429149805e-06,
"loss": 1.4513,
"step": 73
},
{
"epoch": 0.29959514170040485,
"grad_norm": 0.1396484375,
"learning_rate": 7.044534412955466e-06,
"loss": 1.3872,
"step": 74
},
{
"epoch": 0.30364372469635625,
"grad_norm": 0.1328125,
"learning_rate": 7.004048582995951e-06,
"loss": 1.3805,
"step": 75
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.1669921875,
"learning_rate": 6.963562753036438e-06,
"loss": 1.3883,
"step": 76
},
{
"epoch": 0.3117408906882591,
"grad_norm": 0.2275390625,
"learning_rate": 6.923076923076923e-06,
"loss": 1.363,
"step": 77
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.150390625,
"learning_rate": 6.882591093117409e-06,
"loss": 1.3539,
"step": 78
},
{
"epoch": 0.31983805668016196,
"grad_norm": 0.162109375,
"learning_rate": 6.842105263157896e-06,
"loss": 1.5081,
"step": 79
},
{
"epoch": 0.32388663967611336,
"grad_norm": 0.1416015625,
"learning_rate": 6.801619433198381e-06,
"loss": 1.4678,
"step": 80
},
{
"epoch": 0.32793522267206476,
"grad_norm": 0.1396484375,
"learning_rate": 6.761133603238867e-06,
"loss": 1.4031,
"step": 81
},
{
"epoch": 0.3319838056680162,
"grad_norm": 0.16796875,
"learning_rate": 6.720647773279353e-06,
"loss": 1.3495,
"step": 82
},
{
"epoch": 0.3360323886639676,
"grad_norm": 0.1728515625,
"learning_rate": 6.6801619433198385e-06,
"loss": 1.4959,
"step": 83
},
{
"epoch": 0.340080971659919,
"grad_norm": 0.138671875,
"learning_rate": 6.639676113360325e-06,
"loss": 1.4214,
"step": 84
},
{
"epoch": 0.3441295546558704,
"grad_norm": 0.1357421875,
"learning_rate": 6.599190283400811e-06,
"loss": 1.3601,
"step": 85
},
{
"epoch": 0.3481781376518219,
"grad_norm": 0.150390625,
"learning_rate": 6.558704453441296e-06,
"loss": 1.4616,
"step": 86
},
{
"epoch": 0.3522267206477733,
"grad_norm": 0.169921875,
"learning_rate": 6.518218623481782e-06,
"loss": 1.3694,
"step": 87
},
{
"epoch": 0.3562753036437247,
"grad_norm": 0.146484375,
"learning_rate": 6.4777327935222675e-06,
"loss": 1.3776,
"step": 88
},
{
"epoch": 0.3603238866396761,
"grad_norm": 0.154296875,
"learning_rate": 6.437246963562754e-06,
"loss": 1.38,
"step": 89
},
{
"epoch": 0.3643724696356275,
"grad_norm": 0.138671875,
"learning_rate": 6.39676113360324e-06,
"loss": 1.3726,
"step": 90
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.1806640625,
"learning_rate": 6.356275303643725e-06,
"loss": 1.3127,
"step": 91
},
{
"epoch": 0.3724696356275304,
"grad_norm": 0.142578125,
"learning_rate": 6.31578947368421e-06,
"loss": 1.4069,
"step": 92
},
{
"epoch": 0.3765182186234818,
"grad_norm": 0.16015625,
"learning_rate": 6.275303643724697e-06,
"loss": 1.4103,
"step": 93
},
{
"epoch": 0.3805668016194332,
"grad_norm": 0.1484375,
"learning_rate": 6.234817813765183e-06,
"loss": 1.3563,
"step": 94
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.1572265625,
"learning_rate": 6.194331983805668e-06,
"loss": 1.4154,
"step": 95
},
{
"epoch": 0.38866396761133604,
"grad_norm": 0.1572265625,
"learning_rate": 6.153846153846155e-06,
"loss": 1.4294,
"step": 96
},
{
"epoch": 0.39271255060728744,
"grad_norm": 0.1484375,
"learning_rate": 6.11336032388664e-06,
"loss": 1.3781,
"step": 97
},
{
"epoch": 0.3967611336032389,
"grad_norm": 0.1650390625,
"learning_rate": 6.0728744939271254e-06,
"loss": 1.3883,
"step": 98
},
{
"epoch": 0.4008097165991903,
"grad_norm": 0.1533203125,
"learning_rate": 6.0323886639676124e-06,
"loss": 1.3806,
"step": 99
},
{
"epoch": 0.4048582995951417,
"grad_norm": 0.146484375,
"learning_rate": 5.991902834008098e-06,
"loss": 1.3858,
"step": 100
},
{
"epoch": 0.4089068825910931,
"grad_norm": 0.166015625,
"learning_rate": 5.951417004048583e-06,
"loss": 1.3581,
"step": 101
},
{
"epoch": 0.41295546558704455,
"grad_norm": 0.16015625,
"learning_rate": 5.91093117408907e-06,
"loss": 1.4532,
"step": 102
},
{
"epoch": 0.41700404858299595,
"grad_norm": 0.1416015625,
"learning_rate": 5.870445344129555e-06,
"loss": 1.3921,
"step": 103
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.1396484375,
"learning_rate": 5.8299595141700406e-06,
"loss": 1.3722,
"step": 104
},
{
"epoch": 0.4251012145748988,
"grad_norm": 0.15625,
"learning_rate": 5.789473684210527e-06,
"loss": 1.342,
"step": 105
},
{
"epoch": 0.4291497975708502,
"grad_norm": 0.1337890625,
"learning_rate": 5.748987854251013e-06,
"loss": 1.3459,
"step": 106
},
{
"epoch": 0.4331983805668016,
"grad_norm": 0.146484375,
"learning_rate": 5.708502024291498e-06,
"loss": 1.3981,
"step": 107
},
{
"epoch": 0.43724696356275305,
"grad_norm": 0.14453125,
"learning_rate": 5.668016194331984e-06,
"loss": 1.3834,
"step": 108
},
{
"epoch": 0.44129554655870445,
"grad_norm": 0.1689453125,
"learning_rate": 5.6275303643724695e-06,
"loss": 1.3381,
"step": 109
},
{
"epoch": 0.44534412955465585,
"grad_norm": 0.15234375,
"learning_rate": 5.5870445344129565e-06,
"loss": 1.3009,
"step": 110
},
{
"epoch": 0.4493927125506073,
"grad_norm": 0.15625,
"learning_rate": 5.546558704453442e-06,
"loss": 1.3596,
"step": 111
},
{
"epoch": 0.4534412955465587,
"grad_norm": 0.1357421875,
"learning_rate": 5.506072874493927e-06,
"loss": 1.3557,
"step": 112
},
{
"epoch": 0.4574898785425101,
"grad_norm": 0.1484375,
"learning_rate": 5.465587044534414e-06,
"loss": 1.3832,
"step": 113
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.138671875,
"learning_rate": 5.425101214574899e-06,
"loss": 1.3225,
"step": 114
},
{
"epoch": 0.46558704453441296,
"grad_norm": 0.1708984375,
"learning_rate": 5.384615384615385e-06,
"loss": 1.3469,
"step": 115
},
{
"epoch": 0.46963562753036436,
"grad_norm": 0.154296875,
"learning_rate": 5.344129554655872e-06,
"loss": 1.4027,
"step": 116
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.1533203125,
"learning_rate": 5.303643724696357e-06,
"loss": 1.3497,
"step": 117
},
{
"epoch": 0.4777327935222672,
"grad_norm": 0.1572265625,
"learning_rate": 5.263157894736842e-06,
"loss": 1.3458,
"step": 118
},
{
"epoch": 0.4817813765182186,
"grad_norm": 0.1865234375,
"learning_rate": 5.222672064777329e-06,
"loss": 1.2949,
"step": 119
},
{
"epoch": 0.48582995951417,
"grad_norm": 0.140625,
"learning_rate": 5.1821862348178145e-06,
"loss": 1.3905,
"step": 120
},
{
"epoch": 0.4898785425101215,
"grad_norm": 0.1484375,
"learning_rate": 5.1417004048583e-06,
"loss": 1.4107,
"step": 121
},
{
"epoch": 0.4939271255060729,
"grad_norm": 0.1640625,
"learning_rate": 5.101214574898786e-06,
"loss": 1.3685,
"step": 122
},
{
"epoch": 0.4979757085020243,
"grad_norm": 0.15625,
"learning_rate": 5.060728744939272e-06,
"loss": 1.3311,
"step": 123
},
{
"epoch": 0.5020242914979757,
"grad_norm": 0.2314453125,
"learning_rate": 5.020242914979757e-06,
"loss": 1.3534,
"step": 124
},
{
"epoch": 0.5060728744939271,
"grad_norm": 0.154296875,
"learning_rate": 4.9797570850202435e-06,
"loss": 1.4142,
"step": 125
},
{
"epoch": 0.5101214574898786,
"grad_norm": 0.1552734375,
"learning_rate": 4.939271255060729e-06,
"loss": 1.2787,
"step": 126
},
{
"epoch": 0.5141700404858299,
"grad_norm": 0.14453125,
"learning_rate": 4.898785425101215e-06,
"loss": 1.322,
"step": 127
},
{
"epoch": 0.5182186234817814,
"grad_norm": 1.765625,
"learning_rate": 4.8582995951417e-06,
"loss": 1.397,
"step": 128
},
{
"epoch": 0.5222672064777328,
"grad_norm": 0.1953125,
"learning_rate": 4.817813765182186e-06,
"loss": 1.4061,
"step": 129
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.150390625,
"learning_rate": 4.7773279352226725e-06,
"loss": 1.3198,
"step": 130
},
{
"epoch": 0.5303643724696356,
"grad_norm": 0.15625,
"learning_rate": 4.736842105263158e-06,
"loss": 1.4233,
"step": 131
},
{
"epoch": 0.5344129554655871,
"grad_norm": 0.1650390625,
"learning_rate": 4.696356275303644e-06,
"loss": 1.2438,
"step": 132
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.44140625,
"learning_rate": 4.65587044534413e-06,
"loss": 1.2728,
"step": 133
},
{
"epoch": 0.5425101214574899,
"grad_norm": 0.1533203125,
"learning_rate": 4.615384615384616e-06,
"loss": 1.4425,
"step": 134
},
{
"epoch": 0.5465587044534413,
"grad_norm": 0.1533203125,
"learning_rate": 4.5748987854251014e-06,
"loss": 1.4055,
"step": 135
},
{
"epoch": 0.5506072874493927,
"grad_norm": 0.1572265625,
"learning_rate": 4.534412955465588e-06,
"loss": 1.2969,
"step": 136
},
{
"epoch": 0.5546558704453441,
"grad_norm": 0.1484375,
"learning_rate": 4.493927125506074e-06,
"loss": 1.3394,
"step": 137
},
{
"epoch": 0.5587044534412956,
"grad_norm": 0.169921875,
"learning_rate": 4.453441295546559e-06,
"loss": 1.2734,
"step": 138
},
{
"epoch": 0.562753036437247,
"grad_norm": 0.2099609375,
"learning_rate": 4.412955465587045e-06,
"loss": 1.2709,
"step": 139
},
{
"epoch": 0.5668016194331984,
"grad_norm": 0.1650390625,
"learning_rate": 4.372469635627531e-06,
"loss": 1.4363,
"step": 140
},
{
"epoch": 0.5708502024291497,
"grad_norm": 0.1533203125,
"learning_rate": 4.3319838056680166e-06,
"loss": 1.2841,
"step": 141
},
{
"epoch": 0.5748987854251012,
"grad_norm": 0.1845703125,
"learning_rate": 4.291497975708503e-06,
"loss": 1.4294,
"step": 142
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.1611328125,
"learning_rate": 4.251012145748988e-06,
"loss": 1.2398,
"step": 143
},
{
"epoch": 0.582995951417004,
"grad_norm": 0.146484375,
"learning_rate": 4.210526315789474e-06,
"loss": 1.2878,
"step": 144
},
{
"epoch": 0.5870445344129555,
"grad_norm": 0.5078125,
"learning_rate": 4.170040485829959e-06,
"loss": 1.3302,
"step": 145
},
{
"epoch": 0.5910931174089069,
"grad_norm": 0.158203125,
"learning_rate": 4.1295546558704455e-06,
"loss": 1.3835,
"step": 146
},
{
"epoch": 0.5951417004048583,
"grad_norm": 0.16796875,
"learning_rate": 4.089068825910931e-06,
"loss": 1.4713,
"step": 147
},
{
"epoch": 0.5991902834008097,
"grad_norm": 0.1435546875,
"learning_rate": 4.048582995951417e-06,
"loss": 1.3315,
"step": 148
},
{
"epoch": 0.6032388663967612,
"grad_norm": 0.150390625,
"learning_rate": 4.008097165991903e-06,
"loss": 1.3158,
"step": 149
},
{
"epoch": 0.6072874493927125,
"grad_norm": 0.1669921875,
"learning_rate": 3.967611336032389e-06,
"loss": 1.3414,
"step": 150
},
{
"epoch": 0.611336032388664,
"grad_norm": 0.1552734375,
"learning_rate": 3.9271255060728745e-06,
"loss": 1.4022,
"step": 151
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.1865234375,
"learning_rate": 3.886639676113361e-06,
"loss": 1.3256,
"step": 152
},
{
"epoch": 0.6194331983805668,
"grad_norm": 0.14453125,
"learning_rate": 3.846153846153847e-06,
"loss": 1.3395,
"step": 153
},
{
"epoch": 0.6234817813765182,
"grad_norm": 0.146484375,
"learning_rate": 3.805668016194332e-06,
"loss": 1.301,
"step": 154
},
{
"epoch": 0.6275303643724697,
"grad_norm": 0.142578125,
"learning_rate": 3.7651821862348182e-06,
"loss": 1.3438,
"step": 155
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.1806640625,
"learning_rate": 3.724696356275304e-06,
"loss": 1.2879,
"step": 156
},
{
"epoch": 0.6356275303643725,
"grad_norm": 0.1845703125,
"learning_rate": 3.6842105263157896e-06,
"loss": 1.4294,
"step": 157
},
{
"epoch": 0.6396761133603239,
"grad_norm": 0.1640625,
"learning_rate": 3.6437246963562758e-06,
"loss": 1.4076,
"step": 158
},
{
"epoch": 0.6437246963562753,
"grad_norm": 0.15625,
"learning_rate": 3.6032388663967615e-06,
"loss": 1.3888,
"step": 159
},
{
"epoch": 0.6477732793522267,
"grad_norm": 0.1572265625,
"learning_rate": 3.562753036437247e-06,
"loss": 1.2494,
"step": 160
},
{
"epoch": 0.6518218623481782,
"grad_norm": 0.1591796875,
"learning_rate": 3.522267206477733e-06,
"loss": 1.3674,
"step": 161
},
{
"epoch": 0.6558704453441295,
"grad_norm": 0.1943359375,
"learning_rate": 3.481781376518219e-06,
"loss": 1.2075,
"step": 162
},
{
"epoch": 0.659919028340081,
"grad_norm": 0.1494140625,
"learning_rate": 3.4412955465587043e-06,
"loss": 1.3506,
"step": 163
},
{
"epoch": 0.6639676113360324,
"grad_norm": 0.193359375,
"learning_rate": 3.4008097165991905e-06,
"loss": 1.3442,
"step": 164
},
{
"epoch": 0.6680161943319838,
"grad_norm": 0.1494140625,
"learning_rate": 3.3603238866396766e-06,
"loss": 1.2816,
"step": 165
},
{
"epoch": 0.6720647773279352,
"grad_norm": 0.171875,
"learning_rate": 3.3198380566801623e-06,
"loss": 1.3368,
"step": 166
},
{
"epoch": 0.6761133603238867,
"grad_norm": 0.1611328125,
"learning_rate": 3.279352226720648e-06,
"loss": 1.366,
"step": 167
},
{
"epoch": 0.680161943319838,
"grad_norm": 0.1591796875,
"learning_rate": 3.2388663967611337e-06,
"loss": 1.3298,
"step": 168
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.1572265625,
"learning_rate": 3.19838056680162e-06,
"loss": 1.3258,
"step": 169
},
{
"epoch": 0.6882591093117408,
"grad_norm": 0.14453125,
"learning_rate": 3.157894736842105e-06,
"loss": 1.312,
"step": 170
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.2216796875,
"learning_rate": 3.1174089068825913e-06,
"loss": 1.3936,
"step": 171
},
{
"epoch": 0.6963562753036437,
"grad_norm": 0.1552734375,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.35,
"step": 172
},
{
"epoch": 0.7004048582995951,
"grad_norm": 0.1611328125,
"learning_rate": 3.0364372469635627e-06,
"loss": 1.3484,
"step": 173
},
{
"epoch": 0.7044534412955465,
"grad_norm": 0.1708984375,
"learning_rate": 2.995951417004049e-06,
"loss": 1.3093,
"step": 174
},
{
"epoch": 0.708502024291498,
"grad_norm": 0.1689453125,
"learning_rate": 2.955465587044535e-06,
"loss": 1.3015,
"step": 175
},
{
"epoch": 0.7125506072874493,
"grad_norm": 0.1484375,
"learning_rate": 2.9149797570850203e-06,
"loss": 1.3365,
"step": 176
},
{
"epoch": 0.7165991902834008,
"grad_norm": 0.1533203125,
"learning_rate": 2.8744939271255064e-06,
"loss": 1.3336,
"step": 177
},
{
"epoch": 0.7206477732793523,
"grad_norm": 0.14453125,
"learning_rate": 2.834008097165992e-06,
"loss": 1.2927,
"step": 178
},
{
"epoch": 0.7246963562753036,
"grad_norm": 0.16796875,
"learning_rate": 2.7935222672064783e-06,
"loss": 1.29,
"step": 179
},
{
"epoch": 0.728744939271255,
"grad_norm": 0.177734375,
"learning_rate": 2.7530364372469636e-06,
"loss": 1.2423,
"step": 180
},
{
"epoch": 0.7327935222672065,
"grad_norm": 0.1611328125,
"learning_rate": 2.7125506072874497e-06,
"loss": 1.2913,
"step": 181
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.162109375,
"learning_rate": 2.672064777327936e-06,
"loss": 1.3249,
"step": 182
},
{
"epoch": 0.7408906882591093,
"grad_norm": 0.158203125,
"learning_rate": 2.631578947368421e-06,
"loss": 1.2356,
"step": 183
},
{
"epoch": 0.7449392712550608,
"grad_norm": 0.154296875,
"learning_rate": 2.5910931174089072e-06,
"loss": 1.2955,
"step": 184
},
{
"epoch": 0.7489878542510121,
"grad_norm": 0.1552734375,
"learning_rate": 2.550607287449393e-06,
"loss": 1.3227,
"step": 185
},
{
"epoch": 0.7530364372469636,
"grad_norm": 0.1611328125,
"learning_rate": 2.5101214574898787e-06,
"loss": 1.2789,
"step": 186
},
{
"epoch": 0.757085020242915,
"grad_norm": 0.1591796875,
"learning_rate": 2.4696356275303644e-06,
"loss": 1.2742,
"step": 187
},
{
"epoch": 0.7611336032388664,
"grad_norm": 0.1474609375,
"learning_rate": 2.42914979757085e-06,
"loss": 1.3393,
"step": 188
},
{
"epoch": 0.7651821862348178,
"grad_norm": 0.1513671875,
"learning_rate": 2.3886639676113362e-06,
"loss": 1.3122,
"step": 189
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.1494140625,
"learning_rate": 2.348178137651822e-06,
"loss": 1.3016,
"step": 190
},
{
"epoch": 0.7732793522267206,
"grad_norm": 0.1494140625,
"learning_rate": 2.307692307692308e-06,
"loss": 1.2886,
"step": 191
},
{
"epoch": 0.7773279352226721,
"grad_norm": 0.177734375,
"learning_rate": 2.267206477732794e-06,
"loss": 1.4194,
"step": 192
},
{
"epoch": 0.7813765182186235,
"grad_norm": 0.1640625,
"learning_rate": 2.2267206477732795e-06,
"loss": 1.244,
"step": 193
},
{
"epoch": 0.7854251012145749,
"grad_norm": 0.17578125,
"learning_rate": 2.1862348178137656e-06,
"loss": 1.2748,
"step": 194
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.1669921875,
"learning_rate": 2.1457489878542513e-06,
"loss": 1.3047,
"step": 195
},
{
"epoch": 0.7935222672064778,
"grad_norm": 0.181640625,
"learning_rate": 2.105263157894737e-06,
"loss": 1.3679,
"step": 196
},
{
"epoch": 0.7975708502024291,
"grad_norm": 0.16015625,
"learning_rate": 2.0647773279352228e-06,
"loss": 1.2566,
"step": 197
},
{
"epoch": 0.8016194331983806,
"grad_norm": 0.1455078125,
"learning_rate": 2.0242914979757085e-06,
"loss": 1.3056,
"step": 198
},
{
"epoch": 0.805668016194332,
"grad_norm": 0.1650390625,
"learning_rate": 1.9838056680161946e-06,
"loss": 1.32,
"step": 199
},
{
"epoch": 0.8097165991902834,
"grad_norm": 0.1572265625,
"learning_rate": 1.9433198380566803e-06,
"loss": 1.2582,
"step": 200
},
{
"epoch": 0.8137651821862348,
"grad_norm": 0.220703125,
"learning_rate": 1.902834008097166e-06,
"loss": 1.357,
"step": 201
},
{
"epoch": 0.8178137651821862,
"grad_norm": 0.154296875,
"learning_rate": 1.862348178137652e-06,
"loss": 1.3521,
"step": 202
},
{
"epoch": 0.8218623481781376,
"grad_norm": 0.1669921875,
"learning_rate": 1.8218623481781379e-06,
"loss": 1.3194,
"step": 203
},
{
"epoch": 0.8259109311740891,
"grad_norm": 0.1630859375,
"learning_rate": 1.7813765182186236e-06,
"loss": 1.3109,
"step": 204
},
{
"epoch": 0.8299595141700404,
"grad_norm": 0.169921875,
"learning_rate": 1.7408906882591095e-06,
"loss": 1.319,
"step": 205
},
{
"epoch": 0.8340080971659919,
"grad_norm": 0.1572265625,
"learning_rate": 1.7004048582995952e-06,
"loss": 1.2524,
"step": 206
},
{
"epoch": 0.8380566801619433,
"grad_norm": 0.171875,
"learning_rate": 1.6599190283400812e-06,
"loss": 1.2506,
"step": 207
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.162109375,
"learning_rate": 1.6194331983805669e-06,
"loss": 1.2782,
"step": 208
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.1513671875,
"learning_rate": 1.5789473684210526e-06,
"loss": 1.338,
"step": 209
},
{
"epoch": 0.8502024291497976,
"grad_norm": 0.1611328125,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.3054,
"step": 210
},
{
"epoch": 0.854251012145749,
"grad_norm": 0.1650390625,
"learning_rate": 1.4979757085020244e-06,
"loss": 1.2981,
"step": 211
},
{
"epoch": 0.8582995951417004,
"grad_norm": 0.3359375,
"learning_rate": 1.4574898785425101e-06,
"loss": 1.2384,
"step": 212
},
{
"epoch": 0.8623481781376519,
"grad_norm": 0.1630859375,
"learning_rate": 1.417004048582996e-06,
"loss": 1.223,
"step": 213
},
{
"epoch": 0.8663967611336032,
"grad_norm": 0.1640625,
"learning_rate": 1.3765182186234818e-06,
"loss": 1.2893,
"step": 214
},
{
"epoch": 0.8704453441295547,
"grad_norm": 0.1787109375,
"learning_rate": 1.336032388663968e-06,
"loss": 1.3688,
"step": 215
},
{
"epoch": 0.8744939271255061,
"grad_norm": 0.2373046875,
"learning_rate": 1.2955465587044536e-06,
"loss": 1.2511,
"step": 216
},
{
"epoch": 0.8785425101214575,
"grad_norm": 0.1650390625,
"learning_rate": 1.2550607287449393e-06,
"loss": 1.2745,
"step": 217
},
{
"epoch": 0.8825910931174089,
"grad_norm": 0.1630859375,
"learning_rate": 1.214574898785425e-06,
"loss": 1.3063,
"step": 218
},
{
"epoch": 0.8866396761133604,
"grad_norm": 0.158203125,
"learning_rate": 1.174089068825911e-06,
"loss": 1.2903,
"step": 219
},
{
"epoch": 0.8906882591093117,
"grad_norm": 0.150390625,
"learning_rate": 1.133603238866397e-06,
"loss": 1.3452,
"step": 220
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.1982421875,
"learning_rate": 1.0931174089068828e-06,
"loss": 1.4376,
"step": 221
},
{
"epoch": 0.8987854251012146,
"grad_norm": 0.1982421875,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.4119,
"step": 222
},
{
"epoch": 0.902834008097166,
"grad_norm": 0.173828125,
"learning_rate": 1.0121457489878542e-06,
"loss": 1.3001,
"step": 223
},
{
"epoch": 0.9068825910931174,
"grad_norm": 0.1572265625,
"learning_rate": 9.716599190283402e-07,
"loss": 1.3085,
"step": 224
},
{
"epoch": 0.9109311740890689,
"grad_norm": 0.162109375,
"learning_rate": 9.31174089068826e-07,
"loss": 1.2873,
"step": 225
},
{
"epoch": 0.9149797570850202,
"grad_norm": 0.19140625,
"learning_rate": 8.906882591093118e-07,
"loss": 1.2741,
"step": 226
},
{
"epoch": 0.9190283400809717,
"grad_norm": 0.1611328125,
"learning_rate": 8.502024291497976e-07,
"loss": 1.3929,
"step": 227
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.1962890625,
"learning_rate": 8.097165991902834e-07,
"loss": 1.2992,
"step": 228
},
{
"epoch": 0.9271255060728745,
"grad_norm": 0.1845703125,
"learning_rate": 7.692307692307694e-07,
"loss": 1.268,
"step": 229
},
{
"epoch": 0.9311740890688259,
"grad_norm": 0.2451171875,
"learning_rate": 7.287449392712551e-07,
"loss": 1.3517,
"step": 230
},
{
"epoch": 0.9352226720647774,
"grad_norm": 0.154296875,
"learning_rate": 6.882591093117409e-07,
"loss": 1.2013,
"step": 231
},
{
"epoch": 0.9392712550607287,
"grad_norm": 0.158203125,
"learning_rate": 6.477732793522268e-07,
"loss": 1.3459,
"step": 232
},
{
"epoch": 0.9433198380566802,
"grad_norm": 0.1728515625,
"learning_rate": 6.072874493927125e-07,
"loss": 1.3494,
"step": 233
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.1640625,
"learning_rate": 5.668016194331984e-07,
"loss": 1.2346,
"step": 234
},
{
"epoch": 0.951417004048583,
"grad_norm": 0.16015625,
"learning_rate": 5.263157894736843e-07,
"loss": 1.2858,
"step": 235
},
{
"epoch": 0.9554655870445344,
"grad_norm": 0.1962890625,
"learning_rate": 4.858299595141701e-07,
"loss": 1.2212,
"step": 236
},
{
"epoch": 0.9595141700404858,
"grad_norm": 0.16796875,
"learning_rate": 4.453441295546559e-07,
"loss": 1.3566,
"step": 237
},
{
"epoch": 0.9635627530364372,
"grad_norm": 0.15234375,
"learning_rate": 4.048582995951417e-07,
"loss": 1.374,
"step": 238
},
{
"epoch": 0.9676113360323887,
"grad_norm": 0.1669921875,
"learning_rate": 3.6437246963562754e-07,
"loss": 1.2331,
"step": 239
},
{
"epoch": 0.97165991902834,
"grad_norm": 0.1591796875,
"learning_rate": 3.238866396761134e-07,
"loss": 1.3375,
"step": 240
},
{
"epoch": 0.9757085020242915,
"grad_norm": 0.1748046875,
"learning_rate": 2.834008097165992e-07,
"loss": 1.2493,
"step": 241
},
{
"epoch": 0.979757085020243,
"grad_norm": 0.1845703125,
"learning_rate": 2.4291497975708504e-07,
"loss": 1.378,
"step": 242
},
{
"epoch": 0.9838056680161943,
"grad_norm": 0.16015625,
"learning_rate": 2.0242914979757086e-07,
"loss": 1.3907,
"step": 243
},
{
"epoch": 0.9878542510121457,
"grad_norm": 0.1533203125,
"learning_rate": 1.619433198380567e-07,
"loss": 1.3215,
"step": 244
},
{
"epoch": 0.9919028340080972,
"grad_norm": 0.169921875,
"learning_rate": 1.2145748987854252e-07,
"loss": 1.3432,
"step": 245
},
{
"epoch": 0.9959514170040485,
"grad_norm": 0.1875,
"learning_rate": 8.097165991902835e-08,
"loss": 1.3032,
"step": 246
},
{
"epoch": 1.0,
"grad_norm": 0.158203125,
"learning_rate": 4.0485829959514176e-08,
"loss": 1.2984,
"step": 247
}
],
"logging_steps": 1.0,
"max_steps": 247,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.717806075910554e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}