DCFT-scale_up_science_100K-etash / trainer_state.json
EtashGuha's picture
Upload folder using huggingface_hub
826fff8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9846938775510203,
"eval_steps": 500,
"global_step": 390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007653061224489796,
"grad_norm": 7.900599139656812,
"learning_rate": 2.564102564102564e-07,
"loss": 1.2547,
"step": 1
},
{
"epoch": 0.015306122448979591,
"grad_norm": 8.119600107369656,
"learning_rate": 5.128205128205128e-07,
"loss": 1.2846,
"step": 2
},
{
"epoch": 0.02295918367346939,
"grad_norm": 7.716599625963762,
"learning_rate": 7.692307692307694e-07,
"loss": 1.2358,
"step": 3
},
{
"epoch": 0.030612244897959183,
"grad_norm": 8.053428990837991,
"learning_rate": 1.0256410256410257e-06,
"loss": 1.2664,
"step": 4
},
{
"epoch": 0.03826530612244898,
"grad_norm": 7.669927836204581,
"learning_rate": 1.282051282051282e-06,
"loss": 1.2258,
"step": 5
},
{
"epoch": 0.04591836734693878,
"grad_norm": 7.202844535122422,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.2287,
"step": 6
},
{
"epoch": 0.05357142857142857,
"grad_norm": 6.242569470438793,
"learning_rate": 1.794871794871795e-06,
"loss": 1.2074,
"step": 7
},
{
"epoch": 0.061224489795918366,
"grad_norm": 5.60650986860704,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.1714,
"step": 8
},
{
"epoch": 0.06887755102040816,
"grad_norm": 5.288779120719776,
"learning_rate": 2.307692307692308e-06,
"loss": 1.1896,
"step": 9
},
{
"epoch": 0.07653061224489796,
"grad_norm": 2.9734467693838242,
"learning_rate": 2.564102564102564e-06,
"loss": 1.1187,
"step": 10
},
{
"epoch": 0.08418367346938775,
"grad_norm": 2.6447321687920935,
"learning_rate": 2.8205128205128207e-06,
"loss": 1.1054,
"step": 11
},
{
"epoch": 0.09183673469387756,
"grad_norm": 2.500691487000675,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.1232,
"step": 12
},
{
"epoch": 0.09948979591836735,
"grad_norm": 3.7896412560526667,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0521,
"step": 13
},
{
"epoch": 0.10714285714285714,
"grad_norm": 4.330435607713412,
"learning_rate": 3.58974358974359e-06,
"loss": 1.0695,
"step": 14
},
{
"epoch": 0.11479591836734694,
"grad_norm": 4.065967025129537,
"learning_rate": 3.846153846153847e-06,
"loss": 1.0492,
"step": 15
},
{
"epoch": 0.12244897959183673,
"grad_norm": 3.8218626260606254,
"learning_rate": 4.102564102564103e-06,
"loss": 1.0507,
"step": 16
},
{
"epoch": 0.13010204081632654,
"grad_norm": 2.913408611179218,
"learning_rate": 4.358974358974359e-06,
"loss": 1.0352,
"step": 17
},
{
"epoch": 0.1377551020408163,
"grad_norm": 2.181426061973388,
"learning_rate": 4.615384615384616e-06,
"loss": 0.9931,
"step": 18
},
{
"epoch": 0.14540816326530612,
"grad_norm": 2.231160799643045,
"learning_rate": 4.871794871794872e-06,
"loss": 0.9909,
"step": 19
},
{
"epoch": 0.15306122448979592,
"grad_norm": 1.8886705253423084,
"learning_rate": 5.128205128205128e-06,
"loss": 0.9585,
"step": 20
},
{
"epoch": 0.16071428571428573,
"grad_norm": 1.4487020320934674,
"learning_rate": 5.384615384615385e-06,
"loss": 0.9512,
"step": 21
},
{
"epoch": 0.1683673469387755,
"grad_norm": 1.1474113193070334,
"learning_rate": 5.641025641025641e-06,
"loss": 0.9211,
"step": 22
},
{
"epoch": 0.1760204081632653,
"grad_norm": 1.1082161392111283,
"learning_rate": 5.897435897435898e-06,
"loss": 0.9276,
"step": 23
},
{
"epoch": 0.1836734693877551,
"grad_norm": 1.0650174913457753,
"learning_rate": 6.153846153846155e-06,
"loss": 0.8952,
"step": 24
},
{
"epoch": 0.1913265306122449,
"grad_norm": 1.0639688813292578,
"learning_rate": 6.410256410256412e-06,
"loss": 0.9,
"step": 25
},
{
"epoch": 0.1989795918367347,
"grad_norm": 1.026817161303438,
"learning_rate": 6.666666666666667e-06,
"loss": 0.91,
"step": 26
},
{
"epoch": 0.2066326530612245,
"grad_norm": 0.8417599731227993,
"learning_rate": 6.923076923076923e-06,
"loss": 0.8814,
"step": 27
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.6981856946789826,
"learning_rate": 7.17948717948718e-06,
"loss": 0.8919,
"step": 28
},
{
"epoch": 0.22193877551020408,
"grad_norm": 0.8305699974027309,
"learning_rate": 7.435897435897437e-06,
"loss": 0.8725,
"step": 29
},
{
"epoch": 0.22959183673469388,
"grad_norm": 0.8347298264686718,
"learning_rate": 7.692307692307694e-06,
"loss": 0.8632,
"step": 30
},
{
"epoch": 0.2372448979591837,
"grad_norm": 0.7073808513621134,
"learning_rate": 7.948717948717949e-06,
"loss": 0.8851,
"step": 31
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.696082003876517,
"learning_rate": 8.205128205128205e-06,
"loss": 0.8703,
"step": 32
},
{
"epoch": 0.25255102040816324,
"grad_norm": 0.6487862374432037,
"learning_rate": 8.461538461538462e-06,
"loss": 0.8635,
"step": 33
},
{
"epoch": 0.2602040816326531,
"grad_norm": 0.6887600459571754,
"learning_rate": 8.717948717948719e-06,
"loss": 0.857,
"step": 34
},
{
"epoch": 0.26785714285714285,
"grad_norm": 0.6265375198163766,
"learning_rate": 8.974358974358976e-06,
"loss": 0.8674,
"step": 35
},
{
"epoch": 0.2755102040816326,
"grad_norm": 0.5853042860058177,
"learning_rate": 9.230769230769232e-06,
"loss": 0.8643,
"step": 36
},
{
"epoch": 0.28316326530612246,
"grad_norm": 0.5778280304486835,
"learning_rate": 9.487179487179487e-06,
"loss": 0.8524,
"step": 37
},
{
"epoch": 0.29081632653061223,
"grad_norm": 0.47109648165463025,
"learning_rate": 9.743589743589744e-06,
"loss": 0.8345,
"step": 38
},
{
"epoch": 0.29846938775510207,
"grad_norm": 0.6185653217679288,
"learning_rate": 1e-05,
"loss": 0.8383,
"step": 39
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.47985971027522895,
"learning_rate": 9.999799726899261e-06,
"loss": 0.8445,
"step": 40
},
{
"epoch": 0.3137755102040816,
"grad_norm": 0.5168786180126856,
"learning_rate": 9.999198923640774e-06,
"loss": 0.8339,
"step": 41
},
{
"epoch": 0.32142857142857145,
"grad_norm": 0.4697815414710563,
"learning_rate": 9.998197638354428e-06,
"loss": 0.8395,
"step": 42
},
{
"epoch": 0.32908163265306123,
"grad_norm": 0.43778336023032777,
"learning_rate": 9.996795951252427e-06,
"loss": 0.8447,
"step": 43
},
{
"epoch": 0.336734693877551,
"grad_norm": 0.4556959273145522,
"learning_rate": 9.994993974622863e-06,
"loss": 0.8388,
"step": 44
},
{
"epoch": 0.34438775510204084,
"grad_norm": 0.4348649136417768,
"learning_rate": 9.992791852820709e-06,
"loss": 0.8107,
"step": 45
},
{
"epoch": 0.3520408163265306,
"grad_norm": 0.3789542620181641,
"learning_rate": 9.990189762256275e-06,
"loss": 0.8345,
"step": 46
},
{
"epoch": 0.3596938775510204,
"grad_norm": 0.38275331303639004,
"learning_rate": 9.987187911381059e-06,
"loss": 0.8341,
"step": 47
},
{
"epoch": 0.3673469387755102,
"grad_norm": 0.4117058990117418,
"learning_rate": 9.983786540671052e-06,
"loss": 0.8221,
"step": 48
},
{
"epoch": 0.375,
"grad_norm": 0.4382241759510747,
"learning_rate": 9.979985922607476e-06,
"loss": 0.8122,
"step": 49
},
{
"epoch": 0.3826530612244898,
"grad_norm": 0.3762436724149032,
"learning_rate": 9.975786361654959e-06,
"loss": 0.8266,
"step": 50
},
{
"epoch": 0.3903061224489796,
"grad_norm": 0.41057346443737175,
"learning_rate": 9.971188194237141e-06,
"loss": 0.8088,
"step": 51
},
{
"epoch": 0.3979591836734694,
"grad_norm": 0.4047860097370241,
"learning_rate": 9.966191788709716e-06,
"loss": 0.8104,
"step": 52
},
{
"epoch": 0.40561224489795916,
"grad_norm": 0.33943575533118764,
"learning_rate": 9.960797545330936e-06,
"loss": 0.7912,
"step": 53
},
{
"epoch": 0.413265306122449,
"grad_norm": 0.4320896078070966,
"learning_rate": 9.955005896229543e-06,
"loss": 0.812,
"step": 54
},
{
"epoch": 0.42091836734693877,
"grad_norm": 0.36490629588097284,
"learning_rate": 9.948817305370145e-06,
"loss": 0.817,
"step": 55
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.3811135120016499,
"learning_rate": 9.942232268516051e-06,
"loss": 0.8101,
"step": 56
},
{
"epoch": 0.4362244897959184,
"grad_norm": 0.3406018259398481,
"learning_rate": 9.935251313189564e-06,
"loss": 0.7987,
"step": 57
},
{
"epoch": 0.44387755102040816,
"grad_norm": 0.41568382541199506,
"learning_rate": 9.927874998629714e-06,
"loss": 0.8122,
"step": 58
},
{
"epoch": 0.45153061224489793,
"grad_norm": 0.37378450752963704,
"learning_rate": 9.920103915747452e-06,
"loss": 0.8113,
"step": 59
},
{
"epoch": 0.45918367346938777,
"grad_norm": 0.38855892151900484,
"learning_rate": 9.911938687078324e-06,
"loss": 0.7859,
"step": 60
},
{
"epoch": 0.46683673469387754,
"grad_norm": 0.4427608830523002,
"learning_rate": 9.9033799667326e-06,
"loss": 0.802,
"step": 61
},
{
"epoch": 0.4744897959183674,
"grad_norm": 0.35381982973714704,
"learning_rate": 9.89442844034286e-06,
"loss": 0.8253,
"step": 62
},
{
"epoch": 0.48214285714285715,
"grad_norm": 0.39211658446728886,
"learning_rate": 9.885084825009085e-06,
"loss": 0.8105,
"step": 63
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.40435166675919726,
"learning_rate": 9.875349869241202e-06,
"loss": 0.7953,
"step": 64
},
{
"epoch": 0.49744897959183676,
"grad_norm": 0.3549380749010608,
"learning_rate": 9.86522435289912e-06,
"loss": 0.8013,
"step": 65
},
{
"epoch": 0.5051020408163265,
"grad_norm": 0.30865050646134456,
"learning_rate": 9.854709087130261e-06,
"loss": 0.7995,
"step": 66
},
{
"epoch": 0.5127551020408163,
"grad_norm": 0.37493039088229285,
"learning_rate": 9.843804914304578e-06,
"loss": 0.7896,
"step": 67
},
{
"epoch": 0.5204081632653061,
"grad_norm": 0.35445041874178557,
"learning_rate": 9.83251270794707e-06,
"loss": 0.7825,
"step": 68
},
{
"epoch": 0.5280612244897959,
"grad_norm": 0.37028381618760225,
"learning_rate": 9.820833372667813e-06,
"loss": 0.8065,
"step": 69
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.3401014836367619,
"learning_rate": 9.80876784408948e-06,
"loss": 0.7736,
"step": 70
},
{
"epoch": 0.5433673469387755,
"grad_norm": 0.38135639736053345,
"learning_rate": 9.796317088772402e-06,
"loss": 0.8006,
"step": 71
},
{
"epoch": 0.5510204081632653,
"grad_norm": 0.3449580873725305,
"learning_rate": 9.783482104137127e-06,
"loss": 0.793,
"step": 72
},
{
"epoch": 0.5586734693877551,
"grad_norm": 0.39773824101732064,
"learning_rate": 9.770263918384523e-06,
"loss": 0.7628,
"step": 73
},
{
"epoch": 0.5663265306122449,
"grad_norm": 0.36658929825409153,
"learning_rate": 9.75666359041341e-06,
"loss": 0.7849,
"step": 74
},
{
"epoch": 0.5739795918367347,
"grad_norm": 0.40174916793470977,
"learning_rate": 9.742682209735727e-06,
"loss": 0.7813,
"step": 75
},
{
"epoch": 0.5816326530612245,
"grad_norm": 0.3675910724991157,
"learning_rate": 9.728320896389263e-06,
"loss": 0.7717,
"step": 76
},
{
"epoch": 0.5892857142857143,
"grad_norm": 0.35241618551974574,
"learning_rate": 9.713580800847917e-06,
"loss": 0.7964,
"step": 77
},
{
"epoch": 0.5969387755102041,
"grad_norm": 0.42227511752715124,
"learning_rate": 9.698463103929542e-06,
"loss": 0.783,
"step": 78
},
{
"epoch": 0.6045918367346939,
"grad_norm": 0.41099008300592765,
"learning_rate": 9.682969016701357e-06,
"loss": 0.8124,
"step": 79
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.3588362815681614,
"learning_rate": 9.66709978038292e-06,
"loss": 0.8027,
"step": 80
},
{
"epoch": 0.6198979591836735,
"grad_norm": 0.44603372884723097,
"learning_rate": 9.650856666246693e-06,
"loss": 0.7884,
"step": 81
},
{
"epoch": 0.6275510204081632,
"grad_norm": 0.36738127885232164,
"learning_rate": 9.63424097551621e-06,
"loss": 0.7914,
"step": 82
},
{
"epoch": 0.6352040816326531,
"grad_norm": 0.3461400435266106,
"learning_rate": 9.617254039261835e-06,
"loss": 0.7698,
"step": 83
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.3746146031615924,
"learning_rate": 9.599897218294122e-06,
"loss": 0.7958,
"step": 84
},
{
"epoch": 0.6505102040816326,
"grad_norm": 0.37460698420532035,
"learning_rate": 9.582171903054815e-06,
"loss": 0.7677,
"step": 85
},
{
"epoch": 0.6581632653061225,
"grad_norm": 0.3433298671011665,
"learning_rate": 9.564079513505455e-06,
"loss": 0.8015,
"step": 86
},
{
"epoch": 0.6658163265306123,
"grad_norm": 0.3482571197001568,
"learning_rate": 9.54562149901362e-06,
"loss": 0.7638,
"step": 87
},
{
"epoch": 0.673469387755102,
"grad_norm": 0.40316278701128183,
"learning_rate": 9.526799338236828e-06,
"loss": 0.7843,
"step": 88
},
{
"epoch": 0.6811224489795918,
"grad_norm": 0.3555197534077026,
"learning_rate": 9.507614539004082e-06,
"loss": 0.7925,
"step": 89
},
{
"epoch": 0.6887755102040817,
"grad_norm": 0.38348347862843785,
"learning_rate": 9.488068638195072e-06,
"loss": 0.7894,
"step": 90
},
{
"epoch": 0.6964285714285714,
"grad_norm": 0.3578221543396146,
"learning_rate": 9.468163201617063e-06,
"loss": 0.7625,
"step": 91
},
{
"epoch": 0.7040816326530612,
"grad_norm": 0.3512021435202505,
"learning_rate": 9.447899823879456e-06,
"loss": 0.7664,
"step": 92
},
{
"epoch": 0.7117346938775511,
"grad_norm": 0.359766111570709,
"learning_rate": 9.427280128266049e-06,
"loss": 0.7827,
"step": 93
},
{
"epoch": 0.7193877551020408,
"grad_norm": 0.3342360288463418,
"learning_rate": 9.406305766604996e-06,
"loss": 0.7868,
"step": 94
},
{
"epoch": 0.7270408163265306,
"grad_norm": 0.32321038061189955,
"learning_rate": 9.384978419136469e-06,
"loss": 0.7898,
"step": 95
},
{
"epoch": 0.7346938775510204,
"grad_norm": 0.34171882937229003,
"learning_rate": 9.363299794378072e-06,
"loss": 0.7665,
"step": 96
},
{
"epoch": 0.7423469387755102,
"grad_norm": 0.35118533636970833,
"learning_rate": 9.34127162898797e-06,
"loss": 0.7726,
"step": 97
},
{
"epoch": 0.75,
"grad_norm": 0.3370944528784506,
"learning_rate": 9.318895687625752e-06,
"loss": 0.7835,
"step": 98
},
{
"epoch": 0.7576530612244898,
"grad_norm": 0.29470217608260335,
"learning_rate": 9.296173762811084e-06,
"loss": 0.7777,
"step": 99
},
{
"epoch": 0.7653061224489796,
"grad_norm": 0.3454016966220543,
"learning_rate": 9.273107674780102e-06,
"loss": 0.789,
"step": 100
},
{
"epoch": 0.7729591836734694,
"grad_norm": 0.39511779117121837,
"learning_rate": 9.249699271339594e-06,
"loss": 0.7603,
"step": 101
},
{
"epoch": 0.7806122448979592,
"grad_norm": 0.38047057845879517,
"learning_rate": 9.225950427718974e-06,
"loss": 0.7825,
"step": 102
},
{
"epoch": 0.7882653061224489,
"grad_norm": 0.4086929427189167,
"learning_rate": 9.201863046420065e-06,
"loss": 0.7736,
"step": 103
},
{
"epoch": 0.7959183673469388,
"grad_norm": 0.40106266010702624,
"learning_rate": 9.177439057064684e-06,
"loss": 0.7846,
"step": 104
},
{
"epoch": 0.8035714285714286,
"grad_norm": 0.4228915644292435,
"learning_rate": 9.152680416240059e-06,
"loss": 0.7666,
"step": 105
},
{
"epoch": 0.8112244897959183,
"grad_norm": 0.3146448558809326,
"learning_rate": 9.1275891073421e-06,
"loss": 0.7657,
"step": 106
},
{
"epoch": 0.8188775510204082,
"grad_norm": 0.43451140759881535,
"learning_rate": 9.102167140416503e-06,
"loss": 0.7709,
"step": 107
},
{
"epoch": 0.826530612244898,
"grad_norm": 0.33837608510390754,
"learning_rate": 9.076416551997721e-06,
"loss": 0.7744,
"step": 108
},
{
"epoch": 0.8341836734693877,
"grad_norm": 0.37565083198791915,
"learning_rate": 9.050339404945834e-06,
"loss": 0.7827,
"step": 109
},
{
"epoch": 0.8418367346938775,
"grad_norm": 0.365977104631186,
"learning_rate": 9.023937788281278e-06,
"loss": 0.7834,
"step": 110
},
{
"epoch": 0.8494897959183674,
"grad_norm": 0.4486855497610728,
"learning_rate": 8.997213817017508e-06,
"loss": 0.7644,
"step": 111
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.3817323486497425,
"learning_rate": 8.970169631991556e-06,
"loss": 0.779,
"step": 112
},
{
"epoch": 0.8647959183673469,
"grad_norm": 0.38664365638910025,
"learning_rate": 8.942807399692543e-06,
"loss": 0.7688,
"step": 113
},
{
"epoch": 0.8724489795918368,
"grad_norm": 0.37215420738034083,
"learning_rate": 8.915129312088112e-06,
"loss": 0.7757,
"step": 114
},
{
"epoch": 0.8801020408163265,
"grad_norm": 0.35426054376993094,
"learning_rate": 8.88713758644883e-06,
"loss": 0.7674,
"step": 115
},
{
"epoch": 0.8877551020408163,
"grad_norm": 0.4195483369441139,
"learning_rate": 8.858834465170576e-06,
"loss": 0.7677,
"step": 116
},
{
"epoch": 0.8954081632653061,
"grad_norm": 0.41930027175437984,
"learning_rate": 8.83022221559489e-06,
"loss": 0.7568,
"step": 117
},
{
"epoch": 0.9030612244897959,
"grad_norm": 0.3279073397597126,
"learning_rate": 8.801303129827352e-06,
"loss": 0.7822,
"step": 118
},
{
"epoch": 0.9107142857142857,
"grad_norm": 0.357892031294145,
"learning_rate": 8.772079524553951e-06,
"loss": 0.7534,
"step": 119
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.3165448329232979,
"learning_rate": 8.742553740855507e-06,
"loss": 0.7739,
"step": 120
},
{
"epoch": 0.9260204081632653,
"grad_norm": 0.3383676190282577,
"learning_rate": 8.712728144020118e-06,
"loss": 0.7603,
"step": 121
},
{
"epoch": 0.9336734693877551,
"grad_norm": 0.3208003771597576,
"learning_rate": 8.682605123353685e-06,
"loss": 0.7629,
"step": 122
},
{
"epoch": 0.9413265306122449,
"grad_norm": 0.32756065557104463,
"learning_rate": 8.652187091988516e-06,
"loss": 0.7751,
"step": 123
},
{
"epoch": 0.9489795918367347,
"grad_norm": 0.3144217008561782,
"learning_rate": 8.621476486689991e-06,
"loss": 0.7617,
"step": 124
},
{
"epoch": 0.9566326530612245,
"grad_norm": 0.34740205977487404,
"learning_rate": 8.590475767661371e-06,
"loss": 0.7715,
"step": 125
},
{
"epoch": 0.9642857142857143,
"grad_norm": 0.36365085803217173,
"learning_rate": 8.559187418346703e-06,
"loss": 0.7702,
"step": 126
},
{
"epoch": 0.9719387755102041,
"grad_norm": 0.3242477226265553,
"learning_rate": 8.527613945231886e-06,
"loss": 0.7486,
"step": 127
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.33767240411889204,
"learning_rate": 8.495757877643857e-06,
"loss": 0.7539,
"step": 128
},
{
"epoch": 0.9872448979591837,
"grad_norm": 0.3728149497460986,
"learning_rate": 8.463621767547998e-06,
"loss": 0.7823,
"step": 129
},
{
"epoch": 0.9948979591836735,
"grad_norm": 0.3037904703658018,
"learning_rate": 8.43120818934367e-06,
"loss": 0.7676,
"step": 130
},
{
"epoch": 1.0025510204081634,
"grad_norm": 0.3657607958037338,
"learning_rate": 8.398519739657997e-06,
"loss": 0.7447,
"step": 131
},
{
"epoch": 1.010204081632653,
"grad_norm": 0.32702463131138215,
"learning_rate": 8.36555903713785e-06,
"loss": 0.7192,
"step": 132
},
{
"epoch": 1.0178571428571428,
"grad_norm": 0.3241261143033043,
"learning_rate": 8.332328722240072e-06,
"loss": 0.7463,
"step": 133
},
{
"epoch": 1.0255102040816326,
"grad_norm": 0.34857830092581826,
"learning_rate": 8.298831457019943e-06,
"loss": 0.7477,
"step": 134
},
{
"epoch": 1.0331632653061225,
"grad_norm": 0.34054768296119625,
"learning_rate": 8.265069924917925e-06,
"loss": 0.7205,
"step": 135
},
{
"epoch": 1.0408163265306123,
"grad_norm": 0.3456929422423653,
"learning_rate": 8.231046830544716e-06,
"loss": 0.74,
"step": 136
},
{
"epoch": 1.0484693877551021,
"grad_norm": 0.3535133441712295,
"learning_rate": 8.196764899464552e-06,
"loss": 0.7269,
"step": 137
},
{
"epoch": 1.0561224489795917,
"grad_norm": 0.3547241666178161,
"learning_rate": 8.162226877976886e-06,
"loss": 0.7554,
"step": 138
},
{
"epoch": 1.0637755102040816,
"grad_norm": 0.3107043446143115,
"learning_rate": 8.127435532896388e-06,
"loss": 0.7288,
"step": 139
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.3373069931944795,
"learning_rate": 8.092393651331275e-06,
"loss": 0.7355,
"step": 140
},
{
"epoch": 1.0790816326530612,
"grad_norm": 0.3279896770891133,
"learning_rate": 8.057104040460062e-06,
"loss": 0.7385,
"step": 141
},
{
"epoch": 1.086734693877551,
"grad_norm": 0.3261929334088148,
"learning_rate": 8.021569527306663e-06,
"loss": 0.728,
"step": 142
},
{
"epoch": 1.094387755102041,
"grad_norm": 0.3174495615043381,
"learning_rate": 7.985792958513932e-06,
"loss": 0.7408,
"step": 143
},
{
"epoch": 1.1020408163265305,
"grad_norm": 0.3067789820058703,
"learning_rate": 7.949777200115617e-06,
"loss": 0.7227,
"step": 144
},
{
"epoch": 1.1096938775510203,
"grad_norm": 0.35433845526552205,
"learning_rate": 7.913525137306756e-06,
"loss": 0.7287,
"step": 145
},
{
"epoch": 1.1173469387755102,
"grad_norm": 0.34741723337985486,
"learning_rate": 7.877039674212569e-06,
"loss": 0.7258,
"step": 146
},
{
"epoch": 1.125,
"grad_norm": 0.3280286780388698,
"learning_rate": 7.84032373365578e-06,
"loss": 0.7262,
"step": 147
},
{
"epoch": 1.1326530612244898,
"grad_norm": 0.30769648626514395,
"learning_rate": 7.803380256922495e-06,
"loss": 0.7281,
"step": 148
},
{
"epoch": 1.1403061224489797,
"grad_norm": 0.3815259137143696,
"learning_rate": 7.76621220352657e-06,
"loss": 0.7189,
"step": 149
},
{
"epoch": 1.1479591836734695,
"grad_norm": 0.3003966668112163,
"learning_rate": 7.728822550972523e-06,
"loss": 0.7372,
"step": 150
},
{
"epoch": 1.155612244897959,
"grad_norm": 0.3730853217142341,
"learning_rate": 7.69121429451702e-06,
"loss": 0.7327,
"step": 151
},
{
"epoch": 1.163265306122449,
"grad_norm": 0.3077673668382533,
"learning_rate": 7.65339044692891e-06,
"loss": 0.7243,
"step": 152
},
{
"epoch": 1.1709183673469388,
"grad_norm": 0.3419341487539606,
"learning_rate": 7.615354038247889e-06,
"loss": 0.7132,
"step": 153
},
{
"epoch": 1.1785714285714286,
"grad_norm": 0.4263455028715549,
"learning_rate": 7.577108115541761e-06,
"loss": 0.7136,
"step": 154
},
{
"epoch": 1.1862244897959184,
"grad_norm": 0.3659660917830282,
"learning_rate": 7.53865574266234e-06,
"loss": 0.7423,
"step": 155
},
{
"epoch": 1.193877551020408,
"grad_norm": 0.38942680340360186,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7194,
"step": 156
},
{
"epoch": 1.2015306122448979,
"grad_norm": 0.4077561265291119,
"learning_rate": 7.461143984236925e-06,
"loss": 0.7152,
"step": 157
},
{
"epoch": 1.2091836734693877,
"grad_norm": 0.31673843482064173,
"learning_rate": 7.422090808099014e-06,
"loss": 0.7164,
"step": 158
},
{
"epoch": 1.2168367346938775,
"grad_norm": 0.3411891380728348,
"learning_rate": 7.382843600106539e-06,
"loss": 0.7232,
"step": 159
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.35077489655944777,
"learning_rate": 7.343405504323519e-06,
"loss": 0.7306,
"step": 160
},
{
"epoch": 1.2321428571428572,
"grad_norm": 0.35048005407769184,
"learning_rate": 7.303779680105844e-06,
"loss": 0.7252,
"step": 161
},
{
"epoch": 1.239795918367347,
"grad_norm": 0.3074506613629546,
"learning_rate": 7.263969301848188e-06,
"loss": 0.7401,
"step": 162
},
{
"epoch": 1.2474489795918366,
"grad_norm": 0.3522836731747551,
"learning_rate": 7.223977558729707e-06,
"loss": 0.7235,
"step": 163
},
{
"epoch": 1.2551020408163265,
"grad_norm": 0.320879835567969,
"learning_rate": 7.183807654458565e-06,
"loss": 0.7269,
"step": 164
},
{
"epoch": 1.2627551020408163,
"grad_norm": 0.3369884807808652,
"learning_rate": 7.143462807015271e-06,
"loss": 0.7314,
"step": 165
},
{
"epoch": 1.2704081632653061,
"grad_norm": 0.3106327480512219,
"learning_rate": 7.102946248394908e-06,
"loss": 0.7311,
"step": 166
},
{
"epoch": 1.278061224489796,
"grad_norm": 0.3350969859669365,
"learning_rate": 7.0622612243482035e-06,
"loss": 0.7454,
"step": 167
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.32981543690241394,
"learning_rate": 7.021410994121525e-06,
"loss": 0.716,
"step": 168
},
{
"epoch": 1.2933673469387754,
"grad_norm": 0.3124947696287792,
"learning_rate": 6.980398830195785e-06,
"loss": 0.7227,
"step": 169
},
{
"epoch": 1.3010204081632653,
"grad_norm": 0.29800502106565513,
"learning_rate": 6.939228018024275e-06,
"loss": 0.7334,
"step": 170
},
{
"epoch": 1.308673469387755,
"grad_norm": 0.33815713937721337,
"learning_rate": 6.897901855769483e-06,
"loss": 0.7138,
"step": 171
},
{
"epoch": 1.316326530612245,
"grad_norm": 0.307107701344449,
"learning_rate": 6.856423654038868e-06,
"loss": 0.7267,
"step": 172
},
{
"epoch": 1.3239795918367347,
"grad_norm": 0.3373642844242084,
"learning_rate": 6.814796735619664e-06,
"loss": 0.7144,
"step": 173
},
{
"epoch": 1.3316326530612246,
"grad_norm": 0.2967492516461531,
"learning_rate": 6.773024435212678e-06,
"loss": 0.7139,
"step": 174
},
{
"epoch": 1.3392857142857144,
"grad_norm": 0.2944971616996988,
"learning_rate": 6.731110099165165e-06,
"loss": 0.7417,
"step": 175
},
{
"epoch": 1.346938775510204,
"grad_norm": 0.293601269092346,
"learning_rate": 6.689057085202737e-06,
"loss": 0.7292,
"step": 176
},
{
"epoch": 1.3545918367346939,
"grad_norm": 0.2857412083314722,
"learning_rate": 6.646868762160399e-06,
"loss": 0.7169,
"step": 177
},
{
"epoch": 1.3622448979591837,
"grad_norm": 0.2793697266770511,
"learning_rate": 6.6045485097126585e-06,
"loss": 0.723,
"step": 178
},
{
"epoch": 1.3698979591836735,
"grad_norm": 0.28168414558063304,
"learning_rate": 6.562099718102788e-06,
"loss": 0.7111,
"step": 179
},
{
"epoch": 1.3775510204081631,
"grad_norm": 0.31727867501147977,
"learning_rate": 6.519525787871235e-06,
"loss": 0.728,
"step": 180
},
{
"epoch": 1.385204081632653,
"grad_norm": 0.298603631519449,
"learning_rate": 6.476830129583207e-06,
"loss": 0.7136,
"step": 181
},
{
"epoch": 1.3928571428571428,
"grad_norm": 0.29156749391023734,
"learning_rate": 6.434016163555452e-06,
"loss": 0.7229,
"step": 182
},
{
"epoch": 1.4005102040816326,
"grad_norm": 0.3216966381557731,
"learning_rate": 6.391087319582264e-06,
"loss": 0.7329,
"step": 183
},
{
"epoch": 1.4081632653061225,
"grad_norm": 0.3100392635441308,
"learning_rate": 6.34804703666072e-06,
"loss": 0.6948,
"step": 184
},
{
"epoch": 1.4158163265306123,
"grad_norm": 0.3205090950935673,
"learning_rate": 6.304898762715187e-06,
"loss": 0.7185,
"step": 185
},
{
"epoch": 1.4234693877551021,
"grad_norm": 0.31408394255073874,
"learning_rate": 6.261645954321109e-06,
"loss": 0.7155,
"step": 186
},
{
"epoch": 1.431122448979592,
"grad_norm": 0.3615372791973604,
"learning_rate": 6.21829207642811e-06,
"loss": 0.72,
"step": 187
},
{
"epoch": 1.4387755102040816,
"grad_norm": 0.3179800138909638,
"learning_rate": 6.1748406020824115e-06,
"loss": 0.7215,
"step": 188
},
{
"epoch": 1.4464285714285714,
"grad_norm": 0.39197778035917613,
"learning_rate": 6.131295012148613e-06,
"loss": 0.7161,
"step": 189
},
{
"epoch": 1.4540816326530612,
"grad_norm": 0.3475079137600484,
"learning_rate": 6.087658795030838e-06,
"loss": 0.7212,
"step": 190
},
{
"epoch": 1.461734693877551,
"grad_norm": 0.3303825779966534,
"learning_rate": 6.043935446393294e-06,
"loss": 0.7138,
"step": 191
},
{
"epoch": 1.469387755102041,
"grad_norm": 0.3397781093231611,
"learning_rate": 6.000128468880223e-06,
"loss": 0.7357,
"step": 192
},
{
"epoch": 1.4770408163265305,
"grad_norm": 0.28796995234216377,
"learning_rate": 5.956241371835312e-06,
"loss": 0.7329,
"step": 193
},
{
"epoch": 1.4846938775510203,
"grad_norm": 0.33155770096938897,
"learning_rate": 5.912277671020564e-06,
"loss": 0.7301,
"step": 194
},
{
"epoch": 1.4923469387755102,
"grad_norm": 0.3094759577498115,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.6932,
"step": 195
},
{
"epoch": 1.5,
"grad_norm": 0.3318537172693951,
"learning_rate": 5.824134551530783e-06,
"loss": 0.7326,
"step": 196
},
{
"epoch": 1.5076530612244898,
"grad_norm": 0.30879561481901263,
"learning_rate": 5.77996219393409e-06,
"loss": 0.7195,
"step": 197
},
{
"epoch": 1.5153061224489797,
"grad_norm": 0.3303797600701955,
"learning_rate": 5.735727354158581e-06,
"loss": 0.7353,
"step": 198
},
{
"epoch": 1.5229591836734695,
"grad_norm": 0.29685163959288646,
"learning_rate": 5.6914335758236665e-06,
"loss": 0.7262,
"step": 199
},
{
"epoch": 1.5306122448979593,
"grad_norm": 0.3006334315846896,
"learning_rate": 5.647084407270277e-06,
"loss": 0.725,
"step": 200
},
{
"epoch": 1.538265306122449,
"grad_norm": 0.3181086382843995,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.717,
"step": 201
},
{
"epoch": 1.5459183673469388,
"grad_norm": 0.2828538681954745,
"learning_rate": 5.5582341147735396e-06,
"loss": 0.7134,
"step": 202
},
{
"epoch": 1.5535714285714286,
"grad_norm": 0.2949909914868706,
"learning_rate": 5.5137401085596224e-06,
"loss": 0.7334,
"step": 203
},
{
"epoch": 1.5612244897959182,
"grad_norm": 0.3090417583811012,
"learning_rate": 5.469204947015897e-06,
"loss": 0.7189,
"step": 204
},
{
"epoch": 1.568877551020408,
"grad_norm": 0.28811678419113473,
"learning_rate": 5.424632197820325e-06,
"loss": 0.6947,
"step": 205
},
{
"epoch": 1.5765306122448979,
"grad_norm": 0.29198617120064,
"learning_rate": 5.380025431661981e-06,
"loss": 0.7146,
"step": 206
},
{
"epoch": 1.5841836734693877,
"grad_norm": 0.30576199585440467,
"learning_rate": 5.335388221955012e-06,
"loss": 0.7131,
"step": 207
},
{
"epoch": 1.5918367346938775,
"grad_norm": 0.2812362549753351,
"learning_rate": 5.290724144552379e-06,
"loss": 0.7221,
"step": 208
},
{
"epoch": 1.5994897959183674,
"grad_norm": 0.2760501677497026,
"learning_rate": 5.246036777459391e-06,
"loss": 0.7141,
"step": 209
},
{
"epoch": 1.6071428571428572,
"grad_norm": 0.2688411176432346,
"learning_rate": 5.201329700547077e-06,
"loss": 0.7154,
"step": 210
},
{
"epoch": 1.614795918367347,
"grad_norm": 0.30444010616896106,
"learning_rate": 5.156606495265402e-06,
"loss": 0.7185,
"step": 211
},
{
"epoch": 1.6224489795918369,
"grad_norm": 0.2834241043348135,
"learning_rate": 5.111870744356366e-06,
"loss": 0.6954,
"step": 212
},
{
"epoch": 1.6301020408163265,
"grad_norm": 0.2839078787011765,
"learning_rate": 5.067126031566988e-06,
"loss": 0.7199,
"step": 213
},
{
"epoch": 1.6377551020408163,
"grad_norm": 0.2894314952948115,
"learning_rate": 5.022375941362218e-06,
"loss": 0.7206,
"step": 214
},
{
"epoch": 1.6454081632653061,
"grad_norm": 0.30105116531481235,
"learning_rate": 4.977624058637783e-06,
"loss": 0.7104,
"step": 215
},
{
"epoch": 1.6530612244897958,
"grad_norm": 0.30603121168713104,
"learning_rate": 4.932873968433014e-06,
"loss": 0.7214,
"step": 216
},
{
"epoch": 1.6607142857142856,
"grad_norm": 0.2897952522073487,
"learning_rate": 4.8881292556436355e-06,
"loss": 0.7157,
"step": 217
},
{
"epoch": 1.6683673469387754,
"grad_norm": 0.3064132815038337,
"learning_rate": 4.8433935047346e-06,
"loss": 0.7129,
"step": 218
},
{
"epoch": 1.6760204081632653,
"grad_norm": 0.2729773805617788,
"learning_rate": 4.798670299452926e-06,
"loss": 0.7228,
"step": 219
},
{
"epoch": 1.683673469387755,
"grad_norm": 0.2860890205936976,
"learning_rate": 4.75396322254061e-06,
"loss": 0.7329,
"step": 220
},
{
"epoch": 1.691326530612245,
"grad_norm": 0.28789253491280553,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.714,
"step": 221
},
{
"epoch": 1.6989795918367347,
"grad_norm": 0.28710361826225933,
"learning_rate": 4.664611778044988e-06,
"loss": 0.7269,
"step": 222
},
{
"epoch": 1.7066326530612246,
"grad_norm": 0.2607466160182967,
"learning_rate": 4.619974568338021e-06,
"loss": 0.7254,
"step": 223
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.3109562373122327,
"learning_rate": 4.575367802179675e-06,
"loss": 0.7266,
"step": 224
},
{
"epoch": 1.7219387755102042,
"grad_norm": 0.2671326490442337,
"learning_rate": 4.530795052984104e-06,
"loss": 0.7402,
"step": 225
},
{
"epoch": 1.7295918367346939,
"grad_norm": 0.3024449623652671,
"learning_rate": 4.48625989144038e-06,
"loss": 0.7165,
"step": 226
},
{
"epoch": 1.7372448979591837,
"grad_norm": 0.29297628220674093,
"learning_rate": 4.441765885226462e-06,
"loss": 0.7233,
"step": 227
},
{
"epoch": 1.7448979591836735,
"grad_norm": 0.2703897681423039,
"learning_rate": 4.397316598723385e-06,
"loss": 0.729,
"step": 228
},
{
"epoch": 1.7525510204081631,
"grad_norm": 0.30928488016271816,
"learning_rate": 4.352915592729723e-06,
"loss": 0.7242,
"step": 229
},
{
"epoch": 1.760204081632653,
"grad_norm": 0.2706286337944358,
"learning_rate": 4.308566424176336e-06,
"loss": 0.7154,
"step": 230
},
{
"epoch": 1.7678571428571428,
"grad_norm": 0.260276440616788,
"learning_rate": 4.264272645841419e-06,
"loss": 0.7079,
"step": 231
},
{
"epoch": 1.7755102040816326,
"grad_norm": 0.27550251553605226,
"learning_rate": 4.220037806065911e-06,
"loss": 0.727,
"step": 232
},
{
"epoch": 1.7831632653061225,
"grad_norm": 0.27167673630299904,
"learning_rate": 4.175865448469219e-06,
"loss": 0.7284,
"step": 233
},
{
"epoch": 1.7908163265306123,
"grad_norm": 0.27870909240366326,
"learning_rate": 4.131759111665349e-06,
"loss": 0.7228,
"step": 234
},
{
"epoch": 1.7984693877551021,
"grad_norm": 0.25737644360984807,
"learning_rate": 4.087722328979437e-06,
"loss": 0.7248,
"step": 235
},
{
"epoch": 1.806122448979592,
"grad_norm": 0.2678752125292958,
"learning_rate": 4.043758628164688e-06,
"loss": 0.7276,
"step": 236
},
{
"epoch": 1.8137755102040818,
"grad_norm": 0.2937753087166277,
"learning_rate": 3.999871531119779e-06,
"loss": 0.7172,
"step": 237
},
{
"epoch": 1.8214285714285714,
"grad_norm": 0.2935500143060239,
"learning_rate": 3.956064553606708e-06,
"loss": 0.7096,
"step": 238
},
{
"epoch": 1.8290816326530612,
"grad_norm": 0.2872174760470752,
"learning_rate": 3.912341204969164e-06,
"loss": 0.7085,
"step": 239
},
{
"epoch": 1.836734693877551,
"grad_norm": 0.27257890683898406,
"learning_rate": 3.86870498785139e-06,
"loss": 0.7079,
"step": 240
},
{
"epoch": 1.8443877551020407,
"grad_norm": 0.2569539726278704,
"learning_rate": 3.825159397917589e-06,
"loss": 0.7196,
"step": 241
},
{
"epoch": 1.8520408163265305,
"grad_norm": 0.27151411294136946,
"learning_rate": 3.781707923571891e-06,
"loss": 0.7026,
"step": 242
},
{
"epoch": 1.8596938775510203,
"grad_norm": 0.27930041542818224,
"learning_rate": 3.7383540456788915e-06,
"loss": 0.7268,
"step": 243
},
{
"epoch": 1.8673469387755102,
"grad_norm": 0.26864299220619914,
"learning_rate": 3.695101237284815e-06,
"loss": 0.7248,
"step": 244
},
{
"epoch": 1.875,
"grad_norm": 0.27555317317712996,
"learning_rate": 3.6519529633392825e-06,
"loss": 0.7084,
"step": 245
},
{
"epoch": 1.8826530612244898,
"grad_norm": 0.2804770635369131,
"learning_rate": 3.6089126804177373e-06,
"loss": 0.7355,
"step": 246
},
{
"epoch": 1.8903061224489797,
"grad_norm": 0.273320036360711,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.7243,
"step": 247
},
{
"epoch": 1.8979591836734695,
"grad_norm": 0.28634409896833934,
"learning_rate": 3.523169870416795e-06,
"loss": 0.7144,
"step": 248
},
{
"epoch": 1.9056122448979593,
"grad_norm": 0.2731707658644953,
"learning_rate": 3.480474212128766e-06,
"loss": 0.7085,
"step": 249
},
{
"epoch": 1.913265306122449,
"grad_norm": 0.2856724418711564,
"learning_rate": 3.4379002818972122e-06,
"loss": 0.6994,
"step": 250
},
{
"epoch": 1.9209183673469388,
"grad_norm": 0.25169725455713693,
"learning_rate": 3.3954514902873427e-06,
"loss": 0.7198,
"step": 251
},
{
"epoch": 1.9285714285714286,
"grad_norm": 0.2707259735033453,
"learning_rate": 3.3531312378396026e-06,
"loss": 0.7195,
"step": 252
},
{
"epoch": 1.9362244897959182,
"grad_norm": 0.2575058815163588,
"learning_rate": 3.310942914797265e-06,
"loss": 0.7221,
"step": 253
},
{
"epoch": 1.943877551020408,
"grad_norm": 0.2624601348833616,
"learning_rate": 3.2688899008348386e-06,
"loss": 0.7098,
"step": 254
},
{
"epoch": 1.9515306122448979,
"grad_norm": 0.26759811934604133,
"learning_rate": 3.226975564787322e-06,
"loss": 0.715,
"step": 255
},
{
"epoch": 1.9591836734693877,
"grad_norm": 0.26105450865585245,
"learning_rate": 3.1852032643803377e-06,
"loss": 0.7115,
"step": 256
},
{
"epoch": 1.9668367346938775,
"grad_norm": 0.2679020391433801,
"learning_rate": 3.143576345961132e-06,
"loss": 0.7108,
"step": 257
},
{
"epoch": 1.9744897959183674,
"grad_norm": 0.2619436598503336,
"learning_rate": 3.1020981442305187e-06,
"loss": 0.7177,
"step": 258
},
{
"epoch": 1.9821428571428572,
"grad_norm": 0.28254050960476124,
"learning_rate": 3.0607719819757264e-06,
"loss": 0.714,
"step": 259
},
{
"epoch": 1.989795918367347,
"grad_norm": 0.25168812703672494,
"learning_rate": 3.019601169804216e-06,
"loss": 0.6937,
"step": 260
},
{
"epoch": 1.9974489795918369,
"grad_norm": 0.25916481234256045,
"learning_rate": 2.978589005878476e-06,
"loss": 0.728,
"step": 261
},
{
"epoch": 2.0051020408163267,
"grad_norm": 0.29079133938515395,
"learning_rate": 2.937738775651798e-06,
"loss": 0.712,
"step": 262
},
{
"epoch": 2.0127551020408165,
"grad_norm": 0.30679942022688345,
"learning_rate": 2.8970537516050935e-06,
"loss": 0.6888,
"step": 263
},
{
"epoch": 2.020408163265306,
"grad_norm": 0.26247554064017004,
"learning_rate": 2.8565371929847286e-06,
"loss": 0.6857,
"step": 264
},
{
"epoch": 2.0280612244897958,
"grad_norm": 0.25777996123866176,
"learning_rate": 2.816192345541437e-06,
"loss": 0.67,
"step": 265
},
{
"epoch": 2.0357142857142856,
"grad_norm": 0.27157311348157104,
"learning_rate": 2.776022441270295e-06,
"loss": 0.677,
"step": 266
},
{
"epoch": 2.0433673469387754,
"grad_norm": 0.2906167681301434,
"learning_rate": 2.736030698151815e-06,
"loss": 0.6901,
"step": 267
},
{
"epoch": 2.0510204081632653,
"grad_norm": 0.2682203092623702,
"learning_rate": 2.6962203198941587e-06,
"loss": 0.6799,
"step": 268
},
{
"epoch": 2.058673469387755,
"grad_norm": 0.26952637185558564,
"learning_rate": 2.656594495676482e-06,
"loss": 0.6867,
"step": 269
},
{
"epoch": 2.066326530612245,
"grad_norm": 0.2976523762247378,
"learning_rate": 2.6171563998934605e-06,
"loss": 0.6798,
"step": 270
},
{
"epoch": 2.0739795918367347,
"grad_norm": 0.2952839578045796,
"learning_rate": 2.577909191900988e-06,
"loss": 0.6775,
"step": 271
},
{
"epoch": 2.0816326530612246,
"grad_norm": 0.26694864359766846,
"learning_rate": 2.5388560157630765e-06,
"loss": 0.687,
"step": 272
},
{
"epoch": 2.0892857142857144,
"grad_norm": 0.2846824142546979,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.6933,
"step": 273
},
{
"epoch": 2.0969387755102042,
"grad_norm": 0.2603852023863315,
"learning_rate": 2.4613442573376625e-06,
"loss": 0.6932,
"step": 274
},
{
"epoch": 2.104591836734694,
"grad_norm": 0.25484664824654074,
"learning_rate": 2.422891884458241e-06,
"loss": 0.686,
"step": 275
},
{
"epoch": 2.1122448979591835,
"grad_norm": 0.26504213354211553,
"learning_rate": 2.384645961752113e-06,
"loss": 0.6794,
"step": 276
},
{
"epoch": 2.1198979591836733,
"grad_norm": 0.25708180544743425,
"learning_rate": 2.346609553071093e-06,
"loss": 0.6757,
"step": 277
},
{
"epoch": 2.127551020408163,
"grad_norm": 0.2495286574798052,
"learning_rate": 2.308785705482982e-06,
"loss": 0.6904,
"step": 278
},
{
"epoch": 2.135204081632653,
"grad_norm": 0.258995496102722,
"learning_rate": 2.2711774490274767e-06,
"loss": 0.6816,
"step": 279
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.26920967816385,
"learning_rate": 2.2337877964734324e-06,
"loss": 0.6798,
"step": 280
},
{
"epoch": 2.1505102040816326,
"grad_norm": 0.25763183428685615,
"learning_rate": 2.1966197430775056e-06,
"loss": 0.6861,
"step": 281
},
{
"epoch": 2.1581632653061225,
"grad_norm": 0.2476928851183991,
"learning_rate": 2.159676266344222e-06,
"loss": 0.6929,
"step": 282
},
{
"epoch": 2.1658163265306123,
"grad_norm": 0.2509285445340273,
"learning_rate": 2.122960325787432e-06,
"loss": 0.6828,
"step": 283
},
{
"epoch": 2.173469387755102,
"grad_norm": 0.2507947715753146,
"learning_rate": 2.086474862693244e-06,
"loss": 0.6871,
"step": 284
},
{
"epoch": 2.181122448979592,
"grad_norm": 0.2630733156124699,
"learning_rate": 2.050222799884387e-06,
"loss": 0.6799,
"step": 285
},
{
"epoch": 2.188775510204082,
"grad_norm": 0.26301976727268556,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.6858,
"step": 286
},
{
"epoch": 2.1964285714285716,
"grad_norm": 0.24578041007237145,
"learning_rate": 1.9784304726933384e-06,
"loss": 0.6787,
"step": 287
},
{
"epoch": 2.204081632653061,
"grad_norm": 0.2461786949708224,
"learning_rate": 1.942895959539939e-06,
"loss": 0.6994,
"step": 288
},
{
"epoch": 2.211734693877551,
"grad_norm": 0.24930437639531394,
"learning_rate": 1.9076063486687256e-06,
"loss": 0.6958,
"step": 289
},
{
"epoch": 2.2193877551020407,
"grad_norm": 0.24702757253020044,
"learning_rate": 1.8725644671036125e-06,
"loss": 0.6841,
"step": 290
},
{
"epoch": 2.2270408163265305,
"grad_norm": 0.24810798041544824,
"learning_rate": 1.8377731220231144e-06,
"loss": 0.6817,
"step": 291
},
{
"epoch": 2.2346938775510203,
"grad_norm": 0.2398463945344876,
"learning_rate": 1.803235100535452e-06,
"loss": 0.6928,
"step": 292
},
{
"epoch": 2.24234693877551,
"grad_norm": 0.25536410312526053,
"learning_rate": 1.7689531694552863e-06,
"loss": 0.6924,
"step": 293
},
{
"epoch": 2.25,
"grad_norm": 0.24193948267148485,
"learning_rate": 1.7349300750820758e-06,
"loss": 0.7014,
"step": 294
},
{
"epoch": 2.25765306122449,
"grad_norm": 0.25314351545479186,
"learning_rate": 1.7011685429800596e-06,
"loss": 0.6829,
"step": 295
},
{
"epoch": 2.2653061224489797,
"grad_norm": 0.2557447381261113,
"learning_rate": 1.6676712777599275e-06,
"loss": 0.686,
"step": 296
},
{
"epoch": 2.2729591836734695,
"grad_norm": 0.24015367436036683,
"learning_rate": 1.6344409628621482e-06,
"loss": 0.6977,
"step": 297
},
{
"epoch": 2.2806122448979593,
"grad_norm": 0.24549848915323214,
"learning_rate": 1.6014802603420044e-06,
"loss": 0.6929,
"step": 298
},
{
"epoch": 2.288265306122449,
"grad_norm": 0.2554793618240299,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.6917,
"step": 299
},
{
"epoch": 2.295918367346939,
"grad_norm": 0.23893288637534513,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.6919,
"step": 300
},
{
"epoch": 2.3035714285714284,
"grad_norm": 0.2564714188554446,
"learning_rate": 1.504242122356143e-06,
"loss": 0.7097,
"step": 301
},
{
"epoch": 2.311224489795918,
"grad_norm": 0.24283921622086807,
"learning_rate": 1.4723860547681163e-06,
"loss": 0.6849,
"step": 302
},
{
"epoch": 2.318877551020408,
"grad_norm": 0.26173546523391394,
"learning_rate": 1.4408125816532981e-06,
"loss": 0.6993,
"step": 303
},
{
"epoch": 2.326530612244898,
"grad_norm": 0.25486241688912814,
"learning_rate": 1.4095242323386305e-06,
"loss": 0.6788,
"step": 304
},
{
"epoch": 2.3341836734693877,
"grad_norm": 0.2361817323877386,
"learning_rate": 1.3785235133100088e-06,
"loss": 0.6905,
"step": 305
},
{
"epoch": 2.3418367346938775,
"grad_norm": 0.24162144820607892,
"learning_rate": 1.347812908011485e-06,
"loss": 0.6841,
"step": 306
},
{
"epoch": 2.3494897959183674,
"grad_norm": 0.2490597468001364,
"learning_rate": 1.3173948766463146e-06,
"loss": 0.6802,
"step": 307
},
{
"epoch": 2.357142857142857,
"grad_norm": 0.25398677763039623,
"learning_rate": 1.2872718559798852e-06,
"loss": 0.6886,
"step": 308
},
{
"epoch": 2.364795918367347,
"grad_norm": 0.23490252476576118,
"learning_rate": 1.257446259144494e-06,
"loss": 0.6868,
"step": 309
},
{
"epoch": 2.372448979591837,
"grad_norm": 0.23763836546961506,
"learning_rate": 1.2279204754460494e-06,
"loss": 0.6941,
"step": 310
},
{
"epoch": 2.3801020408163267,
"grad_norm": 0.2355524230958337,
"learning_rate": 1.1986968701726492e-06,
"loss": 0.6915,
"step": 311
},
{
"epoch": 2.387755102040816,
"grad_norm": 0.24752535017353638,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.6921,
"step": 312
},
{
"epoch": 2.395408163265306,
"grad_norm": 0.2471974082020202,
"learning_rate": 1.141165534829425e-06,
"loss": 0.6893,
"step": 313
},
{
"epoch": 2.4030612244897958,
"grad_norm": 0.23381448120221882,
"learning_rate": 1.1128624135511712e-06,
"loss": 0.6731,
"step": 314
},
{
"epoch": 2.4107142857142856,
"grad_norm": 0.2389896648493918,
"learning_rate": 1.0848706879118893e-06,
"loss": 0.6969,
"step": 315
},
{
"epoch": 2.4183673469387754,
"grad_norm": 0.24224169159980133,
"learning_rate": 1.057192600307456e-06,
"loss": 0.6955,
"step": 316
},
{
"epoch": 2.4260204081632653,
"grad_norm": 0.24100521101649902,
"learning_rate": 1.0298303680084448e-06,
"loss": 0.6896,
"step": 317
},
{
"epoch": 2.433673469387755,
"grad_norm": 0.2344749626752725,
"learning_rate": 1.0027861829824953e-06,
"loss": 0.6819,
"step": 318
},
{
"epoch": 2.441326530612245,
"grad_norm": 0.2319344660343658,
"learning_rate": 9.760622117187234e-07,
"loss": 0.6786,
"step": 319
},
{
"epoch": 2.4489795918367347,
"grad_norm": 0.23391479387830102,
"learning_rate": 9.496605950541676e-07,
"loss": 0.7039,
"step": 320
},
{
"epoch": 2.4566326530612246,
"grad_norm": 0.23186779062668114,
"learning_rate": 9.235834480022788e-07,
"loss": 0.688,
"step": 321
},
{
"epoch": 2.4642857142857144,
"grad_norm": 0.23669253409024169,
"learning_rate": 8.978328595834984e-07,
"loss": 0.7044,
"step": 322
},
{
"epoch": 2.4719387755102042,
"grad_norm": 0.24077368080825076,
"learning_rate": 8.724108926579e-07,
"loss": 0.6872,
"step": 323
},
{
"epoch": 2.479591836734694,
"grad_norm": 0.2388246013777763,
"learning_rate": 8.473195837599419e-07,
"loss": 0.6916,
"step": 324
},
{
"epoch": 2.487244897959184,
"grad_norm": 0.24710155251980784,
"learning_rate": 8.225609429353187e-07,
"loss": 0.684,
"step": 325
},
{
"epoch": 2.4948979591836733,
"grad_norm": 0.24220633252180443,
"learning_rate": 7.981369535799354e-07,
"loss": 0.6879,
"step": 326
},
{
"epoch": 2.502551020408163,
"grad_norm": 0.2302557057062771,
"learning_rate": 7.740495722810271e-07,
"loss": 0.7039,
"step": 327
},
{
"epoch": 2.510204081632653,
"grad_norm": 0.24237334538109565,
"learning_rate": 7.50300728660407e-07,
"loss": 0.6929,
"step": 328
},
{
"epoch": 2.517857142857143,
"grad_norm": 0.23815797746653947,
"learning_rate": 7.26892325219899e-07,
"loss": 0.6959,
"step": 329
},
{
"epoch": 2.5255102040816326,
"grad_norm": 0.23135128489800952,
"learning_rate": 7.03826237188916e-07,
"loss": 0.675,
"step": 330
},
{
"epoch": 2.5331632653061225,
"grad_norm": 0.22784611175554673,
"learning_rate": 6.811043123742494e-07,
"loss": 0.6977,
"step": 331
},
{
"epoch": 2.5408163265306123,
"grad_norm": 0.2405465208719118,
"learning_rate": 6.587283710120324e-07,
"loss": 0.6749,
"step": 332
},
{
"epoch": 2.548469387755102,
"grad_norm": 0.2309843149804867,
"learning_rate": 6.367002056219285e-07,
"loss": 0.6909,
"step": 333
},
{
"epoch": 2.556122448979592,
"grad_norm": 0.22895030021484628,
"learning_rate": 6.150215808635334e-07,
"loss": 0.6832,
"step": 334
},
{
"epoch": 2.563775510204082,
"grad_norm": 0.22696479172732587,
"learning_rate": 5.936942333950063e-07,
"loss": 0.6746,
"step": 335
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.22839398316256343,
"learning_rate": 5.727198717339511e-07,
"loss": 0.6866,
"step": 336
},
{
"epoch": 2.579081632653061,
"grad_norm": 0.22787946605540033,
"learning_rate": 5.521001761205441e-07,
"loss": 0.6995,
"step": 337
},
{
"epoch": 2.586734693877551,
"grad_norm": 0.22190853543659503,
"learning_rate": 5.318367983829393e-07,
"loss": 0.6785,
"step": 338
},
{
"epoch": 2.5943877551020407,
"grad_norm": 0.23020833242127728,
"learning_rate": 5.119313618049309e-07,
"loss": 0.6965,
"step": 339
},
{
"epoch": 2.6020408163265305,
"grad_norm": 0.24859089494546147,
"learning_rate": 4.9238546099592e-07,
"loss": 0.6799,
"step": 340
},
{
"epoch": 2.6096938775510203,
"grad_norm": 0.24590659281484603,
"learning_rate": 4.732006617631729e-07,
"loss": 0.6705,
"step": 341
},
{
"epoch": 2.61734693877551,
"grad_norm": 0.23193659731275013,
"learning_rate": 4.54378500986381e-07,
"loss": 0.6899,
"step": 342
},
{
"epoch": 2.625,
"grad_norm": 0.22818730097649043,
"learning_rate": 4.35920486494546e-07,
"loss": 0.6735,
"step": 343
},
{
"epoch": 2.63265306122449,
"grad_norm": 0.23430936758279247,
"learning_rate": 4.1782809694518533e-07,
"loss": 0.691,
"step": 344
},
{
"epoch": 2.6403061224489797,
"grad_norm": 0.22808387025349178,
"learning_rate": 4.001027817058789e-07,
"loss": 0.6862,
"step": 345
},
{
"epoch": 2.6479591836734695,
"grad_norm": 0.22146676723869588,
"learning_rate": 3.8274596073816784e-07,
"loss": 0.6863,
"step": 346
},
{
"epoch": 2.6556122448979593,
"grad_norm": 0.22664786178462878,
"learning_rate": 3.657590244837911e-07,
"loss": 0.6849,
"step": 347
},
{
"epoch": 2.663265306122449,
"grad_norm": 0.23451188263459435,
"learning_rate": 3.49143333753309e-07,
"loss": 0.6877,
"step": 348
},
{
"epoch": 2.670918367346939,
"grad_norm": 0.24867897398782676,
"learning_rate": 3.3290021961708163e-07,
"loss": 0.6802,
"step": 349
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.22314324400743446,
"learning_rate": 3.1703098329864237e-07,
"loss": 0.6824,
"step": 350
},
{
"epoch": 2.686224489795918,
"grad_norm": 0.2236801602770835,
"learning_rate": 3.015368960704584e-07,
"loss": 0.6928,
"step": 351
},
{
"epoch": 2.693877551020408,
"grad_norm": 0.2385665875720026,
"learning_rate": 2.864191991520848e-07,
"loss": 0.6846,
"step": 352
},
{
"epoch": 2.701530612244898,
"grad_norm": 0.23375450743740805,
"learning_rate": 2.71679103610738e-07,
"loss": 0.6909,
"step": 353
},
{
"epoch": 2.7091836734693877,
"grad_norm": 0.22520474139439234,
"learning_rate": 2.573177902642726e-07,
"loss": 0.6788,
"step": 354
},
{
"epoch": 2.7168367346938775,
"grad_norm": 0.22804273783537157,
"learning_rate": 2.4333640958659144e-07,
"loss": 0.7087,
"step": 355
},
{
"epoch": 2.7244897959183674,
"grad_norm": 0.22739247348429448,
"learning_rate": 2.2973608161547755e-07,
"loss": 0.695,
"step": 356
},
{
"epoch": 2.732142857142857,
"grad_norm": 0.23455037703234047,
"learning_rate": 2.1651789586287442e-07,
"loss": 0.6805,
"step": 357
},
{
"epoch": 2.739795918367347,
"grad_norm": 0.23159720573508338,
"learning_rate": 2.0368291122759898e-07,
"loss": 0.6891,
"step": 358
},
{
"epoch": 2.747448979591837,
"grad_norm": 0.22563711327867858,
"learning_rate": 1.9123215591052014e-07,
"loss": 0.6874,
"step": 359
},
{
"epoch": 2.7551020408163263,
"grad_norm": 0.2244203065125607,
"learning_rate": 1.7916662733218848e-07,
"loss": 0.6779,
"step": 360
},
{
"epoch": 2.762755102040816,
"grad_norm": 0.22559981924958145,
"learning_rate": 1.6748729205293024e-07,
"loss": 0.6673,
"step": 361
},
{
"epoch": 2.770408163265306,
"grad_norm": 0.23162805532792882,
"learning_rate": 1.5619508569542363e-07,
"loss": 0.6607,
"step": 362
},
{
"epoch": 2.7780612244897958,
"grad_norm": 0.23717254901729207,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.6771,
"step": 363
},
{
"epoch": 2.7857142857142856,
"grad_norm": 0.22534766714917717,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.6853,
"step": 364
},
{
"epoch": 2.7933673469387754,
"grad_norm": 0.23074216180625626,
"learning_rate": 1.2465013075879884e-07,
"loss": 0.6898,
"step": 365
},
{
"epoch": 2.8010204081632653,
"grad_norm": 0.23196388562982978,
"learning_rate": 1.1491517499091498e-07,
"loss": 0.6936,
"step": 366
},
{
"epoch": 2.808673469387755,
"grad_norm": 0.2358477783756292,
"learning_rate": 1.055715596571405e-07,
"loss": 0.6814,
"step": 367
},
{
"epoch": 2.816326530612245,
"grad_norm": 0.22714752868258517,
"learning_rate": 9.662003326740166e-08,
"loss": 0.6808,
"step": 368
},
{
"epoch": 2.8239795918367347,
"grad_norm": 0.21518858671804875,
"learning_rate": 8.80613129216762e-08,
"loss": 0.6754,
"step": 369
},
{
"epoch": 2.8316326530612246,
"grad_norm": 0.23291929200714853,
"learning_rate": 7.989608425254924e-08,
"loss": 0.6787,
"step": 370
},
{
"epoch": 2.8392857142857144,
"grad_norm": 0.22181996825718708,
"learning_rate": 7.212500137028789e-08,
"loss": 0.6894,
"step": 371
},
{
"epoch": 2.8469387755102042,
"grad_norm": 0.22193037565442786,
"learning_rate": 6.474868681043578e-08,
"loss": 0.6698,
"step": 372
},
{
"epoch": 2.854591836734694,
"grad_norm": 0.22934416722405232,
"learning_rate": 5.776773148394976e-08,
"loss": 0.6952,
"step": 373
},
{
"epoch": 2.862244897959184,
"grad_norm": 0.22584203654514134,
"learning_rate": 5.1182694629857145e-08,
"loss": 0.6957,
"step": 374
},
{
"epoch": 2.8698979591836737,
"grad_norm": 0.22461460701111055,
"learning_rate": 4.499410377045765e-08,
"loss": 0.6886,
"step": 375
},
{
"epoch": 2.877551020408163,
"grad_norm": 0.22733876386433874,
"learning_rate": 3.9202454669063915e-08,
"loss": 0.6977,
"step": 376
},
{
"epoch": 2.885204081632653,
"grad_norm": 0.22962431221391724,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.698,
"step": 377
},
{
"epoch": 2.892857142857143,
"grad_norm": 0.23151231532840025,
"learning_rate": 2.8811805762860578e-08,
"loss": 0.6915,
"step": 378
},
{
"epoch": 2.9005102040816326,
"grad_norm": 0.22229087776784215,
"learning_rate": 2.4213638345040868e-08,
"loss": 0.6814,
"step": 379
},
{
"epoch": 2.9081632653061225,
"grad_norm": 0.2266877035262688,
"learning_rate": 2.0014077392525035e-08,
"loss": 0.6795,
"step": 380
},
{
"epoch": 2.9158163265306123,
"grad_norm": 0.2250462204085745,
"learning_rate": 1.6213459328950355e-08,
"loss": 0.6788,
"step": 381
},
{
"epoch": 2.923469387755102,
"grad_norm": 0.23211476979219928,
"learning_rate": 1.2812088618942009e-08,
"loss": 0.6957,
"step": 382
},
{
"epoch": 2.931122448979592,
"grad_norm": 0.22076310527156465,
"learning_rate": 9.810237743724805e-09,
"loss": 0.6813,
"step": 383
},
{
"epoch": 2.938775510204082,
"grad_norm": 0.2275087137841747,
"learning_rate": 7.2081471792911914e-09,
"loss": 0.685,
"step": 384
},
{
"epoch": 2.946428571428571,
"grad_norm": 0.22791650362368526,
"learning_rate": 5.006025377138901e-09,
"loss": 0.6748,
"step": 385
},
{
"epoch": 2.954081632653061,
"grad_norm": 0.22210915099031533,
"learning_rate": 3.204048747573185e-09,
"loss": 0.6824,
"step": 386
},
{
"epoch": 2.961734693877551,
"grad_norm": 0.22179326719231918,
"learning_rate": 1.8023616455731253e-09,
"loss": 0.6791,
"step": 387
},
{
"epoch": 2.9693877551020407,
"grad_norm": 0.21982432832999604,
"learning_rate": 8.010763592264381e-10,
"loss": 0.6825,
"step": 388
},
{
"epoch": 2.9770408163265305,
"grad_norm": 0.23354611683548948,
"learning_rate": 2.0027310073833516e-10,
"loss": 0.6797,
"step": 389
},
{
"epoch": 2.9846938775510203,
"grad_norm": 0.22225271783019312,
"learning_rate": 0.0,
"loss": 0.6847,
"step": 390
},
{
"epoch": 2.9846938775510203,
"step": 390,
"total_flos": 1.3384682412693258e+18,
"train_loss": 0.0,
"train_runtime": 16.2295,
"train_samples_per_second": 2317.452,
"train_steps_per_second": 24.03
}
],
"logging_steps": 1,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3384682412693258e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}