68cf56f23c / last-checkpoint /trainer_state.json
magatex's picture
Training in progress, step 600, checkpoint
3f36f31 verified
{
"best_metric": 0.004960117861628532,
"best_model_checkpoint": "miner_id_24/checkpoint-450",
"epoch": 1.9692307692307693,
"eval_steps": 25,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003282051282051282,
"grad_norm": 8.107928276062012,
"learning_rate": 8.333333333333334e-06,
"loss": 13.7808,
"step": 1
},
{
"epoch": 0.003282051282051282,
"eval_loss": 16.358562469482422,
"eval_runtime": 1.6836,
"eval_samples_per_second": 29.698,
"eval_steps_per_second": 29.698,
"step": 1
},
{
"epoch": 0.006564102564102564,
"grad_norm": 8.200773239135742,
"learning_rate": 1.6666666666666667e-05,
"loss": 14.0115,
"step": 2
},
{
"epoch": 0.009846153846153846,
"grad_norm": 8.451738357543945,
"learning_rate": 2.5e-05,
"loss": 14.201,
"step": 3
},
{
"epoch": 0.013128205128205127,
"grad_norm": 8.04790210723877,
"learning_rate": 3.3333333333333335e-05,
"loss": 14.1223,
"step": 4
},
{
"epoch": 0.01641025641025641,
"grad_norm": 8.250162124633789,
"learning_rate": 4.166666666666667e-05,
"loss": 14.0582,
"step": 5
},
{
"epoch": 0.019692307692307693,
"grad_norm": 8.590646743774414,
"learning_rate": 5e-05,
"loss": 13.9467,
"step": 6
},
{
"epoch": 0.022974358974358976,
"grad_norm": 9.05842399597168,
"learning_rate": 5.833333333333334e-05,
"loss": 13.9617,
"step": 7
},
{
"epoch": 0.026256410256410255,
"grad_norm": 9.801227569580078,
"learning_rate": 6.666666666666667e-05,
"loss": 13.5738,
"step": 8
},
{
"epoch": 0.029538461538461538,
"grad_norm": 10.93070125579834,
"learning_rate": 7.500000000000001e-05,
"loss": 13.7217,
"step": 9
},
{
"epoch": 0.03282051282051282,
"grad_norm": 12.238622665405273,
"learning_rate": 8.333333333333334e-05,
"loss": 13.0686,
"step": 10
},
{
"epoch": 0.0361025641025641,
"grad_norm": 14.860128402709961,
"learning_rate": 9.166666666666667e-05,
"loss": 13.3774,
"step": 11
},
{
"epoch": 0.039384615384615386,
"grad_norm": 14.760673522949219,
"learning_rate": 0.0001,
"loss": 13.4816,
"step": 12
},
{
"epoch": 0.042666666666666665,
"grad_norm": 13.52017593383789,
"learning_rate": 0.00010833333333333333,
"loss": 12.5105,
"step": 13
},
{
"epoch": 0.04594871794871795,
"grad_norm": 15.965612411499023,
"learning_rate": 0.00011666666666666668,
"loss": 12.7767,
"step": 14
},
{
"epoch": 0.04923076923076923,
"grad_norm": 14.593632698059082,
"learning_rate": 0.000125,
"loss": 11.4924,
"step": 15
},
{
"epoch": 0.05251282051282051,
"grad_norm": 14.693791389465332,
"learning_rate": 0.00013333333333333334,
"loss": 10.9891,
"step": 16
},
{
"epoch": 0.055794871794871796,
"grad_norm": 16.373519897460938,
"learning_rate": 0.00014166666666666668,
"loss": 9.5602,
"step": 17
},
{
"epoch": 0.059076923076923075,
"grad_norm": 16.455101013183594,
"learning_rate": 0.00015000000000000001,
"loss": 8.4045,
"step": 18
},
{
"epoch": 0.06235897435897436,
"grad_norm": 15.62342357635498,
"learning_rate": 0.00015833333333333332,
"loss": 6.9826,
"step": 19
},
{
"epoch": 0.06564102564102564,
"grad_norm": 12.31684684753418,
"learning_rate": 0.0001666666666666667,
"loss": 5.5791,
"step": 20
},
{
"epoch": 0.06892307692307692,
"grad_norm": 9.89257526397705,
"learning_rate": 0.000175,
"loss": 4.5438,
"step": 21
},
{
"epoch": 0.0722051282051282,
"grad_norm": 7.226674556732178,
"learning_rate": 0.00018333333333333334,
"loss": 3.9863,
"step": 22
},
{
"epoch": 0.07548717948717949,
"grad_norm": 3.454035520553589,
"learning_rate": 0.00019166666666666667,
"loss": 3.3812,
"step": 23
},
{
"epoch": 0.07876923076923077,
"grad_norm": 4.084013938903809,
"learning_rate": 0.0002,
"loss": 3.3663,
"step": 24
},
{
"epoch": 0.08205128205128205,
"grad_norm": 4.125749111175537,
"learning_rate": 0.00019999866135254795,
"loss": 3.1984,
"step": 25
},
{
"epoch": 0.08205128205128205,
"eval_loss": 3.513918399810791,
"eval_runtime": 1.7168,
"eval_samples_per_second": 29.124,
"eval_steps_per_second": 29.124,
"step": 25
},
{
"epoch": 0.08533333333333333,
"grad_norm": 3.0909500122070312,
"learning_rate": 0.0001999946454500135,
"loss": 2.8699,
"step": 26
},
{
"epoch": 0.08861538461538461,
"grad_norm": 3.214726686477661,
"learning_rate": 0.00019998795241186058,
"loss": 2.6662,
"step": 27
},
{
"epoch": 0.0918974358974359,
"grad_norm": 3.6891489028930664,
"learning_rate": 0.00019997858243719183,
"loss": 2.455,
"step": 28
},
{
"epoch": 0.09517948717948718,
"grad_norm": 3.7046895027160645,
"learning_rate": 0.00019996653580474266,
"loss": 2.2075,
"step": 29
},
{
"epoch": 0.09846153846153846,
"grad_norm": 4.107726097106934,
"learning_rate": 0.00019995181287287293,
"loss": 1.92,
"step": 30
},
{
"epoch": 0.10174358974358974,
"grad_norm": 5.024882793426514,
"learning_rate": 0.0001999344140795563,
"loss": 1.6797,
"step": 31
},
{
"epoch": 0.10502564102564102,
"grad_norm": 4.104298114776611,
"learning_rate": 0.0001999143399423672,
"loss": 1.3627,
"step": 32
},
{
"epoch": 0.10830769230769231,
"grad_norm": 3.537320137023926,
"learning_rate": 0.00019989159105846555,
"loss": 1.0682,
"step": 33
},
{
"epoch": 0.11158974358974359,
"grad_norm": 3.1266891956329346,
"learning_rate": 0.00019986616810457867,
"loss": 0.8258,
"step": 34
},
{
"epoch": 0.11487179487179487,
"grad_norm": 2.4511232376098633,
"learning_rate": 0.00019983807183698163,
"loss": 0.6316,
"step": 35
},
{
"epoch": 0.11815384615384615,
"grad_norm": 3.5290353298187256,
"learning_rate": 0.00019980730309147434,
"loss": 0.6164,
"step": 36
},
{
"epoch": 0.12143589743589743,
"grad_norm": 1.588909387588501,
"learning_rate": 0.0001997738627833568,
"loss": 0.3606,
"step": 37
},
{
"epoch": 0.12471794871794872,
"grad_norm": 1.2943755388259888,
"learning_rate": 0.000199737751907402,
"loss": 0.2683,
"step": 38
},
{
"epoch": 0.128,
"grad_norm": 1.034509539604187,
"learning_rate": 0.00019969897153782623,
"loss": 0.1998,
"step": 39
},
{
"epoch": 0.13128205128205128,
"grad_norm": 0.7319996356964111,
"learning_rate": 0.00019965752282825712,
"loss": 0.1469,
"step": 40
},
{
"epoch": 0.13456410256410256,
"grad_norm": 0.6485500931739807,
"learning_rate": 0.00019961340701169926,
"loss": 0.1237,
"step": 41
},
{
"epoch": 0.13784615384615384,
"grad_norm": 0.43038779497146606,
"learning_rate": 0.00019956662540049773,
"loss": 0.0857,
"step": 42
},
{
"epoch": 0.14112820512820512,
"grad_norm": 0.31721484661102295,
"learning_rate": 0.0001995171793862988,
"loss": 0.0624,
"step": 43
},
{
"epoch": 0.1444102564102564,
"grad_norm": 0.2403116375207901,
"learning_rate": 0.00019946507044000877,
"loss": 0.0492,
"step": 44
},
{
"epoch": 0.1476923076923077,
"grad_norm": 0.21824949979782104,
"learning_rate": 0.00019941030011175,
"loss": 0.0469,
"step": 45
},
{
"epoch": 0.15097435897435899,
"grad_norm": 0.1729796677827835,
"learning_rate": 0.00019935287003081494,
"loss": 0.0372,
"step": 46
},
{
"epoch": 0.15425641025641026,
"grad_norm": 0.1362195611000061,
"learning_rate": 0.00019929278190561767,
"loss": 0.0307,
"step": 47
},
{
"epoch": 0.15753846153846154,
"grad_norm": 0.11314882338047028,
"learning_rate": 0.00019923003752364297,
"loss": 0.0259,
"step": 48
},
{
"epoch": 0.16082051282051282,
"grad_norm": 0.1020006462931633,
"learning_rate": 0.00019916463875139316,
"loss": 0.0239,
"step": 49
},
{
"epoch": 0.1641025641025641,
"grad_norm": 0.12704280018806458,
"learning_rate": 0.00019909658753433272,
"loss": 0.0235,
"step": 50
},
{
"epoch": 0.1641025641025641,
"eval_loss": 0.10366738587617874,
"eval_runtime": 1.6921,
"eval_samples_per_second": 29.548,
"eval_steps_per_second": 29.548,
"step": 50
},
{
"epoch": 0.16738461538461538,
"grad_norm": 4.014084815979004,
"learning_rate": 0.0001990258858968303,
"loss": 0.1188,
"step": 51
},
{
"epoch": 0.17066666666666666,
"grad_norm": 3.2102086544036865,
"learning_rate": 0.0001989525359420985,
"loss": 0.0992,
"step": 52
},
{
"epoch": 0.17394871794871794,
"grad_norm": 1.2787097692489624,
"learning_rate": 0.00019887653985213124,
"loss": 0.0536,
"step": 53
},
{
"epoch": 0.17723076923076922,
"grad_norm": 1.2789212465286255,
"learning_rate": 0.00019879789988763914,
"loss": 0.0527,
"step": 54
},
{
"epoch": 0.18051282051282053,
"grad_norm": 0.691673755645752,
"learning_rate": 0.0001987166183879818,
"loss": 0.0349,
"step": 55
},
{
"epoch": 0.1837948717948718,
"grad_norm": 0.8248271942138672,
"learning_rate": 0.00019863269777109873,
"loss": 0.0326,
"step": 56
},
{
"epoch": 0.18707692307692309,
"grad_norm": 0.2020619660615921,
"learning_rate": 0.00019854614053343696,
"loss": 0.0209,
"step": 57
},
{
"epoch": 0.19035897435897436,
"grad_norm": 0.10885163396596909,
"learning_rate": 0.0001984569492498771,
"loss": 0.0165,
"step": 58
},
{
"epoch": 0.19364102564102564,
"grad_norm": 0.08642534166574478,
"learning_rate": 0.00019836512657365657,
"loss": 0.0146,
"step": 59
},
{
"epoch": 0.19692307692307692,
"grad_norm": 0.7858078479766846,
"learning_rate": 0.00019827067523629075,
"loss": 0.0332,
"step": 60
},
{
"epoch": 0.2002051282051282,
"grad_norm": 3.002739191055298,
"learning_rate": 0.00019817359804749166,
"loss": 0.021,
"step": 61
},
{
"epoch": 0.20348717948717948,
"grad_norm": 1.7393077611923218,
"learning_rate": 0.00019807389789508445,
"loss": 0.0173,
"step": 62
},
{
"epoch": 0.20676923076923076,
"grad_norm": 0.1218792125582695,
"learning_rate": 0.0001979715777449215,
"loss": 0.0135,
"step": 63
},
{
"epoch": 0.21005128205128204,
"grad_norm": 0.11387912929058075,
"learning_rate": 0.00019786664064079401,
"loss": 0.0133,
"step": 64
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.08703344315290451,
"learning_rate": 0.0001977590897043418,
"loss": 0.0126,
"step": 65
},
{
"epoch": 0.21661538461538463,
"grad_norm": 0.0711798295378685,
"learning_rate": 0.00019764892813496003,
"loss": 0.0123,
"step": 66
},
{
"epoch": 0.2198974358974359,
"grad_norm": 0.056465234607458115,
"learning_rate": 0.00019753615920970442,
"loss": 0.0116,
"step": 67
},
{
"epoch": 0.22317948717948718,
"grad_norm": 0.051314447075128555,
"learning_rate": 0.00019742078628319355,
"loss": 0.0111,
"step": 68
},
{
"epoch": 0.22646153846153846,
"grad_norm": 0.04775601997971535,
"learning_rate": 0.00019730281278750898,
"loss": 0.0109,
"step": 69
},
{
"epoch": 0.22974358974358974,
"grad_norm": 0.04364297538995743,
"learning_rate": 0.00019718224223209342,
"loss": 0.0105,
"step": 70
},
{
"epoch": 0.23302564102564102,
"grad_norm": 0.041208017617464066,
"learning_rate": 0.00019705907820364603,
"loss": 0.0101,
"step": 71
},
{
"epoch": 0.2363076923076923,
"grad_norm": 0.03828458860516548,
"learning_rate": 0.00019693332436601614,
"loss": 0.0096,
"step": 72
},
{
"epoch": 0.23958974358974358,
"grad_norm": 0.03837643936276436,
"learning_rate": 0.0001968049844600938,
"loss": 0.0097,
"step": 73
},
{
"epoch": 0.24287179487179486,
"grad_norm": 0.03652190417051315,
"learning_rate": 0.00019667406230369864,
"loss": 0.0094,
"step": 74
},
{
"epoch": 0.24615384615384617,
"grad_norm": 0.03440447524189949,
"learning_rate": 0.00019654056179146658,
"loss": 0.0091,
"step": 75
},
{
"epoch": 0.24615384615384617,
"eval_loss": 0.010809546336531639,
"eval_runtime": 1.7246,
"eval_samples_per_second": 28.993,
"eval_steps_per_second": 28.993,
"step": 75
},
{
"epoch": 0.24943589743589745,
"grad_norm": 0.03175151348114014,
"learning_rate": 0.0001964044868947336,
"loss": 0.0087,
"step": 76
},
{
"epoch": 0.2527179487179487,
"grad_norm": 0.03136426582932472,
"learning_rate": 0.00019626584166141777,
"loss": 0.0087,
"step": 77
},
{
"epoch": 0.256,
"grad_norm": 0.030030284076929092,
"learning_rate": 0.0001961246302158988,
"loss": 0.0085,
"step": 78
},
{
"epoch": 0.2592820512820513,
"grad_norm": 0.028109928593039513,
"learning_rate": 0.00019598085675889547,
"loss": 0.0083,
"step": 79
},
{
"epoch": 0.26256410256410256,
"grad_norm": 0.026804205030202866,
"learning_rate": 0.00019583452556734044,
"loss": 0.008,
"step": 80
},
{
"epoch": 0.26584615384615384,
"grad_norm": 0.025468742474913597,
"learning_rate": 0.0001956856409942532,
"loss": 0.0079,
"step": 81
},
{
"epoch": 0.2691282051282051,
"grad_norm": 0.02471235767006874,
"learning_rate": 0.00019553420746861052,
"loss": 0.0078,
"step": 82
},
{
"epoch": 0.2724102564102564,
"grad_norm": 0.023358380421996117,
"learning_rate": 0.00019538022949521465,
"loss": 0.0078,
"step": 83
},
{
"epoch": 0.2756923076923077,
"grad_norm": 0.022298390045762062,
"learning_rate": 0.00019522371165455954,
"loss": 0.0076,
"step": 84
},
{
"epoch": 0.27897435897435896,
"grad_norm": 0.021381327882409096,
"learning_rate": 0.0001950646586026941,
"loss": 0.0076,
"step": 85
},
{
"epoch": 0.28225641025641024,
"grad_norm": 0.02039244771003723,
"learning_rate": 0.00019490307507108426,
"loss": 0.0075,
"step": 86
},
{
"epoch": 0.2855384615384615,
"grad_norm": 0.1511549949645996,
"learning_rate": 0.00019473896586647186,
"loss": 0.0085,
"step": 87
},
{
"epoch": 0.2888205128205128,
"grad_norm": 0.01761717163026333,
"learning_rate": 0.00019457233587073176,
"loss": 0.007,
"step": 88
},
{
"epoch": 0.2921025641025641,
"grad_norm": 0.01719754748046398,
"learning_rate": 0.0001944031900407266,
"loss": 0.007,
"step": 89
},
{
"epoch": 0.2953846153846154,
"grad_norm": 0.016939733177423477,
"learning_rate": 0.0001942315334081593,
"loss": 0.0072,
"step": 90
},
{
"epoch": 0.2986666666666667,
"grad_norm": 0.01624459959566593,
"learning_rate": 0.00019405737107942362,
"loss": 0.007,
"step": 91
},
{
"epoch": 0.30194871794871797,
"grad_norm": 0.016138238832354546,
"learning_rate": 0.00019388070823545187,
"loss": 0.007,
"step": 92
},
{
"epoch": 0.30523076923076925,
"grad_norm": 0.01595952734351158,
"learning_rate": 0.0001937015501315611,
"loss": 0.007,
"step": 93
},
{
"epoch": 0.30851282051282053,
"grad_norm": 0.015773704275488853,
"learning_rate": 0.00019351990209729662,
"loss": 0.0069,
"step": 94
},
{
"epoch": 0.3117948717948718,
"grad_norm": 0.015645822510123253,
"learning_rate": 0.0001933357695362735,
"loss": 0.0068,
"step": 95
},
{
"epoch": 0.3150769230769231,
"grad_norm": 0.015485318377614021,
"learning_rate": 0.00019314915792601581,
"loss": 0.0068,
"step": 96
},
{
"epoch": 0.31835897435897437,
"grad_norm": 0.016759535297751427,
"learning_rate": 0.00019296007281779373,
"loss": 0.0069,
"step": 97
},
{
"epoch": 0.32164102564102565,
"grad_norm": 0.016155634075403214,
"learning_rate": 0.0001927685198364583,
"loss": 0.0067,
"step": 98
},
{
"epoch": 0.3249230769230769,
"grad_norm": 0.0162162147462368,
"learning_rate": 0.0001925745046802742,
"loss": 0.0069,
"step": 99
},
{
"epoch": 0.3282051282051282,
"grad_norm": 0.01534635853022337,
"learning_rate": 0.00019237803312075028,
"loss": 0.0068,
"step": 100
},
{
"epoch": 0.3282051282051282,
"eval_loss": 0.00823766179382801,
"eval_runtime": 1.7026,
"eval_samples_per_second": 29.368,
"eval_steps_per_second": 29.368,
"step": 100
},
{
"epoch": 0.3314871794871795,
"grad_norm": 0.5894611477851868,
"learning_rate": 0.00019217911100246756,
"loss": 0.0151,
"step": 101
},
{
"epoch": 0.33476923076923076,
"grad_norm": 0.1130584329366684,
"learning_rate": 0.00019197774424290582,
"loss": 0.0103,
"step": 102
},
{
"epoch": 0.33805128205128204,
"grad_norm": 0.0547531358897686,
"learning_rate": 0.0001917739388322673,
"loss": 0.0098,
"step": 103
},
{
"epoch": 0.3413333333333333,
"grad_norm": 0.03562573716044426,
"learning_rate": 0.0001915677008332985,
"loss": 0.0087,
"step": 104
},
{
"epoch": 0.3446153846153846,
"grad_norm": 0.030168889090418816,
"learning_rate": 0.00019135903638110993,
"loss": 0.0083,
"step": 105
},
{
"epoch": 0.3478974358974359,
"grad_norm": 0.023503584787249565,
"learning_rate": 0.00019114795168299347,
"loss": 0.0076,
"step": 106
},
{
"epoch": 0.35117948717948716,
"grad_norm": 0.020446596667170525,
"learning_rate": 0.00019093445301823788,
"loss": 0.0073,
"step": 107
},
{
"epoch": 0.35446153846153844,
"grad_norm": 0.01846328377723694,
"learning_rate": 0.00019071854673794196,
"loss": 0.0069,
"step": 108
},
{
"epoch": 0.3577435897435897,
"grad_norm": 0.016864923760294914,
"learning_rate": 0.00019050023926482548,
"loss": 0.0066,
"step": 109
},
{
"epoch": 0.36102564102564105,
"grad_norm": 0.017281338572502136,
"learning_rate": 0.00019027953709303827,
"loss": 0.0066,
"step": 110
},
{
"epoch": 0.36430769230769233,
"grad_norm": 0.017325421795248985,
"learning_rate": 0.00019005644678796705,
"loss": 0.0065,
"step": 111
},
{
"epoch": 0.3675897435897436,
"grad_norm": 0.020572949200868607,
"learning_rate": 0.00018983097498603995,
"loss": 0.0062,
"step": 112
},
{
"epoch": 0.3708717948717949,
"grad_norm": 0.018894601613283157,
"learning_rate": 0.00018960312839452932,
"loss": 0.0064,
"step": 113
},
{
"epoch": 0.37415384615384617,
"grad_norm": 0.01940176449716091,
"learning_rate": 0.00018937291379135196,
"loss": 0.0063,
"step": 114
},
{
"epoch": 0.37743589743589745,
"grad_norm": 2.0286033153533936,
"learning_rate": 0.00018914033802486775,
"loss": 0.0516,
"step": 115
},
{
"epoch": 0.38071794871794873,
"grad_norm": 0.01834729313850403,
"learning_rate": 0.00018890540801367572,
"loss": 0.0063,
"step": 116
},
{
"epoch": 0.384,
"grad_norm": 0.016609683632850647,
"learning_rate": 0.0001886681307464083,
"loss": 0.0061,
"step": 117
},
{
"epoch": 0.3872820512820513,
"grad_norm": 0.014862019568681717,
"learning_rate": 0.00018842851328152355,
"loss": 0.0059,
"step": 118
},
{
"epoch": 0.39056410256410257,
"grad_norm": 0.014286825433373451,
"learning_rate": 0.00018818656274709493,
"loss": 0.006,
"step": 119
},
{
"epoch": 0.39384615384615385,
"grad_norm": 0.013413852080702782,
"learning_rate": 0.0001879422863405995,
"loss": 0.0062,
"step": 120
},
{
"epoch": 0.3971282051282051,
"grad_norm": 0.012395060621201992,
"learning_rate": 0.00018769569132870366,
"loss": 0.0061,
"step": 121
},
{
"epoch": 0.4004102564102564,
"grad_norm": 0.012121280655264854,
"learning_rate": 0.0001874467850470471,
"loss": 0.006,
"step": 122
},
{
"epoch": 0.4036923076923077,
"grad_norm": 0.012034958228468895,
"learning_rate": 0.0001871955749000245,
"loss": 0.0059,
"step": 123
},
{
"epoch": 0.40697435897435896,
"grad_norm": 0.012840136885643005,
"learning_rate": 0.0001869420683605652,
"loss": 0.0059,
"step": 124
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.01383453793823719,
"learning_rate": 0.0001866862729699111,
"loss": 0.006,
"step": 125
},
{
"epoch": 0.41025641025641024,
"eval_loss": 0.008009692654013634,
"eval_runtime": 1.7144,
"eval_samples_per_second": 29.165,
"eval_steps_per_second": 29.165,
"step": 125
},
{
"epoch": 0.4135384615384615,
"grad_norm": 0.015620230697095394,
"learning_rate": 0.0001864281963373921,
"loss": 0.006,
"step": 126
},
{
"epoch": 0.4168205128205128,
"grad_norm": 0.01630019024014473,
"learning_rate": 0.00018616784614019995,
"loss": 0.0061,
"step": 127
},
{
"epoch": 0.4201025641025641,
"grad_norm": 0.016993194818496704,
"learning_rate": 0.00018590523012315972,
"loss": 0.006,
"step": 128
},
{
"epoch": 0.42338461538461536,
"grad_norm": 0.016546938568353653,
"learning_rate": 0.00018564035609849945,
"loss": 0.0058,
"step": 129
},
{
"epoch": 0.4266666666666667,
"grad_norm": 0.018067970871925354,
"learning_rate": 0.0001853732319456177,
"loss": 0.0059,
"step": 130
},
{
"epoch": 0.429948717948718,
"grad_norm": 0.017609402537345886,
"learning_rate": 0.0001851038656108494,
"loss": 0.0059,
"step": 131
},
{
"epoch": 0.43323076923076925,
"grad_norm": 0.015436948277056217,
"learning_rate": 0.0001848322651072291,
"loss": 0.0058,
"step": 132
},
{
"epoch": 0.43651282051282053,
"grad_norm": 0.014602603390812874,
"learning_rate": 0.00018455843851425283,
"loss": 0.0059,
"step": 133
},
{
"epoch": 0.4397948717948718,
"grad_norm": 0.0129224993288517,
"learning_rate": 0.00018428239397763775,
"loss": 0.0056,
"step": 134
},
{
"epoch": 0.4430769230769231,
"grad_norm": 0.011858578771352768,
"learning_rate": 0.00018400413970907974,
"loss": 0.0057,
"step": 135
},
{
"epoch": 0.44635897435897437,
"grad_norm": 0.011236813850700855,
"learning_rate": 0.00018372368398600927,
"loss": 0.0056,
"step": 136
},
{
"epoch": 0.44964102564102565,
"grad_norm": 0.01114533469080925,
"learning_rate": 0.00018344103515134492,
"loss": 0.0058,
"step": 137
},
{
"epoch": 0.45292307692307693,
"grad_norm": 0.009413519874215126,
"learning_rate": 0.00018315620161324538,
"loss": 0.0056,
"step": 138
},
{
"epoch": 0.4562051282051282,
"grad_norm": 0.009282637387514114,
"learning_rate": 0.0001828691918448594,
"loss": 0.0056,
"step": 139
},
{
"epoch": 0.4594871794871795,
"grad_norm": 0.008740304969251156,
"learning_rate": 0.00018258001438407344,
"loss": 0.0057,
"step": 140
},
{
"epoch": 0.46276923076923077,
"grad_norm": 0.008844515308737755,
"learning_rate": 0.00018228867783325804,
"loss": 0.0056,
"step": 141
},
{
"epoch": 0.46605128205128205,
"grad_norm": 0.008487056940793991,
"learning_rate": 0.00018199519085901165,
"loss": 0.0055,
"step": 142
},
{
"epoch": 0.4693333333333333,
"grad_norm": 0.008857525885105133,
"learning_rate": 0.000181699562191903,
"loss": 0.0057,
"step": 143
},
{
"epoch": 0.4726153846153846,
"grad_norm": 0.008444939740002155,
"learning_rate": 0.00018140180062621117,
"loss": 0.0056,
"step": 144
},
{
"epoch": 0.4758974358974359,
"grad_norm": 0.008454709313809872,
"learning_rate": 0.00018110191501966423,
"loss": 0.0056,
"step": 145
},
{
"epoch": 0.47917948717948716,
"grad_norm": 0.008842523209750652,
"learning_rate": 0.00018079991429317553,
"loss": 0.0056,
"step": 146
},
{
"epoch": 0.48246153846153844,
"grad_norm": 0.008386999368667603,
"learning_rate": 0.00018049580743057853,
"loss": 0.0055,
"step": 147
},
{
"epoch": 0.4857435897435897,
"grad_norm": 0.008592522703111172,
"learning_rate": 0.00018018960347835936,
"loss": 0.0055,
"step": 148
},
{
"epoch": 0.489025641025641,
"grad_norm": 0.008794533088803291,
"learning_rate": 0.00017988131154538783,
"loss": 0.0058,
"step": 149
},
{
"epoch": 0.49230769230769234,
"grad_norm": 0.008663009852170944,
"learning_rate": 0.00017957094080264634,
"loss": 0.0055,
"step": 150
},
{
"epoch": 0.49230769230769234,
"eval_loss": 0.007762688212096691,
"eval_runtime": 1.6988,
"eval_samples_per_second": 29.433,
"eval_steps_per_second": 29.433,
"step": 150
},
{
"epoch": 0.4955897435897436,
"grad_norm": 0.0282682366669178,
"learning_rate": 0.00017925850048295725,
"loss": 0.0074,
"step": 151
},
{
"epoch": 0.4988717948717949,
"grad_norm": 0.027021881192922592,
"learning_rate": 0.00017894399988070803,
"loss": 0.0072,
"step": 152
},
{
"epoch": 0.5021538461538462,
"grad_norm": 0.028071371838450432,
"learning_rate": 0.00017862744835157494,
"loss": 0.0074,
"step": 153
},
{
"epoch": 0.5054358974358975,
"grad_norm": 0.026930296793580055,
"learning_rate": 0.00017830885531224457,
"loss": 0.0072,
"step": 154
},
{
"epoch": 0.5087179487179487,
"grad_norm": 0.022877950221300125,
"learning_rate": 0.00017798823024013383,
"loss": 0.0069,
"step": 155
},
{
"epoch": 0.512,
"grad_norm": 0.02152320370078087,
"learning_rate": 0.00017766558267310798,
"loss": 0.0068,
"step": 156
},
{
"epoch": 0.5152820512820513,
"grad_norm": 0.01885562390089035,
"learning_rate": 0.00017734092220919682,
"loss": 0.0065,
"step": 157
},
{
"epoch": 0.5185641025641026,
"grad_norm": 0.017150552943348885,
"learning_rate": 0.00017701425850630937,
"loss": 0.0062,
"step": 158
},
{
"epoch": 0.5218461538461538,
"grad_norm": 0.01740305684506893,
"learning_rate": 0.00017668560128194635,
"loss": 0.0061,
"step": 159
},
{
"epoch": 0.5251282051282051,
"grad_norm": 0.0199392419308424,
"learning_rate": 0.00017635496031291115,
"loss": 0.0061,
"step": 160
},
{
"epoch": 0.5284102564102564,
"grad_norm": 0.014733769930899143,
"learning_rate": 0.00017602234543501928,
"loss": 0.0059,
"step": 161
},
{
"epoch": 0.5316923076923077,
"grad_norm": 0.010931742377579212,
"learning_rate": 0.0001756877665428052,
"loss": 0.0055,
"step": 162
},
{
"epoch": 0.534974358974359,
"grad_norm": 0.009115135297179222,
"learning_rate": 0.00017535123358922866,
"loss": 0.0054,
"step": 163
},
{
"epoch": 0.5382564102564102,
"grad_norm": 0.008291061967611313,
"learning_rate": 0.000175012756585378,
"loss": 0.0054,
"step": 164
},
{
"epoch": 0.5415384615384615,
"grad_norm": 0.007542457897216082,
"learning_rate": 0.00017467234560017284,
"loss": 0.0053,
"step": 165
},
{
"epoch": 0.5448205128205128,
"grad_norm": 0.007635565474629402,
"learning_rate": 0.0001743300107600642,
"loss": 0.0053,
"step": 166
},
{
"epoch": 0.5481025641025641,
"grad_norm": 0.00773365143686533,
"learning_rate": 0.0001739857622487334,
"loss": 0.0053,
"step": 167
},
{
"epoch": 0.5513846153846154,
"grad_norm": 0.007943989709019661,
"learning_rate": 0.00017363961030678927,
"loss": 0.0052,
"step": 168
},
{
"epoch": 0.5546666666666666,
"grad_norm": 0.008328091353178024,
"learning_rate": 0.00017329156523146323,
"loss": 0.0054,
"step": 169
},
{
"epoch": 0.5579487179487179,
"grad_norm": 0.008655044250190258,
"learning_rate": 0.00017294163737630305,
"loss": 0.0052,
"step": 170
},
{
"epoch": 0.5612307692307692,
"grad_norm": 0.008740806020796299,
"learning_rate": 0.00017258983715086505,
"loss": 0.0054,
"step": 171
},
{
"epoch": 0.5645128205128205,
"grad_norm": 0.00872563011944294,
"learning_rate": 0.00017223617502040427,
"loss": 0.0053,
"step": 172
},
{
"epoch": 0.5677948717948718,
"grad_norm": 0.009066494181752205,
"learning_rate": 0.00017188066150556307,
"loss": 0.0054,
"step": 173
},
{
"epoch": 0.571076923076923,
"grad_norm": 0.008819897659122944,
"learning_rate": 0.0001715233071820584,
"loss": 0.0053,
"step": 174
},
{
"epoch": 0.5743589743589743,
"grad_norm": 0.009242737665772438,
"learning_rate": 0.00017116412268036708,
"loss": 0.0053,
"step": 175
},
{
"epoch": 0.5743589743589743,
"eval_loss": 0.008101023733615875,
"eval_runtime": 1.6961,
"eval_samples_per_second": 29.479,
"eval_steps_per_second": 29.479,
"step": 175
},
{
"epoch": 0.5776410256410256,
"grad_norm": 0.009131861850619316,
"learning_rate": 0.00017080311868540943,
"loss": 0.0052,
"step": 176
},
{
"epoch": 0.5809230769230769,
"grad_norm": 0.008674906566739082,
"learning_rate": 0.00017044030593623167,
"loss": 0.0052,
"step": 177
},
{
"epoch": 0.5842051282051282,
"grad_norm": 0.008941737934947014,
"learning_rate": 0.00017007569522568627,
"loss": 0.0053,
"step": 178
},
{
"epoch": 0.5874871794871794,
"grad_norm": 0.008659431710839272,
"learning_rate": 0.00016970929740011103,
"loss": 0.0053,
"step": 179
},
{
"epoch": 0.5907692307692308,
"grad_norm": 0.008444879204034805,
"learning_rate": 0.00016934112335900621,
"loss": 0.0052,
"step": 180
},
{
"epoch": 0.5940512820512821,
"grad_norm": 0.008231930434703827,
"learning_rate": 0.0001689711840547106,
"loss": 0.0052,
"step": 181
},
{
"epoch": 0.5973333333333334,
"grad_norm": 0.0077408417128026485,
"learning_rate": 0.0001685994904920754,
"loss": 0.0053,
"step": 182
},
{
"epoch": 0.6006153846153847,
"grad_norm": 0.007263463456183672,
"learning_rate": 0.00016822605372813717,
"loss": 0.0051,
"step": 183
},
{
"epoch": 0.6038974358974359,
"grad_norm": 0.0077566043473780155,
"learning_rate": 0.00016785088487178854,
"loss": 0.0051,
"step": 184
},
{
"epoch": 0.6071794871794872,
"grad_norm": 0.0069167339242994785,
"learning_rate": 0.00016747399508344808,
"loss": 0.0052,
"step": 185
},
{
"epoch": 0.6104615384615385,
"grad_norm": 0.006657553371042013,
"learning_rate": 0.0001670953955747281,
"loss": 0.0051,
"step": 186
},
{
"epoch": 0.6137435897435898,
"grad_norm": 0.00632864935323596,
"learning_rate": 0.0001667150976081012,
"loss": 0.0051,
"step": 187
},
{
"epoch": 0.6170256410256411,
"grad_norm": 0.006999644450843334,
"learning_rate": 0.00016633311249656535,
"loss": 0.0053,
"step": 188
},
{
"epoch": 0.6203076923076923,
"grad_norm": 0.006461160257458687,
"learning_rate": 0.000165949451603307,
"loss": 0.0052,
"step": 189
},
{
"epoch": 0.6235897435897436,
"grad_norm": 0.00639855582267046,
"learning_rate": 0.00016556412634136347,
"loss": 0.0052,
"step": 190
},
{
"epoch": 0.6268717948717949,
"grad_norm": 0.006086940411478281,
"learning_rate": 0.0001651771481732832,
"loss": 0.0051,
"step": 191
},
{
"epoch": 0.6301538461538462,
"grad_norm": 0.006296528037637472,
"learning_rate": 0.00016478852861078486,
"loss": 0.0052,
"step": 192
},
{
"epoch": 0.6334358974358975,
"grad_norm": 0.006205971818417311,
"learning_rate": 0.0001643982792144148,
"loss": 0.0052,
"step": 193
},
{
"epoch": 0.6367179487179487,
"grad_norm": 0.006357920356094837,
"learning_rate": 0.0001640064115932033,
"loss": 0.0052,
"step": 194
},
{
"epoch": 0.64,
"grad_norm": 0.006061731372028589,
"learning_rate": 0.00016361293740431904,
"loss": 0.0052,
"step": 195
},
{
"epoch": 0.6432820512820513,
"grad_norm": 0.005866146180778742,
"learning_rate": 0.00016321786835272244,
"loss": 0.0051,
"step": 196
},
{
"epoch": 0.6465641025641026,
"grad_norm": 0.005625640973448753,
"learning_rate": 0.00016282121619081753,
"loss": 0.0051,
"step": 197
},
{
"epoch": 0.6498461538461539,
"grad_norm": 0.005495929159224033,
"learning_rate": 0.0001624229927181022,
"loss": 0.0051,
"step": 198
},
{
"epoch": 0.6531282051282051,
"grad_norm": 0.005448007490485907,
"learning_rate": 0.0001620232097808173,
"loss": 0.0051,
"step": 199
},
{
"epoch": 0.6564102564102564,
"grad_norm": 0.005435483064502478,
"learning_rate": 0.00016162187927159415,
"loss": 0.0051,
"step": 200
},
{
"epoch": 0.6564102564102564,
"eval_loss": 0.008042293600738049,
"eval_runtime": 1.0817,
"eval_samples_per_second": 46.222,
"eval_steps_per_second": 46.222,
"step": 200
},
{
"epoch": 0.6596923076923077,
"grad_norm": 0.015730086714029312,
"learning_rate": 0.00016121901312910085,
"loss": 0.006,
"step": 201
},
{
"epoch": 0.662974358974359,
"grad_norm": 0.01854288950562477,
"learning_rate": 0.00016081462333768703,
"loss": 0.0063,
"step": 202
},
{
"epoch": 0.6662564102564102,
"grad_norm": 0.016778547316789627,
"learning_rate": 0.0001604087219270275,
"loss": 0.0062,
"step": 203
},
{
"epoch": 0.6695384615384615,
"grad_norm": 0.014833272434771061,
"learning_rate": 0.00016000132097176422,
"loss": 0.0059,
"step": 204
},
{
"epoch": 0.6728205128205128,
"grad_norm": 0.016074592247605324,
"learning_rate": 0.0001595924325911472,
"loss": 0.0061,
"step": 205
},
{
"epoch": 0.6761025641025641,
"grad_norm": 0.01580972597002983,
"learning_rate": 0.0001591820689486739,
"loss": 0.0059,
"step": 206
},
{
"epoch": 0.6793846153846154,
"grad_norm": 0.01449266355484724,
"learning_rate": 0.00015877024225172766,
"loss": 0.0058,
"step": 207
},
{
"epoch": 0.6826666666666666,
"grad_norm": 0.012255052104592323,
"learning_rate": 0.00015835696475121418,
"loss": 0.0056,
"step": 208
},
{
"epoch": 0.6859487179487179,
"grad_norm": 0.01258911658078432,
"learning_rate": 0.0001579422487411972,
"loss": 0.0055,
"step": 209
},
{
"epoch": 0.6892307692307692,
"grad_norm": 0.014708973467350006,
"learning_rate": 0.00015752610655853314,
"loss": 0.0056,
"step": 210
},
{
"epoch": 0.6925128205128205,
"grad_norm": 0.012465777806937695,
"learning_rate": 0.00015710855058250346,
"loss": 0.0054,
"step": 211
},
{
"epoch": 0.6957948717948718,
"grad_norm": 0.008663519285619259,
"learning_rate": 0.00015668959323444695,
"loss": 0.0051,
"step": 212
},
{
"epoch": 0.699076923076923,
"grad_norm": 0.006837547291070223,
"learning_rate": 0.00015626924697738993,
"loss": 0.0051,
"step": 213
},
{
"epoch": 0.7023589743589743,
"grad_norm": 0.0062984684482216835,
"learning_rate": 0.00015584752431567578,
"loss": 0.0049,
"step": 214
},
{
"epoch": 0.7056410256410256,
"grad_norm": 0.005604996811598539,
"learning_rate": 0.00015542443779459247,
"loss": 0.005,
"step": 215
},
{
"epoch": 0.7089230769230769,
"grad_norm": 0.005338320974260569,
"learning_rate": 0.000155,
"loss": 0.005,
"step": 216
},
{
"epoch": 0.7122051282051282,
"grad_norm": 0.004744368139654398,
"learning_rate": 0.00015457422355795545,
"loss": 0.0049,
"step": 217
},
{
"epoch": 0.7154871794871794,
"grad_norm": 0.004858638159930706,
"learning_rate": 0.0001541471211343377,
"loss": 0.0049,
"step": 218
},
{
"epoch": 0.7187692307692307,
"grad_norm": 0.0049782246351242065,
"learning_rate": 0.0001537187054344706,
"loss": 0.005,
"step": 219
},
{
"epoch": 0.7220512820512821,
"grad_norm": 0.005176792852580547,
"learning_rate": 0.0001532889892027449,
"loss": 0.005,
"step": 220
},
{
"epoch": 0.7253333333333334,
"grad_norm": 0.00572836771607399,
"learning_rate": 0.00015285798522223922,
"loss": 0.0052,
"step": 221
},
{
"epoch": 0.7286153846153847,
"grad_norm": 0.005091918632388115,
"learning_rate": 0.0001524257063143398,
"loss": 0.0049,
"step": 222
},
{
"epoch": 0.7318974358974359,
"grad_norm": 0.005498278420418501,
"learning_rate": 0.00015199216533835904,
"loss": 0.0049,
"step": 223
},
{
"epoch": 0.7351794871794872,
"grad_norm": 0.0057638660073280334,
"learning_rate": 0.00015155737519115307,
"loss": 0.005,
"step": 224
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.005555428098887205,
"learning_rate": 0.00015112134880673788,
"loss": 0.005,
"step": 225
},
{
"epoch": 0.7384615384615385,
"eval_loss": 0.00847731251269579,
"eval_runtime": 1.7103,
"eval_samples_per_second": 29.235,
"eval_steps_per_second": 29.235,
"step": 225
},
{
"epoch": 0.7417435897435898,
"grad_norm": 0.005984974093735218,
"learning_rate": 0.0001506840991559048,
"loss": 0.005,
"step": 226
},
{
"epoch": 0.7450256410256411,
"grad_norm": 0.006116104777902365,
"learning_rate": 0.0001502456392458345,
"loss": 0.005,
"step": 227
},
{
"epoch": 0.7483076923076923,
"grad_norm": 0.0061400169506669044,
"learning_rate": 0.00014980598211971014,
"loss": 0.005,
"step": 228
},
{
"epoch": 0.7515897435897436,
"grad_norm": 0.010314466431736946,
"learning_rate": 0.0001493651408563293,
"loss": 0.0051,
"step": 229
},
{
"epoch": 0.7548717948717949,
"grad_norm": 0.005545389838516712,
"learning_rate": 0.00014892312856971496,
"loss": 0.005,
"step": 230
},
{
"epoch": 0.7581538461538462,
"grad_norm": 0.005526201333850622,
"learning_rate": 0.0001484799584087254,
"loss": 0.005,
"step": 231
},
{
"epoch": 0.7614358974358975,
"grad_norm": 0.0059164236299693584,
"learning_rate": 0.00014803564355666296,
"loss": 0.0051,
"step": 232
},
{
"epoch": 0.7647179487179487,
"grad_norm": 0.005439561791718006,
"learning_rate": 0.00014759019723088198,
"loss": 0.0048,
"step": 233
},
{
"epoch": 0.768,
"grad_norm": 0.0054712677374482155,
"learning_rate": 0.00014714363268239554,
"loss": 0.005,
"step": 234
},
{
"epoch": 0.7712820512820513,
"grad_norm": 0.031185509636998177,
"learning_rate": 0.00014669596319548132,
"loss": 0.0053,
"step": 235
},
{
"epoch": 0.7745641025641026,
"grad_norm": 0.005096518434584141,
"learning_rate": 0.00014624720208728637,
"loss": 0.0049,
"step": 236
},
{
"epoch": 0.7778461538461539,
"grad_norm": 0.004755071364343166,
"learning_rate": 0.000145797362707431,
"loss": 0.0049,
"step": 237
},
{
"epoch": 0.7811282051282051,
"grad_norm": 0.004966154228895903,
"learning_rate": 0.00014534645843761168,
"loss": 0.005,
"step": 238
},
{
"epoch": 0.7844102564102564,
"grad_norm": 0.0049712988547980785,
"learning_rate": 0.00014489450269120286,
"loss": 0.005,
"step": 239
},
{
"epoch": 0.7876923076923077,
"grad_norm": 0.004542510025203228,
"learning_rate": 0.00014444150891285807,
"loss": 0.0049,
"step": 240
},
{
"epoch": 0.790974358974359,
"grad_norm": 0.004633238539099693,
"learning_rate": 0.00014398749057810997,
"loss": 0.0049,
"step": 241
},
{
"epoch": 0.7942564102564103,
"grad_norm": 0.004606141243129969,
"learning_rate": 0.0001435324611929693,
"loss": 0.005,
"step": 242
},
{
"epoch": 0.7975384615384615,
"grad_norm": 0.004728773143142462,
"learning_rate": 0.00014307643429352333,
"loss": 0.005,
"step": 243
},
{
"epoch": 0.8008205128205128,
"grad_norm": 0.005116627085953951,
"learning_rate": 0.00014261942344553314,
"loss": 0.005,
"step": 244
},
{
"epoch": 0.8041025641025641,
"grad_norm": 0.006209938321262598,
"learning_rate": 0.00014216144224403002,
"loss": 0.0052,
"step": 245
},
{
"epoch": 0.8073846153846154,
"grad_norm": 0.004638911224901676,
"learning_rate": 0.00014170250431291105,
"loss": 0.0049,
"step": 246
},
{
"epoch": 0.8106666666666666,
"grad_norm": 0.0047111185267567635,
"learning_rate": 0.00014124262330453375,
"loss": 0.0048,
"step": 247
},
{
"epoch": 0.8139487179487179,
"grad_norm": 0.004355636890977621,
"learning_rate": 0.0001407818128993102,
"loss": 0.005,
"step": 248
},
{
"epoch": 0.8172307692307692,
"grad_norm": 0.005983169190585613,
"learning_rate": 0.0001403200868052998,
"loss": 0.0052,
"step": 249
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.8480205535888672,
"learning_rate": 0.00013985745875780173,
"loss": 0.0246,
"step": 250
},
{
"epoch": 0.8205128205128205,
"eval_loss": 0.00733967823907733,
"eval_runtime": 1.7185,
"eval_samples_per_second": 29.095,
"eval_steps_per_second": 29.095,
"step": 250
},
{
"epoch": 0.8237948717948718,
"grad_norm": 0.01247889269143343,
"learning_rate": 0.00013939394251894603,
"loss": 0.0056,
"step": 251
},
{
"epoch": 0.827076923076923,
"grad_norm": 0.012979921884834766,
"learning_rate": 0.00013892955187728455,
"loss": 0.0058,
"step": 252
},
{
"epoch": 0.8303589743589743,
"grad_norm": 0.01275433786213398,
"learning_rate": 0.00013846430064738064,
"loss": 0.0058,
"step": 253
},
{
"epoch": 0.8336410256410256,
"grad_norm": 0.013448723591864109,
"learning_rate": 0.00013799820266939818,
"loss": 0.0058,
"step": 254
},
{
"epoch": 0.8369230769230769,
"grad_norm": 0.013343177735805511,
"learning_rate": 0.00013753127180868982,
"loss": 0.0059,
"step": 255
},
{
"epoch": 0.8402051282051282,
"grad_norm": 0.013337034732103348,
"learning_rate": 0.00013706352195538458,
"loss": 0.0057,
"step": 256
},
{
"epoch": 0.8434871794871794,
"grad_norm": 0.014081962406635284,
"learning_rate": 0.0001365949670239747,
"loss": 0.0059,
"step": 257
},
{
"epoch": 0.8467692307692307,
"grad_norm": 0.01269819401204586,
"learning_rate": 0.0001361256209529016,
"loss": 0.0056,
"step": 258
},
{
"epoch": 0.850051282051282,
"grad_norm": 0.012389056384563446,
"learning_rate": 0.0001356554977041414,
"loss": 0.0055,
"step": 259
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.013030463829636574,
"learning_rate": 0.00013518461126278933,
"loss": 0.0055,
"step": 260
},
{
"epoch": 0.8566153846153847,
"grad_norm": 0.013525651767849922,
"learning_rate": 0.00013471297563664392,
"loss": 0.0053,
"step": 261
},
{
"epoch": 0.859897435897436,
"grad_norm": 0.012331862933933735,
"learning_rate": 0.0001342406048557904,
"loss": 0.0053,
"step": 262
},
{
"epoch": 0.8631794871794872,
"grad_norm": 0.009830540977418423,
"learning_rate": 0.00013376751297218287,
"loss": 0.0051,
"step": 263
},
{
"epoch": 0.8664615384615385,
"grad_norm": 0.008250358514487743,
"learning_rate": 0.00013329371405922688,
"loss": 0.005,
"step": 264
},
{
"epoch": 0.8697435897435898,
"grad_norm": 0.0070004030130803585,
"learning_rate": 0.00013281922221136037,
"loss": 0.0049,
"step": 265
},
{
"epoch": 0.8730256410256411,
"grad_norm": 0.006113375071436167,
"learning_rate": 0.00013234405154363446,
"loss": 0.005,
"step": 266
},
{
"epoch": 0.8763076923076923,
"grad_norm": 0.005196304526180029,
"learning_rate": 0.00013186821619129378,
"loss": 0.0048,
"step": 267
},
{
"epoch": 0.8795897435897436,
"grad_norm": 0.004914364777505398,
"learning_rate": 0.0001313917303093556,
"loss": 0.0049,
"step": 268
},
{
"epoch": 0.8828717948717949,
"grad_norm": 0.004722020123153925,
"learning_rate": 0.00013091460807218913,
"loss": 0.0049,
"step": 269
},
{
"epoch": 0.8861538461538462,
"grad_norm": 0.004352330230176449,
"learning_rate": 0.0001304368636730936,
"loss": 0.0049,
"step": 270
},
{
"epoch": 0.8894358974358975,
"grad_norm": 0.004314970225095749,
"learning_rate": 0.00012995851132387623,
"loss": 0.0049,
"step": 271
},
{
"epoch": 0.8927179487179487,
"grad_norm": 0.004017701838165522,
"learning_rate": 0.00012947956525442925,
"loss": 0.0048,
"step": 272
},
{
"epoch": 0.896,
"grad_norm": 0.004261241294443607,
"learning_rate": 0.00012900003971230684,
"loss": 0.0048,
"step": 273
},
{
"epoch": 0.8992820512820513,
"grad_norm": 0.004208185710012913,
"learning_rate": 0.00012851994896230116,
"loss": 0.0048,
"step": 274
},
{
"epoch": 0.9025641025641026,
"grad_norm": 0.004194408655166626,
"learning_rate": 0.00012803930728601785,
"loss": 0.0048,
"step": 275
},
{
"epoch": 0.9025641025641026,
"eval_loss": 0.005147330928593874,
"eval_runtime": 1.6972,
"eval_samples_per_second": 29.46,
"eval_steps_per_second": 29.46,
"step": 275
},
{
"epoch": 0.9058461538461539,
"grad_norm": 0.00444167573004961,
"learning_rate": 0.00012755812898145155,
"loss": 0.0047,
"step": 276
},
{
"epoch": 0.9091282051282051,
"grad_norm": 0.004847261123359203,
"learning_rate": 0.0001270764283625603,
"loss": 0.005,
"step": 277
},
{
"epoch": 0.9124102564102564,
"grad_norm": 0.004441663157194853,
"learning_rate": 0.0001265942197588397,
"loss": 0.0047,
"step": 278
},
{
"epoch": 0.9156923076923077,
"grad_norm": 0.0051227472722530365,
"learning_rate": 0.00012611151751489697,
"loss": 0.0049,
"step": 279
},
{
"epoch": 0.918974358974359,
"grad_norm": 0.004962059669196606,
"learning_rate": 0.00012562833599002375,
"loss": 0.0049,
"step": 280
},
{
"epoch": 0.9222564102564103,
"grad_norm": 0.004343735985457897,
"learning_rate": 0.00012514468955776936,
"loss": 0.0048,
"step": 281
},
{
"epoch": 0.9255384615384615,
"grad_norm": 0.004809789825230837,
"learning_rate": 0.000124660592605513,
"loss": 0.0049,
"step": 282
},
{
"epoch": 0.9288205128205128,
"grad_norm": 0.022584544494748116,
"learning_rate": 0.0001241760595340358,
"loss": 0.005,
"step": 283
},
{
"epoch": 0.9321025641025641,
"grad_norm": 0.004884497728198767,
"learning_rate": 0.0001236911047570925,
"loss": 0.0048,
"step": 284
},
{
"epoch": 0.9353846153846154,
"grad_norm": 0.0044205994345247746,
"learning_rate": 0.00012320574270098254,
"loss": 0.0049,
"step": 285
},
{
"epoch": 0.9386666666666666,
"grad_norm": 0.0042507946491241455,
"learning_rate": 0.0001227199878041211,
"loss": 0.0048,
"step": 286
},
{
"epoch": 0.9419487179487179,
"grad_norm": 0.003997748717665672,
"learning_rate": 0.0001222338545166093,
"loss": 0.0047,
"step": 287
},
{
"epoch": 0.9452307692307692,
"grad_norm": 0.005669133272022009,
"learning_rate": 0.00012174735729980466,
"loss": 0.0049,
"step": 288
},
{
"epoch": 0.9485128205128205,
"grad_norm": 0.004083422012627125,
"learning_rate": 0.00012126051062589075,
"loss": 0.0048,
"step": 289
},
{
"epoch": 0.9517948717948718,
"grad_norm": 0.00390887726098299,
"learning_rate": 0.00012077332897744662,
"loss": 0.0048,
"step": 290
},
{
"epoch": 0.955076923076923,
"grad_norm": 0.003818151541054249,
"learning_rate": 0.0001202858268470162,
"loss": 0.0048,
"step": 291
},
{
"epoch": 0.9583589743589743,
"grad_norm": 0.004068729933351278,
"learning_rate": 0.00011979801873667682,
"loss": 0.0049,
"step": 292
},
{
"epoch": 0.9616410256410256,
"grad_norm": 0.003813754068687558,
"learning_rate": 0.00011930991915760819,
"loss": 0.0049,
"step": 293
},
{
"epoch": 0.9649230769230769,
"grad_norm": 0.004149145446717739,
"learning_rate": 0.0001188215426296605,
"loss": 0.0049,
"step": 294
},
{
"epoch": 0.9682051282051282,
"grad_norm": 0.0038277863059192896,
"learning_rate": 0.00011833290368092243,
"loss": 0.0049,
"step": 295
},
{
"epoch": 0.9714871794871794,
"grad_norm": 0.005801186431199312,
"learning_rate": 0.00011784401684728925,
"loss": 0.0048,
"step": 296
},
{
"epoch": 0.9747692307692307,
"grad_norm": 0.00392820592969656,
"learning_rate": 0.00011735489667203014,
"loss": 0.005,
"step": 297
},
{
"epoch": 0.978051282051282,
"grad_norm": 0.0036806361749768257,
"learning_rate": 0.00011686555770535575,
"loss": 0.0048,
"step": 298
},
{
"epoch": 0.9813333333333333,
"grad_norm": 0.0037270234897732735,
"learning_rate": 0.00011637601450398507,
"loss": 0.0048,
"step": 299
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.032717231661081314,
"learning_rate": 0.00011588628163071289,
"loss": 0.0054,
"step": 300
},
{
"epoch": 0.9846153846153847,
"eval_loss": 0.005068204831331968,
"eval_runtime": 1.6978,
"eval_samples_per_second": 29.45,
"eval_steps_per_second": 29.45,
"step": 300
},
{
"epoch": 0.987897435897436,
"grad_norm": 0.009295407682657242,
"learning_rate": 0.0001153963736539761,
"loss": 0.0053,
"step": 301
},
{
"epoch": 0.9911794871794872,
"grad_norm": 0.006343138869851828,
"learning_rate": 0.00011490630514742058,
"loss": 0.0049,
"step": 302
},
{
"epoch": 0.9944615384615385,
"grad_norm": 0.004029371310025454,
"learning_rate": 0.00011441609068946764,
"loss": 0.0047,
"step": 303
},
{
"epoch": 0.9977435897435898,
"grad_norm": 0.004377037286758423,
"learning_rate": 0.00011392574486288026,
"loss": 0.0048,
"step": 304
},
{
"epoch": 1.001025641025641,
"grad_norm": 0.004222679417580366,
"learning_rate": 0.00011343528225432935,
"loss": 0.0065,
"step": 305
},
{
"epoch": 1.0043076923076923,
"grad_norm": 0.011929094791412354,
"learning_rate": 0.00011294471745395987,
"loss": 0.0057,
"step": 306
},
{
"epoch": 1.0075897435897436,
"grad_norm": 0.011988217942416668,
"learning_rate": 0.00011245406505495668,
"loss": 0.0055,
"step": 307
},
{
"epoch": 1.010871794871795,
"grad_norm": 0.011446448042988777,
"learning_rate": 0.00011196333965311053,
"loss": 0.0054,
"step": 308
},
{
"epoch": 1.0141538461538462,
"grad_norm": 0.010524057783186436,
"learning_rate": 0.00011147255584638383,
"loss": 0.0054,
"step": 309
},
{
"epoch": 1.0174358974358975,
"grad_norm": 0.01129401195794344,
"learning_rate": 0.00011098172823447641,
"loss": 0.0056,
"step": 310
},
{
"epoch": 1.0207179487179487,
"grad_norm": 0.010420313104987144,
"learning_rate": 0.00011049087141839126,
"loss": 0.0054,
"step": 311
},
{
"epoch": 1.024,
"grad_norm": 0.009146089665591717,
"learning_rate": 0.00011000000000000002,
"loss": 0.0052,
"step": 312
},
{
"epoch": 1.0272820512820513,
"grad_norm": 0.008935105055570602,
"learning_rate": 0.00010950912858160875,
"loss": 0.0052,
"step": 313
},
{
"epoch": 1.0305641025641026,
"grad_norm": 0.009841774590313435,
"learning_rate": 0.0001090182717655236,
"loss": 0.0051,
"step": 314
},
{
"epoch": 1.0338461538461539,
"grad_norm": 0.010123356245458126,
"learning_rate": 0.0001085274441536162,
"loss": 0.005,
"step": 315
},
{
"epoch": 1.0371282051282051,
"grad_norm": 0.00827121827751398,
"learning_rate": 0.00010803666034688951,
"loss": 0.0051,
"step": 316
},
{
"epoch": 1.0404102564102564,
"grad_norm": 0.0060403901152312756,
"learning_rate": 0.00010754593494504334,
"loss": 0.0048,
"step": 317
},
{
"epoch": 1.0436923076923077,
"grad_norm": 0.005550421308726072,
"learning_rate": 0.00010705528254604016,
"loss": 0.0047,
"step": 318
},
{
"epoch": 1.046974358974359,
"grad_norm": 0.005134978331625462,
"learning_rate": 0.00010656471774567066,
"loss": 0.0048,
"step": 319
},
{
"epoch": 1.0502564102564103,
"grad_norm": 0.004197476897388697,
"learning_rate": 0.00010607425513711977,
"loss": 0.0048,
"step": 320
},
{
"epoch": 1.0535384615384615,
"grad_norm": 0.0037515375297516584,
"learning_rate": 0.0001055839093105324,
"loss": 0.0047,
"step": 321
},
{
"epoch": 1.0568205128205128,
"grad_norm": 0.0034930245019495487,
"learning_rate": 0.00010509369485257942,
"loss": 0.0048,
"step": 322
},
{
"epoch": 1.060102564102564,
"grad_norm": 0.003294882597401738,
"learning_rate": 0.00010460362634602392,
"loss": 0.0047,
"step": 323
},
{
"epoch": 1.0633846153846154,
"grad_norm": 0.0033549105282872915,
"learning_rate": 0.00010411371836928712,
"loss": 0.0047,
"step": 324
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.0033656777814030647,
"learning_rate": 0.00010362398549601493,
"loss": 0.0049,
"step": 325
},
{
"epoch": 1.0666666666666667,
"eval_loss": 0.005108626559376717,
"eval_runtime": 1.7164,
"eval_samples_per_second": 29.131,
"eval_steps_per_second": 29.131,
"step": 325
},
{
"epoch": 1.069948717948718,
"grad_norm": 0.0033352887257933617,
"learning_rate": 0.00010313444229464429,
"loss": 0.0048,
"step": 326
},
{
"epoch": 1.0732307692307692,
"grad_norm": 0.0032802869100123644,
"learning_rate": 0.00010264510332796991,
"loss": 0.0048,
"step": 327
},
{
"epoch": 1.0765128205128205,
"grad_norm": 0.003124071517959237,
"learning_rate": 0.00010215598315271076,
"loss": 0.0048,
"step": 328
},
{
"epoch": 1.0797948717948718,
"grad_norm": 0.0035496705677360296,
"learning_rate": 0.00010166709631907762,
"loss": 0.0047,
"step": 329
},
{
"epoch": 1.083076923076923,
"grad_norm": 0.0036747653502970934,
"learning_rate": 0.00010117845737033956,
"loss": 0.0048,
"step": 330
},
{
"epoch": 1.0863589743589743,
"grad_norm": 0.0033675089944154024,
"learning_rate": 0.00010069008084239182,
"loss": 0.0048,
"step": 331
},
{
"epoch": 1.0896410256410256,
"grad_norm": 0.0036912565119564533,
"learning_rate": 0.00010020198126332321,
"loss": 0.0048,
"step": 332
},
{
"epoch": 1.0929230769230769,
"grad_norm": 0.003969641402363777,
"learning_rate": 9.971417315298381e-05,
"loss": 0.0048,
"step": 333
},
{
"epoch": 1.0962051282051282,
"grad_norm": 0.003843925893306732,
"learning_rate": 9.92266710225534e-05,
"loss": 0.0049,
"step": 334
},
{
"epoch": 1.0994871794871794,
"grad_norm": 0.004053202457726002,
"learning_rate": 9.873948937410929e-05,
"loss": 0.0049,
"step": 335
},
{
"epoch": 1.1027692307692307,
"grad_norm": 0.00362073234282434,
"learning_rate": 9.825264270019538e-05,
"loss": 0.0048,
"step": 336
},
{
"epoch": 1.106051282051282,
"grad_norm": 0.004117041826248169,
"learning_rate": 9.776614548339074e-05,
"loss": 0.0049,
"step": 337
},
{
"epoch": 1.1093333333333333,
"grad_norm": 0.003928050398826599,
"learning_rate": 9.728001219587897e-05,
"loss": 0.0048,
"step": 338
},
{
"epoch": 1.1126153846153846,
"grad_norm": 0.004026252776384354,
"learning_rate": 9.679425729901746e-05,
"loss": 0.0049,
"step": 339
},
{
"epoch": 1.1158974358974358,
"grad_norm": 0.003736741142347455,
"learning_rate": 9.630889524290749e-05,
"loss": 0.0048,
"step": 340
},
{
"epoch": 1.1191794871794871,
"grad_norm": 0.0036812988109886646,
"learning_rate": 9.582394046596421e-05,
"loss": 0.0049,
"step": 341
},
{
"epoch": 1.1224615384615384,
"grad_norm": 0.0035057140048593283,
"learning_rate": 9.533940739448703e-05,
"loss": 0.0048,
"step": 342
},
{
"epoch": 1.1257435897435897,
"grad_norm": 0.0037068051751703024,
"learning_rate": 9.485531044223068e-05,
"loss": 0.0048,
"step": 343
},
{
"epoch": 1.129025641025641,
"grad_norm": 0.0039064399898052216,
"learning_rate": 9.437166400997628e-05,
"loss": 0.0047,
"step": 344
},
{
"epoch": 1.1323076923076922,
"grad_norm": 0.00368549139238894,
"learning_rate": 9.388848248510309e-05,
"loss": 0.0047,
"step": 345
},
{
"epoch": 1.1355897435897435,
"grad_norm": 0.0034059761092066765,
"learning_rate": 9.340578024116031e-05,
"loss": 0.0048,
"step": 346
},
{
"epoch": 1.1388717948717948,
"grad_norm": 0.0036056831013411283,
"learning_rate": 9.292357163743977e-05,
"loss": 0.0048,
"step": 347
},
{
"epoch": 1.142153846153846,
"grad_norm": 0.0035778083838522434,
"learning_rate": 9.244187101854847e-05,
"loss": 0.0047,
"step": 348
},
{
"epoch": 1.1454358974358974,
"grad_norm": 0.0036830275785177946,
"learning_rate": 9.196069271398216e-05,
"loss": 0.0048,
"step": 349
},
{
"epoch": 1.1487179487179486,
"grad_norm": 0.0038206533063203096,
"learning_rate": 9.148005103769887e-05,
"loss": 0.0049,
"step": 350
},
{
"epoch": 1.1487179487179486,
"eval_loss": 0.005071515217423439,
"eval_runtime": 1.7105,
"eval_samples_per_second": 29.231,
"eval_steps_per_second": 29.231,
"step": 350
},
{
"epoch": 1.152,
"grad_norm": 0.003768439870327711,
"learning_rate": 9.099996028769313e-05,
"loss": 0.0047,
"step": 351
},
{
"epoch": 1.1552820512820512,
"grad_norm": 0.0035322019830346107,
"learning_rate": 9.052043474557075e-05,
"loss": 0.0047,
"step": 352
},
{
"epoch": 1.1585641025641025,
"grad_norm": 0.003624577773734927,
"learning_rate": 9.004148867612379e-05,
"loss": 0.0048,
"step": 353
},
{
"epoch": 1.1618461538461538,
"grad_norm": 0.00348327262327075,
"learning_rate": 8.956313632690642e-05,
"loss": 0.0047,
"step": 354
},
{
"epoch": 1.1651282051282053,
"grad_norm": 0.0037093586288392544,
"learning_rate": 8.908539192781092e-05,
"loss": 0.005,
"step": 355
},
{
"epoch": 1.1684102564102563,
"grad_norm": 0.00967591442167759,
"learning_rate": 8.860826969064444e-05,
"loss": 0.0055,
"step": 356
},
{
"epoch": 1.1716923076923078,
"grad_norm": 0.009883382357656956,
"learning_rate": 8.813178380870625e-05,
"loss": 0.0053,
"step": 357
},
{
"epoch": 1.1749743589743589,
"grad_norm": 0.009646824561059475,
"learning_rate": 8.765594845636553e-05,
"loss": 0.0053,
"step": 358
},
{
"epoch": 1.1782564102564104,
"grad_norm": 0.009988558478653431,
"learning_rate": 8.718077778863966e-05,
"loss": 0.0054,
"step": 359
},
{
"epoch": 1.1815384615384614,
"grad_norm": 0.009603265672922134,
"learning_rate": 8.670628594077313e-05,
"loss": 0.0054,
"step": 360
},
{
"epoch": 1.184820512820513,
"grad_norm": 0.008899768814444542,
"learning_rate": 8.623248702781716e-05,
"loss": 0.0052,
"step": 361
},
{
"epoch": 1.188102564102564,
"grad_norm": 0.009325512684881687,
"learning_rate": 8.575939514420967e-05,
"loss": 0.0053,
"step": 362
},
{
"epoch": 1.1913846153846155,
"grad_norm": 0.008173607289791107,
"learning_rate": 8.528702436335611e-05,
"loss": 0.0052,
"step": 363
},
{
"epoch": 1.1946666666666665,
"grad_norm": 0.008188599720597267,
"learning_rate": 8.481538873721074e-05,
"loss": 0.005,
"step": 364
},
{
"epoch": 1.197948717948718,
"grad_norm": 0.008771805092692375,
"learning_rate": 8.434450229585867e-05,
"loss": 0.0051,
"step": 365
},
{
"epoch": 1.2012307692307693,
"grad_norm": 0.008419954217970371,
"learning_rate": 8.38743790470984e-05,
"loss": 0.0049,
"step": 366
},
{
"epoch": 1.2045128205128206,
"grad_norm": 0.007303427904844284,
"learning_rate": 8.340503297602529e-05,
"loss": 0.0048,
"step": 367
},
{
"epoch": 1.2077948717948719,
"grad_norm": 0.006724661216139793,
"learning_rate": 8.293647804461544e-05,
"loss": 0.0048,
"step": 368
},
{
"epoch": 1.2110769230769232,
"grad_norm": 0.005520752165466547,
"learning_rate": 8.24687281913102e-05,
"loss": 0.0047,
"step": 369
},
{
"epoch": 1.2143589743589744,
"grad_norm": 0.005496680270880461,
"learning_rate": 8.200179733060183e-05,
"loss": 0.0049,
"step": 370
},
{
"epoch": 1.2176410256410257,
"grad_norm": 0.004724535159766674,
"learning_rate": 8.153569935261935e-05,
"loss": 0.0048,
"step": 371
},
{
"epoch": 1.220923076923077,
"grad_norm": 0.004228896461427212,
"learning_rate": 8.10704481227155e-05,
"loss": 0.0047,
"step": 372
},
{
"epoch": 1.2242051282051283,
"grad_norm": 0.003649340244010091,
"learning_rate": 8.060605748105404e-05,
"loss": 0.0049,
"step": 373
},
{
"epoch": 1.2274871794871796,
"grad_norm": 0.0032480864319950342,
"learning_rate": 8.014254124219835e-05,
"loss": 0.0048,
"step": 374
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.003284406615421176,
"learning_rate": 7.96799131947002e-05,
"loss": 0.0047,
"step": 375
},
{
"epoch": 1.2307692307692308,
"eval_loss": 0.005098435096442699,
"eval_runtime": 1.7095,
"eval_samples_per_second": 29.248,
"eval_steps_per_second": 29.248,
"step": 375
},
{
"epoch": 1.2340512820512821,
"grad_norm": 0.0030009394977241755,
"learning_rate": 7.921818710068983e-05,
"loss": 0.0048,
"step": 376
},
{
"epoch": 1.2373333333333334,
"grad_norm": 0.0028776165563613176,
"learning_rate": 7.875737669546627e-05,
"loss": 0.0048,
"step": 377
},
{
"epoch": 1.2406153846153847,
"grad_norm": 0.0028720826376229525,
"learning_rate": 7.829749568708899e-05,
"loss": 0.0048,
"step": 378
},
{
"epoch": 1.243897435897436,
"grad_norm": 0.0026297715958207846,
"learning_rate": 7.783855775597e-05,
"loss": 0.0047,
"step": 379
},
{
"epoch": 1.2471794871794872,
"grad_norm": 0.002790766768157482,
"learning_rate": 7.738057655446687e-05,
"loss": 0.005,
"step": 380
},
{
"epoch": 1.2504615384615385,
"grad_norm": 0.002748630242422223,
"learning_rate": 7.69235657064767e-05,
"loss": 0.0047,
"step": 381
},
{
"epoch": 1.2537435897435898,
"grad_norm": 0.0028022800106555223,
"learning_rate": 7.646753880703074e-05,
"loss": 0.0048,
"step": 382
},
{
"epoch": 1.257025641025641,
"grad_norm": 0.0030396939255297184,
"learning_rate": 7.601250942189009e-05,
"loss": 0.0048,
"step": 383
},
{
"epoch": 1.2603076923076924,
"grad_norm": 0.003127284813672304,
"learning_rate": 7.555849108714192e-05,
"loss": 0.0049,
"step": 384
},
{
"epoch": 1.2635897435897436,
"grad_norm": 0.0029194881208240986,
"learning_rate": 7.510549730879715e-05,
"loss": 0.0047,
"step": 385
},
{
"epoch": 1.266871794871795,
"grad_norm": 0.0032365506049245596,
"learning_rate": 7.465354156238835e-05,
"loss": 0.0048,
"step": 386
},
{
"epoch": 1.2701538461538462,
"grad_norm": 0.003188034286722541,
"learning_rate": 7.420263729256902e-05,
"loss": 0.0047,
"step": 387
},
{
"epoch": 1.2734358974358975,
"grad_norm": 0.0029549768660217524,
"learning_rate": 7.375279791271368e-05,
"loss": 0.0048,
"step": 388
},
{
"epoch": 1.2767179487179487,
"grad_norm": 0.09431561827659607,
"learning_rate": 7.330403680451869e-05,
"loss": 0.0054,
"step": 389
},
{
"epoch": 1.28,
"grad_norm": 0.0030596228316426277,
"learning_rate": 7.285636731760448e-05,
"loss": 0.0046,
"step": 390
},
{
"epoch": 1.2832820512820513,
"grad_norm": 0.0031989929266273975,
"learning_rate": 7.240980276911804e-05,
"loss": 0.0048,
"step": 391
},
{
"epoch": 1.2865641025641026,
"grad_norm": 0.003118851687759161,
"learning_rate": 7.196435644333708e-05,
"loss": 0.0048,
"step": 392
},
{
"epoch": 1.2898461538461539,
"grad_norm": 0.003467726055532694,
"learning_rate": 7.152004159127463e-05,
"loss": 0.0048,
"step": 393
},
{
"epoch": 1.2931282051282051,
"grad_norm": 0.0033981874585151672,
"learning_rate": 7.107687143028502e-05,
"loss": 0.0048,
"step": 394
},
{
"epoch": 1.2964102564102564,
"grad_norm": 0.002850554184988141,
"learning_rate": 7.063485914367075e-05,
"loss": 0.0047,
"step": 395
},
{
"epoch": 1.2996923076923077,
"grad_norm": 0.0028371524531394243,
"learning_rate": 7.019401788028993e-05,
"loss": 0.0047,
"step": 396
},
{
"epoch": 1.302974358974359,
"grad_norm": 0.003020528005436063,
"learning_rate": 6.975436075416555e-05,
"loss": 0.0049,
"step": 397
},
{
"epoch": 1.3062564102564103,
"grad_norm": 0.0028517835307866335,
"learning_rate": 6.931590084409524e-05,
"loss": 0.0047,
"step": 398
},
{
"epoch": 1.3095384615384615,
"grad_norm": 0.003879575990140438,
"learning_rate": 6.887865119326214e-05,
"loss": 0.0049,
"step": 399
},
{
"epoch": 1.3128205128205128,
"grad_norm": 0.0037416021805256605,
"learning_rate": 6.844262480884697e-05,
"loss": 0.0049,
"step": 400
},
{
"epoch": 1.3128205128205128,
"eval_loss": 0.0050128186121582985,
"eval_runtime": 1.8163,
"eval_samples_per_second": 27.529,
"eval_steps_per_second": 27.529,
"step": 400
},
{
"epoch": 1.316102564102564,
"grad_norm": 0.0030078617855906487,
"learning_rate": 6.800783466164098e-05,
"loss": 0.0047,
"step": 401
},
{
"epoch": 1.3193846153846154,
"grad_norm": 0.0030800465028733015,
"learning_rate": 6.757429368566022e-05,
"loss": 0.0048,
"step": 402
},
{
"epoch": 1.3226666666666667,
"grad_norm": 0.003120367182418704,
"learning_rate": 6.71420147777608e-05,
"loss": 0.0047,
"step": 403
},
{
"epoch": 1.325948717948718,
"grad_norm": 0.003493399592116475,
"learning_rate": 6.671101079725513e-05,
"loss": 0.0049,
"step": 404
},
{
"epoch": 1.3292307692307692,
"grad_norm": 0.0038059516809880733,
"learning_rate": 6.62812945655294e-05,
"loss": 0.005,
"step": 405
},
{
"epoch": 1.3325128205128205,
"grad_norm": 0.009474639780819416,
"learning_rate": 6.58528788656623e-05,
"loss": 0.0052,
"step": 406
},
{
"epoch": 1.3357948717948718,
"grad_norm": 0.009293424896895885,
"learning_rate": 6.542577644204456e-05,
"loss": 0.0053,
"step": 407
},
{
"epoch": 1.339076923076923,
"grad_norm": 0.009816371835768223,
"learning_rate": 6.500000000000002e-05,
"loss": 0.0055,
"step": 408
},
{
"epoch": 1.3423589743589743,
"grad_norm": 0.009543578140437603,
"learning_rate": 6.45755622054075e-05,
"loss": 0.0053,
"step": 409
},
{
"epoch": 1.3456410256410256,
"grad_norm": 0.008382177911698818,
"learning_rate": 6.415247568432425e-05,
"loss": 0.0052,
"step": 410
},
{
"epoch": 1.348923076923077,
"grad_norm": 0.00873810425400734,
"learning_rate": 6.373075302261006e-05,
"loss": 0.0052,
"step": 411
},
{
"epoch": 1.3522051282051282,
"grad_norm": 0.007860679179430008,
"learning_rate": 6.331040676555306e-05,
"loss": 0.0052,
"step": 412
},
{
"epoch": 1.3554871794871794,
"grad_norm": 0.007655597757548094,
"learning_rate": 6.289144941749656e-05,
"loss": 0.005,
"step": 413
},
{
"epoch": 1.3587692307692307,
"grad_norm": 0.008586070500314236,
"learning_rate": 6.247389344146688e-05,
"loss": 0.0051,
"step": 414
},
{
"epoch": 1.362051282051282,
"grad_norm": 0.009542787447571754,
"learning_rate": 6.20577512588028e-05,
"loss": 0.005,
"step": 415
},
{
"epoch": 1.3653333333333333,
"grad_norm": 0.00885209534317255,
"learning_rate": 6.164303524878586e-05,
"loss": 0.005,
"step": 416
},
{
"epoch": 1.3686153846153846,
"grad_norm": 0.006266025826334953,
"learning_rate": 6.122975774827238e-05,
"loss": 0.0048,
"step": 417
},
{
"epoch": 1.3718974358974358,
"grad_norm": 0.0066079627722501755,
"learning_rate": 6.081793105132611e-05,
"loss": 0.0048,
"step": 418
},
{
"epoch": 1.3751794871794871,
"grad_norm": 0.00554714584723115,
"learning_rate": 6.0407567408852874e-05,
"loss": 0.0049,
"step": 419
},
{
"epoch": 1.3784615384615384,
"grad_norm": 0.004965722095221281,
"learning_rate": 5.9998679028235824e-05,
"loss": 0.0047,
"step": 420
},
{
"epoch": 1.3817435897435897,
"grad_norm": 0.004424337763339281,
"learning_rate": 5.959127807297251e-05,
"loss": 0.0047,
"step": 421
},
{
"epoch": 1.385025641025641,
"grad_norm": 0.003751178737729788,
"learning_rate": 5.918537666231296e-05,
"loss": 0.0047,
"step": 422
},
{
"epoch": 1.3883076923076922,
"grad_norm": 0.0038334885612130165,
"learning_rate": 5.8780986870899144e-05,
"loss": 0.0048,
"step": 423
},
{
"epoch": 1.3915897435897435,
"grad_norm": 0.0035623824223876,
"learning_rate": 5.8378120728405885e-05,
"loss": 0.0047,
"step": 424
},
{
"epoch": 1.3948717948717948,
"grad_norm": 0.003139911452308297,
"learning_rate": 5.797679021918272e-05,
"loss": 0.0048,
"step": 425
},
{
"epoch": 1.3948717948717948,
"eval_loss": 0.005003854166716337,
"eval_runtime": 1.5778,
"eval_samples_per_second": 31.689,
"eval_steps_per_second": 31.689,
"step": 425
},
{
"epoch": 1.398153846153846,
"grad_norm": 0.0028019817546010017,
"learning_rate": 5.7577007281897824e-05,
"loss": 0.0047,
"step": 426
},
{
"epoch": 1.4014358974358974,
"grad_norm": 0.002678703283891082,
"learning_rate": 5.717878380918251e-05,
"loss": 0.0046,
"step": 427
},
{
"epoch": 1.4047179487179486,
"grad_norm": 0.00241168774664402,
"learning_rate": 5.678213164727761e-05,
"loss": 0.0047,
"step": 428
},
{
"epoch": 1.408,
"grad_norm": 0.0023575942032039165,
"learning_rate": 5.6387062595681006e-05,
"loss": 0.0046,
"step": 429
},
{
"epoch": 1.4112820512820514,
"grad_norm": 0.002579533262178302,
"learning_rate": 5.599358840679673e-05,
"loss": 0.0047,
"step": 430
},
{
"epoch": 1.4145641025641025,
"grad_norm": 0.002402591984719038,
"learning_rate": 5.560172078558521e-05,
"loss": 0.0047,
"step": 431
},
{
"epoch": 1.417846153846154,
"grad_norm": 0.0023617083206772804,
"learning_rate": 5.521147138921514e-05,
"loss": 0.0047,
"step": 432
},
{
"epoch": 1.421128205128205,
"grad_norm": 0.0024777904618531466,
"learning_rate": 5.4822851826716814e-05,
"loss": 0.0046,
"step": 433
},
{
"epoch": 1.4244102564102565,
"grad_norm": 0.002682394813746214,
"learning_rate": 5.443587365863657e-05,
"loss": 0.0048,
"step": 434
},
{
"epoch": 1.4276923076923076,
"grad_norm": 0.0025969718117266893,
"learning_rate": 5.405054839669306e-05,
"loss": 0.0047,
"step": 435
},
{
"epoch": 1.430974358974359,
"grad_norm": 0.0027512714732438326,
"learning_rate": 5.3666887503434693e-05,
"loss": 0.0048,
"step": 436
},
{
"epoch": 1.4342564102564102,
"grad_norm": 0.002662122482433915,
"learning_rate": 5.3284902391898795e-05,
"loss": 0.0047,
"step": 437
},
{
"epoch": 1.4375384615384617,
"grad_norm": 0.002524281619116664,
"learning_rate": 5.290460442527192e-05,
"loss": 0.0047,
"step": 438
},
{
"epoch": 1.4408205128205127,
"grad_norm": 0.003124300390481949,
"learning_rate": 5.252600491655193e-05,
"loss": 0.0047,
"step": 439
},
{
"epoch": 1.4441025641025642,
"grad_norm": 0.0028491078410297632,
"learning_rate": 5.214911512821145e-05,
"loss": 0.0047,
"step": 440
},
{
"epoch": 1.4473846153846153,
"grad_norm": 0.003151330165565014,
"learning_rate": 5.177394627186285e-05,
"loss": 0.0047,
"step": 441
},
{
"epoch": 1.4506666666666668,
"grad_norm": 0.002938151592388749,
"learning_rate": 5.1400509507924596e-05,
"loss": 0.0047,
"step": 442
},
{
"epoch": 1.4539487179487178,
"grad_norm": 0.0029112198390066624,
"learning_rate": 5.102881594528941e-05,
"loss": 0.0047,
"step": 443
},
{
"epoch": 1.4572307692307693,
"grad_norm": 0.003463789587840438,
"learning_rate": 5.06588766409938e-05,
"loss": 0.0048,
"step": 444
},
{
"epoch": 1.4605128205128204,
"grad_norm": 0.003294401103630662,
"learning_rate": 5.0290702599889016e-05,
"loss": 0.0047,
"step": 445
},
{
"epoch": 1.4637948717948719,
"grad_norm": 0.0029868704732507467,
"learning_rate": 4.9924304774313756e-05,
"loss": 0.0047,
"step": 446
},
{
"epoch": 1.467076923076923,
"grad_norm": 0.003178369253873825,
"learning_rate": 4.955969406376835e-05,
"loss": 0.0047,
"step": 447
},
{
"epoch": 1.4703589743589744,
"grad_norm": 0.0042068324983119965,
"learning_rate": 4.919688131459058e-05,
"loss": 0.0048,
"step": 448
},
{
"epoch": 1.4736410256410255,
"grad_norm": 0.00410189013928175,
"learning_rate": 4.883587731963295e-05,
"loss": 0.0048,
"step": 449
},
{
"epoch": 1.476923076923077,
"grad_norm": 0.0037490760441869497,
"learning_rate": 4.847669281794158e-05,
"loss": 0.005,
"step": 450
},
{
"epoch": 1.476923076923077,
"eval_loss": 0.004960117861628532,
"eval_runtime": 1.0676,
"eval_samples_per_second": 46.836,
"eval_steps_per_second": 46.836,
"step": 450
},
{
"epoch": 1.4802051282051283,
"grad_norm": 0.003720939392223954,
"learning_rate": 4.811933849443693e-05,
"loss": 0.0048,
"step": 451
},
{
"epoch": 1.4834871794871796,
"grad_norm": 0.003880757139995694,
"learning_rate": 4.776382497959577e-05,
"loss": 0.0049,
"step": 452
},
{
"epoch": 1.4867692307692308,
"grad_norm": 0.005387154407799244,
"learning_rate": 4.741016284913496e-05,
"loss": 0.0049,
"step": 453
},
{
"epoch": 1.4900512820512821,
"grad_norm": 0.0035007649566978216,
"learning_rate": 4.705836262369696e-05,
"loss": 0.0048,
"step": 454
},
{
"epoch": 1.4933333333333334,
"grad_norm": 0.003741365857422352,
"learning_rate": 4.670843476853683e-05,
"loss": 0.005,
"step": 455
},
{
"epoch": 1.4966153846153847,
"grad_norm": 0.009279366582632065,
"learning_rate": 4.6360389693210735e-05,
"loss": 0.0053,
"step": 456
},
{
"epoch": 1.499897435897436,
"grad_norm": 0.00897167343646288,
"learning_rate": 4.601423775126657e-05,
"loss": 0.0052,
"step": 457
},
{
"epoch": 1.5031794871794872,
"grad_norm": 0.009041314013302326,
"learning_rate": 4.566998923993585e-05,
"loss": 0.0053,
"step": 458
},
{
"epoch": 1.5064615384615383,
"grad_norm": 0.008953151293098927,
"learning_rate": 4.5327654399827175e-05,
"loss": 0.0053,
"step": 459
},
{
"epoch": 1.5097435897435898,
"grad_norm": 0.008846296928822994,
"learning_rate": 4.4987243414622004e-05,
"loss": 0.0052,
"step": 460
},
{
"epoch": 1.513025641025641,
"grad_norm": 0.008246372453868389,
"learning_rate": 4.464876641077137e-05,
"loss": 0.0052,
"step": 461
},
{
"epoch": 1.5163076923076924,
"grad_norm": 0.0073341550305485725,
"learning_rate": 4.431223345719482e-05,
"loss": 0.0051,
"step": 462
},
{
"epoch": 1.5195897435897436,
"grad_norm": 0.006810352671891451,
"learning_rate": 4.397765456498075e-05,
"loss": 0.0051,
"step": 463
},
{
"epoch": 1.522871794871795,
"grad_norm": 0.007108463905751705,
"learning_rate": 4.364503968708885e-05,
"loss": 0.005,
"step": 464
},
{
"epoch": 1.5261538461538462,
"grad_norm": 0.00809982419013977,
"learning_rate": 4.33143987180537e-05,
"loss": 0.005,
"step": 465
},
{
"epoch": 1.5294358974358975,
"grad_norm": 0.007939063012599945,
"learning_rate": 4.298574149369064e-05,
"loss": 0.005,
"step": 466
},
{
"epoch": 1.5327179487179488,
"grad_norm": 0.006030023563653231,
"learning_rate": 4.2659077790803183e-05,
"loss": 0.0048,
"step": 467
},
{
"epoch": 1.536,
"grad_norm": 0.0052713961340487,
"learning_rate": 4.233441732689205e-05,
"loss": 0.0047,
"step": 468
},
{
"epoch": 1.5392820512820513,
"grad_norm": 0.0051575591787695885,
"learning_rate": 4.201176975986618e-05,
"loss": 0.0047,
"step": 469
},
{
"epoch": 1.5425641025641026,
"grad_norm": 0.004836643114686012,
"learning_rate": 4.1691144687755434e-05,
"loss": 0.0048,
"step": 470
},
{
"epoch": 1.5458461538461539,
"grad_norm": 0.004236603155732155,
"learning_rate": 4.137255164842508e-05,
"loss": 0.0047,
"step": 471
},
{
"epoch": 1.5491282051282051,
"grad_norm": 0.0034698331728577614,
"learning_rate": 4.1056000119291995e-05,
"loss": 0.0047,
"step": 472
},
{
"epoch": 1.5524102564102564,
"grad_norm": 0.0033043124713003635,
"learning_rate": 4.074149951704279e-05,
"loss": 0.0048,
"step": 473
},
{
"epoch": 1.5556923076923077,
"grad_norm": 0.00292124948464334,
"learning_rate": 4.042905919735367e-05,
"loss": 0.0047,
"step": 474
},
{
"epoch": 1.558974358974359,
"grad_norm": 0.0029612670186907053,
"learning_rate": 4.0118688454612205e-05,
"loss": 0.0048,
"step": 475
},
{
"epoch": 1.558974358974359,
"eval_loss": 0.004941979423165321,
"eval_runtime": 1.0895,
"eval_samples_per_second": 45.893,
"eval_steps_per_second": 45.893,
"step": 475
},
{
"epoch": 1.5622564102564103,
"grad_norm": 0.0030873967334628105,
"learning_rate": 3.9810396521640656e-05,
"loss": 0.0048,
"step": 476
},
{
"epoch": 1.5655384615384615,
"grad_norm": 0.0027741712983697653,
"learning_rate": 3.9504192569421475e-05,
"loss": 0.0047,
"step": 477
},
{
"epoch": 1.5688205128205128,
"grad_norm": 0.0026183989830315113,
"learning_rate": 3.9200085706824475e-05,
"loss": 0.0047,
"step": 478
},
{
"epoch": 1.572102564102564,
"grad_norm": 0.002464702120050788,
"learning_rate": 3.88980849803358e-05,
"loss": 0.0047,
"step": 479
},
{
"epoch": 1.5753846153846154,
"grad_norm": 0.002672748640179634,
"learning_rate": 3.8598199373788846e-05,
"loss": 0.0048,
"step": 480
},
{
"epoch": 1.5786666666666667,
"grad_norm": 0.002426745370030403,
"learning_rate": 3.8300437808097e-05,
"loss": 0.0048,
"step": 481
},
{
"epoch": 1.581948717948718,
"grad_norm": 0.002388445660471916,
"learning_rate": 3.800480914098834e-05,
"loss": 0.0048,
"step": 482
},
{
"epoch": 1.5852307692307692,
"grad_norm": 0.0023456227499991655,
"learning_rate": 3.771132216674197e-05,
"loss": 0.0048,
"step": 483
},
{
"epoch": 1.5885128205128205,
"grad_norm": 0.0024677433539181948,
"learning_rate": 3.741998561592657e-05,
"loss": 0.0048,
"step": 484
},
{
"epoch": 1.5917948717948718,
"grad_norm": 0.002437222981825471,
"learning_rate": 3.713080815514063e-05,
"loss": 0.0048,
"step": 485
},
{
"epoch": 1.595076923076923,
"grad_norm": 0.0025231228210031986,
"learning_rate": 3.684379838675464e-05,
"loss": 0.0048,
"step": 486
},
{
"epoch": 1.5983589743589743,
"grad_norm": 0.0025096021126955748,
"learning_rate": 3.655896484865512e-05,
"loss": 0.0047,
"step": 487
},
{
"epoch": 1.6016410256410256,
"grad_norm": 0.0026138813700526953,
"learning_rate": 3.627631601399073e-05,
"loss": 0.0048,
"step": 488
},
{
"epoch": 1.604923076923077,
"grad_norm": 0.0025557177141308784,
"learning_rate": 3.599586029092027e-05,
"loss": 0.0047,
"step": 489
},
{
"epoch": 1.6082051282051282,
"grad_norm": 0.0025317317340523005,
"learning_rate": 3.571760602236226e-05,
"loss": 0.0047,
"step": 490
},
{
"epoch": 1.6114871794871795,
"grad_norm": 0.0025278881657868624,
"learning_rate": 3.54415614857472e-05,
"loss": 0.0048,
"step": 491
},
{
"epoch": 1.6147692307692307,
"grad_norm": 0.003274059621617198,
"learning_rate": 3.516773489277092e-05,
"loss": 0.0049,
"step": 492
},
{
"epoch": 1.618051282051282,
"grad_norm": 0.0029409686103463173,
"learning_rate": 3.489613438915061e-05,
"loss": 0.0048,
"step": 493
},
{
"epoch": 1.6213333333333333,
"grad_norm": 0.003028150415048003,
"learning_rate": 3.4626768054382305e-05,
"loss": 0.0048,
"step": 494
},
{
"epoch": 1.6246153846153846,
"grad_norm": 0.0030735956970602274,
"learning_rate": 3.435964390150057e-05,
"loss": 0.0049,
"step": 495
},
{
"epoch": 1.6278974358974359,
"grad_norm": 0.0029834897723048925,
"learning_rate": 3.409476987684031e-05,
"loss": 0.0048,
"step": 496
},
{
"epoch": 1.6311794871794871,
"grad_norm": 0.0033205023501068354,
"learning_rate": 3.3832153859800054e-05,
"loss": 0.0048,
"step": 497
},
{
"epoch": 1.6344615384615384,
"grad_norm": 0.0034721517004072666,
"learning_rate": 3.357180366260791e-05,
"loss": 0.0048,
"step": 498
},
{
"epoch": 1.6377435897435897,
"grad_norm": 0.003484040265902877,
"learning_rate": 3.3313727030088934e-05,
"loss": 0.0048,
"step": 499
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.003361676586791873,
"learning_rate": 3.305793163943483e-05,
"loss": 0.0048,
"step": 500
},
{
"epoch": 1.641025641025641,
"eval_loss": 0.004984674043953419,
"eval_runtime": 1.057,
"eval_samples_per_second": 47.306,
"eval_steps_per_second": 47.306,
"step": 500
},
{
"epoch": 1.6443076923076925,
"grad_norm": 0.0035249050706624985,
"learning_rate": 3.2804425099975525e-05,
"loss": 0.0048,
"step": 501
},
{
"epoch": 1.6475897435897435,
"grad_norm": 0.0035485997796058655,
"learning_rate": 3.25532149529529e-05,
"loss": 0.0048,
"step": 502
},
{
"epoch": 1.650871794871795,
"grad_norm": 0.0034246218856424093,
"learning_rate": 3.2304308671296355e-05,
"loss": 0.0048,
"step": 503
},
{
"epoch": 1.654153846153846,
"grad_norm": 0.0036572501994669437,
"learning_rate": 3.205771365940052e-05,
"loss": 0.0048,
"step": 504
},
{
"epoch": 1.6574358974358976,
"grad_norm": 0.003463857341557741,
"learning_rate": 3.1813437252905096e-05,
"loss": 0.0049,
"step": 505
},
{
"epoch": 1.6607179487179486,
"grad_norm": 0.008703382685780525,
"learning_rate": 3.157148671847649e-05,
"loss": 0.0052,
"step": 506
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.00902031920850277,
"learning_rate": 3.133186925359172e-05,
"loss": 0.0053,
"step": 507
},
{
"epoch": 1.6672820512820512,
"grad_norm": 0.00848945789039135,
"learning_rate": 3.109459198632431e-05,
"loss": 0.0052,
"step": 508
},
{
"epoch": 1.6705641025641027,
"grad_norm": 0.008561553433537483,
"learning_rate": 3.085966197513227e-05,
"loss": 0.0053,
"step": 509
},
{
"epoch": 1.6738461538461538,
"grad_norm": 0.008840641938149929,
"learning_rate": 3.062708620864806e-05,
"loss": 0.0053,
"step": 510
},
{
"epoch": 1.6771282051282053,
"grad_norm": 0.008055276237428188,
"learning_rate": 3.0396871605470702e-05,
"loss": 0.0052,
"step": 511
},
{
"epoch": 1.6804102564102563,
"grad_norm": 0.007263275794684887,
"learning_rate": 3.0169025013960052e-05,
"loss": 0.0052,
"step": 512
},
{
"epoch": 1.6836923076923078,
"grad_norm": 0.006828081328421831,
"learning_rate": 2.9943553212032964e-05,
"loss": 0.005,
"step": 513
},
{
"epoch": 1.6869743589743589,
"grad_norm": 0.006435649003833532,
"learning_rate": 2.972046290696173e-05,
"loss": 0.0051,
"step": 514
},
{
"epoch": 1.6902564102564104,
"grad_norm": 0.007080839481204748,
"learning_rate": 2.9499760735174537e-05,
"loss": 0.0049,
"step": 515
},
{
"epoch": 1.6935384615384614,
"grad_norm": 0.006745288148522377,
"learning_rate": 2.928145326205806e-05,
"loss": 0.0049,
"step": 516
},
{
"epoch": 1.696820512820513,
"grad_norm": 0.00520884245634079,
"learning_rate": 2.906554698176213e-05,
"loss": 0.0047,
"step": 517
},
{
"epoch": 1.700102564102564,
"grad_norm": 0.00431660795584321,
"learning_rate": 2.8852048317006565e-05,
"loss": 0.0047,
"step": 518
},
{
"epoch": 1.7033846153846155,
"grad_norm": 0.004076897166669369,
"learning_rate": 2.8640963618890103e-05,
"loss": 0.0046,
"step": 519
},
{
"epoch": 1.7066666666666666,
"grad_norm": 0.0038497080095112324,
"learning_rate": 2.8432299166701508e-05,
"loss": 0.0048,
"step": 520
},
{
"epoch": 1.709948717948718,
"grad_norm": 0.0030418243259191513,
"learning_rate": 2.8226061167732704e-05,
"loss": 0.0047,
"step": 521
},
{
"epoch": 1.7132307692307691,
"grad_norm": 0.0033263147342950106,
"learning_rate": 2.8022255757094174e-05,
"loss": 0.0046,
"step": 522
},
{
"epoch": 1.7165128205128206,
"grad_norm": 0.0030696901958435774,
"learning_rate": 2.7820888997532464e-05,
"loss": 0.0047,
"step": 523
},
{
"epoch": 1.7197948717948717,
"grad_norm": 0.002596198348328471,
"learning_rate": 2.7621966879249762e-05,
"loss": 0.0048,
"step": 524
},
{
"epoch": 1.7230769230769232,
"grad_norm": 0.00273293349891901,
"learning_rate": 2.7425495319725793e-05,
"loss": 0.0047,
"step": 525
},
{
"epoch": 1.7230769230769232,
"eval_loss": 0.004907587543129921,
"eval_runtime": 1.1425,
"eval_samples_per_second": 43.766,
"eval_steps_per_second": 43.766,
"step": 525
},
{
"epoch": 1.7263589743589742,
"grad_norm": 0.0025641010142862797,
"learning_rate": 2.72314801635417e-05,
"loss": 0.0047,
"step": 526
},
{
"epoch": 1.7296410256410257,
"grad_norm": 0.002465146593749523,
"learning_rate": 2.7039927182206293e-05,
"loss": 0.0047,
"step": 527
},
{
"epoch": 1.7329230769230768,
"grad_norm": 0.0026465991977602243,
"learning_rate": 2.6850842073984196e-05,
"loss": 0.0047,
"step": 528
},
{
"epoch": 1.7362051282051283,
"grad_norm": 0.002503578085452318,
"learning_rate": 2.666423046372651e-05,
"loss": 0.0048,
"step": 529
},
{
"epoch": 1.7394871794871793,
"grad_norm": 0.002414755057543516,
"learning_rate": 2.6480097902703404e-05,
"loss": 0.0047,
"step": 530
},
{
"epoch": 1.7427692307692308,
"grad_norm": 0.00238244840875268,
"learning_rate": 2.629844986843892e-05,
"loss": 0.0046,
"step": 531
},
{
"epoch": 1.746051282051282,
"grad_norm": 0.002321184379979968,
"learning_rate": 2.611929176454814e-05,
"loss": 0.0047,
"step": 532
},
{
"epoch": 1.7493333333333334,
"grad_norm": 0.002244308590888977,
"learning_rate": 2.59426289205764e-05,
"loss": 0.0047,
"step": 533
},
{
"epoch": 1.7526153846153845,
"grad_norm": 0.0020898343063890934,
"learning_rate": 2.5768466591840707e-05,
"loss": 0.0047,
"step": 534
},
{
"epoch": 1.755897435897436,
"grad_norm": 0.002154160290956497,
"learning_rate": 2.5596809959273432e-05,
"loss": 0.0048,
"step": 535
},
{
"epoch": 1.759179487179487,
"grad_norm": 0.0022296863608062267,
"learning_rate": 2.542766412926825e-05,
"loss": 0.0047,
"step": 536
},
{
"epoch": 1.7624615384615385,
"grad_norm": 0.0023085817229002714,
"learning_rate": 2.5261034133528138e-05,
"loss": 0.0048,
"step": 537
},
{
"epoch": 1.7657435897435896,
"grad_norm": 0.0021941603627055883,
"learning_rate": 2.5096924928915733e-05,
"loss": 0.0047,
"step": 538
},
{
"epoch": 1.769025641025641,
"grad_norm": 0.002463526790961623,
"learning_rate": 2.4935341397305903e-05,
"loss": 0.0047,
"step": 539
},
{
"epoch": 1.7723076923076924,
"grad_norm": 0.0022917832247912884,
"learning_rate": 2.4776288345440503e-05,
"loss": 0.0048,
"step": 540
},
{
"epoch": 1.7755897435897436,
"grad_norm": 0.0023666713386774063,
"learning_rate": 2.461977050478534e-05,
"loss": 0.0049,
"step": 541
},
{
"epoch": 1.778871794871795,
"grad_norm": 0.0024120372254401445,
"learning_rate": 2.4465792531389504e-05,
"loss": 0.0047,
"step": 542
},
{
"epoch": 1.7821538461538462,
"grad_norm": 0.0027782933320850134,
"learning_rate": 2.4314359005746817e-05,
"loss": 0.0047,
"step": 543
},
{
"epoch": 1.7854358974358975,
"grad_norm": 0.0026203745510429144,
"learning_rate": 2.4165474432659588e-05,
"loss": 0.0047,
"step": 544
},
{
"epoch": 1.7887179487179488,
"grad_norm": 0.0026280086021870375,
"learning_rate": 2.401914324110456e-05,
"loss": 0.0046,
"step": 545
},
{
"epoch": 1.792,
"grad_norm": 0.002855598460882902,
"learning_rate": 2.387536978410121e-05,
"loss": 0.0047,
"step": 546
},
{
"epoch": 1.7952820512820513,
"grad_norm": 0.0029618304688483477,
"learning_rate": 2.373415833858226e-05,
"loss": 0.0047,
"step": 547
},
{
"epoch": 1.7985641025641026,
"grad_norm": 0.0031228596344590187,
"learning_rate": 2.359551310526643e-05,
"loss": 0.0048,
"step": 548
},
{
"epoch": 1.8018461538461539,
"grad_norm": 0.003316520480439067,
"learning_rate": 2.345943820853342e-05,
"loss": 0.0047,
"step": 549
},
{
"epoch": 1.8051282051282052,
"grad_norm": 0.004680112935602665,
"learning_rate": 2.332593769630136e-05,
"loss": 0.0048,
"step": 550
},
{
"epoch": 1.8051282051282052,
"eval_loss": 0.005017252638936043,
"eval_runtime": 1.0529,
"eval_samples_per_second": 47.487,
"eval_steps_per_second": 47.487,
"step": 550
},
{
"epoch": 1.8084102564102564,
"grad_norm": 0.0034276428632438183,
"learning_rate": 2.3195015539906243e-05,
"loss": 0.0049,
"step": 551
},
{
"epoch": 1.8116923076923077,
"grad_norm": 0.0030685942620038986,
"learning_rate": 2.3066675633983865e-05,
"loss": 0.0048,
"step": 552
},
{
"epoch": 1.814974358974359,
"grad_norm": 0.0046894908882677555,
"learning_rate": 2.2940921796353956e-05,
"loss": 0.0047,
"step": 553
},
{
"epoch": 1.8182564102564103,
"grad_norm": 0.0035674276296049356,
"learning_rate": 2.2817757767906625e-05,
"loss": 0.0048,
"step": 554
},
{
"epoch": 1.8215384615384616,
"grad_norm": 0.005269620567560196,
"learning_rate": 2.2697187212491044e-05,
"loss": 0.0051,
"step": 555
},
{
"epoch": 1.8248205128205128,
"grad_norm": 0.008738451637327671,
"learning_rate": 2.2579213716806474e-05,
"loss": 0.0052,
"step": 556
},
{
"epoch": 1.828102564102564,
"grad_norm": 0.008472139947116375,
"learning_rate": 2.2463840790295566e-05,
"loss": 0.0051,
"step": 557
},
{
"epoch": 1.8313846153846154,
"grad_norm": 0.008605373091995716,
"learning_rate": 2.2351071865039974e-05,
"loss": 0.0051,
"step": 558
},
{
"epoch": 1.8346666666666667,
"grad_norm": 0.02175315096974373,
"learning_rate": 2.224091029565824e-05,
"loss": 0.0053,
"step": 559
},
{
"epoch": 1.837948717948718,
"grad_norm": 0.008465359918773174,
"learning_rate": 2.2133359359206e-05,
"loss": 0.0052,
"step": 560
},
{
"epoch": 1.8412307692307692,
"grad_norm": 0.007232977543026209,
"learning_rate": 2.2028422255078542e-05,
"loss": 0.0052,
"step": 561
},
{
"epoch": 1.8445128205128205,
"grad_norm": 0.007051311433315277,
"learning_rate": 2.1926102104915553e-05,
"loss": 0.0051,
"step": 562
},
{
"epoch": 1.8477948717948718,
"grad_norm": 0.006151077803224325,
"learning_rate": 2.182640195250835e-05,
"loss": 0.005,
"step": 563
},
{
"epoch": 1.851076923076923,
"grad_norm": 0.006573867984116077,
"learning_rate": 2.1729324763709264e-05,
"loss": 0.0051,
"step": 564
},
{
"epoch": 1.8543589743589743,
"grad_norm": 0.00678396737203002,
"learning_rate": 2.1634873426343427e-05,
"loss": 0.0049,
"step": 565
},
{
"epoch": 1.8576410256410256,
"grad_norm": 0.005578219890594482,
"learning_rate": 2.1543050750122902e-05,
"loss": 0.0048,
"step": 566
},
{
"epoch": 1.860923076923077,
"grad_norm": 0.0040833973325788975,
"learning_rate": 2.145385946656303e-05,
"loss": 0.0047,
"step": 567
},
{
"epoch": 1.8642051282051282,
"grad_norm": 0.004177347291260958,
"learning_rate": 2.1367302228901282e-05,
"loss": 0.0046,
"step": 568
},
{
"epoch": 1.8674871794871795,
"grad_norm": 0.0036663906648755074,
"learning_rate": 2.128338161201819e-05,
"loss": 0.0047,
"step": 569
},
{
"epoch": 1.8707692307692307,
"grad_norm": 0.003597427159547806,
"learning_rate": 2.1202100112360894e-05,
"loss": 0.0048,
"step": 570
},
{
"epoch": 1.874051282051282,
"grad_norm": 0.0029398370534181595,
"learning_rate": 2.1123460147868763e-05,
"loss": 0.0048,
"step": 571
},
{
"epoch": 1.8773333333333333,
"grad_norm": 0.003072077641263604,
"learning_rate": 2.1047464057901542e-05,
"loss": 0.0048,
"step": 572
},
{
"epoch": 1.8806153846153846,
"grad_norm": 0.002605011221021414,
"learning_rate": 2.0974114103169712e-05,
"loss": 0.0048,
"step": 573
},
{
"epoch": 1.8838974358974359,
"grad_norm": 0.002371675567701459,
"learning_rate": 2.0903412465667293e-05,
"loss": 0.0047,
"step": 574
},
{
"epoch": 1.8871794871794871,
"grad_norm": 0.002911495743319392,
"learning_rate": 2.0835361248606867e-05,
"loss": 0.0047,
"step": 575
},
{
"epoch": 1.8871794871794871,
"eval_loss": 0.0050178528763353825,
"eval_runtime": 1.0828,
"eval_samples_per_second": 46.176,
"eval_steps_per_second": 46.176,
"step": 575
},
{
"epoch": 1.8904615384615384,
"grad_norm": 0.0025259945541620255,
"learning_rate": 2.0769962476357068e-05,
"loss": 0.0047,
"step": 576
},
{
"epoch": 1.8937435897435897,
"grad_norm": 0.0023200158029794693,
"learning_rate": 2.070721809438233e-05,
"loss": 0.0047,
"step": 577
},
{
"epoch": 1.897025641025641,
"grad_norm": 0.0023292931728065014,
"learning_rate": 2.0647129969185046e-05,
"loss": 0.0048,
"step": 578
},
{
"epoch": 1.9003076923076923,
"grad_norm": 0.0025951117277145386,
"learning_rate": 2.058969988825001e-05,
"loss": 0.0047,
"step": 579
},
{
"epoch": 1.9035897435897438,
"grad_norm": 0.0026415924075990915,
"learning_rate": 2.0534929559991233e-05,
"loss": 0.0047,
"step": 580
},
{
"epoch": 1.9068717948717948,
"grad_norm": 0.0020874382462352514,
"learning_rate": 2.0482820613701192e-05,
"loss": 0.0046,
"step": 581
},
{
"epoch": 1.9101538461538463,
"grad_norm": 0.002052360912784934,
"learning_rate": 2.043337459950229e-05,
"loss": 0.0046,
"step": 582
},
{
"epoch": 1.9134358974358974,
"grad_norm": 0.0021120973397046328,
"learning_rate": 2.0386592988300747e-05,
"loss": 0.0046,
"step": 583
},
{
"epoch": 1.9167179487179489,
"grad_norm": 0.0021454044617712498,
"learning_rate": 2.03424771717429e-05,
"loss": 0.0047,
"step": 584
},
{
"epoch": 1.92,
"grad_norm": 0.0023362315259873867,
"learning_rate": 2.0301028462173774e-05,
"loss": 0.0048,
"step": 585
},
{
"epoch": 1.9232820512820514,
"grad_norm": 0.002209689933806658,
"learning_rate": 2.0262248092598006e-05,
"loss": 0.0048,
"step": 586
},
{
"epoch": 1.9265641025641025,
"grad_norm": 0.0022381660528481007,
"learning_rate": 2.0226137216643222e-05,
"loss": 0.0048,
"step": 587
},
{
"epoch": 1.929846153846154,
"grad_norm": 0.002202109433710575,
"learning_rate": 2.019269690852569e-05,
"loss": 0.0047,
"step": 588
},
{
"epoch": 1.933128205128205,
"grad_norm": 0.0021981867030262947,
"learning_rate": 2.016192816301837e-05,
"loss": 0.0046,
"step": 589
},
{
"epoch": 1.9364102564102565,
"grad_norm": 0.002059696475043893,
"learning_rate": 2.0133831895421322e-05,
"loss": 0.0047,
"step": 590
},
{
"epoch": 1.9396923076923076,
"grad_norm": 0.0020739359315484762,
"learning_rate": 2.0108408941534486e-05,
"loss": 0.0046,
"step": 591
},
{
"epoch": 1.942974358974359,
"grad_norm": 0.0024034185335040092,
"learning_rate": 2.00856600576328e-05,
"loss": 0.0047,
"step": 592
},
{
"epoch": 1.9462564102564102,
"grad_norm": 0.0022281610872596502,
"learning_rate": 2.006558592044373e-05,
"loss": 0.0048,
"step": 593
},
{
"epoch": 1.9495384615384617,
"grad_norm": 0.0029593328945338726,
"learning_rate": 2.0048187127127092e-05,
"loss": 0.0049,
"step": 594
},
{
"epoch": 1.9528205128205127,
"grad_norm": 0.002573527628555894,
"learning_rate": 2.003346419525735e-05,
"loss": 0.0048,
"step": 595
},
{
"epoch": 1.9561025641025642,
"grad_norm": 0.002822197275236249,
"learning_rate": 2.002141756280818e-05,
"loss": 0.0047,
"step": 596
},
{
"epoch": 1.9593846153846153,
"grad_norm": 0.002600959734991193,
"learning_rate": 2.001204758813944e-05,
"loss": 0.0047,
"step": 597
},
{
"epoch": 1.9626666666666668,
"grad_norm": 0.003187810303643346,
"learning_rate": 2.0005354549986523e-05,
"loss": 0.0047,
"step": 598
},
{
"epoch": 1.9659487179487178,
"grad_norm": 0.0029263379983603954,
"learning_rate": 2.0001338647452058e-05,
"loss": 0.0048,
"step": 599
},
{
"epoch": 1.9692307692307693,
"grad_norm": 0.003195718163624406,
"learning_rate": 2e-05,
"loss": 0.0048,
"step": 600
},
{
"epoch": 1.9692307692307693,
"eval_loss": 0.004978457931429148,
"eval_runtime": 1.0723,
"eval_samples_per_second": 46.628,
"eval_steps_per_second": 46.628,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 80,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.654552359691878e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}