27b-7-lora / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
03fa3fd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1638,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003663003663003663,
"grad_norm": 2.7161898612976074,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.6832668781280518,
"step": 2
},
{
"epoch": 0.007326007326007326,
"grad_norm": 1.0459221601486206,
"learning_rate": 3e-06,
"loss": 1.6646876335144043,
"step": 4
},
{
"epoch": 0.01098901098901099,
"grad_norm": 0.2796992063522339,
"learning_rate": 5e-06,
"loss": 1.8732850551605225,
"step": 6
},
{
"epoch": 0.014652014652014652,
"grad_norm": 0.3104994297027588,
"learning_rate": 7.000000000000001e-06,
"loss": 1.9880081415176392,
"step": 8
},
{
"epoch": 0.018315018315018316,
"grad_norm": 0.17194640636444092,
"learning_rate": 9e-06,
"loss": 2.0404136180877686,
"step": 10
},
{
"epoch": 0.02197802197802198,
"grad_norm": 0.6366355419158936,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.7833327054977417,
"step": 12
},
{
"epoch": 0.02564102564102564,
"grad_norm": 0.19716234505176544,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.6680744886398315,
"step": 14
},
{
"epoch": 0.029304029304029304,
"grad_norm": 0.3166082799434662,
"learning_rate": 1.5e-05,
"loss": 1.5191091299057007,
"step": 16
},
{
"epoch": 0.03296703296703297,
"grad_norm": 0.6405833959579468,
"learning_rate": 1.7000000000000003e-05,
"loss": 1.437489628791809,
"step": 18
},
{
"epoch": 0.03663003663003663,
"grad_norm": 0.10038357973098755,
"learning_rate": 1.9e-05,
"loss": 1.6290624141693115,
"step": 20
},
{
"epoch": 0.040293040293040296,
"grad_norm": 0.311852365732193,
"learning_rate": 2.1e-05,
"loss": 0.8650764226913452,
"step": 22
},
{
"epoch": 0.04395604395604396,
"grad_norm": 0.24845249950885773,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.9759135842323303,
"step": 24
},
{
"epoch": 0.047619047619047616,
"grad_norm": 0.32957103848457336,
"learning_rate": 2.5e-05,
"loss": 1.30423903465271,
"step": 26
},
{
"epoch": 0.05128205128205128,
"grad_norm": 0.2035657912492752,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.0214941501617432,
"step": 28
},
{
"epoch": 0.054945054945054944,
"grad_norm": 0.4259459972381592,
"learning_rate": 2.9e-05,
"loss": 1.1116687059402466,
"step": 30
},
{
"epoch": 0.05860805860805861,
"grad_norm": 0.295806884765625,
"learning_rate": 3.1e-05,
"loss": 1.0386732816696167,
"step": 32
},
{
"epoch": 0.06227106227106227,
"grad_norm": 0.05838385224342346,
"learning_rate": 3.3e-05,
"loss": 1.1950305700302124,
"step": 34
},
{
"epoch": 0.06593406593406594,
"grad_norm": 0.0978633388876915,
"learning_rate": 3.5e-05,
"loss": 1.5224722623825073,
"step": 36
},
{
"epoch": 0.0695970695970696,
"grad_norm": 0.3066957890987396,
"learning_rate": 3.7e-05,
"loss": 1.309075951576233,
"step": 38
},
{
"epoch": 0.07326007326007326,
"grad_norm": 0.160082146525383,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.3190542459487915,
"step": 40
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.34093159437179565,
"learning_rate": 4.1e-05,
"loss": 0.985637903213501,
"step": 42
},
{
"epoch": 0.08058608058608059,
"grad_norm": 0.3740093410015106,
"learning_rate": 4.3e-05,
"loss": 1.4261013269424438,
"step": 44
},
{
"epoch": 0.08424908424908426,
"grad_norm": 0.2005496323108673,
"learning_rate": 4.5e-05,
"loss": 1.5266393423080444,
"step": 46
},
{
"epoch": 0.08791208791208792,
"grad_norm": 0.13309991359710693,
"learning_rate": 4.7e-05,
"loss": 0.9458646774291992,
"step": 48
},
{
"epoch": 0.09157509157509157,
"grad_norm": 0.15211951732635498,
"learning_rate": 4.9e-05,
"loss": 1.4107502698898315,
"step": 50
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.22298896312713623,
"learning_rate": 4.9999955969752164e-05,
"loss": 0.7173015475273132,
"step": 52
},
{
"epoch": 0.0989010989010989,
"grad_norm": 0.2911456227302551,
"learning_rate": 4.999960372880343e-05,
"loss": 0.8890910744667053,
"step": 54
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.9632299542427063,
"learning_rate": 4.9998899252420356e-05,
"loss": 1.2817891836166382,
"step": 56
},
{
"epoch": 0.10622710622710622,
"grad_norm": 0.5119697451591492,
"learning_rate": 4.9997842551631656e-05,
"loss": 1.1215670108795166,
"step": 58
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.1322525441646576,
"learning_rate": 4.999643364298017e-05,
"loss": 1.3118717670440674,
"step": 60
},
{
"epoch": 0.11355311355311355,
"grad_norm": 0.1301015466451645,
"learning_rate": 4.9994672548522613e-05,
"loss": 1.3311526775360107,
"step": 62
},
{
"epoch": 0.11721611721611722,
"grad_norm": 0.11371763795614243,
"learning_rate": 4.999255929582926e-05,
"loss": 1.3023815155029297,
"step": 64
},
{
"epoch": 0.12087912087912088,
"grad_norm": 0.14048679172992706,
"learning_rate": 4.9990093917983465e-05,
"loss": 1.2759833335876465,
"step": 66
},
{
"epoch": 0.12454212454212454,
"grad_norm": 0.23039552569389343,
"learning_rate": 4.9987276453581165e-05,
"loss": 1.2587000131607056,
"step": 68
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.4443025290966034,
"learning_rate": 4.998410694673029e-05,
"loss": 1.1879582405090332,
"step": 70
},
{
"epoch": 0.13186813186813187,
"grad_norm": 0.22435733675956726,
"learning_rate": 4.998058544705005e-05,
"loss": 1.3695639371871948,
"step": 72
},
{
"epoch": 0.13553113553113552,
"grad_norm": 0.22466352581977844,
"learning_rate": 4.997671200967017e-05,
"loss": 1.5553536415100098,
"step": 74
},
{
"epoch": 0.1391941391941392,
"grad_norm": 0.05474912002682686,
"learning_rate": 4.997248669523002e-05,
"loss": 1.0360654592514038,
"step": 76
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.17411059141159058,
"learning_rate": 4.9967909569877686e-05,
"loss": 1.3734983205795288,
"step": 78
},
{
"epoch": 0.14652014652014653,
"grad_norm": 0.22640125453472137,
"learning_rate": 4.99629807052689e-05,
"loss": 1.5512079000473022,
"step": 80
},
{
"epoch": 0.15018315018315018,
"grad_norm": 0.16909095644950867,
"learning_rate": 4.995770017856595e-05,
"loss": 1.1615397930145264,
"step": 82
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.14065229892730713,
"learning_rate": 4.995206807243644e-05,
"loss": 1.3165048360824585,
"step": 84
},
{
"epoch": 0.1575091575091575,
"grad_norm": 0.2557665705680847,
"learning_rate": 4.994608447505203e-05,
"loss": 1.456904411315918,
"step": 86
},
{
"epoch": 0.16117216117216118,
"grad_norm": 0.18825113773345947,
"learning_rate": 4.993974948008705e-05,
"loss": 0.7387548685073853,
"step": 88
},
{
"epoch": 0.16483516483516483,
"grad_norm": 0.09088045358657837,
"learning_rate": 4.9933063186717006e-05,
"loss": 0.8501173257827759,
"step": 90
},
{
"epoch": 0.1684981684981685,
"grad_norm": 0.31493327021598816,
"learning_rate": 4.992602569961704e-05,
"loss": 1.2714766263961792,
"step": 92
},
{
"epoch": 0.17216117216117216,
"grad_norm": 0.3955460786819458,
"learning_rate": 4.991863712896033e-05,
"loss": 1.29978609085083,
"step": 94
},
{
"epoch": 0.17582417582417584,
"grad_norm": 0.22290943562984467,
"learning_rate": 4.991089759041628e-05,
"loss": 1.1851716041564941,
"step": 96
},
{
"epoch": 0.1794871794871795,
"grad_norm": 0.1698511391878128,
"learning_rate": 4.99028072051488e-05,
"loss": 1.2791502475738525,
"step": 98
},
{
"epoch": 0.18315018315018314,
"grad_norm": 0.2091524451971054,
"learning_rate": 4.989436609981437e-05,
"loss": 1.143870234489441,
"step": 100
},
{
"epoch": 0.18681318681318682,
"grad_norm": 0.39920395612716675,
"learning_rate": 4.988557440656004e-05,
"loss": 1.2132782936096191,
"step": 102
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.23149904608726501,
"learning_rate": 4.987643226302138e-05,
"loss": 0.8638072609901428,
"step": 104
},
{
"epoch": 0.19413919413919414,
"grad_norm": 0.8971022367477417,
"learning_rate": 4.9866939812320326e-05,
"loss": 1.0543807744979858,
"step": 106
},
{
"epoch": 0.1978021978021978,
"grad_norm": 0.14404769241809845,
"learning_rate": 4.9857097203062955e-05,
"loss": 1.27614426612854,
"step": 108
},
{
"epoch": 0.20146520146520147,
"grad_norm": 0.20676514506340027,
"learning_rate": 4.984690458933711e-05,
"loss": 1.1264268159866333,
"step": 110
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.2630727291107178,
"learning_rate": 4.983636213071004e-05,
"loss": 1.1916111707687378,
"step": 112
},
{
"epoch": 0.2087912087912088,
"grad_norm": 0.3776465356349945,
"learning_rate": 4.982546999222587e-05,
"loss": 0.6021360158920288,
"step": 114
},
{
"epoch": 0.21245421245421245,
"grad_norm": 0.3124425709247589,
"learning_rate": 4.981422834440303e-05,
"loss": 1.2633193731307983,
"step": 116
},
{
"epoch": 0.21611721611721613,
"grad_norm": 0.21436505019664764,
"learning_rate": 4.98026373632316e-05,
"loss": 1.2517153024673462,
"step": 118
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.165570467710495,
"learning_rate": 4.97906972301705e-05,
"loss": 1.329060673713684,
"step": 120
},
{
"epoch": 0.22344322344322345,
"grad_norm": 0.1781710535287857,
"learning_rate": 4.9778408132144715e-05,
"loss": 1.2554113864898682,
"step": 122
},
{
"epoch": 0.2271062271062271,
"grad_norm": 0.4390529692173004,
"learning_rate": 4.976577026154235e-05,
"loss": 0.9952642321586609,
"step": 124
},
{
"epoch": 0.23076923076923078,
"grad_norm": 1.2797257900238037,
"learning_rate": 4.9752783816211576e-05,
"loss": 0.872045636177063,
"step": 126
},
{
"epoch": 0.23443223443223443,
"grad_norm": 0.3392629325389862,
"learning_rate": 4.973944899945758e-05,
"loss": 1.1586322784423828,
"step": 128
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.216371089220047,
"learning_rate": 4.9725766020039395e-05,
"loss": 1.2613385915756226,
"step": 130
},
{
"epoch": 0.24175824175824176,
"grad_norm": 0.44755294919013977,
"learning_rate": 4.971173509216656e-05,
"loss": 1.3555878400802612,
"step": 132
},
{
"epoch": 0.2454212454212454,
"grad_norm": 0.20577383041381836,
"learning_rate": 4.969735643549583e-05,
"loss": 1.2522915601730347,
"step": 134
},
{
"epoch": 0.2490842490842491,
"grad_norm": 0.33351951837539673,
"learning_rate": 4.968263027512773e-05,
"loss": 1.2353262901306152,
"step": 136
},
{
"epoch": 0.25274725274725274,
"grad_norm": 0.12553884088993073,
"learning_rate": 4.966755684160301e-05,
"loss": 1.057889699935913,
"step": 138
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.17903219163417816,
"learning_rate": 4.9652136370899035e-05,
"loss": 1.23538076877594,
"step": 140
},
{
"epoch": 0.2600732600732601,
"grad_norm": 0.3907378315925598,
"learning_rate": 4.963636910442611e-05,
"loss": 1.2505638599395752,
"step": 142
},
{
"epoch": 0.26373626373626374,
"grad_norm": 0.1998424232006073,
"learning_rate": 4.96202552890237e-05,
"loss": 1.2176811695098877,
"step": 144
},
{
"epoch": 0.2673992673992674,
"grad_norm": 0.4392535388469696,
"learning_rate": 4.960379517695654e-05,
"loss": 1.3697282075881958,
"step": 146
},
{
"epoch": 0.27106227106227104,
"grad_norm": 0.3896879255771637,
"learning_rate": 4.958698902591072e-05,
"loss": 1.2809630632400513,
"step": 148
},
{
"epoch": 0.27472527472527475,
"grad_norm": 0.3242920935153961,
"learning_rate": 4.9569837098989626e-05,
"loss": 0.9012686014175415,
"step": 150
},
{
"epoch": 0.2783882783882784,
"grad_norm": 0.13880665600299835,
"learning_rate": 4.9552339664709807e-05,
"loss": 0.6081559658050537,
"step": 152
},
{
"epoch": 0.28205128205128205,
"grad_norm": 0.27750471234321594,
"learning_rate": 4.9534496996996845e-05,
"loss": 1.0718085765838623,
"step": 154
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.4269710183143616,
"learning_rate": 4.951630937518096e-05,
"loss": 1.3045586347579956,
"step": 156
},
{
"epoch": 0.2893772893772894,
"grad_norm": 0.36114686727523804,
"learning_rate": 4.949777708399273e-05,
"loss": 1.24015212059021,
"step": 158
},
{
"epoch": 0.29304029304029305,
"grad_norm": 0.18885226547718048,
"learning_rate": 4.947890041355858e-05,
"loss": 0.9190669655799866,
"step": 160
},
{
"epoch": 0.2967032967032967,
"grad_norm": 0.42237764596939087,
"learning_rate": 4.9459679659396257e-05,
"loss": 1.4927023649215698,
"step": 162
},
{
"epoch": 0.30036630036630035,
"grad_norm": 0.1873409003019333,
"learning_rate": 4.944011512241021e-05,
"loss": 1.0130228996276855,
"step": 164
},
{
"epoch": 0.304029304029304,
"grad_norm": 0.290294885635376,
"learning_rate": 4.942020710888684e-05,
"loss": 1.3621708154678345,
"step": 166
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.7741953134536743,
"learning_rate": 4.939995593048979e-05,
"loss": 1.1007283926010132,
"step": 168
},
{
"epoch": 0.31135531135531136,
"grad_norm": 0.4494468867778778,
"learning_rate": 4.937936190425495e-05,
"loss": 1.2320328950881958,
"step": 170
},
{
"epoch": 0.315018315018315,
"grad_norm": 1.186848759651184,
"learning_rate": 4.9358425352585616e-05,
"loss": 1.0239619016647339,
"step": 172
},
{
"epoch": 0.31868131868131866,
"grad_norm": 0.1502193659543991,
"learning_rate": 4.933714660324735e-05,
"loss": 0.816228449344635,
"step": 174
},
{
"epoch": 0.32234432234432236,
"grad_norm": 0.24374300241470337,
"learning_rate": 4.931552598936287e-05,
"loss": 1.370795726776123,
"step": 176
},
{
"epoch": 0.326007326007326,
"grad_norm": 0.16081377863883972,
"learning_rate": 4.929356384940688e-05,
"loss": 0.8959931135177612,
"step": 178
},
{
"epoch": 0.32967032967032966,
"grad_norm": 0.21084611117839813,
"learning_rate": 4.927126052720071e-05,
"loss": 1.447354793548584,
"step": 180
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.13184507191181183,
"learning_rate": 4.924861637190698e-05,
"loss": 0.954731285572052,
"step": 182
},
{
"epoch": 0.336996336996337,
"grad_norm": 0.2652509808540344,
"learning_rate": 4.922563173802409e-05,
"loss": 1.2110737562179565,
"step": 184
},
{
"epoch": 0.34065934065934067,
"grad_norm": 0.34187304973602295,
"learning_rate": 4.9202306985380734e-05,
"loss": 1.2186378240585327,
"step": 186
},
{
"epoch": 0.3443223443223443,
"grad_norm": 0.08439934998750687,
"learning_rate": 4.917864247913018e-05,
"loss": 1.155535101890564,
"step": 188
},
{
"epoch": 0.34798534798534797,
"grad_norm": 0.22483469545841217,
"learning_rate": 4.9154638589744646e-05,
"loss": 1.2381874322891235,
"step": 190
},
{
"epoch": 0.3516483516483517,
"grad_norm": 10.98539924621582,
"learning_rate": 4.913029569300942e-05,
"loss": 1.0877535343170166,
"step": 192
},
{
"epoch": 0.3553113553113553,
"grad_norm": 0.09030856937170029,
"learning_rate": 4.9105614170017034e-05,
"loss": 1.2255364656448364,
"step": 194
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.22610144317150116,
"learning_rate": 4.908059440716127e-05,
"loss": 1.2344918251037598,
"step": 196
},
{
"epoch": 0.3626373626373626,
"grad_norm": 0.6382534503936768,
"learning_rate": 4.9055236796131115e-05,
"loss": 0.8511998653411865,
"step": 198
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.22709640860557556,
"learning_rate": 4.902954173390464e-05,
"loss": 0.9911755323410034,
"step": 200
},
{
"epoch": 0.36996336996337,
"grad_norm": 0.2304632067680359,
"learning_rate": 4.900350962274275e-05,
"loss": 1.4108072519302368,
"step": 202
},
{
"epoch": 0.37362637362637363,
"grad_norm": 0.2119007110595703,
"learning_rate": 4.897714087018296e-05,
"loss": 1.1905288696289062,
"step": 204
},
{
"epoch": 0.3772893772893773,
"grad_norm": 0.2922574579715729,
"learning_rate": 4.895043588903292e-05,
"loss": 0.7706769704818726,
"step": 206
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.6245097517967224,
"learning_rate": 4.892339509736404e-05,
"loss": 1.1153967380523682,
"step": 208
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.36032259464263916,
"learning_rate": 4.889601891850486e-05,
"loss": 1.3866379261016846,
"step": 210
},
{
"epoch": 0.3882783882783883,
"grad_norm": 0.14774373173713684,
"learning_rate": 4.886830778103452e-05,
"loss": 1.0544565916061401,
"step": 212
},
{
"epoch": 0.39194139194139194,
"grad_norm": 0.2895050346851349,
"learning_rate": 4.884026211877596e-05,
"loss": 1.1082898378372192,
"step": 214
},
{
"epoch": 0.3956043956043956,
"grad_norm": 0.2137245386838913,
"learning_rate": 4.881188237078919e-05,
"loss": 1.2029824256896973,
"step": 216
},
{
"epoch": 0.3992673992673993,
"grad_norm": 0.3577767014503479,
"learning_rate": 4.878316898136437e-05,
"loss": 1.2338331937789917,
"step": 218
},
{
"epoch": 0.40293040293040294,
"grad_norm": 0.09472601860761642,
"learning_rate": 4.875412240001491e-05,
"loss": 0.6112377047538757,
"step": 220
},
{
"epoch": 0.4065934065934066,
"grad_norm": 0.1504858434200287,
"learning_rate": 4.872474308147037e-05,
"loss": 1.3192267417907715,
"step": 222
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.3618045449256897,
"learning_rate": 4.869503148566939e-05,
"loss": 1.0624542236328125,
"step": 224
},
{
"epoch": 0.4139194139194139,
"grad_norm": 0.13315747678279877,
"learning_rate": 4.866498807775247e-05,
"loss": 1.2139613628387451,
"step": 226
},
{
"epoch": 0.4175824175824176,
"grad_norm": 0.2177940309047699,
"learning_rate": 4.8634613328054674e-05,
"loss": 1.2820316553115845,
"step": 228
},
{
"epoch": 0.42124542124542125,
"grad_norm": 0.3730919063091278,
"learning_rate": 4.8603907712098305e-05,
"loss": 1.2036633491516113,
"step": 230
},
{
"epoch": 0.4249084249084249,
"grad_norm": 0.5930754542350769,
"learning_rate": 4.8572871710585424e-05,
"loss": 0.9775714874267578,
"step": 232
},
{
"epoch": 0.42857142857142855,
"grad_norm": 5.749536991119385,
"learning_rate": 4.854150580939035e-05,
"loss": 1.4643810987472534,
"step": 234
},
{
"epoch": 0.43223443223443225,
"grad_norm": 0.18177424371242523,
"learning_rate": 4.850981049955203e-05,
"loss": 0.99868243932724,
"step": 236
},
{
"epoch": 0.4358974358974359,
"grad_norm": 0.5519245266914368,
"learning_rate": 4.847778627726636e-05,
"loss": 1.051274299621582,
"step": 238
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.24456854164600372,
"learning_rate": 4.844543364387844e-05,
"loss": 0.8957317471504211,
"step": 240
},
{
"epoch": 0.4432234432234432,
"grad_norm": 0.32938405871391296,
"learning_rate": 4.8412753105874703e-05,
"loss": 0.8530710339546204,
"step": 242
},
{
"epoch": 0.4468864468864469,
"grad_norm": 0.08594862371683121,
"learning_rate": 4.837974517487496e-05,
"loss": 0.560033917427063,
"step": 244
},
{
"epoch": 0.45054945054945056,
"grad_norm": 0.12811991572380066,
"learning_rate": 4.8346410367624465e-05,
"loss": 1.2348781824111938,
"step": 246
},
{
"epoch": 0.4542124542124542,
"grad_norm": 0.21825870871543884,
"learning_rate": 4.831274920598574e-05,
"loss": 0.8636214733123779,
"step": 248
},
{
"epoch": 0.45787545787545786,
"grad_norm": 0.6492200493812561,
"learning_rate": 4.8278762216930456e-05,
"loss": 1.246092677116394,
"step": 250
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.16339761018753052,
"learning_rate": 4.8244449932531195e-05,
"loss": 1.1555366516113281,
"step": 252
},
{
"epoch": 0.4652014652014652,
"grad_norm": 0.12176702171564102,
"learning_rate": 4.820981288995307e-05,
"loss": 0.9462042450904846,
"step": 254
},
{
"epoch": 0.46886446886446886,
"grad_norm": 0.10259034484624863,
"learning_rate": 4.8174851631445354e-05,
"loss": 1.2327078580856323,
"step": 256
},
{
"epoch": 0.4725274725274725,
"grad_norm": 0.2983081638813019,
"learning_rate": 4.8139566704332984e-05,
"loss": 1.2545617818832397,
"step": 258
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.18848823010921478,
"learning_rate": 4.810395866100797e-05,
"loss": 0.7314871549606323,
"step": 260
},
{
"epoch": 0.47985347985347987,
"grad_norm": 1.1380740404129028,
"learning_rate": 4.8068028058920795e-05,
"loss": 1.1386513710021973,
"step": 262
},
{
"epoch": 0.4835164835164835,
"grad_norm": 0.19013704359531403,
"learning_rate": 4.803177546057163e-05,
"loss": 1.207440972328186,
"step": 264
},
{
"epoch": 0.48717948717948717,
"grad_norm": 0.49117130041122437,
"learning_rate": 4.799520143350158e-05,
"loss": 1.478100299835205,
"step": 266
},
{
"epoch": 0.4908424908424908,
"grad_norm": 0.32450738549232483,
"learning_rate": 4.795830655028376e-05,
"loss": 0.7695617079734802,
"step": 268
},
{
"epoch": 0.4945054945054945,
"grad_norm": 0.14151829481124878,
"learning_rate": 4.792109138851435e-05,
"loss": 1.180545449256897,
"step": 270
},
{
"epoch": 0.4981684981684982,
"grad_norm": 0.21694627404212952,
"learning_rate": 4.7883556530803554e-05,
"loss": 0.8736183643341064,
"step": 272
},
{
"epoch": 0.5018315018315018,
"grad_norm": 0.9315363764762878,
"learning_rate": 4.7845702564766475e-05,
"loss": 1.2287445068359375,
"step": 274
},
{
"epoch": 0.5054945054945055,
"grad_norm": 0.12385514378547668,
"learning_rate": 4.7807530083013906e-05,
"loss": 0.814042329788208,
"step": 276
},
{
"epoch": 0.5091575091575091,
"grad_norm": 0.10513313859701157,
"learning_rate": 4.776903968314308e-05,
"loss": 0.8786470890045166,
"step": 278
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.213555246591568,
"learning_rate": 4.7730231967728275e-05,
"loss": 1.2300586700439453,
"step": 280
},
{
"epoch": 0.5164835164835165,
"grad_norm": 0.20062805712223053,
"learning_rate": 4.769110754431142e-05,
"loss": 1.2230390310287476,
"step": 282
},
{
"epoch": 0.5201465201465202,
"grad_norm": 0.21544235944747925,
"learning_rate": 4.765166702539256e-05,
"loss": 1.2219314575195312,
"step": 284
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.22437822818756104,
"learning_rate": 4.761191102842027e-05,
"loss": 0.9741434454917908,
"step": 286
},
{
"epoch": 0.5274725274725275,
"grad_norm": 0.09989945590496063,
"learning_rate": 4.757184017578198e-05,
"loss": 1.2340394258499146,
"step": 288
},
{
"epoch": 0.5311355311355311,
"grad_norm": 0.14188872277736664,
"learning_rate": 4.7531455094794284e-05,
"loss": 1.197536587715149,
"step": 290
},
{
"epoch": 0.5347985347985348,
"grad_norm": 0.1335064321756363,
"learning_rate": 4.7490756417693036e-05,
"loss": 0.7367426753044128,
"step": 292
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.02857016585767269,
"learning_rate": 4.7449744781623526e-05,
"loss": 0.9376294016838074,
"step": 294
},
{
"epoch": 0.5421245421245421,
"grad_norm": 0.11979032307863235,
"learning_rate": 4.740842082863043e-05,
"loss": 1.0236124992370605,
"step": 296
},
{
"epoch": 0.5457875457875457,
"grad_norm": 0.19949960708618164,
"learning_rate": 4.736678520564786e-05,
"loss": 1.290779709815979,
"step": 298
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.12269338220357895,
"learning_rate": 4.732483856448913e-05,
"loss": 1.1912894248962402,
"step": 300
},
{
"epoch": 0.5531135531135531,
"grad_norm": 0.15550191700458527,
"learning_rate": 4.7282581561836644e-05,
"loss": 1.1734073162078857,
"step": 302
},
{
"epoch": 0.5567765567765568,
"grad_norm": 0.19052956998348236,
"learning_rate": 4.724001485923153e-05,
"loss": 0.9569897055625916,
"step": 304
},
{
"epoch": 0.5604395604395604,
"grad_norm": 0.3564753830432892,
"learning_rate": 4.7197139123063366e-05,
"loss": 0.9688905477523804,
"step": 306
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.25113749504089355,
"learning_rate": 4.715395502455967e-05,
"loss": 1.3545844554901123,
"step": 308
},
{
"epoch": 0.5677655677655677,
"grad_norm": 0.19413875043392181,
"learning_rate": 4.711046323977545e-05,
"loss": 0.9748039245605469,
"step": 310
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.20445436239242554,
"learning_rate": 4.70666644495826e-05,
"loss": 1.2018651962280273,
"step": 312
},
{
"epoch": 0.575091575091575,
"grad_norm": 0.1748535931110382,
"learning_rate": 4.702255933965924e-05,
"loss": 1.1204524040222168,
"step": 314
},
{
"epoch": 0.5787545787545788,
"grad_norm": 0.13978832960128784,
"learning_rate": 4.697814860047895e-05,
"loss": 1.273799180984497,
"step": 316
},
{
"epoch": 0.5824175824175825,
"grad_norm": 0.1604635864496231,
"learning_rate": 4.6933432927300054e-05,
"loss": 1.1062840223312378,
"step": 318
},
{
"epoch": 0.5860805860805861,
"grad_norm": 0.1707131415605545,
"learning_rate": 4.6888413020154626e-05,
"loss": 1.3164299726486206,
"step": 320
},
{
"epoch": 0.5897435897435898,
"grad_norm": 0.13679248094558716,
"learning_rate": 4.6843089583837586e-05,
"loss": 1.5054590702056885,
"step": 322
},
{
"epoch": 0.5934065934065934,
"grad_norm": 0.19371454417705536,
"learning_rate": 4.6797463327895676e-05,
"loss": 1.2403850555419922,
"step": 324
},
{
"epoch": 0.5970695970695971,
"grad_norm": 0.09500681608915329,
"learning_rate": 4.6751534966616314e-05,
"loss": 1.3421348333358765,
"step": 326
},
{
"epoch": 0.6007326007326007,
"grad_norm": 0.1618986278772354,
"learning_rate": 4.670530521901645e-05,
"loss": 1.2023552656173706,
"step": 328
},
{
"epoch": 0.6043956043956044,
"grad_norm": 0.13862641155719757,
"learning_rate": 4.6658774808831284e-05,
"loss": 1.1014868021011353,
"step": 330
},
{
"epoch": 0.608058608058608,
"grad_norm": 0.2911272644996643,
"learning_rate": 4.6611944464502935e-05,
"loss": 1.1684032678604126,
"step": 332
},
{
"epoch": 0.6117216117216118,
"grad_norm": 0.24178026616573334,
"learning_rate": 4.6564814919169075e-05,
"loss": 1.2577779293060303,
"step": 334
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.5293629169464111,
"learning_rate": 4.651738691065139e-05,
"loss": 0.8592604994773865,
"step": 336
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.09567166119813919,
"learning_rate": 4.646966118144407e-05,
"loss": 1.2142037153244019,
"step": 338
},
{
"epoch": 0.6227106227106227,
"grad_norm": 0.13777339458465576,
"learning_rate": 4.642163847870221e-05,
"loss": 1.207306981086731,
"step": 340
},
{
"epoch": 0.6263736263736264,
"grad_norm": 0.22208669781684875,
"learning_rate": 4.637331955423002e-05,
"loss": 0.5593523979187012,
"step": 342
},
{
"epoch": 0.63003663003663,
"grad_norm": 0.15060071647167206,
"learning_rate": 4.6324705164469174e-05,
"loss": 1.4146814346313477,
"step": 344
},
{
"epoch": 0.6336996336996337,
"grad_norm": 0.2521788775920868,
"learning_rate": 4.6275796070486874e-05,
"loss": 0.6819853782653809,
"step": 346
},
{
"epoch": 0.6373626373626373,
"grad_norm": 0.1835089921951294,
"learning_rate": 4.622659303796397e-05,
"loss": 1.135895013809204,
"step": 348
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.30718883872032166,
"learning_rate": 4.6177096837183016e-05,
"loss": 0.8732522130012512,
"step": 350
},
{
"epoch": 0.6446886446886447,
"grad_norm": 0.19664013385772705,
"learning_rate": 4.612730824301611e-05,
"loss": 0.9108962416648865,
"step": 352
},
{
"epoch": 0.6483516483516484,
"grad_norm": 0.24300748109817505,
"learning_rate": 4.6077228034912865e-05,
"loss": 0.944155216217041,
"step": 354
},
{
"epoch": 0.652014652014652,
"grad_norm": 0.25140050053596497,
"learning_rate": 4.602685699688814e-05,
"loss": 1.1503783464431763,
"step": 356
},
{
"epoch": 0.6556776556776557,
"grad_norm": 0.18550491333007812,
"learning_rate": 4.5976195917509804e-05,
"loss": 1.1416871547698975,
"step": 358
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.19337521493434906,
"learning_rate": 4.592524558988638e-05,
"loss": 0.6880902647972107,
"step": 360
},
{
"epoch": 0.663003663003663,
"grad_norm": 0.27510422468185425,
"learning_rate": 4.58740068116546e-05,
"loss": 0.9372468590736389,
"step": 362
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.14954572916030884,
"learning_rate": 4.582248038496698e-05,
"loss": 0.9180594682693481,
"step": 364
},
{
"epoch": 0.6703296703296703,
"grad_norm": 0.23267677426338196,
"learning_rate": 4.577066711647918e-05,
"loss": 1.1724467277526855,
"step": 366
},
{
"epoch": 0.673992673992674,
"grad_norm": 0.1276102066040039,
"learning_rate": 4.571856781733748e-05,
"loss": 1.0390164852142334,
"step": 368
},
{
"epoch": 0.6776556776556777,
"grad_norm": 0.1551157683134079,
"learning_rate": 4.566618330316596e-05,
"loss": 1.081437587738037,
"step": 370
},
{
"epoch": 0.6813186813186813,
"grad_norm": 0.3087083399295807,
"learning_rate": 4.561351439405384e-05,
"loss": 1.1742217540740967,
"step": 372
},
{
"epoch": 0.684981684981685,
"grad_norm": 0.2865961790084839,
"learning_rate": 4.5560561914542576e-05,
"loss": 1.1755157709121704,
"step": 374
},
{
"epoch": 0.6886446886446886,
"grad_norm": 0.20178958773612976,
"learning_rate": 4.550732669361298e-05,
"loss": 0.8584067225456238,
"step": 376
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.14505843818187714,
"learning_rate": 4.54538095646722e-05,
"loss": 0.8162437081336975,
"step": 378
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.35326001048088074,
"learning_rate": 4.540001136554077e-05,
"loss": 1.0263890027999878,
"step": 380
},
{
"epoch": 0.6996336996336996,
"grad_norm": 0.2113528698682785,
"learning_rate": 4.534593293843936e-05,
"loss": 0.9698024392127991,
"step": 382
},
{
"epoch": 0.7032967032967034,
"grad_norm": 0.13257572054862976,
"learning_rate": 4.529157512997571e-05,
"loss": 1.1605135202407837,
"step": 384
},
{
"epoch": 0.706959706959707,
"grad_norm": 0.17257475852966309,
"learning_rate": 4.5236938791131305e-05,
"loss": 1.0823811292648315,
"step": 386
},
{
"epoch": 0.7106227106227107,
"grad_norm": 0.2746966779232025,
"learning_rate": 4.518202477724808e-05,
"loss": 0.8808259963989258,
"step": 388
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.11813390254974365,
"learning_rate": 4.5126833948015016e-05,
"loss": 1.0819435119628906,
"step": 390
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.2048182636499405,
"learning_rate": 4.5071367167454687e-05,
"loss": 1.1645246744155884,
"step": 392
},
{
"epoch": 0.7216117216117216,
"grad_norm": 2.1287009716033936,
"learning_rate": 4.5015625303909755e-05,
"loss": 1.1096913814544678,
"step": 394
},
{
"epoch": 0.7252747252747253,
"grad_norm": 0.11994423717260361,
"learning_rate": 4.495960923002935e-05,
"loss": 1.223901391029358,
"step": 396
},
{
"epoch": 0.7289377289377289,
"grad_norm": 0.15119600296020508,
"learning_rate": 4.49033198227554e-05,
"loss": 0.9063436388969421,
"step": 398
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.11098281294107437,
"learning_rate": 4.4846757963308936e-05,
"loss": 0.8366504907608032,
"step": 400
},
{
"epoch": 0.7362637362637363,
"grad_norm": 0.14989100396633148,
"learning_rate": 4.478992453717626e-05,
"loss": 1.228022813796997,
"step": 402
},
{
"epoch": 0.73992673992674,
"grad_norm": 0.2334737479686737,
"learning_rate": 4.4732820434095123e-05,
"loss": 0.8357920050621033,
"step": 404
},
{
"epoch": 0.7435897435897436,
"grad_norm": 0.3819234073162079,
"learning_rate": 4.4675446548040754e-05,
"loss": 1.0126510858535767,
"step": 406
},
{
"epoch": 0.7472527472527473,
"grad_norm": 0.19562920928001404,
"learning_rate": 4.46178037772119e-05,
"loss": 1.012734055519104,
"step": 408
},
{
"epoch": 0.7509157509157509,
"grad_norm": 0.304485559463501,
"learning_rate": 4.4559893024016726e-05,
"loss": 0.8644341230392456,
"step": 410
},
{
"epoch": 0.7545787545787546,
"grad_norm": 0.24467211961746216,
"learning_rate": 4.450171519505873e-05,
"loss": 1.152502179145813,
"step": 412
},
{
"epoch": 0.7582417582417582,
"grad_norm": 0.8111533522605896,
"learning_rate": 4.4443271201122514e-05,
"loss": 0.9966916441917419,
"step": 414
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.13030032813549042,
"learning_rate": 4.4384561957159565e-05,
"loss": 1.2616826295852661,
"step": 416
},
{
"epoch": 0.7655677655677655,
"grad_norm": 0.09772861003875732,
"learning_rate": 4.43255883822739e-05,
"loss": 0.6672307252883911,
"step": 418
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.16410967707633972,
"learning_rate": 4.4266351399707664e-05,
"loss": 1.214950680732727,
"step": 420
},
{
"epoch": 0.7728937728937729,
"grad_norm": 0.12338349223136902,
"learning_rate": 4.420685193682672e-05,
"loss": 0.9765850305557251,
"step": 422
},
{
"epoch": 0.7765567765567766,
"grad_norm": 0.5074856281280518,
"learning_rate": 4.4147090925106104e-05,
"loss": 0.4896080195903778,
"step": 424
},
{
"epoch": 0.7802197802197802,
"grad_norm": 0.2849983870983124,
"learning_rate": 4.4087069300115444e-05,
"loss": 0.7668413519859314,
"step": 426
},
{
"epoch": 0.7838827838827839,
"grad_norm": 0.36542126536369324,
"learning_rate": 4.4026788001504314e-05,
"loss": 1.045650839805603,
"step": 428
},
{
"epoch": 0.7875457875457875,
"grad_norm": 0.1004275232553482,
"learning_rate": 4.396624797298754e-05,
"loss": 1.1941821575164795,
"step": 430
},
{
"epoch": 0.7912087912087912,
"grad_norm": 0.1513642817735672,
"learning_rate": 4.390545016233039e-05,
"loss": 1.2807530164718628,
"step": 432
},
{
"epoch": 0.7948717948717948,
"grad_norm": 0.3031829595565796,
"learning_rate": 4.3844395521333786e-05,
"loss": 0.8745837807655334,
"step": 434
},
{
"epoch": 0.7985347985347986,
"grad_norm": 0.1763853281736374,
"learning_rate": 4.378308500581934e-05,
"loss": 0.9577867984771729,
"step": 436
},
{
"epoch": 0.8021978021978022,
"grad_norm": 0.21650274097919464,
"learning_rate": 4.372151957561447e-05,
"loss": 0.8710334300994873,
"step": 438
},
{
"epoch": 0.8058608058608059,
"grad_norm": 0.3157196640968323,
"learning_rate": 4.36597001945373e-05,
"loss": 1.2961158752441406,
"step": 440
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.11482734233140945,
"learning_rate": 4.3597627830381606e-05,
"loss": 1.1874325275421143,
"step": 442
},
{
"epoch": 0.8131868131868132,
"grad_norm": 0.15268991887569427,
"learning_rate": 4.353530345490167e-05,
"loss": 1.1880759000778198,
"step": 444
},
{
"epoch": 0.8168498168498168,
"grad_norm": 0.21278268098831177,
"learning_rate": 4.347272804379705e-05,
"loss": 1.206059455871582,
"step": 446
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.179881751537323,
"learning_rate": 4.340990257669732e-05,
"loss": 1.2088541984558105,
"step": 448
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.14933490753173828,
"learning_rate": 4.334682803714672e-05,
"loss": 1.2412981986999512,
"step": 450
},
{
"epoch": 0.8278388278388278,
"grad_norm": 0.1529897153377533,
"learning_rate": 4.328350541258876e-05,
"loss": 0.9919160008430481,
"step": 452
},
{
"epoch": 0.8315018315018315,
"grad_norm": 0.10920170694589615,
"learning_rate": 4.321993569435078e-05,
"loss": 0.49135756492614746,
"step": 454
},
{
"epoch": 0.8351648351648352,
"grad_norm": 0.4436364471912384,
"learning_rate": 4.315611987762841e-05,
"loss": 0.8858435750007629,
"step": 456
},
{
"epoch": 0.8388278388278388,
"grad_norm": 0.22913309931755066,
"learning_rate": 4.309205896146999e-05,
"loss": 0.8232947587966919,
"step": 458
},
{
"epoch": 0.8424908424908425,
"grad_norm": 0.2811645567417145,
"learning_rate": 4.302775394876096e-05,
"loss": 1.0056540966033936,
"step": 460
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.22459489107131958,
"learning_rate": 4.29632058462081e-05,
"loss": 1.2183864116668701,
"step": 462
},
{
"epoch": 0.8498168498168498,
"grad_norm": 0.12809278070926666,
"learning_rate": 4.2898415664323844e-05,
"loss": 1.1671696901321411,
"step": 464
},
{
"epoch": 0.8534798534798534,
"grad_norm": 0.03261662647128105,
"learning_rate": 4.2833384417410395e-05,
"loss": 1.187354564666748,
"step": 466
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.1845274269580841,
"learning_rate": 4.276811312354389e-05,
"loss": 0.8790689706802368,
"step": 468
},
{
"epoch": 0.8608058608058609,
"grad_norm": 0.6504915952682495,
"learning_rate": 4.270260280455843e-05,
"loss": 0.8886659145355225,
"step": 470
},
{
"epoch": 0.8644688644688645,
"grad_norm": 0.19828136265277863,
"learning_rate": 4.263685448603012e-05,
"loss": 1.1550533771514893,
"step": 472
},
{
"epoch": 0.8681318681318682,
"grad_norm": 0.4623855948448181,
"learning_rate": 4.257086919726097e-05,
"loss": 0.7794157862663269,
"step": 474
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.2206961214542389,
"learning_rate": 4.25046479712628e-05,
"loss": 1.087639331817627,
"step": 476
},
{
"epoch": 0.8754578754578755,
"grad_norm": 0.23512773215770721,
"learning_rate": 4.2438191844741105e-05,
"loss": 1.0371439456939697,
"step": 478
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.7811533808708191,
"learning_rate": 4.2371501858078753e-05,
"loss": 1.055543065071106,
"step": 480
},
{
"epoch": 0.8827838827838828,
"grad_norm": 0.22808873653411865,
"learning_rate": 4.230457905531976e-05,
"loss": 1.1782468557357788,
"step": 482
},
{
"epoch": 0.8864468864468864,
"grad_norm": 0.1753520965576172,
"learning_rate": 4.22374244841529e-05,
"loss": 1.244563341140747,
"step": 484
},
{
"epoch": 0.8901098901098901,
"grad_norm": 1.1950969696044922,
"learning_rate": 4.217003919589535e-05,
"loss": 0.8924474120140076,
"step": 486
},
{
"epoch": 0.8937728937728938,
"grad_norm": 0.42523816227912903,
"learning_rate": 4.210242424547617e-05,
"loss": 1.136575698852539,
"step": 488
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.2354390025138855,
"learning_rate": 4.203458069141985e-05,
"loss": 1.0925524234771729,
"step": 490
},
{
"epoch": 0.9010989010989011,
"grad_norm": 0.3065359592437744,
"learning_rate": 4.196650959582973e-05,
"loss": 1.031598687171936,
"step": 492
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.40230098366737366,
"learning_rate": 4.1898212024371304e-05,
"loss": 0.5824300646781921,
"step": 494
},
{
"epoch": 0.9084249084249084,
"grad_norm": 0.1176103800535202,
"learning_rate": 4.1829689046255616e-05,
"loss": 1.2608321905136108,
"step": 496
},
{
"epoch": 0.9120879120879121,
"grad_norm": 0.1900339424610138,
"learning_rate": 4.1760941734222505e-05,
"loss": 1.117556095123291,
"step": 498
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.13478179275989532,
"learning_rate": 4.1691971164523764e-05,
"loss": 0.7983730435371399,
"step": 500
},
{
"epoch": 0.9194139194139194,
"grad_norm": 0.11394549906253815,
"learning_rate": 4.1622778416906375e-05,
"loss": 0.8523120284080505,
"step": 502
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.15726390480995178,
"learning_rate": 4.15533645745955e-05,
"loss": 0.9240443110466003,
"step": 504
},
{
"epoch": 0.9267399267399268,
"grad_norm": 0.07845434546470642,
"learning_rate": 4.148373072427762e-05,
"loss": 0.8336247205734253,
"step": 506
},
{
"epoch": 0.9304029304029304,
"grad_norm": 0.1776849627494812,
"learning_rate": 4.1413877956083456e-05,
"loss": 1.1461174488067627,
"step": 508
},
{
"epoch": 0.9340659340659341,
"grad_norm": 0.21315138041973114,
"learning_rate": 4.1343807363570964e-05,
"loss": 1.2062344551086426,
"step": 510
},
{
"epoch": 0.9377289377289377,
"grad_norm": 0.1842324286699295,
"learning_rate": 4.127352004370814e-05,
"loss": 1.1556131839752197,
"step": 512
},
{
"epoch": 0.9413919413919414,
"grad_norm": 0.17686887085437775,
"learning_rate": 4.12030170968559e-05,
"loss": 0.9388005137443542,
"step": 514
},
{
"epoch": 0.945054945054945,
"grad_norm": 0.20578397810459137,
"learning_rate": 4.113229962675085e-05,
"loss": 1.1634795665740967,
"step": 516
},
{
"epoch": 0.9487179487179487,
"grad_norm": 0.19318969547748566,
"learning_rate": 4.1061368740488e-05,
"loss": 1.1986818313598633,
"step": 518
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.25902265310287476,
"learning_rate": 4.09902255485034e-05,
"loss": 1.1979715824127197,
"step": 520
},
{
"epoch": 0.9560439560439561,
"grad_norm": 0.3866836726665497,
"learning_rate": 4.091887116455681e-05,
"loss": 0.8659937381744385,
"step": 522
},
{
"epoch": 0.9597069597069597,
"grad_norm": 0.4504384994506836,
"learning_rate": 4.084730670571424e-05,
"loss": 1.0120433568954468,
"step": 524
},
{
"epoch": 0.9633699633699634,
"grad_norm": 0.2555679380893707,
"learning_rate": 4.0775533292330464e-05,
"loss": 0.9460458755493164,
"step": 526
},
{
"epoch": 0.967032967032967,
"grad_norm": 0.3483099639415741,
"learning_rate": 4.070355204803145e-05,
"loss": 0.6675710082054138,
"step": 528
},
{
"epoch": 0.9706959706959707,
"grad_norm": 0.09682053327560425,
"learning_rate": 4.0631364099696815e-05,
"loss": 0.90069580078125,
"step": 530
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.7320610880851746,
"learning_rate": 4.055897057744219e-05,
"loss": 1.3395118713378906,
"step": 532
},
{
"epoch": 0.978021978021978,
"grad_norm": 0.24255181849002838,
"learning_rate": 4.048637261460145e-05,
"loss": 0.6177163124084473,
"step": 534
},
{
"epoch": 0.9816849816849816,
"grad_norm": 0.42302218079566956,
"learning_rate": 4.0413571347709074e-05,
"loss": 0.8449323177337646,
"step": 536
},
{
"epoch": 0.9853479853479854,
"grad_norm": 0.1898687183856964,
"learning_rate": 4.034056791648228e-05,
"loss": 0.7976465225219727,
"step": 538
},
{
"epoch": 0.989010989010989,
"grad_norm": 1.4811292886734009,
"learning_rate": 4.0267363463803216e-05,
"loss": 1.1151212453842163,
"step": 540
},
{
"epoch": 0.9926739926739927,
"grad_norm": 0.20806598663330078,
"learning_rate": 4.019395913570104e-05,
"loss": 1.1612093448638916,
"step": 542
},
{
"epoch": 0.9963369963369964,
"grad_norm": 0.4448166787624359,
"learning_rate": 4.0120356081334004e-05,
"loss": 1.1680574417114258,
"step": 544
},
{
"epoch": 1.0,
"grad_norm": 0.2003720998764038,
"learning_rate": 4.004655545297148e-05,
"loss": 1.347452163696289,
"step": 546
},
{
"epoch": 1.0036630036630036,
"grad_norm": 0.5149394869804382,
"learning_rate": 3.997255840597587e-05,
"loss": 0.8998035788536072,
"step": 548
},
{
"epoch": 1.0073260073260073,
"grad_norm": 0.18844829499721527,
"learning_rate": 3.9898366098784544e-05,
"loss": 1.149839162826538,
"step": 550
},
{
"epoch": 1.010989010989011,
"grad_norm": 0.12711450457572937,
"learning_rate": 3.9823979692891734e-05,
"loss": 0.6266541481018066,
"step": 552
},
{
"epoch": 1.0146520146520146,
"grad_norm": 0.43699517846107483,
"learning_rate": 3.974940035283029e-05,
"loss": 1.0944384336471558,
"step": 554
},
{
"epoch": 1.0183150183150182,
"grad_norm": 0.4765096604824066,
"learning_rate": 3.967462924615351e-05,
"loss": 0.9994142055511475,
"step": 556
},
{
"epoch": 1.021978021978022,
"grad_norm": 0.1308145374059677,
"learning_rate": 3.95996675434168e-05,
"loss": 1.181485652923584,
"step": 558
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.17354629933834076,
"learning_rate": 3.952451641815942e-05,
"loss": 0.8960216641426086,
"step": 560
},
{
"epoch": 1.0293040293040292,
"grad_norm": 0.20918694138526917,
"learning_rate": 3.944917704688605e-05,
"loss": 1.130763053894043,
"step": 562
},
{
"epoch": 1.032967032967033,
"grad_norm": 1.6546434164047241,
"learning_rate": 3.9373650609048404e-05,
"loss": 1.109397530555725,
"step": 564
},
{
"epoch": 1.0366300366300367,
"grad_norm": 0.20021019876003265,
"learning_rate": 3.929793828702676e-05,
"loss": 0.9343792796134949,
"step": 566
},
{
"epoch": 1.0402930402930404,
"grad_norm": 0.8677191734313965,
"learning_rate": 3.9222041266111444e-05,
"loss": 1.0045840740203857,
"step": 568
},
{
"epoch": 1.043956043956044,
"grad_norm": 0.3973585367202759,
"learning_rate": 3.914596073448427e-05,
"loss": 1.0684887170791626,
"step": 570
},
{
"epoch": 1.0476190476190477,
"grad_norm": 0.5770029425621033,
"learning_rate": 3.906969788319996e-05,
"loss": 1.2040116786956787,
"step": 572
},
{
"epoch": 1.0512820512820513,
"grad_norm": 0.4743853509426117,
"learning_rate": 3.899325390616748e-05,
"loss": 1.0301820039749146,
"step": 574
},
{
"epoch": 1.054945054945055,
"grad_norm": 0.5908330678939819,
"learning_rate": 3.891663000013133e-05,
"loss": 1.180071473121643,
"step": 576
},
{
"epoch": 1.0586080586080586,
"grad_norm": 0.19290882349014282,
"learning_rate": 3.8839827364652875e-05,
"loss": 1.1906160116195679,
"step": 578
},
{
"epoch": 1.0622710622710623,
"grad_norm": 0.9853556752204895,
"learning_rate": 3.8762847202091486e-05,
"loss": 1.186142086982727,
"step": 580
},
{
"epoch": 1.065934065934066,
"grad_norm": 0.2555069625377655,
"learning_rate": 3.868569071758577e-05,
"loss": 0.9499126076698303,
"step": 582
},
{
"epoch": 1.0695970695970696,
"grad_norm": 0.37080565094947815,
"learning_rate": 3.860835911903467e-05,
"loss": 1.1149709224700928,
"step": 584
},
{
"epoch": 1.0732600732600732,
"grad_norm": 0.15910615026950836,
"learning_rate": 3.853085361707859e-05,
"loss": 1.0009230375289917,
"step": 586
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.24355213344097137,
"learning_rate": 3.8453175425080426e-05,
"loss": 0.7909836769104004,
"step": 588
},
{
"epoch": 1.0805860805860805,
"grad_norm": 0.2337721288204193,
"learning_rate": 3.8375325759106563e-05,
"loss": 1.1665717363357544,
"step": 590
},
{
"epoch": 1.0842490842490842,
"grad_norm": 0.16137680411338806,
"learning_rate": 3.829730583790782e-05,
"loss": 1.009416103363037,
"step": 592
},
{
"epoch": 1.0879120879120878,
"grad_norm": 0.19279745221138,
"learning_rate": 3.821911688290043e-05,
"loss": 0.9450397491455078,
"step": 594
},
{
"epoch": 1.0915750915750915,
"grad_norm": 0.17870314419269562,
"learning_rate": 3.814076011814685e-05,
"loss": 0.991208553314209,
"step": 596
},
{
"epoch": 1.0952380952380953,
"grad_norm": 0.3691723346710205,
"learning_rate": 3.806223677033664e-05,
"loss": 1.0436757802963257,
"step": 598
},
{
"epoch": 1.098901098901099,
"grad_norm": 0.2122497707605362,
"learning_rate": 3.798354806876728e-05,
"loss": 1.10894775390625,
"step": 600
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.22092920541763306,
"learning_rate": 3.790469524532484e-05,
"loss": 0.7955626845359802,
"step": 602
},
{
"epoch": 1.1062271062271063,
"grad_norm": 0.6790117621421814,
"learning_rate": 3.782567953446477e-05,
"loss": 0.9074943661689758,
"step": 604
},
{
"epoch": 1.10989010989011,
"grad_norm": 0.2356610894203186,
"learning_rate": 3.774650217319257e-05,
"loss": 0.8648009896278381,
"step": 606
},
{
"epoch": 1.1135531135531136,
"grad_norm": 0.5769211649894714,
"learning_rate": 3.766716440104439e-05,
"loss": 1.1958070993423462,
"step": 608
},
{
"epoch": 1.1172161172161172,
"grad_norm": 0.2272127866744995,
"learning_rate": 3.7587667460067635e-05,
"loss": 0.7023400664329529,
"step": 610
},
{
"epoch": 1.120879120879121,
"grad_norm": 0.6303228735923767,
"learning_rate": 3.750801259480154e-05,
"loss": 1.1139551401138306,
"step": 612
},
{
"epoch": 1.1245421245421245,
"grad_norm": 0.22971026599407196,
"learning_rate": 3.7428201052257675e-05,
"loss": 0.9638775587081909,
"step": 614
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.1906343698501587,
"learning_rate": 3.7348234081900424e-05,
"loss": 1.127274513244629,
"step": 616
},
{
"epoch": 1.1318681318681318,
"grad_norm": 0.29741182923316956,
"learning_rate": 3.726811293562739e-05,
"loss": 0.36746326088905334,
"step": 618
},
{
"epoch": 1.1355311355311355,
"grad_norm": 1.2210016250610352,
"learning_rate": 3.718783886774988e-05,
"loss": 1.0633288621902466,
"step": 620
},
{
"epoch": 1.1391941391941391,
"grad_norm": 1.7148534059524536,
"learning_rate": 3.7107413134973174e-05,
"loss": 0.7120411992073059,
"step": 622
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.6514428853988647,
"learning_rate": 3.702683699637692e-05,
"loss": 1.0393530130386353,
"step": 624
},
{
"epoch": 1.1465201465201464,
"grad_norm": 0.6007410287857056,
"learning_rate": 3.6946111713395365e-05,
"loss": 1.3253600597381592,
"step": 626
},
{
"epoch": 1.15018315018315,
"grad_norm": 0.6678552031517029,
"learning_rate": 3.6865238549797686e-05,
"loss": 0.4324287176132202,
"step": 628
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.056091565638780594,
"learning_rate": 3.6784218771668125e-05,
"loss": 1.0922839641571045,
"step": 630
},
{
"epoch": 1.1575091575091574,
"grad_norm": 0.34849968552589417,
"learning_rate": 3.670305364738621e-05,
"loss": 1.0799121856689453,
"step": 632
},
{
"epoch": 1.1611721611721613,
"grad_norm": 1.3243259191513062,
"learning_rate": 3.662174444760688e-05,
"loss": 0.8275938630104065,
"step": 634
},
{
"epoch": 1.164835164835165,
"grad_norm": 1.356533169746399,
"learning_rate": 3.6540292445240624e-05,
"loss": 0.93868488073349,
"step": 636
},
{
"epoch": 1.1684981684981686,
"grad_norm": 0.4635038673877716,
"learning_rate": 3.6458698915433506e-05,
"loss": 1.1719251871109009,
"step": 638
},
{
"epoch": 1.1721611721611722,
"grad_norm": 0.04632457718253136,
"learning_rate": 3.637696513554725e-05,
"loss": 0.7194165587425232,
"step": 640
},
{
"epoch": 1.1758241758241759,
"grad_norm": 0.3217860162258148,
"learning_rate": 3.629509238513921e-05,
"loss": 1.0931247472763062,
"step": 642
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.8376574516296387,
"learning_rate": 3.621308194594236e-05,
"loss": 0.9752073884010315,
"step": 644
},
{
"epoch": 1.1831501831501832,
"grad_norm": 0.19979257881641388,
"learning_rate": 3.6130935101845194e-05,
"loss": 0.7485665678977966,
"step": 646
},
{
"epoch": 1.1868131868131868,
"grad_norm": 0.2617126703262329,
"learning_rate": 3.6048653138871666e-05,
"loss": 0.8534201383590698,
"step": 648
},
{
"epoch": 1.1904761904761905,
"grad_norm": 1.090280532836914,
"learning_rate": 3.596623734516104e-05,
"loss": 1.1257884502410889,
"step": 650
},
{
"epoch": 1.1941391941391941,
"grad_norm": 6.1979570388793945,
"learning_rate": 3.588368901094773e-05,
"loss": 0.653273344039917,
"step": 652
},
{
"epoch": 1.1978021978021978,
"grad_norm": 0.17688162624835968,
"learning_rate": 3.5801009428541096e-05,
"loss": 1.249631643295288,
"step": 654
},
{
"epoch": 1.2014652014652014,
"grad_norm": 0.8640678524971008,
"learning_rate": 3.571819989230519e-05,
"loss": 0.8184079527854919,
"step": 656
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.6149291396141052,
"learning_rate": 3.563526169863854e-05,
"loss": 0.7101552486419678,
"step": 658
},
{
"epoch": 1.2087912087912087,
"grad_norm": 0.7223177552223206,
"learning_rate": 3.555219614595381e-05,
"loss": 1.1504517793655396,
"step": 660
},
{
"epoch": 1.2124542124542124,
"grad_norm": 0.6356011629104614,
"learning_rate": 3.546900453465752e-05,
"loss": 0.970334529876709,
"step": 662
},
{
"epoch": 1.2161172161172162,
"grad_norm": 0.8084374666213989,
"learning_rate": 3.538568816712964e-05,
"loss": 0.8572604060173035,
"step": 664
},
{
"epoch": 1.2197802197802199,
"grad_norm": 0.2903348505496979,
"learning_rate": 3.5302248347703224e-05,
"loss": 0.7845436930656433,
"step": 666
},
{
"epoch": 1.2234432234432235,
"grad_norm": 0.9992715716362,
"learning_rate": 3.5218686382643994e-05,
"loss": 0.8749545812606812,
"step": 668
},
{
"epoch": 1.2271062271062272,
"grad_norm": 0.7143378257751465,
"learning_rate": 3.513500358012988e-05,
"loss": 0.6878855228424072,
"step": 670
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.30578601360321045,
"learning_rate": 3.5051201250230545e-05,
"loss": 1.168808937072754,
"step": 672
},
{
"epoch": 1.2344322344322345,
"grad_norm": 1.2355809211730957,
"learning_rate": 3.4967280704886865e-05,
"loss": 1.1536543369293213,
"step": 674
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.3718186914920807,
"learning_rate": 3.488324325789044e-05,
"loss": 1.1648200750350952,
"step": 676
},
{
"epoch": 1.2417582417582418,
"grad_norm": 0.4209219813346863,
"learning_rate": 3.4799090224862924e-05,
"loss": 0.7579060792922974,
"step": 678
},
{
"epoch": 1.2454212454212454,
"grad_norm": 2.09708571434021,
"learning_rate": 3.471482292323554e-05,
"loss": 0.8136189579963684,
"step": 680
},
{
"epoch": 1.249084249084249,
"grad_norm": 0.4596081078052521,
"learning_rate": 3.463044267222841e-05,
"loss": 1.1541743278503418,
"step": 682
},
{
"epoch": 1.2527472527472527,
"grad_norm": 0.2024964988231659,
"learning_rate": 3.454595079282986e-05,
"loss": 1.1373684406280518,
"step": 684
},
{
"epoch": 1.2564102564102564,
"grad_norm": 0.5888193845748901,
"learning_rate": 3.4461348607775806e-05,
"loss": 0.8096006512641907,
"step": 686
},
{
"epoch": 1.26007326007326,
"grad_norm": 0.5350688099861145,
"learning_rate": 3.437663744152902e-05,
"loss": 1.081048607826233,
"step": 688
},
{
"epoch": 1.2637362637362637,
"grad_norm": 0.4051729738712311,
"learning_rate": 3.429181862025839e-05,
"loss": 1.0899769067764282,
"step": 690
},
{
"epoch": 1.2673992673992673,
"grad_norm": 1.3433797359466553,
"learning_rate": 3.4206893471818155e-05,
"loss": 1.1519224643707275,
"step": 692
},
{
"epoch": 1.271062271062271,
"grad_norm": 0.751139223575592,
"learning_rate": 3.4121863325727124e-05,
"loss": 0.9729434251785278,
"step": 694
},
{
"epoch": 1.2747252747252746,
"grad_norm": 0.7331501245498657,
"learning_rate": 3.40367295131479e-05,
"loss": 0.9491739869117737,
"step": 696
},
{
"epoch": 1.2783882783882783,
"grad_norm": 0.39379310607910156,
"learning_rate": 3.395149336686595e-05,
"loss": 0.8585996627807617,
"step": 698
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.2216329425573349,
"learning_rate": 3.386615622126883e-05,
"loss": 0.6349502801895142,
"step": 700
},
{
"epoch": 1.2857142857142856,
"grad_norm": 1.3201651573181152,
"learning_rate": 3.378071941232525e-05,
"loss": 0.507042646408081,
"step": 702
},
{
"epoch": 1.2893772893772895,
"grad_norm": 1.1609549522399902,
"learning_rate": 3.369518427756417e-05,
"loss": 1.2238701581954956,
"step": 704
},
{
"epoch": 1.293040293040293,
"grad_norm": 0.262668639421463,
"learning_rate": 3.360955215605385e-05,
"loss": 0.954353928565979,
"step": 706
},
{
"epoch": 1.2967032967032968,
"grad_norm": 0.15986381471157074,
"learning_rate": 3.35238243883809e-05,
"loss": 0.9157785177230835,
"step": 708
},
{
"epoch": 1.3003663003663004,
"grad_norm": 0.24788087606430054,
"learning_rate": 3.34380023166293e-05,
"loss": 0.523282527923584,
"step": 710
},
{
"epoch": 1.304029304029304,
"grad_norm": 0.5071739554405212,
"learning_rate": 3.335208728435935e-05,
"loss": 0.8822041749954224,
"step": 712
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.21843966841697693,
"learning_rate": 3.3266080636586685e-05,
"loss": 1.1520413160324097,
"step": 714
},
{
"epoch": 1.3113553113553114,
"grad_norm": 0.7570046186447144,
"learning_rate": 3.317998371976121e-05,
"loss": 1.1054189205169678,
"step": 716
},
{
"epoch": 1.315018315018315,
"grad_norm": 0.24603700637817383,
"learning_rate": 3.309379788174598e-05,
"loss": 0.8677737712860107,
"step": 718
},
{
"epoch": 1.3186813186813187,
"grad_norm": 0.6697846055030823,
"learning_rate": 3.3007524471796136e-05,
"loss": 0.8973780870437622,
"step": 720
},
{
"epoch": 1.3223443223443223,
"grad_norm": 0.24531511962413788,
"learning_rate": 3.2921164840537784e-05,
"loss": 0.8588492274284363,
"step": 722
},
{
"epoch": 1.326007326007326,
"grad_norm": 0.1828172355890274,
"learning_rate": 3.283472033994683e-05,
"loss": 1.188812255859375,
"step": 724
},
{
"epoch": 1.3296703296703296,
"grad_norm": 0.24933356046676636,
"learning_rate": 3.274819232332783e-05,
"loss": 1.0235859155654907,
"step": 726
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.8426433801651001,
"learning_rate": 3.2661582145292805e-05,
"loss": 1.116140604019165,
"step": 728
},
{
"epoch": 1.3369963369963371,
"grad_norm": 0.518878161907196,
"learning_rate": 3.2574891161740014e-05,
"loss": 0.6969371438026428,
"step": 730
},
{
"epoch": 1.3406593406593408,
"grad_norm": 0.3753526210784912,
"learning_rate": 3.2488120729832745e-05,
"loss": 0.7986868023872375,
"step": 732
},
{
"epoch": 1.3443223443223444,
"grad_norm": 0.09369145333766937,
"learning_rate": 3.240127220797807e-05,
"loss": 0.6143500804901123,
"step": 734
},
{
"epoch": 1.347985347985348,
"grad_norm": 0.7680373787879944,
"learning_rate": 3.231434695580558e-05,
"loss": 1.102622628211975,
"step": 736
},
{
"epoch": 1.3516483516483517,
"grad_norm": 0.2766784131526947,
"learning_rate": 3.222734633414607e-05,
"loss": 0.7411299347877502,
"step": 738
},
{
"epoch": 1.3553113553113554,
"grad_norm": 0.4584338665008545,
"learning_rate": 3.214027170501029e-05,
"loss": 0.9950368404388428,
"step": 740
},
{
"epoch": 1.358974358974359,
"grad_norm": 0.800658106803894,
"learning_rate": 3.205312443156755e-05,
"loss": 0.5370650887489319,
"step": 742
},
{
"epoch": 1.3626373626373627,
"grad_norm": 0.39628687500953674,
"learning_rate": 3.196590587812446e-05,
"loss": 1.2178653478622437,
"step": 744
},
{
"epoch": 1.3663003663003663,
"grad_norm": 0.9103190302848816,
"learning_rate": 3.1878617410103514e-05,
"loss": 1.0132914781570435,
"step": 746
},
{
"epoch": 1.36996336996337,
"grad_norm": 0.24214321374893188,
"learning_rate": 3.1791260394021735e-05,
"loss": 1.0330907106399536,
"step": 748
},
{
"epoch": 1.3736263736263736,
"grad_norm": 0.22804208099842072,
"learning_rate": 3.1703836197469257e-05,
"loss": 0.7769557237625122,
"step": 750
},
{
"epoch": 1.3772893772893773,
"grad_norm": 0.31006965041160583,
"learning_rate": 3.161634618908797e-05,
"loss": 1.099147915840149,
"step": 752
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.2560300827026367,
"learning_rate": 3.1528791738550054e-05,
"loss": 0.9559687376022339,
"step": 754
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.21459929645061493,
"learning_rate": 3.1441174216536514e-05,
"loss": 1.2862838506698608,
"step": 756
},
{
"epoch": 1.3882783882783882,
"grad_norm": 0.6228247880935669,
"learning_rate": 3.135349499471579e-05,
"loss": 1.1889519691467285,
"step": 758
},
{
"epoch": 1.3919413919413919,
"grad_norm": 0.15515995025634766,
"learning_rate": 3.126575544572222e-05,
"loss": 1.1298028230667114,
"step": 760
},
{
"epoch": 1.3956043956043955,
"grad_norm": 0.2352827489376068,
"learning_rate": 3.117795694313458e-05,
"loss": 1.1332722902297974,
"step": 762
},
{
"epoch": 1.3992673992673992,
"grad_norm": 0.15803271532058716,
"learning_rate": 3.109010086145456e-05,
"loss": 0.689454197883606,
"step": 764
},
{
"epoch": 1.4029304029304028,
"grad_norm": 0.2263651043176651,
"learning_rate": 3.1002188576085295e-05,
"loss": 0.8705043196678162,
"step": 766
},
{
"epoch": 1.4065934065934065,
"grad_norm": 1.2859166860580444,
"learning_rate": 3.091422146330977e-05,
"loss": 0.8634616732597351,
"step": 768
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.38691240549087524,
"learning_rate": 3.082620090026932e-05,
"loss": 1.1554944515228271,
"step": 770
},
{
"epoch": 1.4139194139194138,
"grad_norm": 0.21298235654830933,
"learning_rate": 3.0738128264942046e-05,
"loss": 1.1856485605239868,
"step": 772
},
{
"epoch": 1.4175824175824177,
"grad_norm": 0.4227127432823181,
"learning_rate": 3.0650004936121254e-05,
"loss": 0.9900102615356445,
"step": 774
},
{
"epoch": 1.4212454212454213,
"grad_norm": 0.272560715675354,
"learning_rate": 3.0561832293393846e-05,
"loss": 1.1388965845108032,
"step": 776
},
{
"epoch": 1.424908424908425,
"grad_norm": 0.23176811635494232,
"learning_rate": 3.04736117171188e-05,
"loss": 0.8189452886581421,
"step": 778
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.17611531913280487,
"learning_rate": 3.0385344588405422e-05,
"loss": 1.3413128852844238,
"step": 780
},
{
"epoch": 1.4322344322344323,
"grad_norm": 0.4652513861656189,
"learning_rate": 3.029703228909186e-05,
"loss": 1.1679465770721436,
"step": 782
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.16926081478595734,
"learning_rate": 3.0208676201723406e-05,
"loss": 1.134766697883606,
"step": 784
},
{
"epoch": 1.4395604395604396,
"grad_norm": 3.288715362548828,
"learning_rate": 3.0120277709530854e-05,
"loss": 0.9238865971565247,
"step": 786
},
{
"epoch": 1.4432234432234432,
"grad_norm": 0.3530486524105072,
"learning_rate": 3.003183819640886e-05,
"loss": 1.1074001789093018,
"step": 788
},
{
"epoch": 1.4468864468864469,
"grad_norm": 0.0459970124065876,
"learning_rate": 2.9943359046894254e-05,
"loss": 0.6836336851119995,
"step": 790
},
{
"epoch": 1.4505494505494505,
"grad_norm": 0.23490065336227417,
"learning_rate": 2.9854841646144423e-05,
"loss": 0.9037283062934875,
"step": 792
},
{
"epoch": 1.4542124542124542,
"grad_norm": 0.35676056146621704,
"learning_rate": 2.9766287379915518e-05,
"loss": 0.8027743101119995,
"step": 794
},
{
"epoch": 1.4578754578754578,
"grad_norm": 0.17792175710201263,
"learning_rate": 2.967769763454089e-05,
"loss": 1.1282213926315308,
"step": 796
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.28798913955688477,
"learning_rate": 2.9589073796909282e-05,
"loss": 0.7936130166053772,
"step": 798
},
{
"epoch": 1.4652014652014653,
"grad_norm": 0.20934176445007324,
"learning_rate": 2.950041725444318e-05,
"loss": 0.9876341819763184,
"step": 800
},
{
"epoch": 1.468864468864469,
"grad_norm": 0.4038946032524109,
"learning_rate": 2.941172939507706e-05,
"loss": 1.5155441761016846,
"step": 802
},
{
"epoch": 1.4725274725274726,
"grad_norm": 0.8771412968635559,
"learning_rate": 2.932301160723566e-05,
"loss": 1.0064780712127686,
"step": 804
},
{
"epoch": 1.4761904761904763,
"grad_norm": 0.3849470317363739,
"learning_rate": 2.923426527981228e-05,
"loss": 1.171331524848938,
"step": 806
},
{
"epoch": 1.47985347985348,
"grad_norm": 0.5106516480445862,
"learning_rate": 2.9145491802146984e-05,
"loss": 1.1589710712432861,
"step": 808
},
{
"epoch": 1.4835164835164836,
"grad_norm": 0.2847084403038025,
"learning_rate": 2.905669256400491e-05,
"loss": 0.9889826774597168,
"step": 810
},
{
"epoch": 1.4871794871794872,
"grad_norm": 0.20060478150844574,
"learning_rate": 2.896786895555444e-05,
"loss": 0.8426548838615417,
"step": 812
},
{
"epoch": 1.4908424908424909,
"grad_norm": 0.11503265798091888,
"learning_rate": 2.887902236734552e-05,
"loss": 1.1940970420837402,
"step": 814
},
{
"epoch": 1.4945054945054945,
"grad_norm": 0.23822751641273499,
"learning_rate": 2.879015419028781e-05,
"loss": 1.1169782876968384,
"step": 816
},
{
"epoch": 1.4981684981684982,
"grad_norm": 0.1774878203868866,
"learning_rate": 2.8701265815628987e-05,
"loss": 0.3862011432647705,
"step": 818
},
{
"epoch": 1.5018315018315018,
"grad_norm": 0.2943243980407715,
"learning_rate": 2.8612358634932884e-05,
"loss": 1.1364233493804932,
"step": 820
},
{
"epoch": 1.5054945054945055,
"grad_norm": 0.34925273060798645,
"learning_rate": 2.852343404005778e-05,
"loss": 0.8967536687850952,
"step": 822
},
{
"epoch": 1.5091575091575091,
"grad_norm": 0.2488994151353836,
"learning_rate": 2.8434493423134544e-05,
"loss": 0.8218085169792175,
"step": 824
},
{
"epoch": 1.5128205128205128,
"grad_norm": 0.17446660995483398,
"learning_rate": 2.8345538176544918e-05,
"loss": 1.0801664590835571,
"step": 826
},
{
"epoch": 1.5164835164835164,
"grad_norm": 0.9011160731315613,
"learning_rate": 2.8256569692899627e-05,
"loss": 0.9831532835960388,
"step": 828
},
{
"epoch": 1.52014652014652,
"grad_norm": 0.2522432208061218,
"learning_rate": 2.8167589365016646e-05,
"loss": 0.984779953956604,
"step": 830
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.2282875031232834,
"learning_rate": 2.8078598585899385e-05,
"loss": 1.2356276512145996,
"step": 832
},
{
"epoch": 1.5274725274725274,
"grad_norm": 0.45575666427612305,
"learning_rate": 2.7989598748714846e-05,
"loss": 0.7996046543121338,
"step": 834
},
{
"epoch": 1.531135531135531,
"grad_norm": 0.31805214285850525,
"learning_rate": 2.7900591246771855e-05,
"loss": 1.1425288915634155,
"step": 836
},
{
"epoch": 1.5347985347985347,
"grad_norm": 0.24859696626663208,
"learning_rate": 2.7811577473499224e-05,
"loss": 1.1804063320159912,
"step": 838
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.2666679322719574,
"learning_rate": 2.772255882242394e-05,
"loss": 1.0411241054534912,
"step": 840
},
{
"epoch": 1.542124542124542,
"grad_norm": 0.20402321219444275,
"learning_rate": 2.7633536687149353e-05,
"loss": 0.8526805639266968,
"step": 842
},
{
"epoch": 1.5457875457875456,
"grad_norm": 0.38646435737609863,
"learning_rate": 2.7544512461333377e-05,
"loss": 1.0257073640823364,
"step": 844
},
{
"epoch": 1.5494505494505495,
"grad_norm": 0.19415532052516937,
"learning_rate": 2.745548753866663e-05,
"loss": 1.1687860488891602,
"step": 846
},
{
"epoch": 1.5531135531135531,
"grad_norm": 0.2800712287425995,
"learning_rate": 2.7366463312850655e-05,
"loss": 0.5396187901496887,
"step": 848
},
{
"epoch": 1.5567765567765568,
"grad_norm": 0.18754497170448303,
"learning_rate": 2.727744117757607e-05,
"loss": 1.02944016456604,
"step": 850
},
{
"epoch": 1.5604395604395604,
"grad_norm": 0.2201027274131775,
"learning_rate": 2.7188422526500788e-05,
"loss": 1.168210744857788,
"step": 852
},
{
"epoch": 1.564102564102564,
"grad_norm": 0.3404369354248047,
"learning_rate": 2.709940875322815e-05,
"loss": 0.8437097072601318,
"step": 854
},
{
"epoch": 1.5677655677655677,
"grad_norm": 0.35649099946022034,
"learning_rate": 2.7010401251285156e-05,
"loss": 0.8084161877632141,
"step": 856
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.47654107213020325,
"learning_rate": 2.6921401414100627e-05,
"loss": 0.9324872493743896,
"step": 858
},
{
"epoch": 1.575091575091575,
"grad_norm": 0.23765020072460175,
"learning_rate": 2.6832410634983356e-05,
"loss": 0.6993922591209412,
"step": 860
},
{
"epoch": 1.578754578754579,
"grad_norm": 0.24254556000232697,
"learning_rate": 2.6743430307100388e-05,
"loss": 1.1566822528839111,
"step": 862
},
{
"epoch": 1.5824175824175826,
"grad_norm": 0.2840297520160675,
"learning_rate": 2.665446182345509e-05,
"loss": 0.8900972008705139,
"step": 864
},
{
"epoch": 1.5860805860805862,
"grad_norm": 0.23388059437274933,
"learning_rate": 2.6565506576865458e-05,
"loss": 1.1673542261123657,
"step": 866
},
{
"epoch": 1.5897435897435899,
"grad_norm": 0.5824553966522217,
"learning_rate": 2.6476565959942233e-05,
"loss": 1.1742522716522217,
"step": 868
},
{
"epoch": 1.5934065934065935,
"grad_norm": 0.2521633803844452,
"learning_rate": 2.6387641365067124e-05,
"loss": 0.5782104730606079,
"step": 870
},
{
"epoch": 1.5970695970695972,
"grad_norm": 0.16043610870838165,
"learning_rate": 2.6298734184371015e-05,
"loss": 1.1673752069473267,
"step": 872
},
{
"epoch": 1.6007326007326008,
"grad_norm": 0.1490897536277771,
"learning_rate": 2.6209845809712195e-05,
"loss": 1.0414141416549683,
"step": 874
},
{
"epoch": 1.6043956043956045,
"grad_norm": 5.497232437133789,
"learning_rate": 2.6120977632654485e-05,
"loss": 0.7130216360092163,
"step": 876
},
{
"epoch": 1.6080586080586081,
"grad_norm": 0.6886661648750305,
"learning_rate": 2.6032131044445563e-05,
"loss": 0.9459899663925171,
"step": 878
},
{
"epoch": 1.6117216117216118,
"grad_norm": 0.09610695391893387,
"learning_rate": 2.59433074359951e-05,
"loss": 0.8761966228485107,
"step": 880
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.31204506754875183,
"learning_rate": 2.5854508197853022e-05,
"loss": 1.1188955307006836,
"step": 882
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.3378327190876007,
"learning_rate": 2.5765734720187723e-05,
"loss": 0.9112301468849182,
"step": 884
},
{
"epoch": 1.6227106227106227,
"grad_norm": 0.22289712727069855,
"learning_rate": 2.5676988392764345e-05,
"loss": 1.1279692649841309,
"step": 886
},
{
"epoch": 1.6263736263736264,
"grad_norm": 0.20399990677833557,
"learning_rate": 2.5588270604922947e-05,
"loss": 0.8507078886032104,
"step": 888
},
{
"epoch": 1.63003663003663,
"grad_norm": 0.16419348120689392,
"learning_rate": 2.5499582745556828e-05,
"loss": 1.1664886474609375,
"step": 890
},
{
"epoch": 1.6336996336996337,
"grad_norm": 0.488471120595932,
"learning_rate": 2.541092620309073e-05,
"loss": 0.8955670595169067,
"step": 892
},
{
"epoch": 1.6373626373626373,
"grad_norm": 0.2712729573249817,
"learning_rate": 2.5322302365459116e-05,
"loss": 1.1703094244003296,
"step": 894
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.2993127107620239,
"learning_rate": 2.5233712620084494e-05,
"loss": 0.7074750065803528,
"step": 896
},
{
"epoch": 1.6446886446886446,
"grad_norm": 0.17085275053977966,
"learning_rate": 2.5145158353855592e-05,
"loss": 1.122510313987732,
"step": 898
},
{
"epoch": 1.6483516483516483,
"grad_norm": 0.04999161139130592,
"learning_rate": 2.505664095310574e-05,
"loss": 0.7906731963157654,
"step": 900
},
{
"epoch": 1.652014652014652,
"grad_norm": 0.22062256932258606,
"learning_rate": 2.496816180359115e-05,
"loss": 0.8734868764877319,
"step": 902
},
{
"epoch": 1.6556776556776556,
"grad_norm": 0.32644936442375183,
"learning_rate": 2.4879722290469155e-05,
"loss": 1.0106048583984375,
"step": 904
},
{
"epoch": 1.6593406593406592,
"grad_norm": 0.33785051107406616,
"learning_rate": 2.4791323798276593e-05,
"loss": 1.1063401699066162,
"step": 906
},
{
"epoch": 1.6630036630036629,
"grad_norm": 0.19222113490104675,
"learning_rate": 2.4702967710908143e-05,
"loss": 0.7271807789802551,
"step": 908
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.34294626116752625,
"learning_rate": 2.4614655411594583e-05,
"loss": 1.180138349533081,
"step": 910
},
{
"epoch": 1.6703296703296702,
"grad_norm": 0.08716004341840744,
"learning_rate": 2.452638828288121e-05,
"loss": 1.2012932300567627,
"step": 912
},
{
"epoch": 1.673992673992674,
"grad_norm": 0.14314715564250946,
"learning_rate": 2.4438167706606152e-05,
"loss": 1.0735292434692383,
"step": 914
},
{
"epoch": 1.6776556776556777,
"grad_norm": 0.9384481906890869,
"learning_rate": 2.434999506387875e-05,
"loss": 1.1871505975723267,
"step": 916
},
{
"epoch": 1.6813186813186813,
"grad_norm": 0.8262288570404053,
"learning_rate": 2.4261871735057956e-05,
"loss": 0.8213975429534912,
"step": 918
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.25928905606269836,
"learning_rate": 2.417379909973069e-05,
"loss": 1.1863298416137695,
"step": 920
},
{
"epoch": 1.6886446886446886,
"grad_norm": 0.33573785424232483,
"learning_rate": 2.408577853669024e-05,
"loss": 0.9798559546470642,
"step": 922
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.254114031791687,
"learning_rate": 2.3997811423914717e-05,
"loss": 0.8279831409454346,
"step": 924
},
{
"epoch": 1.695970695970696,
"grad_norm": 0.8429379463195801,
"learning_rate": 2.390989913854545e-05,
"loss": 0.9410126209259033,
"step": 926
},
{
"epoch": 1.6996336996336996,
"grad_norm": 0.13982383906841278,
"learning_rate": 2.382204305686543e-05,
"loss": 0.7881975173950195,
"step": 928
},
{
"epoch": 1.7032967032967035,
"grad_norm": 0.03225285932421684,
"learning_rate": 2.373424455427779e-05,
"loss": 0.8376743197441101,
"step": 930
},
{
"epoch": 1.7069597069597071,
"grad_norm": 0.28351178765296936,
"learning_rate": 2.364650500528421e-05,
"loss": 0.9067142605781555,
"step": 932
},
{
"epoch": 1.7106227106227108,
"grad_norm": 0.1636010706424713,
"learning_rate": 2.3558825783463484e-05,
"loss": 1.3678433895111084,
"step": 934
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.29480600357055664,
"learning_rate": 2.3471208261449955e-05,
"loss": 1.1705397367477417,
"step": 936
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.1829896867275238,
"learning_rate": 2.3383653810912033e-05,
"loss": 0.4487687945365906,
"step": 938
},
{
"epoch": 1.7216117216117217,
"grad_norm": 0.11328794807195663,
"learning_rate": 2.3296163802530745e-05,
"loss": 0.4792923629283905,
"step": 940
},
{
"epoch": 1.7252747252747254,
"grad_norm": 0.2216004580259323,
"learning_rate": 2.320873960597828e-05,
"loss": 1.1090213060379028,
"step": 942
},
{
"epoch": 1.728937728937729,
"grad_norm": 0.147038996219635,
"learning_rate": 2.312138258989649e-05,
"loss": 0.9103480577468872,
"step": 944
},
{
"epoch": 1.7326007326007327,
"grad_norm": 0.13560520112514496,
"learning_rate": 2.3034094121875543e-05,
"loss": 1.1171597242355347,
"step": 946
},
{
"epoch": 1.7362637362637363,
"grad_norm": 0.15320324897766113,
"learning_rate": 2.2946875568432458e-05,
"loss": 1.0631382465362549,
"step": 948
},
{
"epoch": 1.73992673992674,
"grad_norm": 0.1420201063156128,
"learning_rate": 2.2859728294989718e-05,
"loss": 0.9614072442054749,
"step": 950
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.10232152044773102,
"learning_rate": 2.277265366585394e-05,
"loss": 0.9154943227767944,
"step": 952
},
{
"epoch": 1.7472527472527473,
"grad_norm": 0.8972752690315247,
"learning_rate": 2.268565304419443e-05,
"loss": 1.170873761177063,
"step": 954
},
{
"epoch": 1.750915750915751,
"grad_norm": 0.3512289226055145,
"learning_rate": 2.2598727792021933e-05,
"loss": 0.741244912147522,
"step": 956
},
{
"epoch": 1.7545787545787546,
"grad_norm": 0.28857314586639404,
"learning_rate": 2.2511879270167264e-05,
"loss": 1.1468185186386108,
"step": 958
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.7446520328521729,
"learning_rate": 2.2425108838259995e-05,
"loss": 0.46506467461586,
"step": 960
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.3139968514442444,
"learning_rate": 2.23384178547072e-05,
"loss": 0.9495673775672913,
"step": 962
},
{
"epoch": 1.7655677655677655,
"grad_norm": 0.18298202753067017,
"learning_rate": 2.225180767667217e-05,
"loss": 1.0209523439407349,
"step": 964
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.16875436902046204,
"learning_rate": 2.2165279660053174e-05,
"loss": 1.1537625789642334,
"step": 966
},
{
"epoch": 1.7728937728937728,
"grad_norm": 0.19506409764289856,
"learning_rate": 2.2078835159462225e-05,
"loss": 0.5657550692558289,
"step": 968
},
{
"epoch": 1.7765567765567765,
"grad_norm": 0.1412108838558197,
"learning_rate": 2.1992475528203872e-05,
"loss": 0.8089891672134399,
"step": 970
},
{
"epoch": 1.7802197802197801,
"grad_norm": 0.4082830250263214,
"learning_rate": 2.1906202118254025e-05,
"loss": 0.8698192834854126,
"step": 972
},
{
"epoch": 1.7838827838827838,
"grad_norm": 0.3070997893810272,
"learning_rate": 2.1820016280238792e-05,
"loss": 1.1678433418273926,
"step": 974
},
{
"epoch": 1.7875457875457874,
"grad_norm": 0.18406766653060913,
"learning_rate": 2.1733919363413314e-05,
"loss": 1.1347768306732178,
"step": 976
},
{
"epoch": 1.791208791208791,
"grad_norm": 0.16203664243221283,
"learning_rate": 2.1647912715640657e-05,
"loss": 0.8943782448768616,
"step": 978
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.17423562705516815,
"learning_rate": 2.1561997683370705e-05,
"loss": 0.9334428310394287,
"step": 980
},
{
"epoch": 1.7985347985347986,
"grad_norm": 0.15521575510501862,
"learning_rate": 2.147617561161911e-05,
"loss": 1.121093988418579,
"step": 982
},
{
"epoch": 1.8021978021978022,
"grad_norm": 1.1999096870422363,
"learning_rate": 2.1390447843946156e-05,
"loss": 1.0399394035339355,
"step": 984
},
{
"epoch": 1.8058608058608059,
"grad_norm": 0.236453577876091,
"learning_rate": 2.1304815722435838e-05,
"loss": 0.6336957812309265,
"step": 986
},
{
"epoch": 1.8095238095238095,
"grad_norm": 1.987808108329773,
"learning_rate": 2.121928058767475e-05,
"loss": 0.8018144965171814,
"step": 988
},
{
"epoch": 1.8131868131868132,
"grad_norm": 0.6906500458717346,
"learning_rate": 2.113384377873117e-05,
"loss": 0.7112327814102173,
"step": 990
},
{
"epoch": 1.8168498168498168,
"grad_norm": 0.5853157043457031,
"learning_rate": 2.1048506633134058e-05,
"loss": 0.770244300365448,
"step": 992
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.22558943927288055,
"learning_rate": 2.0963270486852116e-05,
"loss": 0.8251454830169678,
"step": 994
},
{
"epoch": 1.8241758241758241,
"grad_norm": 0.36891841888427734,
"learning_rate": 2.0878136674272874e-05,
"loss": 1.0850389003753662,
"step": 996
},
{
"epoch": 1.8278388278388278,
"grad_norm": 0.21534068882465363,
"learning_rate": 2.079310652818186e-05,
"loss": 0.8296566605567932,
"step": 998
},
{
"epoch": 1.8315018315018317,
"grad_norm": 0.20088708400726318,
"learning_rate": 2.070818137974162e-05,
"loss": 1.0995657444000244,
"step": 1000
},
{
"epoch": 1.8351648351648353,
"grad_norm": 0.16254781186580658,
"learning_rate": 2.0623362558470983e-05,
"loss": 1.1204814910888672,
"step": 1002
},
{
"epoch": 1.838827838827839,
"grad_norm": 0.16232743859291077,
"learning_rate": 2.05386513922242e-05,
"loss": 1.1130619049072266,
"step": 1004
},
{
"epoch": 1.8424908424908426,
"grad_norm": 0.21432489156723022,
"learning_rate": 2.0454049207170146e-05,
"loss": 1.1204091310501099,
"step": 1006
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.45065784454345703,
"learning_rate": 2.0369557327771594e-05,
"loss": 0.7804591655731201,
"step": 1008
},
{
"epoch": 1.84981684981685,
"grad_norm": 0.2610171139240265,
"learning_rate": 2.0285177076764462e-05,
"loss": 1.076236367225647,
"step": 1010
},
{
"epoch": 1.8534798534798536,
"grad_norm": 0.11059543490409851,
"learning_rate": 2.0200909775137085e-05,
"loss": 0.7410160899162292,
"step": 1012
},
{
"epoch": 1.8571428571428572,
"grad_norm": 2.333650827407837,
"learning_rate": 2.0116756742109577e-05,
"loss": 1.1382379531860352,
"step": 1014
},
{
"epoch": 1.8608058608058609,
"grad_norm": 0.16106364130973816,
"learning_rate": 2.003271929511314e-05,
"loss": 1.1225502490997314,
"step": 1016
},
{
"epoch": 1.8644688644688645,
"grad_norm": 0.1435774266719818,
"learning_rate": 1.9948798749769464e-05,
"loss": 1.197827696800232,
"step": 1018
},
{
"epoch": 1.8681318681318682,
"grad_norm": 0.04951045662164688,
"learning_rate": 1.986499641987013e-05,
"loss": 0.9368598461151123,
"step": 1020
},
{
"epoch": 1.8717948717948718,
"grad_norm": 0.17583589255809784,
"learning_rate": 1.9781313617356012e-05,
"loss": 1.0920844078063965,
"step": 1022
},
{
"epoch": 1.8754578754578755,
"grad_norm": 0.3376821279525757,
"learning_rate": 1.9697751652296782e-05,
"loss": 0.4992130398750305,
"step": 1024
},
{
"epoch": 1.879120879120879,
"grad_norm": 0.1590043604373932,
"learning_rate": 1.961431183287037e-05,
"loss": 1.1315664052963257,
"step": 1026
},
{
"epoch": 1.8827838827838828,
"grad_norm": 0.28124603629112244,
"learning_rate": 1.9530995465342482e-05,
"loss": 0.9077785611152649,
"step": 1028
},
{
"epoch": 1.8864468864468864,
"grad_norm": 0.2012709081172943,
"learning_rate": 1.9447803854046192e-05,
"loss": 1.1241216659545898,
"step": 1030
},
{
"epoch": 1.89010989010989,
"grad_norm": 0.2831279933452606,
"learning_rate": 1.9364738301361473e-05,
"loss": 0.9281163811683655,
"step": 1032
},
{
"epoch": 1.8937728937728937,
"grad_norm": 0.45628663897514343,
"learning_rate": 1.928180010769482e-05,
"loss": 0.7836743593215942,
"step": 1034
},
{
"epoch": 1.8974358974358974,
"grad_norm": 0.26489734649658203,
"learning_rate": 1.919899057145891e-05,
"loss": 1.1775709390640259,
"step": 1036
},
{
"epoch": 1.901098901098901,
"grad_norm": 0.4411194920539856,
"learning_rate": 1.911631098905227e-05,
"loss": 1.1143101453781128,
"step": 1038
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.39963364601135254,
"learning_rate": 1.903376265483896e-05,
"loss": 1.234065055847168,
"step": 1040
},
{
"epoch": 1.9084249084249083,
"grad_norm": 0.2849297523498535,
"learning_rate": 1.895134686112834e-05,
"loss": 0.6098147034645081,
"step": 1042
},
{
"epoch": 1.912087912087912,
"grad_norm": 0.17120879888534546,
"learning_rate": 1.886906489815482e-05,
"loss": 0.8268396258354187,
"step": 1044
},
{
"epoch": 1.9157509157509156,
"grad_norm": 0.2931853234767914,
"learning_rate": 1.878691805405765e-05,
"loss": 1.0940194129943848,
"step": 1046
},
{
"epoch": 1.9194139194139193,
"grad_norm": 0.16556760668754578,
"learning_rate": 1.8704907614860797e-05,
"loss": 0.3900573253631592,
"step": 1048
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.30659088492393494,
"learning_rate": 1.8623034864452753e-05,
"loss": 0.841820478439331,
"step": 1050
},
{
"epoch": 1.9267399267399268,
"grad_norm": 0.13327482342720032,
"learning_rate": 1.8541301084566496e-05,
"loss": 0.7997146248817444,
"step": 1052
},
{
"epoch": 1.9304029304029304,
"grad_norm": 0.23069679737091064,
"learning_rate": 1.8459707554759385e-05,
"loss": 1.1094664335250854,
"step": 1054
},
{
"epoch": 1.934065934065934,
"grad_norm": 0.08997397869825363,
"learning_rate": 1.8378255552393126e-05,
"loss": 0.737388551235199,
"step": 1056
},
{
"epoch": 1.9377289377289377,
"grad_norm": 0.435207724571228,
"learning_rate": 1.8296946352613792e-05,
"loss": 0.9677636027336121,
"step": 1058
},
{
"epoch": 1.9413919413919414,
"grad_norm": 0.2596050500869751,
"learning_rate": 1.8215781228331884e-05,
"loss": 1.0497726202011108,
"step": 1060
},
{
"epoch": 1.945054945054945,
"grad_norm": 0.15544529259204865,
"learning_rate": 1.8134761450202316e-05,
"loss": 0.7944180369377136,
"step": 1062
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.3575184643268585,
"learning_rate": 1.805388828660463e-05,
"loss": 0.7939121127128601,
"step": 1064
},
{
"epoch": 1.9523809523809523,
"grad_norm": 0.38059937953948975,
"learning_rate": 1.79731630036231e-05,
"loss": 1.1412160396575928,
"step": 1066
},
{
"epoch": 1.9560439560439562,
"grad_norm": 0.15067918598651886,
"learning_rate": 1.7892586865026835e-05,
"loss": 1.1604868173599243,
"step": 1068
},
{
"epoch": 1.9597069597069599,
"grad_norm": 0.14763464033603668,
"learning_rate": 1.7812161132250122e-05,
"loss": 0.6316368579864502,
"step": 1070
},
{
"epoch": 1.9633699633699635,
"grad_norm": 0.20061402022838593,
"learning_rate": 1.7731887064372617e-05,
"loss": 0.7977589964866638,
"step": 1072
},
{
"epoch": 1.9670329670329672,
"grad_norm": 0.20839878916740417,
"learning_rate": 1.7651765918099588e-05,
"loss": 1.1316888332366943,
"step": 1074
},
{
"epoch": 1.9706959706959708,
"grad_norm": 0.14487165212631226,
"learning_rate": 1.757179894774233e-05,
"loss": 1.0948164463043213,
"step": 1076
},
{
"epoch": 1.9743589743589745,
"grad_norm": 1.0238404273986816,
"learning_rate": 1.7491987405198464e-05,
"loss": 0.9682241082191467,
"step": 1078
},
{
"epoch": 1.978021978021978,
"grad_norm": 0.4279444217681885,
"learning_rate": 1.7412332539932367e-05,
"loss": 0.9370381832122803,
"step": 1080
},
{
"epoch": 1.9816849816849818,
"grad_norm": 0.1554751694202423,
"learning_rate": 1.7332835598955615e-05,
"loss": 1.0854570865631104,
"step": 1082
},
{
"epoch": 1.9853479853479854,
"grad_norm": 0.18901632726192474,
"learning_rate": 1.7253497826807435e-05,
"loss": 0.6803427934646606,
"step": 1084
},
{
"epoch": 1.989010989010989,
"grad_norm": 1.4867212772369385,
"learning_rate": 1.717432046553523e-05,
"loss": 0.965499997138977,
"step": 1086
},
{
"epoch": 1.9926739926739927,
"grad_norm": 0.3673637807369232,
"learning_rate": 1.7095304754675168e-05,
"loss": 0.9333543181419373,
"step": 1088
},
{
"epoch": 1.9963369963369964,
"grad_norm": 0.16466295719146729,
"learning_rate": 1.701645193123272e-05,
"loss": 0.887560248374939,
"step": 1090
},
{
"epoch": 2.0,
"grad_norm": 0.21007683873176575,
"learning_rate": 1.6937763229663356e-05,
"loss": 0.9977954626083374,
"step": 1092
},
{
"epoch": 2.0036630036630036,
"grad_norm": 0.29877015948295593,
"learning_rate": 1.685923988185316e-05,
"loss": 0.9857615828514099,
"step": 1094
},
{
"epoch": 2.0073260073260073,
"grad_norm": 0.17748361825942993,
"learning_rate": 1.6780883117099575e-05,
"loss": 1.0911893844604492,
"step": 1096
},
{
"epoch": 2.010989010989011,
"grad_norm": 0.16629448533058167,
"learning_rate": 1.6702694162092177e-05,
"loss": 1.0311784744262695,
"step": 1098
},
{
"epoch": 2.0146520146520146,
"grad_norm": 0.4006339907646179,
"learning_rate": 1.6624674240893452e-05,
"loss": 1.078372597694397,
"step": 1100
},
{
"epoch": 2.0183150183150182,
"grad_norm": 0.3369395136833191,
"learning_rate": 1.6546824574919572e-05,
"loss": 0.7856264114379883,
"step": 1102
},
{
"epoch": 2.021978021978022,
"grad_norm": 0.26995745301246643,
"learning_rate": 1.6469146382921407e-05,
"loss": 1.084755778312683,
"step": 1104
},
{
"epoch": 2.0256410256410255,
"grad_norm": 0.26533597707748413,
"learning_rate": 1.6391640880965338e-05,
"loss": 0.7198016047477722,
"step": 1106
},
{
"epoch": 2.029304029304029,
"grad_norm": 0.5049846172332764,
"learning_rate": 1.6314309282414244e-05,
"loss": 1.1550657749176025,
"step": 1108
},
{
"epoch": 2.032967032967033,
"grad_norm": 0.2585044205188751,
"learning_rate": 1.623715279790853e-05,
"loss": 0.7946727275848389,
"step": 1110
},
{
"epoch": 2.0366300366300365,
"grad_norm": 0.08100654929876328,
"learning_rate": 1.616017263534713e-05,
"loss": 0.5124549269676208,
"step": 1112
},
{
"epoch": 2.04029304029304,
"grad_norm": 0.2333383411169052,
"learning_rate": 1.608336999986867e-05,
"loss": 0.7633625268936157,
"step": 1114
},
{
"epoch": 2.043956043956044,
"grad_norm": 0.6371340751647949,
"learning_rate": 1.600674609383253e-05,
"loss": 1.0826982259750366,
"step": 1116
},
{
"epoch": 2.0476190476190474,
"grad_norm": 0.38138705492019653,
"learning_rate": 1.5930302116800044e-05,
"loss": 1.0550010204315186,
"step": 1118
},
{
"epoch": 2.051282051282051,
"grad_norm": 3.1294875144958496,
"learning_rate": 1.585403926551573e-05,
"loss": 0.7597935199737549,
"step": 1120
},
{
"epoch": 2.0549450549450547,
"grad_norm": 5.722661018371582,
"learning_rate": 1.5777958733888565e-05,
"loss": 1.1912044286727905,
"step": 1122
},
{
"epoch": 2.0586080586080584,
"grad_norm": 0.21310099959373474,
"learning_rate": 1.570206171297324e-05,
"loss": 1.0809831619262695,
"step": 1124
},
{
"epoch": 2.062271062271062,
"grad_norm": 0.30974480509757996,
"learning_rate": 1.56263493909516e-05,
"loss": 0.6720230579376221,
"step": 1126
},
{
"epoch": 2.065934065934066,
"grad_norm": 0.1537218540906906,
"learning_rate": 1.555082295311396e-05,
"loss": 1.0456634759902954,
"step": 1128
},
{
"epoch": 2.06959706959707,
"grad_norm": 0.44281265139579773,
"learning_rate": 1.5475483581840587e-05,
"loss": 1.017748236656189,
"step": 1130
},
{
"epoch": 2.0732600732600734,
"grad_norm": 0.24577392637729645,
"learning_rate": 1.54003324565832e-05,
"loss": 1.0750110149383545,
"step": 1132
},
{
"epoch": 2.076923076923077,
"grad_norm": 0.22100472450256348,
"learning_rate": 1.53253707538465e-05,
"loss": 1.060890793800354,
"step": 1134
},
{
"epoch": 2.0805860805860807,
"grad_norm": 0.9236302375793457,
"learning_rate": 1.5250599647169716e-05,
"loss": 0.6415885090827942,
"step": 1136
},
{
"epoch": 2.0842490842490844,
"grad_norm": 0.20396125316619873,
"learning_rate": 1.5176020307108276e-05,
"loss": 1.0569545030593872,
"step": 1138
},
{
"epoch": 2.087912087912088,
"grad_norm": 0.22348055243492126,
"learning_rate": 1.5101633901215456e-05,
"loss": 0.9237917065620422,
"step": 1140
},
{
"epoch": 2.0915750915750917,
"grad_norm": 0.18689396977424622,
"learning_rate": 1.5027441594024133e-05,
"loss": 1.0551191568374634,
"step": 1142
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.11935965716838837,
"learning_rate": 1.4953444547028531e-05,
"loss": 0.3609432280063629,
"step": 1144
},
{
"epoch": 2.098901098901099,
"grad_norm": 0.8522807955741882,
"learning_rate": 1.4879643918666003e-05,
"loss": 0.7986314296722412,
"step": 1146
},
{
"epoch": 2.1025641025641026,
"grad_norm": 0.35904762148857117,
"learning_rate": 1.480604086429897e-05,
"loss": 0.7680439352989197,
"step": 1148
},
{
"epoch": 2.1062271062271063,
"grad_norm": 0.23995743691921234,
"learning_rate": 1.4732636536196794e-05,
"loss": 0.7488572597503662,
"step": 1150
},
{
"epoch": 2.10989010989011,
"grad_norm": 0.24136975407600403,
"learning_rate": 1.4659432083517726e-05,
"loss": 0.9088020324707031,
"step": 1152
},
{
"epoch": 2.1135531135531136,
"grad_norm": 0.25049832463264465,
"learning_rate": 1.458642865229093e-05,
"loss": 0.6150023937225342,
"step": 1154
},
{
"epoch": 2.1172161172161172,
"grad_norm": 0.2590892016887665,
"learning_rate": 1.4513627385398554e-05,
"loss": 0.9922336935997009,
"step": 1156
},
{
"epoch": 2.120879120879121,
"grad_norm": 0.6481472849845886,
"learning_rate": 1.4441029422557817e-05,
"loss": 0.9142146110534668,
"step": 1158
},
{
"epoch": 2.1245421245421245,
"grad_norm": 0.29179754853248596,
"learning_rate": 1.4368635900303184e-05,
"loss": 0.802727997303009,
"step": 1160
},
{
"epoch": 2.128205128205128,
"grad_norm": 0.5445400476455688,
"learning_rate": 1.4296447951968562e-05,
"loss": 0.6710273623466492,
"step": 1162
},
{
"epoch": 2.131868131868132,
"grad_norm": 0.22561633586883545,
"learning_rate": 1.4224466707669542e-05,
"loss": 0.8315181136131287,
"step": 1164
},
{
"epoch": 2.1355311355311355,
"grad_norm": 0.24327363073825836,
"learning_rate": 1.4152693294285756e-05,
"loss": 0.4054326117038727,
"step": 1166
},
{
"epoch": 2.139194139194139,
"grad_norm": 0.6817838549613953,
"learning_rate": 1.4081128835443188e-05,
"loss": 0.4102446436882019,
"step": 1168
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.44050848484039307,
"learning_rate": 1.400977445149661e-05,
"loss": 0.8157304525375366,
"step": 1170
},
{
"epoch": 2.1465201465201464,
"grad_norm": 0.5266426801681519,
"learning_rate": 1.3938631259512013e-05,
"loss": 1.0504196882247925,
"step": 1172
},
{
"epoch": 2.15018315018315,
"grad_norm": 0.3633262515068054,
"learning_rate": 1.3867700373249152e-05,
"loss": 1.2162549495697021,
"step": 1174
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.5713516473770142,
"learning_rate": 1.37969829031441e-05,
"loss": 0.6878563165664673,
"step": 1176
},
{
"epoch": 2.1575091575091574,
"grad_norm": 0.18287743628025055,
"learning_rate": 1.3726479956291872e-05,
"loss": 1.0863420963287354,
"step": 1178
},
{
"epoch": 2.161172161172161,
"grad_norm": 0.1208617314696312,
"learning_rate": 1.3656192636429043e-05,
"loss": 1.089928388595581,
"step": 1180
},
{
"epoch": 2.1648351648351647,
"grad_norm": 0.32237380743026733,
"learning_rate": 1.3586122043916538e-05,
"loss": 1.0283252000808716,
"step": 1182
},
{
"epoch": 2.1684981684981683,
"grad_norm": 0.18901434540748596,
"learning_rate": 1.3516269275722387e-05,
"loss": 1.0963468551635742,
"step": 1184
},
{
"epoch": 2.172161172161172,
"grad_norm": 0.11865086108446121,
"learning_rate": 1.344663542540451e-05,
"loss": 0.1795816421508789,
"step": 1186
},
{
"epoch": 2.1758241758241756,
"grad_norm": 0.35561323165893555,
"learning_rate": 1.3377221583093632e-05,
"loss": 1.1163209676742554,
"step": 1188
},
{
"epoch": 2.1794871794871793,
"grad_norm": 0.39501580595970154,
"learning_rate": 1.3308028835476238e-05,
"loss": 0.669342041015625,
"step": 1190
},
{
"epoch": 2.183150183150183,
"grad_norm": 0.6345373392105103,
"learning_rate": 1.3239058265777499e-05,
"loss": 0.9228946566581726,
"step": 1192
},
{
"epoch": 2.186813186813187,
"grad_norm": 0.30641525983810425,
"learning_rate": 1.3170310953744388e-05,
"loss": 0.698255181312561,
"step": 1194
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.21093255281448364,
"learning_rate": 1.310178797562871e-05,
"loss": 0.8535992503166199,
"step": 1196
},
{
"epoch": 2.1941391941391943,
"grad_norm": 0.20513616502285004,
"learning_rate": 1.3033490404170276e-05,
"loss": 1.0712019205093384,
"step": 1198
},
{
"epoch": 2.197802197802198,
"grad_norm": 0.4575969874858856,
"learning_rate": 1.296541930858015e-05,
"loss": 0.6536464691162109,
"step": 1200
},
{
"epoch": 2.2014652014652016,
"grad_norm": 0.22849248349666595,
"learning_rate": 1.2897575754523832e-05,
"loss": 1.1299998760223389,
"step": 1202
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.2219730168581009,
"learning_rate": 1.2829960804104663e-05,
"loss": 1.080318570137024,
"step": 1204
},
{
"epoch": 2.208791208791209,
"grad_norm": 0.20927314460277557,
"learning_rate": 1.2762575515847106e-05,
"loss": 0.42392417788505554,
"step": 1206
},
{
"epoch": 2.2124542124542126,
"grad_norm": 0.23490868508815765,
"learning_rate": 1.2695420944680242e-05,
"loss": 0.7045865654945374,
"step": 1208
},
{
"epoch": 2.2161172161172162,
"grad_norm": 0.1882236897945404,
"learning_rate": 1.2628498141921243e-05,
"loss": 0.4839053452014923,
"step": 1210
},
{
"epoch": 2.21978021978022,
"grad_norm": 0.22759205102920532,
"learning_rate": 1.2561808155258897e-05,
"loss": 0.9919928908348083,
"step": 1212
},
{
"epoch": 2.2234432234432235,
"grad_norm": 0.22987626492977142,
"learning_rate": 1.2495352028737201e-05,
"loss": 0.7579973340034485,
"step": 1214
},
{
"epoch": 2.227106227106227,
"grad_norm": 0.2778910994529724,
"learning_rate": 1.2429130802739036e-05,
"loss": 0.8012394905090332,
"step": 1216
},
{
"epoch": 2.230769230769231,
"grad_norm": 0.17181673645973206,
"learning_rate": 1.2363145513969887e-05,
"loss": 1.0715208053588867,
"step": 1218
},
{
"epoch": 2.2344322344322345,
"grad_norm": 0.25348830223083496,
"learning_rate": 1.229739719544157e-05,
"loss": 1.06959867477417,
"step": 1220
},
{
"epoch": 2.238095238095238,
"grad_norm": 0.17098630964756012,
"learning_rate": 1.2231886876456116e-05,
"loss": 1.079147219657898,
"step": 1222
},
{
"epoch": 2.241758241758242,
"grad_norm": 1.3435940742492676,
"learning_rate": 1.2166615582589613e-05,
"loss": 0.40509262681007385,
"step": 1224
},
{
"epoch": 2.2454212454212454,
"grad_norm": 0.24130238592624664,
"learning_rate": 1.210158433567616e-05,
"loss": 1.0378178358078003,
"step": 1226
},
{
"epoch": 2.249084249084249,
"grad_norm": 0.7028672695159912,
"learning_rate": 1.2036794153791905e-05,
"loss": 0.5614770650863647,
"step": 1228
},
{
"epoch": 2.2527472527472527,
"grad_norm": 0.14420144259929657,
"learning_rate": 1.1972246051239054e-05,
"loss": 0.5607399344444275,
"step": 1230
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.20063596963882446,
"learning_rate": 1.1907941038530015e-05,
"loss": 0.5380869507789612,
"step": 1232
},
{
"epoch": 2.26007326007326,
"grad_norm": 0.3156454563140869,
"learning_rate": 1.18438801223716e-05,
"loss": 1.0096523761749268,
"step": 1234
},
{
"epoch": 2.2637362637362637,
"grad_norm": 0.2204177975654602,
"learning_rate": 1.1780064305649224e-05,
"loss": 0.7427061796188354,
"step": 1236
},
{
"epoch": 2.2673992673992673,
"grad_norm": 0.24353505671024323,
"learning_rate": 1.1716494587411248e-05,
"loss": 0.857605516910553,
"step": 1238
},
{
"epoch": 2.271062271062271,
"grad_norm": 0.2163824439048767,
"learning_rate": 1.1653171962853291e-05,
"loss": 0.7742936015129089,
"step": 1240
},
{
"epoch": 2.2747252747252746,
"grad_norm": 0.5982560515403748,
"learning_rate": 1.1590097423302684e-05,
"loss": 0.8472159504890442,
"step": 1242
},
{
"epoch": 2.2783882783882783,
"grad_norm": 0.22811779379844666,
"learning_rate": 1.1527271956202947e-05,
"loss": 1.033808946609497,
"step": 1244
},
{
"epoch": 2.282051282051282,
"grad_norm": 0.2341231256723404,
"learning_rate": 1.1464696545098332e-05,
"loss": 0.6939253807067871,
"step": 1246
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.3489930033683777,
"learning_rate": 1.1402372169618398e-05,
"loss": 0.6756494641304016,
"step": 1248
},
{
"epoch": 2.2893772893772892,
"grad_norm": 0.21145479381084442,
"learning_rate": 1.1340299805462704e-05,
"loss": 1.0505157709121704,
"step": 1250
},
{
"epoch": 2.293040293040293,
"grad_norm": 0.31191250681877136,
"learning_rate": 1.1278480424385534e-05,
"loss": 0.7581150531768799,
"step": 1252
},
{
"epoch": 2.2967032967032965,
"grad_norm": 0.1843535155057907,
"learning_rate": 1.1216914994180659e-05,
"loss": 1.10303795337677,
"step": 1254
},
{
"epoch": 2.3003663003663,
"grad_norm": 0.4075881540775299,
"learning_rate": 1.1155604478666223e-05,
"loss": 1.1203564405441284,
"step": 1256
},
{
"epoch": 2.304029304029304,
"grad_norm": 1.3023512363433838,
"learning_rate": 1.1094549837669616e-05,
"loss": 0.7518989443778992,
"step": 1258
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.3842734396457672,
"learning_rate": 1.1033752027012465e-05,
"loss": 1.0648375749588013,
"step": 1260
},
{
"epoch": 2.311355311355311,
"grad_norm": 0.9410233497619629,
"learning_rate": 1.097321199849569e-05,
"loss": 0.9340834617614746,
"step": 1262
},
{
"epoch": 2.315018315018315,
"grad_norm": 0.2198442965745926,
"learning_rate": 1.0912930699884563e-05,
"loss": 0.83587646484375,
"step": 1264
},
{
"epoch": 2.3186813186813184,
"grad_norm": 0.1930425763130188,
"learning_rate": 1.08529090748939e-05,
"loss": 0.7000831961631775,
"step": 1266
},
{
"epoch": 2.3223443223443225,
"grad_norm": 0.6036158204078674,
"learning_rate": 1.0793148063173284e-05,
"loss": 0.7626188397407532,
"step": 1268
},
{
"epoch": 2.326007326007326,
"grad_norm": 0.2609650492668152,
"learning_rate": 1.073364860029234e-05,
"loss": 1.1375476121902466,
"step": 1270
},
{
"epoch": 2.32967032967033,
"grad_norm": 0.30271434783935547,
"learning_rate": 1.0674411617726106e-05,
"loss": 0.9527180194854736,
"step": 1272
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.9938449859619141,
"learning_rate": 1.0615438042840439e-05,
"loss": 0.5007555484771729,
"step": 1274
},
{
"epoch": 2.336996336996337,
"grad_norm": 0.821854293346405,
"learning_rate": 1.0556728798877488e-05,
"loss": 0.9555824398994446,
"step": 1276
},
{
"epoch": 2.340659340659341,
"grad_norm": 0.24786677956581116,
"learning_rate": 1.0498284804941277e-05,
"loss": 0.7994169592857361,
"step": 1278
},
{
"epoch": 2.3443223443223444,
"grad_norm": 0.3086180090904236,
"learning_rate": 1.0440106975983283e-05,
"loss": 0.7172934412956238,
"step": 1280
},
{
"epoch": 2.347985347985348,
"grad_norm": 0.41890591382980347,
"learning_rate": 1.0382196222788108e-05,
"loss": 0.5843296051025391,
"step": 1282
},
{
"epoch": 2.3516483516483517,
"grad_norm": 0.21397873759269714,
"learning_rate": 1.0324553451959245e-05,
"loss": 1.0417900085449219,
"step": 1284
},
{
"epoch": 2.3553113553113554,
"grad_norm": 0.5100418329238892,
"learning_rate": 1.0267179565904879e-05,
"loss": 0.4865255355834961,
"step": 1286
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.6028478145599365,
"learning_rate": 1.0210075462823738e-05,
"loss": 0.8683855533599854,
"step": 1288
},
{
"epoch": 2.3626373626373627,
"grad_norm": 0.18368980288505554,
"learning_rate": 1.0153242036691071e-05,
"loss": 0.8409366607666016,
"step": 1290
},
{
"epoch": 2.3663003663003663,
"grad_norm": 0.13247643411159515,
"learning_rate": 1.0096680177244609e-05,
"loss": 0.7085995078086853,
"step": 1292
},
{
"epoch": 2.36996336996337,
"grad_norm": 0.27349239587783813,
"learning_rate": 1.0040390769970654e-05,
"loss": 0.7937886714935303,
"step": 1294
},
{
"epoch": 2.3736263736263736,
"grad_norm": 0.41278505325317383,
"learning_rate": 9.98437469609025e-06,
"loss": 1.1152591705322266,
"step": 1296
},
{
"epoch": 2.3772893772893773,
"grad_norm": 0.4278549551963806,
"learning_rate": 9.928632832545317e-06,
"loss": 1.0227138996124268,
"step": 1298
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.21517953276634216,
"learning_rate": 9.873166051984998e-06,
"loss": 1.0946927070617676,
"step": 1300
},
{
"epoch": 2.3846153846153846,
"grad_norm": 0.3315783143043518,
"learning_rate": 9.817975222751931e-06,
"loss": 0.7763844728469849,
"step": 1302
},
{
"epoch": 2.3882783882783882,
"grad_norm": 0.44624730944633484,
"learning_rate": 9.763061208868699e-06,
"loss": 0.4395400285720825,
"step": 1304
},
{
"epoch": 2.391941391941392,
"grad_norm": 0.20948028564453125,
"learning_rate": 9.708424870024285e-06,
"loss": 0.8480145335197449,
"step": 1306
},
{
"epoch": 2.3956043956043955,
"grad_norm": 0.4898599088191986,
"learning_rate": 9.654067061560645e-06,
"loss": 1.0664393901824951,
"step": 1308
},
{
"epoch": 2.399267399267399,
"grad_norm": 0.21064221858978271,
"learning_rate": 9.599988634459236e-06,
"loss": 0.474110871553421,
"step": 1310
},
{
"epoch": 2.402930402930403,
"grad_norm": 0.3536030650138855,
"learning_rate": 9.546190435327795e-06,
"loss": 1.0670816898345947,
"step": 1312
},
{
"epoch": 2.4065934065934065,
"grad_norm": 0.2895529866218567,
"learning_rate": 9.492673306387029e-06,
"loss": 0.7731264233589172,
"step": 1314
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.21338780224323273,
"learning_rate": 9.43943808545743e-06,
"loss": 1.0734295845031738,
"step": 1316
},
{
"epoch": 2.413919413919414,
"grad_norm": 0.5540740489959717,
"learning_rate": 9.386485605946164e-06,
"loss": 0.7238420248031616,
"step": 1318
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.1840064972639084,
"learning_rate": 9.333816696834049e-06,
"loss": 0.6843035221099854,
"step": 1320
},
{
"epoch": 2.421245421245421,
"grad_norm": 0.1636444330215454,
"learning_rate": 9.28143218266253e-06,
"loss": 0.5108417272567749,
"step": 1322
},
{
"epoch": 2.4249084249084247,
"grad_norm": 0.277536541223526,
"learning_rate": 9.229332883520825e-06,
"loss": 0.7295075058937073,
"step": 1324
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.1264895349740982,
"learning_rate": 9.177519615033034e-06,
"loss": 0.7249910831451416,
"step": 1326
},
{
"epoch": 2.4322344322344325,
"grad_norm": 0.19156897068023682,
"learning_rate": 9.125993188345402e-06,
"loss": 0.6318535208702087,
"step": 1328
},
{
"epoch": 2.435897435897436,
"grad_norm": 3.024097204208374,
"learning_rate": 9.074754410113628e-06,
"loss": 0.7735837697982788,
"step": 1330
},
{
"epoch": 2.4395604395604398,
"grad_norm": 0.14816512167453766,
"learning_rate": 9.023804082490197e-06,
"loss": 0.9631860256195068,
"step": 1332
},
{
"epoch": 2.4432234432234434,
"grad_norm": 0.43298929929733276,
"learning_rate": 8.973143003111863e-06,
"loss": 0.6613461971282959,
"step": 1334
},
{
"epoch": 2.446886446886447,
"grad_norm": 0.4770919382572174,
"learning_rate": 8.922771965087144e-06,
"loss": 0.5602841973304749,
"step": 1336
},
{
"epoch": 2.4505494505494507,
"grad_norm": 0.7131723165512085,
"learning_rate": 8.872691756983891e-06,
"loss": 0.9735853672027588,
"step": 1338
},
{
"epoch": 2.4542124542124544,
"grad_norm": 0.31672975420951843,
"learning_rate": 8.822903162816986e-06,
"loss": 0.7807232141494751,
"step": 1340
},
{
"epoch": 2.457875457875458,
"grad_norm": 0.08743062615394592,
"learning_rate": 8.773406962036031e-06,
"loss": 0.5491883754730225,
"step": 1342
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.16250762343406677,
"learning_rate": 8.724203929513133e-06,
"loss": 0.7840443253517151,
"step": 1344
},
{
"epoch": 2.4652014652014653,
"grad_norm": 0.3087010383605957,
"learning_rate": 8.675294835530828e-06,
"loss": 1.0146785974502563,
"step": 1346
},
{
"epoch": 2.468864468864469,
"grad_norm": 0.26062580943107605,
"learning_rate": 8.626680445769981e-06,
"loss": 1.0559192895889282,
"step": 1348
},
{
"epoch": 2.4725274725274726,
"grad_norm": 0.25493213534355164,
"learning_rate": 8.5783615212978e-06,
"loss": 0.6265292167663574,
"step": 1350
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.2113112509250641,
"learning_rate": 8.530338818555931e-06,
"loss": 0.711513340473175,
"step": 1352
},
{
"epoch": 2.47985347985348,
"grad_norm": 0.5152426362037659,
"learning_rate": 8.482613089348618e-06,
"loss": 0.8448625802993774,
"step": 1354
},
{
"epoch": 2.4835164835164836,
"grad_norm": 0.11193950474262238,
"learning_rate": 8.435185080830927e-06,
"loss": 0.8605793118476868,
"step": 1356
},
{
"epoch": 2.4871794871794872,
"grad_norm": 0.18866127729415894,
"learning_rate": 8.388055535497064e-06,
"loss": 1.0280365943908691,
"step": 1358
},
{
"epoch": 2.490842490842491,
"grad_norm": 0.4851335883140564,
"learning_rate": 8.341225191168722e-06,
"loss": 0.8356929421424866,
"step": 1360
},
{
"epoch": 2.4945054945054945,
"grad_norm": 0.18892696499824524,
"learning_rate": 8.29469478098355e-06,
"loss": 0.7504462599754333,
"step": 1362
},
{
"epoch": 2.498168498168498,
"grad_norm": 0.4648627042770386,
"learning_rate": 8.24846503338369e-06,
"loss": 0.7001456618309021,
"step": 1364
},
{
"epoch": 2.501831501831502,
"grad_norm": 0.2734161615371704,
"learning_rate": 8.202536672104326e-06,
"loss": 1.046680212020874,
"step": 1366
},
{
"epoch": 2.5054945054945055,
"grad_norm": 0.12593773007392883,
"learning_rate": 8.156910416162417e-06,
"loss": 0.6337849497795105,
"step": 1368
},
{
"epoch": 2.509157509157509,
"grad_norm": 0.3318650722503662,
"learning_rate": 8.111586979845383e-06,
"loss": 0.8238204121589661,
"step": 1370
},
{
"epoch": 2.5128205128205128,
"grad_norm": 0.1444663256406784,
"learning_rate": 8.066567072699946e-06,
"loss": 0.5835400819778442,
"step": 1372
},
{
"epoch": 2.5164835164835164,
"grad_norm": 0.26651981472969055,
"learning_rate": 8.021851399521048e-06,
"loss": 0.6938576102256775,
"step": 1374
},
{
"epoch": 2.52014652014652,
"grad_norm": 0.5255207419395447,
"learning_rate": 7.97744066034077e-06,
"loss": 0.9116069078445435,
"step": 1376
},
{
"epoch": 2.5238095238095237,
"grad_norm": 0.20919840037822723,
"learning_rate": 7.933335550417405e-06,
"loss": 0.7189561724662781,
"step": 1378
},
{
"epoch": 2.5274725274725274,
"grad_norm": 0.16163820028305054,
"learning_rate": 7.889536760224557e-06,
"loss": 0.7958462834358215,
"step": 1380
},
{
"epoch": 2.531135531135531,
"grad_norm": 0.1546577364206314,
"learning_rate": 7.846044975440334e-06,
"loss": 0.7697736620903015,
"step": 1382
},
{
"epoch": 2.5347985347985347,
"grad_norm": 0.1974683552980423,
"learning_rate": 7.802860876936636e-06,
"loss": 0.7680953741073608,
"step": 1384
},
{
"epoch": 2.5384615384615383,
"grad_norm": 0.1976369023323059,
"learning_rate": 7.759985140768474e-06,
"loss": 1.0490553379058838,
"step": 1386
},
{
"epoch": 2.542124542124542,
"grad_norm": 0.795116126537323,
"learning_rate": 7.717418438163362e-06,
"loss": 0.7246772050857544,
"step": 1388
},
{
"epoch": 2.5457875457875456,
"grad_norm": 0.20595654845237732,
"learning_rate": 7.675161435510869e-06,
"loss": 0.7472343444824219,
"step": 1390
},
{
"epoch": 2.5494505494505493,
"grad_norm": 0.3328467309474945,
"learning_rate": 7.633214794352146e-06,
"loss": 0.8970118761062622,
"step": 1392
},
{
"epoch": 2.553113553113553,
"grad_norm": 0.2709287405014038,
"learning_rate": 7.591579171369574e-06,
"loss": 0.6892199516296387,
"step": 1394
},
{
"epoch": 2.5567765567765566,
"grad_norm": 0.0892128273844719,
"learning_rate": 7.5502552183764845e-06,
"loss": 0.2729473412036896,
"step": 1396
},
{
"epoch": 2.5604395604395602,
"grad_norm": 0.2575327455997467,
"learning_rate": 7.5092435823069655e-06,
"loss": 1.0643916130065918,
"step": 1398
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.42705675959587097,
"learning_rate": 7.468544905205714e-06,
"loss": 1.0583202838897705,
"step": 1400
},
{
"epoch": 2.5677655677655675,
"grad_norm": 0.3415769040584564,
"learning_rate": 7.428159824218017e-06,
"loss": 0.7452787756919861,
"step": 1402
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.16148477792739868,
"learning_rate": 7.388088971579742e-06,
"loss": 1.0535763502120972,
"step": 1404
},
{
"epoch": 2.575091575091575,
"grad_norm": 0.7299770712852478,
"learning_rate": 7.348332974607445e-06,
"loss": 0.47556331753730774,
"step": 1406
},
{
"epoch": 2.578754578754579,
"grad_norm": 0.1957835853099823,
"learning_rate": 7.308892455688579e-06,
"loss": 1.0507322549819946,
"step": 1408
},
{
"epoch": 2.5824175824175826,
"grad_norm": 0.8958855867385864,
"learning_rate": 7.269768032271726e-06,
"loss": 0.7786467671394348,
"step": 1410
},
{
"epoch": 2.586080586080586,
"grad_norm": 0.5932020545005798,
"learning_rate": 7.230960316856925e-06,
"loss": 0.7563549876213074,
"step": 1412
},
{
"epoch": 2.58974358974359,
"grad_norm": 1.3887028694152832,
"learning_rate": 7.192469916986099e-06,
"loss": 0.9135017395019531,
"step": 1414
},
{
"epoch": 2.5934065934065935,
"grad_norm": 0.3214952051639557,
"learning_rate": 7.154297435233528e-06,
"loss": 1.0506607294082642,
"step": 1416
},
{
"epoch": 2.597069597069597,
"grad_norm": 0.4495256841182709,
"learning_rate": 7.116443469196446e-06,
"loss": 0.6735981702804565,
"step": 1418
},
{
"epoch": 2.600732600732601,
"grad_norm": 0.20636308193206787,
"learning_rate": 7.078908611485656e-06,
"loss": 1.0373022556304932,
"step": 1420
},
{
"epoch": 2.6043956043956045,
"grad_norm": 0.753204345703125,
"learning_rate": 7.041693449716244e-06,
"loss": 0.9630070328712463,
"step": 1422
},
{
"epoch": 2.608058608058608,
"grad_norm": 0.2278732806444168,
"learning_rate": 7.00479856649842e-06,
"loss": 1.1420621871948242,
"step": 1424
},
{
"epoch": 2.6117216117216118,
"grad_norm": 0.28468847274780273,
"learning_rate": 6.96822453942837e-06,
"loss": 1.1209793090820312,
"step": 1426
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.3516117036342621,
"learning_rate": 6.931971941079208e-06,
"loss": 1.1041990518569946,
"step": 1428
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.18154621124267578,
"learning_rate": 6.896041338992029e-06,
"loss": 1.0311168432235718,
"step": 1430
},
{
"epoch": 2.6227106227106227,
"grad_norm": 0.5342852473258972,
"learning_rate": 6.860433295667022e-06,
"loss": 0.8854894042015076,
"step": 1432
},
{
"epoch": 2.6263736263736264,
"grad_norm": 0.8895637392997742,
"learning_rate": 6.825148368554646e-06,
"loss": 0.8600127696990967,
"step": 1434
},
{
"epoch": 2.63003663003663,
"grad_norm": 0.5177263021469116,
"learning_rate": 6.790187110046933e-06,
"loss": 1.2419568300247192,
"step": 1436
},
{
"epoch": 2.6336996336996337,
"grad_norm": 0.1689794361591339,
"learning_rate": 6.755550067468812e-06,
"loss": 1.0835387706756592,
"step": 1438
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.2473716288805008,
"learning_rate": 6.721237783069546e-06,
"loss": 1.0448336601257324,
"step": 1440
},
{
"epoch": 2.641025641025641,
"grad_norm": 0.14038227498531342,
"learning_rate": 6.687250794014273e-06,
"loss": 1.0791858434677124,
"step": 1442
},
{
"epoch": 2.6446886446886446,
"grad_norm": 0.30842867493629456,
"learning_rate": 6.653589632375541e-06,
"loss": 0.9658035635948181,
"step": 1444
},
{
"epoch": 2.6483516483516483,
"grad_norm": 0.30728021264076233,
"learning_rate": 6.6202548251250414e-06,
"loss": 0.7608792185783386,
"step": 1446
},
{
"epoch": 2.652014652014652,
"grad_norm": 0.19324815273284912,
"learning_rate": 6.587246894125303e-06,
"loss": 0.7707818150520325,
"step": 1448
},
{
"epoch": 2.6556776556776556,
"grad_norm": 0.16655150055885315,
"learning_rate": 6.554566356121558e-06,
"loss": 1.038588285446167,
"step": 1450
},
{
"epoch": 2.659340659340659,
"grad_norm": 0.10667542368173599,
"learning_rate": 6.522213722733638e-06,
"loss": 0.5798073410987854,
"step": 1452
},
{
"epoch": 2.663003663003663,
"grad_norm": 0.23501524329185486,
"learning_rate": 6.490189500447973e-06,
"loss": 0.6129744648933411,
"step": 1454
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.5228734612464905,
"learning_rate": 6.4584941906096515e-06,
"loss": 1.010016918182373,
"step": 1456
},
{
"epoch": 2.67032967032967,
"grad_norm": 0.1614047735929489,
"learning_rate": 6.427128289414573e-06,
"loss": 0.7019752264022827,
"step": 1458
},
{
"epoch": 2.6739926739926743,
"grad_norm": 0.5308529138565063,
"learning_rate": 6.396092287901696e-06,
"loss": 0.6532785296440125,
"step": 1460
},
{
"epoch": 2.677655677655678,
"grad_norm": 0.22194698452949524,
"learning_rate": 6.365386671945331e-06,
"loss": 0.7371679544448853,
"step": 1462
},
{
"epoch": 2.6813186813186816,
"grad_norm": 1.3231110572814941,
"learning_rate": 6.335011922247535e-06,
"loss": 0.9731379151344299,
"step": 1464
},
{
"epoch": 2.684981684981685,
"grad_norm": 0.15819787979125977,
"learning_rate": 6.304968514330613e-06,
"loss": 0.8071764707565308,
"step": 1466
},
{
"epoch": 2.688644688644689,
"grad_norm": 0.22951114177703857,
"learning_rate": 6.275256918529631e-06,
"loss": 0.95961993932724,
"step": 1468
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.6898245215415955,
"learning_rate": 6.245877599985094e-06,
"loss": 0.5869124531745911,
"step": 1470
},
{
"epoch": 2.695970695970696,
"grad_norm": 0.4145790934562683,
"learning_rate": 6.216831018635631e-06,
"loss": 0.7107551097869873,
"step": 1472
},
{
"epoch": 2.6996336996337,
"grad_norm": 0.5302114486694336,
"learning_rate": 6.188117629210814e-06,
"loss": 0.4114135205745697,
"step": 1474
},
{
"epoch": 2.7032967032967035,
"grad_norm": 0.254586398601532,
"learning_rate": 6.159737881224042e-06,
"loss": 1.0859794616699219,
"step": 1476
},
{
"epoch": 2.706959706959707,
"grad_norm": 0.6441674828529358,
"learning_rate": 6.131692218965484e-06,
"loss": 0.5880909562110901,
"step": 1478
},
{
"epoch": 2.7106227106227108,
"grad_norm": 0.1711389720439911,
"learning_rate": 6.103981081495144e-06,
"loss": 1.0421608686447144,
"step": 1480
},
{
"epoch": 2.7142857142857144,
"grad_norm": 0.20255252718925476,
"learning_rate": 6.076604902635971e-06,
"loss": 1.0526020526885986,
"step": 1482
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.8872889280319214,
"learning_rate": 6.049564110967082e-06,
"loss": 0.9588233828544617,
"step": 1484
},
{
"epoch": 2.7216117216117217,
"grad_norm": 0.2663383483886719,
"learning_rate": 6.022859129817042e-06,
"loss": 1.0862208604812622,
"step": 1486
},
{
"epoch": 2.7252747252747254,
"grad_norm": 0.15567375719547272,
"learning_rate": 5.996490377257248e-06,
"loss": 1.091988444328308,
"step": 1488
},
{
"epoch": 2.728937728937729,
"grad_norm": 0.3074291944503784,
"learning_rate": 5.970458266095369e-06,
"loss": 0.4964509606361389,
"step": 1490
},
{
"epoch": 2.7326007326007327,
"grad_norm": 0.20127466320991516,
"learning_rate": 5.944763203868888e-06,
"loss": 1.0711864233016968,
"step": 1492
},
{
"epoch": 2.7362637362637363,
"grad_norm": 0.22651104629039764,
"learning_rate": 5.919405592838733e-06,
"loss": 0.5613836050033569,
"step": 1494
},
{
"epoch": 2.73992673992674,
"grad_norm": 0.20297077298164368,
"learning_rate": 5.894385829982967e-06,
"loss": 1.1413242816925049,
"step": 1496
},
{
"epoch": 2.7435897435897436,
"grad_norm": 0.20519724488258362,
"learning_rate": 5.869704306990585e-06,
"loss": 1.0266319513320923,
"step": 1498
},
{
"epoch": 2.7472527472527473,
"grad_norm": 3.1034557819366455,
"learning_rate": 5.8453614102553605e-06,
"loss": 0.6879111528396606,
"step": 1500
},
{
"epoch": 2.750915750915751,
"grad_norm": 0.4873732030391693,
"learning_rate": 5.821357520869821e-06,
"loss": 0.9691627621650696,
"step": 1502
},
{
"epoch": 2.7545787545787546,
"grad_norm": 0.13368584215641022,
"learning_rate": 5.797693014619274e-06,
"loss": 1.0586458444595337,
"step": 1504
},
{
"epoch": 2.758241758241758,
"grad_norm": 0.18307749927043915,
"learning_rate": 5.774368261975912e-06,
"loss": 1.037876844406128,
"step": 1506
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.10759836435317993,
"learning_rate": 5.751383628093026e-06,
"loss": 0.8368395566940308,
"step": 1508
},
{
"epoch": 2.7655677655677655,
"grad_norm": 0.2728974521160126,
"learning_rate": 5.728739472799295e-06,
"loss": 0.8790582418441772,
"step": 1510
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.2314990609884262,
"learning_rate": 5.706436150593126e-06,
"loss": 0.8743211627006531,
"step": 1512
},
{
"epoch": 2.772893772893773,
"grad_norm": 0.15124575793743134,
"learning_rate": 5.684474010637134e-06,
"loss": 1.0424885749816895,
"step": 1514
},
{
"epoch": 2.7765567765567765,
"grad_norm": 0.21101588010787964,
"learning_rate": 5.662853396752659e-06,
"loss": 0.943360447883606,
"step": 1516
},
{
"epoch": 2.78021978021978,
"grad_norm": 0.10518907010555267,
"learning_rate": 5.641574647414386e-06,
"loss": 0.921418309211731,
"step": 1518
},
{
"epoch": 2.7838827838827838,
"grad_norm": 0.6700722575187683,
"learning_rate": 5.620638095745048e-06,
"loss": 0.4822154641151428,
"step": 1520
},
{
"epoch": 2.7875457875457874,
"grad_norm": 0.21197772026062012,
"learning_rate": 5.600044069510221e-06,
"loss": 0.708233118057251,
"step": 1522
},
{
"epoch": 2.791208791208791,
"grad_norm": 0.5329016447067261,
"learning_rate": 5.579792891113163e-06,
"loss": 0.7894065976142883,
"step": 1524
},
{
"epoch": 2.7948717948717947,
"grad_norm": 0.23669062554836273,
"learning_rate": 5.5598848775897975e-06,
"loss": 1.0895702838897705,
"step": 1526
},
{
"epoch": 2.7985347985347984,
"grad_norm": 0.2975974977016449,
"learning_rate": 5.540320340603742e-06,
"loss": 1.0676382780075073,
"step": 1528
},
{
"epoch": 2.802197802197802,
"grad_norm": 0.35699358582496643,
"learning_rate": 5.52109958644142e-06,
"loss": 1.047616958618164,
"step": 1530
},
{
"epoch": 2.8058608058608057,
"grad_norm": 0.16987060010433197,
"learning_rate": 5.50222291600727e-06,
"loss": 0.9621225595474243,
"step": 1532
},
{
"epoch": 2.8095238095238093,
"grad_norm": 0.34407544136047363,
"learning_rate": 5.483690624819042e-06,
"loss": 0.7081210613250732,
"step": 1534
},
{
"epoch": 2.813186813186813,
"grad_norm": 0.1482367217540741,
"learning_rate": 5.4655030030031616e-06,
"loss": 1.1918277740478516,
"step": 1536
},
{
"epoch": 2.8168498168498166,
"grad_norm": 0.34928014874458313,
"learning_rate": 5.4476603352901945e-06,
"loss": 0.8316318392753601,
"step": 1538
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.3218369781970978,
"learning_rate": 5.430162901010386e-06,
"loss": 0.7342109084129333,
"step": 1540
},
{
"epoch": 2.824175824175824,
"grad_norm": 0.21855826675891876,
"learning_rate": 5.413010974089283e-06,
"loss": 0.8212740421295166,
"step": 1542
},
{
"epoch": 2.8278388278388276,
"grad_norm": 0.2800341248512268,
"learning_rate": 5.39620482304346e-06,
"loss": 1.0205451250076294,
"step": 1544
},
{
"epoch": 2.8315018315018317,
"grad_norm": 0.21291717886924744,
"learning_rate": 5.379744710976301e-06,
"loss": 1.0645310878753662,
"step": 1546
},
{
"epoch": 2.8351648351648353,
"grad_norm": 0.15496331453323364,
"learning_rate": 5.363630895573892e-06,
"loss": 1.1228570938110352,
"step": 1548
},
{
"epoch": 2.838827838827839,
"grad_norm": 0.8466178178787231,
"learning_rate": 5.347863629100969e-06,
"loss": 0.737494945526123,
"step": 1550
},
{
"epoch": 2.8424908424908426,
"grad_norm": 0.04996780306100845,
"learning_rate": 5.332443158396993e-06,
"loss": 0.5186063051223755,
"step": 1552
},
{
"epoch": 2.8461538461538463,
"grad_norm": 0.34756842255592346,
"learning_rate": 5.317369724872267e-06,
"loss": 1.0735743045806885,
"step": 1554
},
{
"epoch": 2.84981684981685,
"grad_norm": 0.25667431950569153,
"learning_rate": 5.302643564504168e-06,
"loss": 0.8242087364196777,
"step": 1556
},
{
"epoch": 2.8534798534798536,
"grad_norm": 0.5141234397888184,
"learning_rate": 5.288264907833445e-06,
"loss": 0.9391310811042786,
"step": 1558
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.37790647149086,
"learning_rate": 5.274233979960608e-06,
"loss": 0.511182427406311,
"step": 1560
},
{
"epoch": 2.860805860805861,
"grad_norm": 0.8588418960571289,
"learning_rate": 5.260551000542418e-06,
"loss": 0.6005702614784241,
"step": 1562
},
{
"epoch": 2.8644688644688645,
"grad_norm": 0.20086157321929932,
"learning_rate": 5.247216183788431e-06,
"loss": 0.7859454154968262,
"step": 1564
},
{
"epoch": 2.868131868131868,
"grad_norm": 0.5744203925132751,
"learning_rate": 5.234229738457658e-06,
"loss": 0.5249977111816406,
"step": 1566
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.588792085647583,
"learning_rate": 5.221591867855286e-06,
"loss": 0.677643895149231,
"step": 1568
},
{
"epoch": 2.8754578754578755,
"grad_norm": 0.06420119106769562,
"learning_rate": 5.209302769829507e-06,
"loss": 0.5973821878433228,
"step": 1570
},
{
"epoch": 2.879120879120879,
"grad_norm": 0.18718576431274414,
"learning_rate": 5.197362636768409e-06,
"loss": 0.613332986831665,
"step": 1572
},
{
"epoch": 2.8827838827838828,
"grad_norm": 0.2197110801935196,
"learning_rate": 5.185771655596972e-06,
"loss": 0.9175146818161011,
"step": 1574
},
{
"epoch": 2.8864468864468864,
"grad_norm": 5.933550834655762,
"learning_rate": 5.174530007774135e-06,
"loss": 0.8471065163612366,
"step": 1576
},
{
"epoch": 2.89010989010989,
"grad_norm": 0.09670909494161606,
"learning_rate": 5.1636378692899665e-06,
"loss": 0.8234681487083435,
"step": 1578
},
{
"epoch": 2.8937728937728937,
"grad_norm": 0.1601630300283432,
"learning_rate": 5.153095410662896e-06,
"loss": 1.1230218410491943,
"step": 1580
},
{
"epoch": 2.8974358974358974,
"grad_norm": 2.9173190593719482,
"learning_rate": 5.142902796937052e-06,
"loss": 0.7799305319786072,
"step": 1582
},
{
"epoch": 2.901098901098901,
"grad_norm": 0.03245487064123154,
"learning_rate": 5.133060187679675e-06,
"loss": 0.7026646733283997,
"step": 1584
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.26367539167404175,
"learning_rate": 5.1235677369786265e-06,
"loss": 0.6960863471031189,
"step": 1586
},
{
"epoch": 2.9084249084249083,
"grad_norm": 0.24362139403820038,
"learning_rate": 5.1144255934399655e-06,
"loss": 1.0824929475784302,
"step": 1588
},
{
"epoch": 2.912087912087912,
"grad_norm": 0.23772361874580383,
"learning_rate": 5.105633900185632e-06,
"loss": 1.0874613523483276,
"step": 1590
},
{
"epoch": 2.9157509157509156,
"grad_norm": 0.30294981598854065,
"learning_rate": 5.0971927948512e-06,
"loss": 0.4234909117221832,
"step": 1592
},
{
"epoch": 2.9194139194139193,
"grad_norm": 0.24272647500038147,
"learning_rate": 5.089102409583725e-06,
"loss": 1.0570107698440552,
"step": 1594
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.20444297790527344,
"learning_rate": 5.081362871039677e-06,
"loss": 0.6874979138374329,
"step": 1596
},
{
"epoch": 2.926739926739927,
"grad_norm": 0.40901777148246765,
"learning_rate": 5.073974300382959e-06,
"loss": 1.0847806930541992,
"step": 1598
},
{
"epoch": 2.9304029304029307,
"grad_norm": 0.06832870841026306,
"learning_rate": 5.066936813282996e-06,
"loss": 0.6706178784370422,
"step": 1600
},
{
"epoch": 2.9340659340659343,
"grad_norm": 0.16809964179992676,
"learning_rate": 5.060250519912951e-06,
"loss": 1.0802940130233765,
"step": 1602
},
{
"epoch": 2.937728937728938,
"grad_norm": 0.11709550023078918,
"learning_rate": 5.053915524947969e-06,
"loss": 0.7102103233337402,
"step": 1604
},
{
"epoch": 2.9413919413919416,
"grad_norm": 0.18930549919605255,
"learning_rate": 5.047931927563565e-06,
"loss": 1.052394986152649,
"step": 1606
},
{
"epoch": 2.9450549450549453,
"grad_norm": 0.17763479053974152,
"learning_rate": 5.042299821434059e-06,
"loss": 0.6530783772468567,
"step": 1608
},
{
"epoch": 2.948717948717949,
"grad_norm": 0.35226595401763916,
"learning_rate": 5.037019294731103e-06,
"loss": 0.8992307186126709,
"step": 1610
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.30645254254341125,
"learning_rate": 5.032090430122316e-06,
"loss": 0.7746174335479736,
"step": 1612
},
{
"epoch": 2.956043956043956,
"grad_norm": 0.7316517233848572,
"learning_rate": 5.0275133047699814e-06,
"loss": 0.6159262657165527,
"step": 1614
},
{
"epoch": 2.95970695970696,
"grad_norm": 0.20291663706302643,
"learning_rate": 5.023287990329835e-06,
"loss": 0.737842857837677,
"step": 1616
},
{
"epoch": 2.9633699633699635,
"grad_norm": 0.3495129942893982,
"learning_rate": 5.019414552949955e-06,
"loss": 1.2598001956939697,
"step": 1618
},
{
"epoch": 2.967032967032967,
"grad_norm": 0.15239816904067993,
"learning_rate": 5.015893053269714e-06,
"loss": 1.167555332183838,
"step": 1620
},
{
"epoch": 2.970695970695971,
"grad_norm": 0.20208200812339783,
"learning_rate": 5.012723546418838e-06,
"loss": 0.8371485471725464,
"step": 1622
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.15967847406864166,
"learning_rate": 5.009906082016538e-06,
"loss": 0.733574390411377,
"step": 1624
},
{
"epoch": 2.978021978021978,
"grad_norm": 0.17362992465496063,
"learning_rate": 5.007440704170741e-06,
"loss": 0.7777770161628723,
"step": 1626
},
{
"epoch": 2.9816849816849818,
"grad_norm": 0.15043112635612488,
"learning_rate": 5.005327451477387e-06,
"loss": 0.8784082531929016,
"step": 1628
},
{
"epoch": 2.9853479853479854,
"grad_norm": 0.19129148125648499,
"learning_rate": 5.003566357019837e-06,
"loss": 1.2974438667297363,
"step": 1630
},
{
"epoch": 2.989010989010989,
"grad_norm": 0.3431568741798401,
"learning_rate": 5.002157448368347e-06,
"loss": 0.9204556345939636,
"step": 1632
},
{
"epoch": 2.9926739926739927,
"grad_norm": 0.16419149935245514,
"learning_rate": 5.001100747579644e-06,
"loss": 0.6911695003509521,
"step": 1634
},
{
"epoch": 2.9963369963369964,
"grad_norm": 0.48860520124435425,
"learning_rate": 5.000396271196573e-06,
"loss": 1.1634691953659058,
"step": 1636
},
{
"epoch": 3.0,
"grad_norm": 0.28968650102615356,
"learning_rate": 5.000044030247836e-06,
"loss": 1.0265119075775146,
"step": 1638
},
{
"epoch": 3.0,
"step": 1638,
"total_flos": 8.4482141520606e+18,
"train_loss": 0.9791712072451618,
"train_runtime": 55340.5169,
"train_samples_per_second": 0.71,
"train_steps_per_second": 0.03
}
],
"logging_steps": 2,
"max_steps": 1638,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.4482141520606e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}