{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 37.0, "eval_steps": 500, "global_step": 1332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028169014084507043, "grad_norm": 12.737117767333984, "learning_rate": 9.999993146109795e-07, "loss": 0.6797127723693848, "step": 1 }, { "epoch": 0.056338028169014086, "grad_norm": 12.04797649383545, "learning_rate": 9.999972584460056e-07, "loss": 0.6627321243286133, "step": 2 }, { "epoch": 0.08450704225352113, "grad_norm": 12.632461547851562, "learning_rate": 9.99993831511342e-07, "loss": 0.6829236149787903, "step": 3 }, { "epoch": 0.11267605633802817, "grad_norm": 11.97681713104248, "learning_rate": 9.999890338174275e-07, "loss": 0.6625960469245911, "step": 4 }, { "epoch": 0.14084507042253522, "grad_norm": 11.185710906982422, "learning_rate": 9.99982865378877e-07, "loss": 0.6418126821517944, "step": 5 }, { "epoch": 0.16901408450704225, "grad_norm": 11.49565315246582, "learning_rate": 9.999753262144804e-07, "loss": 0.6464570760726929, "step": 6 }, { "epoch": 0.19718309859154928, "grad_norm": 10.954561233520508, "learning_rate": 9.999664163472034e-07, "loss": 0.63329017162323, "step": 7 }, { "epoch": 0.22535211267605634, "grad_norm": 10.728333473205566, "learning_rate": 9.999561358041868e-07, "loss": 0.6382037401199341, "step": 8 }, { "epoch": 0.2535211267605634, "grad_norm": 8.404616355895996, "learning_rate": 9.99944484616747e-07, "loss": 0.5870345830917358, "step": 9 }, { "epoch": 0.28169014084507044, "grad_norm": 7.616209983825684, "learning_rate": 9.99931462820376e-07, "loss": 0.5672095417976379, "step": 10 }, { "epoch": 0.30985915492957744, "grad_norm": 7.800975799560547, "learning_rate": 9.999170704547398e-07, "loss": 0.581696629524231, "step": 11 }, { "epoch": 0.3380281690140845, "grad_norm": 7.584338665008545, "learning_rate": 9.999013075636804e-07, "loss": 0.5873032808303833, "step": 12 }, { "epoch": 0.36619718309859156, "grad_norm": 6.736105442047119, "learning_rate": 9.998841741952141e-07, "loss": 0.5502372980117798, "step": 13 }, { "epoch": 0.39436619718309857, "grad_norm": 6.839756965637207, "learning_rate": 9.998656704015323e-07, "loss": 0.5653150677680969, "step": 14 }, { "epoch": 0.4225352112676056, "grad_norm": 7.052567005157471, "learning_rate": 9.998457962390008e-07, "loss": 0.5660480260848999, "step": 15 }, { "epoch": 0.4507042253521127, "grad_norm": 6.61349630355835, "learning_rate": 9.998245517681593e-07, "loss": 0.552219033241272, "step": 16 }, { "epoch": 0.4788732394366197, "grad_norm": 3.9956817626953125, "learning_rate": 9.998019370537227e-07, "loss": 0.5171241760253906, "step": 17 }, { "epoch": 0.5070422535211268, "grad_norm": 3.6887121200561523, "learning_rate": 9.997779521645791e-07, "loss": 0.5023034811019897, "step": 18 }, { "epoch": 0.5352112676056338, "grad_norm": 3.6457769870758057, "learning_rate": 9.997525971737909e-07, "loss": 0.505454421043396, "step": 19 }, { "epoch": 0.5633802816901409, "grad_norm": 3.398740530014038, "learning_rate": 9.997258721585931e-07, "loss": 0.4978747069835663, "step": 20 }, { "epoch": 0.5915492957746479, "grad_norm": 3.2862207889556885, "learning_rate": 9.99697777200395e-07, "loss": 0.5002620220184326, "step": 21 }, { "epoch": 0.6197183098591549, "grad_norm": 3.3747572898864746, "learning_rate": 9.996683123847795e-07, "loss": 0.5069968700408936, "step": 22 }, { "epoch": 0.647887323943662, "grad_norm": 3.001546621322632, "learning_rate": 9.996374778015007e-07, "loss": 0.4922000765800476, "step": 23 }, { "epoch": 0.676056338028169, "grad_norm": 2.996706962585449, "learning_rate": 9.996052735444862e-07, "loss": 0.4938335716724396, "step": 24 }, { "epoch": 0.704225352112676, "grad_norm": 2.668245315551758, "learning_rate": 9.99571699711836e-07, "loss": 0.49115338921546936, "step": 25 }, { "epoch": 0.7323943661971831, "grad_norm": 2.4952428340911865, "learning_rate": 9.995367564058216e-07, "loss": 0.4847099483013153, "step": 26 }, { "epoch": 0.7605633802816901, "grad_norm": 2.529451847076416, "learning_rate": 9.995004437328865e-07, "loss": 0.48129573464393616, "step": 27 }, { "epoch": 0.7887323943661971, "grad_norm": 2.479883909225464, "learning_rate": 9.994627618036452e-07, "loss": 0.5088395476341248, "step": 28 }, { "epoch": 0.8169014084507042, "grad_norm": 2.414393424987793, "learning_rate": 9.994237107328838e-07, "loss": 0.48045098781585693, "step": 29 }, { "epoch": 0.8450704225352113, "grad_norm": 2.2080600261688232, "learning_rate": 9.993832906395582e-07, "loss": 0.47147125005722046, "step": 30 }, { "epoch": 0.8732394366197183, "grad_norm": 1.912841558456421, "learning_rate": 9.993415016467952e-07, "loss": 0.4724900424480438, "step": 31 }, { "epoch": 0.9014084507042254, "grad_norm": 1.282597303390503, "learning_rate": 9.992983438818915e-07, "loss": 0.46792298555374146, "step": 32 }, { "epoch": 0.9295774647887324, "grad_norm": 1.4362828731536865, "learning_rate": 9.992538174763127e-07, "loss": 0.45093870162963867, "step": 33 }, { "epoch": 0.9577464788732394, "grad_norm": 1.4296821355819702, "learning_rate": 9.992079225656944e-07, "loss": 0.44724205136299133, "step": 34 }, { "epoch": 0.9859154929577465, "grad_norm": 1.4829713106155396, "learning_rate": 9.9916065928984e-07, "loss": 0.44936883449554443, "step": 35 }, { "epoch": 1.0, "grad_norm": 1.387039303779602, "learning_rate": 9.991120277927223e-07, "loss": 0.47316086292266846, "step": 36 }, { "epoch": 1.028169014084507, "grad_norm": 1.3140299320220947, "learning_rate": 9.990620282224806e-07, "loss": 0.4389120638370514, "step": 37 }, { "epoch": 1.056338028169014, "grad_norm": 1.2881019115447998, "learning_rate": 9.990106607314225e-07, "loss": 0.43830516934394836, "step": 38 }, { "epoch": 1.084507042253521, "grad_norm": 1.1489726305007935, "learning_rate": 9.989579254760224e-07, "loss": 0.44559216499328613, "step": 39 }, { "epoch": 1.1126760563380282, "grad_norm": 1.0595662593841553, "learning_rate": 9.989038226169207e-07, "loss": 0.43717890977859497, "step": 40 }, { "epoch": 1.1408450704225352, "grad_norm": 0.9458185434341431, "learning_rate": 9.988483523189248e-07, "loss": 0.43611639738082886, "step": 41 }, { "epoch": 1.1690140845070423, "grad_norm": 0.8811507821083069, "learning_rate": 9.98791514751006e-07, "loss": 0.4194882810115814, "step": 42 }, { "epoch": 1.1971830985915493, "grad_norm": 0.7880372405052185, "learning_rate": 9.98733310086302e-07, "loss": 0.4363758862018585, "step": 43 }, { "epoch": 1.2253521126760563, "grad_norm": 0.7736399173736572, "learning_rate": 9.98673738502114e-07, "loss": 0.43049588799476624, "step": 44 }, { "epoch": 1.2535211267605635, "grad_norm": 0.7198370695114136, "learning_rate": 9.986128001799076e-07, "loss": 0.43443119525909424, "step": 45 }, { "epoch": 1.2816901408450705, "grad_norm": 0.7174084186553955, "learning_rate": 9.985504953053113e-07, "loss": 0.43092280626296997, "step": 46 }, { "epoch": 1.3098591549295775, "grad_norm": 0.7043387293815613, "learning_rate": 9.984868240681164e-07, "loss": 0.417573481798172, "step": 47 }, { "epoch": 1.3380281690140845, "grad_norm": 0.6884390115737915, "learning_rate": 9.98421786662277e-07, "loss": 0.4211745262145996, "step": 48 }, { "epoch": 1.3661971830985915, "grad_norm": 0.7091729044914246, "learning_rate": 9.983553832859078e-07, "loss": 0.4147814214229584, "step": 49 }, { "epoch": 1.3943661971830985, "grad_norm": 0.6925486326217651, "learning_rate": 9.982876141412855e-07, "loss": 0.432437002658844, "step": 50 }, { "epoch": 1.4225352112676055, "grad_norm": 0.7119179368019104, "learning_rate": 9.982184794348462e-07, "loss": 0.41633373498916626, "step": 51 }, { "epoch": 1.4507042253521127, "grad_norm": 0.6801888346672058, "learning_rate": 9.981479793771866e-07, "loss": 0.4228135645389557, "step": 52 }, { "epoch": 1.4788732394366197, "grad_norm": 0.6876774430274963, "learning_rate": 9.98076114183062e-07, "loss": 0.41455432772636414, "step": 53 }, { "epoch": 1.5070422535211268, "grad_norm": 0.6285378336906433, "learning_rate": 9.98002884071386e-07, "loss": 0.41491252183914185, "step": 54 }, { "epoch": 1.5352112676056338, "grad_norm": 0.6261480450630188, "learning_rate": 9.979282892652304e-07, "loss": 0.42695990204811096, "step": 55 }, { "epoch": 1.563380281690141, "grad_norm": 0.6269007325172424, "learning_rate": 9.97852329991824e-07, "loss": 0.41284894943237305, "step": 56 }, { "epoch": 1.591549295774648, "grad_norm": 0.6070351600646973, "learning_rate": 9.977750064825519e-07, "loss": 0.42982780933380127, "step": 57 }, { "epoch": 1.619718309859155, "grad_norm": 0.5970191955566406, "learning_rate": 9.976963189729547e-07, "loss": 0.41365376114845276, "step": 58 }, { "epoch": 1.647887323943662, "grad_norm": 0.5778729319572449, "learning_rate": 9.976162677027284e-07, "loss": 0.42080622911453247, "step": 59 }, { "epoch": 1.676056338028169, "grad_norm": 0.5267013907432556, "learning_rate": 9.975348529157229e-07, "loss": 0.40949106216430664, "step": 60 }, { "epoch": 1.704225352112676, "grad_norm": 0.5284983515739441, "learning_rate": 9.974520748599421e-07, "loss": 0.4082256555557251, "step": 61 }, { "epoch": 1.732394366197183, "grad_norm": 0.49156272411346436, "learning_rate": 9.973679337875418e-07, "loss": 0.3944624662399292, "step": 62 }, { "epoch": 1.76056338028169, "grad_norm": 0.4944726824760437, "learning_rate": 9.972824299548309e-07, "loss": 0.4087256193161011, "step": 63 }, { "epoch": 1.788732394366197, "grad_norm": 0.4764452874660492, "learning_rate": 9.971955636222684e-07, "loss": 0.4067206382751465, "step": 64 }, { "epoch": 1.8169014084507042, "grad_norm": 0.48928746581077576, "learning_rate": 9.971073350544644e-07, "loss": 0.4004918336868286, "step": 65 }, { "epoch": 1.8450704225352113, "grad_norm": 0.4580424726009369, "learning_rate": 9.970177445201783e-07, "loss": 0.4040325880050659, "step": 66 }, { "epoch": 1.8732394366197183, "grad_norm": 0.5053924322128296, "learning_rate": 9.969267922923188e-07, "loss": 0.40139085054397583, "step": 67 }, { "epoch": 1.9014084507042255, "grad_norm": 0.4661526679992676, "learning_rate": 9.968344786479415e-07, "loss": 0.38993388414382935, "step": 68 }, { "epoch": 1.9295774647887325, "grad_norm": 0.4677845537662506, "learning_rate": 9.967408038682505e-07, "loss": 0.4014376401901245, "step": 69 }, { "epoch": 1.9577464788732395, "grad_norm": 0.4655434787273407, "learning_rate": 9.96645768238595e-07, "loss": 0.3975449204444885, "step": 70 }, { "epoch": 1.9859154929577465, "grad_norm": 0.4675063192844391, "learning_rate": 9.965493720484698e-07, "loss": 0.4009154438972473, "step": 71 }, { "epoch": 2.0, "grad_norm": 0.5548242926597595, "learning_rate": 9.964516155915151e-07, "loss": 0.39267462491989136, "step": 72 }, { "epoch": 2.028169014084507, "grad_norm": 0.4601926803588867, "learning_rate": 9.963524991655133e-07, "loss": 0.3973795473575592, "step": 73 }, { "epoch": 2.056338028169014, "grad_norm": 0.4464695155620575, "learning_rate": 9.962520230723906e-07, "loss": 0.39020174741744995, "step": 74 }, { "epoch": 2.084507042253521, "grad_norm": 0.42715415358543396, "learning_rate": 9.961501876182148e-07, "loss": 0.3930002450942993, "step": 75 }, { "epoch": 2.112676056338028, "grad_norm": 0.3989242613315582, "learning_rate": 9.960469931131936e-07, "loss": 0.3865053653717041, "step": 76 }, { "epoch": 2.140845070422535, "grad_norm": 0.4167341887950897, "learning_rate": 9.959424398716763e-07, "loss": 0.39777663350105286, "step": 77 }, { "epoch": 2.169014084507042, "grad_norm": 0.4046856760978699, "learning_rate": 9.958365282121496e-07, "loss": 0.38023141026496887, "step": 78 }, { "epoch": 2.1971830985915495, "grad_norm": 0.40858548879623413, "learning_rate": 9.95729258457239e-07, "loss": 0.37487876415252686, "step": 79 }, { "epoch": 2.2253521126760565, "grad_norm": 0.3576146364212036, "learning_rate": 9.956206309337066e-07, "loss": 0.3785707354545593, "step": 80 }, { "epoch": 2.2535211267605635, "grad_norm": 0.35235047340393066, "learning_rate": 9.955106459724508e-07, "loss": 0.38552170991897583, "step": 81 }, { "epoch": 2.2816901408450705, "grad_norm": 0.373362272977829, "learning_rate": 9.953993039085048e-07, "loss": 0.38321995735168457, "step": 82 }, { "epoch": 2.3098591549295775, "grad_norm": 0.3574947416782379, "learning_rate": 9.952866050810363e-07, "loss": 0.37346434593200684, "step": 83 }, { "epoch": 2.3380281690140845, "grad_norm": 0.36156368255615234, "learning_rate": 9.951725498333448e-07, "loss": 0.382648229598999, "step": 84 }, { "epoch": 2.3661971830985915, "grad_norm": 0.3521256148815155, "learning_rate": 9.950571385128625e-07, "loss": 0.3722230792045593, "step": 85 }, { "epoch": 2.3943661971830985, "grad_norm": 0.3384946584701538, "learning_rate": 9.949403714711526e-07, "loss": 0.3648328185081482, "step": 86 }, { "epoch": 2.4225352112676055, "grad_norm": 0.34228095412254333, "learning_rate": 9.948222490639075e-07, "loss": 0.372160941362381, "step": 87 }, { "epoch": 2.4507042253521125, "grad_norm": 0.34330716729164124, "learning_rate": 9.947027716509488e-07, "loss": 0.36588054895401, "step": 88 }, { "epoch": 2.4788732394366195, "grad_norm": 0.34555092453956604, "learning_rate": 9.94581939596225e-07, "loss": 0.38422292470932007, "step": 89 }, { "epoch": 2.507042253521127, "grad_norm": 0.34432411193847656, "learning_rate": 9.944597532678119e-07, "loss": 0.3802357316017151, "step": 90 }, { "epoch": 2.535211267605634, "grad_norm": 0.35508641600608826, "learning_rate": 9.943362130379101e-07, "loss": 0.37436896562576294, "step": 91 }, { "epoch": 2.563380281690141, "grad_norm": 0.3540443181991577, "learning_rate": 9.942113192828444e-07, "loss": 0.39830613136291504, "step": 92 }, { "epoch": 2.591549295774648, "grad_norm": 0.3429860472679138, "learning_rate": 9.940850723830632e-07, "loss": 0.38153308629989624, "step": 93 }, { "epoch": 2.619718309859155, "grad_norm": 0.3220756947994232, "learning_rate": 9.939574727231362e-07, "loss": 0.36020469665527344, "step": 94 }, { "epoch": 2.647887323943662, "grad_norm": 0.3417351245880127, "learning_rate": 9.93828520691754e-07, "loss": 0.38868680596351624, "step": 95 }, { "epoch": 2.676056338028169, "grad_norm": 0.3259858191013336, "learning_rate": 9.93698216681727e-07, "loss": 0.37741273641586304, "step": 96 }, { "epoch": 2.704225352112676, "grad_norm": 0.33722448348999023, "learning_rate": 9.93566561089984e-07, "loss": 0.3821848928928375, "step": 97 }, { "epoch": 2.732394366197183, "grad_norm": 0.31846100091934204, "learning_rate": 9.934335543175705e-07, "loss": 0.3690311014652252, "step": 98 }, { "epoch": 2.76056338028169, "grad_norm": 0.34040549397468567, "learning_rate": 9.932991967696482e-07, "loss": 0.3875328600406647, "step": 99 }, { "epoch": 2.788732394366197, "grad_norm": 0.3258971571922302, "learning_rate": 9.931634888554935e-07, "loss": 0.3811268210411072, "step": 100 }, { "epoch": 2.816901408450704, "grad_norm": 0.32806867361068726, "learning_rate": 9.930264309884964e-07, "loss": 0.3713844418525696, "step": 101 }, { "epoch": 2.845070422535211, "grad_norm": 0.3252440094947815, "learning_rate": 9.928880235861588e-07, "loss": 0.3812159299850464, "step": 102 }, { "epoch": 2.873239436619718, "grad_norm": 0.33440181612968445, "learning_rate": 9.927482670700936e-07, "loss": 0.37723666429519653, "step": 103 }, { "epoch": 2.9014084507042255, "grad_norm": 0.3046083152294159, "learning_rate": 9.926071618660237e-07, "loss": 0.3681407868862152, "step": 104 }, { "epoch": 2.9295774647887325, "grad_norm": 0.3097338378429413, "learning_rate": 9.924647084037797e-07, "loss": 0.3724687099456787, "step": 105 }, { "epoch": 2.9577464788732395, "grad_norm": 0.32305970788002014, "learning_rate": 9.923209071172994e-07, "loss": 0.3641166090965271, "step": 106 }, { "epoch": 2.9859154929577465, "grad_norm": 0.32677826285362244, "learning_rate": 9.921757584446268e-07, "loss": 0.36330974102020264, "step": 107 }, { "epoch": 3.0, "grad_norm": 0.4263511896133423, "learning_rate": 9.9202926282791e-07, "loss": 0.35592788457870483, "step": 108 }, { "epoch": 3.028169014084507, "grad_norm": 0.2994212508201599, "learning_rate": 9.918814207133997e-07, "loss": 0.3603532314300537, "step": 109 }, { "epoch": 3.056338028169014, "grad_norm": 0.30977630615234375, "learning_rate": 9.917322325514487e-07, "loss": 0.374819278717041, "step": 110 }, { "epoch": 3.084507042253521, "grad_norm": 0.31614792346954346, "learning_rate": 9.915816987965102e-07, "loss": 0.3680700957775116, "step": 111 }, { "epoch": 3.112676056338028, "grad_norm": 0.30458712577819824, "learning_rate": 9.91429819907136e-07, "loss": 0.3753468692302704, "step": 112 }, { "epoch": 3.140845070422535, "grad_norm": 0.30280736088752747, "learning_rate": 9.912765963459756e-07, "loss": 0.3559075593948364, "step": 113 }, { "epoch": 3.169014084507042, "grad_norm": 0.3088322579860687, "learning_rate": 9.911220285797748e-07, "loss": 0.36761462688446045, "step": 114 }, { "epoch": 3.1971830985915495, "grad_norm": 0.3007463216781616, "learning_rate": 9.909661170793733e-07, "loss": 0.3572486340999603, "step": 115 }, { "epoch": 3.2253521126760565, "grad_norm": 0.29317507147789, "learning_rate": 9.908088623197048e-07, "loss": 0.37356066703796387, "step": 116 }, { "epoch": 3.2535211267605635, "grad_norm": 0.30190175771713257, "learning_rate": 9.906502647797945e-07, "loss": 0.3747510015964508, "step": 117 }, { "epoch": 3.2816901408450705, "grad_norm": 0.300547331571579, "learning_rate": 9.904903249427582e-07, "loss": 0.3723798096179962, "step": 118 }, { "epoch": 3.3098591549295775, "grad_norm": 0.2943092882633209, "learning_rate": 9.903290432958003e-07, "loss": 0.3614634573459625, "step": 119 }, { "epoch": 3.3380281690140845, "grad_norm": 0.2933284342288971, "learning_rate": 9.901664203302124e-07, "loss": 0.34804195165634155, "step": 120 }, { "epoch": 3.3661971830985915, "grad_norm": 0.2936899662017822, "learning_rate": 9.900024565413727e-07, "loss": 0.3482627272605896, "step": 121 }, { "epoch": 3.3943661971830985, "grad_norm": 0.2972092628479004, "learning_rate": 9.89837152428743e-07, "loss": 0.35861676931381226, "step": 122 }, { "epoch": 3.4225352112676055, "grad_norm": 0.296779602766037, "learning_rate": 9.896705084958687e-07, "loss": 0.37210696935653687, "step": 123 }, { "epoch": 3.4507042253521125, "grad_norm": 0.2911286950111389, "learning_rate": 9.895025252503755e-07, "loss": 0.33883392810821533, "step": 124 }, { "epoch": 3.4788732394366195, "grad_norm": 0.29729408025741577, "learning_rate": 9.8933320320397e-07, "loss": 0.3569541573524475, "step": 125 }, { "epoch": 3.507042253521127, "grad_norm": 0.29103100299835205, "learning_rate": 9.891625428724364e-07, "loss": 0.36078906059265137, "step": 126 }, { "epoch": 3.535211267605634, "grad_norm": 0.2976583242416382, "learning_rate": 9.889905447756355e-07, "loss": 0.3531530499458313, "step": 127 }, { "epoch": 3.563380281690141, "grad_norm": 0.3033563196659088, "learning_rate": 9.888172094375033e-07, "loss": 0.37008020281791687, "step": 128 }, { "epoch": 3.591549295774648, "grad_norm": 0.30928340554237366, "learning_rate": 9.886425373860496e-07, "loss": 0.3652263283729553, "step": 129 }, { "epoch": 3.619718309859155, "grad_norm": 0.3299793601036072, "learning_rate": 9.88466529153356e-07, "loss": 0.36931300163269043, "step": 130 }, { "epoch": 3.647887323943662, "grad_norm": 0.29216262698173523, "learning_rate": 9.882891852755732e-07, "loss": 0.3560551404953003, "step": 131 }, { "epoch": 3.676056338028169, "grad_norm": 0.3086439371109009, "learning_rate": 9.881105062929221e-07, "loss": 0.3592608869075775, "step": 132 }, { "epoch": 3.704225352112676, "grad_norm": 0.3008037805557251, "learning_rate": 9.879304927496896e-07, "loss": 0.35765546560287476, "step": 133 }, { "epoch": 3.732394366197183, "grad_norm": 0.3011510968208313, "learning_rate": 9.877491451942284e-07, "loss": 0.35755690932273865, "step": 134 }, { "epoch": 3.76056338028169, "grad_norm": 0.28508952260017395, "learning_rate": 9.875664641789543e-07, "loss": 0.3475223183631897, "step": 135 }, { "epoch": 3.788732394366197, "grad_norm": 0.29807090759277344, "learning_rate": 9.873824502603459e-07, "loss": 0.3468858003616333, "step": 136 }, { "epoch": 3.816901408450704, "grad_norm": 0.30015671253204346, "learning_rate": 9.871971039989407e-07, "loss": 0.3525606393814087, "step": 137 }, { "epoch": 3.845070422535211, "grad_norm": 0.2894802689552307, "learning_rate": 9.870104259593362e-07, "loss": 0.35189589858055115, "step": 138 }, { "epoch": 3.873239436619718, "grad_norm": 0.2956956624984741, "learning_rate": 9.86822416710186e-07, "loss": 0.3662959337234497, "step": 139 }, { "epoch": 3.9014084507042255, "grad_norm": 0.28614693880081177, "learning_rate": 9.866330768241983e-07, "loss": 0.3523305654525757, "step": 140 }, { "epoch": 3.9295774647887325, "grad_norm": 0.3109326958656311, "learning_rate": 9.86442406878136e-07, "loss": 0.3661171495914459, "step": 141 }, { "epoch": 3.9577464788732395, "grad_norm": 0.29977917671203613, "learning_rate": 9.862504074528126e-07, "loss": 0.3687261939048767, "step": 142 }, { "epoch": 3.9859154929577465, "grad_norm": 0.2874816954135895, "learning_rate": 9.860570791330911e-07, "loss": 0.35026735067367554, "step": 143 }, { "epoch": 4.0, "grad_norm": 0.39478132128715515, "learning_rate": 9.85862422507884e-07, "loss": 0.329179584980011, "step": 144 }, { "epoch": 4.028169014084507, "grad_norm": 0.29594185948371887, "learning_rate": 9.856664381701483e-07, "loss": 0.34915629029273987, "step": 145 }, { "epoch": 4.056338028169014, "grad_norm": 0.2942439615726471, "learning_rate": 9.854691267168871e-07, "loss": 0.3501034080982208, "step": 146 }, { "epoch": 4.084507042253521, "grad_norm": 0.3186146318912506, "learning_rate": 9.852704887491445e-07, "loss": 0.3498520255088806, "step": 147 }, { "epoch": 4.112676056338028, "grad_norm": 0.2865906059741974, "learning_rate": 9.850705248720068e-07, "loss": 0.359851598739624, "step": 148 }, { "epoch": 4.140845070422535, "grad_norm": 0.2773308753967285, "learning_rate": 9.848692356945981e-07, "loss": 0.34519776701927185, "step": 149 }, { "epoch": 4.169014084507042, "grad_norm": 0.27520084381103516, "learning_rate": 9.846666218300807e-07, "loss": 0.3370436429977417, "step": 150 }, { "epoch": 4.197183098591549, "grad_norm": 0.31606534123420715, "learning_rate": 9.844626838956513e-07, "loss": 0.3660886287689209, "step": 151 }, { "epoch": 4.225352112676056, "grad_norm": 0.30757179856300354, "learning_rate": 9.8425742251254e-07, "loss": 0.3431619703769684, "step": 152 }, { "epoch": 4.253521126760563, "grad_norm": 0.2864473760128021, "learning_rate": 9.84050838306009e-07, "loss": 0.3478638231754303, "step": 153 }, { "epoch": 4.28169014084507, "grad_norm": 0.2924051880836487, "learning_rate": 9.838429319053495e-07, "loss": 0.3459091782569885, "step": 154 }, { "epoch": 4.309859154929577, "grad_norm": 0.2723977565765381, "learning_rate": 9.836337039438803e-07, "loss": 0.3437414765357971, "step": 155 }, { "epoch": 4.338028169014084, "grad_norm": 0.28301340341567993, "learning_rate": 9.83423155058946e-07, "loss": 0.351753830909729, "step": 156 }, { "epoch": 4.366197183098592, "grad_norm": 0.3007968068122864, "learning_rate": 9.832112858919155e-07, "loss": 0.3534032106399536, "step": 157 }, { "epoch": 4.394366197183099, "grad_norm": 0.2823623716831207, "learning_rate": 9.829980970881784e-07, "loss": 0.33871978521347046, "step": 158 }, { "epoch": 4.422535211267606, "grad_norm": 0.27985984086990356, "learning_rate": 9.82783589297145e-07, "loss": 0.35134732723236084, "step": 159 }, { "epoch": 4.450704225352113, "grad_norm": 0.29764989018440247, "learning_rate": 9.825677631722435e-07, "loss": 0.35344886779785156, "step": 160 }, { "epoch": 4.47887323943662, "grad_norm": 0.2861703634262085, "learning_rate": 9.823506193709174e-07, "loss": 0.3553098440170288, "step": 161 }, { "epoch": 4.507042253521127, "grad_norm": 0.3005011975765228, "learning_rate": 9.821321585546243e-07, "loss": 0.349773645401001, "step": 162 }, { "epoch": 4.535211267605634, "grad_norm": 0.28691744804382324, "learning_rate": 9.81912381388834e-07, "loss": 0.3327012360095978, "step": 163 }, { "epoch": 4.563380281690141, "grad_norm": 0.3060745298862457, "learning_rate": 9.816912885430258e-07, "loss": 0.3464226722717285, "step": 164 }, { "epoch": 4.591549295774648, "grad_norm": 0.3035100996494293, "learning_rate": 9.814688806906868e-07, "loss": 0.3499942719936371, "step": 165 }, { "epoch": 4.619718309859155, "grad_norm": 0.3114430606365204, "learning_rate": 9.812451585093098e-07, "loss": 0.3396627604961395, "step": 166 }, { "epoch": 4.647887323943662, "grad_norm": 0.30142080783843994, "learning_rate": 9.810201226803917e-07, "loss": 0.3466919958591461, "step": 167 }, { "epoch": 4.676056338028169, "grad_norm": 0.2819617986679077, "learning_rate": 9.807937738894303e-07, "loss": 0.34856730699539185, "step": 168 }, { "epoch": 4.704225352112676, "grad_norm": 0.29183247685432434, "learning_rate": 9.805661128259235e-07, "loss": 0.3437175750732422, "step": 169 }, { "epoch": 4.732394366197183, "grad_norm": 0.29465699195861816, "learning_rate": 9.80337140183366e-07, "loss": 0.3438083827495575, "step": 170 }, { "epoch": 4.76056338028169, "grad_norm": 0.28720420598983765, "learning_rate": 9.801068566592483e-07, "loss": 0.3422589898109436, "step": 171 }, { "epoch": 4.788732394366197, "grad_norm": 0.2751031816005707, "learning_rate": 9.798752629550546e-07, "loss": 0.3460365831851959, "step": 172 }, { "epoch": 4.816901408450704, "grad_norm": 0.2868765592575073, "learning_rate": 9.796423597762588e-07, "loss": 0.3391006886959076, "step": 173 }, { "epoch": 4.845070422535211, "grad_norm": 0.2844865024089813, "learning_rate": 9.794081478323245e-07, "loss": 0.3488645851612091, "step": 174 }, { "epoch": 4.873239436619718, "grad_norm": 0.28600648045539856, "learning_rate": 9.791726278367021e-07, "loss": 0.3440667986869812, "step": 175 }, { "epoch": 4.901408450704225, "grad_norm": 0.29167741537094116, "learning_rate": 9.78935800506826e-07, "loss": 0.34016746282577515, "step": 176 }, { "epoch": 4.929577464788732, "grad_norm": 0.29203853011131287, "learning_rate": 9.786976665641138e-07, "loss": 0.33034777641296387, "step": 177 }, { "epoch": 4.957746478873239, "grad_norm": 0.29975563287734985, "learning_rate": 9.784582267339622e-07, "loss": 0.34664660692214966, "step": 178 }, { "epoch": 4.985915492957746, "grad_norm": 0.2778502106666565, "learning_rate": 9.78217481745747e-07, "loss": 0.34249287843704224, "step": 179 }, { "epoch": 5.0, "grad_norm": 0.396133691072464, "learning_rate": 9.779754323328192e-07, "loss": 0.34673285484313965, "step": 180 }, { "epoch": 5.028169014084507, "grad_norm": 0.29174622893333435, "learning_rate": 9.777320792325025e-07, "loss": 0.3266841173171997, "step": 181 }, { "epoch": 5.056338028169014, "grad_norm": 0.28281646966934204, "learning_rate": 9.774874231860935e-07, "loss": 0.3295621871948242, "step": 182 }, { "epoch": 5.084507042253521, "grad_norm": 0.2767295837402344, "learning_rate": 9.772414649388568e-07, "loss": 0.3460637629032135, "step": 183 }, { "epoch": 5.112676056338028, "grad_norm": 0.28246212005615234, "learning_rate": 9.769942052400235e-07, "loss": 0.3325508236885071, "step": 184 }, { "epoch": 5.140845070422535, "grad_norm": 0.31317514181137085, "learning_rate": 9.767456448427896e-07, "loss": 0.3373739719390869, "step": 185 }, { "epoch": 5.169014084507042, "grad_norm": 0.29388973116874695, "learning_rate": 9.764957845043135e-07, "loss": 0.3335680365562439, "step": 186 }, { "epoch": 5.197183098591549, "grad_norm": 0.3093099892139435, "learning_rate": 9.76244624985713e-07, "loss": 0.3288199007511139, "step": 187 }, { "epoch": 5.225352112676056, "grad_norm": 0.2718607187271118, "learning_rate": 9.759921670520634e-07, "loss": 0.33789312839508057, "step": 188 }, { "epoch": 5.253521126760563, "grad_norm": 0.3087296485900879, "learning_rate": 9.757384114723953e-07, "loss": 0.3482661843299866, "step": 189 }, { "epoch": 5.28169014084507, "grad_norm": 0.2887554466724396, "learning_rate": 9.754833590196926e-07, "loss": 0.3353871703147888, "step": 190 }, { "epoch": 5.309859154929577, "grad_norm": 0.2770691514015198, "learning_rate": 9.752270104708888e-07, "loss": 0.33239609003067017, "step": 191 }, { "epoch": 5.338028169014084, "grad_norm": 0.29489442706108093, "learning_rate": 9.749693666068663e-07, "loss": 0.34318211674690247, "step": 192 }, { "epoch": 5.366197183098592, "grad_norm": 0.31870850920677185, "learning_rate": 9.747104282124531e-07, "loss": 0.33540403842926025, "step": 193 }, { "epoch": 5.394366197183099, "grad_norm": 0.27267521619796753, "learning_rate": 9.744501960764203e-07, "loss": 0.33416521549224854, "step": 194 }, { "epoch": 5.422535211267606, "grad_norm": 0.284470796585083, "learning_rate": 9.741886709914803e-07, "loss": 0.3242385685443878, "step": 195 }, { "epoch": 5.450704225352113, "grad_norm": 0.2988561689853668, "learning_rate": 9.739258537542835e-07, "loss": 0.3325580656528473, "step": 196 }, { "epoch": 5.47887323943662, "grad_norm": 0.29107666015625, "learning_rate": 9.73661745165417e-07, "loss": 0.34368401765823364, "step": 197 }, { "epoch": 5.507042253521127, "grad_norm": 0.289497047662735, "learning_rate": 9.733963460294015e-07, "loss": 0.33908677101135254, "step": 198 }, { "epoch": 5.535211267605634, "grad_norm": 0.27910080552101135, "learning_rate": 9.731296571546885e-07, "loss": 0.3478449285030365, "step": 199 }, { "epoch": 5.563380281690141, "grad_norm": 0.2966774106025696, "learning_rate": 9.728616793536587e-07, "loss": 0.3371037244796753, "step": 200 }, { "epoch": 5.591549295774648, "grad_norm": 0.30997180938720703, "learning_rate": 9.72592413442619e-07, "loss": 0.3469342589378357, "step": 201 }, { "epoch": 5.619718309859155, "grad_norm": 0.2851829528808594, "learning_rate": 9.723218602418e-07, "loss": 0.3497530221939087, "step": 202 }, { "epoch": 5.647887323943662, "grad_norm": 0.29238471388816833, "learning_rate": 9.720500205753538e-07, "loss": 0.3286020755767822, "step": 203 }, { "epoch": 5.676056338028169, "grad_norm": 0.2877226769924164, "learning_rate": 9.717768952713511e-07, "loss": 0.338655948638916, "step": 204 }, { "epoch": 5.704225352112676, "grad_norm": 0.28834086656570435, "learning_rate": 9.71502485161779e-07, "loss": 0.333360880613327, "step": 205 }, { "epoch": 5.732394366197183, "grad_norm": 0.28225836157798767, "learning_rate": 9.71226791082538e-07, "loss": 0.3514789640903473, "step": 206 }, { "epoch": 5.76056338028169, "grad_norm": 0.28878796100616455, "learning_rate": 9.709498138734403e-07, "loss": 0.3271612524986267, "step": 207 }, { "epoch": 5.788732394366197, "grad_norm": 0.29221564531326294, "learning_rate": 9.706715543782064e-07, "loss": 0.32984620332717896, "step": 208 }, { "epoch": 5.816901408450704, "grad_norm": 0.31417179107666016, "learning_rate": 9.703920134444632e-07, "loss": 0.32708263397216797, "step": 209 }, { "epoch": 5.845070422535211, "grad_norm": 0.30656933784484863, "learning_rate": 9.701111919237408e-07, "loss": 0.3378485143184662, "step": 210 }, { "epoch": 5.873239436619718, "grad_norm": 0.28274714946746826, "learning_rate": 9.698290906714702e-07, "loss": 0.3210570812225342, "step": 211 }, { "epoch": 5.901408450704225, "grad_norm": 0.28694605827331543, "learning_rate": 9.695457105469804e-07, "loss": 0.33672863245010376, "step": 212 }, { "epoch": 5.929577464788732, "grad_norm": 0.2965106666088104, "learning_rate": 9.69261052413497e-07, "loss": 0.34379851818084717, "step": 213 }, { "epoch": 5.957746478873239, "grad_norm": 0.3144500255584717, "learning_rate": 9.689751171381377e-07, "loss": 0.33530962467193604, "step": 214 }, { "epoch": 5.985915492957746, "grad_norm": 0.274070680141449, "learning_rate": 9.68687905591911e-07, "loss": 0.32609909772872925, "step": 215 }, { "epoch": 6.0, "grad_norm": 0.3976318836212158, "learning_rate": 9.683994186497132e-07, "loss": 0.3320915997028351, "step": 216 }, { "epoch": 6.028169014084507, "grad_norm": 0.27306580543518066, "learning_rate": 9.681096571903252e-07, "loss": 0.32757407426834106, "step": 217 }, { "epoch": 6.056338028169014, "grad_norm": 0.2815074622631073, "learning_rate": 9.67818622096411e-07, "loss": 0.31570878624916077, "step": 218 }, { "epoch": 6.084507042253521, "grad_norm": 0.29271578788757324, "learning_rate": 9.67526314254514e-07, "loss": 0.33092743158340454, "step": 219 }, { "epoch": 6.112676056338028, "grad_norm": 0.2819676399230957, "learning_rate": 9.672327345550543e-07, "loss": 0.32412028312683105, "step": 220 }, { "epoch": 6.140845070422535, "grad_norm": 0.29121264815330505, "learning_rate": 9.669378838923267e-07, "loss": 0.324832558631897, "step": 221 }, { "epoch": 6.169014084507042, "grad_norm": 0.28991273045539856, "learning_rate": 9.666417631644976e-07, "loss": 0.3393062949180603, "step": 222 }, { "epoch": 6.197183098591549, "grad_norm": 0.28072309494018555, "learning_rate": 9.66344373273602e-07, "loss": 0.32950296998023987, "step": 223 }, { "epoch": 6.225352112676056, "grad_norm": 0.3102487027645111, "learning_rate": 9.66045715125541e-07, "loss": 0.3289036154747009, "step": 224 }, { "epoch": 6.253521126760563, "grad_norm": 0.2856598198413849, "learning_rate": 9.657457896300791e-07, "loss": 0.30844709277153015, "step": 225 }, { "epoch": 6.28169014084507, "grad_norm": 0.28150248527526855, "learning_rate": 9.654445977008414e-07, "loss": 0.32252323627471924, "step": 226 }, { "epoch": 6.309859154929577, "grad_norm": 0.3106309175491333, "learning_rate": 9.651421402553108e-07, "loss": 0.3153507113456726, "step": 227 }, { "epoch": 6.338028169014084, "grad_norm": 0.3323248028755188, "learning_rate": 9.648384182148252e-07, "loss": 0.3372737169265747, "step": 228 }, { "epoch": 6.366197183098592, "grad_norm": 0.2816256880760193, "learning_rate": 9.645334325045745e-07, "loss": 0.3402503728866577, "step": 229 }, { "epoch": 6.394366197183099, "grad_norm": 0.28511133790016174, "learning_rate": 9.64227184053598e-07, "loss": 0.3433256149291992, "step": 230 }, { "epoch": 6.422535211267606, "grad_norm": 0.27890780568122864, "learning_rate": 9.63919673794782e-07, "loss": 0.3293980658054352, "step": 231 }, { "epoch": 6.450704225352113, "grad_norm": 0.29692021012306213, "learning_rate": 9.636109026648554e-07, "loss": 0.3282950818538666, "step": 232 }, { "epoch": 6.47887323943662, "grad_norm": 0.2867494523525238, "learning_rate": 9.633008716043892e-07, "loss": 0.3350924253463745, "step": 233 }, { "epoch": 6.507042253521127, "grad_norm": 0.27419739961624146, "learning_rate": 9.629895815577915e-07, "loss": 0.33370357751846313, "step": 234 }, { "epoch": 6.535211267605634, "grad_norm": 0.2837441563606262, "learning_rate": 9.626770334733058e-07, "loss": 0.3225363790988922, "step": 235 }, { "epoch": 6.563380281690141, "grad_norm": 0.28063684701919556, "learning_rate": 9.623632283030077e-07, "loss": 0.33922791481018066, "step": 236 }, { "epoch": 6.591549295774648, "grad_norm": 0.2789226770401001, "learning_rate": 9.620481670028026e-07, "loss": 0.3289903998374939, "step": 237 }, { "epoch": 6.619718309859155, "grad_norm": 0.2788150906562805, "learning_rate": 9.617318505324212e-07, "loss": 0.3213944435119629, "step": 238 }, { "epoch": 6.647887323943662, "grad_norm": 0.2622866928577423, "learning_rate": 9.614142798554186e-07, "loss": 0.3391764461994171, "step": 239 }, { "epoch": 6.676056338028169, "grad_norm": 0.2952481806278229, "learning_rate": 9.610954559391704e-07, "loss": 0.31737983226776123, "step": 240 }, { "epoch": 6.704225352112676, "grad_norm": 0.28387367725372314, "learning_rate": 9.607753797548691e-07, "loss": 0.33009767532348633, "step": 241 }, { "epoch": 6.732394366197183, "grad_norm": 0.28222769498825073, "learning_rate": 9.604540522775227e-07, "loss": 0.3226430416107178, "step": 242 }, { "epoch": 6.76056338028169, "grad_norm": 0.2985075116157532, "learning_rate": 9.601314744859504e-07, "loss": 0.3328002393245697, "step": 243 }, { "epoch": 6.788732394366197, "grad_norm": 0.2787352204322815, "learning_rate": 9.598076473627796e-07, "loss": 0.3292522728443146, "step": 244 }, { "epoch": 6.816901408450704, "grad_norm": 0.2772713899612427, "learning_rate": 9.594825718944444e-07, "loss": 0.322078138589859, "step": 245 }, { "epoch": 6.845070422535211, "grad_norm": 0.28727421164512634, "learning_rate": 9.59156249071181e-07, "loss": 0.3206414580345154, "step": 246 }, { "epoch": 6.873239436619718, "grad_norm": 0.28722915053367615, "learning_rate": 9.588286798870248e-07, "loss": 0.34071967005729675, "step": 247 }, { "epoch": 6.901408450704225, "grad_norm": 0.2791661322116852, "learning_rate": 9.58499865339809e-07, "loss": 0.32371699810028076, "step": 248 }, { "epoch": 6.929577464788732, "grad_norm": 0.30174046754837036, "learning_rate": 9.581698064311592e-07, "loss": 0.32212015986442566, "step": 249 }, { "epoch": 6.957746478873239, "grad_norm": 0.2757203280925751, "learning_rate": 9.578385041664925e-07, "loss": 0.3286738395690918, "step": 250 }, { "epoch": 6.985915492957746, "grad_norm": 0.2977890968322754, "learning_rate": 9.575059595550127e-07, "loss": 0.32400673627853394, "step": 251 }, { "epoch": 7.0, "grad_norm": 0.38676717877388, "learning_rate": 9.571721736097088e-07, "loss": 0.31549203395843506, "step": 252 }, { "epoch": 7.028169014084507, "grad_norm": 0.28209057450294495, "learning_rate": 9.568371473473503e-07, "loss": 0.3403396010398865, "step": 253 }, { "epoch": 7.056338028169014, "grad_norm": 0.28578808903694153, "learning_rate": 9.565008817884854e-07, "loss": 0.32727712392807007, "step": 254 }, { "epoch": 7.084507042253521, "grad_norm": 0.2921590805053711, "learning_rate": 9.561633779574372e-07, "loss": 0.33234310150146484, "step": 255 }, { "epoch": 7.112676056338028, "grad_norm": 0.27242740988731384, "learning_rate": 9.55824636882301e-07, "loss": 0.3204275965690613, "step": 256 }, { "epoch": 7.140845070422535, "grad_norm": 0.28681573271751404, "learning_rate": 9.554846595949413e-07, "loss": 0.3127729594707489, "step": 257 }, { "epoch": 7.169014084507042, "grad_norm": 0.27501875162124634, "learning_rate": 9.55143447130987e-07, "loss": 0.3219028115272522, "step": 258 }, { "epoch": 7.197183098591549, "grad_norm": 0.2893284261226654, "learning_rate": 9.54801000529831e-07, "loss": 0.3149603009223938, "step": 259 }, { "epoch": 7.225352112676056, "grad_norm": 0.29977115988731384, "learning_rate": 9.54457320834625e-07, "loss": 0.3116862177848816, "step": 260 }, { "epoch": 7.253521126760563, "grad_norm": 0.2911919355392456, "learning_rate": 9.54112409092277e-07, "loss": 0.3377895653247833, "step": 261 }, { "epoch": 7.28169014084507, "grad_norm": 0.32472458481788635, "learning_rate": 9.537662663534477e-07, "loss": 0.3152693510055542, "step": 262 }, { "epoch": 7.309859154929577, "grad_norm": 0.2667696177959442, "learning_rate": 9.534188936725483e-07, "loss": 0.3181629180908203, "step": 263 }, { "epoch": 7.338028169014084, "grad_norm": 0.29469212889671326, "learning_rate": 9.530702921077358e-07, "loss": 0.32251378893852234, "step": 264 }, { "epoch": 7.366197183098592, "grad_norm": 0.2710505425930023, "learning_rate": 9.527204627209112e-07, "loss": 0.3157137632369995, "step": 265 }, { "epoch": 7.394366197183099, "grad_norm": 0.29605209827423096, "learning_rate": 9.523694065777156e-07, "loss": 0.32492029666900635, "step": 266 }, { "epoch": 7.422535211267606, "grad_norm": 0.28292831778526306, "learning_rate": 9.520171247475268e-07, "loss": 0.3182477653026581, "step": 267 }, { "epoch": 7.450704225352113, "grad_norm": 0.28567084670066833, "learning_rate": 9.516636183034564e-07, "loss": 0.317740797996521, "step": 268 }, { "epoch": 7.47887323943662, "grad_norm": 0.26249128580093384, "learning_rate": 9.513088883223463e-07, "loss": 0.3064804971218109, "step": 269 }, { "epoch": 7.507042253521127, "grad_norm": 0.2805914878845215, "learning_rate": 9.509529358847654e-07, "loss": 0.32089754939079285, "step": 270 }, { "epoch": 7.535211267605634, "grad_norm": 0.2892814874649048, "learning_rate": 9.505957620750069e-07, "loss": 0.31203514337539673, "step": 271 }, { "epoch": 7.563380281690141, "grad_norm": 0.2809925079345703, "learning_rate": 9.502373679810839e-07, "loss": 0.3222312331199646, "step": 272 }, { "epoch": 7.591549295774648, "grad_norm": 0.2793818414211273, "learning_rate": 9.49877754694727e-07, "loss": 0.30804064869880676, "step": 273 }, { "epoch": 7.619718309859155, "grad_norm": 0.27966272830963135, "learning_rate": 9.495169233113806e-07, "loss": 0.32768452167510986, "step": 274 }, { "epoch": 7.647887323943662, "grad_norm": 0.2743930220603943, "learning_rate": 9.491548749301997e-07, "loss": 0.3242339491844177, "step": 275 }, { "epoch": 7.676056338028169, "grad_norm": 0.2765263319015503, "learning_rate": 9.487916106540465e-07, "loss": 0.3245530128479004, "step": 276 }, { "epoch": 7.704225352112676, "grad_norm": 0.29381853342056274, "learning_rate": 9.484271315894871e-07, "loss": 0.32187986373901367, "step": 277 }, { "epoch": 7.732394366197183, "grad_norm": 0.27294641733169556, "learning_rate": 9.480614388467877e-07, "loss": 0.3233500123023987, "step": 278 }, { "epoch": 7.76056338028169, "grad_norm": 0.28944891691207886, "learning_rate": 9.47694533539912e-07, "loss": 0.31809201836586, "step": 279 }, { "epoch": 7.788732394366197, "grad_norm": 0.2922861576080322, "learning_rate": 9.473264167865171e-07, "loss": 0.33151817321777344, "step": 280 }, { "epoch": 7.816901408450704, "grad_norm": 0.2928006649017334, "learning_rate": 9.469570897079504e-07, "loss": 0.3220402002334595, "step": 281 }, { "epoch": 7.845070422535211, "grad_norm": 0.28323814272880554, "learning_rate": 9.465865534292464e-07, "loss": 0.31611043214797974, "step": 282 }, { "epoch": 7.873239436619718, "grad_norm": 0.28506791591644287, "learning_rate": 9.462148090791228e-07, "loss": 0.32090169191360474, "step": 283 }, { "epoch": 7.901408450704225, "grad_norm": 0.2799360156059265, "learning_rate": 9.458418577899774e-07, "loss": 0.344720721244812, "step": 284 }, { "epoch": 7.929577464788732, "grad_norm": 0.27799472212791443, "learning_rate": 9.454677006978842e-07, "loss": 0.3141616880893707, "step": 285 }, { "epoch": 7.957746478873239, "grad_norm": 0.27411341667175293, "learning_rate": 9.450923389425911e-07, "loss": 0.31020885705947876, "step": 286 }, { "epoch": 7.985915492957746, "grad_norm": 0.28921812772750854, "learning_rate": 9.44715773667515e-07, "loss": 0.3182592988014221, "step": 287 }, { "epoch": 8.0, "grad_norm": 0.3832477331161499, "learning_rate": 9.443380060197385e-07, "loss": 0.32039332389831543, "step": 288 }, { "epoch": 8.028169014084508, "grad_norm": 0.2698141932487488, "learning_rate": 9.43959037150008e-07, "loss": 0.3155902028083801, "step": 289 }, { "epoch": 8.056338028169014, "grad_norm": 0.2765481472015381, "learning_rate": 9.43578868212728e-07, "loss": 0.3177169859409332, "step": 290 }, { "epoch": 8.084507042253522, "grad_norm": 0.27723443508148193, "learning_rate": 9.431975003659594e-07, "loss": 0.31647437810897827, "step": 291 }, { "epoch": 8.112676056338028, "grad_norm": 0.26522088050842285, "learning_rate": 9.428149347714143e-07, "loss": 0.31819185614585876, "step": 292 }, { "epoch": 8.140845070422536, "grad_norm": 0.28780215978622437, "learning_rate": 9.424311725944543e-07, "loss": 0.31119635701179504, "step": 293 }, { "epoch": 8.169014084507042, "grad_norm": 0.2786031663417816, "learning_rate": 9.420462150040852e-07, "loss": 0.31440460681915283, "step": 294 }, { "epoch": 8.19718309859155, "grad_norm": 0.26644277572631836, "learning_rate": 9.416600631729548e-07, "loss": 0.32182344794273376, "step": 295 }, { "epoch": 8.225352112676056, "grad_norm": 0.2974756062030792, "learning_rate": 9.412727182773486e-07, "loss": 0.3225427269935608, "step": 296 }, { "epoch": 8.253521126760564, "grad_norm": 0.2951170802116394, "learning_rate": 9.408841814971861e-07, "loss": 0.31894785165786743, "step": 297 }, { "epoch": 8.28169014084507, "grad_norm": 0.28619688749313354, "learning_rate": 9.404944540160177e-07, "loss": 0.31788474321365356, "step": 298 }, { "epoch": 8.309859154929578, "grad_norm": 0.2877795398235321, "learning_rate": 9.401035370210212e-07, "loss": 0.3235325217247009, "step": 299 }, { "epoch": 8.338028169014084, "grad_norm": 0.30395635962486267, "learning_rate": 9.397114317029974e-07, "loss": 0.33284687995910645, "step": 300 }, { "epoch": 8.366197183098592, "grad_norm": 0.2896060347557068, "learning_rate": 9.393181392563669e-07, "loss": 0.32644715905189514, "step": 301 }, { "epoch": 8.394366197183098, "grad_norm": 0.2763223648071289, "learning_rate": 9.38923660879167e-07, "loss": 0.304126501083374, "step": 302 }, { "epoch": 8.422535211267606, "grad_norm": 0.2764940559864044, "learning_rate": 9.385279977730472e-07, "loss": 0.3124150037765503, "step": 303 }, { "epoch": 8.450704225352112, "grad_norm": 0.2838902771472931, "learning_rate": 9.381311511432658e-07, "loss": 0.32950958609580994, "step": 304 }, { "epoch": 8.47887323943662, "grad_norm": 0.2854890823364258, "learning_rate": 9.377331221986866e-07, "loss": 0.30994099378585815, "step": 305 }, { "epoch": 8.507042253521126, "grad_norm": 0.2682625353336334, "learning_rate": 9.373339121517746e-07, "loss": 0.31963592767715454, "step": 306 }, { "epoch": 8.535211267605634, "grad_norm": 0.2849690318107605, "learning_rate": 9.36933522218593e-07, "loss": 0.3182557225227356, "step": 307 }, { "epoch": 8.56338028169014, "grad_norm": 0.28616634011268616, "learning_rate": 9.36531953618799e-07, "loss": 0.30273881554603577, "step": 308 }, { "epoch": 8.591549295774648, "grad_norm": 0.2721138596534729, "learning_rate": 9.361292075756401e-07, "loss": 0.3207533657550812, "step": 309 }, { "epoch": 8.619718309859154, "grad_norm": 0.2752065360546112, "learning_rate": 9.357252853159505e-07, "loss": 0.3186470866203308, "step": 310 }, { "epoch": 8.647887323943662, "grad_norm": 0.2684236168861389, "learning_rate": 9.353201880701477e-07, "loss": 0.31932806968688965, "step": 311 }, { "epoch": 8.676056338028168, "grad_norm": 0.28039291501045227, "learning_rate": 9.34913917072228e-07, "loss": 0.31683626770973206, "step": 312 }, { "epoch": 8.704225352112676, "grad_norm": 0.2638692855834961, "learning_rate": 9.345064735597633e-07, "loss": 0.2991946339607239, "step": 313 }, { "epoch": 8.732394366197184, "grad_norm": 0.30425477027893066, "learning_rate": 9.340978587738972e-07, "loss": 0.3023770749568939, "step": 314 }, { "epoch": 8.76056338028169, "grad_norm": 0.27750107645988464, "learning_rate": 9.336880739593415e-07, "loss": 0.31177228689193726, "step": 315 }, { "epoch": 8.788732394366198, "grad_norm": 0.2731636166572571, "learning_rate": 9.332771203643714e-07, "loss": 0.3076733946800232, "step": 316 }, { "epoch": 8.816901408450704, "grad_norm": 0.2740687131881714, "learning_rate": 9.328649992408231e-07, "loss": 0.30277711153030396, "step": 317 }, { "epoch": 8.845070422535212, "grad_norm": 0.27956005930900574, "learning_rate": 9.324517118440888e-07, "loss": 0.30988752841949463, "step": 318 }, { "epoch": 8.873239436619718, "grad_norm": 0.28827622532844543, "learning_rate": 9.320372594331137e-07, "loss": 0.32537323236465454, "step": 319 }, { "epoch": 8.901408450704226, "grad_norm": 0.2771560847759247, "learning_rate": 9.316216432703916e-07, "loss": 0.3233356475830078, "step": 320 }, { "epoch": 8.929577464788732, "grad_norm": 0.2804992198944092, "learning_rate": 9.312048646219617e-07, "loss": 0.31110987067222595, "step": 321 }, { "epoch": 8.95774647887324, "grad_norm": 0.29048794507980347, "learning_rate": 9.307869247574038e-07, "loss": 0.3100625276565552, "step": 322 }, { "epoch": 8.985915492957746, "grad_norm": 0.2751557230949402, "learning_rate": 9.303678249498352e-07, "loss": 0.30283451080322266, "step": 323 }, { "epoch": 9.0, "grad_norm": 0.38358354568481445, "learning_rate": 9.299475664759068e-07, "loss": 0.3202640414237976, "step": 324 }, { "epoch": 9.028169014084508, "grad_norm": 0.26551520824432373, "learning_rate": 9.295261506157985e-07, "loss": 0.31331080198287964, "step": 325 }, { "epoch": 9.056338028169014, "grad_norm": 0.28371915221214294, "learning_rate": 9.291035786532163e-07, "loss": 0.3039785325527191, "step": 326 }, { "epoch": 9.084507042253522, "grad_norm": 0.28972727060317993, "learning_rate": 9.286798518753878e-07, "loss": 0.3172224462032318, "step": 327 }, { "epoch": 9.112676056338028, "grad_norm": 0.2863673269748688, "learning_rate": 9.282549715730579e-07, "loss": 0.3220033049583435, "step": 328 }, { "epoch": 9.140845070422536, "grad_norm": 0.27619102597236633, "learning_rate": 9.278289390404859e-07, "loss": 0.31595173478126526, "step": 329 }, { "epoch": 9.169014084507042, "grad_norm": 0.2838309705257416, "learning_rate": 9.274017555754407e-07, "loss": 0.31470271944999695, "step": 330 }, { "epoch": 9.19718309859155, "grad_norm": 0.28437867760658264, "learning_rate": 9.269734224791974e-07, "loss": 0.31371644139289856, "step": 331 }, { "epoch": 9.225352112676056, "grad_norm": 0.28935906291007996, "learning_rate": 9.265439410565328e-07, "loss": 0.3154122829437256, "step": 332 }, { "epoch": 9.253521126760564, "grad_norm": 0.28751862049102783, "learning_rate": 9.261133126157217e-07, "loss": 0.3072774410247803, "step": 333 }, { "epoch": 9.28169014084507, "grad_norm": 0.2829267680644989, "learning_rate": 9.256815384685328e-07, "loss": 0.30855560302734375, "step": 334 }, { "epoch": 9.309859154929578, "grad_norm": 0.28372108936309814, "learning_rate": 9.252486199302256e-07, "loss": 0.3047599792480469, "step": 335 }, { "epoch": 9.338028169014084, "grad_norm": 0.26949799060821533, "learning_rate": 9.248145583195447e-07, "loss": 0.3051632046699524, "step": 336 }, { "epoch": 9.366197183098592, "grad_norm": 0.26946741342544556, "learning_rate": 9.243793549587171e-07, "loss": 0.30776509642601013, "step": 337 }, { "epoch": 9.394366197183098, "grad_norm": 0.2829545736312866, "learning_rate": 9.239430111734476e-07, "loss": 0.30643659830093384, "step": 338 }, { "epoch": 9.422535211267606, "grad_norm": 0.30891162157058716, "learning_rate": 9.235055282929153e-07, "loss": 0.30099156498908997, "step": 339 }, { "epoch": 9.450704225352112, "grad_norm": 0.2820793390274048, "learning_rate": 9.230669076497687e-07, "loss": 0.31829434633255005, "step": 340 }, { "epoch": 9.47887323943662, "grad_norm": 0.27604445815086365, "learning_rate": 9.226271505801224e-07, "loss": 0.31647807359695435, "step": 341 }, { "epoch": 9.507042253521126, "grad_norm": 0.2793697714805603, "learning_rate": 9.221862584235526e-07, "loss": 0.30784907937049866, "step": 342 }, { "epoch": 9.535211267605634, "grad_norm": 0.27153849601745605, "learning_rate": 9.217442325230936e-07, "loss": 0.29595351219177246, "step": 343 }, { "epoch": 9.56338028169014, "grad_norm": 0.28174859285354614, "learning_rate": 9.213010742252327e-07, "loss": 0.3158809244632721, "step": 344 }, { "epoch": 9.591549295774648, "grad_norm": 0.27065321803092957, "learning_rate": 9.208567848799069e-07, "loss": 0.29831117391586304, "step": 345 }, { "epoch": 9.619718309859154, "grad_norm": 0.2704644799232483, "learning_rate": 9.204113658404989e-07, "loss": 0.31440460681915283, "step": 346 }, { "epoch": 9.647887323943662, "grad_norm": 0.2712800204753876, "learning_rate": 9.199648184638318e-07, "loss": 0.2985243499279022, "step": 347 }, { "epoch": 9.676056338028168, "grad_norm": 0.2808634042739868, "learning_rate": 9.195171441101668e-07, "loss": 0.3167741000652313, "step": 348 }, { "epoch": 9.704225352112676, "grad_norm": 0.27340877056121826, "learning_rate": 9.190683441431974e-07, "loss": 0.3019712269306183, "step": 349 }, { "epoch": 9.732394366197184, "grad_norm": 0.2813129723072052, "learning_rate": 9.186184199300463e-07, "loss": 0.3006363809108734, "step": 350 }, { "epoch": 9.76056338028169, "grad_norm": 0.28003188967704773, "learning_rate": 9.181673728412605e-07, "loss": 0.31190669536590576, "step": 351 }, { "epoch": 9.788732394366198, "grad_norm": 0.2703484892845154, "learning_rate": 9.177152042508077e-07, "loss": 0.3077196478843689, "step": 352 }, { "epoch": 9.816901408450704, "grad_norm": 0.2803649604320526, "learning_rate": 9.17261915536072e-07, "loss": 0.30905407667160034, "step": 353 }, { "epoch": 9.845070422535212, "grad_norm": 0.2884216606616974, "learning_rate": 9.168075080778494e-07, "loss": 0.30327335000038147, "step": 354 }, { "epoch": 9.873239436619718, "grad_norm": 0.2796288728713989, "learning_rate": 9.163519832603436e-07, "loss": 0.3104422390460968, "step": 355 }, { "epoch": 9.901408450704226, "grad_norm": 0.30282527208328247, "learning_rate": 9.158953424711624e-07, "loss": 0.3279035985469818, "step": 356 }, { "epoch": 9.929577464788732, "grad_norm": 0.2795606851577759, "learning_rate": 9.154375871013128e-07, "loss": 0.3136137127876282, "step": 357 }, { "epoch": 9.95774647887324, "grad_norm": 0.2871512174606323, "learning_rate": 9.149787185451969e-07, "loss": 0.3188316226005554, "step": 358 }, { "epoch": 9.985915492957746, "grad_norm": 0.2814459502696991, "learning_rate": 9.145187382006081e-07, "loss": 0.3084180951118469, "step": 359 }, { "epoch": 10.0, "grad_norm": 0.4135233461856842, "learning_rate": 9.140576474687263e-07, "loss": 0.32664716243743896, "step": 360 }, { "epoch": 10.028169014084508, "grad_norm": 0.2743515968322754, "learning_rate": 9.135954477541137e-07, "loss": 0.31237614154815674, "step": 361 }, { "epoch": 10.056338028169014, "grad_norm": 0.2790542244911194, "learning_rate": 9.131321404647109e-07, "loss": 0.32110899686813354, "step": 362 }, { "epoch": 10.084507042253522, "grad_norm": 0.32552531361579895, "learning_rate": 9.126677270118322e-07, "loss": 0.31540626287460327, "step": 363 }, { "epoch": 10.112676056338028, "grad_norm": 0.27251535654067993, "learning_rate": 9.122022088101613e-07, "loss": 0.2956544756889343, "step": 364 }, { "epoch": 10.140845070422536, "grad_norm": 0.3012971878051758, "learning_rate": 9.117355872777477e-07, "loss": 0.3012295961380005, "step": 365 }, { "epoch": 10.169014084507042, "grad_norm": 0.29038530588150024, "learning_rate": 9.112678638360015e-07, "loss": 0.2931394875049591, "step": 366 }, { "epoch": 10.19718309859155, "grad_norm": 0.2870721220970154, "learning_rate": 9.107990399096893e-07, "loss": 0.2930557131767273, "step": 367 }, { "epoch": 10.225352112676056, "grad_norm": 0.281965047121048, "learning_rate": 9.103291169269299e-07, "loss": 0.3096895217895508, "step": 368 }, { "epoch": 10.253521126760564, "grad_norm": 0.2720247209072113, "learning_rate": 9.098580963191907e-07, "loss": 0.302044540643692, "step": 369 }, { "epoch": 10.28169014084507, "grad_norm": 0.2841237783432007, "learning_rate": 9.093859795212817e-07, "loss": 0.32047468423843384, "step": 370 }, { "epoch": 10.309859154929578, "grad_norm": 0.29989898204803467, "learning_rate": 9.089127679713529e-07, "loss": 0.31085067987442017, "step": 371 }, { "epoch": 10.338028169014084, "grad_norm": 0.29164332151412964, "learning_rate": 9.084384631108882e-07, "loss": 0.3052881360054016, "step": 372 }, { "epoch": 10.366197183098592, "grad_norm": 0.2740509808063507, "learning_rate": 9.079630663847031e-07, "loss": 0.31468653678894043, "step": 373 }, { "epoch": 10.394366197183098, "grad_norm": 0.2791116535663605, "learning_rate": 9.074865792409381e-07, "loss": 0.30899161100387573, "step": 374 }, { "epoch": 10.422535211267606, "grad_norm": 0.30149030685424805, "learning_rate": 9.070090031310558e-07, "loss": 0.3094651997089386, "step": 375 }, { "epoch": 10.450704225352112, "grad_norm": 0.2970089018344879, "learning_rate": 9.065303395098358e-07, "loss": 0.3142540156841278, "step": 376 }, { "epoch": 10.47887323943662, "grad_norm": 0.2772645652294159, "learning_rate": 9.060505898353705e-07, "loss": 0.32443171739578247, "step": 377 }, { "epoch": 10.507042253521126, "grad_norm": 0.2707611620426178, "learning_rate": 9.055697555690607e-07, "loss": 0.30495521426200867, "step": 378 }, { "epoch": 10.535211267605634, "grad_norm": 0.2923314869403839, "learning_rate": 9.050878381756107e-07, "loss": 0.30734074115753174, "step": 379 }, { "epoch": 10.56338028169014, "grad_norm": 0.2865448594093323, "learning_rate": 9.046048391230247e-07, "loss": 0.2913230061531067, "step": 380 }, { "epoch": 10.591549295774648, "grad_norm": 0.29643693566322327, "learning_rate": 9.041207598826017e-07, "loss": 0.30088239908218384, "step": 381 }, { "epoch": 10.619718309859154, "grad_norm": 0.2761143445968628, "learning_rate": 9.036356019289309e-07, "loss": 0.30702435970306396, "step": 382 }, { "epoch": 10.647887323943662, "grad_norm": 0.27720797061920166, "learning_rate": 9.031493667398872e-07, "loss": 0.2953702509403229, "step": 383 }, { "epoch": 10.676056338028168, "grad_norm": 0.30037540197372437, "learning_rate": 9.026620557966279e-07, "loss": 0.3012697696685791, "step": 384 }, { "epoch": 10.704225352112676, "grad_norm": 0.27628859877586365, "learning_rate": 9.021736705835862e-07, "loss": 0.30558526515960693, "step": 385 }, { "epoch": 10.732394366197184, "grad_norm": 0.2692992091178894, "learning_rate": 9.016842125884684e-07, "loss": 0.288699209690094, "step": 386 }, { "epoch": 10.76056338028169, "grad_norm": 0.30020084977149963, "learning_rate": 9.011936833022484e-07, "loss": 0.294253945350647, "step": 387 }, { "epoch": 10.788732394366198, "grad_norm": 0.29289868474006653, "learning_rate": 9.007020842191634e-07, "loss": 0.31805676221847534, "step": 388 }, { "epoch": 10.816901408450704, "grad_norm": 0.28465571999549866, "learning_rate": 9.002094168367095e-07, "loss": 0.3168966472148895, "step": 389 }, { "epoch": 10.845070422535212, "grad_norm": 0.27562448382377625, "learning_rate": 8.997156826556369e-07, "loss": 0.302585631608963, "step": 390 }, { "epoch": 10.873239436619718, "grad_norm": 0.28200119733810425, "learning_rate": 8.992208831799456e-07, "loss": 0.3037059009075165, "step": 391 }, { "epoch": 10.901408450704226, "grad_norm": 0.2829252779483795, "learning_rate": 8.987250199168808e-07, "loss": 0.2850543260574341, "step": 392 }, { "epoch": 10.929577464788732, "grad_norm": 0.28010982275009155, "learning_rate": 8.982280943769278e-07, "loss": 0.30365508794784546, "step": 393 }, { "epoch": 10.95774647887324, "grad_norm": 0.2917790114879608, "learning_rate": 8.977301080738079e-07, "loss": 0.32212477922439575, "step": 394 }, { "epoch": 10.985915492957746, "grad_norm": 0.27254894375801086, "learning_rate": 8.97231062524474e-07, "loss": 0.29733577370643616, "step": 395 }, { "epoch": 11.0, "grad_norm": 0.38847291469573975, "learning_rate": 8.967309592491052e-07, "loss": 0.31824764609336853, "step": 396 }, { "epoch": 11.028169014084508, "grad_norm": 0.27360019087791443, "learning_rate": 8.962297997711027e-07, "loss": 0.2907956540584564, "step": 397 }, { "epoch": 11.056338028169014, "grad_norm": 0.28565695881843567, "learning_rate": 8.957275856170855e-07, "loss": 0.30498966574668884, "step": 398 }, { "epoch": 11.084507042253522, "grad_norm": 0.2826082408428192, "learning_rate": 8.952243183168848e-07, "loss": 0.3076494634151459, "step": 399 }, { "epoch": 11.112676056338028, "grad_norm": 0.28598853945732117, "learning_rate": 8.9471999940354e-07, "loss": 0.29677921533584595, "step": 400 }, { "epoch": 11.140845070422536, "grad_norm": 0.27635788917541504, "learning_rate": 8.942146304132943e-07, "loss": 0.28424787521362305, "step": 401 }, { "epoch": 11.169014084507042, "grad_norm": 0.3110678195953369, "learning_rate": 8.937082128855891e-07, "loss": 0.31091392040252686, "step": 402 }, { "epoch": 11.19718309859155, "grad_norm": 0.28018108010292053, "learning_rate": 8.932007483630596e-07, "loss": 0.2973289489746094, "step": 403 }, { "epoch": 11.225352112676056, "grad_norm": 0.2748464345932007, "learning_rate": 8.926922383915315e-07, "loss": 0.3064712882041931, "step": 404 }, { "epoch": 11.253521126760564, "grad_norm": 0.2758099138736725, "learning_rate": 8.921826845200138e-07, "loss": 0.30080002546310425, "step": 405 }, { "epoch": 11.28169014084507, "grad_norm": 0.27323541045188904, "learning_rate": 8.916720883006963e-07, "loss": 0.30011099576950073, "step": 406 }, { "epoch": 11.309859154929578, "grad_norm": 0.2751684784889221, "learning_rate": 8.911604512889434e-07, "loss": 0.3021606206893921, "step": 407 }, { "epoch": 11.338028169014084, "grad_norm": 0.278543084859848, "learning_rate": 8.906477750432903e-07, "loss": 0.2979898452758789, "step": 408 }, { "epoch": 11.366197183098592, "grad_norm": 0.2872096300125122, "learning_rate": 8.901340611254378e-07, "loss": 0.30450716614723206, "step": 409 }, { "epoch": 11.394366197183098, "grad_norm": 0.27768319845199585, "learning_rate": 8.896193111002475e-07, "loss": 0.31025999784469604, "step": 410 }, { "epoch": 11.422535211267606, "grad_norm": 0.28008511662483215, "learning_rate": 8.891035265357371e-07, "loss": 0.2903551757335663, "step": 411 }, { "epoch": 11.450704225352112, "grad_norm": 0.28000614047050476, "learning_rate": 8.88586709003076e-07, "loss": 0.30711328983306885, "step": 412 }, { "epoch": 11.47887323943662, "grad_norm": 0.27915990352630615, "learning_rate": 8.8806886007658e-07, "loss": 0.309296578168869, "step": 413 }, { "epoch": 11.507042253521126, "grad_norm": 0.2682763636112213, "learning_rate": 8.875499813337067e-07, "loss": 0.3053497076034546, "step": 414 }, { "epoch": 11.535211267605634, "grad_norm": 0.26592400670051575, "learning_rate": 8.87030074355051e-07, "loss": 0.29761987924575806, "step": 415 }, { "epoch": 11.56338028169014, "grad_norm": 0.2664642333984375, "learning_rate": 8.865091407243394e-07, "loss": 0.2986457645893097, "step": 416 }, { "epoch": 11.591549295774648, "grad_norm": 0.2615084648132324, "learning_rate": 8.859871820284261e-07, "loss": 0.31391632556915283, "step": 417 }, { "epoch": 11.619718309859154, "grad_norm": 0.27312856912612915, "learning_rate": 8.85464199857288e-07, "loss": 0.3128984570503235, "step": 418 }, { "epoch": 11.647887323943662, "grad_norm": 0.2734473645687103, "learning_rate": 8.849401958040192e-07, "loss": 0.298526793718338, "step": 419 }, { "epoch": 11.676056338028168, "grad_norm": 0.2901906669139862, "learning_rate": 8.844151714648274e-07, "loss": 0.31268036365509033, "step": 420 }, { "epoch": 11.704225352112676, "grad_norm": 0.28374356031417847, "learning_rate": 8.838891284390273e-07, "loss": 0.3042759299278259, "step": 421 }, { "epoch": 11.732394366197184, "grad_norm": 0.26128286123275757, "learning_rate": 8.833620683290375e-07, "loss": 0.30057787895202637, "step": 422 }, { "epoch": 11.76056338028169, "grad_norm": 0.29005923867225647, "learning_rate": 8.828339927403745e-07, "loss": 0.2969115376472473, "step": 423 }, { "epoch": 11.788732394366198, "grad_norm": 0.26823022961616516, "learning_rate": 8.823049032816478e-07, "loss": 0.3024095296859741, "step": 424 }, { "epoch": 11.816901408450704, "grad_norm": 0.2938059866428375, "learning_rate": 8.817748015645558e-07, "loss": 0.2982884347438812, "step": 425 }, { "epoch": 11.845070422535212, "grad_norm": 0.2794440686702728, "learning_rate": 8.812436892038805e-07, "loss": 0.3006170094013214, "step": 426 }, { "epoch": 11.873239436619718, "grad_norm": 0.27727699279785156, "learning_rate": 8.807115678174819e-07, "loss": 0.29938215017318726, "step": 427 }, { "epoch": 11.901408450704226, "grad_norm": 0.28038865327835083, "learning_rate": 8.801784390262943e-07, "loss": 0.3107326924800873, "step": 428 }, { "epoch": 11.929577464788732, "grad_norm": 0.29747217893600464, "learning_rate": 8.796443044543203e-07, "loss": 0.2999688982963562, "step": 429 }, { "epoch": 11.95774647887324, "grad_norm": 0.2875438332557678, "learning_rate": 8.791091657286267e-07, "loss": 0.2930242419242859, "step": 430 }, { "epoch": 11.985915492957746, "grad_norm": 0.2946978211402893, "learning_rate": 8.785730244793386e-07, "loss": 0.295132577419281, "step": 431 }, { "epoch": 12.0, "grad_norm": 0.39752283692359924, "learning_rate": 8.780358823396352e-07, "loss": 0.30750101804733276, "step": 432 }, { "epoch": 12.028169014084508, "grad_norm": 0.2708489000797272, "learning_rate": 8.774977409457447e-07, "loss": 0.3058265447616577, "step": 433 }, { "epoch": 12.056338028169014, "grad_norm": 0.2773410975933075, "learning_rate": 8.769586019369391e-07, "loss": 0.30409157276153564, "step": 434 }, { "epoch": 12.084507042253522, "grad_norm": 0.26894107460975647, "learning_rate": 8.764184669555293e-07, "loss": 0.30384916067123413, "step": 435 }, { "epoch": 12.112676056338028, "grad_norm": 0.27837878465652466, "learning_rate": 8.758773376468604e-07, "loss": 0.2943356931209564, "step": 436 }, { "epoch": 12.140845070422536, "grad_norm": 0.2690330445766449, "learning_rate": 8.753352156593055e-07, "loss": 0.2933955788612366, "step": 437 }, { "epoch": 12.169014084507042, "grad_norm": 0.27980291843414307, "learning_rate": 8.747921026442629e-07, "loss": 0.28997617959976196, "step": 438 }, { "epoch": 12.19718309859155, "grad_norm": 0.287624329328537, "learning_rate": 8.742480002561487e-07, "loss": 0.30039626359939575, "step": 439 }, { "epoch": 12.225352112676056, "grad_norm": 0.28817304968833923, "learning_rate": 8.737029101523929e-07, "loss": 0.3200758099555969, "step": 440 }, { "epoch": 12.253521126760564, "grad_norm": 0.2769193649291992, "learning_rate": 8.731568339934348e-07, "loss": 0.2976597547531128, "step": 441 }, { "epoch": 12.28169014084507, "grad_norm": 0.309583842754364, "learning_rate": 8.726097734427172e-07, "loss": 0.2977990210056305, "step": 442 }, { "epoch": 12.309859154929578, "grad_norm": 0.26997339725494385, "learning_rate": 8.72061730166681e-07, "loss": 0.29733020067214966, "step": 443 }, { "epoch": 12.338028169014084, "grad_norm": 0.2782990634441376, "learning_rate": 8.715127058347614e-07, "loss": 0.29592543840408325, "step": 444 }, { "epoch": 12.366197183098592, "grad_norm": 0.2781784236431122, "learning_rate": 8.709627021193816e-07, "loss": 0.2965870797634125, "step": 445 }, { "epoch": 12.394366197183098, "grad_norm": 0.2965787649154663, "learning_rate": 8.704117206959484e-07, "loss": 0.30272242426872253, "step": 446 }, { "epoch": 12.422535211267606, "grad_norm": 0.2780534625053406, "learning_rate": 8.698597632428466e-07, "loss": 0.30883416533470154, "step": 447 }, { "epoch": 12.450704225352112, "grad_norm": 0.27513188123703003, "learning_rate": 8.693068314414344e-07, "loss": 0.30461177229881287, "step": 448 }, { "epoch": 12.47887323943662, "grad_norm": 0.2838785946369171, "learning_rate": 8.687529269760379e-07, "loss": 0.2927112281322479, "step": 449 }, { "epoch": 12.507042253521126, "grad_norm": 0.28894707560539246, "learning_rate": 8.681980515339463e-07, "loss": 0.28816863894462585, "step": 450 }, { "epoch": 12.535211267605634, "grad_norm": 0.28006207942962646, "learning_rate": 8.676422068054064e-07, "loss": 0.29931047558784485, "step": 451 }, { "epoch": 12.56338028169014, "grad_norm": 0.2799602150917053, "learning_rate": 8.670853944836176e-07, "loss": 0.3038347363471985, "step": 452 }, { "epoch": 12.591549295774648, "grad_norm": 0.2760638892650604, "learning_rate": 8.665276162647267e-07, "loss": 0.30183106660842896, "step": 453 }, { "epoch": 12.619718309859154, "grad_norm": 0.278127521276474, "learning_rate": 8.659688738478231e-07, "loss": 0.3019717335700989, "step": 454 }, { "epoch": 12.647887323943662, "grad_norm": 0.26856380701065063, "learning_rate": 8.654091689349329e-07, "loss": 0.2945576310157776, "step": 455 }, { "epoch": 12.676056338028168, "grad_norm": 0.2749437391757965, "learning_rate": 8.648485032310144e-07, "loss": 0.3023756444454193, "step": 456 }, { "epoch": 12.704225352112676, "grad_norm": 0.2729102671146393, "learning_rate": 8.642868784439527e-07, "loss": 0.2842894196510315, "step": 457 }, { "epoch": 12.732394366197184, "grad_norm": 0.28390341997146606, "learning_rate": 8.63724296284554e-07, "loss": 0.2940555810928345, "step": 458 }, { "epoch": 12.76056338028169, "grad_norm": 0.2739807069301605, "learning_rate": 8.631607584665413e-07, "loss": 0.2935922145843506, "step": 459 }, { "epoch": 12.788732394366198, "grad_norm": 0.2823079824447632, "learning_rate": 8.625962667065487e-07, "loss": 0.2949485182762146, "step": 460 }, { "epoch": 12.816901408450704, "grad_norm": 0.2843155264854431, "learning_rate": 8.620308227241157e-07, "loss": 0.31058311462402344, "step": 461 }, { "epoch": 12.845070422535212, "grad_norm": 0.2805749475955963, "learning_rate": 8.614644282416831e-07, "loss": 0.2892061173915863, "step": 462 }, { "epoch": 12.873239436619718, "grad_norm": 0.2773419916629791, "learning_rate": 8.608970849845862e-07, "loss": 0.28688696026802063, "step": 463 }, { "epoch": 12.901408450704226, "grad_norm": 0.28667542338371277, "learning_rate": 8.603287946810513e-07, "loss": 0.30356699228286743, "step": 464 }, { "epoch": 12.929577464788732, "grad_norm": 0.2785196900367737, "learning_rate": 8.597595590621892e-07, "loss": 0.29802441596984863, "step": 465 }, { "epoch": 12.95774647887324, "grad_norm": 0.2778855562210083, "learning_rate": 8.591893798619903e-07, "loss": 0.29154932498931885, "step": 466 }, { "epoch": 12.985915492957746, "grad_norm": 0.28308385610580444, "learning_rate": 8.586182588173194e-07, "loss": 0.29143208265304565, "step": 467 }, { "epoch": 13.0, "grad_norm": 0.39711424708366394, "learning_rate": 8.580461976679099e-07, "loss": 0.2990560233592987, "step": 468 }, { "epoch": 13.028169014084508, "grad_norm": 0.26802533864974976, "learning_rate": 8.574731981563597e-07, "loss": 0.29934608936309814, "step": 469 }, { "epoch": 13.056338028169014, "grad_norm": 0.2663622498512268, "learning_rate": 8.568992620281243e-07, "loss": 0.29982200264930725, "step": 470 }, { "epoch": 13.084507042253522, "grad_norm": 0.28624898195266724, "learning_rate": 8.56324391031513e-07, "loss": 0.2810109555721283, "step": 471 }, { "epoch": 13.112676056338028, "grad_norm": 0.28607407212257385, "learning_rate": 8.557485869176825e-07, "loss": 0.2949367165565491, "step": 472 }, { "epoch": 13.140845070422536, "grad_norm": 0.26953044533729553, "learning_rate": 8.551718514406318e-07, "loss": 0.2851143479347229, "step": 473 }, { "epoch": 13.169014084507042, "grad_norm": 0.31105440855026245, "learning_rate": 8.545941863571973e-07, "loss": 0.2858909070491791, "step": 474 }, { "epoch": 13.19718309859155, "grad_norm": 0.28143224120140076, "learning_rate": 8.540155934270471e-07, "loss": 0.2961467504501343, "step": 475 }, { "epoch": 13.225352112676056, "grad_norm": 0.2862183451652527, "learning_rate": 8.534360744126753e-07, "loss": 0.29882240295410156, "step": 476 }, { "epoch": 13.253521126760564, "grad_norm": 0.26780712604522705, "learning_rate": 8.528556310793979e-07, "loss": 0.2933373749256134, "step": 477 }, { "epoch": 13.28169014084507, "grad_norm": 0.27026116847991943, "learning_rate": 8.522742651953456e-07, "loss": 0.2968083918094635, "step": 478 }, { "epoch": 13.309859154929578, "grad_norm": 0.2800562381744385, "learning_rate": 8.516919785314595e-07, "loss": 0.3015640377998352, "step": 479 }, { "epoch": 13.338028169014084, "grad_norm": 0.29154452681541443, "learning_rate": 8.511087728614862e-07, "loss": 0.31045541167259216, "step": 480 }, { "epoch": 13.366197183098592, "grad_norm": 0.28183555603027344, "learning_rate": 8.50524649961971e-07, "loss": 0.29173219203948975, "step": 481 }, { "epoch": 13.394366197183098, "grad_norm": 0.2971493601799011, "learning_rate": 8.499396116122535e-07, "loss": 0.2765740752220154, "step": 482 }, { "epoch": 13.422535211267606, "grad_norm": 0.26922252774238586, "learning_rate": 8.493536595944622e-07, "loss": 0.297348290681839, "step": 483 }, { "epoch": 13.450704225352112, "grad_norm": 0.27836039662361145, "learning_rate": 8.487667956935087e-07, "loss": 0.28694790601730347, "step": 484 }, { "epoch": 13.47887323943662, "grad_norm": 0.29267406463623047, "learning_rate": 8.481790216970819e-07, "loss": 0.2862587571144104, "step": 485 }, { "epoch": 13.507042253521126, "grad_norm": 0.27863144874572754, "learning_rate": 8.475903393956433e-07, "loss": 0.2894202470779419, "step": 486 }, { "epoch": 13.535211267605634, "grad_norm": 0.2911999523639679, "learning_rate": 8.470007505824215e-07, "loss": 0.29356449842453003, "step": 487 }, { "epoch": 13.56338028169014, "grad_norm": 0.2968003451824188, "learning_rate": 8.464102570534061e-07, "loss": 0.29188239574432373, "step": 488 }, { "epoch": 13.591549295774648, "grad_norm": 0.2842749357223511, "learning_rate": 8.458188606073431e-07, "loss": 0.28485268354415894, "step": 489 }, { "epoch": 13.619718309859154, "grad_norm": 0.2762301564216614, "learning_rate": 8.452265630457282e-07, "loss": 0.2829025387763977, "step": 490 }, { "epoch": 13.647887323943662, "grad_norm": 0.27368924021720886, "learning_rate": 8.446333661728028e-07, "loss": 0.3129264712333679, "step": 491 }, { "epoch": 13.676056338028168, "grad_norm": 0.3042363226413727, "learning_rate": 8.440392717955475e-07, "loss": 0.298667311668396, "step": 492 }, { "epoch": 13.704225352112676, "grad_norm": 0.31437602639198303, "learning_rate": 8.434442817236765e-07, "loss": 0.2911669909954071, "step": 493 }, { "epoch": 13.732394366197184, "grad_norm": 0.2624206840991974, "learning_rate": 8.428483977696328e-07, "loss": 0.2875954508781433, "step": 494 }, { "epoch": 13.76056338028169, "grad_norm": 0.2824702858924866, "learning_rate": 8.422516217485825e-07, "loss": 0.28079336881637573, "step": 495 }, { "epoch": 13.788732394366198, "grad_norm": 0.27612945437431335, "learning_rate": 8.416539554784089e-07, "loss": 0.3052091598510742, "step": 496 }, { "epoch": 13.816901408450704, "grad_norm": 0.28139790892601013, "learning_rate": 8.410554007797068e-07, "loss": 0.2918257415294647, "step": 497 }, { "epoch": 13.845070422535212, "grad_norm": 0.2779678702354431, "learning_rate": 8.404559594757777e-07, "loss": 0.30707138776779175, "step": 498 }, { "epoch": 13.873239436619718, "grad_norm": 0.2710152566432953, "learning_rate": 8.398556333926239e-07, "loss": 0.3128437101840973, "step": 499 }, { "epoch": 13.901408450704226, "grad_norm": 0.2958044707775116, "learning_rate": 8.392544243589427e-07, "loss": 0.29653337597846985, "step": 500 }, { "epoch": 13.929577464788732, "grad_norm": 0.28408974409103394, "learning_rate": 8.38652334206121e-07, "loss": 0.29291969537734985, "step": 501 }, { "epoch": 13.95774647887324, "grad_norm": 0.27897724509239197, "learning_rate": 8.3804936476823e-07, "loss": 0.3117462992668152, "step": 502 }, { "epoch": 13.985915492957746, "grad_norm": 0.27391254901885986, "learning_rate": 8.374455178820189e-07, "loss": 0.30571603775024414, "step": 503 }, { "epoch": 14.0, "grad_norm": 0.3995163142681122, "learning_rate": 8.368407953869103e-07, "loss": 0.2876809239387512, "step": 504 }, { "epoch": 14.028169014084508, "grad_norm": 0.3068762719631195, "learning_rate": 8.362351991249937e-07, "loss": 0.28866052627563477, "step": 505 }, { "epoch": 14.056338028169014, "grad_norm": 0.278751939535141, "learning_rate": 8.356287309410204e-07, "loss": 0.3048397898674011, "step": 506 }, { "epoch": 14.084507042253522, "grad_norm": 0.2831234335899353, "learning_rate": 8.350213926823974e-07, "loss": 0.28643566370010376, "step": 507 }, { "epoch": 14.112676056338028, "grad_norm": 0.2744354009628296, "learning_rate": 8.344131861991828e-07, "loss": 0.30159255862236023, "step": 508 }, { "epoch": 14.140845070422536, "grad_norm": 0.2834227383136749, "learning_rate": 8.338041133440788e-07, "loss": 0.2945912182331085, "step": 509 }, { "epoch": 14.169014084507042, "grad_norm": 0.2914932072162628, "learning_rate": 8.331941759724268e-07, "loss": 0.30261489748954773, "step": 510 }, { "epoch": 14.19718309859155, "grad_norm": 0.2795814871788025, "learning_rate": 8.325833759422021e-07, "loss": 0.29661813378334045, "step": 511 }, { "epoch": 14.225352112676056, "grad_norm": 0.2715330719947815, "learning_rate": 8.319717151140072e-07, "loss": 0.28672271966934204, "step": 512 }, { "epoch": 14.253521126760564, "grad_norm": 0.2859768271446228, "learning_rate": 8.313591953510673e-07, "loss": 0.2985742390155792, "step": 513 }, { "epoch": 14.28169014084507, "grad_norm": 0.2789771854877472, "learning_rate": 8.307458185192238e-07, "loss": 0.2883588671684265, "step": 514 }, { "epoch": 14.309859154929578, "grad_norm": 0.2849474549293518, "learning_rate": 8.301315864869289e-07, "loss": 0.3045833706855774, "step": 515 }, { "epoch": 14.338028169014084, "grad_norm": 0.28583216667175293, "learning_rate": 8.295165011252396e-07, "loss": 0.28541919589042664, "step": 516 }, { "epoch": 14.366197183098592, "grad_norm": 0.286767840385437, "learning_rate": 8.289005643078131e-07, "loss": 0.2928876280784607, "step": 517 }, { "epoch": 14.394366197183098, "grad_norm": 0.2851925790309906, "learning_rate": 8.282837779108993e-07, "loss": 0.29808348417282104, "step": 518 }, { "epoch": 14.422535211267606, "grad_norm": 0.2843434512615204, "learning_rate": 8.276661438133368e-07, "loss": 0.281357079744339, "step": 519 }, { "epoch": 14.450704225352112, "grad_norm": 0.29959535598754883, "learning_rate": 8.270476638965461e-07, "loss": 0.287128746509552, "step": 520 }, { "epoch": 14.47887323943662, "grad_norm": 0.2812483310699463, "learning_rate": 8.264283400445243e-07, "loss": 0.29306480288505554, "step": 521 }, { "epoch": 14.507042253521126, "grad_norm": 0.3015466034412384, "learning_rate": 8.258081741438394e-07, "loss": 0.3011341691017151, "step": 522 }, { "epoch": 14.535211267605634, "grad_norm": 0.2930891215801239, "learning_rate": 8.25187168083624e-07, "loss": 0.2976144850254059, "step": 523 }, { "epoch": 14.56338028169014, "grad_norm": 0.2777521312236786, "learning_rate": 8.245653237555705e-07, "loss": 0.2829003930091858, "step": 524 }, { "epoch": 14.591549295774648, "grad_norm": 0.2916077673435211, "learning_rate": 8.239426430539243e-07, "loss": 0.28546392917633057, "step": 525 }, { "epoch": 14.619718309859154, "grad_norm": 0.3006315231323242, "learning_rate": 8.23319127875479e-07, "loss": 0.2851755619049072, "step": 526 }, { "epoch": 14.647887323943662, "grad_norm": 0.2654482424259186, "learning_rate": 8.226947801195699e-07, "loss": 0.28430840373039246, "step": 527 }, { "epoch": 14.676056338028168, "grad_norm": 0.2679372727870941, "learning_rate": 8.220696016880687e-07, "loss": 0.282630980014801, "step": 528 }, { "epoch": 14.704225352112676, "grad_norm": 0.28538262844085693, "learning_rate": 8.21443594485377e-07, "loss": 0.2789214551448822, "step": 529 }, { "epoch": 14.732394366197184, "grad_norm": 0.2713358700275421, "learning_rate": 8.208167604184217e-07, "loss": 0.2909342646598816, "step": 530 }, { "epoch": 14.76056338028169, "grad_norm": 0.30056601762771606, "learning_rate": 8.201891013966478e-07, "loss": 0.2838485836982727, "step": 531 }, { "epoch": 14.788732394366198, "grad_norm": 0.2811543345451355, "learning_rate": 8.195606193320136e-07, "loss": 0.29030710458755493, "step": 532 }, { "epoch": 14.816901408450704, "grad_norm": 0.2930709719657898, "learning_rate": 8.189313161389844e-07, "loss": 0.2922976613044739, "step": 533 }, { "epoch": 14.845070422535212, "grad_norm": 0.29798057675361633, "learning_rate": 8.183011937345271e-07, "loss": 0.2951294183731079, "step": 534 }, { "epoch": 14.873239436619718, "grad_norm": 0.28483426570892334, "learning_rate": 8.176702540381036e-07, "loss": 0.2938500642776489, "step": 535 }, { "epoch": 14.901408450704226, "grad_norm": 0.2990010380744934, "learning_rate": 8.170384989716657e-07, "loss": 0.29805850982666016, "step": 536 }, { "epoch": 14.929577464788732, "grad_norm": 0.2896774411201477, "learning_rate": 8.164059304596488e-07, "loss": 0.29530227184295654, "step": 537 }, { "epoch": 14.95774647887324, "grad_norm": 0.28662148118019104, "learning_rate": 8.157725504289664e-07, "loss": 0.28371667861938477, "step": 538 }, { "epoch": 14.985915492957746, "grad_norm": 0.2807771861553192, "learning_rate": 8.151383608090039e-07, "loss": 0.29020193219184875, "step": 539 }, { "epoch": 15.0, "grad_norm": 0.39528268575668335, "learning_rate": 8.145033635316128e-07, "loss": 0.30530279874801636, "step": 540 }, { "epoch": 15.028169014084508, "grad_norm": 0.28691425919532776, "learning_rate": 8.138675605311051e-07, "loss": 0.27306681871414185, "step": 541 }, { "epoch": 15.056338028169014, "grad_norm": 0.27633434534072876, "learning_rate": 8.13230953744247e-07, "loss": 0.2900540828704834, "step": 542 }, { "epoch": 15.084507042253522, "grad_norm": 0.28263136744499207, "learning_rate": 8.125935451102528e-07, "loss": 0.29298198223114014, "step": 543 }, { "epoch": 15.112676056338028, "grad_norm": 0.2708156406879425, "learning_rate": 8.119553365707802e-07, "loss": 0.2728630006313324, "step": 544 }, { "epoch": 15.140845070422536, "grad_norm": 0.28263747692108154, "learning_rate": 8.113163300699228e-07, "loss": 0.2994900047779083, "step": 545 }, { "epoch": 15.169014084507042, "grad_norm": 0.2628503739833832, "learning_rate": 8.106765275542053e-07, "loss": 0.2943934202194214, "step": 546 }, { "epoch": 15.19718309859155, "grad_norm": 0.2844214141368866, "learning_rate": 8.100359309725774e-07, "loss": 0.286617636680603, "step": 547 }, { "epoch": 15.225352112676056, "grad_norm": 0.2979234457015991, "learning_rate": 8.093945422764069e-07, "loss": 0.28598904609680176, "step": 548 }, { "epoch": 15.253521126760564, "grad_norm": 0.2918255925178528, "learning_rate": 8.087523634194754e-07, "loss": 0.2826801538467407, "step": 549 }, { "epoch": 15.28169014084507, "grad_norm": 0.30238643288612366, "learning_rate": 8.081093963579707e-07, "loss": 0.3018723726272583, "step": 550 }, { "epoch": 15.309859154929578, "grad_norm": 0.2762410342693329, "learning_rate": 8.074656430504823e-07, "loss": 0.27831658720970154, "step": 551 }, { "epoch": 15.338028169014084, "grad_norm": 0.28324148058891296, "learning_rate": 8.068211054579943e-07, "loss": 0.30506500601768494, "step": 552 }, { "epoch": 15.366197183098592, "grad_norm": 0.2893829643726349, "learning_rate": 8.061757855438799e-07, "loss": 0.29023078083992004, "step": 553 }, { "epoch": 15.394366197183098, "grad_norm": 0.2907930016517639, "learning_rate": 8.055296852738956e-07, "loss": 0.28343409299850464, "step": 554 }, { "epoch": 15.422535211267606, "grad_norm": 0.28478139638900757, "learning_rate": 8.048828066161747e-07, "loss": 0.28546571731567383, "step": 555 }, { "epoch": 15.450704225352112, "grad_norm": 0.2851191759109497, "learning_rate": 8.04235151541222e-07, "loss": 0.2884707748889923, "step": 556 }, { "epoch": 15.47887323943662, "grad_norm": 0.2689509987831116, "learning_rate": 8.035867220219071e-07, "loss": 0.2950664758682251, "step": 557 }, { "epoch": 15.507042253521126, "grad_norm": 0.2825435400009155, "learning_rate": 8.029375200334587e-07, "loss": 0.281552791595459, "step": 558 }, { "epoch": 15.535211267605634, "grad_norm": 0.28483787178993225, "learning_rate": 8.022875475534588e-07, "loss": 0.2870042622089386, "step": 559 }, { "epoch": 15.56338028169014, "grad_norm": 0.27896517515182495, "learning_rate": 8.01636806561836e-07, "loss": 0.287916362285614, "step": 560 }, { "epoch": 15.591549295774648, "grad_norm": 0.2788335382938385, "learning_rate": 8.009852990408606e-07, "loss": 0.28609931468963623, "step": 561 }, { "epoch": 15.619718309859154, "grad_norm": 0.2826322019100189, "learning_rate": 8.003330269751372e-07, "loss": 0.2950190305709839, "step": 562 }, { "epoch": 15.647887323943662, "grad_norm": 0.2843019366264343, "learning_rate": 7.996799923515997e-07, "loss": 0.2914244532585144, "step": 563 }, { "epoch": 15.676056338028168, "grad_norm": 0.26445460319519043, "learning_rate": 7.990261971595048e-07, "loss": 0.27984780073165894, "step": 564 }, { "epoch": 15.704225352112676, "grad_norm": 0.27918627858161926, "learning_rate": 7.983716433904262e-07, "loss": 0.27757298946380615, "step": 565 }, { "epoch": 15.732394366197184, "grad_norm": 0.2938336133956909, "learning_rate": 7.977163330382479e-07, "loss": 0.2920360565185547, "step": 566 }, { "epoch": 15.76056338028169, "grad_norm": 0.28976547718048096, "learning_rate": 7.970602680991592e-07, "loss": 0.2951090931892395, "step": 567 }, { "epoch": 15.788732394366198, "grad_norm": 0.27327752113342285, "learning_rate": 7.964034505716476e-07, "loss": 0.29640987515449524, "step": 568 }, { "epoch": 15.816901408450704, "grad_norm": 0.27222704887390137, "learning_rate": 7.957458824564931e-07, "loss": 0.28876399993896484, "step": 569 }, { "epoch": 15.845070422535212, "grad_norm": 0.29962998628616333, "learning_rate": 7.950875657567621e-07, "loss": 0.3039361238479614, "step": 570 }, { "epoch": 15.873239436619718, "grad_norm": 0.2705839276313782, "learning_rate": 7.944285024778017e-07, "loss": 0.28840112686157227, "step": 571 }, { "epoch": 15.901408450704226, "grad_norm": 0.28124475479125977, "learning_rate": 7.93768694627233e-07, "loss": 0.2832530736923218, "step": 572 }, { "epoch": 15.929577464788732, "grad_norm": 0.29025372862815857, "learning_rate": 7.931081442149448e-07, "loss": 0.28588593006134033, "step": 573 }, { "epoch": 15.95774647887324, "grad_norm": 0.27376946806907654, "learning_rate": 7.924468532530883e-07, "loss": 0.2883457839488983, "step": 574 }, { "epoch": 15.985915492957746, "grad_norm": 0.28059038519859314, "learning_rate": 7.917848237560708e-07, "loss": 0.2923107147216797, "step": 575 }, { "epoch": 16.0, "grad_norm": 0.39920157194137573, "learning_rate": 7.911220577405484e-07, "loss": 0.2896960973739624, "step": 576 }, { "epoch": 16.028169014084508, "grad_norm": 0.2756041884422302, "learning_rate": 7.904585572254218e-07, "loss": 0.2934238910675049, "step": 577 }, { "epoch": 16.056338028169016, "grad_norm": 0.2831096947193146, "learning_rate": 7.897943242318285e-07, "loss": 0.2862626910209656, "step": 578 }, { "epoch": 16.08450704225352, "grad_norm": 0.27020981907844543, "learning_rate": 7.891293607831373e-07, "loss": 0.3019767999649048, "step": 579 }, { "epoch": 16.112676056338028, "grad_norm": 0.2866615056991577, "learning_rate": 7.884636689049422e-07, "loss": 0.29431337118148804, "step": 580 }, { "epoch": 16.140845070422536, "grad_norm": 0.27709120512008667, "learning_rate": 7.877972506250562e-07, "loss": 0.26718783378601074, "step": 581 }, { "epoch": 16.169014084507044, "grad_norm": 0.2864624261856079, "learning_rate": 7.871301079735049e-07, "loss": 0.28138402104377747, "step": 582 }, { "epoch": 16.197183098591548, "grad_norm": 0.2806070148944855, "learning_rate": 7.864622429825204e-07, "loss": 0.29040491580963135, "step": 583 }, { "epoch": 16.225352112676056, "grad_norm": 0.2866605818271637, "learning_rate": 7.857936576865356e-07, "loss": 0.2876106798648834, "step": 584 }, { "epoch": 16.253521126760564, "grad_norm": 0.2853955626487732, "learning_rate": 7.851243541221769e-07, "loss": 0.30784159898757935, "step": 585 }, { "epoch": 16.281690140845072, "grad_norm": 0.290031760931015, "learning_rate": 7.844543343282595e-07, "loss": 0.27567434310913086, "step": 586 }, { "epoch": 16.309859154929576, "grad_norm": 0.283806174993515, "learning_rate": 7.837836003457793e-07, "loss": 0.28710314631462097, "step": 587 }, { "epoch": 16.338028169014084, "grad_norm": 0.2768094539642334, "learning_rate": 7.831121542179086e-07, "loss": 0.27676063776016235, "step": 588 }, { "epoch": 16.366197183098592, "grad_norm": 0.27568569779396057, "learning_rate": 7.824399979899889e-07, "loss": 0.2947593927383423, "step": 589 }, { "epoch": 16.3943661971831, "grad_norm": 0.3079885244369507, "learning_rate": 7.817671337095244e-07, "loss": 0.2868027985095978, "step": 590 }, { "epoch": 16.422535211267604, "grad_norm": 0.29744645953178406, "learning_rate": 7.810935634261764e-07, "loss": 0.2946295738220215, "step": 591 }, { "epoch": 16.450704225352112, "grad_norm": 0.28457650542259216, "learning_rate": 7.804192891917571e-07, "loss": 0.2790455222129822, "step": 592 }, { "epoch": 16.47887323943662, "grad_norm": 0.28848767280578613, "learning_rate": 7.797443130602226e-07, "loss": 0.2941606640815735, "step": 593 }, { "epoch": 16.507042253521128, "grad_norm": 0.2936708927154541, "learning_rate": 7.79068637087667e-07, "loss": 0.2923729121685028, "step": 594 }, { "epoch": 16.535211267605632, "grad_norm": 0.28460994362831116, "learning_rate": 7.783922633323169e-07, "loss": 0.2795827090740204, "step": 595 }, { "epoch": 16.56338028169014, "grad_norm": 0.28233277797698975, "learning_rate": 7.777151938545235e-07, "loss": 0.29222947359085083, "step": 596 }, { "epoch": 16.591549295774648, "grad_norm": 0.28648558259010315, "learning_rate": 7.770374307167585e-07, "loss": 0.27923721075057983, "step": 597 }, { "epoch": 16.619718309859156, "grad_norm": 0.2813912332057953, "learning_rate": 7.763589759836058e-07, "loss": 0.2912202477455139, "step": 598 }, { "epoch": 16.647887323943664, "grad_norm": 0.28273841738700867, "learning_rate": 7.756798317217558e-07, "loss": 0.29805850982666016, "step": 599 }, { "epoch": 16.676056338028168, "grad_norm": 0.2922080457210541, "learning_rate": 7.75e-07, "loss": 0.2834911346435547, "step": 600 }, { "epoch": 16.704225352112676, "grad_norm": 0.27855902910232544, "learning_rate": 7.743194828892235e-07, "loss": 0.2842041552066803, "step": 601 }, { "epoch": 16.732394366197184, "grad_norm": 0.2905668318271637, "learning_rate": 7.736382824623999e-07, "loss": 0.281250923871994, "step": 602 }, { "epoch": 16.760563380281692, "grad_norm": 0.2928289771080017, "learning_rate": 7.729564007945834e-07, "loss": 0.2863979935646057, "step": 603 }, { "epoch": 16.788732394366196, "grad_norm": 0.28705668449401855, "learning_rate": 7.72273839962904e-07, "loss": 0.287672221660614, "step": 604 }, { "epoch": 16.816901408450704, "grad_norm": 0.29107093811035156, "learning_rate": 7.715906020465602e-07, "loss": 0.27715277671813965, "step": 605 }, { "epoch": 16.845070422535212, "grad_norm": 0.28827348351478577, "learning_rate": 7.709066891268133e-07, "loss": 0.2648072838783264, "step": 606 }, { "epoch": 16.87323943661972, "grad_norm": 0.28768298029899597, "learning_rate": 7.702221032869808e-07, "loss": 0.26861560344696045, "step": 607 }, { "epoch": 16.901408450704224, "grad_norm": 0.3000086843967438, "learning_rate": 7.695368466124296e-07, "loss": 0.2910693287849426, "step": 608 }, { "epoch": 16.929577464788732, "grad_norm": 0.3058622181415558, "learning_rate": 7.688509211905707e-07, "loss": 0.2804388105869293, "step": 609 }, { "epoch": 16.95774647887324, "grad_norm": 0.2874692678451538, "learning_rate": 7.681643291108517e-07, "loss": 0.2883044481277466, "step": 610 }, { "epoch": 16.985915492957748, "grad_norm": 0.2868764102458954, "learning_rate": 7.67477072464751e-07, "loss": 0.2847598195075989, "step": 611 }, { "epoch": 17.0, "grad_norm": 0.3980148136615753, "learning_rate": 7.667891533457718e-07, "loss": 0.29258161783218384, "step": 612 }, { "epoch": 17.028169014084508, "grad_norm": 0.2752118408679962, "learning_rate": 7.661005738494349e-07, "loss": 0.28283417224884033, "step": 613 }, { "epoch": 17.056338028169016, "grad_norm": 0.2837778627872467, "learning_rate": 7.654113360732732e-07, "loss": 0.2758600115776062, "step": 614 }, { "epoch": 17.08450704225352, "grad_norm": 0.2887240946292877, "learning_rate": 7.647214421168238e-07, "loss": 0.2864817976951599, "step": 615 }, { "epoch": 17.112676056338028, "grad_norm": 0.27935662865638733, "learning_rate": 7.640308940816239e-07, "loss": 0.28024283051490784, "step": 616 }, { "epoch": 17.140845070422536, "grad_norm": 0.2960900664329529, "learning_rate": 7.633396940712023e-07, "loss": 0.2681460976600647, "step": 617 }, { "epoch": 17.169014084507044, "grad_norm": 0.2915673553943634, "learning_rate": 7.626478441910744e-07, "loss": 0.2805773913860321, "step": 618 }, { "epoch": 17.197183098591548, "grad_norm": 0.2789720892906189, "learning_rate": 7.619553465487344e-07, "loss": 0.28847092390060425, "step": 619 }, { "epoch": 17.225352112676056, "grad_norm": 0.2745218575000763, "learning_rate": 7.612622032536507e-07, "loss": 0.28274643421173096, "step": 620 }, { "epoch": 17.253521126760564, "grad_norm": 0.2962469458580017, "learning_rate": 7.60568416417258e-07, "loss": 0.2827341556549072, "step": 621 }, { "epoch": 17.281690140845072, "grad_norm": 0.28243717551231384, "learning_rate": 7.59873988152951e-07, "loss": 0.2872379422187805, "step": 622 }, { "epoch": 17.309859154929576, "grad_norm": 0.2935909926891327, "learning_rate": 7.591789205760789e-07, "loss": 0.29077547788619995, "step": 623 }, { "epoch": 17.338028169014084, "grad_norm": 0.2725030481815338, "learning_rate": 7.584832158039378e-07, "loss": 0.28079894185066223, "step": 624 }, { "epoch": 17.366197183098592, "grad_norm": 0.2863542437553406, "learning_rate": 7.577868759557653e-07, "loss": 0.2759760618209839, "step": 625 }, { "epoch": 17.3943661971831, "grad_norm": 0.2829958498477936, "learning_rate": 7.570899031527332e-07, "loss": 0.27316516637802124, "step": 626 }, { "epoch": 17.422535211267604, "grad_norm": 0.28861963748931885, "learning_rate": 7.563922995179418e-07, "loss": 0.2758478820323944, "step": 627 }, { "epoch": 17.450704225352112, "grad_norm": 0.2935570478439331, "learning_rate": 7.556940671764124e-07, "loss": 0.28437983989715576, "step": 628 }, { "epoch": 17.47887323943662, "grad_norm": 0.3037278652191162, "learning_rate": 7.54995208255082e-07, "loss": 0.28943467140197754, "step": 629 }, { "epoch": 17.507042253521128, "grad_norm": 0.31774893403053284, "learning_rate": 7.54295724882796e-07, "loss": 0.29023581743240356, "step": 630 }, { "epoch": 17.535211267605632, "grad_norm": 0.28832852840423584, "learning_rate": 7.535956191903021e-07, "loss": 0.2840030789375305, "step": 631 }, { "epoch": 17.56338028169014, "grad_norm": 0.28122231364250183, "learning_rate": 7.528948933102438e-07, "loss": 0.28523629903793335, "step": 632 }, { "epoch": 17.591549295774648, "grad_norm": 0.29538190364837646, "learning_rate": 7.521935493771534e-07, "loss": 0.28018033504486084, "step": 633 }, { "epoch": 17.619718309859156, "grad_norm": 0.3163702189922333, "learning_rate": 7.514915895274463e-07, "loss": 0.2885722517967224, "step": 634 }, { "epoch": 17.647887323943664, "grad_norm": 0.2946973741054535, "learning_rate": 7.507890158994139e-07, "loss": 0.2785816490650177, "step": 635 }, { "epoch": 17.676056338028168, "grad_norm": 0.2805889844894409, "learning_rate": 7.500858306332172e-07, "loss": 0.2974117398262024, "step": 636 }, { "epoch": 17.704225352112676, "grad_norm": 0.28544914722442627, "learning_rate": 7.493820358708809e-07, "loss": 0.2892162501811981, "step": 637 }, { "epoch": 17.732394366197184, "grad_norm": 0.3272300064563751, "learning_rate": 7.486776337562853e-07, "loss": 0.3017275333404541, "step": 638 }, { "epoch": 17.760563380281692, "grad_norm": 0.28177788853645325, "learning_rate": 7.479726264351618e-07, "loss": 0.2729823589324951, "step": 639 }, { "epoch": 17.788732394366196, "grad_norm": 0.2774059474468231, "learning_rate": 7.472670160550848e-07, "loss": 0.27497977018356323, "step": 640 }, { "epoch": 17.816901408450704, "grad_norm": 0.2898328900337219, "learning_rate": 7.46560804765466e-07, "loss": 0.27945676445961, "step": 641 }, { "epoch": 17.845070422535212, "grad_norm": 0.2784922420978546, "learning_rate": 7.458539947175473e-07, "loss": 0.29566580057144165, "step": 642 }, { "epoch": 17.87323943661972, "grad_norm": 0.2864189147949219, "learning_rate": 7.45146588064395e-07, "loss": 0.2862587869167328, "step": 643 }, { "epoch": 17.901408450704224, "grad_norm": 0.2896963953971863, "learning_rate": 7.444385869608921e-07, "loss": 0.2924667000770569, "step": 644 }, { "epoch": 17.929577464788732, "grad_norm": 0.28463807702064514, "learning_rate": 7.437299935637328e-07, "loss": 0.2862287163734436, "step": 645 }, { "epoch": 17.95774647887324, "grad_norm": 0.28407302498817444, "learning_rate": 7.430208100314156e-07, "loss": 0.2759779989719391, "step": 646 }, { "epoch": 17.985915492957748, "grad_norm": 0.2773316502571106, "learning_rate": 7.423110385242366e-07, "loss": 0.2798498272895813, "step": 647 }, { "epoch": 18.0, "grad_norm": 0.3958338499069214, "learning_rate": 7.416006812042827e-07, "loss": 0.28481870889663696, "step": 648 }, { "epoch": 18.028169014084508, "grad_norm": 0.2922191321849823, "learning_rate": 7.408897402354255e-07, "loss": 0.2781963348388672, "step": 649 }, { "epoch": 18.056338028169016, "grad_norm": 0.29166096448898315, "learning_rate": 7.401782177833147e-07, "loss": 0.2843964099884033, "step": 650 }, { "epoch": 18.08450704225352, "grad_norm": 0.28290343284606934, "learning_rate": 7.394661160153709e-07, "loss": 0.2840275168418884, "step": 651 }, { "epoch": 18.112676056338028, "grad_norm": 0.28300249576568604, "learning_rate": 7.387534371007797e-07, "loss": 0.2893407642841339, "step": 652 }, { "epoch": 18.140845070422536, "grad_norm": 0.2870761752128601, "learning_rate": 7.380401832104845e-07, "loss": 0.26570916175842285, "step": 653 }, { "epoch": 18.169014084507044, "grad_norm": 0.2919873297214508, "learning_rate": 7.373263565171805e-07, "loss": 0.26768985390663147, "step": 654 }, { "epoch": 18.197183098591548, "grad_norm": 0.2856583893299103, "learning_rate": 7.366119591953075e-07, "loss": 0.2823103070259094, "step": 655 }, { "epoch": 18.225352112676056, "grad_norm": 0.2853250801563263, "learning_rate": 7.358969934210438e-07, "loss": 0.28462791442871094, "step": 656 }, { "epoch": 18.253521126760564, "grad_norm": 0.27667704224586487, "learning_rate": 7.35181461372299e-07, "loss": 0.27125126123428345, "step": 657 }, { "epoch": 18.281690140845072, "grad_norm": 0.2884734272956848, "learning_rate": 7.344653652287077e-07, "loss": 0.271454781293869, "step": 658 }, { "epoch": 18.309859154929576, "grad_norm": 0.28490886092185974, "learning_rate": 7.337487071716232e-07, "loss": 0.286302775144577, "step": 659 }, { "epoch": 18.338028169014084, "grad_norm": 0.27361124753952026, "learning_rate": 7.330314893841101e-07, "loss": 0.2801797389984131, "step": 660 }, { "epoch": 18.366197183098592, "grad_norm": 0.28517088294029236, "learning_rate": 7.323137140509381e-07, "loss": 0.2785356640815735, "step": 661 }, { "epoch": 18.3943661971831, "grad_norm": 0.2725742757320404, "learning_rate": 7.315953833585755e-07, "loss": 0.27504605054855347, "step": 662 }, { "epoch": 18.422535211267604, "grad_norm": 0.29915499687194824, "learning_rate": 7.308764994951821e-07, "loss": 0.2808704078197479, "step": 663 }, { "epoch": 18.450704225352112, "grad_norm": 0.31304341554641724, "learning_rate": 7.301570646506027e-07, "loss": 0.2911706566810608, "step": 664 }, { "epoch": 18.47887323943662, "grad_norm": 0.2919553816318512, "learning_rate": 7.294370810163607e-07, "loss": 0.27866852283477783, "step": 665 }, { "epoch": 18.507042253521128, "grad_norm": 0.3162909746170044, "learning_rate": 7.287165507856512e-07, "loss": 0.2802932560443878, "step": 666 }, { "epoch": 18.535211267605632, "grad_norm": 0.303523451089859, "learning_rate": 7.279954761533342e-07, "loss": 0.2824591398239136, "step": 667 }, { "epoch": 18.56338028169014, "grad_norm": 0.29366716742515564, "learning_rate": 7.27273859315928e-07, "loss": 0.28101497888565063, "step": 668 }, { "epoch": 18.591549295774648, "grad_norm": 0.28469985723495483, "learning_rate": 7.265517024716026e-07, "loss": 0.29134345054626465, "step": 669 }, { "epoch": 18.619718309859156, "grad_norm": 0.28721922636032104, "learning_rate": 7.258290078201731e-07, "loss": 0.284817636013031, "step": 670 }, { "epoch": 18.647887323943664, "grad_norm": 0.30535197257995605, "learning_rate": 7.251057775630927e-07, "loss": 0.28168779611587524, "step": 671 }, { "epoch": 18.676056338028168, "grad_norm": 0.2980702817440033, "learning_rate": 7.243820139034464e-07, "loss": 0.27493056654930115, "step": 672 }, { "epoch": 18.704225352112676, "grad_norm": 0.28984636068344116, "learning_rate": 7.236577190459433e-07, "loss": 0.2975635528564453, "step": 673 }, { "epoch": 18.732394366197184, "grad_norm": 0.29580390453338623, "learning_rate": 7.229328951969115e-07, "loss": 0.2849118113517761, "step": 674 }, { "epoch": 18.760563380281692, "grad_norm": 0.2950834035873413, "learning_rate": 7.222075445642904e-07, "loss": 0.26458609104156494, "step": 675 }, { "epoch": 18.788732394366196, "grad_norm": 0.29167890548706055, "learning_rate": 7.214816693576234e-07, "loss": 0.2846098840236664, "step": 676 }, { "epoch": 18.816901408450704, "grad_norm": 0.2784614861011505, "learning_rate": 7.207552717880522e-07, "loss": 0.28443169593811035, "step": 677 }, { "epoch": 18.845070422535212, "grad_norm": 0.29537051916122437, "learning_rate": 7.200283540683102e-07, "loss": 0.27960023283958435, "step": 678 }, { "epoch": 18.87323943661972, "grad_norm": 0.2873672544956207, "learning_rate": 7.193009184127145e-07, "loss": 0.28757309913635254, "step": 679 }, { "epoch": 18.901408450704224, "grad_norm": 0.28597328066825867, "learning_rate": 7.185729670371604e-07, "loss": 0.2904655635356903, "step": 680 }, { "epoch": 18.929577464788732, "grad_norm": 0.29267045855522156, "learning_rate": 7.17844502159114e-07, "loss": 0.2797931432723999, "step": 681 }, { "epoch": 18.95774647887324, "grad_norm": 0.27707934379577637, "learning_rate": 7.171155259976057e-07, "loss": 0.2788022458553314, "step": 682 }, { "epoch": 18.985915492957748, "grad_norm": 0.2854091227054596, "learning_rate": 7.163860407732231e-07, "loss": 0.28216353058815, "step": 683 }, { "epoch": 19.0, "grad_norm": 0.4010404348373413, "learning_rate": 7.156560487081051e-07, "loss": 0.2831748127937317, "step": 684 }, { "epoch": 19.028169014084508, "grad_norm": 0.2948407232761383, "learning_rate": 7.149255520259338e-07, "loss": 0.26844292879104614, "step": 685 }, { "epoch": 19.056338028169016, "grad_norm": 0.2946661114692688, "learning_rate": 7.141945529519288e-07, "loss": 0.2809017300605774, "step": 686 }, { "epoch": 19.08450704225352, "grad_norm": 0.27715936303138733, "learning_rate": 7.134630537128403e-07, "loss": 0.2835448980331421, "step": 687 }, { "epoch": 19.112676056338028, "grad_norm": 0.2933226525783539, "learning_rate": 7.127310565369415e-07, "loss": 0.2795133888721466, "step": 688 }, { "epoch": 19.140845070422536, "grad_norm": 0.28180861473083496, "learning_rate": 7.11998563654023e-07, "loss": 0.2750745713710785, "step": 689 }, { "epoch": 19.169014084507044, "grad_norm": 0.2755012810230255, "learning_rate": 7.11265577295385e-07, "loss": 0.281097412109375, "step": 690 }, { "epoch": 19.197183098591548, "grad_norm": 0.2865377962589264, "learning_rate": 7.105320996938314e-07, "loss": 0.2677628993988037, "step": 691 }, { "epoch": 19.225352112676056, "grad_norm": 0.2958216369152069, "learning_rate": 7.097981330836616e-07, "loss": 0.2733122408390045, "step": 692 }, { "epoch": 19.253521126760564, "grad_norm": 0.2982434034347534, "learning_rate": 7.090636797006657e-07, "loss": 0.2764785885810852, "step": 693 }, { "epoch": 19.281690140845072, "grad_norm": 0.31210824847221375, "learning_rate": 7.083287417821157e-07, "loss": 0.27116531133651733, "step": 694 }, { "epoch": 19.309859154929576, "grad_norm": 0.29045426845550537, "learning_rate": 7.075933215667604e-07, "loss": 0.2775840163230896, "step": 695 }, { "epoch": 19.338028169014084, "grad_norm": 0.29685893654823303, "learning_rate": 7.068574212948169e-07, "loss": 0.2803945541381836, "step": 696 }, { "epoch": 19.366197183098592, "grad_norm": 0.2790866494178772, "learning_rate": 7.06121043207965e-07, "loss": 0.2769659161567688, "step": 697 }, { "epoch": 19.3943661971831, "grad_norm": 0.31644630432128906, "learning_rate": 7.053841895493406e-07, "loss": 0.27923786640167236, "step": 698 }, { "epoch": 19.422535211267604, "grad_norm": 0.30641067028045654, "learning_rate": 7.046468625635274e-07, "loss": 0.2825276255607605, "step": 699 }, { "epoch": 19.450704225352112, "grad_norm": 0.292458713054657, "learning_rate": 7.039090644965509e-07, "loss": 0.27422571182250977, "step": 700 }, { "epoch": 19.47887323943662, "grad_norm": 0.2903311550617218, "learning_rate": 7.031707975958726e-07, "loss": 0.27189522981643677, "step": 701 }, { "epoch": 19.507042253521128, "grad_norm": 0.2947315275669098, "learning_rate": 7.024320641103811e-07, "loss": 0.2683555483818054, "step": 702 }, { "epoch": 19.535211267605632, "grad_norm": 0.29522547125816345, "learning_rate": 7.01692866290387e-07, "loss": 0.28815943002700806, "step": 703 }, { "epoch": 19.56338028169014, "grad_norm": 0.28272008895874023, "learning_rate": 7.009532063876148e-07, "loss": 0.2853075861930847, "step": 704 }, { "epoch": 19.591549295774648, "grad_norm": 0.286604642868042, "learning_rate": 7.002130866551968e-07, "loss": 0.2744004726409912, "step": 705 }, { "epoch": 19.619718309859156, "grad_norm": 0.2829611301422119, "learning_rate": 6.994725093476664e-07, "loss": 0.2899395525455475, "step": 706 }, { "epoch": 19.647887323943664, "grad_norm": 0.3035781681537628, "learning_rate": 6.987314767209503e-07, "loss": 0.29819610714912415, "step": 707 }, { "epoch": 19.676056338028168, "grad_norm": 0.30463680624961853, "learning_rate": 6.979899910323624e-07, "loss": 0.2818058729171753, "step": 708 }, { "epoch": 19.704225352112676, "grad_norm": 0.29514482617378235, "learning_rate": 6.972480545405968e-07, "loss": 0.294766366481781, "step": 709 }, { "epoch": 19.732394366197184, "grad_norm": 0.282625675201416, "learning_rate": 6.965056695057204e-07, "loss": 0.27316591143608093, "step": 710 }, { "epoch": 19.760563380281692, "grad_norm": 0.3090338110923767, "learning_rate": 6.957628381891673e-07, "loss": 0.2785091698169708, "step": 711 }, { "epoch": 19.788732394366196, "grad_norm": 0.2826164960861206, "learning_rate": 6.950195628537299e-07, "loss": 0.2870754301548004, "step": 712 }, { "epoch": 19.816901408450704, "grad_norm": 0.29807525873184204, "learning_rate": 6.942758457635543e-07, "loss": 0.27232879400253296, "step": 713 }, { "epoch": 19.845070422535212, "grad_norm": 0.2901877760887146, "learning_rate": 6.935316891841315e-07, "loss": 0.2786208987236023, "step": 714 }, { "epoch": 19.87323943661972, "grad_norm": 0.2947152853012085, "learning_rate": 6.927870953822915e-07, "loss": 0.2676268517971039, "step": 715 }, { "epoch": 19.901408450704224, "grad_norm": 0.30847856402397156, "learning_rate": 6.920420666261961e-07, "loss": 0.27726125717163086, "step": 716 }, { "epoch": 19.929577464788732, "grad_norm": 0.29455119371414185, "learning_rate": 6.912966051853322e-07, "loss": 0.28886911273002625, "step": 717 }, { "epoch": 19.95774647887324, "grad_norm": 0.2961712181568146, "learning_rate": 6.905507133305047e-07, "loss": 0.2736320495605469, "step": 718 }, { "epoch": 19.985915492957748, "grad_norm": 0.2923624515533447, "learning_rate": 6.898043933338293e-07, "loss": 0.2720155119895935, "step": 719 }, { "epoch": 20.0, "grad_norm": 0.40786370635032654, "learning_rate": 6.890576474687263e-07, "loss": 0.3052176237106323, "step": 720 }, { "epoch": 20.028169014084508, "grad_norm": 0.281310498714447, "learning_rate": 6.883104780099133e-07, "loss": 0.2827909588813782, "step": 721 }, { "epoch": 20.056338028169016, "grad_norm": 0.28428319096565247, "learning_rate": 6.875628872333975e-07, "loss": 0.2593810558319092, "step": 722 }, { "epoch": 20.08450704225352, "grad_norm": 0.28026291728019714, "learning_rate": 6.868148774164706e-07, "loss": 0.2783263027667999, "step": 723 }, { "epoch": 20.112676056338028, "grad_norm": 0.2842010259628296, "learning_rate": 6.860664508377001e-07, "loss": 0.2809029221534729, "step": 724 }, { "epoch": 20.140845070422536, "grad_norm": 0.2880638539791107, "learning_rate": 6.853176097769228e-07, "loss": 0.26888588070869446, "step": 725 }, { "epoch": 20.169014084507044, "grad_norm": 0.28630784153938293, "learning_rate": 6.84568356515239e-07, "loss": 0.2781735062599182, "step": 726 }, { "epoch": 20.197183098591548, "grad_norm": 0.30342307686805725, "learning_rate": 6.838186933350036e-07, "loss": 0.27911239862442017, "step": 727 }, { "epoch": 20.225352112676056, "grad_norm": 0.29965290427207947, "learning_rate": 6.83068622519821e-07, "loss": 0.2759650945663452, "step": 728 }, { "epoch": 20.253521126760564, "grad_norm": 0.2921484708786011, "learning_rate": 6.823181463545366e-07, "loss": 0.26791465282440186, "step": 729 }, { "epoch": 20.281690140845072, "grad_norm": 0.29477155208587646, "learning_rate": 6.815672671252315e-07, "loss": 0.27440106868743896, "step": 730 }, { "epoch": 20.309859154929576, "grad_norm": 0.2930176854133606, "learning_rate": 6.808159871192136e-07, "loss": 0.28788119554519653, "step": 731 }, { "epoch": 20.338028169014084, "grad_norm": 0.304382860660553, "learning_rate": 6.800643086250121e-07, "loss": 0.2717517614364624, "step": 732 }, { "epoch": 20.366197183098592, "grad_norm": 0.2945499122142792, "learning_rate": 6.793122339323705e-07, "loss": 0.29744279384613037, "step": 733 }, { "epoch": 20.3943661971831, "grad_norm": 0.2932227849960327, "learning_rate": 6.78559765332238e-07, "loss": 0.2782973051071167, "step": 734 }, { "epoch": 20.422535211267604, "grad_norm": 0.29432976245880127, "learning_rate": 6.778069051167653e-07, "loss": 0.28551533818244934, "step": 735 }, { "epoch": 20.450704225352112, "grad_norm": 0.30091312527656555, "learning_rate": 6.770536555792944e-07, "loss": 0.28610894083976746, "step": 736 }, { "epoch": 20.47887323943662, "grad_norm": 0.29813316464424133, "learning_rate": 6.763000190143545e-07, "loss": 0.28137102723121643, "step": 737 }, { "epoch": 20.507042253521128, "grad_norm": 0.28738856315612793, "learning_rate": 6.755459977176532e-07, "loss": 0.26876533031463623, "step": 738 }, { "epoch": 20.535211267605632, "grad_norm": 0.2894875407218933, "learning_rate": 6.747915939860701e-07, "loss": 0.2704589366912842, "step": 739 }, { "epoch": 20.56338028169014, "grad_norm": 0.3046717047691345, "learning_rate": 6.740368101176495e-07, "loss": 0.28678447008132935, "step": 740 }, { "epoch": 20.591549295774648, "grad_norm": 0.29942622780799866, "learning_rate": 6.732816484115946e-07, "loss": 0.27722471952438354, "step": 741 }, { "epoch": 20.619718309859156, "grad_norm": 0.2984582185745239, "learning_rate": 6.725261111682584e-07, "loss": 0.2638360261917114, "step": 742 }, { "epoch": 20.647887323943664, "grad_norm": 0.2943922281265259, "learning_rate": 6.717702006891386e-07, "loss": 0.286998450756073, "step": 743 }, { "epoch": 20.676056338028168, "grad_norm": 0.2971697747707367, "learning_rate": 6.710139192768694e-07, "loss": 0.2628033757209778, "step": 744 }, { "epoch": 20.704225352112676, "grad_norm": 0.2915992736816406, "learning_rate": 6.702572692352155e-07, "loss": 0.2789704203605652, "step": 745 }, { "epoch": 20.732394366197184, "grad_norm": 0.29871392250061035, "learning_rate": 6.695002528690639e-07, "loss": 0.2669401168823242, "step": 746 }, { "epoch": 20.760563380281692, "grad_norm": 0.29496580362319946, "learning_rate": 6.687428724844179e-07, "loss": 0.2711006999015808, "step": 747 }, { "epoch": 20.788732394366196, "grad_norm": 0.29237619042396545, "learning_rate": 6.679851303883891e-07, "loss": 0.2822151780128479, "step": 748 }, { "epoch": 20.816901408450704, "grad_norm": 0.29689720273017883, "learning_rate": 6.672270288891918e-07, "loss": 0.2751491665840149, "step": 749 }, { "epoch": 20.845070422535212, "grad_norm": 0.28889331221580505, "learning_rate": 6.664685702961344e-07, "loss": 0.2681749761104584, "step": 750 }, { "epoch": 20.87323943661972, "grad_norm": 0.2995631694793701, "learning_rate": 6.657097569196133e-07, "loss": 0.2793988287448883, "step": 751 }, { "epoch": 20.901408450704224, "grad_norm": 0.29980671405792236, "learning_rate": 6.649505910711058e-07, "loss": 0.27338624000549316, "step": 752 }, { "epoch": 20.929577464788732, "grad_norm": 0.29344668984413147, "learning_rate": 6.641910750631626e-07, "loss": 0.284781813621521, "step": 753 }, { "epoch": 20.95774647887324, "grad_norm": 0.29827746748924255, "learning_rate": 6.634312112094013e-07, "loss": 0.27890220284461975, "step": 754 }, { "epoch": 20.985915492957748, "grad_norm": 0.2813144326210022, "learning_rate": 6.626710018244987e-07, "loss": 0.2822881042957306, "step": 755 }, { "epoch": 21.0, "grad_norm": 0.3963703215122223, "learning_rate": 6.619104492241847e-07, "loss": 0.27128899097442627, "step": 756 }, { "epoch": 21.028169014084508, "grad_norm": 0.2815580666065216, "learning_rate": 6.611495557252344e-07, "loss": 0.26516419649124146, "step": 757 }, { "epoch": 21.056338028169016, "grad_norm": 0.2884436845779419, "learning_rate": 6.603883236454612e-07, "loss": 0.2861919701099396, "step": 758 }, { "epoch": 21.08450704225352, "grad_norm": 0.29655352234840393, "learning_rate": 6.596267553037102e-07, "loss": 0.28643375635147095, "step": 759 }, { "epoch": 21.112676056338028, "grad_norm": 0.2927301824092865, "learning_rate": 6.588648530198504e-07, "loss": 0.26665711402893066, "step": 760 }, { "epoch": 21.140845070422536, "grad_norm": 0.3053556978702545, "learning_rate": 6.581026191147687e-07, "loss": 0.2608697712421417, "step": 761 }, { "epoch": 21.169014084507044, "grad_norm": 0.2939828634262085, "learning_rate": 6.573400559103613e-07, "loss": 0.2792375683784485, "step": 762 }, { "epoch": 21.197183098591548, "grad_norm": 0.2972046136856079, "learning_rate": 6.565771657295285e-07, "loss": 0.28457099199295044, "step": 763 }, { "epoch": 21.225352112676056, "grad_norm": 0.2918429672718048, "learning_rate": 6.558139508961654e-07, "loss": 0.2648508548736572, "step": 764 }, { "epoch": 21.253521126760564, "grad_norm": 0.28380143642425537, "learning_rate": 6.550504137351575e-07, "loss": 0.27792784571647644, "step": 765 }, { "epoch": 21.281690140845072, "grad_norm": 0.3151639997959137, "learning_rate": 6.542865565723707e-07, "loss": 0.2657250165939331, "step": 766 }, { "epoch": 21.309859154929576, "grad_norm": 0.2861776351928711, "learning_rate": 6.53522381734647e-07, "loss": 0.27351340651512146, "step": 767 }, { "epoch": 21.338028169014084, "grad_norm": 0.28596001863479614, "learning_rate": 6.527578915497951e-07, "loss": 0.28022241592407227, "step": 768 }, { "epoch": 21.366197183098592, "grad_norm": 0.29702675342559814, "learning_rate": 6.519930883465847e-07, "loss": 0.2644035518169403, "step": 769 }, { "epoch": 21.3943661971831, "grad_norm": 0.2863904535770416, "learning_rate": 6.512279744547392e-07, "loss": 0.2721293568611145, "step": 770 }, { "epoch": 21.422535211267604, "grad_norm": 0.311262845993042, "learning_rate": 6.50462552204928e-07, "loss": 0.2911388874053955, "step": 771 }, { "epoch": 21.450704225352112, "grad_norm": 0.3132490813732147, "learning_rate": 6.496968239287603e-07, "loss": 0.27957841753959656, "step": 772 }, { "epoch": 21.47887323943662, "grad_norm": 0.29439255595207214, "learning_rate": 6.489307919587769e-07, "loss": 0.28288164734840393, "step": 773 }, { "epoch": 21.507042253521128, "grad_norm": 0.3006008267402649, "learning_rate": 6.481644586284442e-07, "loss": 0.26865097880363464, "step": 774 }, { "epoch": 21.535211267605632, "grad_norm": 0.28934645652770996, "learning_rate": 6.473978262721463e-07, "loss": 0.28625524044036865, "step": 775 }, { "epoch": 21.56338028169014, "grad_norm": 0.28962355852127075, "learning_rate": 6.466308972251785e-07, "loss": 0.2737366855144501, "step": 776 }, { "epoch": 21.591549295774648, "grad_norm": 0.29193779826164246, "learning_rate": 6.458636738237395e-07, "loss": 0.2644401788711548, "step": 777 }, { "epoch": 21.619718309859156, "grad_norm": 0.31439822912216187, "learning_rate": 6.45096158404925e-07, "loss": 0.2638384699821472, "step": 778 }, { "epoch": 21.647887323943664, "grad_norm": 0.2855563163757324, "learning_rate": 6.443283533067198e-07, "loss": 0.2697969079017639, "step": 779 }, { "epoch": 21.676056338028168, "grad_norm": 0.2941296398639679, "learning_rate": 6.435602608679916e-07, "loss": 0.27152666449546814, "step": 780 }, { "epoch": 21.704225352112676, "grad_norm": 0.2861116826534271, "learning_rate": 6.427918834284834e-07, "loss": 0.2749404013156891, "step": 781 }, { "epoch": 21.732394366197184, "grad_norm": 0.30467715859413147, "learning_rate": 6.420232233288055e-07, "loss": 0.28106456995010376, "step": 782 }, { "epoch": 21.760563380281692, "grad_norm": 0.2885453402996063, "learning_rate": 6.412542829104306e-07, "loss": 0.2661711275577545, "step": 783 }, { "epoch": 21.788732394366196, "grad_norm": 0.30243006348609924, "learning_rate": 6.404850645156841e-07, "loss": 0.28171294927597046, "step": 784 }, { "epoch": 21.816901408450704, "grad_norm": 0.29606276750564575, "learning_rate": 6.397155704877388e-07, "loss": 0.2737141251564026, "step": 785 }, { "epoch": 21.845070422535212, "grad_norm": 0.30514174699783325, "learning_rate": 6.389458031706068e-07, "loss": 0.2778671979904175, "step": 786 }, { "epoch": 21.87323943661972, "grad_norm": 0.29419445991516113, "learning_rate": 6.381757649091329e-07, "loss": 0.27829116582870483, "step": 787 }, { "epoch": 21.901408450704224, "grad_norm": 0.30376535654067993, "learning_rate": 6.374054580489873e-07, "loss": 0.26818743348121643, "step": 788 }, { "epoch": 21.929577464788732, "grad_norm": 0.29063352942466736, "learning_rate": 6.366348849366583e-07, "loss": 0.28016185760498047, "step": 789 }, { "epoch": 21.95774647887324, "grad_norm": 0.29429173469543457, "learning_rate": 6.358640479194451e-07, "loss": 0.27824854850769043, "step": 790 }, { "epoch": 21.985915492957748, "grad_norm": 0.28934815526008606, "learning_rate": 6.35092949345451e-07, "loss": 0.2743881344795227, "step": 791 }, { "epoch": 22.0, "grad_norm": 0.41559702157974243, "learning_rate": 6.343215915635761e-07, "loss": 0.2856147289276123, "step": 792 }, { "epoch": 22.028169014084508, "grad_norm": 0.29498717188835144, "learning_rate": 6.335499769235098e-07, "loss": 0.2729465961456299, "step": 793 }, { "epoch": 22.056338028169016, "grad_norm": 0.30124449729919434, "learning_rate": 6.327781077757241e-07, "loss": 0.2874697744846344, "step": 794 }, { "epoch": 22.08450704225352, "grad_norm": 0.3204105794429779, "learning_rate": 6.320059864714664e-07, "loss": 0.2923066020011902, "step": 795 }, { "epoch": 22.112676056338028, "grad_norm": 0.2912622392177582, "learning_rate": 6.31233615362752e-07, "loss": 0.2808852791786194, "step": 796 }, { "epoch": 22.140845070422536, "grad_norm": 0.30250096321105957, "learning_rate": 6.304609968023572e-07, "loss": 0.27111589908599854, "step": 797 }, { "epoch": 22.169014084507044, "grad_norm": 0.3024645447731018, "learning_rate": 6.296881331438126e-07, "loss": 0.2812804877758026, "step": 798 }, { "epoch": 22.197183098591548, "grad_norm": 0.29673656821250916, "learning_rate": 6.289150267413942e-07, "loss": 0.2681958079338074, "step": 799 }, { "epoch": 22.225352112676056, "grad_norm": 0.29564592242240906, "learning_rate": 6.281416799501187e-07, "loss": 0.26508989930152893, "step": 800 }, { "epoch": 22.253521126760564, "grad_norm": 0.2849496603012085, "learning_rate": 6.273680951257342e-07, "loss": 0.27044007182121277, "step": 801 }, { "epoch": 22.281690140845072, "grad_norm": 0.30459970235824585, "learning_rate": 6.265942746247146e-07, "loss": 0.26503556966781616, "step": 802 }, { "epoch": 22.309859154929576, "grad_norm": 0.29415223002433777, "learning_rate": 6.258202208042511e-07, "loss": 0.26770085096359253, "step": 803 }, { "epoch": 22.338028169014084, "grad_norm": 0.3101199269294739, "learning_rate": 6.25045936022246e-07, "loss": 0.26633113622665405, "step": 804 }, { "epoch": 22.366197183098592, "grad_norm": 0.28551825881004333, "learning_rate": 6.242714226373049e-07, "loss": 0.2745598256587982, "step": 805 }, { "epoch": 22.3943661971831, "grad_norm": 0.30341607332229614, "learning_rate": 6.2349668300873e-07, "loss": 0.2879912853240967, "step": 806 }, { "epoch": 22.422535211267604, "grad_norm": 0.33077767491340637, "learning_rate": 6.227217194965125e-07, "loss": 0.28035950660705566, "step": 807 }, { "epoch": 22.450704225352112, "grad_norm": 0.305733859539032, "learning_rate": 6.219465344613258e-07, "loss": 0.2842296361923218, "step": 808 }, { "epoch": 22.47887323943662, "grad_norm": 0.2931113839149475, "learning_rate": 6.211711302645177e-07, "loss": 0.2730957865715027, "step": 809 }, { "epoch": 22.507042253521128, "grad_norm": 0.2949962913990021, "learning_rate": 6.203955092681039e-07, "loss": 0.281680166721344, "step": 810 }, { "epoch": 22.535211267605632, "grad_norm": 0.30062124133110046, "learning_rate": 6.196196738347607e-07, "loss": 0.2771790027618408, "step": 811 }, { "epoch": 22.56338028169014, "grad_norm": 0.29685312509536743, "learning_rate": 6.188436263278172e-07, "loss": 0.27885377407073975, "step": 812 }, { "epoch": 22.591549295774648, "grad_norm": 0.30217039585113525, "learning_rate": 6.180673691112486e-07, "loss": 0.2664039433002472, "step": 813 }, { "epoch": 22.619718309859156, "grad_norm": 0.2935945987701416, "learning_rate": 6.172909045496694e-07, "loss": 0.266349196434021, "step": 814 }, { "epoch": 22.647887323943664, "grad_norm": 0.31217825412750244, "learning_rate": 6.165142350083249e-07, "loss": 0.2723742127418518, "step": 815 }, { "epoch": 22.676056338028168, "grad_norm": 0.2960183918476105, "learning_rate": 6.157373628530852e-07, "loss": 0.272281289100647, "step": 816 }, { "epoch": 22.704225352112676, "grad_norm": 0.2914189100265503, "learning_rate": 6.149602904504378e-07, "loss": 0.26770728826522827, "step": 817 }, { "epoch": 22.732394366197184, "grad_norm": 0.2774648368358612, "learning_rate": 6.141830201674802e-07, "loss": 0.2694011330604553, "step": 818 }, { "epoch": 22.760563380281692, "grad_norm": 0.29001736640930176, "learning_rate": 6.134055543719121e-07, "loss": 0.2670798897743225, "step": 819 }, { "epoch": 22.788732394366196, "grad_norm": 0.31117716431617737, "learning_rate": 6.126278954320294e-07, "loss": 0.26127567887306213, "step": 820 }, { "epoch": 22.816901408450704, "grad_norm": 0.29720577597618103, "learning_rate": 6.118500457167159e-07, "loss": 0.27497297525405884, "step": 821 }, { "epoch": 22.845070422535212, "grad_norm": 0.3057437241077423, "learning_rate": 6.11072007595437e-07, "loss": 0.27363038063049316, "step": 822 }, { "epoch": 22.87323943661972, "grad_norm": 0.323045939207077, "learning_rate": 6.102937834382315e-07, "loss": 0.27130627632141113, "step": 823 }, { "epoch": 22.901408450704224, "grad_norm": 0.28948745131492615, "learning_rate": 6.095153756157051e-07, "loss": 0.26591163873672485, "step": 824 }, { "epoch": 22.929577464788732, "grad_norm": 0.27952200174331665, "learning_rate": 6.087367864990232e-07, "loss": 0.266745388507843, "step": 825 }, { "epoch": 22.95774647887324, "grad_norm": 0.30804452300071716, "learning_rate": 6.079580184599032e-07, "loss": 0.2794422507286072, "step": 826 }, { "epoch": 22.985915492957748, "grad_norm": 0.3002220392227173, "learning_rate": 6.071790738706078e-07, "loss": 0.26469242572784424, "step": 827 }, { "epoch": 23.0, "grad_norm": 0.4127134084701538, "learning_rate": 6.06399955103937e-07, "loss": 0.2482779324054718, "step": 828 }, { "epoch": 23.028169014084508, "grad_norm": 0.30051475763320923, "learning_rate": 6.056206645332217e-07, "loss": 0.26631736755371094, "step": 829 }, { "epoch": 23.056338028169016, "grad_norm": 0.3008311688899994, "learning_rate": 6.048412045323164e-07, "loss": 0.27459877729415894, "step": 830 }, { "epoch": 23.08450704225352, "grad_norm": 0.28853461146354675, "learning_rate": 6.040615774755911e-07, "loss": 0.26959413290023804, "step": 831 }, { "epoch": 23.112676056338028, "grad_norm": 0.29199543595314026, "learning_rate": 6.032817857379256e-07, "loss": 0.2588391900062561, "step": 832 }, { "epoch": 23.140845070422536, "grad_norm": 0.29191362857818604, "learning_rate": 6.025018316946999e-07, "loss": 0.27447617053985596, "step": 833 }, { "epoch": 23.169014084507044, "grad_norm": 0.29501983523368835, "learning_rate": 6.017217177217899e-07, "loss": 0.26884716749191284, "step": 834 }, { "epoch": 23.197183098591548, "grad_norm": 0.3098088502883911, "learning_rate": 6.009414461955581e-07, "loss": 0.28516972064971924, "step": 835 }, { "epoch": 23.225352112676056, "grad_norm": 0.3027796149253845, "learning_rate": 6.001610194928464e-07, "loss": 0.2739514112472534, "step": 836 }, { "epoch": 23.253521126760564, "grad_norm": 0.31156665086746216, "learning_rate": 5.993804399909703e-07, "loss": 0.26852983236312866, "step": 837 }, { "epoch": 23.281690140845072, "grad_norm": 0.2958903908729553, "learning_rate": 5.985997100677103e-07, "loss": 0.2743365168571472, "step": 838 }, { "epoch": 23.309859154929576, "grad_norm": 0.31140410900115967, "learning_rate": 5.97818832101305e-07, "loss": 0.27525418996810913, "step": 839 }, { "epoch": 23.338028169014084, "grad_norm": 0.3082049787044525, "learning_rate": 5.97037808470444e-07, "loss": 0.27074384689331055, "step": 840 }, { "epoch": 23.366197183098592, "grad_norm": 0.2950114905834198, "learning_rate": 5.96256641554261e-07, "loss": 0.26068389415740967, "step": 841 }, { "epoch": 23.3943661971831, "grad_norm": 0.31746307015419006, "learning_rate": 5.954753337323259e-07, "loss": 0.2648658752441406, "step": 842 }, { "epoch": 23.422535211267604, "grad_norm": 0.2906374931335449, "learning_rate": 5.946938873846375e-07, "loss": 0.29040125012397766, "step": 843 }, { "epoch": 23.450704225352112, "grad_norm": 0.3055919408798218, "learning_rate": 5.939123048916173e-07, "loss": 0.2694965600967407, "step": 844 }, { "epoch": 23.47887323943662, "grad_norm": 0.3007211983203888, "learning_rate": 5.931305886341008e-07, "loss": 0.25987839698791504, "step": 845 }, { "epoch": 23.507042253521128, "grad_norm": 0.3042035400867462, "learning_rate": 5.923487409933315e-07, "loss": 0.26484209299087524, "step": 846 }, { "epoch": 23.535211267605632, "grad_norm": 0.30741506814956665, "learning_rate": 5.915667643509528e-07, "loss": 0.2735103368759155, "step": 847 }, { "epoch": 23.56338028169014, "grad_norm": 0.30859899520874023, "learning_rate": 5.907846610890011e-07, "loss": 0.27706003189086914, "step": 848 }, { "epoch": 23.591549295774648, "grad_norm": 0.29999226331710815, "learning_rate": 5.900024335898987e-07, "loss": 0.2733941674232483, "step": 849 }, { "epoch": 23.619718309859156, "grad_norm": 0.3084903955459595, "learning_rate": 5.892200842364462e-07, "loss": 0.282131165266037, "step": 850 }, { "epoch": 23.647887323943664, "grad_norm": 0.29400384426116943, "learning_rate": 5.884376154118154e-07, "loss": 0.26756390929222107, "step": 851 }, { "epoch": 23.676056338028168, "grad_norm": 0.31666234135627747, "learning_rate": 5.87655029499542e-07, "loss": 0.2766130268573761, "step": 852 }, { "epoch": 23.704225352112676, "grad_norm": 0.30233001708984375, "learning_rate": 5.868723288835184e-07, "loss": 0.2544291019439697, "step": 853 }, { "epoch": 23.732394366197184, "grad_norm": 0.2888985276222229, "learning_rate": 5.860895159479864e-07, "loss": 0.272182822227478, "step": 854 }, { "epoch": 23.760563380281692, "grad_norm": 0.29870662093162537, "learning_rate": 5.853065930775303e-07, "loss": 0.2798278331756592, "step": 855 }, { "epoch": 23.788732394366196, "grad_norm": 0.307162344455719, "learning_rate": 5.845235626570683e-07, "loss": 0.2772548794746399, "step": 856 }, { "epoch": 23.816901408450704, "grad_norm": 0.290558785200119, "learning_rate": 5.837404270718475e-07, "loss": 0.2746056020259857, "step": 857 }, { "epoch": 23.845070422535212, "grad_norm": 0.30080270767211914, "learning_rate": 5.829571887074343e-07, "loss": 0.2648829519748688, "step": 858 }, { "epoch": 23.87323943661972, "grad_norm": 0.3067336678504944, "learning_rate": 5.821738499497086e-07, "loss": 0.2871520519256592, "step": 859 }, { "epoch": 23.901408450704224, "grad_norm": 0.29975709319114685, "learning_rate": 5.813904131848564e-07, "loss": 0.26279598474502563, "step": 860 }, { "epoch": 23.929577464788732, "grad_norm": 0.3006797730922699, "learning_rate": 5.806068807993617e-07, "loss": 0.2586716115474701, "step": 861 }, { "epoch": 23.95774647887324, "grad_norm": 0.31139636039733887, "learning_rate": 5.798232551800002e-07, "loss": 0.26469486951828003, "step": 862 }, { "epoch": 23.985915492957748, "grad_norm": 0.295448899269104, "learning_rate": 5.790395387138311e-07, "loss": 0.27641937136650085, "step": 863 }, { "epoch": 24.0, "grad_norm": 0.41943204402923584, "learning_rate": 5.78255733788191e-07, "loss": 0.2656780779361725, "step": 864 }, { "epoch": 24.028169014084508, "grad_norm": 0.2978457808494568, "learning_rate": 5.774718427906856e-07, "loss": 0.27108752727508545, "step": 865 }, { "epoch": 24.056338028169016, "grad_norm": 0.2980673015117645, "learning_rate": 5.766878681091828e-07, "loss": 0.27321118116378784, "step": 866 }, { "epoch": 24.08450704225352, "grad_norm": 0.30751070380210876, "learning_rate": 5.759038121318052e-07, "loss": 0.26482248306274414, "step": 867 }, { "epoch": 24.112676056338028, "grad_norm": 0.2982223629951477, "learning_rate": 5.751196772469237e-07, "loss": 0.2737855315208435, "step": 868 }, { "epoch": 24.140845070422536, "grad_norm": 0.2943744361400604, "learning_rate": 5.743354658431489e-07, "loss": 0.27646419405937195, "step": 869 }, { "epoch": 24.169014084507044, "grad_norm": 0.2863228917121887, "learning_rate": 5.735511803093248e-07, "loss": 0.2726101279258728, "step": 870 }, { "epoch": 24.197183098591548, "grad_norm": 0.2973101735115051, "learning_rate": 5.727668230345209e-07, "loss": 0.2601590156555176, "step": 871 }, { "epoch": 24.225352112676056, "grad_norm": 0.3052431344985962, "learning_rate": 5.71982396408026e-07, "loss": 0.27889275550842285, "step": 872 }, { "epoch": 24.253521126760564, "grad_norm": 0.3076930046081543, "learning_rate": 5.711979028193391e-07, "loss": 0.2612301707267761, "step": 873 }, { "epoch": 24.281690140845072, "grad_norm": 0.2986485958099365, "learning_rate": 5.704133446581642e-07, "loss": 0.27018094062805176, "step": 874 }, { "epoch": 24.309859154929576, "grad_norm": 0.3108276426792145, "learning_rate": 5.696287243144012e-07, "loss": 0.27102935314178467, "step": 875 }, { "epoch": 24.338028169014084, "grad_norm": 0.30193671584129333, "learning_rate": 5.688440441781398e-07, "loss": 0.2653925120830536, "step": 876 }, { "epoch": 24.366197183098592, "grad_norm": 0.3071465492248535, "learning_rate": 5.680593066396518e-07, "loss": 0.2752073109149933, "step": 877 }, { "epoch": 24.3943661971831, "grad_norm": 0.31397056579589844, "learning_rate": 5.672745140893839e-07, "loss": 0.2662411332130432, "step": 878 }, { "epoch": 24.422535211267604, "grad_norm": 0.2991463243961334, "learning_rate": 5.664896689179504e-07, "loss": 0.24169263243675232, "step": 879 }, { "epoch": 24.450704225352112, "grad_norm": 0.3123292028903961, "learning_rate": 5.657047735161255e-07, "loss": 0.27330368757247925, "step": 880 }, { "epoch": 24.47887323943662, "grad_norm": 0.3062734305858612, "learning_rate": 5.649198302748368e-07, "loss": 0.26652461290359497, "step": 881 }, { "epoch": 24.507042253521128, "grad_norm": 0.2875562906265259, "learning_rate": 5.641348415851577e-07, "loss": 0.2717418670654297, "step": 882 }, { "epoch": 24.535211267605632, "grad_norm": 0.30724218487739563, "learning_rate": 5.633498098382998e-07, "loss": 0.2761197090148926, "step": 883 }, { "epoch": 24.56338028169014, "grad_norm": 0.30381572246551514, "learning_rate": 5.625647374256061e-07, "loss": 0.2838340997695923, "step": 884 }, { "epoch": 24.591549295774648, "grad_norm": 0.30817776918411255, "learning_rate": 5.617796267385429e-07, "loss": 0.26739388704299927, "step": 885 }, { "epoch": 24.619718309859156, "grad_norm": 0.31107473373413086, "learning_rate": 5.60994480168694e-07, "loss": 0.27139878273010254, "step": 886 }, { "epoch": 24.647887323943664, "grad_norm": 0.29710572957992554, "learning_rate": 5.602093001077517e-07, "loss": 0.26788806915283203, "step": 887 }, { "epoch": 24.676056338028168, "grad_norm": 0.31037789583206177, "learning_rate": 5.594240889475106e-07, "loss": 0.2767243981361389, "step": 888 }, { "epoch": 24.704225352112676, "grad_norm": 0.30905231833457947, "learning_rate": 5.586388490798604e-07, "loss": 0.2679288685321808, "step": 889 }, { "epoch": 24.732394366197184, "grad_norm": 0.30612513422966003, "learning_rate": 5.578535828967777e-07, "loss": 0.2660091519355774, "step": 890 }, { "epoch": 24.760563380281692, "grad_norm": 0.29661476612091064, "learning_rate": 5.570682927903193e-07, "loss": 0.27202385663986206, "step": 891 }, { "epoch": 24.788732394366196, "grad_norm": 0.31154492497444153, "learning_rate": 5.562829811526154e-07, "loss": 0.26965251564979553, "step": 892 }, { "epoch": 24.816901408450704, "grad_norm": 0.29887905716896057, "learning_rate": 5.554976503758612e-07, "loss": 0.2663193345069885, "step": 893 }, { "epoch": 24.845070422535212, "grad_norm": 0.3046702444553375, "learning_rate": 5.547123028523106e-07, "loss": 0.26517826318740845, "step": 894 }, { "epoch": 24.87323943661972, "grad_norm": 0.29926952719688416, "learning_rate": 5.539269409742683e-07, "loss": 0.2689710855484009, "step": 895 }, { "epoch": 24.901408450704224, "grad_norm": 0.31607043743133545, "learning_rate": 5.531415671340826e-07, "loss": 0.2774956226348877, "step": 896 }, { "epoch": 24.929577464788732, "grad_norm": 0.313334584236145, "learning_rate": 5.523561837241387e-07, "loss": 0.2801990807056427, "step": 897 }, { "epoch": 24.95774647887324, "grad_norm": 0.3167824149131775, "learning_rate": 5.515707931368507e-07, "loss": 0.2556470036506653, "step": 898 }, { "epoch": 24.985915492957748, "grad_norm": 0.3055095970630646, "learning_rate": 5.507853977646543e-07, "loss": 0.2693515121936798, "step": 899 }, { "epoch": 25.0, "grad_norm": 0.41877350211143494, "learning_rate": 5.5e-07, "loss": 0.2642577588558197, "step": 900 }, { "epoch": 25.028169014084508, "grad_norm": 0.3000764548778534, "learning_rate": 5.492146022353459e-07, "loss": 0.2616558074951172, "step": 901 }, { "epoch": 25.056338028169016, "grad_norm": 0.30835723876953125, "learning_rate": 5.484292068631494e-07, "loss": 0.260206401348114, "step": 902 }, { "epoch": 25.08450704225352, "grad_norm": 0.30945923924446106, "learning_rate": 5.476438162758611e-07, "loss": 0.26666033267974854, "step": 903 }, { "epoch": 25.112676056338028, "grad_norm": 0.3131259083747864, "learning_rate": 5.468584328659172e-07, "loss": 0.2688153386116028, "step": 904 }, { "epoch": 25.140845070422536, "grad_norm": 0.31281140446662903, "learning_rate": 5.460730590257317e-07, "loss": 0.25907081365585327, "step": 905 }, { "epoch": 25.169014084507044, "grad_norm": 0.300714910030365, "learning_rate": 5.452876971476896e-07, "loss": 0.2585920989513397, "step": 906 }, { "epoch": 25.197183098591548, "grad_norm": 0.31137779355049133, "learning_rate": 5.445023496241388e-07, "loss": 0.2691946029663086, "step": 907 }, { "epoch": 25.225352112676056, "grad_norm": 0.31905803084373474, "learning_rate": 5.437170188473847e-07, "loss": 0.25889474153518677, "step": 908 }, { "epoch": 25.253521126760564, "grad_norm": 0.30952438712120056, "learning_rate": 5.429317072096807e-07, "loss": 0.26691755652427673, "step": 909 }, { "epoch": 25.281690140845072, "grad_norm": 0.3063667416572571, "learning_rate": 5.421464171032224e-07, "loss": 0.2661867141723633, "step": 910 }, { "epoch": 25.309859154929576, "grad_norm": 0.31403201818466187, "learning_rate": 5.413611509201396e-07, "loss": 0.26902246475219727, "step": 911 }, { "epoch": 25.338028169014084, "grad_norm": 0.3037600815296173, "learning_rate": 5.405759110524894e-07, "loss": 0.26004883646965027, "step": 912 }, { "epoch": 25.366197183098592, "grad_norm": 0.3116777837276459, "learning_rate": 5.397906998922483e-07, "loss": 0.27219873666763306, "step": 913 }, { "epoch": 25.3943661971831, "grad_norm": 0.2961476445198059, "learning_rate": 5.390055198313061e-07, "loss": 0.26753348112106323, "step": 914 }, { "epoch": 25.422535211267604, "grad_norm": 0.3180798888206482, "learning_rate": 5.382203732614571e-07, "loss": 0.2706093192100525, "step": 915 }, { "epoch": 25.450704225352112, "grad_norm": 0.2982124090194702, "learning_rate": 5.37435262574394e-07, "loss": 0.2601392865180969, "step": 916 }, { "epoch": 25.47887323943662, "grad_norm": 0.29854777455329895, "learning_rate": 5.366501901617001e-07, "loss": 0.2788724899291992, "step": 917 }, { "epoch": 25.507042253521128, "grad_norm": 0.30327802896499634, "learning_rate": 5.358651584148423e-07, "loss": 0.26465606689453125, "step": 918 }, { "epoch": 25.535211267605632, "grad_norm": 0.3136656582355499, "learning_rate": 5.350801697251633e-07, "loss": 0.2621968984603882, "step": 919 }, { "epoch": 25.56338028169014, "grad_norm": 0.3008262813091278, "learning_rate": 5.342952264838747e-07, "loss": 0.2775859236717224, "step": 920 }, { "epoch": 25.591549295774648, "grad_norm": 0.31797295808792114, "learning_rate": 5.335103310820496e-07, "loss": 0.2715638279914856, "step": 921 }, { "epoch": 25.619718309859156, "grad_norm": 0.3112519383430481, "learning_rate": 5.32725485910616e-07, "loss": 0.26941171288490295, "step": 922 }, { "epoch": 25.647887323943664, "grad_norm": 0.2887360453605652, "learning_rate": 5.319406933603482e-07, "loss": 0.26261216402053833, "step": 923 }, { "epoch": 25.676056338028168, "grad_norm": 0.3208933472633362, "learning_rate": 5.311559558218603e-07, "loss": 0.26436418294906616, "step": 924 }, { "epoch": 25.704225352112676, "grad_norm": 0.30341023206710815, "learning_rate": 5.303712756855988e-07, "loss": 0.2747180461883545, "step": 925 }, { "epoch": 25.732394366197184, "grad_norm": 0.31803277134895325, "learning_rate": 5.295866553418358e-07, "loss": 0.2771461606025696, "step": 926 }, { "epoch": 25.760563380281692, "grad_norm": 0.3123302459716797, "learning_rate": 5.288020971806608e-07, "loss": 0.26546305418014526, "step": 927 }, { "epoch": 25.788732394366196, "grad_norm": 0.3141644597053528, "learning_rate": 5.28017603591974e-07, "loss": 0.27546215057373047, "step": 928 }, { "epoch": 25.816901408450704, "grad_norm": 0.29840072989463806, "learning_rate": 5.27233176965479e-07, "loss": 0.25834715366363525, "step": 929 }, { "epoch": 25.845070422535212, "grad_norm": 0.3083305060863495, "learning_rate": 5.264488196906752e-07, "loss": 0.2746443748474121, "step": 930 }, { "epoch": 25.87323943661972, "grad_norm": 0.30847135186195374, "learning_rate": 5.256645341568511e-07, "loss": 0.2748471200466156, "step": 931 }, { "epoch": 25.901408450704224, "grad_norm": 0.30591723322868347, "learning_rate": 5.248803227530763e-07, "loss": 0.26996147632598877, "step": 932 }, { "epoch": 25.929577464788732, "grad_norm": 0.314569354057312, "learning_rate": 5.240961878681947e-07, "loss": 0.28236207365989685, "step": 933 }, { "epoch": 25.95774647887324, "grad_norm": 0.32219424843788147, "learning_rate": 5.233121318908173e-07, "loss": 0.2674041986465454, "step": 934 }, { "epoch": 25.985915492957748, "grad_norm": 0.3121417760848999, "learning_rate": 5.225281572093143e-07, "loss": 0.2723839282989502, "step": 935 }, { "epoch": 26.0, "grad_norm": 0.4469078481197357, "learning_rate": 5.21744266211809e-07, "loss": 0.2659713625907898, "step": 936 }, { "epoch": 26.028169014084508, "grad_norm": 0.3079273998737335, "learning_rate": 5.20960461286169e-07, "loss": 0.2612949013710022, "step": 937 }, { "epoch": 26.056338028169016, "grad_norm": 0.29670900106430054, "learning_rate": 5.2017674482e-07, "loss": 0.26683154702186584, "step": 938 }, { "epoch": 26.08450704225352, "grad_norm": 0.3200303018093109, "learning_rate": 5.193931192006385e-07, "loss": 0.2616243362426758, "step": 939 }, { "epoch": 26.112676056338028, "grad_norm": 0.31682220101356506, "learning_rate": 5.186095868151436e-07, "loss": 0.27138951420783997, "step": 940 }, { "epoch": 26.140845070422536, "grad_norm": 0.30821120738983154, "learning_rate": 5.178261500502912e-07, "loss": 0.26395922899246216, "step": 941 }, { "epoch": 26.169014084507044, "grad_norm": 0.3168351352214813, "learning_rate": 5.170428112925659e-07, "loss": 0.2528039813041687, "step": 942 }, { "epoch": 26.197183098591548, "grad_norm": 0.31877174973487854, "learning_rate": 5.162595729281526e-07, "loss": 0.268981397151947, "step": 943 }, { "epoch": 26.225352112676056, "grad_norm": 0.30236542224884033, "learning_rate": 5.154764373429315e-07, "loss": 0.26689520478248596, "step": 944 }, { "epoch": 26.253521126760564, "grad_norm": 0.31615039706230164, "learning_rate": 5.146934069224698e-07, "loss": 0.25211524963378906, "step": 945 }, { "epoch": 26.281690140845072, "grad_norm": 0.304155558347702, "learning_rate": 5.139104840520135e-07, "loss": 0.26361894607543945, "step": 946 }, { "epoch": 26.309859154929576, "grad_norm": 0.31038856506347656, "learning_rate": 5.131276711164815e-07, "loss": 0.26455777883529663, "step": 947 }, { "epoch": 26.338028169014084, "grad_norm": 0.3139597177505493, "learning_rate": 5.123449705004581e-07, "loss": 0.2526125907897949, "step": 948 }, { "epoch": 26.366197183098592, "grad_norm": 0.3288014233112335, "learning_rate": 5.115623845881847e-07, "loss": 0.2677180767059326, "step": 949 }, { "epoch": 26.3943661971831, "grad_norm": 0.33518192172050476, "learning_rate": 5.107799157635538e-07, "loss": 0.2683093249797821, "step": 950 }, { "epoch": 26.422535211267604, "grad_norm": 0.3219356834888458, "learning_rate": 5.099975664101014e-07, "loss": 0.2773933708667755, "step": 951 }, { "epoch": 26.450704225352112, "grad_norm": 0.32385388016700745, "learning_rate": 5.09215338910999e-07, "loss": 0.2612137198448181, "step": 952 }, { "epoch": 26.47887323943662, "grad_norm": 0.32834818959236145, "learning_rate": 5.084332356490472e-07, "loss": 0.2747904658317566, "step": 953 }, { "epoch": 26.507042253521128, "grad_norm": 0.32953891158103943, "learning_rate": 5.076512590066685e-07, "loss": 0.2700774669647217, "step": 954 }, { "epoch": 26.535211267605632, "grad_norm": 0.31470146775245667, "learning_rate": 5.068694113658992e-07, "loss": 0.26825615763664246, "step": 955 }, { "epoch": 26.56338028169014, "grad_norm": 0.3184269964694977, "learning_rate": 5.060876951083828e-07, "loss": 0.2559502124786377, "step": 956 }, { "epoch": 26.591549295774648, "grad_norm": 0.3205021619796753, "learning_rate": 5.053061126153624e-07, "loss": 0.26462531089782715, "step": 957 }, { "epoch": 26.619718309859156, "grad_norm": 0.3158126473426819, "learning_rate": 5.045246662676741e-07, "loss": 0.2701690196990967, "step": 958 }, { "epoch": 26.647887323943664, "grad_norm": 0.3104144334793091, "learning_rate": 5.037433584457389e-07, "loss": 0.27104830741882324, "step": 959 }, { "epoch": 26.676056338028168, "grad_norm": 0.3229422867298126, "learning_rate": 5.02962191529556e-07, "loss": 0.2765110731124878, "step": 960 }, { "epoch": 26.704225352112676, "grad_norm": 0.3127235770225525, "learning_rate": 5.021811678986951e-07, "loss": 0.26477351784706116, "step": 961 }, { "epoch": 26.732394366197184, "grad_norm": 0.31363457441329956, "learning_rate": 5.014002899322896e-07, "loss": 0.2696647644042969, "step": 962 }, { "epoch": 26.760563380281692, "grad_norm": 0.3330313265323639, "learning_rate": 5.006195600090296e-07, "loss": 0.2720947861671448, "step": 963 }, { "epoch": 26.788732394366196, "grad_norm": 0.3137781023979187, "learning_rate": 4.998389805071536e-07, "loss": 0.2770814001560211, "step": 964 }, { "epoch": 26.816901408450704, "grad_norm": 0.30663928389549255, "learning_rate": 4.990585538044419e-07, "loss": 0.26743337512016296, "step": 965 }, { "epoch": 26.845070422535212, "grad_norm": 0.3439841866493225, "learning_rate": 4.982782822782101e-07, "loss": 0.26640748977661133, "step": 966 }, { "epoch": 26.87323943661972, "grad_norm": 0.30016517639160156, "learning_rate": 4.974981683053001e-07, "loss": 0.2630905508995056, "step": 967 }, { "epoch": 26.901408450704224, "grad_norm": 0.30313640832901, "learning_rate": 4.967182142620745e-07, "loss": 0.26278769969940186, "step": 968 }, { "epoch": 26.929577464788732, "grad_norm": 0.3100942373275757, "learning_rate": 4.959384225244087e-07, "loss": 0.25859004259109497, "step": 969 }, { "epoch": 26.95774647887324, "grad_norm": 0.3049146234989166, "learning_rate": 4.951587954676837e-07, "loss": 0.2737579941749573, "step": 970 }, { "epoch": 26.985915492957748, "grad_norm": 0.3105259835720062, "learning_rate": 4.943793354667783e-07, "loss": 0.2698732018470764, "step": 971 }, { "epoch": 27.0, "grad_norm": 0.43671199679374695, "learning_rate": 4.93600044896063e-07, "loss": 0.2851495146751404, "step": 972 }, { "epoch": 27.028169014084508, "grad_norm": 0.3152709901332855, "learning_rate": 4.928209261293923e-07, "loss": 0.27372750639915466, "step": 973 }, { "epoch": 27.056338028169016, "grad_norm": 0.3281909227371216, "learning_rate": 4.920419815400968e-07, "loss": 0.26317745447158813, "step": 974 }, { "epoch": 27.08450704225352, "grad_norm": 0.30629420280456543, "learning_rate": 4.912632135009769e-07, "loss": 0.267042338848114, "step": 975 }, { "epoch": 27.112676056338028, "grad_norm": 0.31097206473350525, "learning_rate": 4.904846243842949e-07, "loss": 0.2647910714149475, "step": 976 }, { "epoch": 27.140845070422536, "grad_norm": 0.30723172426223755, "learning_rate": 4.897062165617686e-07, "loss": 0.27176767587661743, "step": 977 }, { "epoch": 27.169014084507044, "grad_norm": 0.333957701921463, "learning_rate": 4.88927992404563e-07, "loss": 0.26361826062202454, "step": 978 }, { "epoch": 27.197183098591548, "grad_norm": 0.30476778745651245, "learning_rate": 4.881499542832841e-07, "loss": 0.2584869861602783, "step": 979 }, { "epoch": 27.225352112676056, "grad_norm": 0.3146997392177582, "learning_rate": 4.873721045679706e-07, "loss": 0.2549043893814087, "step": 980 }, { "epoch": 27.253521126760564, "grad_norm": 0.30739930272102356, "learning_rate": 4.865944456280878e-07, "loss": 0.2622683644294739, "step": 981 }, { "epoch": 27.281690140845072, "grad_norm": 0.3006227910518646, "learning_rate": 4.858169798325198e-07, "loss": 0.27283164858818054, "step": 982 }, { "epoch": 27.309859154929576, "grad_norm": 0.31303322315216064, "learning_rate": 4.850397095495621e-07, "loss": 0.2585863471031189, "step": 983 }, { "epoch": 27.338028169014084, "grad_norm": 0.3036518692970276, "learning_rate": 4.842626371469149e-07, "loss": 0.2656107246875763, "step": 984 }, { "epoch": 27.366197183098592, "grad_norm": 0.3137490749359131, "learning_rate": 4.834857649916752e-07, "loss": 0.25737249851226807, "step": 985 }, { "epoch": 27.3943661971831, "grad_norm": 0.3161812424659729, "learning_rate": 4.827090954503308e-07, "loss": 0.2658624053001404, "step": 986 }, { "epoch": 27.422535211267604, "grad_norm": 0.2974465489387512, "learning_rate": 4.819326308887513e-07, "loss": 0.2653939425945282, "step": 987 }, { "epoch": 27.450704225352112, "grad_norm": 0.3207877576351166, "learning_rate": 4.811563736721829e-07, "loss": 0.2567484378814697, "step": 988 }, { "epoch": 27.47887323943662, "grad_norm": 0.30379563570022583, "learning_rate": 4.803803261652395e-07, "loss": 0.2731136083602905, "step": 989 }, { "epoch": 27.507042253521128, "grad_norm": 0.30110257863998413, "learning_rate": 4.79604490731896e-07, "loss": 0.2533247172832489, "step": 990 }, { "epoch": 27.535211267605632, "grad_norm": 0.32354485988616943, "learning_rate": 4.788288697354824e-07, "loss": 0.2776826024055481, "step": 991 }, { "epoch": 27.56338028169014, "grad_norm": 0.3137172758579254, "learning_rate": 4.780534655386743e-07, "loss": 0.2678206264972687, "step": 992 }, { "epoch": 27.591549295774648, "grad_norm": 0.3129335641860962, "learning_rate": 4.772782805034876e-07, "loss": 0.27128273248672485, "step": 993 }, { "epoch": 27.619718309859156, "grad_norm": 0.3112099766731262, "learning_rate": 4.7650331699127013e-07, "loss": 0.25505757331848145, "step": 994 }, { "epoch": 27.647887323943664, "grad_norm": 0.3214300274848938, "learning_rate": 4.75728577362695e-07, "loss": 0.252490371465683, "step": 995 }, { "epoch": 27.676056338028168, "grad_norm": 0.3177250623703003, "learning_rate": 4.749540639777539e-07, "loss": 0.2748945355415344, "step": 996 }, { "epoch": 27.704225352112676, "grad_norm": 0.3087361752986908, "learning_rate": 4.741797791957489e-07, "loss": 0.26117944717407227, "step": 997 }, { "epoch": 27.732394366197184, "grad_norm": 0.3008691072463989, "learning_rate": 4.7340572537528547e-07, "loss": 0.2576630115509033, "step": 998 }, { "epoch": 27.760563380281692, "grad_norm": 0.3111347556114197, "learning_rate": 4.7263190487426563e-07, "loss": 0.26800209283828735, "step": 999 }, { "epoch": 27.788732394366196, "grad_norm": 0.2986048758029938, "learning_rate": 4.7185832004988133e-07, "loss": 0.2734978497028351, "step": 1000 }, { "epoch": 27.816901408450704, "grad_norm": 0.31797438859939575, "learning_rate": 4.710849732586059e-07, "loss": 0.2649095356464386, "step": 1001 }, { "epoch": 27.845070422535212, "grad_norm": 0.3100630044937134, "learning_rate": 4.703118668561875e-07, "loss": 0.2550201117992401, "step": 1002 }, { "epoch": 27.87323943661972, "grad_norm": 0.3206699788570404, "learning_rate": 4.6953900319764274e-07, "loss": 0.26471948623657227, "step": 1003 }, { "epoch": 27.901408450704224, "grad_norm": 0.3138802945613861, "learning_rate": 4.68766384637248e-07, "loss": 0.26174217462539673, "step": 1004 }, { "epoch": 27.929577464788732, "grad_norm": 0.3069911301136017, "learning_rate": 4.679940135285336e-07, "loss": 0.26182085275650024, "step": 1005 }, { "epoch": 27.95774647887324, "grad_norm": 0.3080894351005554, "learning_rate": 4.672218922242759e-07, "loss": 0.272597074508667, "step": 1006 }, { "epoch": 27.985915492957748, "grad_norm": 0.30975106358528137, "learning_rate": 4.664500230764903e-07, "loss": 0.28192490339279175, "step": 1007 }, { "epoch": 28.0, "grad_norm": 0.44492414593696594, "learning_rate": 4.656784084364238e-07, "loss": 0.2805609405040741, "step": 1008 }, { "epoch": 28.028169014084508, "grad_norm": 0.3142589330673218, "learning_rate": 4.6490705065454883e-07, "loss": 0.2571072280406952, "step": 1009 }, { "epoch": 28.056338028169016, "grad_norm": 0.3059631884098053, "learning_rate": 4.641359520805548e-07, "loss": 0.2683190107345581, "step": 1010 }, { "epoch": 28.08450704225352, "grad_norm": 0.32835182547569275, "learning_rate": 4.6336511506334177e-07, "loss": 0.2751193344593048, "step": 1011 }, { "epoch": 28.112676056338028, "grad_norm": 0.31909412145614624, "learning_rate": 4.6259454195101267e-07, "loss": 0.27306729555130005, "step": 1012 }, { "epoch": 28.140845070422536, "grad_norm": 0.32016029953956604, "learning_rate": 4.61824235090867e-07, "loss": 0.2615482211112976, "step": 1013 }, { "epoch": 28.169014084507044, "grad_norm": 0.30900275707244873, "learning_rate": 4.6105419682939316e-07, "loss": 0.2553929388523102, "step": 1014 }, { "epoch": 28.197183098591548, "grad_norm": 0.3047516942024231, "learning_rate": 4.602844295122613e-07, "loss": 0.26050907373428345, "step": 1015 }, { "epoch": 28.225352112676056, "grad_norm": 0.31619319319725037, "learning_rate": 4.59514935484316e-07, "loss": 0.2493715137243271, "step": 1016 }, { "epoch": 28.253521126760564, "grad_norm": 0.31594234704971313, "learning_rate": 4.5874571708956953e-07, "loss": 0.26061999797821045, "step": 1017 }, { "epoch": 28.281690140845072, "grad_norm": 0.31763410568237305, "learning_rate": 4.579767766711944e-07, "loss": 0.2720048427581787, "step": 1018 }, { "epoch": 28.309859154929576, "grad_norm": 0.3225538432598114, "learning_rate": 4.572081165715167e-07, "loss": 0.26587527990341187, "step": 1019 }, { "epoch": 28.338028169014084, "grad_norm": 0.33830496668815613, "learning_rate": 4.5643973913200837e-07, "loss": 0.26142361760139465, "step": 1020 }, { "epoch": 28.366197183098592, "grad_norm": 0.30440667271614075, "learning_rate": 4.556716466932803e-07, "loss": 0.25490373373031616, "step": 1021 }, { "epoch": 28.3943661971831, "grad_norm": 0.30009451508522034, "learning_rate": 4.549038415950751e-07, "loss": 0.258319616317749, "step": 1022 }, { "epoch": 28.422535211267604, "grad_norm": 0.32110437750816345, "learning_rate": 4.5413632617626054e-07, "loss": 0.2684330344200134, "step": 1023 }, { "epoch": 28.450704225352112, "grad_norm": 0.3126528561115265, "learning_rate": 4.5336910277482155e-07, "loss": 0.2647142708301544, "step": 1024 }, { "epoch": 28.47887323943662, "grad_norm": 0.30162736773490906, "learning_rate": 4.526021737278537e-07, "loss": 0.2717491388320923, "step": 1025 }, { "epoch": 28.507042253521128, "grad_norm": 0.32018333673477173, "learning_rate": 4.51835541371556e-07, "loss": 0.2770422697067261, "step": 1026 }, { "epoch": 28.535211267605632, "grad_norm": 0.3132731318473816, "learning_rate": 4.5106920804122304e-07, "loss": 0.2692522406578064, "step": 1027 }, { "epoch": 28.56338028169014, "grad_norm": 0.30906060338020325, "learning_rate": 4.503031760712397e-07, "loss": 0.2523694932460785, "step": 1028 }, { "epoch": 28.591549295774648, "grad_norm": 0.3276032507419586, "learning_rate": 4.4953744779507197e-07, "loss": 0.26482313871383667, "step": 1029 }, { "epoch": 28.619718309859156, "grad_norm": 0.33187615871429443, "learning_rate": 4.4877202554526084e-07, "loss": 0.2603946924209595, "step": 1030 }, { "epoch": 28.647887323943664, "grad_norm": 0.30181628465652466, "learning_rate": 4.480069116534151e-07, "loss": 0.25871700048446655, "step": 1031 }, { "epoch": 28.676056338028168, "grad_norm": 0.3155851662158966, "learning_rate": 4.4724210845020494e-07, "loss": 0.2617461681365967, "step": 1032 }, { "epoch": 28.704225352112676, "grad_norm": 0.30370378494262695, "learning_rate": 4.4647761826535303e-07, "loss": 0.26235488057136536, "step": 1033 }, { "epoch": 28.732394366197184, "grad_norm": 0.317186564207077, "learning_rate": 4.457134434276293e-07, "loss": 0.26761680841445923, "step": 1034 }, { "epoch": 28.760563380281692, "grad_norm": 0.3287314772605896, "learning_rate": 4.449495862648427e-07, "loss": 0.261843204498291, "step": 1035 }, { "epoch": 28.788732394366196, "grad_norm": 0.33204883337020874, "learning_rate": 4.441860491038345e-07, "loss": 0.2633381485939026, "step": 1036 }, { "epoch": 28.816901408450704, "grad_norm": 0.32268011569976807, "learning_rate": 4.4342283427047164e-07, "loss": 0.24900981783866882, "step": 1037 }, { "epoch": 28.845070422535212, "grad_norm": 0.3224244713783264, "learning_rate": 4.4265994408963867e-07, "loss": 0.2667103111743927, "step": 1038 }, { "epoch": 28.87323943661972, "grad_norm": 0.3169482350349426, "learning_rate": 4.418973808852313e-07, "loss": 0.268291175365448, "step": 1039 }, { "epoch": 28.901408450704224, "grad_norm": 0.33006441593170166, "learning_rate": 4.4113514698014953e-07, "loss": 0.27004534006118774, "step": 1040 }, { "epoch": 28.929577464788732, "grad_norm": 0.35179299116134644, "learning_rate": 4.403732446962899e-07, "loss": 0.2628635764122009, "step": 1041 }, { "epoch": 28.95774647887324, "grad_norm": 0.3151315748691559, "learning_rate": 4.3961167635453876e-07, "loss": 0.2677478492259979, "step": 1042 }, { "epoch": 28.985915492957748, "grad_norm": 0.3185572922229767, "learning_rate": 4.388504442747657e-07, "loss": 0.2660791873931885, "step": 1043 }, { "epoch": 29.0, "grad_norm": 0.45902183651924133, "learning_rate": 4.3808955077581546e-07, "loss": 0.2720754146575928, "step": 1044 }, { "epoch": 29.028169014084508, "grad_norm": 0.3011077344417572, "learning_rate": 4.373289981755013e-07, "loss": 0.25422877073287964, "step": 1045 }, { "epoch": 29.056338028169016, "grad_norm": 0.3089461028575897, "learning_rate": 4.365687887905988e-07, "loss": 0.2498088926076889, "step": 1046 }, { "epoch": 29.08450704225352, "grad_norm": 0.32150641083717346, "learning_rate": 4.358089249368375e-07, "loss": 0.2662513554096222, "step": 1047 }, { "epoch": 29.112676056338028, "grad_norm": 0.32592031359672546, "learning_rate": 4.350494089288943e-07, "loss": 0.2539994418621063, "step": 1048 }, { "epoch": 29.140845070422536, "grad_norm": 0.31924694776535034, "learning_rate": 4.3429024308038686e-07, "loss": 0.2557491958141327, "step": 1049 }, { "epoch": 29.169014084507044, "grad_norm": 0.32504960894584656, "learning_rate": 4.3353142970386557e-07, "loss": 0.26317501068115234, "step": 1050 }, { "epoch": 29.197183098591548, "grad_norm": 0.3093854784965515, "learning_rate": 4.327729711108082e-07, "loss": 0.25340092182159424, "step": 1051 }, { "epoch": 29.225352112676056, "grad_norm": 0.313862144947052, "learning_rate": 4.3201486961161093e-07, "loss": 0.2559676766395569, "step": 1052 }, { "epoch": 29.253521126760564, "grad_norm": 0.3301529288291931, "learning_rate": 4.312571275155823e-07, "loss": 0.2709015905857086, "step": 1053 }, { "epoch": 29.281690140845072, "grad_norm": 0.32452118396759033, "learning_rate": 4.304997471309361e-07, "loss": 0.2698490619659424, "step": 1054 }, { "epoch": 29.309859154929576, "grad_norm": 0.3382558226585388, "learning_rate": 4.297427307647844e-07, "loss": 0.2615205645561218, "step": 1055 }, { "epoch": 29.338028169014084, "grad_norm": 0.3098710775375366, "learning_rate": 4.2898608072313045e-07, "loss": 0.2664251923561096, "step": 1056 }, { "epoch": 29.366197183098592, "grad_norm": 0.3207705318927765, "learning_rate": 4.2822979931086144e-07, "loss": 0.2764906883239746, "step": 1057 }, { "epoch": 29.3943661971831, "grad_norm": 0.3483034372329712, "learning_rate": 4.2747388883174154e-07, "loss": 0.2622952163219452, "step": 1058 }, { "epoch": 29.422535211267604, "grad_norm": 0.30950114130973816, "learning_rate": 4.267183515884054e-07, "loss": 0.2630128860473633, "step": 1059 }, { "epoch": 29.450704225352112, "grad_norm": 0.32425740361213684, "learning_rate": 4.2596318988235037e-07, "loss": 0.25917208194732666, "step": 1060 }, { "epoch": 29.47887323943662, "grad_norm": 0.3382692039012909, "learning_rate": 4.2520840601392996e-07, "loss": 0.26483750343322754, "step": 1061 }, { "epoch": 29.507042253521128, "grad_norm": 0.30861786007881165, "learning_rate": 4.2445400228234687e-07, "loss": 0.2531127631664276, "step": 1062 }, { "epoch": 29.535211267605632, "grad_norm": 0.33470088243484497, "learning_rate": 4.2369998098564554e-07, "loss": 0.263372540473938, "step": 1063 }, { "epoch": 29.56338028169014, "grad_norm": 0.34484177827835083, "learning_rate": 4.2294634442070553e-07, "loss": 0.263760507106781, "step": 1064 }, { "epoch": 29.591549295774648, "grad_norm": 0.32152125239372253, "learning_rate": 4.2219309488323487e-07, "loss": 0.2630784511566162, "step": 1065 }, { "epoch": 29.619718309859156, "grad_norm": 0.3259511888027191, "learning_rate": 4.214402346677619e-07, "loss": 0.26080453395843506, "step": 1066 }, { "epoch": 29.647887323943664, "grad_norm": 0.32442566752433777, "learning_rate": 4.206877660676297e-07, "loss": 0.2604103088378906, "step": 1067 }, { "epoch": 29.676056338028168, "grad_norm": 0.3231119215488434, "learning_rate": 4.1993569137498776e-07, "loss": 0.26589787006378174, "step": 1068 }, { "epoch": 29.704225352112676, "grad_norm": 0.3275383412837982, "learning_rate": 4.1918401288078633e-07, "loss": 0.2476288229227066, "step": 1069 }, { "epoch": 29.732394366197184, "grad_norm": 0.3219151496887207, "learning_rate": 4.1843273287476854e-07, "loss": 0.26332658529281616, "step": 1070 }, { "epoch": 29.760563380281692, "grad_norm": 0.31227391958236694, "learning_rate": 4.1768185364546326e-07, "loss": 0.2647852301597595, "step": 1071 }, { "epoch": 29.788732394366196, "grad_norm": 0.3090374767780304, "learning_rate": 4.1693137748017915e-07, "loss": 0.2562742531299591, "step": 1072 }, { "epoch": 29.816901408450704, "grad_norm": 0.32516875863075256, "learning_rate": 4.161813066649963e-07, "loss": 0.27417412400245667, "step": 1073 }, { "epoch": 29.845070422535212, "grad_norm": 0.3393928110599518, "learning_rate": 4.15431643484761e-07, "loss": 0.25790080428123474, "step": 1074 }, { "epoch": 29.87323943661972, "grad_norm": 0.3293744623661041, "learning_rate": 4.146823902230772e-07, "loss": 0.27599674463272095, "step": 1075 }, { "epoch": 29.901408450704224, "grad_norm": 0.336525022983551, "learning_rate": 4.1393354916230005e-07, "loss": 0.2566748261451721, "step": 1076 }, { "epoch": 29.929577464788732, "grad_norm": 0.30744579434394836, "learning_rate": 4.1318512258352936e-07, "loss": 0.276886522769928, "step": 1077 }, { "epoch": 29.95774647887324, "grad_norm": 0.3156173527240753, "learning_rate": 4.124371127666024e-07, "loss": 0.27484360337257385, "step": 1078 }, { "epoch": 29.985915492957748, "grad_norm": 0.31924012303352356, "learning_rate": 4.1168952199008677e-07, "loss": 0.2567445635795593, "step": 1079 }, { "epoch": 30.0, "grad_norm": 0.4623652994632721, "learning_rate": 4.1094235253127374e-07, "loss": 0.27351921796798706, "step": 1080 }, { "epoch": 30.028169014084508, "grad_norm": 0.32494813203811646, "learning_rate": 4.101956066661708e-07, "loss": 0.26006799936294556, "step": 1081 }, { "epoch": 30.056338028169016, "grad_norm": 0.3355497121810913, "learning_rate": 4.0944928666949527e-07, "loss": 0.26071614027023315, "step": 1082 }, { "epoch": 30.08450704225352, "grad_norm": 0.3180653750896454, "learning_rate": 4.0870339481466774e-07, "loss": 0.2741304039955139, "step": 1083 }, { "epoch": 30.112676056338028, "grad_norm": 0.31589558720588684, "learning_rate": 4.079579333738039e-07, "loss": 0.2640499770641327, "step": 1084 }, { "epoch": 30.140845070422536, "grad_norm": 0.33277377486228943, "learning_rate": 4.0721290461770863e-07, "loss": 0.2542555630207062, "step": 1085 }, { "epoch": 30.169014084507044, "grad_norm": 0.31191685795783997, "learning_rate": 4.064683108158685e-07, "loss": 0.24946148693561554, "step": 1086 }, { "epoch": 30.197183098591548, "grad_norm": 0.31646913290023804, "learning_rate": 4.057241542364457e-07, "loss": 0.2565403878688812, "step": 1087 }, { "epoch": 30.225352112676056, "grad_norm": 0.32091739773750305, "learning_rate": 4.0498043714627006e-07, "loss": 0.2608620226383209, "step": 1088 }, { "epoch": 30.253521126760564, "grad_norm": 0.3244355618953705, "learning_rate": 4.042371618108329e-07, "loss": 0.25209081172943115, "step": 1089 }, { "epoch": 30.281690140845072, "grad_norm": 0.3262701630592346, "learning_rate": 4.034943304942796e-07, "loss": 0.2566452622413635, "step": 1090 }, { "epoch": 30.309859154929576, "grad_norm": 0.35125988721847534, "learning_rate": 4.027519454594033e-07, "loss": 0.2646006643772125, "step": 1091 }, { "epoch": 30.338028169014084, "grad_norm": 0.32471081614494324, "learning_rate": 4.020100089676376e-07, "loss": 0.2576545178890228, "step": 1092 }, { "epoch": 30.366197183098592, "grad_norm": 0.33542898297309875, "learning_rate": 4.012685232790497e-07, "loss": 0.25865480303764343, "step": 1093 }, { "epoch": 30.3943661971831, "grad_norm": 0.31360387802124023, "learning_rate": 4.005274906523336e-07, "loss": 0.25481581687927246, "step": 1094 }, { "epoch": 30.422535211267604, "grad_norm": 0.33107563853263855, "learning_rate": 3.9978691334480306e-07, "loss": 0.252411812543869, "step": 1095 }, { "epoch": 30.450704225352112, "grad_norm": 0.3281182050704956, "learning_rate": 3.9904679361238526e-07, "loss": 0.2586092948913574, "step": 1096 }, { "epoch": 30.47887323943662, "grad_norm": 0.32694414258003235, "learning_rate": 3.9830713370961313e-07, "loss": 0.26445192098617554, "step": 1097 }, { "epoch": 30.507042253521128, "grad_norm": 0.318498432636261, "learning_rate": 3.975679358896189e-07, "loss": 0.25009143352508545, "step": 1098 }, { "epoch": 30.535211267605632, "grad_norm": 0.3352436423301697, "learning_rate": 3.968292024041275e-07, "loss": 0.2770006060600281, "step": 1099 }, { "epoch": 30.56338028169014, "grad_norm": 0.3413051664829254, "learning_rate": 3.9609093550344907e-07, "loss": 0.2675744593143463, "step": 1100 }, { "epoch": 30.591549295774648, "grad_norm": 0.33011800050735474, "learning_rate": 3.953531374364728e-07, "loss": 0.25982439517974854, "step": 1101 }, { "epoch": 30.619718309859156, "grad_norm": 0.3153058588504791, "learning_rate": 3.946158104506594e-07, "loss": 0.26440930366516113, "step": 1102 }, { "epoch": 30.647887323943664, "grad_norm": 0.33693262934684753, "learning_rate": 3.938789567920349e-07, "loss": 0.2564413845539093, "step": 1103 }, { "epoch": 30.676056338028168, "grad_norm": 0.3082239031791687, "learning_rate": 3.931425787051832e-07, "loss": 0.26095646619796753, "step": 1104 }, { "epoch": 30.704225352112676, "grad_norm": 0.34148088097572327, "learning_rate": 3.924066784332396e-07, "loss": 0.27237722277641296, "step": 1105 }, { "epoch": 30.732394366197184, "grad_norm": 0.3161861300468445, "learning_rate": 3.9167125821788416e-07, "loss": 0.25798144936561584, "step": 1106 }, { "epoch": 30.760563380281692, "grad_norm": 0.33590832352638245, "learning_rate": 3.909363202993343e-07, "loss": 0.2643035650253296, "step": 1107 }, { "epoch": 30.788732394366196, "grad_norm": 0.33959585428237915, "learning_rate": 3.902018669163384e-07, "loss": 0.2613189220428467, "step": 1108 }, { "epoch": 30.816901408450704, "grad_norm": 0.31452202796936035, "learning_rate": 3.894679003061686e-07, "loss": 0.26554104685783386, "step": 1109 }, { "epoch": 30.845070422535212, "grad_norm": 0.3322625160217285, "learning_rate": 3.8873442270461485e-07, "loss": 0.2571873664855957, "step": 1110 }, { "epoch": 30.87323943661972, "grad_norm": 0.33110320568084717, "learning_rate": 3.88001436345977e-07, "loss": 0.26796817779541016, "step": 1111 }, { "epoch": 30.901408450704224, "grad_norm": 0.32166630029678345, "learning_rate": 3.872689434630585e-07, "loss": 0.25648969411849976, "step": 1112 }, { "epoch": 30.929577464788732, "grad_norm": 0.3449627757072449, "learning_rate": 3.8653694628715984e-07, "loss": 0.26782190799713135, "step": 1113 }, { "epoch": 30.95774647887324, "grad_norm": 0.3227315843105316, "learning_rate": 3.8580544704807117e-07, "loss": 0.2791867256164551, "step": 1114 }, { "epoch": 30.985915492957748, "grad_norm": 0.3112963140010834, "learning_rate": 3.850744479740663e-07, "loss": 0.26565277576446533, "step": 1115 }, { "epoch": 31.0, "grad_norm": 0.4575044810771942, "learning_rate": 3.843439512918949e-07, "loss": 0.25405725836753845, "step": 1116 }, { "epoch": 31.028169014084508, "grad_norm": 0.3324749767780304, "learning_rate": 3.8361395922677687e-07, "loss": 0.26342666149139404, "step": 1117 }, { "epoch": 31.056338028169016, "grad_norm": 0.3335409164428711, "learning_rate": 3.8288447400239443e-07, "loss": 0.27227702736854553, "step": 1118 }, { "epoch": 31.08450704225352, "grad_norm": 0.33716699481010437, "learning_rate": 3.82155497840886e-07, "loss": 0.2696995437145233, "step": 1119 }, { "epoch": 31.112676056338028, "grad_norm": 0.33672624826431274, "learning_rate": 3.8142703296283953e-07, "loss": 0.2588409185409546, "step": 1120 }, { "epoch": 31.140845070422536, "grad_norm": 0.3224928081035614, "learning_rate": 3.806990815872855e-07, "loss": 0.2625422775745392, "step": 1121 }, { "epoch": 31.169014084507044, "grad_norm": 0.32264038920402527, "learning_rate": 3.7997164593168983e-07, "loss": 0.251539021730423, "step": 1122 }, { "epoch": 31.197183098591548, "grad_norm": 0.33344459533691406, "learning_rate": 3.7924472821194765e-07, "loss": 0.25519099831581116, "step": 1123 }, { "epoch": 31.225352112676056, "grad_norm": 0.3551379442214966, "learning_rate": 3.785183306423767e-07, "loss": 0.2584845721721649, "step": 1124 }, { "epoch": 31.253521126760564, "grad_norm": 0.3440611660480499, "learning_rate": 3.777924554357096e-07, "loss": 0.2609241008758545, "step": 1125 }, { "epoch": 31.281690140845072, "grad_norm": 0.3400917649269104, "learning_rate": 3.7706710480308835e-07, "loss": 0.26181089878082275, "step": 1126 }, { "epoch": 31.309859154929576, "grad_norm": 0.3361797630786896, "learning_rate": 3.7634228095405673e-07, "loss": 0.2546064853668213, "step": 1127 }, { "epoch": 31.338028169014084, "grad_norm": 0.3346230387687683, "learning_rate": 3.7561798609655373e-07, "loss": 0.26581573486328125, "step": 1128 }, { "epoch": 31.366197183098592, "grad_norm": 0.34457266330718994, "learning_rate": 3.748942224369073e-07, "loss": 0.2582035958766937, "step": 1129 }, { "epoch": 31.3943661971831, "grad_norm": 0.3213818073272705, "learning_rate": 3.7417099217982686e-07, "loss": 0.25484442710876465, "step": 1130 }, { "epoch": 31.422535211267604, "grad_norm": 0.3486325442790985, "learning_rate": 3.734482975283975e-07, "loss": 0.27330318093299866, "step": 1131 }, { "epoch": 31.450704225352112, "grad_norm": 0.3430873453617096, "learning_rate": 3.72726140684072e-07, "loss": 0.25915205478668213, "step": 1132 }, { "epoch": 31.47887323943662, "grad_norm": 0.3348333537578583, "learning_rate": 3.720045238466658e-07, "loss": 0.2582821846008301, "step": 1133 }, { "epoch": 31.507042253521128, "grad_norm": 0.3174356520175934, "learning_rate": 3.712834492143487e-07, "loss": 0.2682039737701416, "step": 1134 }, { "epoch": 31.535211267605632, "grad_norm": 0.3320380449295044, "learning_rate": 3.7056291898363925e-07, "loss": 0.2751486003398895, "step": 1135 }, { "epoch": 31.56338028169014, "grad_norm": 0.3412676155567169, "learning_rate": 3.6984293534939737e-07, "loss": 0.2540426254272461, "step": 1136 }, { "epoch": 31.591549295774648, "grad_norm": 0.35137638449668884, "learning_rate": 3.69123500504818e-07, "loss": 0.2570858895778656, "step": 1137 }, { "epoch": 31.619718309859156, "grad_norm": 0.32933273911476135, "learning_rate": 3.6840461664142444e-07, "loss": 0.2535385489463806, "step": 1138 }, { "epoch": 31.647887323943664, "grad_norm": 0.32296112179756165, "learning_rate": 3.6768628594906193e-07, "loss": 0.26802340149879456, "step": 1139 }, { "epoch": 31.676056338028168, "grad_norm": 0.33371275663375854, "learning_rate": 3.6696851061588994e-07, "loss": 0.26279398798942566, "step": 1140 }, { "epoch": 31.704225352112676, "grad_norm": 0.3587881624698639, "learning_rate": 3.6625129282837685e-07, "loss": 0.26237016916275024, "step": 1141 }, { "epoch": 31.732394366197184, "grad_norm": 0.3388115465641022, "learning_rate": 3.655346347712922e-07, "loss": 0.2542800307273865, "step": 1142 }, { "epoch": 31.760563380281692, "grad_norm": 0.3145511746406555, "learning_rate": 3.6481853862770107e-07, "loss": 0.2536108195781708, "step": 1143 }, { "epoch": 31.788732394366196, "grad_norm": 0.34181296825408936, "learning_rate": 3.641030065789562e-07, "loss": 0.2601550817489624, "step": 1144 }, { "epoch": 31.816901408450704, "grad_norm": 0.322862833738327, "learning_rate": 3.6338804080469253e-07, "loss": 0.25029903650283813, "step": 1145 }, { "epoch": 31.845070422535212, "grad_norm": 0.3622659146785736, "learning_rate": 3.6267364348281946e-07, "loss": 0.26150447130203247, "step": 1146 }, { "epoch": 31.87323943661972, "grad_norm": 0.330181360244751, "learning_rate": 3.6195981678951535e-07, "loss": 0.2587708830833435, "step": 1147 }, { "epoch": 31.901408450704224, "grad_norm": 0.3616638779640198, "learning_rate": 3.612465628992203e-07, "loss": 0.26097607612609863, "step": 1148 }, { "epoch": 31.929577464788732, "grad_norm": 0.3439587652683258, "learning_rate": 3.60533883984629e-07, "loss": 0.2429528385400772, "step": 1149 }, { "epoch": 31.95774647887324, "grad_norm": 0.3390144407749176, "learning_rate": 3.5982178221668533e-07, "loss": 0.2673777937889099, "step": 1150 }, { "epoch": 31.985915492957748, "grad_norm": 0.3215203881263733, "learning_rate": 3.591102597645743e-07, "loss": 0.25635766983032227, "step": 1151 }, { "epoch": 32.0, "grad_norm": 0.4861057698726654, "learning_rate": 3.5839931879571725e-07, "loss": 0.26994332671165466, "step": 1152 }, { "epoch": 32.028169014084504, "grad_norm": 0.3433145582675934, "learning_rate": 3.5768896147576344e-07, "loss": 0.2525317072868347, "step": 1153 }, { "epoch": 32.056338028169016, "grad_norm": 0.34238752722740173, "learning_rate": 3.5697918996858443e-07, "loss": 0.271589457988739, "step": 1154 }, { "epoch": 32.08450704225352, "grad_norm": 0.33140960335731506, "learning_rate": 3.5627000643626704e-07, "loss": 0.2612978219985962, "step": 1155 }, { "epoch": 32.11267605633803, "grad_norm": 0.31951841711997986, "learning_rate": 3.555614130391079e-07, "loss": 0.27151286602020264, "step": 1156 }, { "epoch": 32.140845070422536, "grad_norm": 0.3442953824996948, "learning_rate": 3.5485341193560503e-07, "loss": 0.2442217469215393, "step": 1157 }, { "epoch": 32.16901408450704, "grad_norm": 0.3276779055595398, "learning_rate": 3.5414600528245266e-07, "loss": 0.25613170862197876, "step": 1158 }, { "epoch": 32.19718309859155, "grad_norm": 0.33608436584472656, "learning_rate": 3.534391952345341e-07, "loss": 0.2614259123802185, "step": 1159 }, { "epoch": 32.225352112676056, "grad_norm": 0.3303307592868805, "learning_rate": 3.5273298394491515e-07, "loss": 0.2672120928764343, "step": 1160 }, { "epoch": 32.25352112676056, "grad_norm": 0.32655128836631775, "learning_rate": 3.5202737356483816e-07, "loss": 0.25033846497535706, "step": 1161 }, { "epoch": 32.28169014084507, "grad_norm": 0.3326750099658966, "learning_rate": 3.513223662437147e-07, "loss": 0.2697717547416687, "step": 1162 }, { "epoch": 32.309859154929576, "grad_norm": 0.33951663970947266, "learning_rate": 3.5061796412911913e-07, "loss": 0.25987690687179565, "step": 1163 }, { "epoch": 32.33802816901409, "grad_norm": 0.3316378891468048, "learning_rate": 3.4991416936678276e-07, "loss": 0.26063597202301025, "step": 1164 }, { "epoch": 32.36619718309859, "grad_norm": 0.33838751912117004, "learning_rate": 3.49210984100586e-07, "loss": 0.26821669936180115, "step": 1165 }, { "epoch": 32.394366197183096, "grad_norm": 0.3294714689254761, "learning_rate": 3.4850841047255364e-07, "loss": 0.2651536464691162, "step": 1166 }, { "epoch": 32.42253521126761, "grad_norm": 0.32624831795692444, "learning_rate": 3.4780645062284665e-07, "loss": 0.26797136664390564, "step": 1167 }, { "epoch": 32.45070422535211, "grad_norm": 0.3322686553001404, "learning_rate": 3.471051066897562e-07, "loss": 0.2507922649383545, "step": 1168 }, { "epoch": 32.478873239436616, "grad_norm": 0.34128591418266296, "learning_rate": 3.4640438080969773e-07, "loss": 0.2541847229003906, "step": 1169 }, { "epoch": 32.50704225352113, "grad_norm": 0.3294316828250885, "learning_rate": 3.45704275117204e-07, "loss": 0.26326608657836914, "step": 1170 }, { "epoch": 32.53521126760563, "grad_norm": 0.3293727934360504, "learning_rate": 3.450047917449181e-07, "loss": 0.2654852271080017, "step": 1171 }, { "epoch": 32.563380281690144, "grad_norm": 0.32460466027259827, "learning_rate": 3.4430593282358777e-07, "loss": 0.25532153248786926, "step": 1172 }, { "epoch": 32.59154929577465, "grad_norm": 0.3373318016529083, "learning_rate": 3.4360770048205843e-07, "loss": 0.25554513931274414, "step": 1173 }, { "epoch": 32.61971830985915, "grad_norm": 0.34251123666763306, "learning_rate": 3.429100968472668e-07, "loss": 0.26249927282333374, "step": 1174 }, { "epoch": 32.647887323943664, "grad_norm": 0.32484838366508484, "learning_rate": 3.4221312404423486e-07, "loss": 0.2562830448150635, "step": 1175 }, { "epoch": 32.67605633802817, "grad_norm": 0.3435952365398407, "learning_rate": 3.4151678419606233e-07, "loss": 0.2574070692062378, "step": 1176 }, { "epoch": 32.70422535211267, "grad_norm": 0.33101195096969604, "learning_rate": 3.4082107942392136e-07, "loss": 0.257138729095459, "step": 1177 }, { "epoch": 32.732394366197184, "grad_norm": 0.37783390283584595, "learning_rate": 3.4012601184704904e-07, "loss": 0.26037871837615967, "step": 1178 }, { "epoch": 32.76056338028169, "grad_norm": 0.33994340896606445, "learning_rate": 3.3943158358274203e-07, "loss": 0.27281370759010315, "step": 1179 }, { "epoch": 32.7887323943662, "grad_norm": 0.32044896483421326, "learning_rate": 3.387377967463493e-07, "loss": 0.2526357173919678, "step": 1180 }, { "epoch": 32.816901408450704, "grad_norm": 0.3177328109741211, "learning_rate": 3.3804465345126545e-07, "loss": 0.24474188685417175, "step": 1181 }, { "epoch": 32.84507042253521, "grad_norm": 0.3454241454601288, "learning_rate": 3.3735215580892575e-07, "loss": 0.24287842214107513, "step": 1182 }, { "epoch": 32.87323943661972, "grad_norm": 0.3315359354019165, "learning_rate": 3.366603059287977e-07, "loss": 0.26422587037086487, "step": 1183 }, { "epoch": 32.901408450704224, "grad_norm": 0.3329971730709076, "learning_rate": 3.359691059183761e-07, "loss": 0.2687873840332031, "step": 1184 }, { "epoch": 32.929577464788736, "grad_norm": 0.32194119691848755, "learning_rate": 3.3527855788317614e-07, "loss": 0.2529294788837433, "step": 1185 }, { "epoch": 32.95774647887324, "grad_norm": 0.3383830487728119, "learning_rate": 3.3458866392672694e-07, "loss": 0.24743716418743134, "step": 1186 }, { "epoch": 32.985915492957744, "grad_norm": 0.3237183690071106, "learning_rate": 3.338994261505649e-07, "loss": 0.2624974250793457, "step": 1187 }, { "epoch": 33.0, "grad_norm": 0.4738941192626953, "learning_rate": 3.3321084665422803e-07, "loss": 0.2611575722694397, "step": 1188 }, { "epoch": 33.028169014084504, "grad_norm": 0.3192257285118103, "learning_rate": 3.325229275352489e-07, "loss": 0.25964364409446716, "step": 1189 }, { "epoch": 33.056338028169016, "grad_norm": 0.3343312442302704, "learning_rate": 3.3183567088914833e-07, "loss": 0.2630879282951355, "step": 1190 }, { "epoch": 33.08450704225352, "grad_norm": 0.32633543014526367, "learning_rate": 3.3114907880942933e-07, "loss": 0.2663639783859253, "step": 1191 }, { "epoch": 33.11267605633803, "grad_norm": 0.3315299451351166, "learning_rate": 3.3046315338757026e-07, "loss": 0.2600438892841339, "step": 1192 }, { "epoch": 33.140845070422536, "grad_norm": 0.35579875111579895, "learning_rate": 3.297778967130191e-07, "loss": 0.2606794834136963, "step": 1193 }, { "epoch": 33.16901408450704, "grad_norm": 0.3733043074607849, "learning_rate": 3.290933108731866e-07, "loss": 0.2512716054916382, "step": 1194 }, { "epoch": 33.19718309859155, "grad_norm": 0.345547616481781, "learning_rate": 3.2840939795343987e-07, "loss": 0.26478058099746704, "step": 1195 }, { "epoch": 33.225352112676056, "grad_norm": 0.33482369780540466, "learning_rate": 3.2772616003709616e-07, "loss": 0.2547541856765747, "step": 1196 }, { "epoch": 33.25352112676056, "grad_norm": 0.3360159695148468, "learning_rate": 3.270435992054166e-07, "loss": 0.2729008197784424, "step": 1197 }, { "epoch": 33.28169014084507, "grad_norm": 0.34279924631118774, "learning_rate": 3.263617175376001e-07, "loss": 0.253216028213501, "step": 1198 }, { "epoch": 33.309859154929576, "grad_norm": 0.33277833461761475, "learning_rate": 3.2568051711077636e-07, "loss": 0.2548581659793854, "step": 1199 }, { "epoch": 33.33802816901409, "grad_norm": 0.3363766074180603, "learning_rate": 3.250000000000001e-07, "loss": 0.25859585404396057, "step": 1200 }, { "epoch": 33.36619718309859, "grad_norm": 0.3143514394760132, "learning_rate": 3.2432016827824414e-07, "loss": 0.25202757120132446, "step": 1201 }, { "epoch": 33.394366197183096, "grad_norm": 0.3307502567768097, "learning_rate": 3.2364102401639423e-07, "loss": 0.2585509717464447, "step": 1202 }, { "epoch": 33.42253521126761, "grad_norm": 0.33466944098472595, "learning_rate": 3.229625692832414e-07, "loss": 0.25337138772010803, "step": 1203 }, { "epoch": 33.45070422535211, "grad_norm": 0.31453531980514526, "learning_rate": 3.222848061454764e-07, "loss": 0.2618822455406189, "step": 1204 }, { "epoch": 33.478873239436616, "grad_norm": 0.35038280487060547, "learning_rate": 3.216077366676833e-07, "loss": 0.26571914553642273, "step": 1205 }, { "epoch": 33.50704225352113, "grad_norm": 0.3479344844818115, "learning_rate": 3.209313629123329e-07, "loss": 0.26047736406326294, "step": 1206 }, { "epoch": 33.53521126760563, "grad_norm": 0.339733362197876, "learning_rate": 3.2025568693977745e-07, "loss": 0.2580920159816742, "step": 1207 }, { "epoch": 33.563380281690144, "grad_norm": 0.3457892835140228, "learning_rate": 3.195807108082429e-07, "loss": 0.25361278653144836, "step": 1208 }, { "epoch": 33.59154929577465, "grad_norm": 0.35116419196128845, "learning_rate": 3.1890643657382356e-07, "loss": 0.2517722249031067, "step": 1209 }, { "epoch": 33.61971830985915, "grad_norm": 0.3323304355144501, "learning_rate": 3.182328662904756e-07, "loss": 0.25763052701950073, "step": 1210 }, { "epoch": 33.647887323943664, "grad_norm": 0.3180283308029175, "learning_rate": 3.175600020100112e-07, "loss": 0.26268666982650757, "step": 1211 }, { "epoch": 33.67605633802817, "grad_norm": 0.32394516468048096, "learning_rate": 3.168878457820915e-07, "loss": 0.2540284991264343, "step": 1212 }, { "epoch": 33.70422535211267, "grad_norm": 0.3315521478652954, "learning_rate": 3.162163996542209e-07, "loss": 0.26291581988334656, "step": 1213 }, { "epoch": 33.732394366197184, "grad_norm": 0.32950082421302795, "learning_rate": 3.155456656717408e-07, "loss": 0.2569209039211273, "step": 1214 }, { "epoch": 33.76056338028169, "grad_norm": 0.3513064384460449, "learning_rate": 3.14875645877823e-07, "loss": 0.24890759587287903, "step": 1215 }, { "epoch": 33.7887323943662, "grad_norm": 0.3389022946357727, "learning_rate": 3.142063423134644e-07, "loss": 0.2649242579936981, "step": 1216 }, { "epoch": 33.816901408450704, "grad_norm": 0.3270207941532135, "learning_rate": 3.135377570174796e-07, "loss": 0.26036375761032104, "step": 1217 }, { "epoch": 33.84507042253521, "grad_norm": 0.35390451550483704, "learning_rate": 3.1286989202649503e-07, "loss": 0.25314897298812866, "step": 1218 }, { "epoch": 33.87323943661972, "grad_norm": 0.3263014256954193, "learning_rate": 3.122027493749438e-07, "loss": 0.2565680742263794, "step": 1219 }, { "epoch": 33.901408450704224, "grad_norm": 0.3133479654788971, "learning_rate": 3.115363310950578e-07, "loss": 0.2629280090332031, "step": 1220 }, { "epoch": 33.929577464788736, "grad_norm": 0.3530975580215454, "learning_rate": 3.1087063921686263e-07, "loss": 0.26493778824806213, "step": 1221 }, { "epoch": 33.95774647887324, "grad_norm": 0.3344945013523102, "learning_rate": 3.102056757681715e-07, "loss": 0.2550634741783142, "step": 1222 }, { "epoch": 33.985915492957744, "grad_norm": 0.32563889026641846, "learning_rate": 3.0954144277457817e-07, "loss": 0.25193893909454346, "step": 1223 }, { "epoch": 34.0, "grad_norm": 0.48929160833358765, "learning_rate": 3.0887794225945143e-07, "loss": 0.2488047182559967, "step": 1224 }, { "epoch": 34.028169014084504, "grad_norm": 0.32252368330955505, "learning_rate": 3.0821517624392925e-07, "loss": 0.25322937965393066, "step": 1225 }, { "epoch": 34.056338028169016, "grad_norm": 0.3510408401489258, "learning_rate": 3.075531467469116e-07, "loss": 0.265546977519989, "step": 1226 }, { "epoch": 34.08450704225352, "grad_norm": 0.33205100893974304, "learning_rate": 3.0689185578505525e-07, "loss": 0.2621091902256012, "step": 1227 }, { "epoch": 34.11267605633803, "grad_norm": 0.33356767892837524, "learning_rate": 3.062313053727671e-07, "loss": 0.24525871872901917, "step": 1228 }, { "epoch": 34.140845070422536, "grad_norm": 0.32789838314056396, "learning_rate": 3.055714975221981e-07, "loss": 0.2655676007270813, "step": 1229 }, { "epoch": 34.16901408450704, "grad_norm": 0.3837502598762512, "learning_rate": 3.0491243424323783e-07, "loss": 0.2583563029766083, "step": 1230 }, { "epoch": 34.19718309859155, "grad_norm": 0.32497507333755493, "learning_rate": 3.0425411754350694e-07, "loss": 0.25412964820861816, "step": 1231 }, { "epoch": 34.225352112676056, "grad_norm": 0.3423527181148529, "learning_rate": 3.0359654942835247e-07, "loss": 0.2603622078895569, "step": 1232 }, { "epoch": 34.25352112676056, "grad_norm": 0.3326815068721771, "learning_rate": 3.029397319008407e-07, "loss": 0.2565937638282776, "step": 1233 }, { "epoch": 34.28169014084507, "grad_norm": 0.3410370945930481, "learning_rate": 3.02283666961752e-07, "loss": 0.2687773108482361, "step": 1234 }, { "epoch": 34.309859154929576, "grad_norm": 0.33839917182922363, "learning_rate": 3.016283566095739e-07, "loss": 0.27057865262031555, "step": 1235 }, { "epoch": 34.33802816901409, "grad_norm": 0.32578834891319275, "learning_rate": 3.0097380284049523e-07, "loss": 0.2486121952533722, "step": 1236 }, { "epoch": 34.36619718309859, "grad_norm": 0.34315571188926697, "learning_rate": 3.003200076484004e-07, "loss": 0.24546003341674805, "step": 1237 }, { "epoch": 34.394366197183096, "grad_norm": 0.32684844732284546, "learning_rate": 2.996669730248628e-07, "loss": 0.2699982523918152, "step": 1238 }, { "epoch": 34.42253521126761, "grad_norm": 0.33143216371536255, "learning_rate": 2.9901470095913943e-07, "loss": 0.25373488664627075, "step": 1239 }, { "epoch": 34.45070422535211, "grad_norm": 0.35439276695251465, "learning_rate": 2.9836319343816397e-07, "loss": 0.24537047743797302, "step": 1240 }, { "epoch": 34.478873239436616, "grad_norm": 0.33683332800865173, "learning_rate": 2.977124524465413e-07, "loss": 0.2581592798233032, "step": 1241 }, { "epoch": 34.50704225352113, "grad_norm": 0.3526037037372589, "learning_rate": 2.9706247996654134e-07, "loss": 0.2586764693260193, "step": 1242 }, { "epoch": 34.53521126760563, "grad_norm": 0.3380417823791504, "learning_rate": 2.964132779780929e-07, "loss": 0.263625830411911, "step": 1243 }, { "epoch": 34.563380281690144, "grad_norm": 0.3443485200405121, "learning_rate": 2.9576484845877793e-07, "loss": 0.2503140866756439, "step": 1244 }, { "epoch": 34.59154929577465, "grad_norm": 0.35234031081199646, "learning_rate": 2.9511719338382535e-07, "loss": 0.25954437255859375, "step": 1245 }, { "epoch": 34.61971830985915, "grad_norm": 0.3406411111354828, "learning_rate": 2.944703147261046e-07, "loss": 0.2619974613189697, "step": 1246 }, { "epoch": 34.647887323943664, "grad_norm": 0.3347373306751251, "learning_rate": 2.938242144561201e-07, "loss": 0.2618395984172821, "step": 1247 }, { "epoch": 34.67605633802817, "grad_norm": 0.33204221725463867, "learning_rate": 2.931788945420058e-07, "loss": 0.26617297530174255, "step": 1248 }, { "epoch": 34.70422535211267, "grad_norm": 0.3484657406806946, "learning_rate": 2.925343569495178e-07, "loss": 0.2656903564929962, "step": 1249 }, { "epoch": 34.732394366197184, "grad_norm": 0.3254799544811249, "learning_rate": 2.918906036420294e-07, "loss": 0.24855300784111023, "step": 1250 }, { "epoch": 34.76056338028169, "grad_norm": 0.33594822883605957, "learning_rate": 2.9124763658052474e-07, "loss": 0.2618425786495209, "step": 1251 }, { "epoch": 34.7887323943662, "grad_norm": 0.323949933052063, "learning_rate": 2.9060545772359305e-07, "loss": 0.2546170949935913, "step": 1252 }, { "epoch": 34.816901408450704, "grad_norm": 0.3242202699184418, "learning_rate": 2.8996406902742267e-07, "loss": 0.24211625754833221, "step": 1253 }, { "epoch": 34.84507042253521, "grad_norm": 0.3353058695793152, "learning_rate": 2.893234724457946e-07, "loss": 0.25402140617370605, "step": 1254 }, { "epoch": 34.87323943661972, "grad_norm": 0.33988505601882935, "learning_rate": 2.886836699300771e-07, "loss": 0.24861261248588562, "step": 1255 }, { "epoch": 34.901408450704224, "grad_norm": 0.3339218199253082, "learning_rate": 2.8804466342921987e-07, "loss": 0.25520533323287964, "step": 1256 }, { "epoch": 34.929577464788736, "grad_norm": 0.3448787033557892, "learning_rate": 2.874064548897472e-07, "loss": 0.2663518786430359, "step": 1257 }, { "epoch": 34.95774647887324, "grad_norm": 0.3454734981060028, "learning_rate": 2.86769046255753e-07, "loss": 0.25287461280822754, "step": 1258 }, { "epoch": 34.985915492957744, "grad_norm": 0.3322574496269226, "learning_rate": 2.8613243946889477e-07, "loss": 0.25937291979789734, "step": 1259 }, { "epoch": 35.0, "grad_norm": 0.47356757521629333, "learning_rate": 2.854966364683872e-07, "loss": 0.2588436007499695, "step": 1260 }, { "epoch": 35.028169014084504, "grad_norm": 0.32370901107788086, "learning_rate": 2.848616391909959e-07, "loss": 0.2847004234790802, "step": 1261 }, { "epoch": 35.056338028169016, "grad_norm": 0.3340662717819214, "learning_rate": 2.842274495710335e-07, "loss": 0.24963748455047607, "step": 1262 }, { "epoch": 35.08450704225352, "grad_norm": 0.3470820188522339, "learning_rate": 2.835940695403512e-07, "loss": 0.25704559683799744, "step": 1263 }, { "epoch": 35.11267605633803, "grad_norm": 0.3213740289211273, "learning_rate": 2.829615010283344e-07, "loss": 0.24562162160873413, "step": 1264 }, { "epoch": 35.140845070422536, "grad_norm": 0.3323827385902405, "learning_rate": 2.8232974596189653e-07, "loss": 0.25376367568969727, "step": 1265 }, { "epoch": 35.16901408450704, "grad_norm": 0.32620102167129517, "learning_rate": 2.8169880626547283e-07, "loss": 0.25920748710632324, "step": 1266 }, { "epoch": 35.19718309859155, "grad_norm": 0.34155285358428955, "learning_rate": 2.8106868386101545e-07, "loss": 0.2532484233379364, "step": 1267 }, { "epoch": 35.225352112676056, "grad_norm": 0.32295599579811096, "learning_rate": 2.8043938066798645e-07, "loss": 0.2596886456012726, "step": 1268 }, { "epoch": 35.25352112676056, "grad_norm": 0.3390556871891022, "learning_rate": 2.7981089860335225e-07, "loss": 0.2628597021102905, "step": 1269 }, { "epoch": 35.28169014084507, "grad_norm": 0.3397858738899231, "learning_rate": 2.791832395815782e-07, "loss": 0.260450154542923, "step": 1270 }, { "epoch": 35.309859154929576, "grad_norm": 0.3356383442878723, "learning_rate": 2.7855640551462287e-07, "loss": 0.24709969758987427, "step": 1271 }, { "epoch": 35.33802816901409, "grad_norm": 0.3386112153530121, "learning_rate": 2.7793039831193133e-07, "loss": 0.2554944157600403, "step": 1272 }, { "epoch": 35.36619718309859, "grad_norm": 0.34547311067581177, "learning_rate": 2.773052198804301e-07, "loss": 0.2689363658428192, "step": 1273 }, { "epoch": 35.394366197183096, "grad_norm": 0.34119531512260437, "learning_rate": 2.766808721245211e-07, "loss": 0.2566688656806946, "step": 1274 }, { "epoch": 35.42253521126761, "grad_norm": 0.3342508375644684, "learning_rate": 2.760573569460757e-07, "loss": 0.24888336658477783, "step": 1275 }, { "epoch": 35.45070422535211, "grad_norm": 0.33420711755752563, "learning_rate": 2.7543467624442956e-07, "loss": 0.27446046471595764, "step": 1276 }, { "epoch": 35.478873239436616, "grad_norm": 0.3241899907588959, "learning_rate": 2.7481283191637605e-07, "loss": 0.24648495018482208, "step": 1277 }, { "epoch": 35.50704225352113, "grad_norm": 0.3267020285129547, "learning_rate": 2.741918258561607e-07, "loss": 0.2573559880256653, "step": 1278 }, { "epoch": 35.53521126760563, "grad_norm": 0.3532126247882843, "learning_rate": 2.7357165995547547e-07, "loss": 0.2432764172554016, "step": 1279 }, { "epoch": 35.563380281690144, "grad_norm": 0.33826351165771484, "learning_rate": 2.729523361034538e-07, "loss": 0.25668877363204956, "step": 1280 }, { "epoch": 35.59154929577465, "grad_norm": 0.338796466588974, "learning_rate": 2.7233385618666315e-07, "loss": 0.2522228956222534, "step": 1281 }, { "epoch": 35.61971830985915, "grad_norm": 0.3262656629085541, "learning_rate": 2.717162220891007e-07, "loss": 0.2595973312854767, "step": 1282 }, { "epoch": 35.647887323943664, "grad_norm": 0.3441692590713501, "learning_rate": 2.7109943569218707e-07, "loss": 0.26480039954185486, "step": 1283 }, { "epoch": 35.67605633802817, "grad_norm": 0.3370777368545532, "learning_rate": 2.7048349887476037e-07, "loss": 0.25393831729888916, "step": 1284 }, { "epoch": 35.70422535211267, "grad_norm": 0.34027761220932007, "learning_rate": 2.698684135130713e-07, "loss": 0.24741466343402863, "step": 1285 }, { "epoch": 35.732394366197184, "grad_norm": 0.3438904881477356, "learning_rate": 2.692541814807763e-07, "loss": 0.2620083689689636, "step": 1286 }, { "epoch": 35.76056338028169, "grad_norm": 0.33286988735198975, "learning_rate": 2.686408046489328e-07, "loss": 0.2683720588684082, "step": 1287 }, { "epoch": 35.7887323943662, "grad_norm": 0.3397563397884369, "learning_rate": 2.6802828488599294e-07, "loss": 0.25813597440719604, "step": 1288 }, { "epoch": 35.816901408450704, "grad_norm": 0.34016039967536926, "learning_rate": 2.6741662405779796e-07, "loss": 0.25924018025398254, "step": 1289 }, { "epoch": 35.84507042253521, "grad_norm": 0.3287438452243805, "learning_rate": 2.6680582402757324e-07, "loss": 0.24357835948467255, "step": 1290 }, { "epoch": 35.87323943661972, "grad_norm": 0.3473154306411743, "learning_rate": 2.661958866559213e-07, "loss": 0.25433164834976196, "step": 1291 }, { "epoch": 35.901408450704224, "grad_norm": 0.3320452570915222, "learning_rate": 2.655868138008171e-07, "loss": 0.2620140016078949, "step": 1292 }, { "epoch": 35.929577464788736, "grad_norm": 0.35027673840522766, "learning_rate": 2.649786073176025e-07, "loss": 0.26484349370002747, "step": 1293 }, { "epoch": 35.95774647887324, "grad_norm": 0.34910938143730164, "learning_rate": 2.6437126905897967e-07, "loss": 0.24849724769592285, "step": 1294 }, { "epoch": 35.985915492957744, "grad_norm": 0.3321913480758667, "learning_rate": 2.637648008750062e-07, "loss": 0.24661482870578766, "step": 1295 }, { "epoch": 36.0, "grad_norm": 0.48746395111083984, "learning_rate": 2.631592046130896e-07, "loss": 0.25251615047454834, "step": 1296 }, { "epoch": 36.028169014084504, "grad_norm": 0.3326322138309479, "learning_rate": 2.6255448211798103e-07, "loss": 0.2514849603176117, "step": 1297 }, { "epoch": 36.056338028169016, "grad_norm": 0.323958158493042, "learning_rate": 2.6195063523177e-07, "loss": 0.2420714795589447, "step": 1298 }, { "epoch": 36.08450704225352, "grad_norm": 0.3715856075286865, "learning_rate": 2.613476657938789e-07, "loss": 0.24617412686347961, "step": 1299 }, { "epoch": 36.11267605633803, "grad_norm": 0.34012261033058167, "learning_rate": 2.6074557564105724e-07, "loss": 0.26243406534194946, "step": 1300 }, { "epoch": 36.140845070422536, "grad_norm": 0.33578699827194214, "learning_rate": 2.6014436660737605e-07, "loss": 0.2461467981338501, "step": 1301 }, { "epoch": 36.16901408450704, "grad_norm": 0.3389386832714081, "learning_rate": 2.595440405242222e-07, "loss": 0.2597675025463104, "step": 1302 }, { "epoch": 36.19718309859155, "grad_norm": 0.33628833293914795, "learning_rate": 2.589445992202931e-07, "loss": 0.2510983943939209, "step": 1303 }, { "epoch": 36.225352112676056, "grad_norm": 0.3409932851791382, "learning_rate": 2.583460445215911e-07, "loss": 0.2607109844684601, "step": 1304 }, { "epoch": 36.25352112676056, "grad_norm": 0.3476935625076294, "learning_rate": 2.5774837825141736e-07, "loss": 0.26868295669555664, "step": 1305 }, { "epoch": 36.28169014084507, "grad_norm": 0.3389628231525421, "learning_rate": 2.571516022303671e-07, "loss": 0.24396029114723206, "step": 1306 }, { "epoch": 36.309859154929576, "grad_norm": 0.3351360261440277, "learning_rate": 2.565557182763235e-07, "loss": 0.2638927102088928, "step": 1307 }, { "epoch": 36.33802816901409, "grad_norm": 0.34508877992630005, "learning_rate": 2.5596072820445254e-07, "loss": 0.25982603430747986, "step": 1308 }, { "epoch": 36.36619718309859, "grad_norm": 0.3333590626716614, "learning_rate": 2.5536663382719713e-07, "loss": 0.25606241822242737, "step": 1309 }, { "epoch": 36.394366197183096, "grad_norm": 0.33822396397590637, "learning_rate": 2.547734369542718e-07, "loss": 0.2518611252307892, "step": 1310 }, { "epoch": 36.42253521126761, "grad_norm": 0.3358154594898224, "learning_rate": 2.5418113939265686e-07, "loss": 0.25333690643310547, "step": 1311 }, { "epoch": 36.45070422535211, "grad_norm": 0.33005034923553467, "learning_rate": 2.5358974294659373e-07, "loss": 0.24985584616661072, "step": 1312 }, { "epoch": 36.478873239436616, "grad_norm": 0.3343973159790039, "learning_rate": 2.5299924941757843e-07, "loss": 0.27109482884407043, "step": 1313 }, { "epoch": 36.50704225352113, "grad_norm": 0.33798739314079285, "learning_rate": 2.5240966060435674e-07, "loss": 0.2599262595176697, "step": 1314 }, { "epoch": 36.53521126760563, "grad_norm": 0.33094605803489685, "learning_rate": 2.5182097830291824e-07, "loss": 0.24939575791358948, "step": 1315 }, { "epoch": 36.563380281690144, "grad_norm": 0.3303806185722351, "learning_rate": 2.512332043064913e-07, "loss": 0.2498035877943039, "step": 1316 }, { "epoch": 36.59154929577465, "grad_norm": 0.3437672555446625, "learning_rate": 2.5064634040553767e-07, "loss": 0.26817601919174194, "step": 1317 }, { "epoch": 36.61971830985915, "grad_norm": 0.3672111928462982, "learning_rate": 2.5006038838774647e-07, "loss": 0.2572394609451294, "step": 1318 }, { "epoch": 36.647887323943664, "grad_norm": 0.34106817841529846, "learning_rate": 2.494753500380291e-07, "loss": 0.25872814655303955, "step": 1319 }, { "epoch": 36.67605633802817, "grad_norm": 0.35012519359588623, "learning_rate": 2.488912271385139e-07, "loss": 0.2478848099708557, "step": 1320 }, { "epoch": 36.70422535211267, "grad_norm": 0.3354050815105438, "learning_rate": 2.483080214685404e-07, "loss": 0.2592930793762207, "step": 1321 }, { "epoch": 36.732394366197184, "grad_norm": 0.3539486825466156, "learning_rate": 2.4772573480465445e-07, "loss": 0.24492186307907104, "step": 1322 }, { "epoch": 36.76056338028169, "grad_norm": 0.34425100684165955, "learning_rate": 2.471443689206021e-07, "loss": 0.2586178779602051, "step": 1323 }, { "epoch": 36.7887323943662, "grad_norm": 0.35161006450653076, "learning_rate": 2.465639255873246e-07, "loss": 0.2581009268760681, "step": 1324 }, { "epoch": 36.816901408450704, "grad_norm": 0.3478921949863434, "learning_rate": 2.4598440657295286e-07, "loss": 0.2674616575241089, "step": 1325 }, { "epoch": 36.84507042253521, "grad_norm": 0.35100990533828735, "learning_rate": 2.454058136428027e-07, "loss": 0.27003878355026245, "step": 1326 }, { "epoch": 36.87323943661972, "grad_norm": 0.3363000452518463, "learning_rate": 2.4482814855936834e-07, "loss": 0.2609623968601227, "step": 1327 }, { "epoch": 36.901408450704224, "grad_norm": 0.3406379222869873, "learning_rate": 2.4425141308231765e-07, "loss": 0.2661615014076233, "step": 1328 }, { "epoch": 36.929577464788736, "grad_norm": 0.331514447927475, "learning_rate": 2.43675608968487e-07, "loss": 0.24595093727111816, "step": 1329 }, { "epoch": 36.95774647887324, "grad_norm": 0.33636540174484253, "learning_rate": 2.4310073797187573e-07, "loss": 0.2518694996833801, "step": 1330 }, { "epoch": 36.985915492957744, "grad_norm": 0.3203655779361725, "learning_rate": 2.4252680184364045e-07, "loss": 0.24997392296791077, "step": 1331 }, { "epoch": 37.0, "grad_norm": 0.47873687744140625, "learning_rate": 2.4195380233209006e-07, "loss": 0.24962179362773895, "step": 1332 } ], "logging_steps": 1, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.788879174705873e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }