SirajRLX's picture
Add Devstral-14B CPT training run
a555835 verified
{
"best_global_step": 650,
"best_metric": 0.3949255049228668,
"best_model_checkpoint": "runs/cpt_run_v1/checkpoints/checkpoint-600",
"epoch": 2.0,
"eval_steps": 50,
"global_step": 686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029170464904284413,
"grad_norm": 1.1577509641647339,
"learning_rate": 0.0,
"loss": 0.9893555045127869,
"step": 1
},
{
"epoch": 0.005834092980856883,
"grad_norm": 0.9491796493530273,
"learning_rate": 2.8985507246376816e-07,
"loss": 0.8791205883026123,
"step": 2
},
{
"epoch": 0.008751139471285323,
"grad_norm": 1.1600768566131592,
"learning_rate": 5.797101449275363e-07,
"loss": 0.9858248233795166,
"step": 3
},
{
"epoch": 0.011668185961713765,
"grad_norm": 1.2298306226730347,
"learning_rate": 8.695652173913044e-07,
"loss": 1.0516364574432373,
"step": 4
},
{
"epoch": 0.014585232452142206,
"grad_norm": 0.9520533680915833,
"learning_rate": 1.1594202898550726e-06,
"loss": 0.8392249345779419,
"step": 5
},
{
"epoch": 0.017502278942570646,
"grad_norm": 1.2451188564300537,
"learning_rate": 1.4492753623188408e-06,
"loss": 1.0955077409744263,
"step": 6
},
{
"epoch": 0.02041932543299909,
"grad_norm": 1.1123991012573242,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.9201866388320923,
"step": 7
},
{
"epoch": 0.02333637192342753,
"grad_norm": 0.9283139705657959,
"learning_rate": 2.028985507246377e-06,
"loss": 0.9770950078964233,
"step": 8
},
{
"epoch": 0.02625341841385597,
"grad_norm": 0.9589216113090515,
"learning_rate": 2.3188405797101453e-06,
"loss": 0.9442565441131592,
"step": 9
},
{
"epoch": 0.02917046490428441,
"grad_norm": 0.8866703510284424,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.9354464411735535,
"step": 10
},
{
"epoch": 0.03208751139471285,
"grad_norm": 0.7191241383552551,
"learning_rate": 2.8985507246376816e-06,
"loss": 0.7659736275672913,
"step": 11
},
{
"epoch": 0.03500455788514129,
"grad_norm": 0.9110142588615417,
"learning_rate": 3.188405797101449e-06,
"loss": 0.9319326877593994,
"step": 12
},
{
"epoch": 0.03792160437556973,
"grad_norm": 0.8754057288169861,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.9819356203079224,
"step": 13
},
{
"epoch": 0.04083865086599818,
"grad_norm": 0.896181046962738,
"learning_rate": 3.768115942028986e-06,
"loss": 1.026316523551941,
"step": 14
},
{
"epoch": 0.04375569735642662,
"grad_norm": 0.6104832887649536,
"learning_rate": 4.057971014492754e-06,
"loss": 0.8427562713623047,
"step": 15
},
{
"epoch": 0.04667274384685506,
"grad_norm": 0.6529208421707153,
"learning_rate": 4.347826086956522e-06,
"loss": 0.8496565222740173,
"step": 16
},
{
"epoch": 0.0495897903372835,
"grad_norm": 0.6319335699081421,
"learning_rate": 4.637681159420291e-06,
"loss": 0.9139047861099243,
"step": 17
},
{
"epoch": 0.05250683682771194,
"grad_norm": 0.7458649277687073,
"learning_rate": 4.927536231884059e-06,
"loss": 0.8867442011833191,
"step": 18
},
{
"epoch": 0.05542388331814038,
"grad_norm": 0.6179773211479187,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.9579408168792725,
"step": 19
},
{
"epoch": 0.05834092980856882,
"grad_norm": 0.794481635093689,
"learning_rate": 5.507246376811595e-06,
"loss": 0.8736554980278015,
"step": 20
},
{
"epoch": 0.06125797629899726,
"grad_norm": 0.8356145620346069,
"learning_rate": 5.797101449275363e-06,
"loss": 0.9358762502670288,
"step": 21
},
{
"epoch": 0.0641750227894257,
"grad_norm": 0.5891932845115662,
"learning_rate": 6.086956521739132e-06,
"loss": 0.8972038626670837,
"step": 22
},
{
"epoch": 0.06709206927985414,
"grad_norm": 0.6931268572807312,
"learning_rate": 6.376811594202898e-06,
"loss": 0.9583507776260376,
"step": 23
},
{
"epoch": 0.07000911577028258,
"grad_norm": 0.7298229336738586,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8119489550590515,
"step": 24
},
{
"epoch": 0.07292616226071102,
"grad_norm": 0.6419956684112549,
"learning_rate": 6.956521739130435e-06,
"loss": 0.9386100769042969,
"step": 25
},
{
"epoch": 0.07584320875113947,
"grad_norm": 0.7508338689804077,
"learning_rate": 7.246376811594203e-06,
"loss": 0.9272583723068237,
"step": 26
},
{
"epoch": 0.0787602552415679,
"grad_norm": 0.5848079919815063,
"learning_rate": 7.536231884057972e-06,
"loss": 0.8967856168746948,
"step": 27
},
{
"epoch": 0.08167730173199636,
"grad_norm": 0.7384837865829468,
"learning_rate": 7.82608695652174e-06,
"loss": 0.8696568012237549,
"step": 28
},
{
"epoch": 0.0845943482224248,
"grad_norm": 0.5069604516029358,
"learning_rate": 8.115942028985508e-06,
"loss": 0.9121193885803223,
"step": 29
},
{
"epoch": 0.08751139471285324,
"grad_norm": 0.833165168762207,
"learning_rate": 8.405797101449275e-06,
"loss": 0.8180589079856873,
"step": 30
},
{
"epoch": 0.09042844120328168,
"grad_norm": 0.6355920433998108,
"learning_rate": 8.695652173913044e-06,
"loss": 0.8640957474708557,
"step": 31
},
{
"epoch": 0.09334548769371012,
"grad_norm": 1.0429315567016602,
"learning_rate": 8.985507246376812e-06,
"loss": 0.9517915844917297,
"step": 32
},
{
"epoch": 0.09626253418413856,
"grad_norm": 0.5875154733657837,
"learning_rate": 9.275362318840581e-06,
"loss": 0.9443603754043579,
"step": 33
},
{
"epoch": 0.099179580674567,
"grad_norm": 1.9913769960403442,
"learning_rate": 9.565217391304349e-06,
"loss": 0.9510866403579712,
"step": 34
},
{
"epoch": 0.10209662716499544,
"grad_norm": 0.5310097932815552,
"learning_rate": 9.855072463768118e-06,
"loss": 0.8653419613838196,
"step": 35
},
{
"epoch": 0.10501367365542388,
"grad_norm": 0.624421238899231,
"learning_rate": 1.0144927536231885e-05,
"loss": 0.7941208481788635,
"step": 36
},
{
"epoch": 0.10793072014585232,
"grad_norm": 0.6314200758934021,
"learning_rate": 1.0434782608695653e-05,
"loss": 0.8931174278259277,
"step": 37
},
{
"epoch": 0.11084776663628076,
"grad_norm": 0.6272342205047607,
"learning_rate": 1.0724637681159422e-05,
"loss": 0.8978185057640076,
"step": 38
},
{
"epoch": 0.1137648131267092,
"grad_norm": 0.5711184740066528,
"learning_rate": 1.101449275362319e-05,
"loss": 0.808263897895813,
"step": 39
},
{
"epoch": 0.11668185961713765,
"grad_norm": 0.7581208944320679,
"learning_rate": 1.1304347826086957e-05,
"loss": 0.7456756830215454,
"step": 40
},
{
"epoch": 0.11959890610756609,
"grad_norm": 0.4989977180957794,
"learning_rate": 1.1594202898550726e-05,
"loss": 0.8273333311080933,
"step": 41
},
{
"epoch": 0.12251595259799453,
"grad_norm": 0.8602972626686096,
"learning_rate": 1.1884057971014494e-05,
"loss": 0.8514784574508667,
"step": 42
},
{
"epoch": 0.12543299908842298,
"grad_norm": 0.6918581128120422,
"learning_rate": 1.2173913043478263e-05,
"loss": 0.8182265162467957,
"step": 43
},
{
"epoch": 0.1283500455788514,
"grad_norm": 0.653099536895752,
"learning_rate": 1.2463768115942029e-05,
"loss": 0.8242791891098022,
"step": 44
},
{
"epoch": 0.13126709206927986,
"grad_norm": 0.7485584616661072,
"learning_rate": 1.2753623188405797e-05,
"loss": 0.8229591250419617,
"step": 45
},
{
"epoch": 0.1341841385597083,
"grad_norm": 0.6724833250045776,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.8146833181381226,
"step": 46
},
{
"epoch": 0.13710118505013674,
"grad_norm": 0.857208251953125,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.8154427409172058,
"step": 47
},
{
"epoch": 0.14001823154056517,
"grad_norm": 0.5559669137001038,
"learning_rate": 1.3623188405797103e-05,
"loss": 0.879005491733551,
"step": 48
},
{
"epoch": 0.14293527803099362,
"grad_norm": 0.5910897850990295,
"learning_rate": 1.391304347826087e-05,
"loss": 0.8148283362388611,
"step": 49
},
{
"epoch": 0.14585232452142205,
"grad_norm": 0.6478891372680664,
"learning_rate": 1.420289855072464e-05,
"loss": 0.8293006420135498,
"step": 50
},
{
"epoch": 0.14585232452142205,
"eval_loss": 0.7892261147499084,
"eval_runtime": 973.2157,
"eval_samples_per_second": 0.649,
"eval_steps_per_second": 0.649,
"step": 50
},
{
"epoch": 0.1487693710118505,
"grad_norm": 0.757882833480835,
"learning_rate": 1.4492753623188407e-05,
"loss": 0.8114852905273438,
"step": 51
},
{
"epoch": 0.15168641750227893,
"grad_norm": 0.8496116995811462,
"learning_rate": 1.4782608695652174e-05,
"loss": 0.7886185050010681,
"step": 52
},
{
"epoch": 0.15460346399270739,
"grad_norm": 0.6078857183456421,
"learning_rate": 1.5072463768115944e-05,
"loss": 0.7298170924186707,
"step": 53
},
{
"epoch": 0.1575205104831358,
"grad_norm": 0.5856835246086121,
"learning_rate": 1.536231884057971e-05,
"loss": 0.7407160997390747,
"step": 54
},
{
"epoch": 0.16043755697356427,
"grad_norm": 1.0533701181411743,
"learning_rate": 1.565217391304348e-05,
"loss": 0.7057831287384033,
"step": 55
},
{
"epoch": 0.16335460346399272,
"grad_norm": 0.8087610006332397,
"learning_rate": 1.5942028985507246e-05,
"loss": 0.7409019470214844,
"step": 56
},
{
"epoch": 0.16627164995442115,
"grad_norm": 0.629945695400238,
"learning_rate": 1.6231884057971015e-05,
"loss": 0.7768293023109436,
"step": 57
},
{
"epoch": 0.1691886964448496,
"grad_norm": 0.5187911987304688,
"learning_rate": 1.6521739130434785e-05,
"loss": 0.825718104839325,
"step": 58
},
{
"epoch": 0.17210574293527803,
"grad_norm": 0.5866358280181885,
"learning_rate": 1.681159420289855e-05,
"loss": 0.8575979471206665,
"step": 59
},
{
"epoch": 0.17502278942570648,
"grad_norm": 1.5098934173583984,
"learning_rate": 1.710144927536232e-05,
"loss": 0.8058848977088928,
"step": 60
},
{
"epoch": 0.1779398359161349,
"grad_norm": 0.6981958150863647,
"learning_rate": 1.739130434782609e-05,
"loss": 0.7640778422355652,
"step": 61
},
{
"epoch": 0.18085688240656336,
"grad_norm": 0.631349503993988,
"learning_rate": 1.7681159420289858e-05,
"loss": 0.7896331548690796,
"step": 62
},
{
"epoch": 0.1837739288969918,
"grad_norm": 0.6930747032165527,
"learning_rate": 1.7971014492753624e-05,
"loss": 0.6762524247169495,
"step": 63
},
{
"epoch": 0.18669097538742024,
"grad_norm": 0.599399209022522,
"learning_rate": 1.8260869565217393e-05,
"loss": 0.7285035848617554,
"step": 64
},
{
"epoch": 0.18960802187784867,
"grad_norm": 0.6194344758987427,
"learning_rate": 1.8550724637681162e-05,
"loss": 0.7682523131370544,
"step": 65
},
{
"epoch": 0.19252506836827712,
"grad_norm": 0.5691342949867249,
"learning_rate": 1.8840579710144928e-05,
"loss": 0.6791993379592896,
"step": 66
},
{
"epoch": 0.19544211485870555,
"grad_norm": 0.6257390379905701,
"learning_rate": 1.9130434782608697e-05,
"loss": 0.6744828224182129,
"step": 67
},
{
"epoch": 0.198359161349134,
"grad_norm": 0.5871018767356873,
"learning_rate": 1.9420289855072467e-05,
"loss": 0.7317330837249756,
"step": 68
},
{
"epoch": 0.20127620783956243,
"grad_norm": 1.0744612216949463,
"learning_rate": 1.9710144927536236e-05,
"loss": 0.6617178916931152,
"step": 69
},
{
"epoch": 0.2041932543299909,
"grad_norm": 0.675946831703186,
"learning_rate": 2e-05,
"loss": 0.7615712881088257,
"step": 70
},
{
"epoch": 0.2071103008204193,
"grad_norm": 0.7663411498069763,
"learning_rate": 1.9999870372100614e-05,
"loss": 0.7131291627883911,
"step": 71
},
{
"epoch": 0.21002734731084777,
"grad_norm": 0.6725395321846008,
"learning_rate": 1.9999481491763123e-05,
"loss": 0.7452989816665649,
"step": 72
},
{
"epoch": 0.21294439380127622,
"grad_norm": 0.6505664587020874,
"learning_rate": 1.9998833369069483e-05,
"loss": 0.7477136850357056,
"step": 73
},
{
"epoch": 0.21586144029170465,
"grad_norm": 0.7032860517501831,
"learning_rate": 1.9997926020822643e-05,
"loss": 0.6854275465011597,
"step": 74
},
{
"epoch": 0.2187784867821331,
"grad_norm": 0.645345151424408,
"learning_rate": 1.999675947054614e-05,
"loss": 0.7552425265312195,
"step": 75
},
{
"epoch": 0.22169553327256153,
"grad_norm": 0.6620492935180664,
"learning_rate": 1.9995333748483464e-05,
"loss": 0.7262853384017944,
"step": 76
},
{
"epoch": 0.22461257976298998,
"grad_norm": 0.6511455774307251,
"learning_rate": 1.9993648891597284e-05,
"loss": 0.7591732144355774,
"step": 77
},
{
"epoch": 0.2275296262534184,
"grad_norm": 0.6775254011154175,
"learning_rate": 1.9991704943568497e-05,
"loss": 0.7498704195022583,
"step": 78
},
{
"epoch": 0.23044667274384686,
"grad_norm": 0.8199896216392517,
"learning_rate": 1.9989501954795076e-05,
"loss": 0.7238684296607971,
"step": 79
},
{
"epoch": 0.2333637192342753,
"grad_norm": 0.8197569847106934,
"learning_rate": 1.998703998239079e-05,
"loss": 0.7028778195381165,
"step": 80
},
{
"epoch": 0.23628076572470375,
"grad_norm": 0.6602625250816345,
"learning_rate": 1.9984319090183692e-05,
"loss": 0.8842703104019165,
"step": 81
},
{
"epoch": 0.23919781221513217,
"grad_norm": 0.9587129354476929,
"learning_rate": 1.99813393487145e-05,
"loss": 0.732614278793335,
"step": 82
},
{
"epoch": 0.24211485870556063,
"grad_norm": 0.6822189092636108,
"learning_rate": 1.997810083523473e-05,
"loss": 0.7544928193092346,
"step": 83
},
{
"epoch": 0.24503190519598905,
"grad_norm": 0.8980082869529724,
"learning_rate": 1.9974603633704726e-05,
"loss": 0.6704054474830627,
"step": 84
},
{
"epoch": 0.2479489516864175,
"grad_norm": 0.7413425445556641,
"learning_rate": 1.9970847834791472e-05,
"loss": 0.693661093711853,
"step": 85
},
{
"epoch": 0.25086599817684596,
"grad_norm": 0.8314999341964722,
"learning_rate": 1.9966833535866223e-05,
"loss": 0.667654275894165,
"step": 86
},
{
"epoch": 0.25378304466727436,
"grad_norm": 0.7972444891929626,
"learning_rate": 1.9962560841002013e-05,
"loss": 0.8403134942054749,
"step": 87
},
{
"epoch": 0.2567000911577028,
"grad_norm": 0.8519951701164246,
"learning_rate": 1.995802986097093e-05,
"loss": 0.6897370219230652,
"step": 88
},
{
"epoch": 0.25961713764813127,
"grad_norm": 0.8268933892250061,
"learning_rate": 1.995324071324126e-05,
"loss": 0.6690632700920105,
"step": 89
},
{
"epoch": 0.2625341841385597,
"grad_norm": 0.7133983969688416,
"learning_rate": 1.9948193521974436e-05,
"loss": 0.6314147114753723,
"step": 90
},
{
"epoch": 0.2654512306289881,
"grad_norm": 0.889302134513855,
"learning_rate": 1.9942888418021814e-05,
"loss": 0.7389825582504272,
"step": 91
},
{
"epoch": 0.2683682771194166,
"grad_norm": 0.7022432088851929,
"learning_rate": 1.99373255389213e-05,
"loss": 0.6916261911392212,
"step": 92
},
{
"epoch": 0.27128532360984503,
"grad_norm": 0.696432888507843,
"learning_rate": 1.9931505028893748e-05,
"loss": 0.6908476948738098,
"step": 93
},
{
"epoch": 0.2742023701002735,
"grad_norm": 0.7667419910430908,
"learning_rate": 1.9925427038839267e-05,
"loss": 0.6500837206840515,
"step": 94
},
{
"epoch": 0.27711941659070194,
"grad_norm": 0.6974894404411316,
"learning_rate": 1.9919091726333265e-05,
"loss": 0.7059191465377808,
"step": 95
},
{
"epoch": 0.28003646308113034,
"grad_norm": 0.7047077417373657,
"learning_rate": 1.9912499255622397e-05,
"loss": 0.6287837624549866,
"step": 96
},
{
"epoch": 0.2829535095715588,
"grad_norm": 0.7729557156562805,
"learning_rate": 1.990564979762029e-05,
"loss": 0.6738612055778503,
"step": 97
},
{
"epoch": 0.28587055606198725,
"grad_norm": 0.7020529508590698,
"learning_rate": 1.989854352990311e-05,
"loss": 0.662042498588562,
"step": 98
},
{
"epoch": 0.2887876025524157,
"grad_norm": 0.7369800209999084,
"learning_rate": 1.9891180636704975e-05,
"loss": 0.6246830821037292,
"step": 99
},
{
"epoch": 0.2917046490428441,
"grad_norm": 0.7412623167037964,
"learning_rate": 1.9883561308913154e-05,
"loss": 0.6623879075050354,
"step": 100
},
{
"epoch": 0.2917046490428441,
"eval_loss": 0.6552971005439758,
"eval_runtime": 966.7072,
"eval_samples_per_second": 0.654,
"eval_steps_per_second": 0.654,
"step": 100
},
{
"epoch": 0.29462169553327255,
"grad_norm": 0.8428792953491211,
"learning_rate": 1.987568574406314e-05,
"loss": 0.6312171816825867,
"step": 101
},
{
"epoch": 0.297538742023701,
"grad_norm": 0.6948133707046509,
"learning_rate": 1.9867554146333517e-05,
"loss": 0.6266146898269653,
"step": 102
},
{
"epoch": 0.30045578851412946,
"grad_norm": 1.3897597789764404,
"learning_rate": 1.985916672654068e-05,
"loss": 0.6669265031814575,
"step": 103
},
{
"epoch": 0.30337283500455786,
"grad_norm": 0.8838400840759277,
"learning_rate": 1.985052370213334e-05,
"loss": 0.6601086854934692,
"step": 104
},
{
"epoch": 0.3062898814949863,
"grad_norm": 0.8471395373344421,
"learning_rate": 1.9841625297186925e-05,
"loss": 0.5984431505203247,
"step": 105
},
{
"epoch": 0.30920692798541477,
"grad_norm": 0.8940042853355408,
"learning_rate": 1.983247174239774e-05,
"loss": 0.7223822474479675,
"step": 106
},
{
"epoch": 0.3121239744758432,
"grad_norm": 0.7833696603775024,
"learning_rate": 1.9823063275076998e-05,
"loss": 0.6868705749511719,
"step": 107
},
{
"epoch": 0.3150410209662716,
"grad_norm": 0.8794649243354797,
"learning_rate": 1.9813400139144673e-05,
"loss": 0.6246675848960876,
"step": 108
},
{
"epoch": 0.3179580674567001,
"grad_norm": 0.8126057982444763,
"learning_rate": 1.9803482585123165e-05,
"loss": 0.5908697247505188,
"step": 109
},
{
"epoch": 0.32087511394712853,
"grad_norm": 0.7947676777839661,
"learning_rate": 1.979331087013082e-05,
"loss": 0.5751246809959412,
"step": 110
},
{
"epoch": 0.323792160437557,
"grad_norm": 0.713545560836792,
"learning_rate": 1.978288525787524e-05,
"loss": 0.6081106066703796,
"step": 111
},
{
"epoch": 0.32670920692798544,
"grad_norm": 1.011828064918518,
"learning_rate": 1.977220601864647e-05,
"loss": 0.7039169669151306,
"step": 112
},
{
"epoch": 0.32962625341841384,
"grad_norm": 0.730570912361145,
"learning_rate": 1.9761273429309982e-05,
"loss": 0.6140255928039551,
"step": 113
},
{
"epoch": 0.3325432999088423,
"grad_norm": 1.059688687324524,
"learning_rate": 1.9750087773299492e-05,
"loss": 0.648114025592804,
"step": 114
},
{
"epoch": 0.33546034639927075,
"grad_norm": 0.9336895942687988,
"learning_rate": 1.973864934060962e-05,
"loss": 0.622555673122406,
"step": 115
},
{
"epoch": 0.3383773928896992,
"grad_norm": 0.7195945978164673,
"learning_rate": 1.9726958427788367e-05,
"loss": 0.70485520362854,
"step": 116
},
{
"epoch": 0.3412944393801276,
"grad_norm": 0.8101872801780701,
"learning_rate": 1.971501533792942e-05,
"loss": 0.6958848834037781,
"step": 117
},
{
"epoch": 0.34421148587055606,
"grad_norm": 1.6075212955474854,
"learning_rate": 1.970282038066432e-05,
"loss": 0.6021550893783569,
"step": 118
},
{
"epoch": 0.3471285323609845,
"grad_norm": 0.7881433963775635,
"learning_rate": 1.9690373872154396e-05,
"loss": 0.6449777483940125,
"step": 119
},
{
"epoch": 0.35004557885141296,
"grad_norm": 1.014639973640442,
"learning_rate": 1.9677676135082606e-05,
"loss": 0.5939379930496216,
"step": 120
},
{
"epoch": 0.35296262534184136,
"grad_norm": 0.8198449611663818,
"learning_rate": 1.9664727498645144e-05,
"loss": 0.6210286617279053,
"step": 121
},
{
"epoch": 0.3558796718322698,
"grad_norm": 1.0194576978683472,
"learning_rate": 1.9651528298542918e-05,
"loss": 0.624247670173645,
"step": 122
},
{
"epoch": 0.35879671832269827,
"grad_norm": 0.7963470220565796,
"learning_rate": 1.9638078876972842e-05,
"loss": 0.6479315757751465,
"step": 123
},
{
"epoch": 0.3617137648131267,
"grad_norm": 0.9007541537284851,
"learning_rate": 1.9624379582618976e-05,
"loss": 0.6131505370140076,
"step": 124
},
{
"epoch": 0.3646308113035551,
"grad_norm": 0.8712120056152344,
"learning_rate": 1.9610430770643464e-05,
"loss": 0.6249448657035828,
"step": 125
},
{
"epoch": 0.3675478577939836,
"grad_norm": 1.1482540369033813,
"learning_rate": 1.9596232802677347e-05,
"loss": 0.5844688415527344,
"step": 126
},
{
"epoch": 0.37046490428441203,
"grad_norm": 0.8662379384040833,
"learning_rate": 1.9581786046811175e-05,
"loss": 0.6573485732078552,
"step": 127
},
{
"epoch": 0.3733819507748405,
"grad_norm": 0.8191388845443726,
"learning_rate": 1.9567090877585477e-05,
"loss": 0.5896862745285034,
"step": 128
},
{
"epoch": 0.37629899726526894,
"grad_norm": 1.0187078714370728,
"learning_rate": 1.955214767598103e-05,
"loss": 0.613490879535675,
"step": 129
},
{
"epoch": 0.37921604375569734,
"grad_norm": 0.8444119691848755,
"learning_rate": 1.953695682940901e-05,
"loss": 0.727687656879425,
"step": 130
},
{
"epoch": 0.3821330902461258,
"grad_norm": 0.74753737449646,
"learning_rate": 1.9521518731700913e-05,
"loss": 0.6102436780929565,
"step": 131
},
{
"epoch": 0.38505013673655425,
"grad_norm": 1.0166202783584595,
"learning_rate": 1.9505833783098378e-05,
"loss": 0.6244844198226929,
"step": 132
},
{
"epoch": 0.3879671832269827,
"grad_norm": 0.8175772428512573,
"learning_rate": 1.9489902390242793e-05,
"loss": 0.5939282178878784,
"step": 133
},
{
"epoch": 0.3908842297174111,
"grad_norm": 1.0177713632583618,
"learning_rate": 1.947372496616476e-05,
"loss": 0.6418229937553406,
"step": 134
},
{
"epoch": 0.39380127620783956,
"grad_norm": 0.8652453422546387,
"learning_rate": 1.9457301930273376e-05,
"loss": 0.5870395302772522,
"step": 135
},
{
"epoch": 0.396718322698268,
"grad_norm": 0.8378894925117493,
"learning_rate": 1.9440633708345365e-05,
"loss": 0.6480278372764587,
"step": 136
},
{
"epoch": 0.39963536918869647,
"grad_norm": 0.8303541541099548,
"learning_rate": 1.9423720732514052e-05,
"loss": 0.6191359758377075,
"step": 137
},
{
"epoch": 0.40255241567912486,
"grad_norm": 0.8576734662055969,
"learning_rate": 1.9406563441258145e-05,
"loss": 0.5696198344230652,
"step": 138
},
{
"epoch": 0.4054694621695533,
"grad_norm": 0.9558727145195007,
"learning_rate": 1.9389162279390362e-05,
"loss": 0.6177623271942139,
"step": 139
},
{
"epoch": 0.4083865086599818,
"grad_norm": 0.7046042084693909,
"learning_rate": 1.9371517698045922e-05,
"loss": 0.5836521983146667,
"step": 140
},
{
"epoch": 0.4113035551504102,
"grad_norm": 1.0522717237472534,
"learning_rate": 1.935363015467082e-05,
"loss": 0.5728275775909424,
"step": 141
},
{
"epoch": 0.4142206016408386,
"grad_norm": 0.9554787874221802,
"learning_rate": 1.933550011301e-05,
"loss": 0.632586658000946,
"step": 142
},
{
"epoch": 0.4171376481312671,
"grad_norm": 0.8874214291572571,
"learning_rate": 1.9317128043095293e-05,
"loss": 0.5850118398666382,
"step": 143
},
{
"epoch": 0.42005469462169553,
"grad_norm": 1.0708963871002197,
"learning_rate": 1.9298514421233276e-05,
"loss": 0.6260685324668884,
"step": 144
},
{
"epoch": 0.422971741112124,
"grad_norm": 0.8135736584663391,
"learning_rate": 1.9279659729992888e-05,
"loss": 0.6031094193458557,
"step": 145
},
{
"epoch": 0.42588878760255244,
"grad_norm": 0.7971774339675903,
"learning_rate": 1.9260564458192926e-05,
"loss": 0.6101322770118713,
"step": 146
},
{
"epoch": 0.42880583409298084,
"grad_norm": 0.9374974966049194,
"learning_rate": 1.9241229100889397e-05,
"loss": 0.5836313366889954,
"step": 147
},
{
"epoch": 0.4317228805834093,
"grad_norm": 0.8043425679206848,
"learning_rate": 1.9221654159362636e-05,
"loss": 0.6181215047836304,
"step": 148
},
{
"epoch": 0.43463992707383775,
"grad_norm": 0.8923380374908447,
"learning_rate": 1.920184014110436e-05,
"loss": 0.6149677634239197,
"step": 149
},
{
"epoch": 0.4375569735642662,
"grad_norm": 0.8908132314682007,
"learning_rate": 1.918178755980449e-05,
"loss": 0.5899742841720581,
"step": 150
},
{
"epoch": 0.4375569735642662,
"eval_loss": 0.5903874635696411,
"eval_runtime": 1186.9542,
"eval_samples_per_second": 0.532,
"eval_steps_per_second": 0.532,
"step": 150
},
{
"epoch": 0.4404740200546946,
"grad_norm": 1.060531497001648,
"learning_rate": 1.9161496935337808e-05,
"loss": 0.5852696895599365,
"step": 151
},
{
"epoch": 0.44339106654512306,
"grad_norm": 0.9723032712936401,
"learning_rate": 1.914096879375053e-05,
"loss": 0.5822056531906128,
"step": 152
},
{
"epoch": 0.4463081130355515,
"grad_norm": 0.9519931674003601,
"learning_rate": 1.912020366724663e-05,
"loss": 0.6183493137359619,
"step": 153
},
{
"epoch": 0.44922515952597997,
"grad_norm": 0.8282918334007263,
"learning_rate": 1.9099202094174055e-05,
"loss": 0.6229860782623291,
"step": 154
},
{
"epoch": 0.45214220601640837,
"grad_norm": 0.9251292943954468,
"learning_rate": 1.907796461901076e-05,
"loss": 0.6552959680557251,
"step": 155
},
{
"epoch": 0.4550592525068368,
"grad_norm": 1.0349540710449219,
"learning_rate": 1.9056491792350606e-05,
"loss": 0.6170098781585693,
"step": 156
},
{
"epoch": 0.4579762989972653,
"grad_norm": 0.8720711469650269,
"learning_rate": 1.9034784170889076e-05,
"loss": 0.5870137810707092,
"step": 157
},
{
"epoch": 0.46089334548769373,
"grad_norm": 1.0785977840423584,
"learning_rate": 1.9012842317408843e-05,
"loss": 0.5515124201774597,
"step": 158
},
{
"epoch": 0.4638103919781221,
"grad_norm": 1.0634154081344604,
"learning_rate": 1.8990666800765187e-05,
"loss": 0.6073828339576721,
"step": 159
},
{
"epoch": 0.4667274384685506,
"grad_norm": 0.8770879507064819,
"learning_rate": 1.896825819587123e-05,
"loss": 0.5960907936096191,
"step": 160
},
{
"epoch": 0.46964448495897904,
"grad_norm": 1.1225898265838623,
"learning_rate": 1.894561708368305e-05,
"loss": 0.545990526676178,
"step": 161
},
{
"epoch": 0.4725615314494075,
"grad_norm": 0.9373893141746521,
"learning_rate": 1.8922744051184613e-05,
"loss": 0.5566108822822571,
"step": 162
},
{
"epoch": 0.4754785779398359,
"grad_norm": 1.5016087293624878,
"learning_rate": 1.8899639691372545e-05,
"loss": 0.558845043182373,
"step": 163
},
{
"epoch": 0.47839562443026434,
"grad_norm": 0.903020977973938,
"learning_rate": 1.8876304603240773e-05,
"loss": 0.6824233531951904,
"step": 164
},
{
"epoch": 0.4813126709206928,
"grad_norm": 0.8239623308181763,
"learning_rate": 1.8852739391764993e-05,
"loss": 0.5630610585212708,
"step": 165
},
{
"epoch": 0.48422971741112125,
"grad_norm": 0.926069438457489,
"learning_rate": 1.882894466788697e-05,
"loss": 0.6211802363395691,
"step": 166
},
{
"epoch": 0.4871467639015497,
"grad_norm": 1.0098828077316284,
"learning_rate": 1.8804921048498722e-05,
"loss": 0.5513257384300232,
"step": 167
},
{
"epoch": 0.4900638103919781,
"grad_norm": 0.9228141903877258,
"learning_rate": 1.8780669156426517e-05,
"loss": 0.6197121739387512,
"step": 168
},
{
"epoch": 0.49298085688240656,
"grad_norm": 1.0551754236221313,
"learning_rate": 1.8756189620414712e-05,
"loss": 0.5221806764602661,
"step": 169
},
{
"epoch": 0.495897903372835,
"grad_norm": 0.9017496109008789,
"learning_rate": 1.873148307510948e-05,
"loss": 0.5766995549201965,
"step": 170
},
{
"epoch": 0.49881494986326347,
"grad_norm": 0.9704970717430115,
"learning_rate": 1.870655016104233e-05,
"loss": 0.6514763832092285,
"step": 171
},
{
"epoch": 0.5017319963536919,
"grad_norm": 0.9972712397575378,
"learning_rate": 1.8681391524613518e-05,
"loss": 0.5273895263671875,
"step": 172
},
{
"epoch": 0.5046490428441204,
"grad_norm": 0.9473339319229126,
"learning_rate": 1.8656007818075288e-05,
"loss": 0.5548599362373352,
"step": 173
},
{
"epoch": 0.5075660893345487,
"grad_norm": 1.2493574619293213,
"learning_rate": 1.8630399699514944e-05,
"loss": 0.5593586564064026,
"step": 174
},
{
"epoch": 0.5104831358249772,
"grad_norm": 1.2766696214675903,
"learning_rate": 1.860456783283781e-05,
"loss": 0.6054630279541016,
"step": 175
},
{
"epoch": 0.5134001823154056,
"grad_norm": 0.9555240869522095,
"learning_rate": 1.857851288775002e-05,
"loss": 0.508592963218689,
"step": 176
},
{
"epoch": 0.5163172288058341,
"grad_norm": 1.260219931602478,
"learning_rate": 1.8552235539741118e-05,
"loss": 0.5532065629959106,
"step": 177
},
{
"epoch": 0.5192342752962625,
"grad_norm": 1.1859954595565796,
"learning_rate": 1.8525736470066595e-05,
"loss": 0.5683344006538391,
"step": 178
},
{
"epoch": 0.522151321786691,
"grad_norm": 1.3044344186782837,
"learning_rate": 1.8499016365730203e-05,
"loss": 0.5281959772109985,
"step": 179
},
{
"epoch": 0.5250683682771194,
"grad_norm": 1.3049921989440918,
"learning_rate": 1.8472075919466137e-05,
"loss": 0.49621230363845825,
"step": 180
},
{
"epoch": 0.5279854147675479,
"grad_norm": 1.0488537549972534,
"learning_rate": 1.844491582972109e-05,
"loss": 0.6194032430648804,
"step": 181
},
{
"epoch": 0.5309024612579762,
"grad_norm": 1.5553455352783203,
"learning_rate": 1.8417536800636138e-05,
"loss": 0.5645846724510193,
"step": 182
},
{
"epoch": 0.5338195077484047,
"grad_norm": 1.2673912048339844,
"learning_rate": 1.8389939542028484e-05,
"loss": 0.6267315745353699,
"step": 183
},
{
"epoch": 0.5367365542388332,
"grad_norm": 1.0273847579956055,
"learning_rate": 1.8362124769373064e-05,
"loss": 0.5256403684616089,
"step": 184
},
{
"epoch": 0.5396536007292616,
"grad_norm": 1.006093978881836,
"learning_rate": 1.8334093203783986e-05,
"loss": 0.5916382074356079,
"step": 185
},
{
"epoch": 0.5425706472196901,
"grad_norm": 1.2740857601165771,
"learning_rate": 1.8305845571995843e-05,
"loss": 0.581648588180542,
"step": 186
},
{
"epoch": 0.5454876937101185,
"grad_norm": 1.494248390197754,
"learning_rate": 1.8277382606344872e-05,
"loss": 0.4824523627758026,
"step": 187
},
{
"epoch": 0.548404740200547,
"grad_norm": 1.1862496137619019,
"learning_rate": 1.824870504474996e-05,
"loss": 0.5531858205795288,
"step": 188
},
{
"epoch": 0.5513217866909754,
"grad_norm": 3.503049373626709,
"learning_rate": 1.8219813630693523e-05,
"loss": 0.6308296918869019,
"step": 189
},
{
"epoch": 0.5542388331814039,
"grad_norm": 1.7544710636138916,
"learning_rate": 1.819070911320222e-05,
"loss": 0.6146273016929626,
"step": 190
},
{
"epoch": 0.5571558796718322,
"grad_norm": 1.3367774486541748,
"learning_rate": 1.8161392246827546e-05,
"loss": 0.5848966240882874,
"step": 191
},
{
"epoch": 0.5600729261622607,
"grad_norm": 1.696418046951294,
"learning_rate": 1.8131863791626263e-05,
"loss": 0.6621730327606201,
"step": 192
},
{
"epoch": 0.5629899726526891,
"grad_norm": 1.360052227973938,
"learning_rate": 1.8102124513140694e-05,
"loss": 0.5972204208374023,
"step": 193
},
{
"epoch": 0.5659070191431176,
"grad_norm": 1.5376263856887817,
"learning_rate": 1.807217518237888e-05,
"loss": 0.4938785433769226,
"step": 194
},
{
"epoch": 0.568824065633546,
"grad_norm": 1.2249681949615479,
"learning_rate": 1.8042016575794585e-05,
"loss": 0.5366095304489136,
"step": 195
},
{
"epoch": 0.5717411121239745,
"grad_norm": 1.7868080139160156,
"learning_rate": 1.8011649475267178e-05,
"loss": 0.5116773843765259,
"step": 196
},
{
"epoch": 0.574658158614403,
"grad_norm": 2.369993209838867,
"learning_rate": 1.7981074668081345e-05,
"loss": 0.49072742462158203,
"step": 197
},
{
"epoch": 0.5775752051048314,
"grad_norm": 1.0168434381484985,
"learning_rate": 1.7950292946906695e-05,
"loss": 0.5691611170768738,
"step": 198
},
{
"epoch": 0.5804922515952597,
"grad_norm": 1.2990851402282715,
"learning_rate": 1.7919305109777195e-05,
"loss": 0.5515039563179016,
"step": 199
},
{
"epoch": 0.5834092980856882,
"grad_norm": 1.4859853982925415,
"learning_rate": 1.7888111960070493e-05,
"loss": 0.5017011165618896,
"step": 200
},
{
"epoch": 0.5834092980856882,
"eval_loss": 0.5414339303970337,
"eval_runtime": 1180.7894,
"eval_samples_per_second": 0.535,
"eval_steps_per_second": 0.535,
"step": 200
},
{
"epoch": 0.5863263445761167,
"grad_norm": 1.0065829753875732,
"learning_rate": 1.7856714306487088e-05,
"loss": 0.5677731037139893,
"step": 201
},
{
"epoch": 0.5892433910665451,
"grad_norm": 1.1727538108825684,
"learning_rate": 1.7825112963029352e-05,
"loss": 0.4525509476661682,
"step": 202
},
{
"epoch": 0.5921604375569736,
"grad_norm": 1.3376752138137817,
"learning_rate": 1.7793308748980437e-05,
"loss": 0.5208959579467773,
"step": 203
},
{
"epoch": 0.595077484047402,
"grad_norm": 0.9196159839630127,
"learning_rate": 1.776130248888304e-05,
"loss": 0.6033903360366821,
"step": 204
},
{
"epoch": 0.5979945305378305,
"grad_norm": 1.0750919580459595,
"learning_rate": 1.772909501251801e-05,
"loss": 0.5449609160423279,
"step": 205
},
{
"epoch": 0.6009115770282589,
"grad_norm": 1.2459467649459839,
"learning_rate": 1.769668715488285e-05,
"loss": 0.5685338377952576,
"step": 206
},
{
"epoch": 0.6038286235186874,
"grad_norm": 1.1690552234649658,
"learning_rate": 1.766407975617006e-05,
"loss": 0.5240382552146912,
"step": 207
},
{
"epoch": 0.6067456700091157,
"grad_norm": 1.0816599130630493,
"learning_rate": 1.7631273661745362e-05,
"loss": 0.6802893877029419,
"step": 208
},
{
"epoch": 0.6096627164995442,
"grad_norm": 1.3662947416305542,
"learning_rate": 1.7598269722125775e-05,
"loss": 0.48193931579589844,
"step": 209
},
{
"epoch": 0.6125797629899726,
"grad_norm": 0.9364766478538513,
"learning_rate": 1.7565068792957576e-05,
"loss": 0.5675849914550781,
"step": 210
},
{
"epoch": 0.6154968094804011,
"grad_norm": 1.123828411102295,
"learning_rate": 1.75316717349941e-05,
"loss": 0.5474762916564941,
"step": 211
},
{
"epoch": 0.6184138559708295,
"grad_norm": 1.1924363374710083,
"learning_rate": 1.749807941407345e-05,
"loss": 0.4918654263019562,
"step": 212
},
{
"epoch": 0.621330902461258,
"grad_norm": 1.101293921470642,
"learning_rate": 1.7464292701096014e-05,
"loss": 0.5742691159248352,
"step": 213
},
{
"epoch": 0.6242479489516864,
"grad_norm": 1.7374963760375977,
"learning_rate": 1.7430312472001928e-05,
"loss": 0.5828965902328491,
"step": 214
},
{
"epoch": 0.6271649954421149,
"grad_norm": 1.3195666074752808,
"learning_rate": 1.739613960774833e-05,
"loss": 0.5265159010887146,
"step": 215
},
{
"epoch": 0.6300820419325432,
"grad_norm": 1.254686713218689,
"learning_rate": 1.7361774994286545e-05,
"loss": 0.4929371476173401,
"step": 216
},
{
"epoch": 0.6329990884229717,
"grad_norm": 1.1476380825042725,
"learning_rate": 1.7327219522539102e-05,
"loss": 0.5060417652130127,
"step": 217
},
{
"epoch": 0.6359161349134002,
"grad_norm": 1.0914150476455688,
"learning_rate": 1.7292474088376643e-05,
"loss": 0.504043698310852,
"step": 218
},
{
"epoch": 0.6388331814038286,
"grad_norm": 1.1339508295059204,
"learning_rate": 1.7257539592594698e-05,
"loss": 0.4797310531139374,
"step": 219
},
{
"epoch": 0.6417502278942571,
"grad_norm": 1.0805399417877197,
"learning_rate": 1.722241694089033e-05,
"loss": 0.5878555178642273,
"step": 220
},
{
"epoch": 0.6446672743846855,
"grad_norm": 1.8615056276321411,
"learning_rate": 1.718710704383865e-05,
"loss": 0.5005823969841003,
"step": 221
},
{
"epoch": 0.647584320875114,
"grad_norm": 1.1445401906967163,
"learning_rate": 1.7151610816869214e-05,
"loss": 0.4949319064617157,
"step": 222
},
{
"epoch": 0.6505013673655424,
"grad_norm": 0.9726515412330627,
"learning_rate": 1.711592918024229e-05,
"loss": 0.5073204040527344,
"step": 223
},
{
"epoch": 0.6534184138559709,
"grad_norm": 1.4491140842437744,
"learning_rate": 1.7080063059024998e-05,
"loss": 0.47885262966156006,
"step": 224
},
{
"epoch": 0.6563354603463992,
"grad_norm": 1.0070592164993286,
"learning_rate": 1.7044013383067327e-05,
"loss": 0.5775837898254395,
"step": 225
},
{
"epoch": 0.6592525068368277,
"grad_norm": 0.966221272945404,
"learning_rate": 1.7007781086978037e-05,
"loss": 0.5050399899482727,
"step": 226
},
{
"epoch": 0.6621695533272561,
"grad_norm": 0.9808815121650696,
"learning_rate": 1.6971367110100407e-05,
"loss": 0.5737045407295227,
"step": 227
},
{
"epoch": 0.6650865998176846,
"grad_norm": 1.0158127546310425,
"learning_rate": 1.6934772396487906e-05,
"loss": 0.48077821731567383,
"step": 228
},
{
"epoch": 0.668003646308113,
"grad_norm": 1.32015860080719,
"learning_rate": 1.6897997894879706e-05,
"loss": 0.5614925026893616,
"step": 229
},
{
"epoch": 0.6709206927985415,
"grad_norm": 1.1055903434753418,
"learning_rate": 1.686104455867608e-05,
"loss": 0.4970760643482208,
"step": 230
},
{
"epoch": 0.67383773928897,
"grad_norm": 1.0804500579833984,
"learning_rate": 1.682391334591371e-05,
"loss": 0.5540452003479004,
"step": 231
},
{
"epoch": 0.6767547857793984,
"grad_norm": 1.1906245946884155,
"learning_rate": 1.6786605219240807e-05,
"loss": 0.5778501033782959,
"step": 232
},
{
"epoch": 0.6796718322698267,
"grad_norm": 0.9758645296096802,
"learning_rate": 1.6749121145892192e-05,
"loss": 0.49073565006256104,
"step": 233
},
{
"epoch": 0.6825888787602552,
"grad_norm": 1.1678364276885986,
"learning_rate": 1.6711462097664207e-05,
"loss": 0.4828741252422333,
"step": 234
},
{
"epoch": 0.6855059252506837,
"grad_norm": 1.148301362991333,
"learning_rate": 1.6673629050889507e-05,
"loss": 0.5143818855285645,
"step": 235
},
{
"epoch": 0.6884229717411121,
"grad_norm": 1.005898356437683,
"learning_rate": 1.6635622986411776e-05,
"loss": 0.5301160216331482,
"step": 236
},
{
"epoch": 0.6913400182315406,
"grad_norm": 1.2227320671081543,
"learning_rate": 1.659744488956027e-05,
"loss": 0.4800386130809784,
"step": 237
},
{
"epoch": 0.694257064721969,
"grad_norm": 0.986456573009491,
"learning_rate": 1.6559095750124296e-05,
"loss": 0.5098081827163696,
"step": 238
},
{
"epoch": 0.6971741112123975,
"grad_norm": 1.1474376916885376,
"learning_rate": 1.6520576562327518e-05,
"loss": 0.5147273540496826,
"step": 239
},
{
"epoch": 0.7000911577028259,
"grad_norm": 1.10917067527771,
"learning_rate": 1.6481888324802223e-05,
"loss": 0.5023190379142761,
"step": 240
},
{
"epoch": 0.7030082041932544,
"grad_norm": 1.2339262962341309,
"learning_rate": 1.644303204056341e-05,
"loss": 0.5282092690467834,
"step": 241
},
{
"epoch": 0.7059252506836827,
"grad_norm": 0.997941255569458,
"learning_rate": 1.640400871698277e-05,
"loss": 0.5635963082313538,
"step": 242
},
{
"epoch": 0.7088422971741112,
"grad_norm": 1.0345823764801025,
"learning_rate": 1.63648193657626e-05,
"loss": 0.5577977895736694,
"step": 243
},
{
"epoch": 0.7117593436645396,
"grad_norm": 1.3468303680419922,
"learning_rate": 1.6325465002909554e-05,
"loss": 0.4365362524986267,
"step": 244
},
{
"epoch": 0.7146763901549681,
"grad_norm": 1.2817128896713257,
"learning_rate": 1.628594664870831e-05,
"loss": 0.46069926023483276,
"step": 245
},
{
"epoch": 0.7175934366453965,
"grad_norm": 1.043311357498169,
"learning_rate": 1.6246265327695117e-05,
"loss": 0.5476971864700317,
"step": 246
},
{
"epoch": 0.720510483135825,
"grad_norm": 1.0297389030456543,
"learning_rate": 1.620642206863124e-05,
"loss": 0.48051249980926514,
"step": 247
},
{
"epoch": 0.7234275296262535,
"grad_norm": 1.4869836568832397,
"learning_rate": 1.6166417904476257e-05,
"loss": 0.5683314800262451,
"step": 248
},
{
"epoch": 0.7263445761166819,
"grad_norm": 1.0628005266189575,
"learning_rate": 1.6126253872361336e-05,
"loss": 0.5277887582778931,
"step": 249
},
{
"epoch": 0.7292616226071102,
"grad_norm": 1.2682170867919922,
"learning_rate": 1.608593101356229e-05,
"loss": 0.5048879384994507,
"step": 250
},
{
"epoch": 0.7292616226071102,
"eval_loss": 0.5038471221923828,
"eval_runtime": 1175.0375,
"eval_samples_per_second": 0.538,
"eval_steps_per_second": 0.538,
"step": 250
},
{
"epoch": 0.7321786690975387,
"grad_norm": 1.7376199960708618,
"learning_rate": 1.6045450373472626e-05,
"loss": 0.5093721151351929,
"step": 251
},
{
"epoch": 0.7350957155879672,
"grad_norm": 1.6047718524932861,
"learning_rate": 1.6004813001576405e-05,
"loss": 0.4796055555343628,
"step": 252
},
{
"epoch": 0.7380127620783956,
"grad_norm": 1.3582886457443237,
"learning_rate": 1.5964019951421058e-05,
"loss": 0.4733014702796936,
"step": 253
},
{
"epoch": 0.7409298085688241,
"grad_norm": 0.9468897581100464,
"learning_rate": 1.5923072280590072e-05,
"loss": 0.5312032103538513,
"step": 254
},
{
"epoch": 0.7438468550592525,
"grad_norm": 1.3890198469161987,
"learning_rate": 1.5881971050675547e-05,
"loss": 0.47576645016670227,
"step": 255
},
{
"epoch": 0.746763901549681,
"grad_norm": 1.782992959022522,
"learning_rate": 1.584071732725071e-05,
"loss": 0.5555092096328735,
"step": 256
},
{
"epoch": 0.7496809480401094,
"grad_norm": 1.1790621280670166,
"learning_rate": 1.5799312179842265e-05,
"loss": 0.5148727893829346,
"step": 257
},
{
"epoch": 0.7525979945305379,
"grad_norm": 1.446694254875183,
"learning_rate": 1.5757756681902664e-05,
"loss": 0.49939870834350586,
"step": 258
},
{
"epoch": 0.7555150410209662,
"grad_norm": 1.1786166429519653,
"learning_rate": 1.571605191078229e-05,
"loss": 0.562156081199646,
"step": 259
},
{
"epoch": 0.7584320875113947,
"grad_norm": 1.16925847530365,
"learning_rate": 1.567419894770151e-05,
"loss": 0.49580734968185425,
"step": 260
},
{
"epoch": 0.7613491340018231,
"grad_norm": 1.60944664478302,
"learning_rate": 1.5632198877722676e-05,
"loss": 0.4821680784225464,
"step": 261
},
{
"epoch": 0.7642661804922516,
"grad_norm": 1.3957884311676025,
"learning_rate": 1.5590052789721946e-05,
"loss": 0.4392276406288147,
"step": 262
},
{
"epoch": 0.76718322698268,
"grad_norm": 1.636195421218872,
"learning_rate": 1.5547761776361096e-05,
"loss": 0.39603114128112793,
"step": 263
},
{
"epoch": 0.7701002734731085,
"grad_norm": 1.496766448020935,
"learning_rate": 1.550532693405917e-05,
"loss": 0.4833749234676361,
"step": 264
},
{
"epoch": 0.773017319963537,
"grad_norm": 1.3587844371795654,
"learning_rate": 1.5462749362964058e-05,
"loss": 0.43738317489624023,
"step": 265
},
{
"epoch": 0.7759343664539654,
"grad_norm": 1.670704960823059,
"learning_rate": 1.5420030166923983e-05,
"loss": 0.4476737380027771,
"step": 266
},
{
"epoch": 0.7788514129443938,
"grad_norm": 1.2674932479858398,
"learning_rate": 1.537717045345888e-05,
"loss": 0.42266708612442017,
"step": 267
},
{
"epoch": 0.7817684594348222,
"grad_norm": 2.0639536380767822,
"learning_rate": 1.5334171333731666e-05,
"loss": 0.5245381593704224,
"step": 268
},
{
"epoch": 0.7846855059252507,
"grad_norm": 1.2091766595840454,
"learning_rate": 1.529103392251946e-05,
"loss": 0.5166443586349487,
"step": 269
},
{
"epoch": 0.7876025524156791,
"grad_norm": 1.1021631956100464,
"learning_rate": 1.5247759338184653e-05,
"loss": 0.5674265027046204,
"step": 270
},
{
"epoch": 0.7905195989061076,
"grad_norm": 1.3143829107284546,
"learning_rate": 1.520434870264595e-05,
"loss": 0.40855613350868225,
"step": 271
},
{
"epoch": 0.793436645396536,
"grad_norm": 1.1784812211990356,
"learning_rate": 1.5160803141349244e-05,
"loss": 0.4308925271034241,
"step": 272
},
{
"epoch": 0.7963536918869645,
"grad_norm": 2.1635706424713135,
"learning_rate": 1.5117123783238458e-05,
"loss": 0.45035502314567566,
"step": 273
},
{
"epoch": 0.7992707383773929,
"grad_norm": 1.569203495979309,
"learning_rate": 1.5073311760726287e-05,
"loss": 0.5095728635787964,
"step": 274
},
{
"epoch": 0.8021877848678214,
"grad_norm": 2.532621383666992,
"learning_rate": 1.5029368209664822e-05,
"loss": 0.496748685836792,
"step": 275
},
{
"epoch": 0.8051048313582497,
"grad_norm": 1.6312552690505981,
"learning_rate": 1.4985294269316098e-05,
"loss": 0.4972914159297943,
"step": 276
},
{
"epoch": 0.8080218778486782,
"grad_norm": 1.3996756076812744,
"learning_rate": 1.4941091082322579e-05,
"loss": 0.5589750409126282,
"step": 277
},
{
"epoch": 0.8109389243391066,
"grad_norm": 1.1288363933563232,
"learning_rate": 1.4896759794677526e-05,
"loss": 0.5349453687667847,
"step": 278
},
{
"epoch": 0.8138559708295351,
"grad_norm": 1.6913920640945435,
"learning_rate": 1.4852301555695268e-05,
"loss": 0.46511000394821167,
"step": 279
},
{
"epoch": 0.8167730173199635,
"grad_norm": 1.1913212537765503,
"learning_rate": 1.4807717517981439e-05,
"loss": 0.4715422987937927,
"step": 280
},
{
"epoch": 0.819690063810392,
"grad_norm": 1.1179691553115845,
"learning_rate": 1.476300883740307e-05,
"loss": 0.53330397605896,
"step": 281
},
{
"epoch": 0.8226071103008205,
"grad_norm": 1.7473797798156738,
"learning_rate": 1.4718176673058624e-05,
"loss": 0.47564437985420227,
"step": 282
},
{
"epoch": 0.8255241567912489,
"grad_norm": 1.2653177976608276,
"learning_rate": 1.4673222187247963e-05,
"loss": 0.46364277601242065,
"step": 283
},
{
"epoch": 0.8284412032816773,
"grad_norm": 1.2567330598831177,
"learning_rate": 1.4628146545442202e-05,
"loss": 0.4778091013431549,
"step": 284
},
{
"epoch": 0.8313582497721057,
"grad_norm": 1.5848406553268433,
"learning_rate": 1.4582950916253488e-05,
"loss": 0.4480203688144684,
"step": 285
},
{
"epoch": 0.8342752962625342,
"grad_norm": 1.3278183937072754,
"learning_rate": 1.453763647140472e-05,
"loss": 0.37945032119750977,
"step": 286
},
{
"epoch": 0.8371923427529626,
"grad_norm": 1.0961651802062988,
"learning_rate": 1.4492204385699155e-05,
"loss": 0.5306747555732727,
"step": 287
},
{
"epoch": 0.8401093892433911,
"grad_norm": 1.176276683807373,
"learning_rate": 1.4446655836989961e-05,
"loss": 0.49950045347213745,
"step": 288
},
{
"epoch": 0.8430264357338195,
"grad_norm": 1.2228269577026367,
"learning_rate": 1.4400992006149674e-05,
"loss": 0.494475394487381,
"step": 289
},
{
"epoch": 0.845943482224248,
"grad_norm": 1.1584209203720093,
"learning_rate": 1.4355214077039592e-05,
"loss": 0.44170859456062317,
"step": 290
},
{
"epoch": 0.8488605287146764,
"grad_norm": 1.2041938304901123,
"learning_rate": 1.4309323236479071e-05,
"loss": 0.4359871745109558,
"step": 291
},
{
"epoch": 0.8517775752051049,
"grad_norm": 1.279645562171936,
"learning_rate": 1.4263320674214762e-05,
"loss": 0.45031386613845825,
"step": 292
},
{
"epoch": 0.8546946216955332,
"grad_norm": 1.3958357572555542,
"learning_rate": 1.4217207582889769e-05,
"loss": 0.4832204580307007,
"step": 293
},
{
"epoch": 0.8576116681859617,
"grad_norm": 1.2788586616516113,
"learning_rate": 1.4170985158012725e-05,
"loss": 0.5154346227645874,
"step": 294
},
{
"epoch": 0.8605287146763901,
"grad_norm": 1.3634892702102661,
"learning_rate": 1.4124654597926795e-05,
"loss": 0.46777206659317017,
"step": 295
},
{
"epoch": 0.8634457611668186,
"grad_norm": 1.2719579935073853,
"learning_rate": 1.4078217103778619e-05,
"loss": 0.4247053265571594,
"step": 296
},
{
"epoch": 0.866362807657247,
"grad_norm": 2.890467643737793,
"learning_rate": 1.4031673879487161e-05,
"loss": 0.38349640369415283,
"step": 297
},
{
"epoch": 0.8692798541476755,
"grad_norm": 2.4354801177978516,
"learning_rate": 1.3985026131712499e-05,
"loss": 0.4134889543056488,
"step": 298
},
{
"epoch": 0.872196900638104,
"grad_norm": 1.0138323307037354,
"learning_rate": 1.3938275069824541e-05,
"loss": 0.5176680684089661,
"step": 299
},
{
"epoch": 0.8751139471285324,
"grad_norm": 1.2316186428070068,
"learning_rate": 1.389142190587168e-05,
"loss": 0.4818477928638458,
"step": 300
},
{
"epoch": 0.8751139471285324,
"eval_loss": 0.4752846360206604,
"eval_runtime": 1189.1666,
"eval_samples_per_second": 0.531,
"eval_steps_per_second": 0.531,
"step": 300
},
{
"epoch": 0.8780309936189608,
"grad_norm": 1.515487551689148,
"learning_rate": 1.384446785454936e-05,
"loss": 0.47766175866127014,
"step": 301
},
{
"epoch": 0.8809480401093892,
"grad_norm": 1.4357497692108154,
"learning_rate": 1.3797414133168591e-05,
"loss": 0.49297061562538147,
"step": 302
},
{
"epoch": 0.8838650865998177,
"grad_norm": 1.2523037195205688,
"learning_rate": 1.3750261961624383e-05,
"loss": 0.4629015326499939,
"step": 303
},
{
"epoch": 0.8867821330902461,
"grad_norm": 3.5790023803710938,
"learning_rate": 1.3703012562364124e-05,
"loss": 0.3773120045661926,
"step": 304
},
{
"epoch": 0.8896991795806746,
"grad_norm": 1.9305704832077026,
"learning_rate": 1.3655667160355892e-05,
"loss": 0.496719628572464,
"step": 305
},
{
"epoch": 0.892616226071103,
"grad_norm": 1.1506154537200928,
"learning_rate": 1.3608226983056687e-05,
"loss": 0.49487072229385376,
"step": 306
},
{
"epoch": 0.8955332725615315,
"grad_norm": 1.8046090602874756,
"learning_rate": 1.3560693260380614e-05,
"loss": 0.4910697937011719,
"step": 307
},
{
"epoch": 0.8984503190519599,
"grad_norm": 2.0088653564453125,
"learning_rate": 1.3513067224667e-05,
"loss": 0.508246660232544,
"step": 308
},
{
"epoch": 0.9013673655423883,
"grad_norm": 1.2966033220291138,
"learning_rate": 1.3465350110648437e-05,
"loss": 0.5125166177749634,
"step": 309
},
{
"epoch": 0.9042844120328167,
"grad_norm": 1.9976309537887573,
"learning_rate": 1.3417543155418775e-05,
"loss": 0.43942537903785706,
"step": 310
},
{
"epoch": 0.9072014585232452,
"grad_norm": 1.2663682699203491,
"learning_rate": 1.336964759840105e-05,
"loss": 0.4839101731777191,
"step": 311
},
{
"epoch": 0.9101185050136736,
"grad_norm": 1.1223328113555908,
"learning_rate": 1.3321664681315354e-05,
"loss": 0.48008066415786743,
"step": 312
},
{
"epoch": 0.9130355515041021,
"grad_norm": 1.5786972045898438,
"learning_rate": 1.3273595648146634e-05,
"loss": 0.47250309586524963,
"step": 313
},
{
"epoch": 0.9159525979945305,
"grad_norm": 1.2150241136550903,
"learning_rate": 1.322544174511245e-05,
"loss": 0.5149738788604736,
"step": 314
},
{
"epoch": 0.918869644484959,
"grad_norm": 1.3676542043685913,
"learning_rate": 1.3177204220630662e-05,
"loss": 0.4430195093154907,
"step": 315
},
{
"epoch": 0.9217866909753875,
"grad_norm": 1.0703285932540894,
"learning_rate": 1.3128884325287064e-05,
"loss": 0.4798983037471771,
"step": 316
},
{
"epoch": 0.9247037374658159,
"grad_norm": 1.3131535053253174,
"learning_rate": 1.308048331180296e-05,
"loss": 0.4241073727607727,
"step": 317
},
{
"epoch": 0.9276207839562443,
"grad_norm": 1.4485348463058472,
"learning_rate": 1.3032002435002698e-05,
"loss": 0.527199923992157,
"step": 318
},
{
"epoch": 0.9305378304466727,
"grad_norm": 1.370936393737793,
"learning_rate": 1.2983442951781114e-05,
"loss": 0.47125962376594543,
"step": 319
},
{
"epoch": 0.9334548769371012,
"grad_norm": 1.2369643449783325,
"learning_rate": 1.2934806121070973e-05,
"loss": 0.4814244210720062,
"step": 320
},
{
"epoch": 0.9363719234275296,
"grad_norm": 1.2632933855056763,
"learning_rate": 1.2886093203810314e-05,
"loss": 0.4915548264980316,
"step": 321
},
{
"epoch": 0.9392889699179581,
"grad_norm": 1.054569959640503,
"learning_rate": 1.2837305462909764e-05,
"loss": 0.5325602293014526,
"step": 322
},
{
"epoch": 0.9422060164083865,
"grad_norm": 1.15959632396698,
"learning_rate": 1.27884441632198e-05,
"loss": 0.43607404828071594,
"step": 323
},
{
"epoch": 0.945123062898815,
"grad_norm": 1.1667979955673218,
"learning_rate": 1.2739510571497945e-05,
"loss": 0.4631507992744446,
"step": 324
},
{
"epoch": 0.9480401093892434,
"grad_norm": 1.6009081602096558,
"learning_rate": 1.2690505956375944e-05,
"loss": 0.4935731887817383,
"step": 325
},
{
"epoch": 0.9509571558796718,
"grad_norm": 1.1193996667861938,
"learning_rate": 1.2641431588326858e-05,
"loss": 0.45883435010910034,
"step": 326
},
{
"epoch": 0.9538742023701002,
"grad_norm": 1.5365067720413208,
"learning_rate": 1.2592288739632138e-05,
"loss": 0.5206276178359985,
"step": 327
},
{
"epoch": 0.9567912488605287,
"grad_norm": 1.0714622735977173,
"learning_rate": 1.2543078684348632e-05,
"loss": 0.5242853760719299,
"step": 328
},
{
"epoch": 0.9597082953509571,
"grad_norm": 1.3009248971939087,
"learning_rate": 1.2493802698275557e-05,
"loss": 0.4794357717037201,
"step": 329
},
{
"epoch": 0.9626253418413856,
"grad_norm": 1.495771050453186,
"learning_rate": 1.244446205892143e-05,
"loss": 0.5849282145500183,
"step": 330
},
{
"epoch": 0.965542388331814,
"grad_norm": 1.2046003341674805,
"learning_rate": 1.2395058045470935e-05,
"loss": 0.47758305072784424,
"step": 331
},
{
"epoch": 0.9684594348222425,
"grad_norm": 1.1362569332122803,
"learning_rate": 1.2345591938751772e-05,
"loss": 0.4490663409233093,
"step": 332
},
{
"epoch": 0.971376481312671,
"grad_norm": 1.2658129930496216,
"learning_rate": 1.2296065021201438e-05,
"loss": 0.4035309851169586,
"step": 333
},
{
"epoch": 0.9742935278030994,
"grad_norm": 4.370306015014648,
"learning_rate": 1.2246478576833993e-05,
"loss": 0.495273619890213,
"step": 334
},
{
"epoch": 0.9772105742935278,
"grad_norm": 1.3863654136657715,
"learning_rate": 1.219683389120676e-05,
"loss": 0.46410733461380005,
"step": 335
},
{
"epoch": 0.9801276207839562,
"grad_norm": 1.4544321298599243,
"learning_rate": 1.2147132251387004e-05,
"loss": 0.4301709830760956,
"step": 336
},
{
"epoch": 0.9830446672743847,
"grad_norm": 1.0852457284927368,
"learning_rate": 1.2097374945918554e-05,
"loss": 0.48892468214035034,
"step": 337
},
{
"epoch": 0.9859617137648131,
"grad_norm": 1.5062257051467896,
"learning_rate": 1.2047563264788412e-05,
"loss": 0.4667983055114746,
"step": 338
},
{
"epoch": 0.9888787602552416,
"grad_norm": 1.2472951412200928,
"learning_rate": 1.199769849939329e-05,
"loss": 0.4827345013618469,
"step": 339
},
{
"epoch": 0.99179580674567,
"grad_norm": 1.2589871883392334,
"learning_rate": 1.1947781942506151e-05,
"loss": 0.405245304107666,
"step": 340
},
{
"epoch": 0.9947128532360985,
"grad_norm": 1.25636625289917,
"learning_rate": 1.1897814888242679e-05,
"loss": 0.37956133484840393,
"step": 341
},
{
"epoch": 0.9976298997265269,
"grad_norm": 2.7064895629882812,
"learning_rate": 1.1847798632027726e-05,
"loss": 0.489456444978714,
"step": 342
},
{
"epoch": 1.0,
"grad_norm": 1.6156240701675415,
"learning_rate": 1.1797734470561744e-05,
"loss": 0.46473199129104614,
"step": 343
},
{
"epoch": 1.0029170464904285,
"grad_norm": 1.3046343326568604,
"learning_rate": 1.1747623701787143e-05,
"loss": 0.3504878282546997,
"step": 344
},
{
"epoch": 1.005834092980857,
"grad_norm": 1.414828896522522,
"learning_rate": 1.1697467624854666e-05,
"loss": 0.4719260334968567,
"step": 345
},
{
"epoch": 1.0087511394712854,
"grad_norm": 1.1873356103897095,
"learning_rate": 1.164726754008969e-05,
"loss": 0.45313555002212524,
"step": 346
},
{
"epoch": 1.0116681859617138,
"grad_norm": 1.1382380723953247,
"learning_rate": 1.1597024748958526e-05,
"loss": 0.4365478456020355,
"step": 347
},
{
"epoch": 1.0145852324521423,
"grad_norm": 1.8141961097717285,
"learning_rate": 1.1546740554034661e-05,
"loss": 0.3694503605365753,
"step": 348
},
{
"epoch": 1.0175022789425707,
"grad_norm": 1.333388328552246,
"learning_rate": 1.1496416258965015e-05,
"loss": 0.4755721688270569,
"step": 349
},
{
"epoch": 1.0204193254329992,
"grad_norm": 1.3464443683624268,
"learning_rate": 1.1446053168436117e-05,
"loss": 0.4227846562862396,
"step": 350
},
{
"epoch": 1.0204193254329992,
"eval_loss": 0.44924086332321167,
"eval_runtime": 1214.6648,
"eval_samples_per_second": 0.52,
"eval_steps_per_second": 0.52,
"step": 350
},
{
"epoch": 1.0233363719234276,
"grad_norm": 1.2682689428329468,
"learning_rate": 1.1395652588140292e-05,
"loss": 0.44300130009651184,
"step": 351
},
{
"epoch": 1.0262534184138559,
"grad_norm": 1.7737696170806885,
"learning_rate": 1.1345215824741814e-05,
"loss": 0.5106258988380432,
"step": 352
},
{
"epoch": 1.0291704649042843,
"grad_norm": 1.2601238489151,
"learning_rate": 1.1294744185843014e-05,
"loss": 0.45930635929107666,
"step": 353
},
{
"epoch": 1.0320875113947128,
"grad_norm": 1.2162678241729736,
"learning_rate": 1.1244238979950406e-05,
"loss": 0.44163084030151367,
"step": 354
},
{
"epoch": 1.0350045578851412,
"grad_norm": 1.0905817747116089,
"learning_rate": 1.1193701516440733e-05,
"loss": 0.510662317276001,
"step": 355
},
{
"epoch": 1.0379216043755697,
"grad_norm": 0.9624952673912048,
"learning_rate": 1.1143133105527048e-05,
"loss": 0.5297917127609253,
"step": 356
},
{
"epoch": 1.0408386508659981,
"grad_norm": 1.2757681608200073,
"learning_rate": 1.1092535058224725e-05,
"loss": 0.4332093596458435,
"step": 357
},
{
"epoch": 1.0437556973564266,
"grad_norm": 1.6885719299316406,
"learning_rate": 1.104190868631748e-05,
"loss": 0.4337635040283203,
"step": 358
},
{
"epoch": 1.046672743846855,
"grad_norm": 1.175484538078308,
"learning_rate": 1.099125530232336e-05,
"loss": 0.45411020517349243,
"step": 359
},
{
"epoch": 1.0495897903372835,
"grad_norm": 1.0964939594268799,
"learning_rate": 1.0940576219460723e-05,
"loss": 0.5333439707756042,
"step": 360
},
{
"epoch": 1.052506836827712,
"grad_norm": 1.5493136644363403,
"learning_rate": 1.0889872751614176e-05,
"loss": 0.4400906264781952,
"step": 361
},
{
"epoch": 1.0554238833181404,
"grad_norm": 1.2491416931152344,
"learning_rate": 1.0839146213300526e-05,
"loss": 0.31049978733062744,
"step": 362
},
{
"epoch": 1.0583409298085689,
"grad_norm": 1.7213693857192993,
"learning_rate": 1.0788397919634694e-05,
"loss": 0.389009028673172,
"step": 363
},
{
"epoch": 1.0612579762989973,
"grad_norm": 1.5405336618423462,
"learning_rate": 1.0737629186295621e-05,
"loss": 0.4068562984466553,
"step": 364
},
{
"epoch": 1.0641750227894258,
"grad_norm": 1.225455641746521,
"learning_rate": 1.0686841329492159e-05,
"loss": 0.47358617186546326,
"step": 365
},
{
"epoch": 1.0670920692798542,
"grad_norm": 1.3436250686645508,
"learning_rate": 1.0636035665928945e-05,
"loss": 0.47050854563713074,
"step": 366
},
{
"epoch": 1.0700091157702827,
"grad_norm": 1.4952112436294556,
"learning_rate": 1.058521351277227e-05,
"loss": 0.43496906757354736,
"step": 367
},
{
"epoch": 1.072926162260711,
"grad_norm": 1.549112319946289,
"learning_rate": 1.0534376187615924e-05,
"loss": 0.45711052417755127,
"step": 368
},
{
"epoch": 1.0758432087511394,
"grad_norm": 1.3851526975631714,
"learning_rate": 1.048352500844704e-05,
"loss": 0.45045915246009827,
"step": 369
},
{
"epoch": 1.0787602552415678,
"grad_norm": 1.6302049160003662,
"learning_rate": 1.0432661293611927e-05,
"loss": 0.3736046254634857,
"step": 370
},
{
"epoch": 1.0816773017319963,
"grad_norm": 1.3365869522094727,
"learning_rate": 1.0381786361781885e-05,
"loss": 0.42242100834846497,
"step": 371
},
{
"epoch": 1.0845943482224247,
"grad_norm": 1.4369138479232788,
"learning_rate": 1.0330901531919026e-05,
"loss": 0.44570961594581604,
"step": 372
},
{
"epoch": 1.0875113947128532,
"grad_norm": 1.3528283834457397,
"learning_rate": 1.0280008123242069e-05,
"loss": 0.43440738320350647,
"step": 373
},
{
"epoch": 1.0904284412032816,
"grad_norm": 1.469660997390747,
"learning_rate": 1.0229107455192147e-05,
"loss": 0.3960394263267517,
"step": 374
},
{
"epoch": 1.09334548769371,
"grad_norm": 1.4542185068130493,
"learning_rate": 1.0178200847398595e-05,
"loss": 0.47834208607673645,
"step": 375
},
{
"epoch": 1.0962625341841385,
"grad_norm": 1.6470292806625366,
"learning_rate": 1.0127289619644737e-05,
"loss": 0.42791086435317993,
"step": 376
},
{
"epoch": 1.099179580674567,
"grad_norm": 1.1934021711349487,
"learning_rate": 1.0076375091833681e-05,
"loss": 0.4401305019855499,
"step": 377
},
{
"epoch": 1.1020966271649955,
"grad_norm": 0.9786668419837952,
"learning_rate": 1.0025458583954078e-05,
"loss": 0.4816555678844452,
"step": 378
},
{
"epoch": 1.105013673655424,
"grad_norm": 1.1348779201507568,
"learning_rate": 9.974541416045924e-06,
"loss": 0.41516968607902527,
"step": 379
},
{
"epoch": 1.1079307201458524,
"grad_norm": 1.0188615322113037,
"learning_rate": 9.923624908166322e-06,
"loss": 0.48087278008461,
"step": 380
},
{
"epoch": 1.1108477666362808,
"grad_norm": 1.0821740627288818,
"learning_rate": 9.872710380355263e-06,
"loss": 0.41974008083343506,
"step": 381
},
{
"epoch": 1.1137648131267093,
"grad_norm": 1.250951886177063,
"learning_rate": 9.82179915260141e-06,
"loss": 0.42703643441200256,
"step": 382
},
{
"epoch": 1.1166818596171377,
"grad_norm": 1.4528254270553589,
"learning_rate": 9.770892544807856e-06,
"loss": 0.43801453709602356,
"step": 383
},
{
"epoch": 1.1195989061075662,
"grad_norm": 1.813859462738037,
"learning_rate": 9.719991876757934e-06,
"loss": 0.4344240725040436,
"step": 384
},
{
"epoch": 1.1225159525979946,
"grad_norm": 1.6681253910064697,
"learning_rate": 9.669098468080976e-06,
"loss": 0.4356998801231384,
"step": 385
},
{
"epoch": 1.125432999088423,
"grad_norm": 1.3447953462600708,
"learning_rate": 9.618213638218117e-06,
"loss": 0.43189188838005066,
"step": 386
},
{
"epoch": 1.1283500455788513,
"grad_norm": 1.9577926397323608,
"learning_rate": 9.567338706388074e-06,
"loss": 0.34984707832336426,
"step": 387
},
{
"epoch": 1.1312670920692798,
"grad_norm": 1.5225576162338257,
"learning_rate": 9.516474991552965e-06,
"loss": 0.4243963062763214,
"step": 388
},
{
"epoch": 1.1341841385597082,
"grad_norm": 1.7416809797286987,
"learning_rate": 9.46562381238408e-06,
"loss": 0.3414606750011444,
"step": 389
},
{
"epoch": 1.1371011850501367,
"grad_norm": 1.8358951807022095,
"learning_rate": 9.414786487227732e-06,
"loss": 0.387447327375412,
"step": 390
},
{
"epoch": 1.1400182315405651,
"grad_norm": 1.9706153869628906,
"learning_rate": 9.363964334071057e-06,
"loss": 0.4599088728427887,
"step": 391
},
{
"epoch": 1.1429352780309936,
"grad_norm": 1.0604286193847656,
"learning_rate": 9.313158670507843e-06,
"loss": 0.4633581042289734,
"step": 392
},
{
"epoch": 1.145852324521422,
"grad_norm": 1.4851202964782715,
"learning_rate": 9.262370813704379e-06,
"loss": 0.3872259557247162,
"step": 393
},
{
"epoch": 1.1487693710118505,
"grad_norm": 1.7839159965515137,
"learning_rate": 9.21160208036531e-06,
"loss": 0.5215944647789001,
"step": 394
},
{
"epoch": 1.151686417502279,
"grad_norm": 1.3054656982421875,
"learning_rate": 9.160853786699475e-06,
"loss": 0.4030425548553467,
"step": 395
},
{
"epoch": 1.1546034639927074,
"grad_norm": 3.8467981815338135,
"learning_rate": 9.110127248385827e-06,
"loss": 0.4032524824142456,
"step": 396
},
{
"epoch": 1.1575205104831359,
"grad_norm": 1.8513801097869873,
"learning_rate": 9.05942378053928e-06,
"loss": 0.46577155590057373,
"step": 397
},
{
"epoch": 1.1604375569735643,
"grad_norm": 1.312689185142517,
"learning_rate": 9.008744697676642e-06,
"loss": 0.39114487171173096,
"step": 398
},
{
"epoch": 1.1633546034639928,
"grad_norm": 1.1996328830718994,
"learning_rate": 8.958091313682521e-06,
"loss": 0.481199711561203,
"step": 399
},
{
"epoch": 1.1662716499544212,
"grad_norm": 5.172409534454346,
"learning_rate": 8.90746494177528e-06,
"loss": 0.3803558945655823,
"step": 400
},
{
"epoch": 1.1662716499544212,
"eval_loss": 0.4318464398384094,
"eval_runtime": 1206.0306,
"eval_samples_per_second": 0.524,
"eval_steps_per_second": 0.524,
"step": 400
},
{
"epoch": 1.1691886964448497,
"grad_norm": 1.0115015506744385,
"learning_rate": 8.856866894472954e-06,
"loss": 0.39636704325675964,
"step": 401
},
{
"epoch": 1.172105742935278,
"grad_norm": 1.1557435989379883,
"learning_rate": 8.806298483559268e-06,
"loss": 0.4076298475265503,
"step": 402
},
{
"epoch": 1.1750227894257064,
"grad_norm": 1.2802515029907227,
"learning_rate": 8.755761020049597e-06,
"loss": 0.44352248311042786,
"step": 403
},
{
"epoch": 1.1779398359161348,
"grad_norm": 1.2755069732666016,
"learning_rate": 8.705255814156988e-06,
"loss": 0.390497624874115,
"step": 404
},
{
"epoch": 1.1808568824065633,
"grad_norm": 1.2799782752990723,
"learning_rate": 8.654784175258188e-06,
"loss": 0.35810694098472595,
"step": 405
},
{
"epoch": 1.1837739288969917,
"grad_norm": 1.0968674421310425,
"learning_rate": 8.604347411859713e-06,
"loss": 0.3890265226364136,
"step": 406
},
{
"epoch": 1.1866909753874202,
"grad_norm": 1.3334455490112305,
"learning_rate": 8.553946831563886e-06,
"loss": 0.3916901648044586,
"step": 407
},
{
"epoch": 1.1896080218778486,
"grad_norm": 1.1888184547424316,
"learning_rate": 8.503583741034988e-06,
"loss": 0.5231326222419739,
"step": 408
},
{
"epoch": 1.192525068368277,
"grad_norm": 1.1163763999938965,
"learning_rate": 8.45325944596534e-06,
"loss": 0.4249858558177948,
"step": 409
},
{
"epoch": 1.1954421148587056,
"grad_norm": 1.3470333814620972,
"learning_rate": 8.40297525104148e-06,
"loss": 0.5201632380485535,
"step": 410
},
{
"epoch": 1.198359161349134,
"grad_norm": 1.5412285327911377,
"learning_rate": 8.35273245991031e-06,
"loss": 0.39376699924468994,
"step": 411
},
{
"epoch": 1.2012762078395625,
"grad_norm": 1.3408735990524292,
"learning_rate": 8.302532375145339e-06,
"loss": 0.39554283022880554,
"step": 412
},
{
"epoch": 1.204193254329991,
"grad_norm": 1.990668773651123,
"learning_rate": 8.25237629821286e-06,
"loss": 0.42424261569976807,
"step": 413
},
{
"epoch": 1.2071103008204194,
"grad_norm": 1.6471989154815674,
"learning_rate": 8.202265529438259e-06,
"loss": 0.3234582543373108,
"step": 414
},
{
"epoch": 1.2100273473108478,
"grad_norm": 1.1483631134033203,
"learning_rate": 8.152201367972275e-06,
"loss": 0.39163246750831604,
"step": 415
},
{
"epoch": 1.2129443938012763,
"grad_norm": 1.800149917602539,
"learning_rate": 8.102185111757323e-06,
"loss": 0.5055042505264282,
"step": 416
},
{
"epoch": 1.2158614402917047,
"grad_norm": 1.4394795894622803,
"learning_rate": 8.052218057493849e-06,
"loss": 0.4761751592159271,
"step": 417
},
{
"epoch": 1.2187784867821332,
"grad_norm": 1.622689962387085,
"learning_rate": 8.002301500606715e-06,
"loss": 0.4490141272544861,
"step": 418
},
{
"epoch": 1.2216955332725616,
"grad_norm": 1.2564961910247803,
"learning_rate": 7.952436735211593e-06,
"loss": 0.3964035212993622,
"step": 419
},
{
"epoch": 1.22461257976299,
"grad_norm": 1.3248411417007446,
"learning_rate": 7.902625054081449e-06,
"loss": 0.46039122343063354,
"step": 420
},
{
"epoch": 1.2275296262534183,
"grad_norm": 1.568983793258667,
"learning_rate": 7.852867748613e-06,
"loss": 0.49916595220565796,
"step": 421
},
{
"epoch": 1.2304466727438468,
"grad_norm": 1.4784491062164307,
"learning_rate": 7.803166108793243e-06,
"loss": 0.4035068154335022,
"step": 422
},
{
"epoch": 1.2333637192342752,
"grad_norm": 1.2940057516098022,
"learning_rate": 7.753521423166007e-06,
"loss": 0.4154140055179596,
"step": 423
},
{
"epoch": 1.2362807657247037,
"grad_norm": 1.167786717414856,
"learning_rate": 7.703934978798565e-06,
"loss": 0.39541637897491455,
"step": 424
},
{
"epoch": 1.2391978122151321,
"grad_norm": 1.5126771926879883,
"learning_rate": 7.65440806124823e-06,
"loss": 0.37744253873825073,
"step": 425
},
{
"epoch": 1.2421148587055606,
"grad_norm": 1.2595263719558716,
"learning_rate": 7.604941954529067e-06,
"loss": 0.46380615234375,
"step": 426
},
{
"epoch": 1.245031905195989,
"grad_norm": 1.4258298873901367,
"learning_rate": 7.555537941078573e-06,
"loss": 0.3391319513320923,
"step": 427
},
{
"epoch": 1.2479489516864175,
"grad_norm": 1.5371774435043335,
"learning_rate": 7.506197301724446e-06,
"loss": 0.39805102348327637,
"step": 428
},
{
"epoch": 1.250865998176846,
"grad_norm": 1.3789173364639282,
"learning_rate": 7.456921315651371e-06,
"loss": 0.37969034910202026,
"step": 429
},
{
"epoch": 1.2537830446672744,
"grad_norm": 1.32931649684906,
"learning_rate": 7.407711260367867e-06,
"loss": 0.3841526508331299,
"step": 430
},
{
"epoch": 1.2567000911577029,
"grad_norm": 1.2836817502975464,
"learning_rate": 7.358568411673145e-06,
"loss": 0.340289443731308,
"step": 431
},
{
"epoch": 1.2596171376481313,
"grad_norm": 1.0418318510055542,
"learning_rate": 7.309494043624059e-06,
"loss": 0.44747158885002136,
"step": 432
},
{
"epoch": 1.2625341841385598,
"grad_norm": 1.1769362688064575,
"learning_rate": 7.260489428502058e-06,
"loss": 0.45737382769584656,
"step": 433
},
{
"epoch": 1.265451230628988,
"grad_norm": 2.2730748653411865,
"learning_rate": 7.211555836780203e-06,
"loss": 0.3827931582927704,
"step": 434
},
{
"epoch": 1.2683682771194165,
"grad_norm": 1.263096809387207,
"learning_rate": 7.162694537090235e-06,
"loss": 0.3589435815811157,
"step": 435
},
{
"epoch": 1.271285323609845,
"grad_norm": 1.4073514938354492,
"learning_rate": 7.113906796189692e-06,
"loss": 0.45206642150878906,
"step": 436
},
{
"epoch": 1.2742023701002734,
"grad_norm": 1.064585566520691,
"learning_rate": 7.0651938789290306e-06,
"loss": 0.5409261584281921,
"step": 437
},
{
"epoch": 1.2771194165907018,
"grad_norm": 1.2346999645233154,
"learning_rate": 7.016557048218889e-06,
"loss": 0.40680158138275146,
"step": 438
},
{
"epoch": 1.2800364630811303,
"grad_norm": 1.5816547870635986,
"learning_rate": 6.967997564997306e-06,
"loss": 0.38718655705451965,
"step": 439
},
{
"epoch": 1.2829535095715587,
"grad_norm": 1.085268259048462,
"learning_rate": 6.919516688197041e-06,
"loss": 0.4863276779651642,
"step": 440
},
{
"epoch": 1.2858705560619872,
"grad_norm": 1.0984629392623901,
"learning_rate": 6.871115674712937e-06,
"loss": 0.39562875032424927,
"step": 441
},
{
"epoch": 1.2887876025524156,
"grad_norm": 1.3004229068756104,
"learning_rate": 6.822795779369339e-06,
"loss": 0.44437694549560547,
"step": 442
},
{
"epoch": 1.291704649042844,
"grad_norm": 1.3541183471679688,
"learning_rate": 6.774558254887553e-06,
"loss": 0.4728967249393463,
"step": 443
},
{
"epoch": 1.2946216955332726,
"grad_norm": 1.2485377788543701,
"learning_rate": 6.7264043518533695e-06,
"loss": 0.4052809476852417,
"step": 444
},
{
"epoch": 1.297538742023701,
"grad_norm": 1.412827730178833,
"learning_rate": 6.67833531868465e-06,
"loss": 0.40149861574172974,
"step": 445
},
{
"epoch": 1.3004557885141295,
"grad_norm": 1.5576224327087402,
"learning_rate": 6.630352401598953e-06,
"loss": 0.44107240438461304,
"step": 446
},
{
"epoch": 1.303372835004558,
"grad_norm": 1.1551047563552856,
"learning_rate": 6.582456844581226e-06,
"loss": 0.4898405969142914,
"step": 447
},
{
"epoch": 1.3062898814949864,
"grad_norm": 1.9939689636230469,
"learning_rate": 6.5346498893515645e-06,
"loss": 0.4791329801082611,
"step": 448
},
{
"epoch": 1.3092069279854148,
"grad_norm": 1.4782553911209106,
"learning_rate": 6.486932775333002e-06,
"loss": 0.472908616065979,
"step": 449
},
{
"epoch": 1.3121239744758433,
"grad_norm": 1.2496148347854614,
"learning_rate": 6.439306739619387e-06,
"loss": 0.514995276927948,
"step": 450
},
{
"epoch": 1.3121239744758433,
"eval_loss": 0.4178673028945923,
"eval_runtime": 1197.5534,
"eval_samples_per_second": 0.528,
"eval_steps_per_second": 0.528,
"step": 450
},
{
"epoch": 1.3150410209662717,
"grad_norm": 1.3996772766113281,
"learning_rate": 6.391773016943316e-06,
"loss": 0.4087896943092346,
"step": 451
},
{
"epoch": 1.3179580674567002,
"grad_norm": 1.20390784740448,
"learning_rate": 6.344332839644111e-06,
"loss": 0.43224579095840454,
"step": 452
},
{
"epoch": 1.3208751139471286,
"grad_norm": 1.2709496021270752,
"learning_rate": 6.296987437635876e-06,
"loss": 0.44104093313217163,
"step": 453
},
{
"epoch": 1.323792160437557,
"grad_norm": 1.0112334489822388,
"learning_rate": 6.249738038375618e-06,
"loss": 0.47084498405456543,
"step": 454
},
{
"epoch": 1.3267092069279856,
"grad_norm": 1.0771515369415283,
"learning_rate": 6.202585866831411e-06,
"loss": 0.4700928032398224,
"step": 455
},
{
"epoch": 1.3296262534184138,
"grad_norm": 1.4937143325805664,
"learning_rate": 6.15553214545064e-06,
"loss": 0.345747709274292,
"step": 456
},
{
"epoch": 1.3325432999088422,
"grad_norm": 1.1348456144332886,
"learning_rate": 6.108578094128321e-06,
"loss": 0.33824583888053894,
"step": 457
},
{
"epoch": 1.3354603463992707,
"grad_norm": 1.2502707242965698,
"learning_rate": 6.061724930175461e-06,
"loss": 0.3528832197189331,
"step": 458
},
{
"epoch": 1.3383773928896991,
"grad_norm": 1.5359619855880737,
"learning_rate": 6.014973868287504e-06,
"loss": 0.4413869082927704,
"step": 459
},
{
"epoch": 1.3412944393801276,
"grad_norm": 0.9747081398963928,
"learning_rate": 5.9683261205128395e-06,
"loss": 0.6849499940872192,
"step": 460
},
{
"epoch": 1.344211485870556,
"grad_norm": 1.3150533437728882,
"learning_rate": 5.921782896221383e-06,
"loss": 0.3901931047439575,
"step": 461
},
{
"epoch": 1.3471285323609845,
"grad_norm": 1.137770652770996,
"learning_rate": 5.875345402073207e-06,
"loss": 0.37498384714126587,
"step": 462
},
{
"epoch": 1.350045578851413,
"grad_norm": 1.2216367721557617,
"learning_rate": 5.829014841987277e-06,
"loss": 0.3874579966068268,
"step": 463
},
{
"epoch": 1.3529626253418414,
"grad_norm": 1.135439157485962,
"learning_rate": 5.782792417110233e-06,
"loss": 0.384797066450119,
"step": 464
},
{
"epoch": 1.3558796718322699,
"grad_norm": 1.2400696277618408,
"learning_rate": 5.736679325785239e-06,
"loss": 0.46303266286849976,
"step": 465
},
{
"epoch": 1.3587967183226983,
"grad_norm": 1.8848882913589478,
"learning_rate": 5.6906767635209304e-06,
"loss": 0.5068309903144836,
"step": 466
},
{
"epoch": 1.3617137648131268,
"grad_norm": 1.4707008600234985,
"learning_rate": 5.644785922960412e-06,
"loss": 0.364332914352417,
"step": 467
},
{
"epoch": 1.364630811303555,
"grad_norm": 2.4436841011047363,
"learning_rate": 5.599007993850329e-06,
"loss": 0.485107421875,
"step": 468
},
{
"epoch": 1.3675478577939835,
"grad_norm": 1.1924740076065063,
"learning_rate": 5.553344163010039e-06,
"loss": 0.34547489881515503,
"step": 469
},
{
"epoch": 1.370464904284412,
"grad_norm": 1.1255877017974854,
"learning_rate": 5.507795614300846e-06,
"loss": 0.39645254611968994,
"step": 470
},
{
"epoch": 1.3733819507748404,
"grad_norm": 1.0937018394470215,
"learning_rate": 5.4623635285952815e-06,
"loss": 0.4267856478691101,
"step": 471
},
{
"epoch": 1.3762989972652688,
"grad_norm": 1.3355520963668823,
"learning_rate": 5.417049083746513e-06,
"loss": 0.3669992983341217,
"step": 472
},
{
"epoch": 1.3792160437556973,
"grad_norm": 1.7302504777908325,
"learning_rate": 5.3718534545578035e-06,
"loss": 0.3873697519302368,
"step": 473
},
{
"epoch": 1.3821330902461257,
"grad_norm": 1.17263662815094,
"learning_rate": 5.326777812752041e-06,
"loss": 0.4581540524959564,
"step": 474
},
{
"epoch": 1.3850501367365542,
"grad_norm": 1.0998128652572632,
"learning_rate": 5.281823326941377e-06,
"loss": 0.43062761425971985,
"step": 475
},
{
"epoch": 1.3879671832269826,
"grad_norm": 1.1194556951522827,
"learning_rate": 5.236991162596932e-06,
"loss": 0.381741464138031,
"step": 476
},
{
"epoch": 1.390884229717411,
"grad_norm": 1.2759051322937012,
"learning_rate": 5.19228248201856e-06,
"loss": 0.49175748229026794,
"step": 477
},
{
"epoch": 1.3938012762078396,
"grad_norm": 1.2134747505187988,
"learning_rate": 5.147698444304732e-06,
"loss": 0.4997562766075134,
"step": 478
},
{
"epoch": 1.396718322698268,
"grad_norm": 1.0833078622817993,
"learning_rate": 5.1032402053224804e-06,
"loss": 0.42580488324165344,
"step": 479
},
{
"epoch": 1.3996353691886965,
"grad_norm": 1.4838510751724243,
"learning_rate": 5.058908917677426e-06,
"loss": 0.5015593767166138,
"step": 480
},
{
"epoch": 1.402552415679125,
"grad_norm": 1.218610167503357,
"learning_rate": 5.014705730683904e-06,
"loss": 0.34739193320274353,
"step": 481
},
{
"epoch": 1.4054694621695534,
"grad_norm": 1.1883307695388794,
"learning_rate": 4.970631790335181e-06,
"loss": 0.41708022356033325,
"step": 482
},
{
"epoch": 1.4083865086599818,
"grad_norm": 1.209291696548462,
"learning_rate": 4.926688239273713e-06,
"loss": 0.43546172976493835,
"step": 483
},
{
"epoch": 1.4113035551504103,
"grad_norm": 1.0801606178283691,
"learning_rate": 4.882876216761543e-06,
"loss": 0.44491735100746155,
"step": 484
},
{
"epoch": 1.4142206016408387,
"grad_norm": 1.2746628522872925,
"learning_rate": 4.839196858650763e-06,
"loss": 0.436122864484787,
"step": 485
},
{
"epoch": 1.4171376481312672,
"grad_norm": 1.4465962648391724,
"learning_rate": 4.795651297354056e-06,
"loss": 0.3750447630882263,
"step": 486
},
{
"epoch": 1.4200546946216956,
"grad_norm": 1.6736211776733398,
"learning_rate": 4.752240661815346e-06,
"loss": 0.38286519050598145,
"step": 487
},
{
"epoch": 1.422971741112124,
"grad_norm": 1.1946996450424194,
"learning_rate": 4.708966077480544e-06,
"loss": 0.4488063156604767,
"step": 488
},
{
"epoch": 1.4258887876025526,
"grad_norm": 1.42599356174469,
"learning_rate": 4.665828666268335e-06,
"loss": 0.44088613986968994,
"step": 489
},
{
"epoch": 1.4288058340929808,
"grad_norm": 1.2281016111373901,
"learning_rate": 4.622829546541121e-06,
"loss": 0.4030645489692688,
"step": 490
},
{
"epoch": 1.4317228805834092,
"grad_norm": 1.2875670194625854,
"learning_rate": 4.57996983307602e-06,
"loss": 0.44702020287513733,
"step": 491
},
{
"epoch": 1.4346399270738377,
"grad_norm": 1.2456860542297363,
"learning_rate": 4.537250637035947e-06,
"loss": 0.4067370593547821,
"step": 492
},
{
"epoch": 1.4375569735642661,
"grad_norm": 1.2822725772857666,
"learning_rate": 4.494673065940833e-06,
"loss": 0.4237740635871887,
"step": 493
},
{
"epoch": 1.4404740200546946,
"grad_norm": 1.5517818927764893,
"learning_rate": 4.452238223638906e-06,
"loss": 0.40579724311828613,
"step": 494
},
{
"epoch": 1.443391066545123,
"grad_norm": 1.275344967842102,
"learning_rate": 4.409947210278056e-06,
"loss": 0.38880717754364014,
"step": 495
},
{
"epoch": 1.4463081130355515,
"grad_norm": 1.22952139377594,
"learning_rate": 4.367801122277327e-06,
"loss": 0.4042310416698456,
"step": 496
},
{
"epoch": 1.44922515952598,
"grad_norm": 1.122261643409729,
"learning_rate": 4.325801052298493e-06,
"loss": 0.5408368110656738,
"step": 497
},
{
"epoch": 1.4521422060164084,
"grad_norm": 1.5885361433029175,
"learning_rate": 4.283948089217715e-06,
"loss": 0.37697717547416687,
"step": 498
},
{
"epoch": 1.4550592525068369,
"grad_norm": 2.3565149307250977,
"learning_rate": 4.242243318097338e-06,
"loss": 0.3811529576778412,
"step": 499
},
{
"epoch": 1.4579762989972653,
"grad_norm": 1.1944137811660767,
"learning_rate": 4.200687820157735e-06,
"loss": 0.414781391620636,
"step": 500
},
{
"epoch": 1.4579762989972653,
"eval_loss": 0.40706494450569153,
"eval_runtime": 1189.1593,
"eval_samples_per_second": 0.531,
"eval_steps_per_second": 0.531,
"step": 500
},
{
"epoch": 1.4608933454876938,
"grad_norm": 1.0442464351654053,
"learning_rate": 4.159282672749289e-06,
"loss": 0.38155990839004517,
"step": 501
},
{
"epoch": 1.463810391978122,
"grad_norm": 1.7274727821350098,
"learning_rate": 4.118028949324453e-06,
"loss": 0.4830601215362549,
"step": 502
},
{
"epoch": 1.4667274384685505,
"grad_norm": 2.064513921737671,
"learning_rate": 4.0769277194099345e-06,
"loss": 0.3975123167037964,
"step": 503
},
{
"epoch": 1.469644484958979,
"grad_norm": 1.7695534229278564,
"learning_rate": 4.035980048578942e-06,
"loss": 0.37033841013908386,
"step": 504
},
{
"epoch": 1.4725615314494074,
"grad_norm": 1.4455046653747559,
"learning_rate": 3.995186998423597e-06,
"loss": 0.39567673206329346,
"step": 505
},
{
"epoch": 1.4754785779398358,
"grad_norm": 1.1791958808898926,
"learning_rate": 3.9545496265273765e-06,
"loss": 0.44786664843559265,
"step": 506
},
{
"epoch": 1.4783956244302643,
"grad_norm": 2.0874717235565186,
"learning_rate": 3.9140689864377105e-06,
"loss": 0.3333263099193573,
"step": 507
},
{
"epoch": 1.4813126709206927,
"grad_norm": 1.5897501707077026,
"learning_rate": 3.873746127638668e-06,
"loss": 0.5105943083763123,
"step": 508
},
{
"epoch": 1.4842297174111212,
"grad_norm": 1.5059760808944702,
"learning_rate": 3.833582095523749e-06,
"loss": 0.43922683596611023,
"step": 509
},
{
"epoch": 1.4871467639015497,
"grad_norm": 1.379347562789917,
"learning_rate": 3.7935779313687648e-06,
"loss": 0.4584790766239166,
"step": 510
},
{
"epoch": 1.490063810391978,
"grad_norm": 1.0984690189361572,
"learning_rate": 3.7537346723048816e-06,
"loss": 0.5217512249946594,
"step": 511
},
{
"epoch": 1.4929808568824066,
"grad_norm": 1.5944225788116455,
"learning_rate": 3.71405335129169e-06,
"loss": 0.4180052876472473,
"step": 512
},
{
"epoch": 1.495897903372835,
"grad_norm": 1.2745033502578735,
"learning_rate": 3.6745349970904465e-06,
"loss": 0.4584833085536957,
"step": 513
},
{
"epoch": 1.4988149498632635,
"grad_norm": 1.2746814489364624,
"learning_rate": 3.6351806342374007e-06,
"loss": 0.3202287554740906,
"step": 514
},
{
"epoch": 1.501731996353692,
"grad_norm": 1.409638524055481,
"learning_rate": 3.5959912830172348e-06,
"loss": 0.37963351607322693,
"step": 515
},
{
"epoch": 1.5046490428441204,
"grad_norm": 1.1655553579330444,
"learning_rate": 3.556967959436591e-06,
"loss": 0.43133026361465454,
"step": 516
},
{
"epoch": 1.5075660893345488,
"grad_norm": 1.0495020151138306,
"learning_rate": 3.518111675197776e-06,
"loss": 0.3739299178123474,
"step": 517
},
{
"epoch": 1.5104831358249773,
"grad_norm": 1.3055057525634766,
"learning_rate": 3.4794234376724835e-06,
"loss": 0.4099601209163666,
"step": 518
},
{
"epoch": 1.5134001823154057,
"grad_norm": 1.2252463102340698,
"learning_rate": 3.4409042498757084e-06,
"loss": 0.380616158246994,
"step": 519
},
{
"epoch": 1.5163172288058342,
"grad_norm": 1.2728638648986816,
"learning_rate": 3.4025551104397294e-06,
"loss": 0.3510003685951233,
"step": 520
},
{
"epoch": 1.5192342752962626,
"grad_norm": 2.70664644241333,
"learning_rate": 3.3643770135882282e-06,
"loss": 0.4087940752506256,
"step": 521
},
{
"epoch": 1.522151321786691,
"grad_norm": 1.6197112798690796,
"learning_rate": 3.3263709491104933e-06,
"loss": 0.45614126324653625,
"step": 522
},
{
"epoch": 1.5250683682771196,
"grad_norm": 1.3596103191375732,
"learning_rate": 3.2885379023357956e-06,
"loss": 0.3824586272239685,
"step": 523
},
{
"epoch": 1.527985414767548,
"grad_norm": 1.1768635511398315,
"learning_rate": 3.2508788541078097e-06,
"loss": 0.47717779874801636,
"step": 524
},
{
"epoch": 1.5309024612579762,
"grad_norm": 1.669474482536316,
"learning_rate": 3.2133947807591958e-06,
"loss": 0.4013281762599945,
"step": 525
},
{
"epoch": 1.5338195077484047,
"grad_norm": 1.600868582725525,
"learning_rate": 3.1760866540862932e-06,
"loss": 0.367280513048172,
"step": 526
},
{
"epoch": 1.5367365542388332,
"grad_norm": 1.1689515113830566,
"learning_rate": 3.138955441323923e-06,
"loss": 0.4432409405708313,
"step": 527
},
{
"epoch": 1.5396536007292616,
"grad_norm": 2.361961603164673,
"learning_rate": 3.1020021051202973e-06,
"loss": 0.4219942092895508,
"step": 528
},
{
"epoch": 1.54257064721969,
"grad_norm": 1.1962230205535889,
"learning_rate": 3.0652276035120964e-06,
"loss": 0.3672596514225006,
"step": 529
},
{
"epoch": 1.5454876937101185,
"grad_norm": 1.4149441719055176,
"learning_rate": 3.0286328898995963e-06,
"loss": 0.42919260263442993,
"step": 530
},
{
"epoch": 1.548404740200547,
"grad_norm": 1.2668434381484985,
"learning_rate": 2.992218913021966e-06,
"loss": 0.4499061107635498,
"step": 531
},
{
"epoch": 1.5513217866909754,
"grad_norm": 1.268114686012268,
"learning_rate": 2.9559866169326734e-06,
"loss": 0.34660714864730835,
"step": 532
},
{
"epoch": 1.5542388331814039,
"grad_norm": 1.0086419582366943,
"learning_rate": 2.919936940975007e-06,
"loss": 0.38239023089408875,
"step": 533
},
{
"epoch": 1.557155879671832,
"grad_norm": 1.0700170993804932,
"learning_rate": 2.884070819757712e-06,
"loss": 0.48240017890930176,
"step": 534
},
{
"epoch": 1.5600729261622606,
"grad_norm": 1.2101227045059204,
"learning_rate": 2.8483891831307873e-06,
"loss": 0.4098761975765228,
"step": 535
},
{
"epoch": 1.562989972652689,
"grad_norm": 1.2731400728225708,
"learning_rate": 2.8128929561613505e-06,
"loss": 0.45641395449638367,
"step": 536
},
{
"epoch": 1.5659070191431175,
"grad_norm": 1.1474392414093018,
"learning_rate": 2.777583059109671e-06,
"loss": 0.42283985018730164,
"step": 537
},
{
"epoch": 1.568824065633546,
"grad_norm": 1.789881944656372,
"learning_rate": 2.7424604074053028e-06,
"loss": 0.3469158113002777,
"step": 538
},
{
"epoch": 1.5717411121239744,
"grad_norm": 1.3426933288574219,
"learning_rate": 2.707525911623362e-06,
"loss": 0.35837510228157043,
"step": 539
},
{
"epoch": 1.5746581586144028,
"grad_norm": 1.2343578338623047,
"learning_rate": 2.672780477460901e-06,
"loss": 0.4736083745956421,
"step": 540
},
{
"epoch": 1.5775752051048313,
"grad_norm": 1.516298770904541,
"learning_rate": 2.638225005713457e-06,
"loss": 0.34345340728759766,
"step": 541
},
{
"epoch": 1.5804922515952597,
"grad_norm": 1.1488829851150513,
"learning_rate": 2.6038603922516705e-06,
"loss": 0.4134179949760437,
"step": 542
},
{
"epoch": 1.5834092980856882,
"grad_norm": 1.4486491680145264,
"learning_rate": 2.569687527998073e-06,
"loss": 0.3297592103481293,
"step": 543
},
{
"epoch": 1.5863263445761167,
"grad_norm": 1.272691011428833,
"learning_rate": 2.5357072989039855e-06,
"loss": 0.3958476185798645,
"step": 544
},
{
"epoch": 1.589243391066545,
"grad_norm": 1.244240641593933,
"learning_rate": 2.501920585926555e-06,
"loss": 0.4125611186027527,
"step": 545
},
{
"epoch": 1.5921604375569736,
"grad_norm": 1.5844073295593262,
"learning_rate": 2.4683282650058992e-06,
"loss": 0.3762253224849701,
"step": 546
},
{
"epoch": 1.595077484047402,
"grad_norm": 1.8209946155548096,
"learning_rate": 2.4349312070424258e-06,
"loss": 0.37053319811820984,
"step": 547
},
{
"epoch": 1.5979945305378305,
"grad_norm": 1.3752915859222412,
"learning_rate": 2.4017302778742247e-06,
"loss": 0.5004774332046509,
"step": 548
},
{
"epoch": 1.600911577028259,
"grad_norm": 5.143753528594971,
"learning_rate": 2.36872633825464e-06,
"loss": 0.39014023542404175,
"step": 549
},
{
"epoch": 1.6038286235186874,
"grad_norm": 1.0730944871902466,
"learning_rate": 2.335920243829941e-06,
"loss": 0.378440260887146,
"step": 550
},
{
"epoch": 1.6038286235186874,
"eval_loss": 0.40037089586257935,
"eval_runtime": 893.7411,
"eval_samples_per_second": 0.707,
"eval_steps_per_second": 0.707,
"step": 550
},
{
"epoch": 1.6067456700091158,
"grad_norm": 1.5507797002792358,
"learning_rate": 2.3033128451171548e-06,
"loss": 0.4471960663795471,
"step": 551
},
{
"epoch": 1.6096627164995443,
"grad_norm": 1.9462968111038208,
"learning_rate": 2.2709049874819924e-06,
"loss": 0.3658301830291748,
"step": 552
},
{
"epoch": 1.6125797629899727,
"grad_norm": 1.2034238576889038,
"learning_rate": 2.238697511116962e-06,
"loss": 0.3911179304122925,
"step": 553
},
{
"epoch": 1.6154968094804012,
"grad_norm": 1.3574327230453491,
"learning_rate": 2.2066912510195636e-06,
"loss": 0.3998897671699524,
"step": 554
},
{
"epoch": 1.6184138559708297,
"grad_norm": 1.1973012685775757,
"learning_rate": 2.1748870369706507e-06,
"loss": 0.38577449321746826,
"step": 555
},
{
"epoch": 1.621330902461258,
"grad_norm": 1.9365874528884888,
"learning_rate": 2.1432856935129144e-06,
"loss": 0.411307156085968,
"step": 556
},
{
"epoch": 1.6242479489516866,
"grad_norm": 1.3558642864227295,
"learning_rate": 2.1118880399295106e-06,
"loss": 0.38424253463745117,
"step": 557
},
{
"epoch": 1.627164995442115,
"grad_norm": 1.4368890523910522,
"learning_rate": 2.0806948902228075e-06,
"loss": 0.39943546056747437,
"step": 558
},
{
"epoch": 1.6300820419325432,
"grad_norm": 1.6266753673553467,
"learning_rate": 2.0497070530933084e-06,
"loss": 0.36787641048431396,
"step": 559
},
{
"epoch": 1.6329990884229717,
"grad_norm": 1.2600938081741333,
"learning_rate": 2.0189253319186576e-06,
"loss": 0.3781934380531311,
"step": 560
},
{
"epoch": 1.6359161349134002,
"grad_norm": 1.975071907043457,
"learning_rate": 1.9883505247328237e-06,
"loss": 0.4132305383682251,
"step": 561
},
{
"epoch": 1.6388331814038286,
"grad_norm": 1.4095909595489502,
"learning_rate": 1.9579834242054154e-06,
"loss": 0.3727574646472931,
"step": 562
},
{
"epoch": 1.641750227894257,
"grad_norm": 1.4271371364593506,
"learning_rate": 1.9278248176211243e-06,
"loss": 0.33786773681640625,
"step": 563
},
{
"epoch": 1.6446672743846855,
"grad_norm": 1.5907646417617798,
"learning_rate": 1.8978754868593074e-06,
"loss": 0.33035099506378174,
"step": 564
},
{
"epoch": 1.647584320875114,
"grad_norm": 1.1315702199935913,
"learning_rate": 1.8681362083737387e-06,
"loss": 0.41707149147987366,
"step": 565
},
{
"epoch": 1.6505013673655424,
"grad_norm": 1.4737143516540527,
"learning_rate": 1.8386077531724556e-06,
"loss": 0.43079230189323425,
"step": 566
},
{
"epoch": 1.6534184138559709,
"grad_norm": 1.1006760597229004,
"learning_rate": 1.8092908867977822e-06,
"loss": 0.3524904251098633,
"step": 567
},
{
"epoch": 1.6563354603463991,
"grad_norm": 1.4066118001937866,
"learning_rate": 1.780186369306479e-06,
"loss": 0.3695681691169739,
"step": 568
},
{
"epoch": 1.6592525068368276,
"grad_norm": 1.6444640159606934,
"learning_rate": 1.7512949552500412e-06,
"loss": 0.35596007108688354,
"step": 569
},
{
"epoch": 1.662169553327256,
"grad_norm": 1.159480094909668,
"learning_rate": 1.7226173936551282e-06,
"loss": 0.4520571827888489,
"step": 570
},
{
"epoch": 1.6650865998176845,
"grad_norm": 1.5874221324920654,
"learning_rate": 1.6941544280041567e-06,
"loss": 0.4702282249927521,
"step": 571
},
{
"epoch": 1.668003646308113,
"grad_norm": 1.6153535842895508,
"learning_rate": 1.6659067962160157e-06,
"loss": 0.3803800046443939,
"step": 572
},
{
"epoch": 1.6709206927985414,
"grad_norm": 1.0748940706253052,
"learning_rate": 1.6378752306269386e-06,
"loss": 0.4368419051170349,
"step": 573
},
{
"epoch": 1.6738377392889698,
"grad_norm": 1.5286788940429688,
"learning_rate": 1.6100604579715185e-06,
"loss": 0.4195623993873596,
"step": 574
},
{
"epoch": 1.6767547857793983,
"grad_norm": 1.1433510780334473,
"learning_rate": 1.5824631993638651e-06,
"loss": 0.4366849660873413,
"step": 575
},
{
"epoch": 1.6796718322698267,
"grad_norm": 1.9694907665252686,
"learning_rate": 1.5550841702789122e-06,
"loss": 0.5555303692817688,
"step": 576
},
{
"epoch": 1.6825888787602552,
"grad_norm": 1.7587188482284546,
"learning_rate": 1.5279240805338647e-06,
"loss": 0.40394848585128784,
"step": 577
},
{
"epoch": 1.6855059252506837,
"grad_norm": 1.063381314277649,
"learning_rate": 1.5009836342697993e-06,
"loss": 0.49564215540885925,
"step": 578
},
{
"epoch": 1.688422971741112,
"grad_norm": 1.1742531061172485,
"learning_rate": 1.4742635299334063e-06,
"loss": 0.3891904950141907,
"step": 579
},
{
"epoch": 1.6913400182315406,
"grad_norm": 1.499934196472168,
"learning_rate": 1.4477644602588848e-06,
"loss": 0.35497623682022095,
"step": 580
},
{
"epoch": 1.694257064721969,
"grad_norm": 1.5112360715866089,
"learning_rate": 1.421487112249984e-06,
"loss": 0.4062272012233734,
"step": 581
},
{
"epoch": 1.6971741112123975,
"grad_norm": 1.3583141565322876,
"learning_rate": 1.3954321671621885e-06,
"loss": 0.3655265271663666,
"step": 582
},
{
"epoch": 1.700091157702826,
"grad_norm": 2.8181653022766113,
"learning_rate": 1.3696003004850577e-06,
"loss": 0.37418332695961,
"step": 583
},
{
"epoch": 1.7030082041932544,
"grad_norm": 0.967166543006897,
"learning_rate": 1.3439921819247138e-06,
"loss": 0.4946930408477783,
"step": 584
},
{
"epoch": 1.7059252506836828,
"grad_norm": 1.2773699760437012,
"learning_rate": 1.3186084753864813e-06,
"loss": 0.5101871490478516,
"step": 585
},
{
"epoch": 1.7088422971741113,
"grad_norm": 1.2814991474151611,
"learning_rate": 1.293449838957671e-06,
"loss": 0.3688133656978607,
"step": 586
},
{
"epoch": 1.7117593436645397,
"grad_norm": 1.594966173171997,
"learning_rate": 1.2685169248905228e-06,
"loss": 0.4739398956298828,
"step": 587
},
{
"epoch": 1.7146763901549682,
"grad_norm": 1.1471531391143799,
"learning_rate": 1.2438103795852885e-06,
"loss": 0.3719588816165924,
"step": 588
},
{
"epoch": 1.7175934366453967,
"grad_norm": 1.1657356023788452,
"learning_rate": 1.2193308435734852e-06,
"loss": 0.4119298458099365,
"step": 589
},
{
"epoch": 1.720510483135825,
"grad_norm": 1.1239042282104492,
"learning_rate": 1.1950789515012783e-06,
"loss": 0.38277503848075867,
"step": 590
},
{
"epoch": 1.7234275296262536,
"grad_norm": 1.149478554725647,
"learning_rate": 1.1710553321130324e-06,
"loss": 0.35080626606941223,
"step": 591
},
{
"epoch": 1.726344576116682,
"grad_norm": 1.2020260095596313,
"learning_rate": 1.1472606082350112e-06,
"loss": 0.3991318345069885,
"step": 592
},
{
"epoch": 1.7292616226071102,
"grad_norm": 1.101475477218628,
"learning_rate": 1.123695396759229e-06,
"loss": 0.45791420340538025,
"step": 593
},
{
"epoch": 1.7321786690975387,
"grad_norm": 0.9617101550102234,
"learning_rate": 1.1003603086274584e-06,
"loss": 0.39805036783218384,
"step": 594
},
{
"epoch": 1.7350957155879672,
"grad_norm": 1.1439731121063232,
"learning_rate": 1.07725594881539e-06,
"loss": 0.35753339529037476,
"step": 595
},
{
"epoch": 1.7380127620783956,
"grad_norm": 1.0350618362426758,
"learning_rate": 1.0543829163169516e-06,
"loss": 0.42581748962402344,
"step": 596
},
{
"epoch": 1.740929808568824,
"grad_norm": 1.2865227460861206,
"learning_rate": 1.031741804128773e-06,
"loss": 0.34685325622558594,
"step": 597
},
{
"epoch": 1.7438468550592525,
"grad_norm": 1.2079373598098755,
"learning_rate": 1.0093331992348154e-06,
"loss": 0.48401936888694763,
"step": 598
},
{
"epoch": 1.746763901549681,
"grad_norm": 1.1684436798095703,
"learning_rate": 9.871576825911577e-07,
"loss": 0.387456476688385,
"step": 599
},
{
"epoch": 1.7496809480401094,
"grad_norm": 1.298045039176941,
"learning_rate": 9.65215829110927e-07,
"loss": 0.40196847915649414,
"step": 600
},
{
"epoch": 1.7496809480401094,
"eval_loss": 0.3965963125228882,
"eval_runtime": 912.3102,
"eval_samples_per_second": 0.693,
"eval_steps_per_second": 0.693,
"step": 600
},
{
"epoch": 1.7525979945305379,
"grad_norm": 1.24501371383667,
"learning_rate": 9.435082076493974e-07,
"loss": 0.3990224003791809,
"step": 601
},
{
"epoch": 1.7555150410209661,
"grad_norm": 1.0634632110595703,
"learning_rate": 9.220353809892435e-07,
"loss": 0.44232451915740967,
"step": 602
},
{
"epoch": 1.7584320875113946,
"grad_norm": 1.0276325941085815,
"learning_rate": 9.007979058259475e-07,
"loss": 0.5336061716079712,
"step": 603
},
{
"epoch": 1.761349134001823,
"grad_norm": 1.1488786935806274,
"learning_rate": 8.797963327533698e-07,
"loss": 0.35023194551467896,
"step": 604
},
{
"epoch": 1.7642661804922515,
"grad_norm": 1.171109676361084,
"learning_rate": 8.590312062494699e-07,
"loss": 0.4461829662322998,
"step": 605
},
{
"epoch": 1.76718322698268,
"grad_norm": 1.3948134183883667,
"learning_rate": 8.385030646621938e-07,
"loss": 0.3448236584663391,
"step": 606
},
{
"epoch": 1.7701002734731084,
"grad_norm": 1.144608497619629,
"learning_rate": 8.18212440195515e-07,
"loss": 0.39913487434387207,
"step": 607
},
{
"epoch": 1.7730173199635368,
"grad_norm": 1.1941088438034058,
"learning_rate": 7.981598588956396e-07,
"loss": 0.40005186200141907,
"step": 608
},
{
"epoch": 1.7759343664539653,
"grad_norm": 1.1087690591812134,
"learning_rate": 7.783458406373656e-07,
"loss": 0.38895174860954285,
"step": 609
},
{
"epoch": 1.7788514129443938,
"grad_norm": 1.1787676811218262,
"learning_rate": 7.587708991106069e-07,
"loss": 0.36259594559669495,
"step": 610
},
{
"epoch": 1.7817684594348222,
"grad_norm": 1.1265360116958618,
"learning_rate": 7.394355418070731e-07,
"loss": 0.44475269317626953,
"step": 611
},
{
"epoch": 1.7846855059252507,
"grad_norm": 1.2230898141860962,
"learning_rate": 7.203402700071138e-07,
"loss": 0.3823542594909668,
"step": 612
},
{
"epoch": 1.7876025524156791,
"grad_norm": 1.0893492698669434,
"learning_rate": 7.01485578766724e-07,
"loss": 0.43276944756507874,
"step": 613
},
{
"epoch": 1.7905195989061076,
"grad_norm": 1.039494514465332,
"learning_rate": 6.828719569047082e-07,
"loss": 0.5362570881843567,
"step": 614
},
{
"epoch": 1.793436645396536,
"grad_norm": 1.0307413339614868,
"learning_rate": 6.644998869900054e-07,
"loss": 0.34828731417655945,
"step": 615
},
{
"epoch": 1.7963536918869645,
"grad_norm": 1.1253540515899658,
"learning_rate": 6.463698453291823e-07,
"loss": 0.3669811487197876,
"step": 616
},
{
"epoch": 1.799270738377393,
"grad_norm": 1.1103028059005737,
"learning_rate": 6.28482301954082e-07,
"loss": 0.3868233561515808,
"step": 617
},
{
"epoch": 1.8021877848678214,
"grad_norm": 1.0804798603057861,
"learning_rate": 6.108377206096394e-07,
"loss": 0.4123673439025879,
"step": 618
},
{
"epoch": 1.8051048313582498,
"grad_norm": 1.1068788766860962,
"learning_rate": 5.934365587418567e-07,
"loss": 0.44468799233436584,
"step": 619
},
{
"epoch": 1.8080218778486783,
"grad_norm": 1.0318645238876343,
"learning_rate": 5.762792674859474e-07,
"loss": 0.3586595356464386,
"step": 620
},
{
"epoch": 1.8109389243391067,
"grad_norm": 1.1553035974502563,
"learning_rate": 5.593662916546361e-07,
"loss": 0.4580552577972412,
"step": 621
},
{
"epoch": 1.8138559708295352,
"grad_norm": 1.3010531663894653,
"learning_rate": 5.426980697266271e-07,
"loss": 0.42412641644477844,
"step": 622
},
{
"epoch": 1.8167730173199637,
"grad_norm": 1.1858006715774536,
"learning_rate": 5.262750338352418e-07,
"loss": 0.38257676362991333,
"step": 623
},
{
"epoch": 1.8196900638103921,
"grad_norm": 1.1341536045074463,
"learning_rate": 5.100976097572074e-07,
"loss": 0.48365846276283264,
"step": 624
},
{
"epoch": 1.8226071103008206,
"grad_norm": 1.112844467163086,
"learning_rate": 4.941662169016237e-07,
"loss": 0.3893233835697174,
"step": 625
},
{
"epoch": 1.825524156791249,
"grad_norm": 1.1846497058868408,
"learning_rate": 4.784812682990903e-07,
"loss": 0.38869139552116394,
"step": 626
},
{
"epoch": 1.8284412032816773,
"grad_norm": 1.1383928060531616,
"learning_rate": 4.6304317059099326e-07,
"loss": 0.36156678199768066,
"step": 627
},
{
"epoch": 1.8313582497721057,
"grad_norm": 1.0891298055648804,
"learning_rate": 4.478523240189703e-07,
"loss": 0.40910348296165466,
"step": 628
},
{
"epoch": 1.8342752962625342,
"grad_norm": 1.1337662935256958,
"learning_rate": 4.3290912241452545e-07,
"loss": 0.3360365629196167,
"step": 629
},
{
"epoch": 1.8371923427529626,
"grad_norm": 1.280463695526123,
"learning_rate": 4.182139531888263e-07,
"loss": 0.44318532943725586,
"step": 630
},
{
"epoch": 1.840109389243391,
"grad_norm": 1.1408170461654663,
"learning_rate": 4.0376719732265647e-07,
"loss": 0.37003564834594727,
"step": 631
},
{
"epoch": 1.8430264357338195,
"grad_norm": 0.9730168581008911,
"learning_rate": 3.8956922935653895e-07,
"loss": 0.355985552072525,
"step": 632
},
{
"epoch": 1.845943482224248,
"grad_norm": 1.0643151998519897,
"learning_rate": 3.756204173810263e-07,
"loss": 0.3911808729171753,
"step": 633
},
{
"epoch": 1.8488605287146764,
"grad_norm": 1.1769851446151733,
"learning_rate": 3.61921123027158e-07,
"loss": 0.314385324716568,
"step": 634
},
{
"epoch": 1.8517775752051049,
"grad_norm": 0.921336829662323,
"learning_rate": 3.484717014570838e-07,
"loss": 0.3375144302845001,
"step": 635
},
{
"epoch": 1.8546946216955331,
"grad_norm": 0.9904773235321045,
"learning_rate": 3.3527250135485744e-07,
"loss": 0.4461369514465332,
"step": 636
},
{
"epoch": 1.8576116681859616,
"grad_norm": 1.0844534635543823,
"learning_rate": 3.223238649173954e-07,
"loss": 0.398414671421051,
"step": 637
},
{
"epoch": 1.86052871467639,
"grad_norm": 0.9829220771789551,
"learning_rate": 3.096261278456048e-07,
"loss": 0.35938704013824463,
"step": 638
},
{
"epoch": 1.8634457611668185,
"grad_norm": 1.13048255443573,
"learning_rate": 2.971796193356835e-07,
"loss": 0.3783624768257141,
"step": 639
},
{
"epoch": 1.866362807657247,
"grad_norm": 1.4307893514633179,
"learning_rate": 2.8498466207058095e-07,
"loss": 0.3601874113082886,
"step": 640
},
{
"epoch": 1.8692798541476754,
"grad_norm": 1.1835116147994995,
"learning_rate": 2.7304157221163753e-07,
"loss": 0.43897169828414917,
"step": 641
},
{
"epoch": 1.8721969006381038,
"grad_norm": 1.0730469226837158,
"learning_rate": 2.613506593903825e-07,
"loss": 0.4407995343208313,
"step": 642
},
{
"epoch": 1.8751139471285323,
"grad_norm": 0.9504678845405579,
"learning_rate": 2.499122267005105e-07,
"loss": 0.4105035960674286,
"step": 643
},
{
"epoch": 1.8780309936189608,
"grad_norm": 1.2599385976791382,
"learning_rate": 2.387265706900199e-07,
"loss": 0.41521430015563965,
"step": 644
},
{
"epoch": 1.8809480401093892,
"grad_norm": 1.035783052444458,
"learning_rate": 2.2779398135353127e-07,
"loss": 0.33491846919059753,
"step": 645
},
{
"epoch": 1.8838650865998177,
"grad_norm": 1.1612690687179565,
"learning_rate": 2.1711474212476325e-07,
"loss": 0.3367970287799835,
"step": 646
},
{
"epoch": 1.8867821330902461,
"grad_norm": 1.2541207075119019,
"learning_rate": 2.066891298691831e-07,
"loss": 0.46374717354774475,
"step": 647
},
{
"epoch": 1.8896991795806746,
"grad_norm": 1.1037088632583618,
"learning_rate": 1.9651741487683562e-07,
"loss": 0.3799871802330017,
"step": 648
},
{
"epoch": 1.892616226071103,
"grad_norm": 1.3611476421356201,
"learning_rate": 1.8659986085532988e-07,
"loss": 0.40523889660835266,
"step": 649
},
{
"epoch": 1.8955332725615315,
"grad_norm": 1.1628823280334473,
"learning_rate": 1.7693672492300473e-07,
"loss": 0.38399839401245117,
"step": 650
},
{
"epoch": 1.8955332725615315,
"eval_loss": 0.3949255049228668,
"eval_runtime": 903.6455,
"eval_samples_per_second": 0.699,
"eval_steps_per_second": 0.699,
"step": 650
},
{
"epoch": 1.89845031905196,
"grad_norm": 1.1185522079467773,
"learning_rate": 1.675282576022641e-07,
"loss": 0.4280855059623718,
"step": 651
},
{
"epoch": 1.9013673655423884,
"grad_norm": 1.1962717771530151,
"learning_rate": 1.5837470281307666e-07,
"loss": 0.3026162087917328,
"step": 652
},
{
"epoch": 1.9042844120328168,
"grad_norm": 1.1818240880966187,
"learning_rate": 1.4947629786666084e-07,
"loss": 0.43283963203430176,
"step": 653
},
{
"epoch": 1.9072014585232453,
"grad_norm": 1.161944031715393,
"learning_rate": 1.4083327345932208e-07,
"loss": 0.435259610414505,
"step": 654
},
{
"epoch": 1.9101185050136738,
"grad_norm": 1.1311709880828857,
"learning_rate": 1.32445853666483e-07,
"loss": 0.3258042633533478,
"step": 655
},
{
"epoch": 1.9130355515041022,
"grad_norm": 1.0152852535247803,
"learning_rate": 1.2431425593686263e-07,
"loss": 0.40951770544052124,
"step": 656
},
{
"epoch": 1.9159525979945307,
"grad_norm": 1.2698794603347778,
"learning_rate": 1.164386910868498e-07,
"loss": 0.3610893785953522,
"step": 657
},
{
"epoch": 1.9188696444849591,
"grad_norm": 1.1092722415924072,
"learning_rate": 1.0881936329502851e-07,
"loss": 0.31951773166656494,
"step": 658
},
{
"epoch": 1.9217866909753876,
"grad_norm": 1.2378597259521484,
"learning_rate": 1.0145647009689008e-07,
"loss": 0.3756055235862732,
"step": 659
},
{
"epoch": 1.924703737465816,
"grad_norm": 1.0100237131118774,
"learning_rate": 9.43502023797116e-08,
"loss": 0.26117536425590515,
"step": 660
},
{
"epoch": 1.9276207839562443,
"grad_norm": 1.2368487119674683,
"learning_rate": 8.750074437760325e-08,
"loss": 0.3092282712459564,
"step": 661
},
{
"epoch": 1.9305378304466727,
"grad_norm": 1.0328837633132935,
"learning_rate": 8.090827366673548e-08,
"loss": 0.4076297879219055,
"step": 662
},
{
"epoch": 1.9334548769371012,
"grad_norm": 0.9885771870613098,
"learning_rate": 7.457296116073487e-08,
"loss": 0.40007251501083374,
"step": 663
},
{
"epoch": 1.9363719234275296,
"grad_norm": 1.19287109375,
"learning_rate": 6.849497110625214e-08,
"loss": 0.3751019239425659,
"step": 664
},
{
"epoch": 1.939288969917958,
"grad_norm": 1.134682536125183,
"learning_rate": 6.267446107870334e-08,
"loss": 0.4558236300945282,
"step": 665
},
{
"epoch": 1.9422060164083865,
"grad_norm": 3.414883852005005,
"learning_rate": 5.7111581978185336e-08,
"loss": 0.5070392489433289,
"step": 666
},
{
"epoch": 1.945123062898815,
"grad_norm": 1.179479956626892,
"learning_rate": 5.180647802556671e-08,
"loss": 0.389989972114563,
"step": 667
},
{
"epoch": 1.9480401093892434,
"grad_norm": 1.1473273038864136,
"learning_rate": 4.675928675874186e-08,
"loss": 0.460910826921463,
"step": 668
},
{
"epoch": 1.9509571558796717,
"grad_norm": 0.9269355535507202,
"learning_rate": 4.197013902907165e-08,
"loss": 0.5488728284835815,
"step": 669
},
{
"epoch": 1.9538742023701001,
"grad_norm": 1.1781370639801025,
"learning_rate": 3.7439158997989445e-08,
"loss": 0.39483463764190674,
"step": 670
},
{
"epoch": 1.9567912488605286,
"grad_norm": 1.1759430170059204,
"learning_rate": 3.316646413377811e-08,
"loss": 0.38600990176200867,
"step": 671
},
{
"epoch": 1.959708295350957,
"grad_norm": 1.1981792449951172,
"learning_rate": 2.9152165208529147e-08,
"loss": 0.4657193422317505,
"step": 672
},
{
"epoch": 1.9626253418413855,
"grad_norm": 1.186043620109558,
"learning_rate": 2.5396366295272756e-08,
"loss": 0.46212077140808105,
"step": 673
},
{
"epoch": 1.965542388331814,
"grad_norm": 1.115103840827942,
"learning_rate": 2.1899164765271096e-08,
"loss": 0.4416077733039856,
"step": 674
},
{
"epoch": 1.9684594348222424,
"grad_norm": 1.2150691747665405,
"learning_rate": 1.866065128550365e-08,
"loss": 0.3557685911655426,
"step": 675
},
{
"epoch": 1.9713764813126708,
"grad_norm": 1.096506953239441,
"learning_rate": 1.5680909816309098e-08,
"loss": 0.32865390181541443,
"step": 676
},
{
"epoch": 1.9742935278030993,
"grad_norm": 1.0974191427230835,
"learning_rate": 1.2960017609213727e-08,
"loss": 0.37568721175193787,
"step": 677
},
{
"epoch": 1.9772105742935278,
"grad_norm": 1.1290082931518555,
"learning_rate": 1.0498045204924145e-08,
"loss": 0.329836905002594,
"step": 678
},
{
"epoch": 1.9801276207839562,
"grad_norm": 1.0609803199768066,
"learning_rate": 8.295056431504301e-09,
"loss": 0.2694982886314392,
"step": 679
},
{
"epoch": 1.9830446672743847,
"grad_norm": 0.9838472604751587,
"learning_rate": 6.3511084027156885e-09,
"loss": 0.4270719587802887,
"step": 680
},
{
"epoch": 1.9859617137648131,
"grad_norm": 1.1900098323822021,
"learning_rate": 4.666251516536324e-09,
"loss": 0.4060650169849396,
"step": 681
},
{
"epoch": 1.9888787602552416,
"grad_norm": 0.9812174439430237,
"learning_rate": 3.2405294538606637e-09,
"loss": 0.3900409936904907,
"step": 682
},
{
"epoch": 1.99179580674567,
"grad_norm": 1.1988210678100586,
"learning_rate": 2.073979177357188e-09,
"loss": 0.3999583125114441,
"step": 683
},
{
"epoch": 1.9947128532360985,
"grad_norm": 0.9738736152648926,
"learning_rate": 1.1666309305202738e-09,
"loss": 0.46780622005462646,
"step": 684
},
{
"epoch": 1.997629899726527,
"grad_norm": 0.9841824173927307,
"learning_rate": 5.18508236878601e-10,
"loss": 0.4595794975757599,
"step": 685
},
{
"epoch": 2.0,
"grad_norm": 1.0865421295166016,
"learning_rate": 1.2962789938897323e-10,
"loss": 0.5136060118675232,
"step": 686
}
],
"logging_steps": 1,
"max_steps": 686,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.317102071220797e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}