diff --git "a/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json" "b/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft_qwen_14B/checkpoints/checkpoint-2000/trainer_state.json" @@ -0,0 +1,7203 @@ +{ + "best_global_step": 2000, + "best_metric": 0.8567262887954712, + "best_model_checkpoint": "runs/instruct_run_14b_v1/checkpoints/checkpoint-2000", + "epoch": 0.8629989212513485, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008629989212513484, + "grad_norm": 0.36567428708076477, + "learning_rate": 1.7969451931716084e-07, + "loss": 1.6794371604919434, + "step": 2 + }, + { + "epoch": 0.001725997842502697, + "grad_norm": 0.4024646580219269, + "learning_rate": 5.390835579514825e-07, + "loss": 1.6853073835372925, + "step": 4 + }, + { + "epoch": 0.0025889967637540453, + "grad_norm": 0.40199393033981323, + "learning_rate": 8.984725965858042e-07, + "loss": 1.7621158361434937, + "step": 6 + }, + { + "epoch": 0.003451995685005394, + "grad_norm": 0.35409677028656006, + "learning_rate": 1.257861635220126e-06, + "loss": 1.633257269859314, + "step": 8 + }, + { + "epoch": 0.004314994606256742, + "grad_norm": 0.39087551832199097, + "learning_rate": 1.6172506738544475e-06, + "loss": 1.7374768257141113, + "step": 10 + }, + { + "epoch": 0.005177993527508091, + "grad_norm": 0.3586857318878174, + "learning_rate": 1.9766397124887693e-06, + "loss": 1.6955714225769043, + "step": 12 + }, + { + "epoch": 0.006040992448759439, + "grad_norm": 0.32755669951438904, + "learning_rate": 2.3360287511230908e-06, + "loss": 1.720664381980896, + "step": 14 + }, + { + "epoch": 0.006903991370010788, + "grad_norm": 0.4054872691631317, + "learning_rate": 2.6954177897574127e-06, + "loss": 1.6957035064697266, + "step": 16 + }, + { + "epoch": 0.007766990291262136, + "grad_norm": 0.37593814730644226, + "learning_rate": 3.0548068283917343e-06, + "loss": 1.7286947965621948, + "step": 18 + }, + { + "epoch": 0.008629989212513484, + "grad_norm": 0.3344813287258148, + "learning_rate": 3.414195867026056e-06, + "loss": 1.727295994758606, + "step": 20 + }, + { + "epoch": 0.009492988133764833, + "grad_norm": 0.357474148273468, + "learning_rate": 3.7735849056603773e-06, + "loss": 1.6727914810180664, + "step": 22 + }, + { + "epoch": 0.010355987055016181, + "grad_norm": 0.39115726947784424, + "learning_rate": 4.132973944294699e-06, + "loss": 1.6518884897232056, + "step": 24 + }, + { + "epoch": 0.01121898597626753, + "grad_norm": 0.4711727201938629, + "learning_rate": 4.492362982929021e-06, + "loss": 1.7868088483810425, + "step": 26 + }, + { + "epoch": 0.012081984897518877, + "grad_norm": 0.34112176299095154, + "learning_rate": 4.851752021563342e-06, + "loss": 1.6127634048461914, + "step": 28 + }, + { + "epoch": 0.012944983818770227, + "grad_norm": 0.5071991682052612, + "learning_rate": 5.211141060197664e-06, + "loss": 1.7858378887176514, + "step": 30 + }, + { + "epoch": 0.013807982740021575, + "grad_norm": 0.42048847675323486, + "learning_rate": 5.570530098831986e-06, + "loss": 1.7123326063156128, + "step": 32 + }, + { + "epoch": 0.014670981661272923, + "grad_norm": 0.48883870244026184, + "learning_rate": 5.929919137466308e-06, + "loss": 1.737749695777893, + "step": 34 + }, + { + "epoch": 0.015533980582524271, + "grad_norm": 0.3311465084552765, + "learning_rate": 6.289308176100629e-06, + "loss": 1.5578981637954712, + "step": 36 + }, + { + "epoch": 0.01639697950377562, + "grad_norm": 0.5178973078727722, + "learning_rate": 6.64869721473495e-06, + "loss": 1.719806432723999, + "step": 38 + }, + { + "epoch": 0.017259978425026967, + "grad_norm": 0.47097742557525635, + "learning_rate": 7.008086253369272e-06, + "loss": 1.728212833404541, + "step": 40 + }, + { + "epoch": 0.018122977346278317, + "grad_norm": 0.5051584243774414, + "learning_rate": 7.367475292003594e-06, + "loss": 1.6542466878890991, + "step": 42 + }, + { + "epoch": 0.018985976267529667, + "grad_norm": 0.4645111560821533, + "learning_rate": 7.726864330637915e-06, + "loss": 1.7087690830230713, + "step": 44 + }, + { + "epoch": 0.019848975188781013, + "grad_norm": 0.5184999704360962, + "learning_rate": 8.086253369272237e-06, + "loss": 1.7018946409225464, + "step": 46 + }, + { + "epoch": 0.020711974110032363, + "grad_norm": 0.4543815851211548, + "learning_rate": 8.44564240790656e-06, + "loss": 1.6818269491195679, + "step": 48 + }, + { + "epoch": 0.021574973031283712, + "grad_norm": 0.44411996006965637, + "learning_rate": 8.80503144654088e-06, + "loss": 1.5772877931594849, + "step": 50 + }, + { + "epoch": 0.02243797195253506, + "grad_norm": 0.3409404158592224, + "learning_rate": 9.164420485175203e-06, + "loss": 1.498152732849121, + "step": 52 + }, + { + "epoch": 0.02330097087378641, + "grad_norm": 0.42104434967041016, + "learning_rate": 9.523809523809523e-06, + "loss": 1.6189048290252686, + "step": 54 + }, + { + "epoch": 0.024163969795037755, + "grad_norm": 0.3756246268749237, + "learning_rate": 9.883198562443846e-06, + "loss": 1.4596441984176636, + "step": 56 + }, + { + "epoch": 0.025026968716289105, + "grad_norm": 0.36214128136634827, + "learning_rate": 1.0242587601078168e-05, + "loss": 1.503880500793457, + "step": 58 + }, + { + "epoch": 0.025889967637540454, + "grad_norm": 0.40893009305000305, + "learning_rate": 1.060197663971249e-05, + "loss": 1.5912823677062988, + "step": 60 + }, + { + "epoch": 0.0267529665587918, + "grad_norm": 0.28710272908210754, + "learning_rate": 1.0961365678346811e-05, + "loss": 1.2956721782684326, + "step": 62 + }, + { + "epoch": 0.02761596548004315, + "grad_norm": 0.304573118686676, + "learning_rate": 1.1320754716981132e-05, + "loss": 1.4648056030273438, + "step": 64 + }, + { + "epoch": 0.0284789644012945, + "grad_norm": 0.36523914337158203, + "learning_rate": 1.1680143755615454e-05, + "loss": 1.6078968048095703, + "step": 66 + }, + { + "epoch": 0.029341963322545846, + "grad_norm": 0.37929031252861023, + "learning_rate": 1.2039532794249775e-05, + "loss": 1.5969421863555908, + "step": 68 + }, + { + "epoch": 0.030204962243797196, + "grad_norm": 0.3053947389125824, + "learning_rate": 1.2398921832884097e-05, + "loss": 1.4312325716018677, + "step": 70 + }, + { + "epoch": 0.031067961165048542, + "grad_norm": 0.3028779923915863, + "learning_rate": 1.275831087151842e-05, + "loss": 1.4101300239562988, + "step": 72 + }, + { + "epoch": 0.03193096008629989, + "grad_norm": 0.29649803042411804, + "learning_rate": 1.3117699910152742e-05, + "loss": 1.4553817510604858, + "step": 74 + }, + { + "epoch": 0.03279395900755124, + "grad_norm": 0.26032644510269165, + "learning_rate": 1.3477088948787062e-05, + "loss": 1.4623000621795654, + "step": 76 + }, + { + "epoch": 0.03365695792880259, + "grad_norm": 0.33558446168899536, + "learning_rate": 1.3836477987421385e-05, + "loss": 1.5181745290756226, + "step": 78 + }, + { + "epoch": 0.034519956850053934, + "grad_norm": 0.28307804465293884, + "learning_rate": 1.4195867026055706e-05, + "loss": 1.4397861957550049, + "step": 80 + }, + { + "epoch": 0.035382955771305284, + "grad_norm": 0.3451690673828125, + "learning_rate": 1.455525606469003e-05, + "loss": 1.463841199874878, + "step": 82 + }, + { + "epoch": 0.036245954692556634, + "grad_norm": 0.3248669505119324, + "learning_rate": 1.4914645103324348e-05, + "loss": 1.3554227352142334, + "step": 84 + }, + { + "epoch": 0.037108953613807984, + "grad_norm": 0.2855011224746704, + "learning_rate": 1.527403414195867e-05, + "loss": 1.2810425758361816, + "step": 86 + }, + { + "epoch": 0.03797195253505933, + "grad_norm": 0.33365535736083984, + "learning_rate": 1.5633423180592992e-05, + "loss": 1.428163766860962, + "step": 88 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 0.34099438786506653, + "learning_rate": 1.5992812219227316e-05, + "loss": 1.3487578630447388, + "step": 90 + }, + { + "epoch": 0.039697950377562026, + "grad_norm": 0.39247506856918335, + "learning_rate": 1.6352201257861635e-05, + "loss": 1.30057954788208, + "step": 92 + }, + { + "epoch": 0.040560949298813376, + "grad_norm": 0.32692041993141174, + "learning_rate": 1.671159029649596e-05, + "loss": 1.2923580408096313, + "step": 94 + }, + { + "epoch": 0.041423948220064725, + "grad_norm": 0.43452519178390503, + "learning_rate": 1.707097933513028e-05, + "loss": 1.5002273321151733, + "step": 96 + }, + { + "epoch": 0.042286947141316075, + "grad_norm": 0.3251534402370453, + "learning_rate": 1.7430368373764602e-05, + "loss": 1.330254077911377, + "step": 98 + }, + { + "epoch": 0.043149946062567425, + "grad_norm": 0.3198273479938507, + "learning_rate": 1.778975741239892e-05, + "loss": 1.3054943084716797, + "step": 100 + }, + { + "epoch": 0.043149946062567425, + "eval_loss": 1.366738200187683, + "eval_runtime": 651.8198, + "eval_samples_per_second": 3.16, + "eval_steps_per_second": 3.16, + "step": 100 + }, + { + "epoch": 0.04401294498381877, + "grad_norm": 0.37364065647125244, + "learning_rate": 1.8149146451033245e-05, + "loss": 1.314281940460205, + "step": 102 + }, + { + "epoch": 0.04487594390507012, + "grad_norm": 0.39384758472442627, + "learning_rate": 1.8508535489667568e-05, + "loss": 1.2737246751785278, + "step": 104 + }, + { + "epoch": 0.04573894282632147, + "grad_norm": 0.3521905541419983, + "learning_rate": 1.8867924528301888e-05, + "loss": 1.3113226890563965, + "step": 106 + }, + { + "epoch": 0.04660194174757282, + "grad_norm": 0.33531463146209717, + "learning_rate": 1.9227313566936208e-05, + "loss": 1.3253653049468994, + "step": 108 + }, + { + "epoch": 0.04746494066882417, + "grad_norm": 0.35596340894699097, + "learning_rate": 1.958670260557053e-05, + "loss": 1.3236849308013916, + "step": 110 + }, + { + "epoch": 0.04832793959007551, + "grad_norm": 0.36028242111206055, + "learning_rate": 1.9946091644204854e-05, + "loss": 1.183128833770752, + "step": 112 + }, + { + "epoch": 0.04919093851132686, + "grad_norm": 0.42109814286231995, + "learning_rate": 2.0305480682839174e-05, + "loss": 1.2741888761520386, + "step": 114 + }, + { + "epoch": 0.05005393743257821, + "grad_norm": 0.39675939083099365, + "learning_rate": 2.0664869721473494e-05, + "loss": 1.3050109148025513, + "step": 116 + }, + { + "epoch": 0.05091693635382956, + "grad_norm": 0.4414141774177551, + "learning_rate": 2.1024258760107817e-05, + "loss": 1.2472094297409058, + "step": 118 + }, + { + "epoch": 0.05177993527508091, + "grad_norm": 0.42872729897499084, + "learning_rate": 2.138364779874214e-05, + "loss": 1.3338921070098877, + "step": 120 + }, + { + "epoch": 0.05264293419633225, + "grad_norm": 0.38336244225502014, + "learning_rate": 2.174303683737646e-05, + "loss": 1.322908878326416, + "step": 122 + }, + { + "epoch": 0.0535059331175836, + "grad_norm": 0.41046878695487976, + "learning_rate": 2.2102425876010783e-05, + "loss": 1.2169240713119507, + "step": 124 + }, + { + "epoch": 0.05436893203883495, + "grad_norm": 0.39460113644599915, + "learning_rate": 2.2461814914645103e-05, + "loss": 1.2085309028625488, + "step": 126 + }, + { + "epoch": 0.0552319309600863, + "grad_norm": 0.42829909920692444, + "learning_rate": 2.2821203953279426e-05, + "loss": 1.2969133853912354, + "step": 128 + }, + { + "epoch": 0.05609492988133765, + "grad_norm": 0.3940851390361786, + "learning_rate": 2.3180592991913746e-05, + "loss": 1.1892330646514893, + "step": 130 + }, + { + "epoch": 0.056957928802589, + "grad_norm": 0.45011839270591736, + "learning_rate": 2.353998203054807e-05, + "loss": 1.2082979679107666, + "step": 132 + }, + { + "epoch": 0.05782092772384034, + "grad_norm": 0.46059420704841614, + "learning_rate": 2.3899371069182393e-05, + "loss": 1.2388817071914673, + "step": 134 + }, + { + "epoch": 0.05868392664509169, + "grad_norm": 0.41085872054100037, + "learning_rate": 2.4258760107816713e-05, + "loss": 1.193917155265808, + "step": 136 + }, + { + "epoch": 0.05954692556634304, + "grad_norm": 0.4024205207824707, + "learning_rate": 2.4618149146451032e-05, + "loss": 1.1514034271240234, + "step": 138 + }, + { + "epoch": 0.06040992448759439, + "grad_norm": 0.3893793523311615, + "learning_rate": 2.4977538185085356e-05, + "loss": 1.1626157760620117, + "step": 140 + }, + { + "epoch": 0.06127292340884574, + "grad_norm": 0.4456317126750946, + "learning_rate": 2.5336927223719675e-05, + "loss": 1.1627076864242554, + "step": 142 + }, + { + "epoch": 0.062135922330097085, + "grad_norm": 0.5050215125083923, + "learning_rate": 2.5696316262354e-05, + "loss": 1.3038755655288696, + "step": 144 + }, + { + "epoch": 0.06299892125134844, + "grad_norm": 0.4071207642555237, + "learning_rate": 2.605570530098832e-05, + "loss": 1.1708844900131226, + "step": 146 + }, + { + "epoch": 0.06386192017259978, + "grad_norm": 0.4363228678703308, + "learning_rate": 2.641509433962264e-05, + "loss": 1.2149070501327515, + "step": 148 + }, + { + "epoch": 0.06472491909385113, + "grad_norm": 0.4436556398868561, + "learning_rate": 2.6774483378256965e-05, + "loss": 1.1942368745803833, + "step": 150 + }, + { + "epoch": 0.06558791801510248, + "grad_norm": 0.4068629741668701, + "learning_rate": 2.7133872416891288e-05, + "loss": 1.1799161434173584, + "step": 152 + }, + { + "epoch": 0.06645091693635383, + "grad_norm": 0.5291106700897217, + "learning_rate": 2.7493261455525608e-05, + "loss": 1.1832845211029053, + "step": 154 + }, + { + "epoch": 0.06731391585760518, + "grad_norm": 0.4410109221935272, + "learning_rate": 2.785265049415993e-05, + "loss": 1.1696993112564087, + "step": 156 + }, + { + "epoch": 0.06817691477885653, + "grad_norm": 0.4858371913433075, + "learning_rate": 2.8212039532794248e-05, + "loss": 1.2036973237991333, + "step": 158 + }, + { + "epoch": 0.06903991370010787, + "grad_norm": 0.45373693108558655, + "learning_rate": 2.857142857142857e-05, + "loss": 1.1145079135894775, + "step": 160 + }, + { + "epoch": 0.06990291262135923, + "grad_norm": 0.4881038963794708, + "learning_rate": 2.8930817610062894e-05, + "loss": 1.173502802848816, + "step": 162 + }, + { + "epoch": 0.07076591154261057, + "grad_norm": 0.576934814453125, + "learning_rate": 2.9290206648697217e-05, + "loss": 1.250414490699768, + "step": 164 + }, + { + "epoch": 0.07162891046386193, + "grad_norm": 0.4900001287460327, + "learning_rate": 2.9649595687331537e-05, + "loss": 1.0721495151519775, + "step": 166 + }, + { + "epoch": 0.07249190938511327, + "grad_norm": 0.4440019726753235, + "learning_rate": 3.000898472596586e-05, + "loss": 1.0689374208450317, + "step": 168 + }, + { + "epoch": 0.07335490830636461, + "grad_norm": 0.4267268180847168, + "learning_rate": 3.0368373764600184e-05, + "loss": 1.2095128297805786, + "step": 170 + }, + { + "epoch": 0.07421790722761597, + "grad_norm": 0.6062787771224976, + "learning_rate": 3.0727762803234503e-05, + "loss": 1.077776551246643, + "step": 172 + }, + { + "epoch": 0.07508090614886731, + "grad_norm": 0.49510180950164795, + "learning_rate": 3.108715184186882e-05, + "loss": 1.144006371498108, + "step": 174 + }, + { + "epoch": 0.07594390507011867, + "grad_norm": 0.4670701026916504, + "learning_rate": 3.144654088050314e-05, + "loss": 1.1663392782211304, + "step": 176 + }, + { + "epoch": 0.07680690399137001, + "grad_norm": 0.5615383386611938, + "learning_rate": 3.1805929919137466e-05, + "loss": 1.1665973663330078, + "step": 178 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 0.47305551171302795, + "learning_rate": 3.216531895777179e-05, + "loss": 1.1337063312530518, + "step": 180 + }, + { + "epoch": 0.07853290183387271, + "grad_norm": 0.5127068758010864, + "learning_rate": 3.252470799640611e-05, + "loss": 1.072874903678894, + "step": 182 + }, + { + "epoch": 0.07939590075512405, + "grad_norm": 0.632448136806488, + "learning_rate": 3.2884097035040436e-05, + "loss": 1.1577240228652954, + "step": 184 + }, + { + "epoch": 0.08025889967637541, + "grad_norm": 0.4041025638580322, + "learning_rate": 3.324348607367476e-05, + "loss": 1.1186822652816772, + "step": 186 + }, + { + "epoch": 0.08112189859762675, + "grad_norm": 0.5239102244377136, + "learning_rate": 3.3602875112309076e-05, + "loss": 1.1468429565429688, + "step": 188 + }, + { + "epoch": 0.08198489751887811, + "grad_norm": 0.4486575424671173, + "learning_rate": 3.39622641509434e-05, + "loss": 1.0017019510269165, + "step": 190 + }, + { + "epoch": 0.08284789644012945, + "grad_norm": 0.4994317293167114, + "learning_rate": 3.4321653189577715e-05, + "loss": 1.1901532411575317, + "step": 192 + }, + { + "epoch": 0.0837108953613808, + "grad_norm": 0.5023699998855591, + "learning_rate": 3.468104222821204e-05, + "loss": 1.1398564577102661, + "step": 194 + }, + { + "epoch": 0.08457389428263215, + "grad_norm": 0.5077701807022095, + "learning_rate": 3.504043126684636e-05, + "loss": 1.1390413045883179, + "step": 196 + }, + { + "epoch": 0.0854368932038835, + "grad_norm": 0.5527892112731934, + "learning_rate": 3.5399820305480685e-05, + "loss": 1.1411432027816772, + "step": 198 + }, + { + "epoch": 0.08629989212513485, + "grad_norm": 0.5572488903999329, + "learning_rate": 3.575920934411501e-05, + "loss": 1.071260690689087, + "step": 200 + }, + { + "epoch": 0.08629989212513485, + "eval_loss": 1.1519012451171875, + "eval_runtime": 654.6055, + "eval_samples_per_second": 3.147, + "eval_steps_per_second": 3.147, + "step": 200 + }, + { + "epoch": 0.08716289104638619, + "grad_norm": 0.5134095549583435, + "learning_rate": 3.611859838274933e-05, + "loss": 1.138135552406311, + "step": 202 + }, + { + "epoch": 0.08802588996763754, + "grad_norm": 0.5166040658950806, + "learning_rate": 3.647798742138365e-05, + "loss": 1.111999273300171, + "step": 204 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.5336993336677551, + "learning_rate": 3.683737646001797e-05, + "loss": 1.1031352281570435, + "step": 206 + }, + { + "epoch": 0.08975188781014024, + "grad_norm": 0.8289600014686584, + "learning_rate": 3.7196765498652294e-05, + "loss": 1.0388667583465576, + "step": 208 + }, + { + "epoch": 0.09061488673139159, + "grad_norm": 0.47992637753486633, + "learning_rate": 3.755615453728661e-05, + "loss": 1.0950241088867188, + "step": 210 + }, + { + "epoch": 0.09147788565264293, + "grad_norm": 0.5629691481590271, + "learning_rate": 3.7915543575920934e-05, + "loss": 1.0361733436584473, + "step": 212 + }, + { + "epoch": 0.09234088457389428, + "grad_norm": 0.5515111684799194, + "learning_rate": 3.827493261455526e-05, + "loss": 1.0922447443008423, + "step": 214 + }, + { + "epoch": 0.09320388349514563, + "grad_norm": 0.5078643560409546, + "learning_rate": 3.863432165318958e-05, + "loss": 1.0866856575012207, + "step": 216 + }, + { + "epoch": 0.09406688241639698, + "grad_norm": 0.6046127676963806, + "learning_rate": 3.8993710691823904e-05, + "loss": 1.1231595277786255, + "step": 218 + }, + { + "epoch": 0.09492988133764833, + "grad_norm": 0.6255762577056885, + "learning_rate": 3.935309973045822e-05, + "loss": 1.099171757698059, + "step": 220 + }, + { + "epoch": 0.09579288025889968, + "grad_norm": 0.6036638021469116, + "learning_rate": 3.971248876909254e-05, + "loss": 1.0557761192321777, + "step": 222 + }, + { + "epoch": 0.09665587918015102, + "grad_norm": 0.5520529747009277, + "learning_rate": 4.0071877807726867e-05, + "loss": 1.0467877388000488, + "step": 224 + }, + { + "epoch": 0.09751887810140238, + "grad_norm": 0.5958684682846069, + "learning_rate": 4.043126684636119e-05, + "loss": 1.17941153049469, + "step": 226 + }, + { + "epoch": 0.09838187702265372, + "grad_norm": 0.5283281803131104, + "learning_rate": 4.079065588499551e-05, + "loss": 1.104217767715454, + "step": 228 + }, + { + "epoch": 0.09924487594390508, + "grad_norm": 0.5608792901039124, + "learning_rate": 4.115004492362983e-05, + "loss": 1.0900640487670898, + "step": 230 + }, + { + "epoch": 0.10010787486515642, + "grad_norm": 0.555964469909668, + "learning_rate": 4.150943396226415e-05, + "loss": 0.9887422323226929, + "step": 232 + }, + { + "epoch": 0.10097087378640776, + "grad_norm": 0.5875785946846008, + "learning_rate": 4.1868823000898476e-05, + "loss": 1.1298567056655884, + "step": 234 + }, + { + "epoch": 0.10183387270765912, + "grad_norm": 0.4544795751571655, + "learning_rate": 4.222821203953279e-05, + "loss": 1.0957067012786865, + "step": 236 + }, + { + "epoch": 0.10269687162891046, + "grad_norm": 0.564145565032959, + "learning_rate": 4.2587601078167116e-05, + "loss": 1.0328738689422607, + "step": 238 + }, + { + "epoch": 0.10355987055016182, + "grad_norm": 0.6285979747772217, + "learning_rate": 4.294699011680144e-05, + "loss": 1.1085515022277832, + "step": 240 + }, + { + "epoch": 0.10442286947141316, + "grad_norm": 0.6442288756370544, + "learning_rate": 4.330637915543576e-05, + "loss": 1.1291271448135376, + "step": 242 + }, + { + "epoch": 0.1052858683926645, + "grad_norm": 0.6137154698371887, + "learning_rate": 4.3665768194070085e-05, + "loss": 1.1759567260742188, + "step": 244 + }, + { + "epoch": 0.10614886731391586, + "grad_norm": 0.5906805992126465, + "learning_rate": 4.402515723270441e-05, + "loss": 1.148414969444275, + "step": 246 + }, + { + "epoch": 0.1070118662351672, + "grad_norm": 0.5382888913154602, + "learning_rate": 4.438454627133873e-05, + "loss": 1.0749616622924805, + "step": 248 + }, + { + "epoch": 0.10787486515641856, + "grad_norm": 0.6185492873191833, + "learning_rate": 4.474393530997305e-05, + "loss": 1.2235801219940186, + "step": 250 + }, + { + "epoch": 0.1087378640776699, + "grad_norm": 0.5981597900390625, + "learning_rate": 4.5103324348607365e-05, + "loss": 1.1390639543533325, + "step": 252 + }, + { + "epoch": 0.10960086299892124, + "grad_norm": 0.5664694905281067, + "learning_rate": 4.546271338724169e-05, + "loss": 1.171774983406067, + "step": 254 + }, + { + "epoch": 0.1104638619201726, + "grad_norm": 0.7071851491928101, + "learning_rate": 4.582210242587601e-05, + "loss": 1.1704237461090088, + "step": 256 + }, + { + "epoch": 0.11132686084142394, + "grad_norm": 0.5815614461898804, + "learning_rate": 4.6181491464510334e-05, + "loss": 1.0619677305221558, + "step": 258 + }, + { + "epoch": 0.1121898597626753, + "grad_norm": 0.6481915712356567, + "learning_rate": 4.654088050314466e-05, + "loss": 1.0824390649795532, + "step": 260 + }, + { + "epoch": 0.11305285868392664, + "grad_norm": 0.5988591313362122, + "learning_rate": 4.690026954177898e-05, + "loss": 1.087929606437683, + "step": 262 + }, + { + "epoch": 0.113915857605178, + "grad_norm": 0.6545296311378479, + "learning_rate": 4.7259658580413304e-05, + "loss": 1.0936195850372314, + "step": 264 + }, + { + "epoch": 0.11477885652642934, + "grad_norm": 0.5826204419136047, + "learning_rate": 4.761904761904762e-05, + "loss": 1.0433681011199951, + "step": 266 + }, + { + "epoch": 0.11564185544768069, + "grad_norm": 0.5907514095306396, + "learning_rate": 4.7978436657681944e-05, + "loss": 1.0719536542892456, + "step": 268 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 0.524394154548645, + "learning_rate": 4.833782569631627e-05, + "loss": 1.0231504440307617, + "step": 270 + }, + { + "epoch": 0.11736785329018339, + "grad_norm": 0.5472846031188965, + "learning_rate": 4.869721473495058e-05, + "loss": 0.9905915260314941, + "step": 272 + }, + { + "epoch": 0.11823085221143474, + "grad_norm": 0.727922260761261, + "learning_rate": 4.9056603773584906e-05, + "loss": 1.213677167892456, + "step": 274 + }, + { + "epoch": 0.11909385113268608, + "grad_norm": 0.6009684801101685, + "learning_rate": 4.941599281221923e-05, + "loss": 1.0052144527435303, + "step": 276 + }, + { + "epoch": 0.11995685005393743, + "grad_norm": 0.6564669013023376, + "learning_rate": 4.977538185085355e-05, + "loss": 1.108136773109436, + "step": 278 + }, + { + "epoch": 0.12081984897518878, + "grad_norm": 0.650074303150177, + "learning_rate": 5.013477088948787e-05, + "loss": 0.9700815677642822, + "step": 280 + }, + { + "epoch": 0.12168284789644013, + "grad_norm": 0.5772947072982788, + "learning_rate": 5.04941599281222e-05, + "loss": 1.038031816482544, + "step": 282 + }, + { + "epoch": 0.12254584681769148, + "grad_norm": 0.7293002009391785, + "learning_rate": 5.0853548966756516e-05, + "loss": 1.1063730716705322, + "step": 284 + }, + { + "epoch": 0.12340884573894283, + "grad_norm": 0.7937333583831787, + "learning_rate": 5.1212938005390846e-05, + "loss": 1.128495693206787, + "step": 286 + }, + { + "epoch": 0.12427184466019417, + "grad_norm": 0.48499324917793274, + "learning_rate": 5.157232704402516e-05, + "loss": 0.9438712000846863, + "step": 288 + }, + { + "epoch": 0.12513484358144553, + "grad_norm": 0.6010656952857971, + "learning_rate": 5.193171608265948e-05, + "loss": 1.0872881412506104, + "step": 290 + }, + { + "epoch": 0.12599784250269688, + "grad_norm": 0.6240811944007874, + "learning_rate": 5.22911051212938e-05, + "loss": 1.110992193222046, + "step": 292 + }, + { + "epoch": 0.1268608414239482, + "grad_norm": 0.7172768712043762, + "learning_rate": 5.265049415992812e-05, + "loss": 1.1109752655029297, + "step": 294 + }, + { + "epoch": 0.12772384034519957, + "grad_norm": 0.6442400217056274, + "learning_rate": 5.300988319856245e-05, + "loss": 1.05553138256073, + "step": 296 + }, + { + "epoch": 0.12858683926645093, + "grad_norm": 0.7074702382087708, + "learning_rate": 5.3369272237196765e-05, + "loss": 1.0717648267745972, + "step": 298 + }, + { + "epoch": 0.12944983818770225, + "grad_norm": 0.5277591347694397, + "learning_rate": 5.3728661275831095e-05, + "loss": 0.9777541756629944, + "step": 300 + }, + { + "epoch": 0.12944983818770225, + "eval_loss": 1.0977506637573242, + "eval_runtime": 662.1728, + "eval_samples_per_second": 3.111, + "eval_steps_per_second": 3.111, + "step": 300 + }, + { + "epoch": 0.1303128371089536, + "grad_norm": 0.7252246737480164, + "learning_rate": 5.408805031446541e-05, + "loss": 1.075905203819275, + "step": 302 + }, + { + "epoch": 0.13117583603020497, + "grad_norm": 0.7003294229507446, + "learning_rate": 5.444743935309974e-05, + "loss": 1.1117515563964844, + "step": 304 + }, + { + "epoch": 0.13203883495145632, + "grad_norm": 0.5878211259841919, + "learning_rate": 5.480682839173406e-05, + "loss": 1.0289191007614136, + "step": 306 + }, + { + "epoch": 0.13290183387270765, + "grad_norm": 0.7133644223213196, + "learning_rate": 5.5166217430368374e-05, + "loss": 1.0199183225631714, + "step": 308 + }, + { + "epoch": 0.133764832793959, + "grad_norm": 0.6098423600196838, + "learning_rate": 5.55256064690027e-05, + "loss": 1.0132375955581665, + "step": 310 + }, + { + "epoch": 0.13462783171521037, + "grad_norm": 0.6386916041374207, + "learning_rate": 5.5884995507637014e-05, + "loss": 1.1595754623413086, + "step": 312 + }, + { + "epoch": 0.1354908306364617, + "grad_norm": 0.6563469767570496, + "learning_rate": 5.6244384546271344e-05, + "loss": 1.0921307802200317, + "step": 314 + }, + { + "epoch": 0.13635382955771305, + "grad_norm": 0.6388015747070312, + "learning_rate": 5.660377358490566e-05, + "loss": 1.0200815200805664, + "step": 316 + }, + { + "epoch": 0.1372168284789644, + "grad_norm": 0.6026274561882019, + "learning_rate": 5.696316262353999e-05, + "loss": 0.9339485764503479, + "step": 318 + }, + { + "epoch": 0.13807982740021574, + "grad_norm": 0.619800865650177, + "learning_rate": 5.732255166217431e-05, + "loss": 1.0268478393554688, + "step": 320 + }, + { + "epoch": 0.1389428263214671, + "grad_norm": 0.5924715399742126, + "learning_rate": 5.768194070080862e-05, + "loss": 1.1394236087799072, + "step": 322 + }, + { + "epoch": 0.13980582524271845, + "grad_norm": 0.6829012036323547, + "learning_rate": 5.804132973944295e-05, + "loss": 1.002437949180603, + "step": 324 + }, + { + "epoch": 0.1406688241639698, + "grad_norm": 0.7012544274330139, + "learning_rate": 5.840071877807727e-05, + "loss": 1.132503628730774, + "step": 326 + }, + { + "epoch": 0.14153182308522114, + "grad_norm": 0.7921599745750427, + "learning_rate": 5.876010781671159e-05, + "loss": 1.1859129667282104, + "step": 328 + }, + { + "epoch": 0.1423948220064725, + "grad_norm": 0.6373353004455566, + "learning_rate": 5.9119496855345916e-05, + "loss": 1.0896776914596558, + "step": 330 + }, + { + "epoch": 0.14325782092772385, + "grad_norm": 0.6174030900001526, + "learning_rate": 5.947888589398024e-05, + "loss": 1.0691723823547363, + "step": 332 + }, + { + "epoch": 0.14412081984897518, + "grad_norm": 0.5110617280006409, + "learning_rate": 5.9838274932614556e-05, + "loss": 1.0144777297973633, + "step": 334 + }, + { + "epoch": 0.14498381877022654, + "grad_norm": 0.5580511093139648, + "learning_rate": 6.019766397124887e-05, + "loss": 0.9955101609230042, + "step": 336 + }, + { + "epoch": 0.1458468176914779, + "grad_norm": 0.6427345275878906, + "learning_rate": 6.05570530098832e-05, + "loss": 0.9863013625144958, + "step": 338 + }, + { + "epoch": 0.14670981661272922, + "grad_norm": 0.7464537024497986, + "learning_rate": 6.091644204851752e-05, + "loss": 1.0682255029678345, + "step": 340 + }, + { + "epoch": 0.14757281553398058, + "grad_norm": 0.599926769733429, + "learning_rate": 6.127583108715184e-05, + "loss": 1.034083366394043, + "step": 342 + }, + { + "epoch": 0.14843581445523193, + "grad_norm": 0.6320257186889648, + "learning_rate": 6.163522012578616e-05, + "loss": 1.0776089429855347, + "step": 344 + }, + { + "epoch": 0.1492988133764833, + "grad_norm": 0.6565091013908386, + "learning_rate": 6.199460916442049e-05, + "loss": 1.0493087768554688, + "step": 346 + }, + { + "epoch": 0.15016181229773462, + "grad_norm": 0.6512171626091003, + "learning_rate": 6.23539982030548e-05, + "loss": 1.0469218492507935, + "step": 348 + }, + { + "epoch": 0.15102481121898598, + "grad_norm": 0.8487282991409302, + "learning_rate": 6.271338724168913e-05, + "loss": 1.0985081195831299, + "step": 350 + }, + { + "epoch": 0.15188781014023733, + "grad_norm": 0.6718961596488953, + "learning_rate": 6.307277628032345e-05, + "loss": 1.0714176893234253, + "step": 352 + }, + { + "epoch": 0.15275080906148866, + "grad_norm": 0.8175088167190552, + "learning_rate": 6.343216531895777e-05, + "loss": 1.0599322319030762, + "step": 354 + }, + { + "epoch": 0.15361380798274002, + "grad_norm": 0.6359215378761292, + "learning_rate": 6.37915543575921e-05, + "loss": 0.9268131256103516, + "step": 356 + }, + { + "epoch": 0.15447680690399138, + "grad_norm": 0.6423866748809814, + "learning_rate": 6.415094339622641e-05, + "loss": 0.9838354587554932, + "step": 358 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 0.6496716737747192, + "learning_rate": 6.451033243486074e-05, + "loss": 1.048566460609436, + "step": 360 + }, + { + "epoch": 0.15620280474649406, + "grad_norm": 0.6536920666694641, + "learning_rate": 6.486972147349506e-05, + "loss": 1.0910537242889404, + "step": 362 + }, + { + "epoch": 0.15706580366774542, + "grad_norm": 0.5832068920135498, + "learning_rate": 6.522911051212939e-05, + "loss": 0.9971448183059692, + "step": 364 + }, + { + "epoch": 0.15792880258899678, + "grad_norm": 0.6647719144821167, + "learning_rate": 6.558849955076371e-05, + "loss": 1.0496708154678345, + "step": 366 + }, + { + "epoch": 0.1587918015102481, + "grad_norm": 0.623252809047699, + "learning_rate": 6.594788858939802e-05, + "loss": 0.955894410610199, + "step": 368 + }, + { + "epoch": 0.15965480043149946, + "grad_norm": 0.6311860084533691, + "learning_rate": 6.630727762803235e-05, + "loss": 1.1304032802581787, + "step": 370 + }, + { + "epoch": 0.16051779935275082, + "grad_norm": 0.5306481122970581, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8746405243873596, + "step": 372 + }, + { + "epoch": 0.16138079827400215, + "grad_norm": 0.6249631643295288, + "learning_rate": 6.7026055705301e-05, + "loss": 0.9104986786842346, + "step": 374 + }, + { + "epoch": 0.1622437971952535, + "grad_norm": 0.6243219971656799, + "learning_rate": 6.738544474393532e-05, + "loss": 1.043666124343872, + "step": 376 + }, + { + "epoch": 0.16310679611650486, + "grad_norm": 0.6833282113075256, + "learning_rate": 6.774483378256963e-05, + "loss": 1.0504906177520752, + "step": 378 + }, + { + "epoch": 0.16396979503775622, + "grad_norm": 0.7124452590942383, + "learning_rate": 6.810422282120395e-05, + "loss": 1.0608166456222534, + "step": 380 + }, + { + "epoch": 0.16483279395900755, + "grad_norm": 0.7520908117294312, + "learning_rate": 6.846361185983828e-05, + "loss": 1.1653732061386108, + "step": 382 + }, + { + "epoch": 0.1656957928802589, + "grad_norm": 0.7121814489364624, + "learning_rate": 6.88230008984726e-05, + "loss": 1.0626367330551147, + "step": 384 + }, + { + "epoch": 0.16655879180151026, + "grad_norm": 0.6825008988380432, + "learning_rate": 6.918238993710691e-05, + "loss": 1.012121319770813, + "step": 386 + }, + { + "epoch": 0.1674217907227616, + "grad_norm": 0.4922940135002136, + "learning_rate": 6.954177897574124e-05, + "loss": 1.0576211214065552, + "step": 388 + }, + { + "epoch": 0.16828478964401294, + "grad_norm": 0.6122089624404907, + "learning_rate": 6.990116801437556e-05, + "loss": 1.03916597366333, + "step": 390 + }, + { + "epoch": 0.1691477885652643, + "grad_norm": 0.6348981261253357, + "learning_rate": 7.026055705300989e-05, + "loss": 1.17647123336792, + "step": 392 + }, + { + "epoch": 0.17001078748651563, + "grad_norm": 0.6205878257751465, + "learning_rate": 7.06199460916442e-05, + "loss": 0.9095983505249023, + "step": 394 + }, + { + "epoch": 0.170873786407767, + "grad_norm": 0.61506187915802, + "learning_rate": 7.097933513027853e-05, + "loss": 1.082506775856018, + "step": 396 + }, + { + "epoch": 0.17173678532901834, + "grad_norm": 0.6481751799583435, + "learning_rate": 7.133872416891285e-05, + "loss": 1.0716280937194824, + "step": 398 + }, + { + "epoch": 0.1725997842502697, + "grad_norm": 0.4871014952659607, + "learning_rate": 7.169811320754717e-05, + "loss": 0.9616814851760864, + "step": 400 + }, + { + "epoch": 0.1725997842502697, + "eval_loss": 1.0649415254592896, + "eval_runtime": 668.6025, + "eval_samples_per_second": 3.081, + "eval_steps_per_second": 3.081, + "step": 400 + }, + { + "epoch": 0.17346278317152103, + "grad_norm": 0.5680040121078491, + "learning_rate": 7.20575022461815e-05, + "loss": 1.0475050210952759, + "step": 402 + }, + { + "epoch": 0.17432578209277239, + "grad_norm": 0.6417813897132874, + "learning_rate": 7.241689128481581e-05, + "loss": 0.9851161241531372, + "step": 404 + }, + { + "epoch": 0.17518878101402374, + "grad_norm": 0.6600468158721924, + "learning_rate": 7.277628032345014e-05, + "loss": 1.013339638710022, + "step": 406 + }, + { + "epoch": 0.17605177993527507, + "grad_norm": 0.6733932495117188, + "learning_rate": 7.313566936208446e-05, + "loss": 0.9346804022789001, + "step": 408 + }, + { + "epoch": 0.17691477885652643, + "grad_norm": 0.6812151074409485, + "learning_rate": 7.349505840071879e-05, + "loss": 0.9890368580818176, + "step": 410 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.6380394697189331, + "learning_rate": 7.385444743935311e-05, + "loss": 0.8787848949432373, + "step": 412 + }, + { + "epoch": 0.1786407766990291, + "grad_norm": 0.6004905700683594, + "learning_rate": 7.421383647798742e-05, + "loss": 1.0235728025436401, + "step": 414 + }, + { + "epoch": 0.17950377562028047, + "grad_norm": 0.6569193005561829, + "learning_rate": 7.457322551662175e-05, + "loss": 0.9972385168075562, + "step": 416 + }, + { + "epoch": 0.18036677454153183, + "grad_norm": 0.6761631369590759, + "learning_rate": 7.493261455525607e-05, + "loss": 0.9593698382377625, + "step": 418 + }, + { + "epoch": 0.18122977346278318, + "grad_norm": 0.7328561544418335, + "learning_rate": 7.529200359389039e-05, + "loss": 1.0426853895187378, + "step": 420 + }, + { + "epoch": 0.1820927723840345, + "grad_norm": 0.6256070137023926, + "learning_rate": 7.56513926325247e-05, + "loss": 0.9608182311058044, + "step": 422 + }, + { + "epoch": 0.18295577130528587, + "grad_norm": 1.2549844980239868, + "learning_rate": 7.601078167115903e-05, + "loss": 1.0162668228149414, + "step": 424 + }, + { + "epoch": 0.18381877022653723, + "grad_norm": 0.6751510500907898, + "learning_rate": 7.637017070979335e-05, + "loss": 1.130725383758545, + "step": 426 + }, + { + "epoch": 0.18468176914778855, + "grad_norm": 0.7029808163642883, + "learning_rate": 7.672955974842768e-05, + "loss": 1.0384817123413086, + "step": 428 + }, + { + "epoch": 0.1855447680690399, + "grad_norm": 0.644353449344635, + "learning_rate": 7.7088948787062e-05, + "loss": 1.017020344734192, + "step": 430 + }, + { + "epoch": 0.18640776699029127, + "grad_norm": 0.6784916520118713, + "learning_rate": 7.744833782569631e-05, + "loss": 1.005354404449463, + "step": 432 + }, + { + "epoch": 0.1872707659115426, + "grad_norm": 0.5989449620246887, + "learning_rate": 7.780772686433064e-05, + "loss": 1.026848316192627, + "step": 434 + }, + { + "epoch": 0.18813376483279395, + "grad_norm": 0.6502639651298523, + "learning_rate": 7.816711590296496e-05, + "loss": 0.9891080856323242, + "step": 436 + }, + { + "epoch": 0.1889967637540453, + "grad_norm": 0.6176205277442932, + "learning_rate": 7.852650494159929e-05, + "loss": 0.966316819190979, + "step": 438 + }, + { + "epoch": 0.18985976267529667, + "grad_norm": 0.6801626086235046, + "learning_rate": 7.88858939802336e-05, + "loss": 1.123063087463379, + "step": 440 + }, + { + "epoch": 0.190722761596548, + "grad_norm": 0.6718618273735046, + "learning_rate": 7.924528301886794e-05, + "loss": 1.0467073917388916, + "step": 442 + }, + { + "epoch": 0.19158576051779935, + "grad_norm": 0.6761009097099304, + "learning_rate": 7.960467205750225e-05, + "loss": 1.0952889919281006, + "step": 444 + }, + { + "epoch": 0.1924487594390507, + "grad_norm": 0.6356327533721924, + "learning_rate": 7.996406109613657e-05, + "loss": 0.954807698726654, + "step": 446 + }, + { + "epoch": 0.19331175836030204, + "grad_norm": 0.6798669695854187, + "learning_rate": 8.03234501347709e-05, + "loss": 0.9941422343254089, + "step": 448 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 0.6511302590370178, + "learning_rate": 8.068283917340521e-05, + "loss": 1.0351495742797852, + "step": 450 + }, + { + "epoch": 0.19503775620280475, + "grad_norm": 0.6061258912086487, + "learning_rate": 8.104222821203954e-05, + "loss": 1.00546133518219, + "step": 452 + }, + { + "epoch": 0.1959007551240561, + "grad_norm": 0.6278533935546875, + "learning_rate": 8.140161725067386e-05, + "loss": 1.0778460502624512, + "step": 454 + }, + { + "epoch": 0.19676375404530744, + "grad_norm": 0.6866298317909241, + "learning_rate": 8.176100628930818e-05, + "loss": 1.0344486236572266, + "step": 456 + }, + { + "epoch": 0.1976267529665588, + "grad_norm": 0.7338075041770935, + "learning_rate": 8.212039532794251e-05, + "loss": 1.0663033723831177, + "step": 458 + }, + { + "epoch": 0.19848975188781015, + "grad_norm": 0.6811459064483643, + "learning_rate": 8.247978436657682e-05, + "loss": 0.9665339589118958, + "step": 460 + }, + { + "epoch": 0.19935275080906148, + "grad_norm": 0.6779627799987793, + "learning_rate": 8.283917340521114e-05, + "loss": 1.024712324142456, + "step": 462 + }, + { + "epoch": 0.20021574973031284, + "grad_norm": 0.6486892700195312, + "learning_rate": 8.319856244384546e-05, + "loss": 0.9699305295944214, + "step": 464 + }, + { + "epoch": 0.2010787486515642, + "grad_norm": 0.7022278308868408, + "learning_rate": 8.355795148247979e-05, + "loss": 0.9540432095527649, + "step": 466 + }, + { + "epoch": 0.20194174757281552, + "grad_norm": 0.5922990441322327, + "learning_rate": 8.39173405211141e-05, + "loss": 0.9253339767456055, + "step": 468 + }, + { + "epoch": 0.20280474649406688, + "grad_norm": 0.7076792120933533, + "learning_rate": 8.427672955974843e-05, + "loss": 0.9987741112709045, + "step": 470 + }, + { + "epoch": 0.20366774541531824, + "grad_norm": 0.6491380333900452, + "learning_rate": 8.463611859838275e-05, + "loss": 1.0249329805374146, + "step": 472 + }, + { + "epoch": 0.2045307443365696, + "grad_norm": 0.6784211993217468, + "learning_rate": 8.499550763701708e-05, + "loss": 1.0577133893966675, + "step": 474 + }, + { + "epoch": 0.20539374325782092, + "grad_norm": 0.6453303694725037, + "learning_rate": 8.53548966756514e-05, + "loss": 1.1312458515167236, + "step": 476 + }, + { + "epoch": 0.20625674217907228, + "grad_norm": 0.7431377172470093, + "learning_rate": 8.571428571428571e-05, + "loss": 1.0592451095581055, + "step": 478 + }, + { + "epoch": 0.20711974110032363, + "grad_norm": 0.6097649931907654, + "learning_rate": 8.607367475292004e-05, + "loss": 0.9337235689163208, + "step": 480 + }, + { + "epoch": 0.20798274002157496, + "grad_norm": 0.5693124532699585, + "learning_rate": 8.643306379155436e-05, + "loss": 0.9088928699493408, + "step": 482 + }, + { + "epoch": 0.20884573894282632, + "grad_norm": 0.7377229332923889, + "learning_rate": 8.679245283018869e-05, + "loss": 1.0729358196258545, + "step": 484 + }, + { + "epoch": 0.20970873786407768, + "grad_norm": 0.7399470210075378, + "learning_rate": 8.7151841868823e-05, + "loss": 1.0428457260131836, + "step": 486 + }, + { + "epoch": 0.210571736785329, + "grad_norm": 0.677052915096283, + "learning_rate": 8.751123090745734e-05, + "loss": 0.9940266013145447, + "step": 488 + }, + { + "epoch": 0.21143473570658036, + "grad_norm": 0.7126721739768982, + "learning_rate": 8.787061994609165e-05, + "loss": 1.011808156967163, + "step": 490 + }, + { + "epoch": 0.21229773462783172, + "grad_norm": 0.6663792729377747, + "learning_rate": 8.823000898472597e-05, + "loss": 1.0054185390472412, + "step": 492 + }, + { + "epoch": 0.21316073354908308, + "grad_norm": 0.6661092042922974, + "learning_rate": 8.85893980233603e-05, + "loss": 1.0167138576507568, + "step": 494 + }, + { + "epoch": 0.2140237324703344, + "grad_norm": 0.6975740194320679, + "learning_rate": 8.894878706199461e-05, + "loss": 1.1470818519592285, + "step": 496 + }, + { + "epoch": 0.21488673139158576, + "grad_norm": 0.6594390869140625, + "learning_rate": 8.930817610062893e-05, + "loss": 0.9619631171226501, + "step": 498 + }, + { + "epoch": 0.21574973031283712, + "grad_norm": 0.7216679453849792, + "learning_rate": 8.966756513926325e-05, + "loss": 0.9971368312835693, + "step": 500 + }, + { + "epoch": 0.21574973031283712, + "eval_loss": 1.0417571067810059, + "eval_runtime": 659.3112, + "eval_samples_per_second": 3.124, + "eval_steps_per_second": 3.124, + "step": 500 + }, + { + "epoch": 0.21661272923408845, + "grad_norm": 0.6188210844993591, + "learning_rate": 9.002695417789758e-05, + "loss": 1.0307213068008423, + "step": 502 + }, + { + "epoch": 0.2174757281553398, + "grad_norm": 0.6716445088386536, + "learning_rate": 9.03863432165319e-05, + "loss": 1.0188794136047363, + "step": 504 + }, + { + "epoch": 0.21833872707659116, + "grad_norm": 0.6790863275527954, + "learning_rate": 9.074573225516622e-05, + "loss": 0.9764845967292786, + "step": 506 + }, + { + "epoch": 0.2192017259978425, + "grad_norm": 0.6764960289001465, + "learning_rate": 9.110512129380054e-05, + "loss": 0.948829174041748, + "step": 508 + }, + { + "epoch": 0.22006472491909385, + "grad_norm": 0.6210965514183044, + "learning_rate": 9.146451033243486e-05, + "loss": 1.008013129234314, + "step": 510 + }, + { + "epoch": 0.2209277238403452, + "grad_norm": 0.7739297747612, + "learning_rate": 9.182389937106919e-05, + "loss": 1.1662557125091553, + "step": 512 + }, + { + "epoch": 0.22179072276159656, + "grad_norm": 0.7055562138557434, + "learning_rate": 9.21832884097035e-05, + "loss": 1.0325161218643188, + "step": 514 + }, + { + "epoch": 0.2226537216828479, + "grad_norm": 0.6079210042953491, + "learning_rate": 9.254267744833783e-05, + "loss": 1.00056791305542, + "step": 516 + }, + { + "epoch": 0.22351672060409924, + "grad_norm": 0.5974318981170654, + "learning_rate": 9.290206648697215e-05, + "loss": 0.9422364234924316, + "step": 518 + }, + { + "epoch": 0.2243797195253506, + "grad_norm": 0.5963430404663086, + "learning_rate": 9.326145552560648e-05, + "loss": 0.936336100101471, + "step": 520 + }, + { + "epoch": 0.22524271844660193, + "grad_norm": 0.6823658347129822, + "learning_rate": 9.36208445642408e-05, + "loss": 1.0538607835769653, + "step": 522 + }, + { + "epoch": 0.2261057173678533, + "grad_norm": 0.6409855484962463, + "learning_rate": 9.398023360287511e-05, + "loss": 1.0483653545379639, + "step": 524 + }, + { + "epoch": 0.22696871628910464, + "grad_norm": 0.6867254376411438, + "learning_rate": 9.433962264150944e-05, + "loss": 0.9668049812316895, + "step": 526 + }, + { + "epoch": 0.227831715210356, + "grad_norm": 0.5690792798995972, + "learning_rate": 9.469901168014376e-05, + "loss": 1.008763313293457, + "step": 528 + }, + { + "epoch": 0.22869471413160733, + "grad_norm": 0.5964897274971008, + "learning_rate": 9.505840071877809e-05, + "loss": 1.0816441774368286, + "step": 530 + }, + { + "epoch": 0.2295577130528587, + "grad_norm": 0.627419114112854, + "learning_rate": 9.54177897574124e-05, + "loss": 0.9265700578689575, + "step": 532 + }, + { + "epoch": 0.23042071197411004, + "grad_norm": 0.5862151980400085, + "learning_rate": 9.577717879604674e-05, + "loss": 0.9804646372795105, + "step": 534 + }, + { + "epoch": 0.23128371089536137, + "grad_norm": 0.5573718547821045, + "learning_rate": 9.613656783468105e-05, + "loss": 0.9627988934516907, + "step": 536 + }, + { + "epoch": 0.23214670981661273, + "grad_norm": 0.6705166101455688, + "learning_rate": 9.649595687331537e-05, + "loss": 1.0012824535369873, + "step": 538 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 0.6251236796379089, + "learning_rate": 9.685534591194969e-05, + "loss": 0.9568162560462952, + "step": 540 + }, + { + "epoch": 0.23387270765911541, + "grad_norm": 0.6466493010520935, + "learning_rate": 9.7214734950584e-05, + "loss": 1.031549334526062, + "step": 542 + }, + { + "epoch": 0.23473570658036677, + "grad_norm": 0.5183866024017334, + "learning_rate": 9.757412398921833e-05, + "loss": 0.8603643774986267, + "step": 544 + }, + { + "epoch": 0.23559870550161813, + "grad_norm": 0.6725775599479675, + "learning_rate": 9.793351302785265e-05, + "loss": 1.0365077257156372, + "step": 546 + }, + { + "epoch": 0.23646170442286948, + "grad_norm": 0.5972357988357544, + "learning_rate": 9.829290206648698e-05, + "loss": 0.9304701089859009, + "step": 548 + }, + { + "epoch": 0.2373247033441208, + "grad_norm": 0.5319957733154297, + "learning_rate": 9.86522911051213e-05, + "loss": 0.9575805068016052, + "step": 550 + }, + { + "epoch": 0.23818770226537217, + "grad_norm": 0.6502835750579834, + "learning_rate": 9.901168014375562e-05, + "loss": 1.0307214260101318, + "step": 552 + }, + { + "epoch": 0.23905070118662353, + "grad_norm": 0.6734047532081604, + "learning_rate": 9.937106918238994e-05, + "loss": 1.05185067653656, + "step": 554 + }, + { + "epoch": 0.23991370010787486, + "grad_norm": 0.5667978525161743, + "learning_rate": 9.973045822102426e-05, + "loss": 1.0190176963806152, + "step": 556 + }, + { + "epoch": 0.2407766990291262, + "grad_norm": 0.6370418667793274, + "learning_rate": 0.00010008984725965857, + "loss": 1.076182246208191, + "step": 558 + }, + { + "epoch": 0.24163969795037757, + "grad_norm": 0.689719021320343, + "learning_rate": 0.0001004492362982929, + "loss": 1.0408724546432495, + "step": 560 + }, + { + "epoch": 0.2425026968716289, + "grad_norm": 0.6304254531860352, + "learning_rate": 0.00010080862533692723, + "loss": 0.9869902729988098, + "step": 562 + }, + { + "epoch": 0.24336569579288025, + "grad_norm": 0.6797420382499695, + "learning_rate": 0.00010116801437556156, + "loss": 1.0198370218276978, + "step": 564 + }, + { + "epoch": 0.2442286947141316, + "grad_norm": 0.5993657112121582, + "learning_rate": 0.00010152740341419587, + "loss": 0.9947441816329956, + "step": 566 + }, + { + "epoch": 0.24509169363538297, + "grad_norm": 0.6369836330413818, + "learning_rate": 0.0001018867924528302, + "loss": 0.9722896814346313, + "step": 568 + }, + { + "epoch": 0.2459546925566343, + "grad_norm": 0.6942457556724548, + "learning_rate": 0.00010224618149146453, + "loss": 0.9716570973396301, + "step": 570 + }, + { + "epoch": 0.24681769147788565, + "grad_norm": 0.5403370261192322, + "learning_rate": 0.00010260557053009883, + "loss": 0.9797524213790894, + "step": 572 + }, + { + "epoch": 0.247680690399137, + "grad_norm": 0.5207529067993164, + "learning_rate": 0.00010296495956873316, + "loss": 0.985367476940155, + "step": 574 + }, + { + "epoch": 0.24854368932038834, + "grad_norm": 0.6751103401184082, + "learning_rate": 0.00010332434860736748, + "loss": 1.075042724609375, + "step": 576 + }, + { + "epoch": 0.2494066882416397, + "grad_norm": 0.565331220626831, + "learning_rate": 0.0001036837376460018, + "loss": 0.9273878335952759, + "step": 578 + }, + { + "epoch": 0.25026968716289105, + "grad_norm": 0.6858948469161987, + "learning_rate": 0.00010404312668463612, + "loss": 0.9872279763221741, + "step": 580 + }, + { + "epoch": 0.2511326860841424, + "grad_norm": 0.7091426253318787, + "learning_rate": 0.00010440251572327044, + "loss": 1.0038671493530273, + "step": 582 + }, + { + "epoch": 0.25199568500539377, + "grad_norm": 0.6493771076202393, + "learning_rate": 0.00010476190476190477, + "loss": 1.0109868049621582, + "step": 584 + }, + { + "epoch": 0.25285868392664507, + "grad_norm": 0.6107586622238159, + "learning_rate": 0.00010512129380053907, + "loss": 1.0020402669906616, + "step": 586 + }, + { + "epoch": 0.2537216828478964, + "grad_norm": 0.6878048181533813, + "learning_rate": 0.0001054806828391734, + "loss": 0.961039662361145, + "step": 588 + }, + { + "epoch": 0.2545846817691478, + "grad_norm": 0.664034903049469, + "learning_rate": 0.00010584007187780773, + "loss": 0.9725209474563599, + "step": 590 + }, + { + "epoch": 0.25544768069039914, + "grad_norm": 0.6399680376052856, + "learning_rate": 0.00010619946091644206, + "loss": 0.9907437562942505, + "step": 592 + }, + { + "epoch": 0.2563106796116505, + "grad_norm": 0.6163286566734314, + "learning_rate": 0.00010655884995507636, + "loss": 0.9650095701217651, + "step": 594 + }, + { + "epoch": 0.25717367853290185, + "grad_norm": 0.6008322238922119, + "learning_rate": 0.0001069182389937107, + "loss": 1.0102758407592773, + "step": 596 + }, + { + "epoch": 0.2580366774541532, + "grad_norm": 0.6752071380615234, + "learning_rate": 0.00010727762803234502, + "loss": 0.9101885557174683, + "step": 598 + }, + { + "epoch": 0.2588996763754045, + "grad_norm": 0.6789175868034363, + "learning_rate": 0.00010763701707097935, + "loss": 1.0461398363113403, + "step": 600 + }, + { + "epoch": 0.2588996763754045, + "eval_loss": 1.021111011505127, + "eval_runtime": 648.1611, + "eval_samples_per_second": 3.178, + "eval_steps_per_second": 3.178, + "step": 600 + }, + { + "epoch": 0.25976267529665586, + "grad_norm": 0.5660730600357056, + "learning_rate": 0.00010799640610961366, + "loss": 0.9582418203353882, + "step": 602 + }, + { + "epoch": 0.2606256742179072, + "grad_norm": 0.6726544499397278, + "learning_rate": 0.00010835579514824799, + "loss": 1.0763746500015259, + "step": 604 + }, + { + "epoch": 0.2614886731391586, + "grad_norm": 0.6068508625030518, + "learning_rate": 0.00010871518418688232, + "loss": 1.0432032346725464, + "step": 606 + }, + { + "epoch": 0.26235167206040994, + "grad_norm": 0.5731637477874756, + "learning_rate": 0.00010907457322551662, + "loss": 0.9830516576766968, + "step": 608 + }, + { + "epoch": 0.2632146709816613, + "grad_norm": 0.6777567267417908, + "learning_rate": 0.00010943396226415095, + "loss": 1.0442042350769043, + "step": 610 + }, + { + "epoch": 0.26407766990291265, + "grad_norm": 0.6372506618499756, + "learning_rate": 0.00010979335130278528, + "loss": 1.0001944303512573, + "step": 612 + }, + { + "epoch": 0.26494066882416395, + "grad_norm": 0.6606221795082092, + "learning_rate": 0.0001101527403414196, + "loss": 1.035884141921997, + "step": 614 + }, + { + "epoch": 0.2658036677454153, + "grad_norm": 0.6083229780197144, + "learning_rate": 0.00011051212938005391, + "loss": 0.9403397440910339, + "step": 616 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.6318517923355103, + "learning_rate": 0.00011087151841868823, + "loss": 0.9274454116821289, + "step": 618 + }, + { + "epoch": 0.267529665587918, + "grad_norm": 0.628190279006958, + "learning_rate": 0.00011123090745732256, + "loss": 0.9883754253387451, + "step": 620 + }, + { + "epoch": 0.2683926645091694, + "grad_norm": 0.5961961150169373, + "learning_rate": 0.00011159029649595688, + "loss": 1.0317535400390625, + "step": 622 + }, + { + "epoch": 0.26925566343042073, + "grad_norm": 0.5995341539382935, + "learning_rate": 0.00011194968553459119, + "loss": 0.9776126742362976, + "step": 624 + }, + { + "epoch": 0.27011866235167203, + "grad_norm": 0.6639334559440613, + "learning_rate": 0.00011230907457322552, + "loss": 1.0112378597259521, + "step": 626 + }, + { + "epoch": 0.2709816612729234, + "grad_norm": 0.6348621249198914, + "learning_rate": 0.00011266846361185985, + "loss": 1.0553804636001587, + "step": 628 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 0.5929805040359497, + "learning_rate": 0.00011302785265049416, + "loss": 1.025888442993164, + "step": 630 + }, + { + "epoch": 0.2727076591154261, + "grad_norm": 0.6052366495132446, + "learning_rate": 0.00011338724168912849, + "loss": 1.02956223487854, + "step": 632 + }, + { + "epoch": 0.27357065803667746, + "grad_norm": 0.6494882106781006, + "learning_rate": 0.00011374663072776282, + "loss": 0.989752471446991, + "step": 634 + }, + { + "epoch": 0.2744336569579288, + "grad_norm": 0.6005767583847046, + "learning_rate": 0.00011410601976639712, + "loss": 1.0030683279037476, + "step": 636 + }, + { + "epoch": 0.2752966558791802, + "grad_norm": 0.6478356122970581, + "learning_rate": 0.00011446540880503145, + "loss": 1.002000093460083, + "step": 638 + }, + { + "epoch": 0.2761596548004315, + "grad_norm": 0.5804725289344788, + "learning_rate": 0.00011482479784366578, + "loss": 0.9807654023170471, + "step": 640 + }, + { + "epoch": 0.27702265372168283, + "grad_norm": 0.632530689239502, + "learning_rate": 0.00011518418688230011, + "loss": 0.9841892123222351, + "step": 642 + }, + { + "epoch": 0.2778856526429342, + "grad_norm": 0.5340113639831543, + "learning_rate": 0.00011554357592093441, + "loss": 0.8902478814125061, + "step": 644 + }, + { + "epoch": 0.27874865156418555, + "grad_norm": 0.5901665091514587, + "learning_rate": 0.00011590296495956874, + "loss": 0.9019404649734497, + "step": 646 + }, + { + "epoch": 0.2796116504854369, + "grad_norm": 0.666589617729187, + "learning_rate": 0.00011626235399820307, + "loss": 0.9384423494338989, + "step": 648 + }, + { + "epoch": 0.28047464940668826, + "grad_norm": 0.7000334858894348, + "learning_rate": 0.00011662174303683737, + "loss": 1.0666629076004028, + "step": 650 + }, + { + "epoch": 0.2813376483279396, + "grad_norm": 0.663663923740387, + "learning_rate": 0.0001169811320754717, + "loss": 1.000019907951355, + "step": 652 + }, + { + "epoch": 0.2822006472491909, + "grad_norm": 0.6097694039344788, + "learning_rate": 0.00011734052111410603, + "loss": 0.9450293183326721, + "step": 654 + }, + { + "epoch": 0.2830636461704423, + "grad_norm": 0.6130967140197754, + "learning_rate": 0.00011769991015274035, + "loss": 0.9480894207954407, + "step": 656 + }, + { + "epoch": 0.28392664509169363, + "grad_norm": 0.7091249227523804, + "learning_rate": 0.00011805929919137467, + "loss": 1.1377143859863281, + "step": 658 + }, + { + "epoch": 0.284789644012945, + "grad_norm": 0.6556766629219055, + "learning_rate": 0.00011841868823000898, + "loss": 0.9421243667602539, + "step": 660 + }, + { + "epoch": 0.28565264293419634, + "grad_norm": 0.6682968735694885, + "learning_rate": 0.00011877807726864331, + "loss": 0.9726828336715698, + "step": 662 + }, + { + "epoch": 0.2865156418554477, + "grad_norm": 0.5224708914756775, + "learning_rate": 0.00011913746630727762, + "loss": 0.8996511697769165, + "step": 664 + }, + { + "epoch": 0.287378640776699, + "grad_norm": 0.5914195775985718, + "learning_rate": 0.00011949685534591195, + "loss": 0.9679517149925232, + "step": 666 + }, + { + "epoch": 0.28824163969795036, + "grad_norm": 0.6175519824028015, + "learning_rate": 0.00011985624438454628, + "loss": 0.8743209838867188, + "step": 668 + }, + { + "epoch": 0.2891046386192017, + "grad_norm": 0.6019226312637329, + "learning_rate": 0.0001202156334231806, + "loss": 0.9741992354393005, + "step": 670 + }, + { + "epoch": 0.28996763754045307, + "grad_norm": 0.6080542206764221, + "learning_rate": 0.00012057502246181491, + "loss": 0.9516472816467285, + "step": 672 + }, + { + "epoch": 0.29083063646170443, + "grad_norm": 0.5885615944862366, + "learning_rate": 0.00012093441150044924, + "loss": 1.122761607170105, + "step": 674 + }, + { + "epoch": 0.2916936353829558, + "grad_norm": 0.6635209918022156, + "learning_rate": 0.00012129380053908357, + "loss": 1.0105189085006714, + "step": 676 + }, + { + "epoch": 0.29255663430420714, + "grad_norm": 0.5805009007453918, + "learning_rate": 0.0001216531895777179, + "loss": 0.906292200088501, + "step": 678 + }, + { + "epoch": 0.29341963322545844, + "grad_norm": 0.5980029702186584, + "learning_rate": 0.0001220125786163522, + "loss": 1.009568691253662, + "step": 680 + }, + { + "epoch": 0.2942826321467098, + "grad_norm": 0.6797705292701721, + "learning_rate": 0.00012237196765498652, + "loss": 1.0373667478561401, + "step": 682 + }, + { + "epoch": 0.29514563106796116, + "grad_norm": 0.6280547976493835, + "learning_rate": 0.00012273135669362085, + "loss": 0.9758188724517822, + "step": 684 + }, + { + "epoch": 0.2960086299892125, + "grad_norm": 0.511608898639679, + "learning_rate": 0.00012309074573225515, + "loss": 0.9111692905426025, + "step": 686 + }, + { + "epoch": 0.29687162891046387, + "grad_norm": 0.5781835317611694, + "learning_rate": 0.00012345013477088948, + "loss": 0.8865921497344971, + "step": 688 + }, + { + "epoch": 0.2977346278317152, + "grad_norm": 0.6514166593551636, + "learning_rate": 0.0001238095238095238, + "loss": 0.9768189191818237, + "step": 690 + }, + { + "epoch": 0.2985976267529666, + "grad_norm": 0.6109189987182617, + "learning_rate": 0.00012416891284815814, + "loss": 0.9991607069969177, + "step": 692 + }, + { + "epoch": 0.2994606256742179, + "grad_norm": 0.6598902344703674, + "learning_rate": 0.00012452830188679244, + "loss": 0.9548360705375671, + "step": 694 + }, + { + "epoch": 0.30032362459546924, + "grad_norm": 0.5633156895637512, + "learning_rate": 0.00012488769092542677, + "loss": 0.992988109588623, + "step": 696 + }, + { + "epoch": 0.3011866235167206, + "grad_norm": 0.6098802089691162, + "learning_rate": 0.0001252470799640611, + "loss": 0.9709890484809875, + "step": 698 + }, + { + "epoch": 0.30204962243797195, + "grad_norm": 0.6197102665901184, + "learning_rate": 0.0001256064690026954, + "loss": 1.018282175064087, + "step": 700 + }, + { + "epoch": 0.30204962243797195, + "eval_loss": 1.0030721426010132, + "eval_runtime": 655.4533, + "eval_samples_per_second": 3.143, + "eval_steps_per_second": 3.143, + "step": 700 + }, + { + "epoch": 0.3029126213592233, + "grad_norm": 0.5817480683326721, + "learning_rate": 0.00012596585804132974, + "loss": 0.9147283434867859, + "step": 702 + }, + { + "epoch": 0.30377562028047467, + "grad_norm": 0.5976696014404297, + "learning_rate": 0.00012632524707996407, + "loss": 0.9318362474441528, + "step": 704 + }, + { + "epoch": 0.304638619201726, + "grad_norm": 0.6389723420143127, + "learning_rate": 0.0001266846361185984, + "loss": 0.9500927925109863, + "step": 706 + }, + { + "epoch": 0.3055016181229773, + "grad_norm": 0.6485719084739685, + "learning_rate": 0.0001270440251572327, + "loss": 1.0271424055099487, + "step": 708 + }, + { + "epoch": 0.3063646170442287, + "grad_norm": 0.5802455544471741, + "learning_rate": 0.00012740341419586703, + "loss": 0.9781906008720398, + "step": 710 + }, + { + "epoch": 0.30722761596548004, + "grad_norm": 0.6359356641769409, + "learning_rate": 0.00012776280323450136, + "loss": 1.0195324420928955, + "step": 712 + }, + { + "epoch": 0.3080906148867314, + "grad_norm": 0.5975426435470581, + "learning_rate": 0.00012812219227313566, + "loss": 0.9250738024711609, + "step": 714 + }, + { + "epoch": 0.30895361380798275, + "grad_norm": 0.643110454082489, + "learning_rate": 0.00012848158131177, + "loss": 0.9888015985488892, + "step": 716 + }, + { + "epoch": 0.3098166127292341, + "grad_norm": 0.6043205261230469, + "learning_rate": 0.00012884097035040432, + "loss": 0.9709514379501343, + "step": 718 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 0.5687094926834106, + "learning_rate": 0.00012920035938903865, + "loss": 1.0272964239120483, + "step": 720 + }, + { + "epoch": 0.31154261057173677, + "grad_norm": 0.5688400864601135, + "learning_rate": 0.00012955974842767296, + "loss": 0.9370370507240295, + "step": 722 + }, + { + "epoch": 0.3124056094929881, + "grad_norm": 0.5610610246658325, + "learning_rate": 0.00012991913746630729, + "loss": 0.9535608291625977, + "step": 724 + }, + { + "epoch": 0.3132686084142395, + "grad_norm": 0.6338257193565369, + "learning_rate": 0.00013027852650494162, + "loss": 1.0188907384872437, + "step": 726 + }, + { + "epoch": 0.31413160733549084, + "grad_norm": 0.5365633368492126, + "learning_rate": 0.00013063791554357592, + "loss": 0.9253716468811035, + "step": 728 + }, + { + "epoch": 0.3149946062567422, + "grad_norm": 0.5599163174629211, + "learning_rate": 0.00013099730458221025, + "loss": 0.8941492438316345, + "step": 730 + }, + { + "epoch": 0.31585760517799355, + "grad_norm": 0.6059780716896057, + "learning_rate": 0.00013135669362084458, + "loss": 0.9831459522247314, + "step": 732 + }, + { + "epoch": 0.31672060409924485, + "grad_norm": 0.5596494078636169, + "learning_rate": 0.0001317160826594789, + "loss": 0.9332310557365417, + "step": 734 + }, + { + "epoch": 0.3175836030204962, + "grad_norm": 0.5618010759353638, + "learning_rate": 0.0001320754716981132, + "loss": 0.9082580208778381, + "step": 736 + }, + { + "epoch": 0.31844660194174756, + "grad_norm": 0.6412109732627869, + "learning_rate": 0.00013243486073674754, + "loss": 1.008690357208252, + "step": 738 + }, + { + "epoch": 0.3193096008629989, + "grad_norm": 0.5742355585098267, + "learning_rate": 0.00013279424977538187, + "loss": 0.9597798585891724, + "step": 740 + }, + { + "epoch": 0.3201725997842503, + "grad_norm": 0.6470226645469666, + "learning_rate": 0.00013315363881401617, + "loss": 0.989331841468811, + "step": 742 + }, + { + "epoch": 0.32103559870550163, + "grad_norm": 0.5598039031028748, + "learning_rate": 0.0001335130278526505, + "loss": 0.8677343130111694, + "step": 744 + }, + { + "epoch": 0.321898597626753, + "grad_norm": 0.5441372990608215, + "learning_rate": 0.00013387241689128483, + "loss": 0.9462730288505554, + "step": 746 + }, + { + "epoch": 0.3227615965480043, + "grad_norm": 0.5858626365661621, + "learning_rate": 0.00013423180592991916, + "loss": 0.994694173336029, + "step": 748 + }, + { + "epoch": 0.32362459546925565, + "grad_norm": 0.511372447013855, + "learning_rate": 0.00013459119496855347, + "loss": 0.9387269616127014, + "step": 750 + }, + { + "epoch": 0.324487594390507, + "grad_norm": 0.47798457741737366, + "learning_rate": 0.0001349505840071878, + "loss": 0.9473881721496582, + "step": 752 + }, + { + "epoch": 0.32535059331175836, + "grad_norm": 0.5907022953033447, + "learning_rate": 0.0001353099730458221, + "loss": 0.9375183582305908, + "step": 754 + }, + { + "epoch": 0.3262135922330097, + "grad_norm": 0.618733286857605, + "learning_rate": 0.00013566936208445643, + "loss": 1.028738260269165, + "step": 756 + }, + { + "epoch": 0.3270765911542611, + "grad_norm": 0.5234512090682983, + "learning_rate": 0.00013602875112309076, + "loss": 0.9420192241668701, + "step": 758 + }, + { + "epoch": 0.32793959007551243, + "grad_norm": 0.7036319971084595, + "learning_rate": 0.00013638814016172506, + "loss": 1.0252270698547363, + "step": 760 + }, + { + "epoch": 0.32880258899676373, + "grad_norm": 0.5543172359466553, + "learning_rate": 0.0001367475292003594, + "loss": 0.8453778028488159, + "step": 762 + }, + { + "epoch": 0.3296655879180151, + "grad_norm": 0.5438711643218994, + "learning_rate": 0.0001371069182389937, + "loss": 0.8659937977790833, + "step": 764 + }, + { + "epoch": 0.33052858683926645, + "grad_norm": 0.6390914916992188, + "learning_rate": 0.00013746630727762803, + "loss": 1.038142442703247, + "step": 766 + }, + { + "epoch": 0.3313915857605178, + "grad_norm": 0.50070720911026, + "learning_rate": 0.00013782569631626236, + "loss": 0.899932861328125, + "step": 768 + }, + { + "epoch": 0.33225458468176916, + "grad_norm": 0.5982286334037781, + "learning_rate": 0.00013818508535489669, + "loss": 0.9712884426116943, + "step": 770 + }, + { + "epoch": 0.3331175836030205, + "grad_norm": 0.6588822603225708, + "learning_rate": 0.000138544474393531, + "loss": 0.9427542686462402, + "step": 772 + }, + { + "epoch": 0.3339805825242718, + "grad_norm": 0.6022042632102966, + "learning_rate": 0.00013890386343216532, + "loss": 0.8961561918258667, + "step": 774 + }, + { + "epoch": 0.3348435814455232, + "grad_norm": 0.6595642566680908, + "learning_rate": 0.00013926325247079965, + "loss": 0.9525937438011169, + "step": 776 + }, + { + "epoch": 0.33570658036677453, + "grad_norm": 0.5210421681404114, + "learning_rate": 0.00013962264150943395, + "loss": 0.9218845367431641, + "step": 778 + }, + { + "epoch": 0.3365695792880259, + "grad_norm": 0.549669623374939, + "learning_rate": 0.00013998203054806828, + "loss": 0.877951443195343, + "step": 780 + }, + { + "epoch": 0.33743257820927725, + "grad_norm": 0.5360157489776611, + "learning_rate": 0.0001403414195867026, + "loss": 0.8670064210891724, + "step": 782 + }, + { + "epoch": 0.3382955771305286, + "grad_norm": 0.614734947681427, + "learning_rate": 0.00014070080862533694, + "loss": 0.9561367630958557, + "step": 784 + }, + { + "epoch": 0.33915857605177996, + "grad_norm": 0.5798251628875732, + "learning_rate": 0.00014106019766397124, + "loss": 0.9132505059242249, + "step": 786 + }, + { + "epoch": 0.34002157497303126, + "grad_norm": 0.6267077326774597, + "learning_rate": 0.00014141958670260557, + "loss": 0.9297707080841064, + "step": 788 + }, + { + "epoch": 0.3408845738942826, + "grad_norm": 0.6045349836349487, + "learning_rate": 0.0001417789757412399, + "loss": 0.9382412433624268, + "step": 790 + }, + { + "epoch": 0.341747572815534, + "grad_norm": 0.6125404834747314, + "learning_rate": 0.0001421383647798742, + "loss": 0.9078555107116699, + "step": 792 + }, + { + "epoch": 0.34261057173678533, + "grad_norm": 0.5927051901817322, + "learning_rate": 0.00014249775381850854, + "loss": 0.899101197719574, + "step": 794 + }, + { + "epoch": 0.3434735706580367, + "grad_norm": 0.6315743923187256, + "learning_rate": 0.00014285714285714287, + "loss": 1.028346300125122, + "step": 796 + }, + { + "epoch": 0.34433656957928804, + "grad_norm": 0.549271285533905, + "learning_rate": 0.0001432165318957772, + "loss": 0.8988189697265625, + "step": 798 + }, + { + "epoch": 0.3451995685005394, + "grad_norm": 0.6344245672225952, + "learning_rate": 0.0001435759209344115, + "loss": 1.0489003658294678, + "step": 800 + }, + { + "epoch": 0.3451995685005394, + "eval_loss": 0.9864639639854431, + "eval_runtime": 667.3516, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 3.087, + "step": 800 + }, + { + "epoch": 0.3460625674217907, + "grad_norm": 0.5625309348106384, + "learning_rate": 0.00014393530997304583, + "loss": 0.8773928284645081, + "step": 802 + }, + { + "epoch": 0.34692556634304206, + "grad_norm": 0.5931969285011292, + "learning_rate": 0.00014429469901168016, + "loss": 0.9116050004959106, + "step": 804 + }, + { + "epoch": 0.3477885652642934, + "grad_norm": 0.5189821720123291, + "learning_rate": 0.00014465408805031446, + "loss": 0.9124425649642944, + "step": 806 + }, + { + "epoch": 0.34865156418554477, + "grad_norm": 0.5392254590988159, + "learning_rate": 0.0001450134770889488, + "loss": 0.9517888426780701, + "step": 808 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 0.5584444999694824, + "learning_rate": 0.00014537286612758312, + "loss": 0.9947572350502014, + "step": 810 + }, + { + "epoch": 0.3503775620280475, + "grad_norm": 0.5188854932785034, + "learning_rate": 0.00014573225516621745, + "loss": 0.9314022660255432, + "step": 812 + }, + { + "epoch": 0.3512405609492988, + "grad_norm": 0.5783659815788269, + "learning_rate": 0.00014609164420485176, + "loss": 0.9135628938674927, + "step": 814 + }, + { + "epoch": 0.35210355987055014, + "grad_norm": 0.550959050655365, + "learning_rate": 0.0001464510332434861, + "loss": 0.9665075540542603, + "step": 816 + }, + { + "epoch": 0.3529665587918015, + "grad_norm": 0.6013346314430237, + "learning_rate": 0.00014681042228212042, + "loss": 0.9836555123329163, + "step": 818 + }, + { + "epoch": 0.35382955771305286, + "grad_norm": 0.49219194054603577, + "learning_rate": 0.00014716981132075472, + "loss": 0.8900108337402344, + "step": 820 + }, + { + "epoch": 0.3546925566343042, + "grad_norm": 0.5517411828041077, + "learning_rate": 0.00014752920035938905, + "loss": 0.8769304156303406, + "step": 822 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.6062695980072021, + "learning_rate": 0.00014788858939802338, + "loss": 0.9744759202003479, + "step": 824 + }, + { + "epoch": 0.3564185544768069, + "grad_norm": 0.5132041573524475, + "learning_rate": 0.0001482479784366577, + "loss": 0.8875447511672974, + "step": 826 + }, + { + "epoch": 0.3572815533980582, + "grad_norm": 0.551799476146698, + "learning_rate": 0.000148607367475292, + "loss": 0.962710440158844, + "step": 828 + }, + { + "epoch": 0.3581445523193096, + "grad_norm": 0.6046625971794128, + "learning_rate": 0.00014896675651392634, + "loss": 0.8997528553009033, + "step": 830 + }, + { + "epoch": 0.35900755124056094, + "grad_norm": 0.560025691986084, + "learning_rate": 0.00014932614555256067, + "loss": 0.9541417360305786, + "step": 832 + }, + { + "epoch": 0.3598705501618123, + "grad_norm": 0.6441047787666321, + "learning_rate": 0.00014968553459119498, + "loss": 0.907791018486023, + "step": 834 + }, + { + "epoch": 0.36073354908306365, + "grad_norm": 0.5636281967163086, + "learning_rate": 0.0001500449236298293, + "loss": 1.0295937061309814, + "step": 836 + }, + { + "epoch": 0.361596548004315, + "grad_norm": 0.5528303384780884, + "learning_rate": 0.0001504043126684636, + "loss": 0.8875265717506409, + "step": 838 + }, + { + "epoch": 0.36245954692556637, + "grad_norm": 0.5345163345336914, + "learning_rate": 0.00015076370170709794, + "loss": 0.9678915739059448, + "step": 840 + }, + { + "epoch": 0.36332254584681767, + "grad_norm": 0.5551225543022156, + "learning_rate": 0.00015112309074573224, + "loss": 0.9235162734985352, + "step": 842 + }, + { + "epoch": 0.364185544768069, + "grad_norm": 0.5131904482841492, + "learning_rate": 0.00015148247978436657, + "loss": 0.8624292016029358, + "step": 844 + }, + { + "epoch": 0.3650485436893204, + "grad_norm": 0.6811004281044006, + "learning_rate": 0.0001518418688230009, + "loss": 1.0360193252563477, + "step": 846 + }, + { + "epoch": 0.36591154261057174, + "grad_norm": 0.6409741640090942, + "learning_rate": 0.00015220125786163523, + "loss": 0.9254010915756226, + "step": 848 + }, + { + "epoch": 0.3667745415318231, + "grad_norm": 0.5534068942070007, + "learning_rate": 0.00015256064690026953, + "loss": 0.8900630474090576, + "step": 850 + }, + { + "epoch": 0.36763754045307445, + "grad_norm": 0.4999487102031708, + "learning_rate": 0.00015292003593890386, + "loss": 0.88521409034729, + "step": 852 + }, + { + "epoch": 0.3685005393743258, + "grad_norm": 0.5805923938751221, + "learning_rate": 0.0001532794249775382, + "loss": 0.9563921093940735, + "step": 854 + }, + { + "epoch": 0.3693635382955771, + "grad_norm": 0.5485470294952393, + "learning_rate": 0.0001536388140161725, + "loss": 0.8909372687339783, + "step": 856 + }, + { + "epoch": 0.37022653721682847, + "grad_norm": 0.5317923426628113, + "learning_rate": 0.00015399820305480683, + "loss": 0.9145731925964355, + "step": 858 + }, + { + "epoch": 0.3710895361380798, + "grad_norm": 0.6073495745658875, + "learning_rate": 0.00015435759209344116, + "loss": 1.01466965675354, + "step": 860 + }, + { + "epoch": 0.3719525350593312, + "grad_norm": 0.566655158996582, + "learning_rate": 0.0001547169811320755, + "loss": 0.9941825270652771, + "step": 862 + }, + { + "epoch": 0.37281553398058254, + "grad_norm": 0.5262459516525269, + "learning_rate": 0.0001550763701707098, + "loss": 1.0059782266616821, + "step": 864 + }, + { + "epoch": 0.3736785329018339, + "grad_norm": 0.6264083981513977, + "learning_rate": 0.00015543575920934412, + "loss": 1.0332856178283691, + "step": 866 + }, + { + "epoch": 0.3745415318230852, + "grad_norm": 0.6575480699539185, + "learning_rate": 0.00015579514824797845, + "loss": 1.022459626197815, + "step": 868 + }, + { + "epoch": 0.37540453074433655, + "grad_norm": 0.6291940212249756, + "learning_rate": 0.00015615453728661275, + "loss": 0.9550372362136841, + "step": 870 + }, + { + "epoch": 0.3762675296655879, + "grad_norm": 0.6710562109947205, + "learning_rate": 0.00015651392632524708, + "loss": 0.9861716628074646, + "step": 872 + }, + { + "epoch": 0.37713052858683926, + "grad_norm": 0.5505748987197876, + "learning_rate": 0.0001568733153638814, + "loss": 0.9719111919403076, + "step": 874 + }, + { + "epoch": 0.3779935275080906, + "grad_norm": 0.5055180788040161, + "learning_rate": 0.00015723270440251574, + "loss": 0.8698170185089111, + "step": 876 + }, + { + "epoch": 0.378856526429342, + "grad_norm": 0.5935947895050049, + "learning_rate": 0.00015759209344115005, + "loss": 1.029494285583496, + "step": 878 + }, + { + "epoch": 0.37971952535059333, + "grad_norm": 0.538325846195221, + "learning_rate": 0.00015795148247978438, + "loss": 0.923010528087616, + "step": 880 + }, + { + "epoch": 0.38058252427184464, + "grad_norm": 0.587297797203064, + "learning_rate": 0.0001583108715184187, + "loss": 0.9394056797027588, + "step": 882 + }, + { + "epoch": 0.381445523193096, + "grad_norm": 0.5910462737083435, + "learning_rate": 0.000158670260557053, + "loss": 0.9472483992576599, + "step": 884 + }, + { + "epoch": 0.38230852211434735, + "grad_norm": 0.629048764705658, + "learning_rate": 0.00015902964959568734, + "loss": 0.9028263688087463, + "step": 886 + }, + { + "epoch": 0.3831715210355987, + "grad_norm": 0.5028086304664612, + "learning_rate": 0.00015938903863432167, + "loss": 0.9579087495803833, + "step": 888 + }, + { + "epoch": 0.38403451995685006, + "grad_norm": 0.5372384190559387, + "learning_rate": 0.000159748427672956, + "loss": 0.8318673372268677, + "step": 890 + }, + { + "epoch": 0.3848975188781014, + "grad_norm": 0.6314184665679932, + "learning_rate": 0.0001601078167115903, + "loss": 0.9804943203926086, + "step": 892 + }, + { + "epoch": 0.3857605177993528, + "grad_norm": 0.5545229911804199, + "learning_rate": 0.00016046720575022463, + "loss": 1.0078438520431519, + "step": 894 + }, + { + "epoch": 0.3866235167206041, + "grad_norm": 0.4674014151096344, + "learning_rate": 0.00016082659478885896, + "loss": 0.9269036650657654, + "step": 896 + }, + { + "epoch": 0.38748651564185543, + "grad_norm": 1.5887153148651123, + "learning_rate": 0.00016118598382749326, + "loss": 0.8927953243255615, + "step": 898 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 0.5217035412788391, + "learning_rate": 0.0001615453728661276, + "loss": 0.908074140548706, + "step": 900 + }, + { + "epoch": 0.3883495145631068, + "eval_loss": 0.9741895794868469, + "eval_runtime": 667.2236, + "eval_samples_per_second": 3.087, + "eval_steps_per_second": 3.087, + "step": 900 + }, + { + "epoch": 0.38921251348435815, + "grad_norm": 0.470498651266098, + "learning_rate": 0.00016190476190476192, + "loss": 0.9660369157791138, + "step": 902 + }, + { + "epoch": 0.3900755124056095, + "grad_norm": 0.5111004114151001, + "learning_rate": 0.00016226415094339625, + "loss": 0.9236379265785217, + "step": 904 + }, + { + "epoch": 0.39093851132686086, + "grad_norm": 0.5872815251350403, + "learning_rate": 0.00016262353998203056, + "loss": 1.0061595439910889, + "step": 906 + }, + { + "epoch": 0.3918015102481122, + "grad_norm": 0.5150740742683411, + "learning_rate": 0.0001629829290206649, + "loss": 0.8347328901290894, + "step": 908 + }, + { + "epoch": 0.3926645091693635, + "grad_norm": 0.46554985642433167, + "learning_rate": 0.00016334231805929922, + "loss": 0.9091183543205261, + "step": 910 + }, + { + "epoch": 0.3935275080906149, + "grad_norm": 0.5292875170707703, + "learning_rate": 0.00016370170709793352, + "loss": 0.9299798011779785, + "step": 912 + }, + { + "epoch": 0.39439050701186623, + "grad_norm": 0.5177125930786133, + "learning_rate": 0.00016406109613656785, + "loss": 0.942286491394043, + "step": 914 + }, + { + "epoch": 0.3952535059331176, + "grad_norm": 0.5564161539077759, + "learning_rate": 0.00016442048517520215, + "loss": 0.825290858745575, + "step": 916 + }, + { + "epoch": 0.39611650485436894, + "grad_norm": 0.5572530031204224, + "learning_rate": 0.00016477987421383648, + "loss": 0.876898467540741, + "step": 918 + }, + { + "epoch": 0.3969795037756203, + "grad_norm": 0.7294673323631287, + "learning_rate": 0.0001651392632524708, + "loss": 0.8949798941612244, + "step": 920 + }, + { + "epoch": 0.3978425026968716, + "grad_norm": 0.5234251022338867, + "learning_rate": 0.00016549865229110512, + "loss": 0.8457819223403931, + "step": 922 + }, + { + "epoch": 0.39870550161812296, + "grad_norm": 0.5273709893226624, + "learning_rate": 0.00016585804132973945, + "loss": 0.9080174565315247, + "step": 924 + }, + { + "epoch": 0.3995685005393743, + "grad_norm": 0.5795063376426697, + "learning_rate": 0.00016621743036837378, + "loss": 1.0304023027420044, + "step": 926 + }, + { + "epoch": 0.4004314994606257, + "grad_norm": 0.6153313517570496, + "learning_rate": 0.00016657681940700808, + "loss": 0.8900477886199951, + "step": 928 + }, + { + "epoch": 0.40129449838187703, + "grad_norm": 0.6293173432350159, + "learning_rate": 0.0001669362084456424, + "loss": 1.0130009651184082, + "step": 930 + }, + { + "epoch": 0.4021574973031284, + "grad_norm": 0.5455223321914673, + "learning_rate": 0.00016729559748427674, + "loss": 0.9339282512664795, + "step": 932 + }, + { + "epoch": 0.40302049622437974, + "grad_norm": 0.5349094271659851, + "learning_rate": 0.00016765498652291104, + "loss": 0.9628980755805969, + "step": 934 + }, + { + "epoch": 0.40388349514563104, + "grad_norm": 0.491227924823761, + "learning_rate": 0.00016801437556154537, + "loss": 0.8922860026359558, + "step": 936 + }, + { + "epoch": 0.4047464940668824, + "grad_norm": 0.6331246495246887, + "learning_rate": 0.0001683737646001797, + "loss": 1.0470497608184814, + "step": 938 + }, + { + "epoch": 0.40560949298813376, + "grad_norm": 0.6079246401786804, + "learning_rate": 0.00016873315363881403, + "loss": 0.8868283629417419, + "step": 940 + }, + { + "epoch": 0.4064724919093851, + "grad_norm": 0.5326972603797913, + "learning_rate": 0.00016909254267744833, + "loss": 0.9938711524009705, + "step": 942 + }, + { + "epoch": 0.40733549083063647, + "grad_norm": 0.47754305601119995, + "learning_rate": 0.00016945193171608266, + "loss": 0.8280484676361084, + "step": 944 + }, + { + "epoch": 0.4081984897518878, + "grad_norm": 0.6683310270309448, + "learning_rate": 0.000169811320754717, + "loss": 1.089701533317566, + "step": 946 + }, + { + "epoch": 0.4090614886731392, + "grad_norm": 0.42798754572868347, + "learning_rate": 0.0001701707097933513, + "loss": 0.8535542488098145, + "step": 948 + }, + { + "epoch": 0.4099244875943905, + "grad_norm": 0.5999574065208435, + "learning_rate": 0.00017053009883198563, + "loss": 0.9039298295974731, + "step": 950 + }, + { + "epoch": 0.41078748651564184, + "grad_norm": 0.5752781629562378, + "learning_rate": 0.00017088948787061996, + "loss": 0.8786448240280151, + "step": 952 + }, + { + "epoch": 0.4116504854368932, + "grad_norm": 0.5121532678604126, + "learning_rate": 0.0001712488769092543, + "loss": 0.9206072688102722, + "step": 954 + }, + { + "epoch": 0.41251348435814456, + "grad_norm": 0.611078143119812, + "learning_rate": 0.0001716082659478886, + "loss": 0.9246986508369446, + "step": 956 + }, + { + "epoch": 0.4133764832793959, + "grad_norm": 0.5101020336151123, + "learning_rate": 0.00017196765498652292, + "loss": 0.9221894145011902, + "step": 958 + }, + { + "epoch": 0.41423948220064727, + "grad_norm": 0.5681450963020325, + "learning_rate": 0.00017232704402515725, + "loss": 0.9072799682617188, + "step": 960 + }, + { + "epoch": 0.41510248112189857, + "grad_norm": 0.47865498065948486, + "learning_rate": 0.00017268643306379155, + "loss": 0.9460896849632263, + "step": 962 + }, + { + "epoch": 0.4159654800431499, + "grad_norm": 0.49861401319503784, + "learning_rate": 0.00017304582210242588, + "loss": 0.9121519923210144, + "step": 964 + }, + { + "epoch": 0.4168284789644013, + "grad_norm": 0.43025892972946167, + "learning_rate": 0.0001734052111410602, + "loss": 0.8826848864555359, + "step": 966 + }, + { + "epoch": 0.41769147788565264, + "grad_norm": 0.4600491225719452, + "learning_rate": 0.00017376460017969454, + "loss": 0.8756251335144043, + "step": 968 + }, + { + "epoch": 0.418554476806904, + "grad_norm": 0.5297656059265137, + "learning_rate": 0.00017412398921832885, + "loss": 0.9171333312988281, + "step": 970 + }, + { + "epoch": 0.41941747572815535, + "grad_norm": 0.4906919002532959, + "learning_rate": 0.00017448337825696318, + "loss": 0.8887524008750916, + "step": 972 + }, + { + "epoch": 0.4202804746494067, + "grad_norm": 0.49263402819633484, + "learning_rate": 0.0001748427672955975, + "loss": 0.8345810174942017, + "step": 974 + }, + { + "epoch": 0.421143473570658, + "grad_norm": 0.5706565380096436, + "learning_rate": 0.0001752021563342318, + "loss": 0.968651294708252, + "step": 976 + }, + { + "epoch": 0.42200647249190937, + "grad_norm": 0.5269908308982849, + "learning_rate": 0.00017556154537286614, + "loss": 0.9729376435279846, + "step": 978 + }, + { + "epoch": 0.4228694714131607, + "grad_norm": 0.47058001160621643, + "learning_rate": 0.00017592093441150047, + "loss": 0.963884711265564, + "step": 980 + }, + { + "epoch": 0.4237324703344121, + "grad_norm": 0.5322962999343872, + "learning_rate": 0.0001762803234501348, + "loss": 0.8952447175979614, + "step": 982 + }, + { + "epoch": 0.42459546925566344, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0001766397124887691, + "loss": 0.8932783603668213, + "step": 984 + }, + { + "epoch": 0.4254584681769148, + "grad_norm": 0.5539655685424805, + "learning_rate": 0.00017699910152740343, + "loss": 0.916595458984375, + "step": 986 + }, + { + "epoch": 0.42632146709816615, + "grad_norm": 0.568000853061676, + "learning_rate": 0.00017735849056603776, + "loss": 0.9669626355171204, + "step": 988 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 0.6010684370994568, + "learning_rate": 0.00017771787960467206, + "loss": 1.0089105367660522, + "step": 990 + }, + { + "epoch": 0.4280474649406688, + "grad_norm": 0.6083462238311768, + "learning_rate": 0.0001780772686433064, + "loss": 0.9810921549797058, + "step": 992 + }, + { + "epoch": 0.42891046386192017, + "grad_norm": 0.5076655149459839, + "learning_rate": 0.0001784366576819407, + "loss": 0.9524372816085815, + "step": 994 + }, + { + "epoch": 0.4297734627831715, + "grad_norm": 0.5260922312736511, + "learning_rate": 0.00017879604672057503, + "loss": 0.881294846534729, + "step": 996 + }, + { + "epoch": 0.4306364617044229, + "grad_norm": 0.6130498051643372, + "learning_rate": 0.00017915543575920936, + "loss": 0.9138327836990356, + "step": 998 + }, + { + "epoch": 0.43149946062567424, + "grad_norm": 0.5346242785453796, + "learning_rate": 0.00017951482479784366, + "loss": 0.8861367106437683, + "step": 1000 + }, + { + "epoch": 0.43149946062567424, + "eval_loss": 0.9606748819351196, + "eval_runtime": 655.4358, + "eval_samples_per_second": 3.143, + "eval_steps_per_second": 3.143, + "step": 1000 + }, + { + "epoch": 0.4323624595469256, + "grad_norm": 0.5977228879928589, + "learning_rate": 0.000179874213836478, + "loss": 0.8711628913879395, + "step": 1002 + }, + { + "epoch": 0.4332254584681769, + "grad_norm": 0.5547866821289062, + "learning_rate": 0.00018023360287511232, + "loss": 0.9393253326416016, + "step": 1004 + }, + { + "epoch": 0.43408845738942825, + "grad_norm": 0.536856472492218, + "learning_rate": 0.00018059299191374662, + "loss": 0.9486003518104553, + "step": 1006 + }, + { + "epoch": 0.4349514563106796, + "grad_norm": 0.4769814610481262, + "learning_rate": 0.00018095238095238095, + "loss": 0.9042052030563354, + "step": 1008 + }, + { + "epoch": 0.43581445523193096, + "grad_norm": 0.5554604530334473, + "learning_rate": 0.00018131176999101528, + "loss": 0.978546142578125, + "step": 1010 + }, + { + "epoch": 0.4366774541531823, + "grad_norm": 0.5112947225570679, + "learning_rate": 0.00018167115902964959, + "loss": 0.8382073640823364, + "step": 1012 + }, + { + "epoch": 0.4375404530744337, + "grad_norm": 0.45194941759109497, + "learning_rate": 0.00018203054806828392, + "loss": 0.8577026724815369, + "step": 1014 + }, + { + "epoch": 0.438403451995685, + "grad_norm": 0.5115043520927429, + "learning_rate": 0.00018238993710691825, + "loss": 0.8517863154411316, + "step": 1016 + }, + { + "epoch": 0.43926645091693634, + "grad_norm": 0.5485050082206726, + "learning_rate": 0.00018274932614555258, + "loss": 0.9597266912460327, + "step": 1018 + }, + { + "epoch": 0.4401294498381877, + "grad_norm": 0.5742959976196289, + "learning_rate": 0.00018310871518418688, + "loss": 1.0407187938690186, + "step": 1020 + }, + { + "epoch": 0.44099244875943905, + "grad_norm": 0.44870051741600037, + "learning_rate": 0.0001834681042228212, + "loss": 0.8696310520172119, + "step": 1022 + }, + { + "epoch": 0.4418554476806904, + "grad_norm": 0.5179623961448669, + "learning_rate": 0.00018382749326145554, + "loss": 0.9673634767532349, + "step": 1024 + }, + { + "epoch": 0.44271844660194176, + "grad_norm": 0.5404779314994812, + "learning_rate": 0.00018418688230008984, + "loss": 0.9596615433692932, + "step": 1026 + }, + { + "epoch": 0.4435814455231931, + "grad_norm": 0.47766315937042236, + "learning_rate": 0.00018454627133872417, + "loss": 0.8483878970146179, + "step": 1028 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.503380537033081, + "learning_rate": 0.0001849056603773585, + "loss": 0.9330979585647583, + "step": 1030 + }, + { + "epoch": 0.4453074433656958, + "grad_norm": 0.6129396557807922, + "learning_rate": 0.00018526504941599283, + "loss": 0.9341012239456177, + "step": 1032 + }, + { + "epoch": 0.44617044228694713, + "grad_norm": 0.4497876465320587, + "learning_rate": 0.00018562443845462713, + "loss": 0.9139068126678467, + "step": 1034 + }, + { + "epoch": 0.4470334412081985, + "grad_norm": 0.5369747281074524, + "learning_rate": 0.00018598382749326146, + "loss": 0.8874827027320862, + "step": 1036 + }, + { + "epoch": 0.44789644012944985, + "grad_norm": 0.5947322845458984, + "learning_rate": 0.0001863432165318958, + "loss": 0.9653725028038025, + "step": 1038 + }, + { + "epoch": 0.4487594390507012, + "grad_norm": 0.6649987101554871, + "learning_rate": 0.0001867026055705301, + "loss": 0.9553119540214539, + "step": 1040 + }, + { + "epoch": 0.44962243797195256, + "grad_norm": 0.5369387269020081, + "learning_rate": 0.00018706199460916443, + "loss": 0.904233992099762, + "step": 1042 + }, + { + "epoch": 0.45048543689320386, + "grad_norm": 0.4956842362880707, + "learning_rate": 0.00018742138364779876, + "loss": 0.8837952017784119, + "step": 1044 + }, + { + "epoch": 0.4513484358144552, + "grad_norm": 0.48045051097869873, + "learning_rate": 0.0001877807726864331, + "loss": 0.8964687585830688, + "step": 1046 + }, + { + "epoch": 0.4522114347357066, + "grad_norm": 0.4925530254840851, + "learning_rate": 0.0001881401617250674, + "loss": 0.9105878472328186, + "step": 1048 + }, + { + "epoch": 0.45307443365695793, + "grad_norm": 0.5131362080574036, + "learning_rate": 0.00018849955076370172, + "loss": 0.96272873878479, + "step": 1050 + }, + { + "epoch": 0.4539374325782093, + "grad_norm": 0.435739129781723, + "learning_rate": 0.00018885893980233605, + "loss": 0.8796783685684204, + "step": 1052 + }, + { + "epoch": 0.45480043149946064, + "grad_norm": 0.500938892364502, + "learning_rate": 0.00018921832884097035, + "loss": 0.9463814496994019, + "step": 1054 + }, + { + "epoch": 0.455663430420712, + "grad_norm": 0.4769900143146515, + "learning_rate": 0.00018957771787960468, + "loss": 0.9030335545539856, + "step": 1056 + }, + { + "epoch": 0.4565264293419633, + "grad_norm": 0.49585285782814026, + "learning_rate": 0.00018993710691823901, + "loss": 0.986995279788971, + "step": 1058 + }, + { + "epoch": 0.45738942826321466, + "grad_norm": 0.5875195264816284, + "learning_rate": 0.00019029649595687334, + "loss": 0.9297246932983398, + "step": 1060 + }, + { + "epoch": 0.458252427184466, + "grad_norm": 0.5552583932876587, + "learning_rate": 0.00019065588499550765, + "loss": 1.005869746208191, + "step": 1062 + }, + { + "epoch": 0.4591154261057174, + "grad_norm": 0.49282076954841614, + "learning_rate": 0.00019101527403414198, + "loss": 0.8949927091598511, + "step": 1064 + }, + { + "epoch": 0.45997842502696873, + "grad_norm": 0.4951777160167694, + "learning_rate": 0.0001913746630727763, + "loss": 0.9997886419296265, + "step": 1066 + }, + { + "epoch": 0.4608414239482201, + "grad_norm": 0.5154827237129211, + "learning_rate": 0.0001917340521114106, + "loss": 0.9532123804092407, + "step": 1068 + }, + { + "epoch": 0.4617044228694714, + "grad_norm": 0.5547500252723694, + "learning_rate": 0.00019209344115004494, + "loss": 0.8959843516349792, + "step": 1070 + }, + { + "epoch": 0.46256742179072274, + "grad_norm": 0.500188946723938, + "learning_rate": 0.00019245283018867927, + "loss": 0.8201484680175781, + "step": 1072 + }, + { + "epoch": 0.4634304207119741, + "grad_norm": 0.4181794822216034, + "learning_rate": 0.00019281221922731357, + "loss": 0.8255136609077454, + "step": 1074 + }, + { + "epoch": 0.46429341963322546, + "grad_norm": 0.5613874197006226, + "learning_rate": 0.0001931716082659479, + "loss": 0.896024763584137, + "step": 1076 + }, + { + "epoch": 0.4651564185544768, + "grad_norm": 0.5000972151756287, + "learning_rate": 0.0001935309973045822, + "loss": 0.8831873536109924, + "step": 1078 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 0.6321820616722107, + "learning_rate": 0.00019389038634321654, + "loss": 0.9787988662719727, + "step": 1080 + }, + { + "epoch": 0.4668824163969795, + "grad_norm": 0.4843652546405792, + "learning_rate": 0.00019424977538185087, + "loss": 0.933361828327179, + "step": 1082 + }, + { + "epoch": 0.46774541531823083, + "grad_norm": 0.537330150604248, + "learning_rate": 0.00019460916442048517, + "loss": 0.9046981334686279, + "step": 1084 + }, + { + "epoch": 0.4686084142394822, + "grad_norm": 0.5761371850967407, + "learning_rate": 0.0001949685534591195, + "loss": 0.9625781178474426, + "step": 1086 + }, + { + "epoch": 0.46947141316073354, + "grad_norm": 0.5209522843360901, + "learning_rate": 0.00019532794249775383, + "loss": 0.9280619025230408, + "step": 1088 + }, + { + "epoch": 0.4703344120819849, + "grad_norm": 0.5383933186531067, + "learning_rate": 0.00019568733153638813, + "loss": 0.8236247301101685, + "step": 1090 + }, + { + "epoch": 0.47119741100323626, + "grad_norm": 0.4994274377822876, + "learning_rate": 0.00019604672057502246, + "loss": 0.9404071569442749, + "step": 1092 + }, + { + "epoch": 0.4720604099244876, + "grad_norm": 0.5177807211875916, + "learning_rate": 0.0001964061096136568, + "loss": 0.8517536520957947, + "step": 1094 + }, + { + "epoch": 0.47292340884573897, + "grad_norm": 0.5374870896339417, + "learning_rate": 0.00019676549865229112, + "loss": 0.8214367032051086, + "step": 1096 + }, + { + "epoch": 0.47378640776699027, + "grad_norm": 0.5544074177742004, + "learning_rate": 0.00019712488769092542, + "loss": 1.016176700592041, + "step": 1098 + }, + { + "epoch": 0.4746494066882416, + "grad_norm": 0.5125867128372192, + "learning_rate": 0.00019748427672955975, + "loss": 0.8425421118736267, + "step": 1100 + }, + { + "epoch": 0.4746494066882416, + "eval_loss": 0.944629430770874, + "eval_runtime": 649.9107, + "eval_samples_per_second": 3.17, + "eval_steps_per_second": 3.17, + "step": 1100 + }, + { + "epoch": 0.475512405609493, + "grad_norm": 0.5204418897628784, + "learning_rate": 0.00019784366576819408, + "loss": 0.9444556832313538, + "step": 1102 + }, + { + "epoch": 0.47637540453074434, + "grad_norm": 0.46085885167121887, + "learning_rate": 0.0001982030548068284, + "loss": 0.877082109451294, + "step": 1104 + }, + { + "epoch": 0.4772384034519957, + "grad_norm": 0.5269598960876465, + "learning_rate": 0.00019856244384546272, + "loss": 0.9214640259742737, + "step": 1106 + }, + { + "epoch": 0.47810140237324705, + "grad_norm": 0.4894753694534302, + "learning_rate": 0.00019892183288409705, + "loss": 0.8867175579071045, + "step": 1108 + }, + { + "epoch": 0.47896440129449835, + "grad_norm": 0.5204115509986877, + "learning_rate": 0.00019928122192273138, + "loss": 0.9641162753105164, + "step": 1110 + }, + { + "epoch": 0.4798274002157497, + "grad_norm": 0.6399031281471252, + "learning_rate": 0.00019964061096136568, + "loss": 1.0219199657440186, + "step": 1112 + }, + { + "epoch": 0.48069039913700107, + "grad_norm": 0.3979159891605377, + "learning_rate": 0.0002, + "loss": 0.8189998269081116, + "step": 1114 + }, + { + "epoch": 0.4815533980582524, + "grad_norm": 0.4782681465148926, + "learning_rate": 0.0001999999879427254, + "loss": 0.83241868019104, + "step": 1116 + }, + { + "epoch": 0.4824163969795038, + "grad_norm": 0.5235620141029358, + "learning_rate": 0.00019999995177090454, + "loss": 0.9371466636657715, + "step": 1118 + }, + { + "epoch": 0.48327939590075514, + "grad_norm": 0.4543023407459259, + "learning_rate": 0.00019999989148454606, + "loss": 0.7767758369445801, + "step": 1120 + }, + { + "epoch": 0.4841423948220065, + "grad_norm": 0.6191229820251465, + "learning_rate": 0.00019999980708366457, + "loss": 1.0103063583374023, + "step": 1122 + }, + { + "epoch": 0.4850053937432578, + "grad_norm": 0.45392486453056335, + "learning_rate": 0.00019999969856828042, + "loss": 0.8720875382423401, + "step": 1124 + }, + { + "epoch": 0.48586839266450915, + "grad_norm": 0.42748701572418213, + "learning_rate": 0.00019999956593841974, + "loss": 0.7859150171279907, + "step": 1126 + }, + { + "epoch": 0.4867313915857605, + "grad_norm": 0.4952569603919983, + "learning_rate": 0.00019999940919411454, + "loss": 0.9154419898986816, + "step": 1128 + }, + { + "epoch": 0.48759439050701187, + "grad_norm": 0.5522173047065735, + "learning_rate": 0.00019999922833540264, + "loss": 0.9076330065727234, + "step": 1130 + }, + { + "epoch": 0.4884573894282632, + "grad_norm": 0.5355855226516724, + "learning_rate": 0.00019999902336232758, + "loss": 0.8933543562889099, + "step": 1132 + }, + { + "epoch": 0.4893203883495146, + "grad_norm": 0.4613489508628845, + "learning_rate": 0.00019999879427493885, + "loss": 0.9160735607147217, + "step": 1134 + }, + { + "epoch": 0.49018338727076594, + "grad_norm": 0.4758962094783783, + "learning_rate": 0.0001999985410732917, + "loss": 0.8552446961402893, + "step": 1136 + }, + { + "epoch": 0.49104638619201724, + "grad_norm": 0.4549376964569092, + "learning_rate": 0.00019999826375744715, + "loss": 0.9979530572891235, + "step": 1138 + }, + { + "epoch": 0.4919093851132686, + "grad_norm": 0.4363284409046173, + "learning_rate": 0.0001999979623274721, + "loss": 0.9295380115509033, + "step": 1140 + }, + { + "epoch": 0.49277238403451995, + "grad_norm": 0.5090877413749695, + "learning_rate": 0.0001999976367834392, + "loss": 0.8737252950668335, + "step": 1142 + }, + { + "epoch": 0.4936353829557713, + "grad_norm": 0.45340779423713684, + "learning_rate": 0.000199997287125427, + "loss": 0.8312779068946838, + "step": 1144 + }, + { + "epoch": 0.49449838187702266, + "grad_norm": 0.4771472215652466, + "learning_rate": 0.0001999969133535198, + "loss": 0.9105207324028015, + "step": 1146 + }, + { + "epoch": 0.495361380798274, + "grad_norm": 0.5251384377479553, + "learning_rate": 0.00019999651546780773, + "loss": 0.8578172922134399, + "step": 1148 + }, + { + "epoch": 0.4962243797195254, + "grad_norm": 0.49128198623657227, + "learning_rate": 0.00019999609346838676, + "loss": 0.9193941950798035, + "step": 1150 + }, + { + "epoch": 0.4970873786407767, + "grad_norm": 0.5558596849441528, + "learning_rate": 0.0001999956473553586, + "loss": 0.9141314625740051, + "step": 1152 + }, + { + "epoch": 0.49795037756202803, + "grad_norm": 0.45872750878334045, + "learning_rate": 0.00019999517712883087, + "loss": 0.9058388471603394, + "step": 1154 + }, + { + "epoch": 0.4988133764832794, + "grad_norm": 0.4710173010826111, + "learning_rate": 0.00019999468278891698, + "loss": 0.8462487459182739, + "step": 1156 + }, + { + "epoch": 0.49967637540453075, + "grad_norm": 0.4805637001991272, + "learning_rate": 0.0001999941643357361, + "loss": 0.8960906863212585, + "step": 1158 + }, + { + "epoch": 0.5005393743257821, + "grad_norm": 0.5108052492141724, + "learning_rate": 0.00019999362176941325, + "loss": 0.9074980020523071, + "step": 1160 + }, + { + "epoch": 0.5014023732470334, + "grad_norm": 0.49572333693504333, + "learning_rate": 0.00019999305509007932, + "loss": 0.9710080623626709, + "step": 1162 + }, + { + "epoch": 0.5022653721682848, + "grad_norm": 0.5304561257362366, + "learning_rate": 0.0001999924642978709, + "loss": 0.8877825140953064, + "step": 1164 + }, + { + "epoch": 0.5031283710895361, + "grad_norm": 0.5007328391075134, + "learning_rate": 0.0001999918493929305, + "loss": 0.8955381512641907, + "step": 1166 + }, + { + "epoch": 0.5039913700107875, + "grad_norm": 0.569549024105072, + "learning_rate": 0.0001999912103754064, + "loss": 0.9478562474250793, + "step": 1168 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 0.5354957580566406, + "learning_rate": 0.00019999054724545264, + "loss": 0.9685383439064026, + "step": 1170 + }, + { + "epoch": 0.5057173678532901, + "grad_norm": 0.547788143157959, + "learning_rate": 0.00019998986000322917, + "loss": 0.9221975207328796, + "step": 1172 + }, + { + "epoch": 0.5065803667745415, + "grad_norm": 0.4919529855251312, + "learning_rate": 0.00019998914864890175, + "loss": 0.9104788303375244, + "step": 1174 + }, + { + "epoch": 0.5074433656957928, + "grad_norm": 0.5274141430854797, + "learning_rate": 0.00019998841318264187, + "loss": 0.9176050424575806, + "step": 1176 + }, + { + "epoch": 0.5083063646170443, + "grad_norm": 0.4257420301437378, + "learning_rate": 0.00019998765360462688, + "loss": 0.8389710187911987, + "step": 1178 + }, + { + "epoch": 0.5091693635382956, + "grad_norm": 0.4947778880596161, + "learning_rate": 0.00019998686991504002, + "loss": 1.0164397954940796, + "step": 1180 + }, + { + "epoch": 0.510032362459547, + "grad_norm": 0.5540821552276611, + "learning_rate": 0.00019998606211407016, + "loss": 0.9900994300842285, + "step": 1182 + }, + { + "epoch": 0.5108953613807983, + "grad_norm": 0.4793289601802826, + "learning_rate": 0.0001999852302019122, + "loss": 0.8797636032104492, + "step": 1184 + }, + { + "epoch": 0.5117583603020496, + "grad_norm": 0.47429659962654114, + "learning_rate": 0.00019998437417876672, + "loss": 0.96225905418396, + "step": 1186 + }, + { + "epoch": 0.512621359223301, + "grad_norm": 0.44918450713157654, + "learning_rate": 0.00019998349404484013, + "loss": 0.8604235649108887, + "step": 1188 + }, + { + "epoch": 0.5134843581445523, + "grad_norm": 0.566977858543396, + "learning_rate": 0.00019998258980034468, + "loss": 1.0325366258621216, + "step": 1190 + }, + { + "epoch": 0.5143473570658037, + "grad_norm": 0.4671999514102936, + "learning_rate": 0.00019998166144549843, + "loss": 0.7658900022506714, + "step": 1192 + }, + { + "epoch": 0.515210355987055, + "grad_norm": 0.42312702536582947, + "learning_rate": 0.00019998070898052521, + "loss": 0.9365432858467102, + "step": 1194 + }, + { + "epoch": 0.5160733549083064, + "grad_norm": 0.4652721881866455, + "learning_rate": 0.00019997973240565476, + "loss": 0.771016538143158, + "step": 1196 + }, + { + "epoch": 0.5169363538295577, + "grad_norm": 0.5048499703407288, + "learning_rate": 0.00019997873172112254, + "loss": 0.9123705625534058, + "step": 1198 + }, + { + "epoch": 0.517799352750809, + "grad_norm": 0.5446439981460571, + "learning_rate": 0.0001999777069271699, + "loss": 0.8975751399993896, + "step": 1200 + }, + { + "epoch": 0.517799352750809, + "eval_loss": 0.9344067573547363, + "eval_runtime": 658.6934, + "eval_samples_per_second": 3.127, + "eval_steps_per_second": 3.127, + "step": 1200 + }, + { + "epoch": 0.5186623516720604, + "grad_norm": 0.5044088959693909, + "learning_rate": 0.0001999766580240439, + "loss": 0.89097660779953, + "step": 1202 + }, + { + "epoch": 0.5195253505933117, + "grad_norm": 0.4672294855117798, + "learning_rate": 0.00019997558501199753, + "loss": 0.9138525128364563, + "step": 1204 + }, + { + "epoch": 0.5203883495145631, + "grad_norm": 0.45749008655548096, + "learning_rate": 0.00019997448789128952, + "loss": 0.8946340680122375, + "step": 1206 + }, + { + "epoch": 0.5212513484358144, + "grad_norm": 0.4828707277774811, + "learning_rate": 0.00019997336666218447, + "loss": 0.8661436438560486, + "step": 1208 + }, + { + "epoch": 0.5221143473570659, + "grad_norm": 0.3975147008895874, + "learning_rate": 0.0001999722213249527, + "loss": 0.7684835195541382, + "step": 1210 + }, + { + "epoch": 0.5229773462783172, + "grad_norm": 0.5642077326774597, + "learning_rate": 0.00019997105187987045, + "loss": 0.9667536616325378, + "step": 1212 + }, + { + "epoch": 0.5238403451995685, + "grad_norm": 0.4907105565071106, + "learning_rate": 0.00019996985832721972, + "loss": 0.8679366707801819, + "step": 1214 + }, + { + "epoch": 0.5247033441208199, + "grad_norm": 0.46214789152145386, + "learning_rate": 0.0001999686406672883, + "loss": 0.8802784085273743, + "step": 1216 + }, + { + "epoch": 0.5255663430420712, + "grad_norm": 0.4355131685733795, + "learning_rate": 0.00019996739890036985, + "loss": 0.8493598103523254, + "step": 1218 + }, + { + "epoch": 0.5264293419633226, + "grad_norm": 0.4293915331363678, + "learning_rate": 0.0001999661330267638, + "loss": 0.8949980735778809, + "step": 1220 + }, + { + "epoch": 0.5272923408845739, + "grad_norm": 0.5452485680580139, + "learning_rate": 0.00019996484304677544, + "loss": 0.9497376680374146, + "step": 1222 + }, + { + "epoch": 0.5281553398058253, + "grad_norm": 0.45874500274658203, + "learning_rate": 0.00019996352896071583, + "loss": 0.9170818328857422, + "step": 1224 + }, + { + "epoch": 0.5290183387270766, + "grad_norm": 0.4414025843143463, + "learning_rate": 0.00019996219076890182, + "loss": 0.7557252645492554, + "step": 1226 + }, + { + "epoch": 0.5298813376483279, + "grad_norm": 0.4891829192638397, + "learning_rate": 0.0001999608284716562, + "loss": 0.848960816860199, + "step": 1228 + }, + { + "epoch": 0.5307443365695793, + "grad_norm": 0.5048345327377319, + "learning_rate": 0.00019995944206930734, + "loss": 0.9555954933166504, + "step": 1230 + }, + { + "epoch": 0.5316073354908306, + "grad_norm": 0.5006756782531738, + "learning_rate": 0.00019995803156218968, + "loss": 0.8080939054489136, + "step": 1232 + }, + { + "epoch": 0.532470334412082, + "grad_norm": 0.5422173738479614, + "learning_rate": 0.00019995659695064332, + "loss": 0.8638371825218201, + "step": 1234 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.4535163640975952, + "learning_rate": 0.0001999551382350142, + "loss": 0.889068067073822, + "step": 1236 + }, + { + "epoch": 0.5341963322545846, + "grad_norm": 0.4588642418384552, + "learning_rate": 0.00019995365541565412, + "loss": 0.8803121447563171, + "step": 1238 + }, + { + "epoch": 0.535059331175836, + "grad_norm": 0.49366191029548645, + "learning_rate": 0.00019995214849292064, + "loss": 0.9694926738739014, + "step": 1240 + }, + { + "epoch": 0.5359223300970873, + "grad_norm": 0.41988301277160645, + "learning_rate": 0.0001999506174671771, + "loss": 0.8367960453033447, + "step": 1242 + }, + { + "epoch": 0.5367853290183388, + "grad_norm": 0.5242130160331726, + "learning_rate": 0.00019994906233879273, + "loss": 0.942340612411499, + "step": 1244 + }, + { + "epoch": 0.53764832793959, + "grad_norm": 0.4899834096431732, + "learning_rate": 0.00019994748310814256, + "loss": 0.8926790356636047, + "step": 1246 + }, + { + "epoch": 0.5385113268608415, + "grad_norm": 0.5496823787689209, + "learning_rate": 0.00019994587977560744, + "loss": 0.99891597032547, + "step": 1248 + }, + { + "epoch": 0.5393743257820928, + "grad_norm": 0.4497414231300354, + "learning_rate": 0.00019994425234157396, + "loss": 0.8873116970062256, + "step": 1250 + }, + { + "epoch": 0.5402373247033441, + "grad_norm": 0.4256928563117981, + "learning_rate": 0.00019994260080643454, + "loss": 0.9041396975517273, + "step": 1252 + }, + { + "epoch": 0.5411003236245955, + "grad_norm": 0.36910608410835266, + "learning_rate": 0.00019994092517058753, + "loss": 0.7647561430931091, + "step": 1254 + }, + { + "epoch": 0.5419633225458468, + "grad_norm": 0.537584125995636, + "learning_rate": 0.0001999392254344369, + "loss": 0.8892287015914917, + "step": 1256 + }, + { + "epoch": 0.5428263214670982, + "grad_norm": 0.49463894963264465, + "learning_rate": 0.00019993750159839264, + "loss": 0.8638571500778198, + "step": 1258 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 0.5052056908607483, + "learning_rate": 0.00019993575366287036, + "loss": 0.8165372014045715, + "step": 1260 + }, + { + "epoch": 0.5445523193096009, + "grad_norm": 0.47367510199546814, + "learning_rate": 0.0001999339816282916, + "loss": 0.9099977016448975, + "step": 1262 + }, + { + "epoch": 0.5454153182308522, + "grad_norm": 0.4600350558757782, + "learning_rate": 0.00019993218549508364, + "loss": 0.8557311296463013, + "step": 1264 + }, + { + "epoch": 0.5462783171521035, + "grad_norm": 0.5684534311294556, + "learning_rate": 0.0001999303652636797, + "loss": 0.9136497974395752, + "step": 1266 + }, + { + "epoch": 0.5471413160733549, + "grad_norm": 0.5151359438896179, + "learning_rate": 0.00019992852093451865, + "loss": 0.7906932830810547, + "step": 1268 + }, + { + "epoch": 0.5480043149946062, + "grad_norm": 0.48577409982681274, + "learning_rate": 0.00019992665250804525, + "loss": 0.9326766133308411, + "step": 1270 + }, + { + "epoch": 0.5488673139158576, + "grad_norm": 0.490531325340271, + "learning_rate": 0.00019992475998471004, + "loss": 0.9734495878219604, + "step": 1272 + }, + { + "epoch": 0.5497303128371089, + "grad_norm": 0.5092435479164124, + "learning_rate": 0.00019992284336496947, + "loss": 0.8728410005569458, + "step": 1274 + }, + { + "epoch": 0.5505933117583603, + "grad_norm": 0.3843296766281128, + "learning_rate": 0.00019992090264928566, + "loss": 0.7572637796401978, + "step": 1276 + }, + { + "epoch": 0.5514563106796116, + "grad_norm": 0.534304678440094, + "learning_rate": 0.00019991893783812662, + "loss": 0.8895323872566223, + "step": 1278 + }, + { + "epoch": 0.552319309600863, + "grad_norm": 0.4567227363586426, + "learning_rate": 0.00019991694893196614, + "loss": 0.9318088293075562, + "step": 1280 + }, + { + "epoch": 0.5531823085221144, + "grad_norm": 0.48464900255203247, + "learning_rate": 0.0001999149359312839, + "loss": 0.8541979789733887, + "step": 1282 + }, + { + "epoch": 0.5540453074433657, + "grad_norm": 0.5569567084312439, + "learning_rate": 0.00019991289883656524, + "loss": 0.977894127368927, + "step": 1284 + }, + { + "epoch": 0.5549083063646171, + "grad_norm": 0.4637227952480316, + "learning_rate": 0.00019991083764830145, + "loss": 0.8860608339309692, + "step": 1286 + }, + { + "epoch": 0.5557713052858684, + "grad_norm": 0.4096687436103821, + "learning_rate": 0.00019990875236698956, + "loss": 0.8429648876190186, + "step": 1288 + }, + { + "epoch": 0.5566343042071198, + "grad_norm": 0.5221695303916931, + "learning_rate": 0.00019990664299313242, + "loss": 0.8510909080505371, + "step": 1290 + }, + { + "epoch": 0.5574973031283711, + "grad_norm": 0.5155899524688721, + "learning_rate": 0.00019990450952723872, + "loss": 0.8971074223518372, + "step": 1292 + }, + { + "epoch": 0.5583603020496224, + "grad_norm": 0.5064809322357178, + "learning_rate": 0.0001999023519698229, + "loss": 0.9030373096466064, + "step": 1294 + }, + { + "epoch": 0.5592233009708738, + "grad_norm": 0.40551698207855225, + "learning_rate": 0.00019990017032140526, + "loss": 0.7866057753562927, + "step": 1296 + }, + { + "epoch": 0.5600862998921251, + "grad_norm": 0.5009430050849915, + "learning_rate": 0.00019989796458251194, + "loss": 0.9477730989456177, + "step": 1298 + }, + { + "epoch": 0.5609492988133765, + "grad_norm": 0.5192028880119324, + "learning_rate": 0.00019989573475367477, + "loss": 0.9206778407096863, + "step": 1300 + }, + { + "epoch": 0.5609492988133765, + "eval_loss": 0.9177446365356445, + "eval_runtime": 665.3245, + "eval_samples_per_second": 3.096, + "eval_steps_per_second": 3.096, + "step": 1300 + }, + { + "epoch": 0.5618122977346278, + "grad_norm": 0.5580230951309204, + "learning_rate": 0.00019989348083543148, + "loss": 0.9516512155532837, + "step": 1302 + }, + { + "epoch": 0.5626752966558792, + "grad_norm": 0.4151005744934082, + "learning_rate": 0.00019989120282832564, + "loss": 0.7725991606712341, + "step": 1304 + }, + { + "epoch": 0.5635382955771305, + "grad_norm": 0.56330406665802, + "learning_rate": 0.00019988890073290656, + "loss": 0.9241501688957214, + "step": 1306 + }, + { + "epoch": 0.5644012944983818, + "grad_norm": 0.44836440682411194, + "learning_rate": 0.00019988657454972936, + "loss": 0.8351686000823975, + "step": 1308 + }, + { + "epoch": 0.5652642934196332, + "grad_norm": 0.5414754152297974, + "learning_rate": 0.00019988422427935496, + "loss": 0.9033217430114746, + "step": 1310 + }, + { + "epoch": 0.5661272923408845, + "grad_norm": 0.5283750891685486, + "learning_rate": 0.0001998818499223502, + "loss": 0.8885331153869629, + "step": 1312 + }, + { + "epoch": 0.566990291262136, + "grad_norm": 0.45846256613731384, + "learning_rate": 0.00019987945147928758, + "loss": 0.8359912037849426, + "step": 1314 + }, + { + "epoch": 0.5678532901833873, + "grad_norm": 0.44439879059791565, + "learning_rate": 0.0001998770289507455, + "loss": 0.8327895402908325, + "step": 1316 + }, + { + "epoch": 0.5687162891046387, + "grad_norm": 0.5491341948509216, + "learning_rate": 0.00019987458233730813, + "loss": 0.9354757070541382, + "step": 1318 + }, + { + "epoch": 0.56957928802589, + "grad_norm": 0.5502263307571411, + "learning_rate": 0.00019987211163956548, + "loss": 0.944054901599884, + "step": 1320 + }, + { + "epoch": 0.5704422869471413, + "grad_norm": 0.47165682911872864, + "learning_rate": 0.00019986961685811334, + "loss": 0.9515072107315063, + "step": 1322 + }, + { + "epoch": 0.5713052858683927, + "grad_norm": 0.5136987566947937, + "learning_rate": 0.0001998670979935533, + "loss": 0.9455493688583374, + "step": 1324 + }, + { + "epoch": 0.572168284789644, + "grad_norm": 0.4822693169116974, + "learning_rate": 0.00019986455504649277, + "loss": 0.8626728057861328, + "step": 1326 + }, + { + "epoch": 0.5730312837108954, + "grad_norm": 0.4639468193054199, + "learning_rate": 0.000199861988017545, + "loss": 0.8857194781303406, + "step": 1328 + }, + { + "epoch": 0.5738942826321467, + "grad_norm": 0.5224950313568115, + "learning_rate": 0.00019985939690732898, + "loss": 0.9198446273803711, + "step": 1330 + }, + { + "epoch": 0.574757281553398, + "grad_norm": 0.47504499554634094, + "learning_rate": 0.00019985678171646954, + "loss": 0.8785439133644104, + "step": 1332 + }, + { + "epoch": 0.5756202804746494, + "grad_norm": 0.5022051930427551, + "learning_rate": 0.0001998541424455974, + "loss": 0.9593754410743713, + "step": 1334 + }, + { + "epoch": 0.5764832793959007, + "grad_norm": 0.4572875201702118, + "learning_rate": 0.00019985147909534897, + "loss": 0.986197292804718, + "step": 1336 + }, + { + "epoch": 0.5773462783171521, + "grad_norm": 0.5153827667236328, + "learning_rate": 0.00019984879166636644, + "loss": 0.9163863658905029, + "step": 1338 + }, + { + "epoch": 0.5782092772384034, + "grad_norm": 0.4813650846481323, + "learning_rate": 0.00019984608015929792, + "loss": 0.8815995454788208, + "step": 1340 + }, + { + "epoch": 0.5790722761596548, + "grad_norm": 0.4319819211959839, + "learning_rate": 0.0001998433445747973, + "loss": 0.857044517993927, + "step": 1342 + }, + { + "epoch": 0.5799352750809061, + "grad_norm": 0.5128870010375977, + "learning_rate": 0.00019984058491352423, + "loss": 0.8939256072044373, + "step": 1344 + }, + { + "epoch": 0.5807982740021574, + "grad_norm": 0.41450315713882446, + "learning_rate": 0.0001998378011761442, + "loss": 0.9267327785491943, + "step": 1346 + }, + { + "epoch": 0.5816612729234089, + "grad_norm": 0.42916348576545715, + "learning_rate": 0.00019983499336332844, + "loss": 0.8494639992713928, + "step": 1348 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.43995746970176697, + "learning_rate": 0.00019983216147575412, + "loss": 0.9720427989959717, + "step": 1350 + }, + { + "epoch": 0.5833872707659116, + "grad_norm": 0.5063773989677429, + "learning_rate": 0.00019982930551410411, + "loss": 0.9400854706764221, + "step": 1352 + }, + { + "epoch": 0.5842502696871629, + "grad_norm": 0.5029586553573608, + "learning_rate": 0.0001998264254790671, + "loss": 0.8657845258712769, + "step": 1354 + }, + { + "epoch": 0.5851132686084143, + "grad_norm": 0.45519232749938965, + "learning_rate": 0.00019982352137133764, + "loss": 0.8593506813049316, + "step": 1356 + }, + { + "epoch": 0.5859762675296656, + "grad_norm": 0.49116215109825134, + "learning_rate": 0.000199820593191616, + "loss": 0.8658114671707153, + "step": 1358 + }, + { + "epoch": 0.5868392664509169, + "grad_norm": 0.45347318053245544, + "learning_rate": 0.00019981764094060826, + "loss": 0.8044605255126953, + "step": 1360 + }, + { + "epoch": 0.5877022653721683, + "grad_norm": 0.5191754698753357, + "learning_rate": 0.00019981466461902643, + "loss": 0.9176861047744751, + "step": 1362 + }, + { + "epoch": 0.5885652642934196, + "grad_norm": 0.4791528284549713, + "learning_rate": 0.00019981166422758818, + "loss": 0.8453370928764343, + "step": 1364 + }, + { + "epoch": 0.589428263214671, + "grad_norm": 0.5105116367340088, + "learning_rate": 0.00019980863976701705, + "loss": 0.9343777298927307, + "step": 1366 + }, + { + "epoch": 0.5902912621359223, + "grad_norm": 0.44593656063079834, + "learning_rate": 0.00019980559123804236, + "loss": 0.8950760960578918, + "step": 1368 + }, + { + "epoch": 0.5911542610571737, + "grad_norm": 0.4375658631324768, + "learning_rate": 0.0001998025186413993, + "loss": 0.8613521456718445, + "step": 1370 + }, + { + "epoch": 0.592017259978425, + "grad_norm": 0.5138815641403198, + "learning_rate": 0.00019979942197782878, + "loss": 0.8982083201408386, + "step": 1372 + }, + { + "epoch": 0.5928802588996763, + "grad_norm": 0.45473602414131165, + "learning_rate": 0.00019979630124807753, + "loss": 0.9372450709342957, + "step": 1374 + }, + { + "epoch": 0.5937432578209277, + "grad_norm": 0.4961191713809967, + "learning_rate": 0.00019979315645289814, + "loss": 0.8758652806282043, + "step": 1376 + }, + { + "epoch": 0.594606256742179, + "grad_norm": 0.5672827363014221, + "learning_rate": 0.00019978998759304895, + "loss": 0.9852207899093628, + "step": 1378 + }, + { + "epoch": 0.5954692556634305, + "grad_norm": 0.44907906651496887, + "learning_rate": 0.00019978679466929407, + "loss": 0.8451287746429443, + "step": 1380 + }, + { + "epoch": 0.5963322545846818, + "grad_norm": 0.4759652316570282, + "learning_rate": 0.00019978357768240352, + "loss": 0.8381558656692505, + "step": 1382 + }, + { + "epoch": 0.5971952535059332, + "grad_norm": 0.4936048686504364, + "learning_rate": 0.00019978033663315304, + "loss": 0.8820816874504089, + "step": 1384 + }, + { + "epoch": 0.5980582524271845, + "grad_norm": 0.44983741641044617, + "learning_rate": 0.00019977707152232416, + "loss": 0.9156787991523743, + "step": 1386 + }, + { + "epoch": 0.5989212513484358, + "grad_norm": 0.45198705792427063, + "learning_rate": 0.0001997737823507043, + "loss": 0.8285194039344788, + "step": 1388 + }, + { + "epoch": 0.5997842502696872, + "grad_norm": 0.45788463950157166, + "learning_rate": 0.00019977046911908664, + "loss": 0.8109505772590637, + "step": 1390 + }, + { + "epoch": 0.6006472491909385, + "grad_norm": 0.4795142412185669, + "learning_rate": 0.0001997671318282701, + "loss": 0.8285947442054749, + "step": 1392 + }, + { + "epoch": 0.6015102481121899, + "grad_norm": 0.5026728510856628, + "learning_rate": 0.00019976377047905945, + "loss": 0.9497535228729248, + "step": 1394 + }, + { + "epoch": 0.6023732470334412, + "grad_norm": 0.4994853734970093, + "learning_rate": 0.0001997603850722653, + "loss": 0.9171916246414185, + "step": 1396 + }, + { + "epoch": 0.6032362459546926, + "grad_norm": 0.4789866507053375, + "learning_rate": 0.00019975697560870403, + "loss": 0.7894434928894043, + "step": 1398 + }, + { + "epoch": 0.6040992448759439, + "grad_norm": 0.42282742261886597, + "learning_rate": 0.0001997535420891978, + "loss": 0.8942429423332214, + "step": 1400 + }, + { + "epoch": 0.6040992448759439, + "eval_loss": 0.9080492854118347, + "eval_runtime": 661.4597, + "eval_samples_per_second": 3.114, + "eval_steps_per_second": 3.114, + "step": 1400 + }, + { + "epoch": 0.6049622437971952, + "grad_norm": 0.5789905190467834, + "learning_rate": 0.00019975008451457454, + "loss": 0.8938372731208801, + "step": 1402 + }, + { + "epoch": 0.6058252427184466, + "grad_norm": 0.46121683716773987, + "learning_rate": 0.00019974660288566814, + "loss": 0.8746235370635986, + "step": 1404 + }, + { + "epoch": 0.6066882416396979, + "grad_norm": 0.5195551514625549, + "learning_rate": 0.00019974309720331807, + "loss": 0.8650617003440857, + "step": 1406 + }, + { + "epoch": 0.6075512405609493, + "grad_norm": 0.46930259466171265, + "learning_rate": 0.00019973956746836976, + "loss": 0.8853039145469666, + "step": 1408 + }, + { + "epoch": 0.6084142394822006, + "grad_norm": 0.40869632363319397, + "learning_rate": 0.0001997360136816744, + "loss": 0.7865594029426575, + "step": 1410 + }, + { + "epoch": 0.609277238403452, + "grad_norm": 0.5398361086845398, + "learning_rate": 0.00019973243584408895, + "loss": 0.907535970211029, + "step": 1412 + }, + { + "epoch": 0.6101402373247033, + "grad_norm": 0.5110154747962952, + "learning_rate": 0.00019972883395647615, + "loss": 0.8682730197906494, + "step": 1414 + }, + { + "epoch": 0.6110032362459547, + "grad_norm": 0.44484639167785645, + "learning_rate": 0.00019972520801970467, + "loss": 0.8786011338233948, + "step": 1416 + }, + { + "epoch": 0.6118662351672061, + "grad_norm": 0.4768071472644806, + "learning_rate": 0.0001997215580346488, + "loss": 0.9021878242492676, + "step": 1418 + }, + { + "epoch": 0.6127292340884574, + "grad_norm": 0.43265241384506226, + "learning_rate": 0.0001997178840021888, + "loss": 0.7737482786178589, + "step": 1420 + }, + { + "epoch": 0.6135922330097088, + "grad_norm": 0.525692343711853, + "learning_rate": 0.0001997141859232106, + "loss": 0.876280665397644, + "step": 1422 + }, + { + "epoch": 0.6144552319309601, + "grad_norm": 0.48206865787506104, + "learning_rate": 0.00019971046379860594, + "loss": 0.8503577709197998, + "step": 1424 + }, + { + "epoch": 0.6153182308522115, + "grad_norm": 0.6032769680023193, + "learning_rate": 0.00019970671762927246, + "loss": 0.9459730982780457, + "step": 1426 + }, + { + "epoch": 0.6161812297734628, + "grad_norm": 0.4491981863975525, + "learning_rate": 0.0001997029474161135, + "loss": 0.8836647868156433, + "step": 1428 + }, + { + "epoch": 0.6170442286947141, + "grad_norm": 0.47503358125686646, + "learning_rate": 0.00019969915316003824, + "loss": 0.8614388108253479, + "step": 1430 + }, + { + "epoch": 0.6179072276159655, + "grad_norm": 0.44801047444343567, + "learning_rate": 0.00019969533486196162, + "loss": 0.8420360684394836, + "step": 1432 + }, + { + "epoch": 0.6187702265372168, + "grad_norm": 0.45057111978530884, + "learning_rate": 0.00019969149252280446, + "loss": 0.8256269693374634, + "step": 1434 + }, + { + "epoch": 0.6196332254584682, + "grad_norm": 0.4589645266532898, + "learning_rate": 0.00019968762614349327, + "loss": 0.9130199551582336, + "step": 1436 + }, + { + "epoch": 0.6204962243797195, + "grad_norm": 0.48914027214050293, + "learning_rate": 0.00019968373572496045, + "loss": 0.74083012342453, + "step": 1438 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 0.4582098424434662, + "learning_rate": 0.00019967982126814412, + "loss": 0.8538379669189453, + "step": 1440 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.48722779750823975, + "learning_rate": 0.00019967588277398823, + "loss": 0.8780114054679871, + "step": 1442 + }, + { + "epoch": 0.6230852211434735, + "grad_norm": 0.4291327893733978, + "learning_rate": 0.00019967192024344254, + "loss": 0.8341028690338135, + "step": 1444 + }, + { + "epoch": 0.623948220064725, + "grad_norm": 0.4773139953613281, + "learning_rate": 0.00019966793367746265, + "loss": 0.8651667237281799, + "step": 1446 + }, + { + "epoch": 0.6248112189859762, + "grad_norm": 0.45556166768074036, + "learning_rate": 0.00019966392307700986, + "loss": 0.8339929580688477, + "step": 1448 + }, + { + "epoch": 0.6256742179072277, + "grad_norm": 0.5126671195030212, + "learning_rate": 0.00019965988844305129, + "loss": 0.9129340052604675, + "step": 1450 + }, + { + "epoch": 0.626537216828479, + "grad_norm": 0.6067109704017639, + "learning_rate": 0.00019965582977655988, + "loss": 0.9057610034942627, + "step": 1452 + }, + { + "epoch": 0.6274002157497303, + "grad_norm": 0.46425968408584595, + "learning_rate": 0.00019965174707851438, + "loss": 0.874100387096405, + "step": 1454 + }, + { + "epoch": 0.6282632146709817, + "grad_norm": 0.481077641248703, + "learning_rate": 0.0001996476403498993, + "loss": 0.915635347366333, + "step": 1456 + }, + { + "epoch": 0.629126213592233, + "grad_norm": 0.47299909591674805, + "learning_rate": 0.000199643509591705, + "loss": 0.9059650301933289, + "step": 1458 + }, + { + "epoch": 0.6299892125134844, + "grad_norm": 0.48924630880355835, + "learning_rate": 0.00019963935480492753, + "loss": 0.9775188565254211, + "step": 1460 + }, + { + "epoch": 0.6308522114347357, + "grad_norm": 0.4407665431499481, + "learning_rate": 0.0001996351759905688, + "loss": 0.8950685858726501, + "step": 1462 + }, + { + "epoch": 0.6317152103559871, + "grad_norm": 0.5018318295478821, + "learning_rate": 0.00019963097314963657, + "loss": 0.8532119989395142, + "step": 1464 + }, + { + "epoch": 0.6325782092772384, + "grad_norm": 0.43245720863342285, + "learning_rate": 0.0001996267462831443, + "loss": 0.7775963544845581, + "step": 1466 + }, + { + "epoch": 0.6334412081984897, + "grad_norm": 0.5028865337371826, + "learning_rate": 0.00019962249539211125, + "loss": 0.8315839767456055, + "step": 1468 + }, + { + "epoch": 0.6343042071197411, + "grad_norm": 0.4697185754776001, + "learning_rate": 0.0001996182204775626, + "loss": 0.849076509475708, + "step": 1470 + }, + { + "epoch": 0.6351672060409924, + "grad_norm": 0.46725034713745117, + "learning_rate": 0.00019961392154052912, + "loss": 0.8828577995300293, + "step": 1472 + }, + { + "epoch": 0.6360302049622438, + "grad_norm": 0.4301203489303589, + "learning_rate": 0.00019960959858204754, + "loss": 0.850115954875946, + "step": 1474 + }, + { + "epoch": 0.6368932038834951, + "grad_norm": 0.46635881066322327, + "learning_rate": 0.0001996052516031603, + "loss": 0.7912618517875671, + "step": 1476 + }, + { + "epoch": 0.6377562028047465, + "grad_norm": 0.44143620133399963, + "learning_rate": 0.00019960088060491565, + "loss": 0.9072504639625549, + "step": 1478 + }, + { + "epoch": 0.6386192017259978, + "grad_norm": 0.47458893060684204, + "learning_rate": 0.00019959648558836763, + "loss": 0.8976638317108154, + "step": 1480 + }, + { + "epoch": 0.6394822006472491, + "grad_norm": 0.4596816897392273, + "learning_rate": 0.00019959206655457612, + "loss": 0.8142043352127075, + "step": 1482 + }, + { + "epoch": 0.6403451995685006, + "grad_norm": 0.4839977025985718, + "learning_rate": 0.0001995876235046067, + "loss": 0.860643744468689, + "step": 1484 + }, + { + "epoch": 0.6412081984897519, + "grad_norm": 0.3542814552783966, + "learning_rate": 0.00019958315643953085, + "loss": 0.7586524486541748, + "step": 1486 + }, + { + "epoch": 0.6420711974110033, + "grad_norm": 0.5423269271850586, + "learning_rate": 0.00019957866536042572, + "loss": 0.9353570342063904, + "step": 1488 + }, + { + "epoch": 0.6429341963322546, + "grad_norm": 0.4580909013748169, + "learning_rate": 0.00019957415026837437, + "loss": 0.9919291138648987, + "step": 1490 + }, + { + "epoch": 0.643797195253506, + "grad_norm": 0.4211732745170593, + "learning_rate": 0.00019956961116446555, + "loss": 0.8720914125442505, + "step": 1492 + }, + { + "epoch": 0.6446601941747573, + "grad_norm": 0.4583161175251007, + "learning_rate": 0.00019956504804979384, + "loss": 0.8661212921142578, + "step": 1494 + }, + { + "epoch": 0.6455231930960086, + "grad_norm": 0.4359884262084961, + "learning_rate": 0.00019956046092545966, + "loss": 0.8170996308326721, + "step": 1496 + }, + { + "epoch": 0.64638619201726, + "grad_norm": 0.4642556607723236, + "learning_rate": 0.00019955584979256913, + "loss": 0.8607422113418579, + "step": 1498 + }, + { + "epoch": 0.6472491909385113, + "grad_norm": 0.4496007561683655, + "learning_rate": 0.00019955121465223426, + "loss": 0.837529182434082, + "step": 1500 + }, + { + "epoch": 0.6472491909385113, + "eval_loss": 0.8972997665405273, + "eval_runtime": 649.796, + "eval_samples_per_second": 3.17, + "eval_steps_per_second": 3.17, + "step": 1500 + }, + { + "epoch": 0.6481121898597627, + "grad_norm": 0.48363205790519714, + "learning_rate": 0.0001995465555055727, + "loss": 0.9355250000953674, + "step": 1502 + }, + { + "epoch": 0.648975188781014, + "grad_norm": 0.44681695103645325, + "learning_rate": 0.0001995418723537081, + "loss": 0.8286949396133423, + "step": 1504 + }, + { + "epoch": 0.6498381877022654, + "grad_norm": 0.5110394358634949, + "learning_rate": 0.00019953716519776967, + "loss": 0.890398383140564, + "step": 1506 + }, + { + "epoch": 0.6507011866235167, + "grad_norm": 0.4353160262107849, + "learning_rate": 0.00019953243403889257, + "loss": 0.8117311000823975, + "step": 1508 + }, + { + "epoch": 0.651564185544768, + "grad_norm": 0.4731789231300354, + "learning_rate": 0.0001995276788782177, + "loss": 0.8255904316902161, + "step": 1510 + }, + { + "epoch": 0.6524271844660194, + "grad_norm": 0.5447185039520264, + "learning_rate": 0.00019952289971689177, + "loss": 0.9371263384819031, + "step": 1512 + }, + { + "epoch": 0.6532901833872707, + "grad_norm": 0.47616517543792725, + "learning_rate": 0.0001995180965560672, + "loss": 0.8532910943031311, + "step": 1514 + }, + { + "epoch": 0.6541531823085222, + "grad_norm": 0.4412213861942291, + "learning_rate": 0.0001995132693969023, + "loss": 0.8799141645431519, + "step": 1516 + }, + { + "epoch": 0.6550161812297735, + "grad_norm": 0.469911128282547, + "learning_rate": 0.00019950841824056107, + "loss": 0.8395764827728271, + "step": 1518 + }, + { + "epoch": 0.6558791801510249, + "grad_norm": 0.5236243009567261, + "learning_rate": 0.00019950354308821336, + "loss": 0.9556697010993958, + "step": 1520 + }, + { + "epoch": 0.6567421790722762, + "grad_norm": 0.4441990256309509, + "learning_rate": 0.00019949864394103482, + "loss": 0.8524283170700073, + "step": 1522 + }, + { + "epoch": 0.6576051779935275, + "grad_norm": 0.4173077940940857, + "learning_rate": 0.00019949372080020682, + "loss": 0.9213772416114807, + "step": 1524 + }, + { + "epoch": 0.6584681769147789, + "grad_norm": 0.4404120147228241, + "learning_rate": 0.00019948877366691658, + "loss": 0.9236897230148315, + "step": 1526 + }, + { + "epoch": 0.6593311758360302, + "grad_norm": 0.4586230516433716, + "learning_rate": 0.00019948380254235706, + "loss": 0.8373230695724487, + "step": 1528 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.4136028587818146, + "learning_rate": 0.00019947880742772703, + "loss": 0.8256528377532959, + "step": 1530 + }, + { + "epoch": 0.6610571736785329, + "grad_norm": 0.5007129907608032, + "learning_rate": 0.00019947378832423107, + "loss": 0.8580789566040039, + "step": 1532 + }, + { + "epoch": 0.6619201725997842, + "grad_norm": 0.39050203561782837, + "learning_rate": 0.00019946874523307947, + "loss": 0.7647744417190552, + "step": 1534 + }, + { + "epoch": 0.6627831715210356, + "grad_norm": 0.4640588164329529, + "learning_rate": 0.00019946367815548835, + "loss": 0.9042545557022095, + "step": 1536 + }, + { + "epoch": 0.6636461704422869, + "grad_norm": 0.5304957032203674, + "learning_rate": 0.00019945858709267963, + "loss": 0.9114110469818115, + "step": 1538 + }, + { + "epoch": 0.6645091693635383, + "grad_norm": 0.5426004528999329, + "learning_rate": 0.000199453472045881, + "loss": 0.8239460587501526, + "step": 1540 + }, + { + "epoch": 0.6653721682847896, + "grad_norm": 0.44893568754196167, + "learning_rate": 0.00019944833301632593, + "loss": 0.8091367483139038, + "step": 1542 + }, + { + "epoch": 0.666235167206041, + "grad_norm": 0.4294016361236572, + "learning_rate": 0.00019944317000525366, + "loss": 0.9202280640602112, + "step": 1544 + }, + { + "epoch": 0.6670981661272923, + "grad_norm": 0.449633926153183, + "learning_rate": 0.00019943798301390927, + "loss": 0.8884767889976501, + "step": 1546 + }, + { + "epoch": 0.6679611650485436, + "grad_norm": 0.4516827166080475, + "learning_rate": 0.0001994327720435435, + "loss": 0.8390879034996033, + "step": 1548 + }, + { + "epoch": 0.668824163969795, + "grad_norm": 0.422270268201828, + "learning_rate": 0.000199427537095413, + "loss": 0.7388033270835876, + "step": 1550 + }, + { + "epoch": 0.6696871628910464, + "grad_norm": 0.580563485622406, + "learning_rate": 0.00019942227817078015, + "loss": 0.9268350601196289, + "step": 1552 + }, + { + "epoch": 0.6705501618122978, + "grad_norm": 0.4436347782611847, + "learning_rate": 0.00019941699527091316, + "loss": 0.7978561520576477, + "step": 1554 + }, + { + "epoch": 0.6714131607335491, + "grad_norm": 0.4149787127971649, + "learning_rate": 0.0001994116883970859, + "loss": 0.8229286670684814, + "step": 1556 + }, + { + "epoch": 0.6722761596548005, + "grad_norm": 0.49915504455566406, + "learning_rate": 0.00019940635755057813, + "loss": 0.8554545640945435, + "step": 1558 + }, + { + "epoch": 0.6731391585760518, + "grad_norm": 0.45326656103134155, + "learning_rate": 0.00019940100273267537, + "loss": 0.9135572910308838, + "step": 1560 + }, + { + "epoch": 0.6740021574973031, + "grad_norm": 0.48639237880706787, + "learning_rate": 0.0001993956239446689, + "loss": 0.7769742012023926, + "step": 1562 + }, + { + "epoch": 0.6748651564185545, + "grad_norm": 0.5072791576385498, + "learning_rate": 0.0001993902211878558, + "loss": 0.9626237154006958, + "step": 1564 + }, + { + "epoch": 0.6757281553398058, + "grad_norm": 0.4646652638912201, + "learning_rate": 0.00019938479446353892, + "loss": 0.8506941199302673, + "step": 1566 + }, + { + "epoch": 0.6765911542610572, + "grad_norm": 0.4343051314353943, + "learning_rate": 0.00019937934377302688, + "loss": 0.8172947764396667, + "step": 1568 + }, + { + "epoch": 0.6774541531823085, + "grad_norm": 0.46890193223953247, + "learning_rate": 0.00019937386911763407, + "loss": 0.9215856790542603, + "step": 1570 + }, + { + "epoch": 0.6783171521035599, + "grad_norm": 0.5121113061904907, + "learning_rate": 0.0001993683704986807, + "loss": 0.8099892139434814, + "step": 1572 + }, + { + "epoch": 0.6791801510248112, + "grad_norm": 0.4652405083179474, + "learning_rate": 0.0001993628479174928, + "loss": 0.8675104975700378, + "step": 1574 + }, + { + "epoch": 0.6800431499460625, + "grad_norm": 0.4599422812461853, + "learning_rate": 0.00019935730137540198, + "loss": 0.7938929200172424, + "step": 1576 + }, + { + "epoch": 0.6809061488673139, + "grad_norm": 0.4738059937953949, + "learning_rate": 0.0001993517308737459, + "loss": 0.8610570430755615, + "step": 1578 + }, + { + "epoch": 0.6817691477885652, + "grad_norm": 0.5161214470863342, + "learning_rate": 0.00019934613641386776, + "loss": 0.9199413657188416, + "step": 1580 + }, + { + "epoch": 0.6826321467098166, + "grad_norm": 0.4284999370574951, + "learning_rate": 0.00019934051799711672, + "loss": 0.771649181842804, + "step": 1582 + }, + { + "epoch": 0.683495145631068, + "grad_norm": 0.5117548704147339, + "learning_rate": 0.00019933487562484757, + "loss": 0.8861327767372131, + "step": 1584 + }, + { + "epoch": 0.6843581445523194, + "grad_norm": 0.4964369833469391, + "learning_rate": 0.00019932920929842095, + "loss": 0.806983232498169, + "step": 1586 + }, + { + "epoch": 0.6852211434735707, + "grad_norm": 0.4699532091617584, + "learning_rate": 0.00019932351901920327, + "loss": 0.7963525652885437, + "step": 1588 + }, + { + "epoch": 0.686084142394822, + "grad_norm": 0.5329220294952393, + "learning_rate": 0.00019931780478856678, + "loss": 0.9406430721282959, + "step": 1590 + }, + { + "epoch": 0.6869471413160734, + "grad_norm": 0.49823835492134094, + "learning_rate": 0.00019931206660788936, + "loss": 0.8517770171165466, + "step": 1592 + }, + { + "epoch": 0.6878101402373247, + "grad_norm": 0.45245134830474854, + "learning_rate": 0.00019930630447855482, + "loss": 0.8703644275665283, + "step": 1594 + }, + { + "epoch": 0.6886731391585761, + "grad_norm": 0.47524577379226685, + "learning_rate": 0.0001993005184019526, + "loss": 0.9035283327102661, + "step": 1596 + }, + { + "epoch": 0.6895361380798274, + "grad_norm": 0.4537610411643982, + "learning_rate": 0.00019929470837947802, + "loss": 0.9173959493637085, + "step": 1598 + }, + { + "epoch": 0.6903991370010788, + "grad_norm": 0.42469722032546997, + "learning_rate": 0.00019928887441253212, + "loss": 0.8573579788208008, + "step": 1600 + }, + { + "epoch": 0.6903991370010788, + "eval_loss": 0.8880587220191956, + "eval_runtime": 653.9515, + "eval_samples_per_second": 3.15, + "eval_steps_per_second": 3.15, + "step": 1600 + }, + { + "epoch": 0.6912621359223301, + "grad_norm": 0.4388251304626465, + "learning_rate": 0.00019928301650252176, + "loss": 0.849348247051239, + "step": 1602 + }, + { + "epoch": 0.6921251348435814, + "grad_norm": 0.46086886525154114, + "learning_rate": 0.00019927713465085956, + "loss": 0.8298451900482178, + "step": 1604 + }, + { + "epoch": 0.6929881337648328, + "grad_norm": 0.42972785234451294, + "learning_rate": 0.00019927122885896387, + "loss": 0.8860712647438049, + "step": 1606 + }, + { + "epoch": 0.6938511326860841, + "grad_norm": 0.43009471893310547, + "learning_rate": 0.00019926529912825888, + "loss": 0.7972728610038757, + "step": 1608 + }, + { + "epoch": 0.6947141316073355, + "grad_norm": 0.3705308437347412, + "learning_rate": 0.00019925934546017446, + "loss": 0.8661653995513916, + "step": 1610 + }, + { + "epoch": 0.6955771305285868, + "grad_norm": 0.405208945274353, + "learning_rate": 0.00019925336785614635, + "loss": 0.8350111246109009, + "step": 1612 + }, + { + "epoch": 0.6964401294498382, + "grad_norm": 0.4773033857345581, + "learning_rate": 0.00019924736631761602, + "loss": 0.7920925617218018, + "step": 1614 + }, + { + "epoch": 0.6973031283710895, + "grad_norm": 0.4682428240776062, + "learning_rate": 0.00019924134084603075, + "loss": 0.8644304871559143, + "step": 1616 + }, + { + "epoch": 0.6981661272923408, + "grad_norm": 0.5694834589958191, + "learning_rate": 0.00019923529144284346, + "loss": 0.9897904992103577, + "step": 1618 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.40137484669685364, + "learning_rate": 0.00019922921810951302, + "loss": 0.6910083293914795, + "step": 1620 + }, + { + "epoch": 0.6998921251348436, + "grad_norm": 0.42076537013053894, + "learning_rate": 0.000199223120847504, + "loss": 0.8295826315879822, + "step": 1622 + }, + { + "epoch": 0.700755124056095, + "grad_norm": 0.4473017752170563, + "learning_rate": 0.00019921699965828662, + "loss": 0.820871889591217, + "step": 1624 + }, + { + "epoch": 0.7016181229773463, + "grad_norm": 0.43914029002189636, + "learning_rate": 0.00019921085454333706, + "loss": 0.8319019079208374, + "step": 1626 + }, + { + "epoch": 0.7024811218985976, + "grad_norm": 0.4758487939834595, + "learning_rate": 0.0001992046855041372, + "loss": 0.8589251041412354, + "step": 1628 + }, + { + "epoch": 0.703344120819849, + "grad_norm": 0.506401538848877, + "learning_rate": 0.00019919849254217465, + "loss": 0.9219205975532532, + "step": 1630 + }, + { + "epoch": 0.7042071197411003, + "grad_norm": 0.4397984445095062, + "learning_rate": 0.00019919227565894277, + "loss": 0.7824978232383728, + "step": 1632 + }, + { + "epoch": 0.7050701186623517, + "grad_norm": 0.3879252076148987, + "learning_rate": 0.0001991860348559408, + "loss": 0.8472069501876831, + "step": 1634 + }, + { + "epoch": 0.705933117583603, + "grad_norm": 0.42238810658454895, + "learning_rate": 0.00019917977013467368, + "loss": 0.824957013130188, + "step": 1636 + }, + { + "epoch": 0.7067961165048544, + "grad_norm": 0.5235037207603455, + "learning_rate": 0.00019917348149665206, + "loss": 0.9490993022918701, + "step": 1638 + }, + { + "epoch": 0.7076591154261057, + "grad_norm": 0.5195287466049194, + "learning_rate": 0.0001991671689433925, + "loss": 0.960905909538269, + "step": 1640 + }, + { + "epoch": 0.708522114347357, + "grad_norm": 0.5016481876373291, + "learning_rate": 0.00019916083247641716, + "loss": 0.8961218595504761, + "step": 1642 + }, + { + "epoch": 0.7093851132686084, + "grad_norm": 0.5510191321372986, + "learning_rate": 0.00019915447209725408, + "loss": 0.8883417844772339, + "step": 1644 + }, + { + "epoch": 0.7102481121898597, + "grad_norm": 0.4492250084877014, + "learning_rate": 0.0001991480878074371, + "loss": 0.7968636751174927, + "step": 1646 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.4189683496952057, + "learning_rate": 0.00019914167960850563, + "loss": 0.7869221568107605, + "step": 1648 + }, + { + "epoch": 0.7119741100323624, + "grad_norm": 0.4590536653995514, + "learning_rate": 0.0001991352475020051, + "loss": 0.8423646092414856, + "step": 1650 + }, + { + "epoch": 0.7128371089536139, + "grad_norm": 0.5058966875076294, + "learning_rate": 0.00019912879148948652, + "loss": 0.895459771156311, + "step": 1652 + }, + { + "epoch": 0.7137001078748652, + "grad_norm": 0.4904550313949585, + "learning_rate": 0.00019912231157250676, + "loss": 0.8737894296646118, + "step": 1654 + }, + { + "epoch": 0.7145631067961165, + "grad_norm": 0.4670710861682892, + "learning_rate": 0.0001991158077526284, + "loss": 0.8019732236862183, + "step": 1656 + }, + { + "epoch": 0.7154261057173679, + "grad_norm": 0.530343770980835, + "learning_rate": 0.00019910928003141984, + "loss": 0.9474499821662903, + "step": 1658 + }, + { + "epoch": 0.7162891046386192, + "grad_norm": 0.4250960052013397, + "learning_rate": 0.00019910272841045518, + "loss": 0.8738601803779602, + "step": 1660 + }, + { + "epoch": 0.7171521035598706, + "grad_norm": 0.4896513819694519, + "learning_rate": 0.0001990961528913143, + "loss": 0.9578261971473694, + "step": 1662 + }, + { + "epoch": 0.7180151024811219, + "grad_norm": 0.4999626576900482, + "learning_rate": 0.00019908955347558291, + "loss": 0.8116445541381836, + "step": 1664 + }, + { + "epoch": 0.7188781014023733, + "grad_norm": 0.4335242807865143, + "learning_rate": 0.00019908293016485237, + "loss": 0.8783043622970581, + "step": 1666 + }, + { + "epoch": 0.7197411003236246, + "grad_norm": 0.43542763590812683, + "learning_rate": 0.00019907628296071992, + "loss": 0.8223029375076294, + "step": 1668 + }, + { + "epoch": 0.7206040992448759, + "grad_norm": 0.4907461702823639, + "learning_rate": 0.00019906961186478842, + "loss": 1.0053197145462036, + "step": 1670 + }, + { + "epoch": 0.7214670981661273, + "grad_norm": 0.4054848253726959, + "learning_rate": 0.00019906291687866667, + "loss": 0.8107786178588867, + "step": 1672 + }, + { + "epoch": 0.7223300970873786, + "grad_norm": 0.3818599283695221, + "learning_rate": 0.0001990561980039691, + "loss": 0.780781626701355, + "step": 1674 + }, + { + "epoch": 0.72319309600863, + "grad_norm": 0.4128594994544983, + "learning_rate": 0.00019904945524231587, + "loss": 0.8189221620559692, + "step": 1676 + }, + { + "epoch": 0.7240560949298813, + "grad_norm": 0.46902593970298767, + "learning_rate": 0.0001990426885953331, + "loss": 0.83652263879776, + "step": 1678 + }, + { + "epoch": 0.7249190938511327, + "grad_norm": 0.49305564165115356, + "learning_rate": 0.00019903589806465242, + "loss": 0.8057956695556641, + "step": 1680 + }, + { + "epoch": 0.725782092772384, + "grad_norm": 0.44408300518989563, + "learning_rate": 0.0001990290836519114, + "loss": 0.8523716926574707, + "step": 1682 + }, + { + "epoch": 0.7266450916936353, + "grad_norm": 0.5211108922958374, + "learning_rate": 0.00019902224535875326, + "loss": 0.9179236888885498, + "step": 1684 + }, + { + "epoch": 0.7275080906148867, + "grad_norm": 0.4736526608467102, + "learning_rate": 0.00019901538318682705, + "loss": 0.8229476809501648, + "step": 1686 + }, + { + "epoch": 0.728371089536138, + "grad_norm": 0.541716992855072, + "learning_rate": 0.00019900849713778756, + "loss": 0.924200713634491, + "step": 1688 + }, + { + "epoch": 0.7292340884573895, + "grad_norm": 0.4524400532245636, + "learning_rate": 0.00019900158721329532, + "loss": 0.88961261510849, + "step": 1690 + }, + { + "epoch": 0.7300970873786408, + "grad_norm": 0.45256128907203674, + "learning_rate": 0.00019899465341501662, + "loss": 0.8491015434265137, + "step": 1692 + }, + { + "epoch": 0.7309600862998922, + "grad_norm": 0.5346773266792297, + "learning_rate": 0.0001989876957446235, + "loss": 0.8833339810371399, + "step": 1694 + }, + { + "epoch": 0.7318230852211435, + "grad_norm": 0.4696357846260071, + "learning_rate": 0.0001989807142037938, + "loss": 0.8535294532775879, + "step": 1696 + }, + { + "epoch": 0.7326860841423948, + "grad_norm": 0.4304637908935547, + "learning_rate": 0.0001989737087942111, + "loss": 0.8273076415061951, + "step": 1698 + }, + { + "epoch": 0.7335490830636462, + "grad_norm": 0.5085629224777222, + "learning_rate": 0.00019896667951756466, + "loss": 0.8759240508079529, + "step": 1700 + }, + { + "epoch": 0.7335490830636462, + "eval_loss": 0.879119336605072, + "eval_runtime": 663.0553, + "eval_samples_per_second": 3.107, + "eval_steps_per_second": 3.107, + "step": 1700 + }, + { + "epoch": 0.7344120819848975, + "grad_norm": 0.484223872423172, + "learning_rate": 0.00019895962637554964, + "loss": 0.852645218372345, + "step": 1702 + }, + { + "epoch": 0.7352750809061489, + "grad_norm": 0.4819294214248657, + "learning_rate": 0.0001989525493698668, + "loss": 0.878247857093811, + "step": 1704 + }, + { + "epoch": 0.7361380798274002, + "grad_norm": 0.4170311689376831, + "learning_rate": 0.00019894544850222276, + "loss": 0.8964285254478455, + "step": 1706 + }, + { + "epoch": 0.7370010787486516, + "grad_norm": 0.42712801694869995, + "learning_rate": 0.0001989383237743299, + "loss": 0.7479548454284668, + "step": 1708 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 0.5051686763763428, + "learning_rate": 0.00019893117518790624, + "loss": 0.8190052509307861, + "step": 1710 + }, + { + "epoch": 0.7387270765911542, + "grad_norm": 0.44053253531455994, + "learning_rate": 0.0001989240027446757, + "loss": 0.8646742105484009, + "step": 1712 + }, + { + "epoch": 0.7395900755124056, + "grad_norm": 0.4451025724411011, + "learning_rate": 0.00019891680644636782, + "loss": 0.874261736869812, + "step": 1714 + }, + { + "epoch": 0.7404530744336569, + "grad_norm": 0.4590521454811096, + "learning_rate": 0.00019890958629471798, + "loss": 0.8892465233802795, + "step": 1716 + }, + { + "epoch": 0.7413160733549083, + "grad_norm": 0.39169448614120483, + "learning_rate": 0.00019890234229146732, + "loss": 0.7031586766242981, + "step": 1718 + }, + { + "epoch": 0.7421790722761596, + "grad_norm": 0.46946024894714355, + "learning_rate": 0.00019889507443836266, + "loss": 0.8548433184623718, + "step": 1720 + }, + { + "epoch": 0.7430420711974111, + "grad_norm": 0.42404699325561523, + "learning_rate": 0.0001988877827371566, + "loss": 0.8231223821640015, + "step": 1722 + }, + { + "epoch": 0.7439050701186624, + "grad_norm": 0.40419483184814453, + "learning_rate": 0.00019888046718960755, + "loss": 0.8443762063980103, + "step": 1724 + }, + { + "epoch": 0.7447680690399137, + "grad_norm": 0.4550437927246094, + "learning_rate": 0.0001988731277974796, + "loss": 0.8787111639976501, + "step": 1726 + }, + { + "epoch": 0.7456310679611651, + "grad_norm": 0.42264053225517273, + "learning_rate": 0.0001988657645625426, + "loss": 0.8440850377082825, + "step": 1728 + }, + { + "epoch": 0.7464940668824164, + "grad_norm": 0.4638359844684601, + "learning_rate": 0.0001988583774865721, + "loss": 0.839216947555542, + "step": 1730 + }, + { + "epoch": 0.7473570658036678, + "grad_norm": 0.42644429206848145, + "learning_rate": 0.0001988509665713496, + "loss": 0.8011161684989929, + "step": 1732 + }, + { + "epoch": 0.7482200647249191, + "grad_norm": 0.36932024359703064, + "learning_rate": 0.0001988435318186621, + "loss": 0.850246787071228, + "step": 1734 + }, + { + "epoch": 0.7490830636461704, + "grad_norm": 0.4771935045719147, + "learning_rate": 0.00019883607323030252, + "loss": 0.7782483100891113, + "step": 1736 + }, + { + "epoch": 0.7499460625674218, + "grad_norm": 0.40007370710372925, + "learning_rate": 0.00019882859080806942, + "loss": 0.8337594866752625, + "step": 1738 + }, + { + "epoch": 0.7508090614886731, + "grad_norm": 0.5007418394088745, + "learning_rate": 0.00019882108455376716, + "loss": 0.8287386894226074, + "step": 1740 + }, + { + "epoch": 0.7516720604099245, + "grad_norm": 0.43999138474464417, + "learning_rate": 0.00019881355446920584, + "loss": 0.8655616044998169, + "step": 1742 + }, + { + "epoch": 0.7525350593311758, + "grad_norm": 0.5115824937820435, + "learning_rate": 0.00019880600055620135, + "loss": 0.8695262670516968, + "step": 1744 + }, + { + "epoch": 0.7533980582524272, + "grad_norm": 0.5035707950592041, + "learning_rate": 0.0001987984228165752, + "loss": 0.9207013845443726, + "step": 1746 + }, + { + "epoch": 0.7542610571736785, + "grad_norm": 0.4689575731754303, + "learning_rate": 0.0001987908212521548, + "loss": 0.8798729777336121, + "step": 1748 + }, + { + "epoch": 0.7551240560949298, + "grad_norm": 0.4730616509914398, + "learning_rate": 0.00019878319586477322, + "loss": 0.7737767696380615, + "step": 1750 + }, + { + "epoch": 0.7559870550161812, + "grad_norm": 0.49012845754623413, + "learning_rate": 0.00019877554665626926, + "loss": 0.929466187953949, + "step": 1752 + }, + { + "epoch": 0.7568500539374325, + "grad_norm": 0.43468761444091797, + "learning_rate": 0.0001987678736284875, + "loss": 0.8155670166015625, + "step": 1754 + }, + { + "epoch": 0.757713052858684, + "grad_norm": 0.507399320602417, + "learning_rate": 0.00019876017678327826, + "loss": 0.8082395195960999, + "step": 1756 + }, + { + "epoch": 0.7585760517799353, + "grad_norm": 0.4733552634716034, + "learning_rate": 0.0001987524561224976, + "loss": 0.8905934691429138, + "step": 1758 + }, + { + "epoch": 0.7594390507011867, + "grad_norm": 0.4670012891292572, + "learning_rate": 0.00019874471164800733, + "loss": 0.8794633746147156, + "step": 1760 + }, + { + "epoch": 0.760302049622438, + "grad_norm": 0.4951624572277069, + "learning_rate": 0.000198736943361675, + "loss": 0.8413973450660706, + "step": 1762 + }, + { + "epoch": 0.7611650485436893, + "grad_norm": 0.5478648543357849, + "learning_rate": 0.00019872915126537387, + "loss": 0.9067897200584412, + "step": 1764 + }, + { + "epoch": 0.7620280474649407, + "grad_norm": 0.48215776681900024, + "learning_rate": 0.000198721335360983, + "loss": 0.8932394981384277, + "step": 1766 + }, + { + "epoch": 0.762891046386192, + "grad_norm": 0.4688864052295685, + "learning_rate": 0.00019871349565038715, + "loss": 0.8496726751327515, + "step": 1768 + }, + { + "epoch": 0.7637540453074434, + "grad_norm": 0.4728260636329651, + "learning_rate": 0.0001987056321354768, + "loss": 0.9232800602912903, + "step": 1770 + }, + { + "epoch": 0.7646170442286947, + "grad_norm": 0.44501692056655884, + "learning_rate": 0.00019869774481814828, + "loss": 0.849755585193634, + "step": 1772 + }, + { + "epoch": 0.7654800431499461, + "grad_norm": 0.4189201593399048, + "learning_rate": 0.00019868983370030348, + "loss": 0.8258485794067383, + "step": 1774 + }, + { + "epoch": 0.7663430420711974, + "grad_norm": 0.5144591927528381, + "learning_rate": 0.00019868189878385016, + "loss": 0.8762873411178589, + "step": 1776 + }, + { + "epoch": 0.7672060409924487, + "grad_norm": 0.5048011541366577, + "learning_rate": 0.00019867394007070188, + "loss": 0.8732464909553528, + "step": 1778 + }, + { + "epoch": 0.7680690399137001, + "grad_norm": 0.41639819741249084, + "learning_rate": 0.00019866595756277774, + "loss": 0.8732751607894897, + "step": 1780 + }, + { + "epoch": 0.7689320388349514, + "grad_norm": 0.526757538318634, + "learning_rate": 0.00019865795126200271, + "loss": 0.8453729748725891, + "step": 1782 + }, + { + "epoch": 0.7697950377562028, + "grad_norm": 0.47041091322898865, + "learning_rate": 0.0001986499211703075, + "loss": 0.8780192732810974, + "step": 1784 + }, + { + "epoch": 0.7706580366774541, + "grad_norm": 0.4535890221595764, + "learning_rate": 0.0001986418672896285, + "loss": 0.8508450388908386, + "step": 1786 + }, + { + "epoch": 0.7715210355987056, + "grad_norm": 0.4608050286769867, + "learning_rate": 0.00019863378962190788, + "loss": 0.822467565536499, + "step": 1788 + }, + { + "epoch": 0.7723840345199569, + "grad_norm": 0.5190523862838745, + "learning_rate": 0.00019862568816909356, + "loss": 0.844614565372467, + "step": 1790 + }, + { + "epoch": 0.7732470334412082, + "grad_norm": 0.42502254247665405, + "learning_rate": 0.00019861756293313912, + "loss": 0.8144394755363464, + "step": 1792 + }, + { + "epoch": 0.7741100323624596, + "grad_norm": 0.47112616896629333, + "learning_rate": 0.000198609413916004, + "loss": 0.8836341500282288, + "step": 1794 + }, + { + "epoch": 0.7749730312837109, + "grad_norm": 0.48414838314056396, + "learning_rate": 0.0001986012411196532, + "loss": 0.8846262693405151, + "step": 1796 + }, + { + "epoch": 0.7758360302049623, + "grad_norm": 0.4670039415359497, + "learning_rate": 0.00019859304454605763, + "loss": 0.7993118762969971, + "step": 1798 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.41939061880111694, + "learning_rate": 0.0001985848241971938, + "loss": 0.8389407396316528, + "step": 1800 + }, + { + "epoch": 0.7766990291262136, + "eval_loss": 0.8727664947509766, + "eval_runtime": 668.2062, + "eval_samples_per_second": 3.083, + "eval_steps_per_second": 3.083, + "step": 1800 + }, + { + "epoch": 0.777562028047465, + "grad_norm": 0.4566517174243927, + "learning_rate": 0.00019857658007504405, + "loss": 0.8824291825294495, + "step": 1802 + }, + { + "epoch": 0.7784250269687163, + "grad_norm": 0.4798925518989563, + "learning_rate": 0.0001985683121815964, + "loss": 0.808982789516449, + "step": 1804 + }, + { + "epoch": 0.7792880258899676, + "grad_norm": 0.4659746587276459, + "learning_rate": 0.00019856002051884462, + "loss": 0.955269992351532, + "step": 1806 + }, + { + "epoch": 0.780151024811219, + "grad_norm": 0.48573991656303406, + "learning_rate": 0.00019855170508878818, + "loss": 0.8142994046211243, + "step": 1808 + }, + { + "epoch": 0.7810140237324703, + "grad_norm": 0.4700213670730591, + "learning_rate": 0.00019854336589343236, + "loss": 0.8755695819854736, + "step": 1810 + }, + { + "epoch": 0.7818770226537217, + "grad_norm": 0.42113450169563293, + "learning_rate": 0.00019853500293478806, + "loss": 0.8098483681678772, + "step": 1812 + }, + { + "epoch": 0.782740021574973, + "grad_norm": 0.4218153655529022, + "learning_rate": 0.00019852661621487205, + "loss": 0.8219783306121826, + "step": 1814 + }, + { + "epoch": 0.7836030204962244, + "grad_norm": 0.499052494764328, + "learning_rate": 0.00019851820573570664, + "loss": 0.8314159512519836, + "step": 1816 + }, + { + "epoch": 0.7844660194174757, + "grad_norm": 0.42420193552970886, + "learning_rate": 0.00019850977149932008, + "loss": 0.7985323071479797, + "step": 1818 + }, + { + "epoch": 0.785329018338727, + "grad_norm": 0.46345841884613037, + "learning_rate": 0.0001985013135077462, + "loss": 0.8528217077255249, + "step": 1820 + }, + { + "epoch": 0.7861920172599784, + "grad_norm": 0.4433307945728302, + "learning_rate": 0.00019849283176302462, + "loss": 0.8659319877624512, + "step": 1822 + }, + { + "epoch": 0.7870550161812297, + "grad_norm": 0.48279091715812683, + "learning_rate": 0.00019848432626720067, + "loss": 0.8675655126571655, + "step": 1824 + }, + { + "epoch": 0.7879180151024812, + "grad_norm": 0.5439180731773376, + "learning_rate": 0.0001984757970223254, + "loss": 0.8550227284431458, + "step": 1826 + }, + { + "epoch": 0.7887810140237325, + "grad_norm": 0.45749521255493164, + "learning_rate": 0.0001984672440304556, + "loss": 0.7290607690811157, + "step": 1828 + }, + { + "epoch": 0.7896440129449838, + "grad_norm": 0.4654783606529236, + "learning_rate": 0.00019845866729365378, + "loss": 0.8619251251220703, + "step": 1830 + }, + { + "epoch": 0.7905070118662352, + "grad_norm": 0.42632243037223816, + "learning_rate": 0.00019845006681398823, + "loss": 0.8249601125717163, + "step": 1832 + }, + { + "epoch": 0.7913700107874865, + "grad_norm": 0.4747186005115509, + "learning_rate": 0.0001984414425935329, + "loss": 0.7138552069664001, + "step": 1834 + }, + { + "epoch": 0.7922330097087379, + "grad_norm": 0.4462338387966156, + "learning_rate": 0.0001984327946343674, + "loss": 0.903292715549469, + "step": 1836 + }, + { + "epoch": 0.7930960086299892, + "grad_norm": 0.4581359922885895, + "learning_rate": 0.00019842412293857726, + "loss": 0.7569618225097656, + "step": 1838 + }, + { + "epoch": 0.7939590075512406, + "grad_norm": 0.4183015525341034, + "learning_rate": 0.00019841542750825356, + "loss": 0.8063036203384399, + "step": 1840 + }, + { + "epoch": 0.7948220064724919, + "grad_norm": 0.3954181373119354, + "learning_rate": 0.0001984067083454932, + "loss": 0.81150221824646, + "step": 1842 + }, + { + "epoch": 0.7956850053937432, + "grad_norm": 0.46220019459724426, + "learning_rate": 0.0001983979654523987, + "loss": 0.841649055480957, + "step": 1844 + }, + { + "epoch": 0.7965480043149946, + "grad_norm": 0.47807541489601135, + "learning_rate": 0.00019838919883107843, + "loss": 0.8019483685493469, + "step": 1846 + }, + { + "epoch": 0.7974110032362459, + "grad_norm": 0.48015692830085754, + "learning_rate": 0.0001983804084836464, + "loss": 0.9343363046646118, + "step": 1848 + }, + { + "epoch": 0.7982740021574973, + "grad_norm": 0.4906708896160126, + "learning_rate": 0.00019837159441222238, + "loss": 0.9163194894790649, + "step": 1850 + }, + { + "epoch": 0.7991370010787486, + "grad_norm": 0.4856911599636078, + "learning_rate": 0.0001983627566189318, + "loss": 0.8017736077308655, + "step": 1852 + }, + { + "epoch": 0.8, + "grad_norm": 0.49403145909309387, + "learning_rate": 0.0001983538951059059, + "loss": 0.8375223875045776, + "step": 1854 + }, + { + "epoch": 0.8008629989212513, + "grad_norm": 0.4237985908985138, + "learning_rate": 0.00019834500987528158, + "loss": 0.8213951587677002, + "step": 1856 + }, + { + "epoch": 0.8017259978425026, + "grad_norm": 0.3977980315685272, + "learning_rate": 0.00019833610092920149, + "loss": 0.8086028099060059, + "step": 1858 + }, + { + "epoch": 0.8025889967637541, + "grad_norm": 0.435253381729126, + "learning_rate": 0.00019832716826981392, + "loss": 0.9402202367782593, + "step": 1860 + }, + { + "epoch": 0.8034519956850054, + "grad_norm": 0.4074764847755432, + "learning_rate": 0.000198318211899273, + "loss": 0.7730796933174133, + "step": 1862 + }, + { + "epoch": 0.8043149946062568, + "grad_norm": 0.48804348707199097, + "learning_rate": 0.0001983092318197385, + "loss": 0.9185802936553955, + "step": 1864 + }, + { + "epoch": 0.8051779935275081, + "grad_norm": 0.44363343715667725, + "learning_rate": 0.00019830022803337592, + "loss": 0.8578243851661682, + "step": 1866 + }, + { + "epoch": 0.8060409924487595, + "grad_norm": 0.46645957231521606, + "learning_rate": 0.00019829120054235653, + "loss": 0.8682060241699219, + "step": 1868 + }, + { + "epoch": 0.8069039913700108, + "grad_norm": 0.4527221620082855, + "learning_rate": 0.00019828214934885718, + "loss": 0.7845040559768677, + "step": 1870 + }, + { + "epoch": 0.8077669902912621, + "grad_norm": 0.4103536307811737, + "learning_rate": 0.0001982730744550606, + "loss": 0.8768247961997986, + "step": 1872 + }, + { + "epoch": 0.8086299892125135, + "grad_norm": 0.5257515907287598, + "learning_rate": 0.00019826397586315513, + "loss": 0.850267231464386, + "step": 1874 + }, + { + "epoch": 0.8094929881337648, + "grad_norm": 0.46675893664360046, + "learning_rate": 0.00019825485357533485, + "loss": 0.8234293460845947, + "step": 1876 + }, + { + "epoch": 0.8103559870550162, + "grad_norm": 0.46315401792526245, + "learning_rate": 0.00019824570759379958, + "loss": 0.8131387829780579, + "step": 1878 + }, + { + "epoch": 0.8112189859762675, + "grad_norm": 0.4766870439052582, + "learning_rate": 0.00019823653792075478, + "loss": 0.7680494785308838, + "step": 1880 + }, + { + "epoch": 0.8120819848975189, + "grad_norm": 0.39025625586509705, + "learning_rate": 0.00019822734455841173, + "loss": 0.7891425490379333, + "step": 1882 + }, + { + "epoch": 0.8129449838187702, + "grad_norm": 0.4672441780567169, + "learning_rate": 0.0001982181275089874, + "loss": 0.7990400791168213, + "step": 1884 + }, + { + "epoch": 0.8138079827400215, + "grad_norm": 0.44310975074768066, + "learning_rate": 0.00019820888677470432, + "loss": 0.7828341722488403, + "step": 1886 + }, + { + "epoch": 0.8146709816612729, + "grad_norm": 0.45098716020584106, + "learning_rate": 0.00019819962235779096, + "loss": 0.897715151309967, + "step": 1888 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.466805100440979, + "learning_rate": 0.00019819033426048135, + "loss": 0.7987668514251709, + "step": 1890 + }, + { + "epoch": 0.8163969795037757, + "grad_norm": 0.4438319206237793, + "learning_rate": 0.00019818102248501528, + "loss": 0.7950236201286316, + "step": 1892 + }, + { + "epoch": 0.817259978425027, + "grad_norm": 0.42012497782707214, + "learning_rate": 0.00019817168703363823, + "loss": 0.8789975643157959, + "step": 1894 + }, + { + "epoch": 0.8181229773462784, + "grad_norm": 0.4359394311904907, + "learning_rate": 0.0001981623279086014, + "loss": 0.8159777522087097, + "step": 1896 + }, + { + "epoch": 0.8189859762675297, + "grad_norm": 0.44709593057632446, + "learning_rate": 0.00019815294511216173, + "loss": 0.84877610206604, + "step": 1898 + }, + { + "epoch": 0.819848975188781, + "grad_norm": 0.4315306544303894, + "learning_rate": 0.00019814353864658184, + "loss": 0.8467556834220886, + "step": 1900 + }, + { + "epoch": 0.819848975188781, + "eval_loss": 0.8643407821655273, + "eval_runtime": 658.6942, + "eval_samples_per_second": 3.127, + "eval_steps_per_second": 3.127, + "step": 1900 + }, + { + "epoch": 0.8207119741100324, + "grad_norm": 0.35530397295951843, + "learning_rate": 0.00019813410851412998, + "loss": 0.7398589849472046, + "step": 1902 + }, + { + "epoch": 0.8215749730312837, + "grad_norm": 0.46949300169944763, + "learning_rate": 0.00019812465471708032, + "loss": 0.8544237613677979, + "step": 1904 + }, + { + "epoch": 0.8224379719525351, + "grad_norm": 0.4961565434932709, + "learning_rate": 0.00019811517725771248, + "loss": 0.8242526054382324, + "step": 1906 + }, + { + "epoch": 0.8233009708737864, + "grad_norm": 0.45835059881210327, + "learning_rate": 0.00019810567613831194, + "loss": 0.7856690287590027, + "step": 1908 + }, + { + "epoch": 0.8241639697950378, + "grad_norm": 0.4446084797382355, + "learning_rate": 0.0001980961513611699, + "loss": 0.8361829519271851, + "step": 1910 + }, + { + "epoch": 0.8250269687162891, + "grad_norm": 0.4470907747745514, + "learning_rate": 0.00019808660292858313, + "loss": 0.8993050456047058, + "step": 1912 + }, + { + "epoch": 0.8258899676375404, + "grad_norm": 0.44883644580841064, + "learning_rate": 0.0001980770308428543, + "loss": 0.8702824711799622, + "step": 1914 + }, + { + "epoch": 0.8267529665587918, + "grad_norm": 0.43215686082839966, + "learning_rate": 0.00019806743510629159, + "loss": 0.8454389572143555, + "step": 1916 + }, + { + "epoch": 0.8276159654800431, + "grad_norm": 0.4525185823440552, + "learning_rate": 0.00019805781572120897, + "loss": 0.8621824383735657, + "step": 1918 + }, + { + "epoch": 0.8284789644012945, + "grad_norm": 0.4616840183734894, + "learning_rate": 0.00019804817268992615, + "loss": 0.8661681413650513, + "step": 1920 + }, + { + "epoch": 0.8293419633225458, + "grad_norm": 0.4252975583076477, + "learning_rate": 0.0001980385060147685, + "loss": 0.8376660346984863, + "step": 1922 + }, + { + "epoch": 0.8302049622437971, + "grad_norm": 0.44600266218185425, + "learning_rate": 0.00019802881569806706, + "loss": 0.9258401393890381, + "step": 1924 + }, + { + "epoch": 0.8310679611650486, + "grad_norm": 0.48872479796409607, + "learning_rate": 0.00019801910174215866, + "loss": 0.8804965615272522, + "step": 1926 + }, + { + "epoch": 0.8319309600862999, + "grad_norm": 0.5357037782669067, + "learning_rate": 0.00019800936414938574, + "loss": 0.8561494946479797, + "step": 1928 + }, + { + "epoch": 0.8327939590075513, + "grad_norm": 0.39637291431427, + "learning_rate": 0.00019799960292209647, + "loss": 0.782166063785553, + "step": 1930 + }, + { + "epoch": 0.8336569579288026, + "grad_norm": 0.521138072013855, + "learning_rate": 0.00019798981806264476, + "loss": 0.9048293232917786, + "step": 1932 + }, + { + "epoch": 0.834519956850054, + "grad_norm": 0.4723529815673828, + "learning_rate": 0.00019798000957339015, + "loss": 0.9269952774047852, + "step": 1934 + }, + { + "epoch": 0.8353829557713053, + "grad_norm": 0.42827340960502625, + "learning_rate": 0.0001979701774566979, + "loss": 0.8620670437812805, + "step": 1936 + }, + { + "epoch": 0.8362459546925566, + "grad_norm": 0.4305116534233093, + "learning_rate": 0.00019796032171493907, + "loss": 0.8016669750213623, + "step": 1938 + }, + { + "epoch": 0.837108953613808, + "grad_norm": 0.4995502233505249, + "learning_rate": 0.00019795044235049024, + "loss": 0.879247784614563, + "step": 1940 + }, + { + "epoch": 0.8379719525350593, + "grad_norm": 0.49229878187179565, + "learning_rate": 0.0001979405393657338, + "loss": 0.9476580023765564, + "step": 1942 + }, + { + "epoch": 0.8388349514563107, + "grad_norm": 0.45756596326828003, + "learning_rate": 0.0001979306127630578, + "loss": 0.8654064536094666, + "step": 1944 + }, + { + "epoch": 0.839697950377562, + "grad_norm": 0.4855344891548157, + "learning_rate": 0.00019792066254485603, + "loss": 0.7792956829071045, + "step": 1946 + }, + { + "epoch": 0.8405609492988134, + "grad_norm": 0.4358632266521454, + "learning_rate": 0.00019791068871352787, + "loss": 0.8000320792198181, + "step": 1948 + }, + { + "epoch": 0.8414239482200647, + "grad_norm": 0.4225342273712158, + "learning_rate": 0.00019790069127147852, + "loss": 0.818372368812561, + "step": 1950 + }, + { + "epoch": 0.842286947141316, + "grad_norm": 0.3894529938697815, + "learning_rate": 0.00019789067022111886, + "loss": 0.727220892906189, + "step": 1952 + }, + { + "epoch": 0.8431499460625674, + "grad_norm": 0.5060731768608093, + "learning_rate": 0.0001978806255648653, + "loss": 0.894101083278656, + "step": 1954 + }, + { + "epoch": 0.8440129449838187, + "grad_norm": 0.4165003001689911, + "learning_rate": 0.0001978705573051402, + "loss": 0.878365695476532, + "step": 1956 + }, + { + "epoch": 0.8448759439050701, + "grad_norm": 0.48767927289009094, + "learning_rate": 0.0001978604654443714, + "loss": 0.8390909433364868, + "step": 1958 + }, + { + "epoch": 0.8457389428263214, + "grad_norm": 0.43019410967826843, + "learning_rate": 0.00019785034998499247, + "loss": 0.8807769417762756, + "step": 1960 + }, + { + "epoch": 0.8466019417475729, + "grad_norm": 0.4430403709411621, + "learning_rate": 0.0001978402109294428, + "loss": 0.8037779331207275, + "step": 1962 + }, + { + "epoch": 0.8474649406688242, + "grad_norm": 0.41642463207244873, + "learning_rate": 0.0001978300482801673, + "loss": 0.8341337442398071, + "step": 1964 + }, + { + "epoch": 0.8483279395900755, + "grad_norm": 0.45358774065971375, + "learning_rate": 0.00019781986203961668, + "loss": 0.854821503162384, + "step": 1966 + }, + { + "epoch": 0.8491909385113269, + "grad_norm": 0.4316342771053314, + "learning_rate": 0.00019780965221024728, + "loss": 0.8527678847312927, + "step": 1968 + }, + { + "epoch": 0.8500539374325782, + "grad_norm": 0.4581106901168823, + "learning_rate": 0.00019779941879452122, + "loss": 0.7461717128753662, + "step": 1970 + }, + { + "epoch": 0.8509169363538296, + "grad_norm": 0.49578142166137695, + "learning_rate": 0.0001977891617949062, + "loss": 0.884441077709198, + "step": 1972 + }, + { + "epoch": 0.8517799352750809, + "grad_norm": 0.4366011917591095, + "learning_rate": 0.00019777888121387562, + "loss": 0.855915904045105, + "step": 1974 + }, + { + "epoch": 0.8526429341963323, + "grad_norm": 0.486162930727005, + "learning_rate": 0.00019776857705390864, + "loss": 0.7563765645027161, + "step": 1976 + }, + { + "epoch": 0.8535059331175836, + "grad_norm": 0.5162674784660339, + "learning_rate": 0.00019775824931749005, + "loss": 0.8346326947212219, + "step": 1978 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 0.5824693441390991, + "learning_rate": 0.0001977478980071103, + "loss": 0.8701820969581604, + "step": 1980 + }, + { + "epoch": 0.8552319309600863, + "grad_norm": 0.4297148883342743, + "learning_rate": 0.00019773752312526565, + "loss": 0.893528938293457, + "step": 1982 + }, + { + "epoch": 0.8560949298813376, + "grad_norm": 0.42978280782699585, + "learning_rate": 0.00019772712467445788, + "loss": 0.8201018571853638, + "step": 1984 + }, + { + "epoch": 0.856957928802589, + "grad_norm": 0.5192655324935913, + "learning_rate": 0.00019771670265719454, + "loss": 0.9080212116241455, + "step": 1986 + }, + { + "epoch": 0.8578209277238403, + "grad_norm": 0.452690452337265, + "learning_rate": 0.00019770625707598885, + "loss": 0.8518272638320923, + "step": 1988 + }, + { + "epoch": 0.8586839266450917, + "grad_norm": 0.4371768832206726, + "learning_rate": 0.00019769578793335976, + "loss": 0.9426717758178711, + "step": 1990 + }, + { + "epoch": 0.859546925566343, + "grad_norm": 0.44595038890838623, + "learning_rate": 0.0001976852952318318, + "loss": 0.8065400123596191, + "step": 1992 + }, + { + "epoch": 0.8604099244875943, + "grad_norm": 0.4355090260505676, + "learning_rate": 0.0001976747789739353, + "loss": 0.7674415707588196, + "step": 1994 + }, + { + "epoch": 0.8612729234088458, + "grad_norm": 0.43745186924934387, + "learning_rate": 0.00019766423916220616, + "loss": 0.813849925994873, + "step": 1996 + }, + { + "epoch": 0.8621359223300971, + "grad_norm": 0.4588927924633026, + "learning_rate": 0.00019765367579918598, + "loss": 0.7870585322380066, + "step": 1998 + }, + { + "epoch": 0.8629989212513485, + "grad_norm": 0.4170977473258972, + "learning_rate": 0.00019764308888742214, + "loss": 0.8383269309997559, + "step": 2000 + }, + { + "epoch": 0.8629989212513485, + "eval_loss": 0.8567262887954712, + "eval_runtime": 646.6443, + "eval_samples_per_second": 3.186, + "eval_steps_per_second": 3.186, + "step": 2000 + } + ], + "logging_steps": 2, + "max_steps": 13908, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1995266703367885e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}