{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.062, "eval_steps": 1000, "global_step": 91000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2e-06, "grad_norm": 29.506126403808594, "learning_rate": 0.0, "loss": 1.5091, "step": 1 }, { "epoch": 0.0002, "grad_norm": 7.35781717300415, "learning_rate": 9.9e-07, "loss": 1.6562, "step": 100 }, { "epoch": 0.0004, "grad_norm": 4.9180989265441895, "learning_rate": 1.99e-06, "loss": 1.6176, "step": 200 }, { "epoch": 0.0006, "grad_norm": 1.8868086338043213, "learning_rate": 2.99e-06, "loss": 1.548, "step": 300 }, { "epoch": 0.0008, "grad_norm": 7.365355491638184, "learning_rate": 3.99e-06, "loss": 1.4958, "step": 400 }, { "epoch": 0.001, "grad_norm": 8.965476989746094, "learning_rate": 4.9900000000000005e-06, "loss": 1.4918, "step": 500 }, { "epoch": 0.0012, "grad_norm": 2.2186834812164307, "learning_rate": 5.99e-06, "loss": 1.4807, "step": 600 }, { "epoch": 0.0014, "grad_norm": 1.970430850982666, "learning_rate": 6.990000000000001e-06, "loss": 1.4312, "step": 700 }, { "epoch": 0.0016, "grad_norm": 1.5914119482040405, "learning_rate": 7.99e-06, "loss": 1.3848, "step": 800 }, { "epoch": 0.0018, "grad_norm": 1.7615679502487183, "learning_rate": 8.99e-06, "loss": 1.4126, "step": 900 }, { "epoch": 0.002, "grad_norm": 1.5981565713882446, "learning_rate": 9.990000000000001e-06, "loss": 1.3768, "step": 1000 }, { "epoch": 0.002, "eval_loss": 1.1488478183746338, "eval_runtime": 84.3931, "eval_samples_per_second": 182.989, "eval_steps_per_second": 2.868, "step": 1000 }, { "epoch": 0.0022, "grad_norm": 1.9463247060775757, "learning_rate": 1.099e-05, "loss": 1.4649, "step": 1100 }, { "epoch": 0.0024, "grad_norm": 1.997353434562683, "learning_rate": 1.199e-05, "loss": 1.422, "step": 1200 }, { "epoch": 0.0026, "grad_norm": 2.028587818145752, "learning_rate": 1.299e-05, "loss": 1.4101, "step": 1300 }, { "epoch": 0.0028, "grad_norm": 1.8055784702301025, "learning_rate": 1.399e-05, "loss": 1.379, "step": 1400 }, { "epoch": 0.003, "grad_norm": 2.630389451980591, "learning_rate": 1.499e-05, "loss": 1.3915, "step": 1500 }, { "epoch": 0.0032, "grad_norm": 1.4471231698989868, "learning_rate": 1.599e-05, "loss": 1.3651, "step": 1600 }, { "epoch": 0.0034, "grad_norm": 1.4115934371948242, "learning_rate": 1.699e-05, "loss": 1.3327, "step": 1700 }, { "epoch": 0.0036, "grad_norm": 1.1099858283996582, "learning_rate": 1.7990000000000002e-05, "loss": 1.304, "step": 1800 }, { "epoch": 0.0038, "grad_norm": 1.5767651796340942, "learning_rate": 1.8990000000000003e-05, "loss": 1.3375, "step": 1900 }, { "epoch": 0.004, "grad_norm": 1.3484268188476562, "learning_rate": 1.999e-05, "loss": 1.3746, "step": 2000 }, { "epoch": 0.004, "eval_loss": 1.1486531496047974, "eval_runtime": 76.1223, "eval_samples_per_second": 202.871, "eval_steps_per_second": 3.179, "step": 2000 }, { "epoch": 0.0042, "grad_norm": 1.6412079334259033, "learning_rate": 2.099e-05, "loss": 1.3931, "step": 2100 }, { "epoch": 0.0044, "grad_norm": 1.17317533493042, "learning_rate": 2.199e-05, "loss": 1.3512, "step": 2200 }, { "epoch": 0.0046, "grad_norm": 0.8342074751853943, "learning_rate": 2.2990000000000002e-05, "loss": 1.3805, "step": 2300 }, { "epoch": 0.0048, "grad_norm": 1.5843234062194824, "learning_rate": 2.3990000000000002e-05, "loss": 1.377, "step": 2400 }, { "epoch": 0.005, "grad_norm": 1.915511131286621, "learning_rate": 2.4990000000000003e-05, "loss": 1.3659, "step": 2500 }, { "epoch": 0.0052, "grad_norm": 1.6507076025009155, "learning_rate": 2.5990000000000004e-05, "loss": 1.2875, "step": 2600 }, { "epoch": 0.0054, "grad_norm": 1.5680265426635742, "learning_rate": 2.6989999999999997e-05, "loss": 1.3402, "step": 2700 }, { "epoch": 0.0056, "grad_norm": 0.8005309700965881, "learning_rate": 2.7989999999999998e-05, "loss": 1.3565, "step": 2800 }, { "epoch": 0.0058, "grad_norm": 1.664014220237732, "learning_rate": 2.8990000000000002e-05, "loss": 1.3118, "step": 2900 }, { "epoch": 0.006, "grad_norm": 1.1597651243209839, "learning_rate": 2.9990000000000003e-05, "loss": 1.3207, "step": 3000 }, { "epoch": 0.006, "eval_loss": 1.1344993114471436, "eval_runtime": 76.5771, "eval_samples_per_second": 201.666, "eval_steps_per_second": 3.16, "step": 3000 }, { "epoch": 0.0062, "grad_norm": 1.6559661626815796, "learning_rate": 3.099e-05, "loss": 1.3103, "step": 3100 }, { "epoch": 0.0064, "grad_norm": 1.390712857246399, "learning_rate": 3.1990000000000004e-05, "loss": 1.3855, "step": 3200 }, { "epoch": 0.0066, "grad_norm": 1.9980418682098389, "learning_rate": 3.299e-05, "loss": 1.3109, "step": 3300 }, { "epoch": 0.0068, "grad_norm": 1.2899682521820068, "learning_rate": 3.399e-05, "loss": 1.3219, "step": 3400 }, { "epoch": 0.007, "grad_norm": 1.44901704788208, "learning_rate": 3.499e-05, "loss": 1.3089, "step": 3500 }, { "epoch": 0.0072, "grad_norm": 1.3377976417541504, "learning_rate": 3.599e-05, "loss": 1.2995, "step": 3600 }, { "epoch": 0.0074, "grad_norm": 1.5043129920959473, "learning_rate": 3.699e-05, "loss": 1.3421, "step": 3700 }, { "epoch": 0.0076, "grad_norm": 1.4387165307998657, "learning_rate": 3.799e-05, "loss": 1.3337, "step": 3800 }, { "epoch": 0.0078, "grad_norm": 1.1607294082641602, "learning_rate": 3.8990000000000004e-05, "loss": 1.2852, "step": 3900 }, { "epoch": 0.008, "grad_norm": 1.0189259052276611, "learning_rate": 3.999e-05, "loss": 1.3277, "step": 4000 }, { "epoch": 0.008, "eval_loss": 1.1298929452896118, "eval_runtime": 76.4952, "eval_samples_per_second": 201.882, "eval_steps_per_second": 3.164, "step": 4000 }, { "epoch": 0.0082, "grad_norm": 1.6229581832885742, "learning_rate": 4.099e-05, "loss": 1.2878, "step": 4100 }, { "epoch": 0.0084, "grad_norm": 1.693702220916748, "learning_rate": 4.199e-05, "loss": 1.313, "step": 4200 }, { "epoch": 0.0086, "grad_norm": 1.169730544090271, "learning_rate": 4.299e-05, "loss": 1.2915, "step": 4300 }, { "epoch": 0.0088, "grad_norm": 1.3561712503433228, "learning_rate": 4.3990000000000004e-05, "loss": 1.3337, "step": 4400 }, { "epoch": 0.009, "grad_norm": 1.4713114500045776, "learning_rate": 4.499e-05, "loss": 1.309, "step": 4500 }, { "epoch": 0.0092, "grad_norm": 1.0679044723510742, "learning_rate": 4.599e-05, "loss": 1.3464, "step": 4600 }, { "epoch": 0.0094, "grad_norm": 1.4595869779586792, "learning_rate": 4.699e-05, "loss": 1.3385, "step": 4700 }, { "epoch": 0.0096, "grad_norm": 1.6443949937820435, "learning_rate": 4.799e-05, "loss": 1.3287, "step": 4800 }, { "epoch": 0.0098, "grad_norm": 1.3524634838104248, "learning_rate": 4.8990000000000004e-05, "loss": 1.3224, "step": 4900 }, { "epoch": 0.01, "grad_norm": 1.552986979484558, "learning_rate": 4.999e-05, "loss": 1.3256, "step": 5000 }, { "epoch": 0.01, "eval_loss": 1.1314986944198608, "eval_runtime": 76.3433, "eval_samples_per_second": 202.284, "eval_steps_per_second": 3.17, "step": 5000 }, { "epoch": 0.0102, "grad_norm": 1.1126846075057983, "learning_rate": 4.9999995065197964e-05, "loss": 1.3184, "step": 5100 }, { "epoch": 0.0104, "grad_norm": 0.8533400893211365, "learning_rate": 4.999998006090441e-05, "loss": 1.3145, "step": 5200 }, { "epoch": 0.0106, "grad_norm": 1.6032077074050903, "learning_rate": 4.9999954986621866e-05, "loss": 1.2894, "step": 5300 }, { "epoch": 0.0108, "grad_norm": 1.2594430446624756, "learning_rate": 4.999991984236044e-05, "loss": 1.2515, "step": 5400 }, { "epoch": 0.011, "grad_norm": 1.2169750928878784, "learning_rate": 4.99998746281343e-05, "loss": 1.2603, "step": 5500 }, { "epoch": 0.0112, "grad_norm": 1.2038013935089111, "learning_rate": 4.999981934396165e-05, "loss": 1.3063, "step": 5600 }, { "epoch": 0.0114, "grad_norm": 1.1477010250091553, "learning_rate": 4.999975398986476e-05, "loss": 1.3057, "step": 5700 }, { "epoch": 0.0116, "grad_norm": 0.6725754141807556, "learning_rate": 4.9999678565869944e-05, "loss": 1.3211, "step": 5800 }, { "epoch": 0.0118, "grad_norm": 1.5470402240753174, "learning_rate": 4.99995930720076e-05, "loss": 1.2794, "step": 5900 }, { "epoch": 0.012, "grad_norm": 1.8079277276992798, "learning_rate": 4.999949750831215e-05, "loss": 1.2736, "step": 6000 }, { "epoch": 0.012, "eval_loss": 1.1335862874984741, "eval_runtime": 76.3508, "eval_samples_per_second": 202.264, "eval_steps_per_second": 3.17, "step": 6000 }, { "epoch": 0.0122, "grad_norm": 1.4117431640625, "learning_rate": 4.99993918748221e-05, "loss": 1.3142, "step": 6100 }, { "epoch": 0.0124, "grad_norm": 1.2657192945480347, "learning_rate": 4.999927617157998e-05, "loss": 1.3216, "step": 6200 }, { "epoch": 0.0126, "grad_norm": 1.0358809232711792, "learning_rate": 4.9999150398632425e-05, "loss": 1.329, "step": 6300 }, { "epoch": 0.0128, "grad_norm": 1.6824450492858887, "learning_rate": 4.999901455603007e-05, "loss": 1.2911, "step": 6400 }, { "epoch": 0.013, "grad_norm": 1.5632168054580688, "learning_rate": 4.9998868643827635e-05, "loss": 1.3004, "step": 6500 }, { "epoch": 0.0132, "grad_norm": 1.254310131072998, "learning_rate": 4.99987126620839e-05, "loss": 1.2981, "step": 6600 }, { "epoch": 0.0134, "grad_norm": 1.4540060758590698, "learning_rate": 4.999854661086171e-05, "loss": 1.3184, "step": 6700 }, { "epoch": 0.0136, "grad_norm": 1.3684179782867432, "learning_rate": 4.999837049022792e-05, "loss": 1.2914, "step": 6800 }, { "epoch": 0.0138, "grad_norm": 1.474075436592102, "learning_rate": 4.999818430025349e-05, "loss": 1.2702, "step": 6900 }, { "epoch": 0.014, "grad_norm": 1.3687875270843506, "learning_rate": 4.999798804101341e-05, "loss": 1.2388, "step": 7000 }, { "epoch": 0.014, "eval_loss": 1.1258224248886108, "eval_runtime": 76.3516, "eval_samples_per_second": 202.262, "eval_steps_per_second": 3.17, "step": 7000 }, { "epoch": 0.0142, "grad_norm": 0.6668384075164795, "learning_rate": 4.999778171258675e-05, "loss": 1.2768, "step": 7100 }, { "epoch": 0.0144, "grad_norm": 1.1303478479385376, "learning_rate": 4.9997565315056596e-05, "loss": 1.2639, "step": 7200 }, { "epoch": 0.0146, "grad_norm": 1.516221046447754, "learning_rate": 4.999733884851012e-05, "loss": 1.2805, "step": 7300 }, { "epoch": 0.0148, "grad_norm": 1.3124428987503052, "learning_rate": 4.9997102313038544e-05, "loss": 1.2811, "step": 7400 }, { "epoch": 0.015, "grad_norm": 1.390687346458435, "learning_rate": 4.999685570873715e-05, "loss": 1.2481, "step": 7500 }, { "epoch": 0.0152, "grad_norm": 0.8783305883407593, "learning_rate": 4.999659903570526e-05, "loss": 1.2986, "step": 7600 }, { "epoch": 0.0154, "grad_norm": 1.0741727352142334, "learning_rate": 4.999633229404628e-05, "loss": 1.2784, "step": 7700 }, { "epoch": 0.0156, "grad_norm": 1.022088885307312, "learning_rate": 4.999605548386763e-05, "loss": 1.2869, "step": 7800 }, { "epoch": 0.0158, "grad_norm": 1.0997594594955444, "learning_rate": 4.9995768605280826e-05, "loss": 1.2736, "step": 7900 }, { "epoch": 0.016, "grad_norm": 1.191188931465149, "learning_rate": 4.9995471658401414e-05, "loss": 1.256, "step": 8000 }, { "epoch": 0.016, "eval_loss": 1.1234357357025146, "eval_runtime": 76.115, "eval_samples_per_second": 202.89, "eval_steps_per_second": 3.179, "step": 8000 }, { "epoch": 0.0162, "grad_norm": 0.7304887175559998, "learning_rate": 4.9995164643349015e-05, "loss": 1.2717, "step": 8100 }, { "epoch": 0.0164, "grad_norm": 1.2335166931152344, "learning_rate": 4.9994847560247276e-05, "loss": 1.2657, "step": 8200 }, { "epoch": 0.0166, "grad_norm": 1.424973487854004, "learning_rate": 4.999452040922393e-05, "loss": 1.3235, "step": 8300 }, { "epoch": 0.0168, "grad_norm": 1.1544169187545776, "learning_rate": 4.999418319041076e-05, "loss": 1.2455, "step": 8400 }, { "epoch": 0.017, "grad_norm": 1.1393338441848755, "learning_rate": 4.9993835903943585e-05, "loss": 1.233, "step": 8500 }, { "epoch": 0.0172, "grad_norm": 1.1183439493179321, "learning_rate": 4.99934785499623e-05, "loss": 1.2282, "step": 8600 }, { "epoch": 0.0174, "grad_norm": 1.275148868560791, "learning_rate": 4.999311112861084e-05, "loss": 1.2665, "step": 8700 }, { "epoch": 0.0176, "grad_norm": 1.4136372804641724, "learning_rate": 4.99927336400372e-05, "loss": 1.2617, "step": 8800 }, { "epoch": 0.0178, "grad_norm": 1.392327904701233, "learning_rate": 4.999234608439345e-05, "loss": 1.292, "step": 8900 }, { "epoch": 0.018, "grad_norm": 1.367475152015686, "learning_rate": 4.9991948461835685e-05, "loss": 1.2153, "step": 9000 }, { "epoch": 0.018, "eval_loss": 1.1127148866653442, "eval_runtime": 76.2524, "eval_samples_per_second": 202.525, "eval_steps_per_second": 3.174, "step": 9000 }, { "epoch": 0.0182, "grad_norm": 0.8793131709098816, "learning_rate": 4.999154077252407e-05, "loss": 1.2734, "step": 9100 }, { "epoch": 0.0184, "grad_norm": 0.6496739387512207, "learning_rate": 4.999112301662281e-05, "loss": 1.2498, "step": 9200 }, { "epoch": 0.0186, "grad_norm": 1.1462939977645874, "learning_rate": 4.99906951943002e-05, "loss": 1.2549, "step": 9300 }, { "epoch": 0.0188, "grad_norm": 1.520691156387329, "learning_rate": 4.999025730572854e-05, "loss": 1.2437, "step": 9400 }, { "epoch": 0.019, "grad_norm": 1.3555136919021606, "learning_rate": 4.998980935108424e-05, "loss": 1.2326, "step": 9500 }, { "epoch": 0.0192, "grad_norm": 1.467217206954956, "learning_rate": 4.9989351330547715e-05, "loss": 1.2768, "step": 9600 }, { "epoch": 0.0194, "grad_norm": 1.3842765092849731, "learning_rate": 4.998888324430346e-05, "loss": 1.2675, "step": 9700 }, { "epoch": 0.0196, "grad_norm": 1.344078540802002, "learning_rate": 4.998840509254003e-05, "loss": 1.2619, "step": 9800 }, { "epoch": 0.0198, "grad_norm": 0.7567517757415771, "learning_rate": 4.998791687545001e-05, "loss": 1.2794, "step": 9900 }, { "epoch": 0.02, "grad_norm": 0.9987697601318359, "learning_rate": 4.998741859323006e-05, "loss": 1.2778, "step": 10000 }, { "epoch": 0.02, "eval_loss": 1.1275579929351807, "eval_runtime": 76.2888, "eval_samples_per_second": 202.428, "eval_steps_per_second": 3.172, "step": 10000 }, { "epoch": 0.0202, "grad_norm": 1.5212323665618896, "learning_rate": 4.9986910246080894e-05, "loss": 1.2884, "step": 10100 }, { "epoch": 0.0204, "grad_norm": 1.5730245113372803, "learning_rate": 4.998639183420727e-05, "loss": 1.282, "step": 10200 }, { "epoch": 0.0206, "grad_norm": 0.8342368602752686, "learning_rate": 4.9985863357818e-05, "loss": 1.2408, "step": 10300 }, { "epoch": 0.0208, "grad_norm": 1.3672316074371338, "learning_rate": 4.998532481712596e-05, "loss": 1.2205, "step": 10400 }, { "epoch": 0.021, "grad_norm": 1.1164605617523193, "learning_rate": 4.998477621234806e-05, "loss": 1.2817, "step": 10500 }, { "epoch": 0.0212, "grad_norm": 1.2867449522018433, "learning_rate": 4.99842175437053e-05, "loss": 1.2598, "step": 10600 }, { "epoch": 0.0214, "grad_norm": 1.6646244525909424, "learning_rate": 4.99836488114227e-05, "loss": 1.2163, "step": 10700 }, { "epoch": 0.0216, "grad_norm": 1.3233399391174316, "learning_rate": 4.998307001572935e-05, "loss": 1.2744, "step": 10800 }, { "epoch": 0.0218, "grad_norm": 1.1658077239990234, "learning_rate": 4.9982481156858385e-05, "loss": 1.274, "step": 10900 }, { "epoch": 0.022, "grad_norm": 1.4505467414855957, "learning_rate": 4.9981882235046995e-05, "loss": 1.2645, "step": 11000 }, { "epoch": 0.022, "eval_loss": 1.1138958930969238, "eval_runtime": 76.7643, "eval_samples_per_second": 201.174, "eval_steps_per_second": 3.153, "step": 11000 }, { "epoch": 0.0222, "grad_norm": 0.8515588641166687, "learning_rate": 4.998127325053642e-05, "loss": 1.2359, "step": 11100 }, { "epoch": 0.0224, "grad_norm": 1.4022259712219238, "learning_rate": 4.9980654203571983e-05, "loss": 1.2515, "step": 11200 }, { "epoch": 0.0226, "grad_norm": 1.5902676582336426, "learning_rate": 4.998002509440301e-05, "loss": 1.2305, "step": 11300 }, { "epoch": 0.0228, "grad_norm": 0.763087809085846, "learning_rate": 4.997938592328292e-05, "loss": 1.2312, "step": 11400 }, { "epoch": 0.023, "grad_norm": 1.4949332475662231, "learning_rate": 4.997873669046916e-05, "loss": 1.2768, "step": 11500 }, { "epoch": 0.0232, "grad_norm": 1.0390666723251343, "learning_rate": 4.9978077396223255e-05, "loss": 1.2355, "step": 11600 }, { "epoch": 0.0234, "grad_norm": 0.6799549460411072, "learning_rate": 4.997740804081076e-05, "loss": 1.264, "step": 11700 }, { "epoch": 0.0236, "grad_norm": 1.4702496528625488, "learning_rate": 4.99767286245013e-05, "loss": 1.3092, "step": 11800 }, { "epoch": 0.0238, "grad_norm": 1.3574661016464233, "learning_rate": 4.997603914756853e-05, "loss": 1.2654, "step": 11900 }, { "epoch": 0.024, "grad_norm": 1.1170625686645508, "learning_rate": 4.9975339610290175e-05, "loss": 1.2343, "step": 12000 }, { "epoch": 0.024, "eval_loss": 1.1109821796417236, "eval_runtime": 76.4587, "eval_samples_per_second": 201.978, "eval_steps_per_second": 3.165, "step": 12000 }, { "epoch": 0.0242, "grad_norm": 1.2707583904266357, "learning_rate": 4.997463001294802e-05, "loss": 1.2525, "step": 12100 }, { "epoch": 0.0244, "grad_norm": 1.2613739967346191, "learning_rate": 4.997391035582788e-05, "loss": 1.2698, "step": 12200 }, { "epoch": 0.0246, "grad_norm": 1.1995183229446411, "learning_rate": 4.997318063921963e-05, "loss": 1.237, "step": 12300 }, { "epoch": 0.0248, "grad_norm": 0.729535698890686, "learning_rate": 4.997244086341721e-05, "loss": 1.2248, "step": 12400 }, { "epoch": 0.025, "grad_norm": 1.3250787258148193, "learning_rate": 4.9971691028718594e-05, "loss": 1.2617, "step": 12500 }, { "epoch": 0.0252, "grad_norm": 1.421278476715088, "learning_rate": 4.997093113542582e-05, "loss": 1.2321, "step": 12600 }, { "epoch": 0.0254, "grad_norm": 1.5168310403823853, "learning_rate": 4.997016118384497e-05, "loss": 1.2268, "step": 12700 }, { "epoch": 0.0256, "grad_norm": 1.045483946800232, "learning_rate": 4.996938117428618e-05, "loss": 1.2714, "step": 12800 }, { "epoch": 0.0258, "grad_norm": 0.8379656076431274, "learning_rate": 4.9968591107063647e-05, "loss": 1.2792, "step": 12900 }, { "epoch": 0.026, "grad_norm": 1.620133638381958, "learning_rate": 4.996779098249559e-05, "loss": 1.2456, "step": 13000 }, { "epoch": 0.026, "eval_loss": 1.1081608533859253, "eval_runtime": 76.4734, "eval_samples_per_second": 201.939, "eval_steps_per_second": 3.164, "step": 13000 }, { "epoch": 0.0262, "grad_norm": 1.2181329727172852, "learning_rate": 4.9966980800904315e-05, "loss": 1.2187, "step": 13100 }, { "epoch": 0.0264, "grad_norm": 1.4935636520385742, "learning_rate": 4.996616056261616e-05, "loss": 1.2405, "step": 13200 }, { "epoch": 0.0266, "grad_norm": 1.3096436262130737, "learning_rate": 4.996533026796152e-05, "loss": 1.2599, "step": 13300 }, { "epoch": 0.0268, "grad_norm": 1.5392045974731445, "learning_rate": 4.996448991727483e-05, "loss": 1.2491, "step": 13400 }, { "epoch": 0.027, "grad_norm": 1.3175737857818604, "learning_rate": 4.996363951089459e-05, "loss": 1.2383, "step": 13500 }, { "epoch": 0.0272, "grad_norm": 1.3839282989501953, "learning_rate": 4.9962779049163335e-05, "loss": 1.2739, "step": 13600 }, { "epoch": 0.0274, "grad_norm": 0.8403354287147522, "learning_rate": 4.996190853242767e-05, "loss": 1.2378, "step": 13700 }, { "epoch": 0.0276, "grad_norm": 1.2463191747665405, "learning_rate": 4.996102796103823e-05, "loss": 1.2248, "step": 13800 }, { "epoch": 0.0278, "grad_norm": 1.466070294380188, "learning_rate": 4.996013733534971e-05, "loss": 1.2567, "step": 13900 }, { "epoch": 0.028, "grad_norm": 0.8661775588989258, "learning_rate": 4.995923665572085e-05, "loss": 1.2372, "step": 14000 }, { "epoch": 0.028, "eval_loss": 1.113655686378479, "eval_runtime": 76.3727, "eval_samples_per_second": 202.206, "eval_steps_per_second": 3.169, "step": 14000 }, { "epoch": 0.0282, "grad_norm": 0.9262897968292236, "learning_rate": 4.9958325922514466e-05, "loss": 1.2082, "step": 14100 }, { "epoch": 0.0284, "grad_norm": 1.406928539276123, "learning_rate": 4.995740513609738e-05, "loss": 1.2576, "step": 14200 }, { "epoch": 0.0286, "grad_norm": 0.9858616590499878, "learning_rate": 4.9956474296840485e-05, "loss": 1.2173, "step": 14300 }, { "epoch": 0.0288, "grad_norm": 0.6425116062164307, "learning_rate": 4.9955533405118725e-05, "loss": 1.237, "step": 14400 }, { "epoch": 0.029, "grad_norm": 0.7704317569732666, "learning_rate": 4.9954582461311106e-05, "loss": 1.286, "step": 14500 }, { "epoch": 0.0292, "grad_norm": 1.2745368480682373, "learning_rate": 4.995362146580065e-05, "loss": 1.2553, "step": 14600 }, { "epoch": 0.0294, "grad_norm": 1.1889222860336304, "learning_rate": 4.995265041897444e-05, "loss": 1.2783, "step": 14700 }, { "epoch": 0.0296, "grad_norm": 1.4223252534866333, "learning_rate": 4.9951669321223645e-05, "loss": 1.27, "step": 14800 }, { "epoch": 0.0298, "grad_norm": 1.0991147756576538, "learning_rate": 4.995067817294342e-05, "loss": 1.2373, "step": 14900 }, { "epoch": 0.03, "grad_norm": 1.2834559679031372, "learning_rate": 4.994967697453301e-05, "loss": 1.2725, "step": 15000 }, { "epoch": 0.03, "eval_loss": 1.1147979497909546, "eval_runtime": 77.4863, "eval_samples_per_second": 199.3, "eval_steps_per_second": 3.123, "step": 15000 }, { "epoch": 0.0302, "grad_norm": 1.3690969944000244, "learning_rate": 4.9948665726395705e-05, "loss": 1.2631, "step": 15100 }, { "epoch": 0.0304, "grad_norm": 1.0501981973648071, "learning_rate": 4.994764442893882e-05, "loss": 1.2614, "step": 15200 }, { "epoch": 0.0306, "grad_norm": 1.2085719108581543, "learning_rate": 4.994661308257375e-05, "loss": 1.1982, "step": 15300 }, { "epoch": 0.0308, "grad_norm": 1.1436259746551514, "learning_rate": 4.994557168771591e-05, "loss": 1.2079, "step": 15400 }, { "epoch": 0.031, "grad_norm": 0.8355712890625, "learning_rate": 4.994452024478478e-05, "loss": 1.2537, "step": 15500 }, { "epoch": 0.0312, "grad_norm": 0.9547547698020935, "learning_rate": 4.9943458754203875e-05, "loss": 1.2399, "step": 15600 }, { "epoch": 0.0314, "grad_norm": 1.090165138244629, "learning_rate": 4.994238721640077e-05, "loss": 1.2324, "step": 15700 }, { "epoch": 0.0316, "grad_norm": 0.9351906180381775, "learning_rate": 4.9941305631807076e-05, "loss": 1.2431, "step": 15800 }, { "epoch": 0.0318, "grad_norm": 1.3740676641464233, "learning_rate": 4.9940214000858456e-05, "loss": 1.2487, "step": 15900 }, { "epoch": 0.032, "grad_norm": 0.656019926071167, "learning_rate": 4.993911232399462e-05, "loss": 1.2371, "step": 16000 }, { "epoch": 0.032, "eval_loss": 1.1028244495391846, "eval_runtime": 76.4629, "eval_samples_per_second": 201.967, "eval_steps_per_second": 3.165, "step": 16000 }, { "epoch": 0.0322, "grad_norm": 1.20018470287323, "learning_rate": 4.9938000601659315e-05, "loss": 1.2547, "step": 16100 }, { "epoch": 0.0324, "grad_norm": 1.2216906547546387, "learning_rate": 4.993687883430036e-05, "loss": 1.2327, "step": 16200 }, { "epoch": 0.0326, "grad_norm": 1.0969616174697876, "learning_rate": 4.99357470223696e-05, "loss": 1.2513, "step": 16300 }, { "epoch": 0.0328, "grad_norm": 1.026194453239441, "learning_rate": 4.99346051663229e-05, "loss": 1.2508, "step": 16400 }, { "epoch": 0.033, "grad_norm": 1.1246017217636108, "learning_rate": 4.993345326662023e-05, "loss": 1.2538, "step": 16500 }, { "epoch": 0.0332, "grad_norm": 1.293093204498291, "learning_rate": 4.993229132372557e-05, "loss": 1.2236, "step": 16600 }, { "epoch": 0.0334, "grad_norm": 1.208122730255127, "learning_rate": 4.993111933810695e-05, "loss": 1.2753, "step": 16700 }, { "epoch": 0.0336, "grad_norm": 1.073480248451233, "learning_rate": 4.992993731023643e-05, "loss": 1.2665, "step": 16800 }, { "epoch": 0.0338, "grad_norm": 1.4211028814315796, "learning_rate": 4.9928745240590146e-05, "loss": 1.2388, "step": 16900 }, { "epoch": 0.034, "grad_norm": 1.1787285804748535, "learning_rate": 4.992754312964827e-05, "loss": 1.2118, "step": 17000 }, { "epoch": 0.034, "eval_loss": 1.104814887046814, "eval_runtime": 76.4454, "eval_samples_per_second": 202.013, "eval_steps_per_second": 3.166, "step": 17000 }, { "epoch": 0.0342, "grad_norm": 0.9049177765846252, "learning_rate": 4.992633097789499e-05, "loss": 1.1995, "step": 17100 }, { "epoch": 0.0344, "grad_norm": 1.2447205781936646, "learning_rate": 4.992510878581858e-05, "loss": 1.2174, "step": 17200 }, { "epoch": 0.0346, "grad_norm": 1.0060733556747437, "learning_rate": 4.9923876553911334e-05, "loss": 1.2098, "step": 17300 }, { "epoch": 0.0348, "grad_norm": 1.3275829553604126, "learning_rate": 4.992263428266958e-05, "loss": 1.2256, "step": 17400 }, { "epoch": 0.035, "grad_norm": 1.3165931701660156, "learning_rate": 4.992138197259373e-05, "loss": 1.2276, "step": 17500 }, { "epoch": 0.0352, "grad_norm": 1.2749327421188354, "learning_rate": 4.9920119624188196e-05, "loss": 1.2758, "step": 17600 }, { "epoch": 0.0354, "grad_norm": 1.0836033821105957, "learning_rate": 4.991884723796146e-05, "loss": 1.2407, "step": 17700 }, { "epoch": 0.0356, "grad_norm": 1.343475103378296, "learning_rate": 4.9917564814426034e-05, "loss": 1.2466, "step": 17800 }, { "epoch": 0.0358, "grad_norm": 1.3868790864944458, "learning_rate": 4.991627235409848e-05, "loss": 1.2402, "step": 17900 }, { "epoch": 0.036, "grad_norm": 1.5200074911117554, "learning_rate": 4.99149698574994e-05, "loss": 1.2183, "step": 18000 }, { "epoch": 0.036, "eval_loss": 1.0960842370986938, "eval_runtime": 76.481, "eval_samples_per_second": 201.92, "eval_steps_per_second": 3.164, "step": 18000 }, { "epoch": 0.0362, "grad_norm": 1.4647791385650635, "learning_rate": 4.991365732515345e-05, "loss": 1.2386, "step": 18100 }, { "epoch": 0.0364, "grad_norm": 0.9076351523399353, "learning_rate": 4.991233475758931e-05, "loss": 1.2011, "step": 18200 }, { "epoch": 0.0366, "grad_norm": 0.9813222289085388, "learning_rate": 4.99110021553397e-05, "loss": 1.214, "step": 18300 }, { "epoch": 0.0368, "grad_norm": 1.5431565046310425, "learning_rate": 4.99096595189414e-05, "loss": 1.2206, "step": 18400 }, { "epoch": 0.037, "grad_norm": 0.9991932511329651, "learning_rate": 4.990830684893523e-05, "loss": 1.2334, "step": 18500 }, { "epoch": 0.0372, "grad_norm": 0.6322658658027649, "learning_rate": 4.9906944145866035e-05, "loss": 1.2354, "step": 18600 }, { "epoch": 0.0374, "grad_norm": 0.9555477499961853, "learning_rate": 4.990557141028272e-05, "loss": 1.2017, "step": 18700 }, { "epoch": 0.0376, "grad_norm": 1.171019196510315, "learning_rate": 4.990418864273822e-05, "loss": 1.286, "step": 18800 }, { "epoch": 0.0378, "grad_norm": 1.2275811433792114, "learning_rate": 4.990279584378951e-05, "loss": 1.2345, "step": 18900 }, { "epoch": 0.038, "grad_norm": 1.6589407920837402, "learning_rate": 4.9901393013997616e-05, "loss": 1.2376, "step": 19000 }, { "epoch": 0.038, "eval_loss": 1.107132077217102, "eval_runtime": 76.3932, "eval_samples_per_second": 202.152, "eval_steps_per_second": 3.168, "step": 19000 }, { "epoch": 0.0382, "grad_norm": 0.7907335758209229, "learning_rate": 4.9899980153927596e-05, "loss": 1.2554, "step": 19100 }, { "epoch": 0.0384, "grad_norm": 1.4444235563278198, "learning_rate": 4.989855726414854e-05, "loss": 1.2618, "step": 19200 }, { "epoch": 0.0386, "grad_norm": 1.1591296195983887, "learning_rate": 4.98971243452336e-05, "loss": 1.2028, "step": 19300 }, { "epoch": 0.0388, "grad_norm": 0.9183579087257385, "learning_rate": 4.989568139775995e-05, "loss": 1.2259, "step": 19400 }, { "epoch": 0.039, "grad_norm": 1.0866785049438477, "learning_rate": 4.9894228422308805e-05, "loss": 1.2307, "step": 19500 }, { "epoch": 0.0392, "grad_norm": 1.5889687538146973, "learning_rate": 4.9892765419465436e-05, "loss": 1.2346, "step": 19600 }, { "epoch": 0.0394, "grad_norm": 1.300850510597229, "learning_rate": 4.989129238981913e-05, "loss": 1.2748, "step": 19700 }, { "epoch": 0.0396, "grad_norm": 1.2363704442977905, "learning_rate": 4.988980933396323e-05, "loss": 1.2536, "step": 19800 }, { "epoch": 0.0398, "grad_norm": 0.8141745328903198, "learning_rate": 4.9888316252495106e-05, "loss": 1.2198, "step": 19900 }, { "epoch": 0.04, "grad_norm": 1.0759721994400024, "learning_rate": 4.988681314601617e-05, "loss": 1.2225, "step": 20000 }, { "epoch": 0.04, "eval_loss": 1.0962127447128296, "eval_runtime": 76.7102, "eval_samples_per_second": 201.316, "eval_steps_per_second": 3.155, "step": 20000 }, { "epoch": 0.0402, "grad_norm": 1.2748645544052124, "learning_rate": 4.988530001513187e-05, "loss": 1.2245, "step": 20100 }, { "epoch": 0.0404, "grad_norm": 1.3192243576049805, "learning_rate": 4.9883776860451704e-05, "loss": 1.2292, "step": 20200 }, { "epoch": 0.0406, "grad_norm": 1.329868197441101, "learning_rate": 4.98822436825892e-05, "loss": 1.2243, "step": 20300 }, { "epoch": 0.0408, "grad_norm": 1.3394356966018677, "learning_rate": 4.988070048216191e-05, "loss": 1.216, "step": 20400 }, { "epoch": 0.041, "grad_norm": 1.3035671710968018, "learning_rate": 4.987914725979144e-05, "loss": 1.2335, "step": 20500 }, { "epoch": 0.0412, "grad_norm": 1.2765480279922485, "learning_rate": 4.987758401610343e-05, "loss": 1.261, "step": 20600 }, { "epoch": 0.0414, "grad_norm": 1.0472270250320435, "learning_rate": 4.9876010751727553e-05, "loss": 1.2173, "step": 20700 }, { "epoch": 0.0416, "grad_norm": 1.163237452507019, "learning_rate": 4.9874427467297525e-05, "loss": 1.2316, "step": 20800 }, { "epoch": 0.0418, "grad_norm": 1.3546457290649414, "learning_rate": 4.987283416345109e-05, "loss": 1.2268, "step": 20900 }, { "epoch": 0.042, "grad_norm": 1.0812748670578003, "learning_rate": 4.9871230840830016e-05, "loss": 1.2267, "step": 21000 }, { "epoch": 0.042, "eval_loss": 1.1046785116195679, "eval_runtime": 76.3631, "eval_samples_per_second": 202.231, "eval_steps_per_second": 3.169, "step": 21000 }, { "epoch": 0.0422, "grad_norm": 0.7458230257034302, "learning_rate": 4.986961750008014e-05, "loss": 1.1918, "step": 21100 }, { "epoch": 0.0424, "grad_norm": 1.2837951183319092, "learning_rate": 4.986799414185131e-05, "loss": 1.2206, "step": 21200 }, { "epoch": 0.0426, "grad_norm": 1.4213489294052124, "learning_rate": 4.986636076679742e-05, "loss": 1.2552, "step": 21300 }, { "epoch": 0.0428, "grad_norm": 1.297608733177185, "learning_rate": 4.986471737557638e-05, "loss": 1.2234, "step": 21400 }, { "epoch": 0.043, "grad_norm": 1.3617885112762451, "learning_rate": 4.986306396885015e-05, "loss": 1.2381, "step": 21500 }, { "epoch": 0.0432, "grad_norm": 1.500025749206543, "learning_rate": 4.986140054728473e-05, "loss": 1.1957, "step": 21600 }, { "epoch": 0.0434, "grad_norm": 0.6222732663154602, "learning_rate": 4.9859727111550147e-05, "loss": 1.2579, "step": 21700 }, { "epoch": 0.0436, "grad_norm": 1.4154349565505981, "learning_rate": 4.985804366232045e-05, "loss": 1.2073, "step": 21800 }, { "epoch": 0.0438, "grad_norm": 1.334390640258789, "learning_rate": 4.9856350200273746e-05, "loss": 1.2317, "step": 21900 }, { "epoch": 0.044, "grad_norm": 0.8164774179458618, "learning_rate": 4.985464672609215e-05, "loss": 1.2248, "step": 22000 }, { "epoch": 0.044, "eval_loss": 1.1025385856628418, "eval_runtime": 76.8498, "eval_samples_per_second": 200.951, "eval_steps_per_second": 3.149, "step": 22000 }, { "epoch": 0.0442, "grad_norm": 1.1641725301742554, "learning_rate": 4.985293324046182e-05, "loss": 1.1928, "step": 22100 }, { "epoch": 0.0444, "grad_norm": 1.2185006141662598, "learning_rate": 4.9851209744072954e-05, "loss": 1.2435, "step": 22200 }, { "epoch": 0.0446, "grad_norm": 1.0973742008209229, "learning_rate": 4.9849476237619784e-05, "loss": 1.2515, "step": 22300 }, { "epoch": 0.0448, "grad_norm": 1.0242998600006104, "learning_rate": 4.984773272180056e-05, "loss": 1.2511, "step": 22400 }, { "epoch": 0.045, "grad_norm": 0.598416805267334, "learning_rate": 4.984597919731755e-05, "loss": 1.215, "step": 22500 }, { "epoch": 0.0452, "grad_norm": 0.9391146302223206, "learning_rate": 4.98442156648771e-05, "loss": 1.2303, "step": 22600 }, { "epoch": 0.0454, "grad_norm": 0.9301611185073853, "learning_rate": 4.9842442125189556e-05, "loss": 1.2621, "step": 22700 }, { "epoch": 0.0456, "grad_norm": 1.3423951864242554, "learning_rate": 4.984065857896928e-05, "loss": 1.2251, "step": 22800 }, { "epoch": 0.0458, "grad_norm": 1.3373651504516602, "learning_rate": 4.983886502693471e-05, "loss": 1.2738, "step": 22900 }, { "epoch": 0.046, "grad_norm": 1.007158637046814, "learning_rate": 4.983706146980828e-05, "loss": 1.1923, "step": 23000 }, { "epoch": 0.046, "eval_loss": 1.1094993352890015, "eval_runtime": 76.6473, "eval_samples_per_second": 201.481, "eval_steps_per_second": 3.157, "step": 23000 }, { "epoch": 0.0462, "grad_norm": 0.7804542779922485, "learning_rate": 4.9835247908316454e-05, "loss": 1.2098, "step": 23100 }, { "epoch": 0.0464, "grad_norm": 1.377008318901062, "learning_rate": 4.983342434318975e-05, "loss": 1.2202, "step": 23200 }, { "epoch": 0.0466, "grad_norm": 1.1037031412124634, "learning_rate": 4.983159077516268e-05, "loss": 1.1977, "step": 23300 }, { "epoch": 0.0468, "grad_norm": 0.7141278386116028, "learning_rate": 4.982974720497382e-05, "loss": 1.2054, "step": 23400 }, { "epoch": 0.047, "grad_norm": 0.570811927318573, "learning_rate": 4.9827893633365754e-05, "loss": 1.2163, "step": 23500 }, { "epoch": 0.0472, "grad_norm": 0.7255613803863525, "learning_rate": 4.98260300610851e-05, "loss": 1.2212, "step": 23600 }, { "epoch": 0.0474, "grad_norm": 0.8988520503044128, "learning_rate": 4.982415648888251e-05, "loss": 1.2332, "step": 23700 }, { "epoch": 0.0476, "grad_norm": 1.2191438674926758, "learning_rate": 4.9822272917512644e-05, "loss": 1.1974, "step": 23800 }, { "epoch": 0.0478, "grad_norm": 1.2043516635894775, "learning_rate": 4.982037934773423e-05, "loss": 1.2229, "step": 23900 }, { "epoch": 0.048, "grad_norm": 1.3503689765930176, "learning_rate": 4.981847578030998e-05, "loss": 1.2307, "step": 24000 }, { "epoch": 0.048, "eval_loss": 1.0969973802566528, "eval_runtime": 76.7433, "eval_samples_per_second": 201.229, "eval_steps_per_second": 3.153, "step": 24000 }, { "epoch": 0.0482, "grad_norm": 1.3795185089111328, "learning_rate": 4.9816562216006645e-05, "loss": 1.1894, "step": 24100 }, { "epoch": 0.0484, "grad_norm": 1.1966140270233154, "learning_rate": 4.9814638655595024e-05, "loss": 1.2011, "step": 24200 }, { "epoch": 0.0486, "grad_norm": 1.179077386856079, "learning_rate": 4.981270509984992e-05, "loss": 1.2596, "step": 24300 }, { "epoch": 0.0488, "grad_norm": 1.24593186378479, "learning_rate": 4.9810761549550166e-05, "loss": 1.2219, "step": 24400 }, { "epoch": 0.049, "grad_norm": 1.2809820175170898, "learning_rate": 4.9808808005478635e-05, "loss": 1.2033, "step": 24500 }, { "epoch": 0.0492, "grad_norm": 0.9016757011413574, "learning_rate": 4.9806844468422196e-05, "loss": 1.2394, "step": 24600 }, { "epoch": 0.0494, "grad_norm": 0.7064381837844849, "learning_rate": 4.9804870939171774e-05, "loss": 1.2154, "step": 24700 }, { "epoch": 0.0496, "grad_norm": 0.626646101474762, "learning_rate": 4.980288741852231e-05, "loss": 1.2021, "step": 24800 }, { "epoch": 0.0498, "grad_norm": 1.049187421798706, "learning_rate": 4.980089390727275e-05, "loss": 1.1839, "step": 24900 }, { "epoch": 0.05, "grad_norm": 1.2987581491470337, "learning_rate": 4.97988904062261e-05, "loss": 1.1969, "step": 25000 }, { "epoch": 0.05, "eval_loss": 1.090114951133728, "eval_runtime": 77.5992, "eval_samples_per_second": 199.01, "eval_steps_per_second": 3.119, "step": 25000 }, { "epoch": 0.0502, "grad_norm": 1.105361819267273, "learning_rate": 4.979687691618936e-05, "loss": 1.1784, "step": 25100 }, { "epoch": 0.0504, "grad_norm": 0.7138956189155579, "learning_rate": 4.9794853437973555e-05, "loss": 1.2016, "step": 25200 }, { "epoch": 0.0506, "grad_norm": 1.250241756439209, "learning_rate": 4.9792819972393756e-05, "loss": 1.2032, "step": 25300 }, { "epoch": 0.0508, "grad_norm": 0.5875529050827026, "learning_rate": 4.9790776520269034e-05, "loss": 1.2034, "step": 25400 }, { "epoch": 0.051, "grad_norm": 1.2880475521087646, "learning_rate": 4.9788723082422495e-05, "loss": 1.2172, "step": 25500 }, { "epoch": 0.0512, "grad_norm": 0.8775302767753601, "learning_rate": 4.978665965968127e-05, "loss": 1.2264, "step": 25600 }, { "epoch": 0.0514, "grad_norm": 0.7336851954460144, "learning_rate": 4.978458625287649e-05, "loss": 1.2248, "step": 25700 }, { "epoch": 0.0516, "grad_norm": 1.431084156036377, "learning_rate": 4.978250286284333e-05, "loss": 1.2353, "step": 25800 }, { "epoch": 0.0518, "grad_norm": 1.6342276334762573, "learning_rate": 4.978040949042099e-05, "loss": 1.1984, "step": 25900 }, { "epoch": 0.052, "grad_norm": 1.5883526802062988, "learning_rate": 4.977830613645266e-05, "loss": 1.2251, "step": 26000 }, { "epoch": 0.052, "eval_loss": 1.0901614427566528, "eval_runtime": 76.7254, "eval_samples_per_second": 201.276, "eval_steps_per_second": 3.154, "step": 26000 }, { "epoch": 0.0522, "grad_norm": 1.1527795791625977, "learning_rate": 4.977619280178558e-05, "loss": 1.2043, "step": 26100 }, { "epoch": 0.0524, "grad_norm": 1.5160431861877441, "learning_rate": 4.9774069487271014e-05, "loss": 1.1931, "step": 26200 }, { "epoch": 0.0526, "grad_norm": 1.2551748752593994, "learning_rate": 4.977193619376421e-05, "loss": 1.2397, "step": 26300 }, { "epoch": 0.0528, "grad_norm": 1.2745076417922974, "learning_rate": 4.976979292212448e-05, "loss": 1.2336, "step": 26400 }, { "epoch": 0.053, "grad_norm": 1.4893673658370972, "learning_rate": 4.976763967321511e-05, "loss": 1.1827, "step": 26500 }, { "epoch": 0.0532, "grad_norm": 0.857379138469696, "learning_rate": 4.976547644790346e-05, "loss": 1.2441, "step": 26600 }, { "epoch": 0.0534, "grad_norm": 1.167006492614746, "learning_rate": 4.976330324706084e-05, "loss": 1.2779, "step": 26700 }, { "epoch": 0.0536, "grad_norm": 0.634842574596405, "learning_rate": 4.976112007156265e-05, "loss": 1.2828, "step": 26800 }, { "epoch": 0.0538, "grad_norm": 0.9239290952682495, "learning_rate": 4.975892692228825e-05, "loss": 1.2094, "step": 26900 }, { "epoch": 0.054, "grad_norm": 1.2031028270721436, "learning_rate": 4.9756723800121044e-05, "loss": 1.222, "step": 27000 }, { "epoch": 0.054, "eval_loss": 1.0867078304290771, "eval_runtime": 76.6606, "eval_samples_per_second": 201.446, "eval_steps_per_second": 3.157, "step": 27000 }, { "epoch": 0.0542, "grad_norm": 1.3575947284698486, "learning_rate": 4.9754510705948456e-05, "loss": 1.1622, "step": 27100 }, { "epoch": 0.0544, "grad_norm": 1.142074465751648, "learning_rate": 4.975228764066191e-05, "loss": 1.2703, "step": 27200 }, { "epoch": 0.0546, "grad_norm": 0.8273721933364868, "learning_rate": 4.975005460515686e-05, "loss": 1.1921, "step": 27300 }, { "epoch": 0.0548, "grad_norm": 1.3859556913375854, "learning_rate": 4.974781160033278e-05, "loss": 1.2195, "step": 27400 }, { "epoch": 0.055, "grad_norm": 1.2232416868209839, "learning_rate": 4.974555862709315e-05, "loss": 1.1851, "step": 27500 }, { "epoch": 0.0552, "grad_norm": 0.7069573998451233, "learning_rate": 4.974329568634546e-05, "loss": 1.2098, "step": 27600 }, { "epoch": 0.0554, "grad_norm": 1.2497153282165527, "learning_rate": 4.974102277900122e-05, "loss": 1.206, "step": 27700 }, { "epoch": 0.0556, "grad_norm": 1.206449031829834, "learning_rate": 4.9738739905975976e-05, "loss": 1.2352, "step": 27800 }, { "epoch": 0.0558, "grad_norm": 1.3927749395370483, "learning_rate": 4.973644706818925e-05, "loss": 1.1952, "step": 27900 }, { "epoch": 0.056, "grad_norm": 1.3856321573257446, "learning_rate": 4.973414426656461e-05, "loss": 1.2499, "step": 28000 }, { "epoch": 0.056, "eval_loss": 1.0941141843795776, "eval_runtime": 76.7063, "eval_samples_per_second": 201.326, "eval_steps_per_second": 3.155, "step": 28000 }, { "epoch": 0.0562, "grad_norm": 0.6676329970359802, "learning_rate": 4.9731831502029606e-05, "loss": 1.2333, "step": 28100 }, { "epoch": 0.0564, "grad_norm": 1.2670732736587524, "learning_rate": 4.972950877551584e-05, "loss": 1.183, "step": 28200 }, { "epoch": 0.0566, "grad_norm": 1.2089595794677734, "learning_rate": 4.972717608795889e-05, "loss": 1.2445, "step": 28300 }, { "epoch": 0.0568, "grad_norm": 1.1897366046905518, "learning_rate": 4.972483344029838e-05, "loss": 1.2217, "step": 28400 }, { "epoch": 0.057, "grad_norm": 1.4963501691818237, "learning_rate": 4.97224808334779e-05, "loss": 1.2079, "step": 28500 }, { "epoch": 0.0572, "grad_norm": 1.594019889831543, "learning_rate": 4.972011826844511e-05, "loss": 1.1822, "step": 28600 }, { "epoch": 0.0574, "grad_norm": 1.3324779272079468, "learning_rate": 4.971774574615163e-05, "loss": 1.2562, "step": 28700 }, { "epoch": 0.0576, "grad_norm": 1.3334344625473022, "learning_rate": 4.971536326755313e-05, "loss": 1.2509, "step": 28800 }, { "epoch": 0.0578, "grad_norm": 0.9475389719009399, "learning_rate": 4.971297083360925e-05, "loss": 1.1826, "step": 28900 }, { "epoch": 0.058, "grad_norm": 0.8067657947540283, "learning_rate": 4.971056844528368e-05, "loss": 1.1895, "step": 29000 }, { "epoch": 0.058, "eval_loss": 1.0870901346206665, "eval_runtime": 76.6141, "eval_samples_per_second": 201.569, "eval_steps_per_second": 3.159, "step": 29000 }, { "epoch": 0.0582, "grad_norm": 0.7364763617515564, "learning_rate": 4.970815610354409e-05, "loss": 1.1821, "step": 29100 }, { "epoch": 0.0584, "grad_norm": 1.494878888130188, "learning_rate": 4.970573380936218e-05, "loss": 1.1592, "step": 29200 }, { "epoch": 0.0586, "grad_norm": 0.7247675061225891, "learning_rate": 4.9703301563713645e-05, "loss": 1.2347, "step": 29300 }, { "epoch": 0.0588, "grad_norm": 1.0013625621795654, "learning_rate": 4.970085936757819e-05, "loss": 1.2536, "step": 29400 }, { "epoch": 0.059, "grad_norm": 1.012537956237793, "learning_rate": 4.969840722193955e-05, "loss": 1.2461, "step": 29500 }, { "epoch": 0.0592, "grad_norm": 0.8702846169471741, "learning_rate": 4.969594512778541e-05, "loss": 1.2005, "step": 29600 }, { "epoch": 0.0594, "grad_norm": 1.1068499088287354, "learning_rate": 4.969347308610755e-05, "loss": 1.1942, "step": 29700 }, { "epoch": 0.0596, "grad_norm": 1.6333682537078857, "learning_rate": 4.969099109790167e-05, "loss": 1.2372, "step": 29800 }, { "epoch": 0.0598, "grad_norm": 1.0337685346603394, "learning_rate": 4.9688499164167536e-05, "loss": 1.2435, "step": 29900 }, { "epoch": 0.06, "grad_norm": 0.8429011702537537, "learning_rate": 4.9685997285908894e-05, "loss": 1.2023, "step": 30000 }, { "epoch": 0.06, "eval_loss": 1.086748480796814, "eval_runtime": 76.8684, "eval_samples_per_second": 200.902, "eval_steps_per_second": 3.148, "step": 30000 }, { "epoch": 0.0002, "grad_norm": 0.8381020426750183, "learning_rate": 4.9683485464133484e-05, "loss": 1.2362, "step": 30100 }, { "epoch": 0.0004, "grad_norm": 0.6860467791557312, "learning_rate": 4.968096369985309e-05, "loss": 1.2125, "step": 30200 }, { "epoch": 0.0006, "grad_norm": 0.9316505193710327, "learning_rate": 4.967843199408347e-05, "loss": 1.1904, "step": 30300 }, { "epoch": 0.0008, "grad_norm": 1.3389461040496826, "learning_rate": 4.967589034784439e-05, "loss": 1.2689, "step": 30400 }, { "epoch": 0.001, "grad_norm": 0.9387079477310181, "learning_rate": 4.967333876215963e-05, "loss": 1.2205, "step": 30500 }, { "epoch": 0.0012, "grad_norm": 0.7549923062324524, "learning_rate": 4.967077723805697e-05, "loss": 1.21, "step": 30600 }, { "epoch": 0.0014, "grad_norm": 1.1242858171463013, "learning_rate": 4.966820577656819e-05, "loss": 1.203, "step": 30700 }, { "epoch": 0.0016, "grad_norm": 1.5065937042236328, "learning_rate": 4.966562437872907e-05, "loss": 1.2233, "step": 30800 }, { "epoch": 0.0018, "grad_norm": 1.1448508501052856, "learning_rate": 4.96630330455794e-05, "loss": 1.2242, "step": 30900 }, { "epoch": 0.002, "grad_norm": 0.7356053590774536, "learning_rate": 4.966043177816296e-05, "loss": 1.2541, "step": 31000 }, { "epoch": 0.002, "eval_loss": 1.0892270803451538, "eval_runtime": 78.1396, "eval_samples_per_second": 197.633, "eval_steps_per_second": 3.097, "step": 31000 }, { "epoch": 0.0022, "grad_norm": 1.290472149848938, "learning_rate": 4.965782057752757e-05, "loss": 1.2005, "step": 31100 }, { "epoch": 0.0024, "grad_norm": 0.7970076203346252, "learning_rate": 4.965519944472498e-05, "loss": 1.2718, "step": 31200 }, { "epoch": 0.0026, "grad_norm": 1.3415039777755737, "learning_rate": 4.9652568380811016e-05, "loss": 1.2673, "step": 31300 }, { "epoch": 0.0028, "grad_norm": 1.3146836757659912, "learning_rate": 4.9649927386845444e-05, "loss": 1.2717, "step": 31400 }, { "epoch": 0.003, "grad_norm": 0.9725894927978516, "learning_rate": 4.964727646389208e-05, "loss": 1.2418, "step": 31500 }, { "epoch": 0.0032, "grad_norm": 0.9590099453926086, "learning_rate": 4.96446156130187e-05, "loss": 1.2389, "step": 31600 }, { "epoch": 0.0034, "grad_norm": 1.5478194952011108, "learning_rate": 4.964194483529709e-05, "loss": 1.2693, "step": 31700 }, { "epoch": 0.0036, "grad_norm": 0.7029865384101868, "learning_rate": 4.9639264131803056e-05, "loss": 1.25, "step": 31800 }, { "epoch": 0.0038, "grad_norm": 0.7784998416900635, "learning_rate": 4.963657350361637e-05, "loss": 1.2339, "step": 31900 }, { "epoch": 0.004, "grad_norm": 0.6479517817497253, "learning_rate": 4.963387295182083e-05, "loss": 1.2538, "step": 32000 }, { "epoch": 0.004, "eval_loss": 1.0948545932769775, "eval_runtime": 77.4713, "eval_samples_per_second": 199.338, "eval_steps_per_second": 3.124, "step": 32000 }, { "epoch": 0.0042, "grad_norm": 1.4759093523025513, "learning_rate": 4.963116247750421e-05, "loss": 1.2646, "step": 32100 }, { "epoch": 0.0044, "grad_norm": 0.7561829686164856, "learning_rate": 4.9628442081758285e-05, "loss": 1.2083, "step": 32200 }, { "epoch": 0.0046, "grad_norm": 0.6289774775505066, "learning_rate": 4.962571176567884e-05, "loss": 1.2492, "step": 32300 }, { "epoch": 0.0048, "grad_norm": 0.8146848678588867, "learning_rate": 4.962297153036564e-05, "loss": 1.2693, "step": 32400 }, { "epoch": 0.005, "grad_norm": 1.1135525703430176, "learning_rate": 4.962022137692245e-05, "loss": 1.2218, "step": 32500 }, { "epoch": 0.0052, "grad_norm": 1.1507619619369507, "learning_rate": 4.961746130645703e-05, "loss": 1.2118, "step": 32600 }, { "epoch": 0.0054, "grad_norm": 0.8586376905441284, "learning_rate": 4.961469132008114e-05, "loss": 1.2115, "step": 32700 }, { "epoch": 0.0056, "grad_norm": 1.5335224866867065, "learning_rate": 4.961191141891054e-05, "loss": 1.2239, "step": 32800 }, { "epoch": 0.0058, "grad_norm": 1.2822892665863037, "learning_rate": 4.960912160406496e-05, "loss": 1.2443, "step": 32900 }, { "epoch": 0.006, "grad_norm": 0.9584761261940002, "learning_rate": 4.960632187666814e-05, "loss": 1.243, "step": 33000 }, { "epoch": 0.006, "eval_loss": 1.0964241027832031, "eval_runtime": 76.2571, "eval_samples_per_second": 202.512, "eval_steps_per_second": 3.173, "step": 33000 }, { "epoch": 0.0062, "grad_norm": 0.7512497305870056, "learning_rate": 4.960351223784781e-05, "loss": 1.1821, "step": 33100 }, { "epoch": 0.0064, "grad_norm": 1.3305505514144897, "learning_rate": 4.960069268873568e-05, "loss": 1.2393, "step": 33200 }, { "epoch": 0.0066, "grad_norm": 1.5360506772994995, "learning_rate": 4.959786323046749e-05, "loss": 1.2475, "step": 33300 }, { "epoch": 0.0068, "grad_norm": 0.7005806565284729, "learning_rate": 4.959502386418293e-05, "loss": 1.2122, "step": 33400 }, { "epoch": 0.007, "grad_norm": 1.381052017211914, "learning_rate": 4.95921745910257e-05, "loss": 1.2336, "step": 33500 }, { "epoch": 0.0072, "grad_norm": 1.074300765991211, "learning_rate": 4.958931541214349e-05, "loss": 1.2661, "step": 33600 }, { "epoch": 0.0074, "grad_norm": 1.1441256999969482, "learning_rate": 4.9586446328687967e-05, "loss": 1.2296, "step": 33700 }, { "epoch": 0.0076, "grad_norm": 0.8737586140632629, "learning_rate": 4.958356734181481e-05, "loss": 1.2067, "step": 33800 }, { "epoch": 0.0078, "grad_norm": 1.1493791341781616, "learning_rate": 4.958067845268366e-05, "loss": 1.2643, "step": 33900 }, { "epoch": 0.008, "grad_norm": 1.3028621673583984, "learning_rate": 4.957777966245817e-05, "loss": 1.2427, "step": 34000 }, { "epoch": 0.008, "eval_loss": 1.096444845199585, "eval_runtime": 76.4253, "eval_samples_per_second": 202.067, "eval_steps_per_second": 3.166, "step": 34000 }, { "epoch": 0.0082, "grad_norm": 1.31423819065094, "learning_rate": 4.957487097230597e-05, "loss": 1.2137, "step": 34100 }, { "epoch": 0.0084, "grad_norm": 1.1846545934677124, "learning_rate": 4.957195238339868e-05, "loss": 1.2141, "step": 34200 }, { "epoch": 0.0086, "grad_norm": 0.9421952366828918, "learning_rate": 4.9569023896911914e-05, "loss": 1.219, "step": 34300 }, { "epoch": 0.0088, "grad_norm": 1.4107282161712646, "learning_rate": 4.9566085514025256e-05, "loss": 1.2141, "step": 34400 }, { "epoch": 0.009, "grad_norm": 0.7364057302474976, "learning_rate": 4.95631372359223e-05, "loss": 1.246, "step": 34500 }, { "epoch": 0.0092, "grad_norm": 0.8100732564926147, "learning_rate": 4.956017906379059e-05, "loss": 1.1891, "step": 34600 }, { "epoch": 0.0094, "grad_norm": 1.2455086708068848, "learning_rate": 4.955721099882169e-05, "loss": 1.2458, "step": 34700 }, { "epoch": 0.0096, "grad_norm": 0.676437497138977, "learning_rate": 4.9554233042211146e-05, "loss": 1.2058, "step": 34800 }, { "epoch": 0.0098, "grad_norm": 1.3339647054672241, "learning_rate": 4.955124519515847e-05, "loss": 1.2407, "step": 34900 }, { "epoch": 0.01, "grad_norm": 0.9411395192146301, "learning_rate": 4.954824745886716e-05, "loss": 1.1974, "step": 35000 }, { "epoch": 0.01, "eval_loss": 1.0945005416870117, "eval_runtime": 76.9422, "eval_samples_per_second": 200.709, "eval_steps_per_second": 3.145, "step": 35000 }, { "epoch": 0.0102, "grad_norm": 0.6638602018356323, "learning_rate": 4.95452398345447e-05, "loss": 1.2259, "step": 35100 }, { "epoch": 0.0104, "grad_norm": 0.6337453722953796, "learning_rate": 4.954222232340259e-05, "loss": 1.1686, "step": 35200 }, { "epoch": 0.0106, "grad_norm": 0.809762179851532, "learning_rate": 4.953919492665625e-05, "loss": 1.2174, "step": 35300 }, { "epoch": 0.0108, "grad_norm": 0.9431924819946289, "learning_rate": 4.953615764552513e-05, "loss": 1.2128, "step": 35400 }, { "epoch": 0.011, "grad_norm": 0.7606577277183533, "learning_rate": 4.953311048123265e-05, "loss": 1.2473, "step": 35500 }, { "epoch": 0.0112, "grad_norm": 1.1843669414520264, "learning_rate": 4.953005343500619e-05, "loss": 1.2194, "step": 35600 }, { "epoch": 0.0114, "grad_norm": 0.9086577296257019, "learning_rate": 4.952698650807715e-05, "loss": 1.2572, "step": 35700 }, { "epoch": 0.0116, "grad_norm": 1.36215078830719, "learning_rate": 4.9523909701680874e-05, "loss": 1.2263, "step": 35800 }, { "epoch": 0.0118, "grad_norm": 0.8537183403968811, "learning_rate": 4.952082301705671e-05, "loss": 1.2297, "step": 35900 }, { "epoch": 0.012, "grad_norm": 0.6182298064231873, "learning_rate": 4.9517726455447955e-05, "loss": 1.2101, "step": 36000 }, { "epoch": 0.012, "eval_loss": 1.0894391536712646, "eval_runtime": 76.3033, "eval_samples_per_second": 202.39, "eval_steps_per_second": 3.172, "step": 36000 }, { "epoch": 0.0122, "grad_norm": 1.1102640628814697, "learning_rate": 4.951462001810192e-05, "loss": 1.2086, "step": 36100 }, { "epoch": 0.0124, "grad_norm": 0.9391844868659973, "learning_rate": 4.951150370626988e-05, "loss": 1.2595, "step": 36200 }, { "epoch": 0.0126, "grad_norm": 1.3386393785476685, "learning_rate": 4.950837752120707e-05, "loss": 1.1953, "step": 36300 }, { "epoch": 0.0128, "grad_norm": 1.0943065881729126, "learning_rate": 4.950524146417273e-05, "loss": 1.2759, "step": 36400 }, { "epoch": 0.013, "grad_norm": 0.9743318557739258, "learning_rate": 4.950209553643006e-05, "loss": 1.2421, "step": 36500 }, { "epoch": 0.0132, "grad_norm": 1.2555447816848755, "learning_rate": 4.949893973924623e-05, "loss": 1.242, "step": 36600 }, { "epoch": 0.0134, "grad_norm": 1.3289902210235596, "learning_rate": 4.949577407389241e-05, "loss": 1.2337, "step": 36700 }, { "epoch": 0.0136, "grad_norm": 0.8806101679801941, "learning_rate": 4.949259854164372e-05, "loss": 1.244, "step": 36800 }, { "epoch": 0.0138, "grad_norm": 1.211584448814392, "learning_rate": 4.948941314377927e-05, "loss": 1.2344, "step": 36900 }, { "epoch": 0.014, "grad_norm": 1.6472032070159912, "learning_rate": 4.9486217881582134e-05, "loss": 1.1866, "step": 37000 }, { "epoch": 0.014, "eval_loss": 1.0940065383911133, "eval_runtime": 76.3383, "eval_samples_per_second": 202.297, "eval_steps_per_second": 3.17, "step": 37000 }, { "epoch": 0.0142, "grad_norm": 1.5010918378829956, "learning_rate": 4.948301275633936e-05, "loss": 1.2057, "step": 37100 }, { "epoch": 0.0144, "grad_norm": 0.6793572306632996, "learning_rate": 4.947979776934197e-05, "loss": 1.2104, "step": 37200 }, { "epoch": 0.0146, "grad_norm": 0.7654362916946411, "learning_rate": 4.947657292188498e-05, "loss": 1.2266, "step": 37300 }, { "epoch": 0.0148, "grad_norm": 1.0618220567703247, "learning_rate": 4.947333821526734e-05, "loss": 1.2509, "step": 37400 }, { "epoch": 0.015, "grad_norm": 1.2712790966033936, "learning_rate": 4.947009365079199e-05, "loss": 1.2179, "step": 37500 }, { "epoch": 0.0152, "grad_norm": 1.3342602252960205, "learning_rate": 4.946683922976584e-05, "loss": 1.2224, "step": 37600 }, { "epoch": 0.0154, "grad_norm": 0.8218332529067993, "learning_rate": 4.946357495349978e-05, "loss": 1.2402, "step": 37700 }, { "epoch": 0.0156, "grad_norm": 2.0291969776153564, "learning_rate": 4.946030082330865e-05, "loss": 1.1599, "step": 37800 }, { "epoch": 0.0158, "grad_norm": 1.5702838897705078, "learning_rate": 4.945701684051128e-05, "loss": 1.1784, "step": 37900 }, { "epoch": 0.016, "grad_norm": 1.268508791923523, "learning_rate": 4.9453723006430444e-05, "loss": 1.2172, "step": 38000 }, { "epoch": 0.016, "eval_loss": 1.088572382926941, "eval_runtime": 76.533, "eval_samples_per_second": 201.782, "eval_steps_per_second": 3.162, "step": 38000 }, { "epoch": 0.0162, "grad_norm": 1.3127037286758423, "learning_rate": 4.945041932239292e-05, "loss": 1.2299, "step": 38100 }, { "epoch": 0.0164, "grad_norm": 0.7277888655662537, "learning_rate": 4.9447105789729396e-05, "loss": 1.2655, "step": 38200 }, { "epoch": 0.0166, "grad_norm": 1.031909704208374, "learning_rate": 4.94437824097746e-05, "loss": 1.2179, "step": 38300 }, { "epoch": 0.0168, "grad_norm": 1.2462060451507568, "learning_rate": 4.9440449183867166e-05, "loss": 1.2311, "step": 38400 }, { "epoch": 0.017, "grad_norm": 0.5426816344261169, "learning_rate": 4.9437106113349716e-05, "loss": 1.1637, "step": 38500 }, { "epoch": 0.0172, "grad_norm": 1.2320595979690552, "learning_rate": 4.9433753199568856e-05, "loss": 1.2282, "step": 38600 }, { "epoch": 0.0174, "grad_norm": 0.928945779800415, "learning_rate": 4.943039044387513e-05, "loss": 1.1936, "step": 38700 }, { "epoch": 0.0176, "grad_norm": 1.4080160856246948, "learning_rate": 4.9427017847623044e-05, "loss": 1.251, "step": 38800 }, { "epoch": 0.0178, "grad_norm": 1.3436859846115112, "learning_rate": 4.9423635412171106e-05, "loss": 1.287, "step": 38900 }, { "epoch": 0.018, "grad_norm": 0.9334709048271179, "learning_rate": 4.9420243138881734e-05, "loss": 1.1766, "step": 39000 }, { "epoch": 0.018, "eval_loss": 1.092005968093872, "eval_runtime": 76.2687, "eval_samples_per_second": 202.482, "eval_steps_per_second": 3.173, "step": 39000 }, { "epoch": 0.0182, "grad_norm": 0.9674895405769348, "learning_rate": 4.9416841029121355e-05, "loss": 1.2388, "step": 39100 }, { "epoch": 0.0184, "grad_norm": 1.3673955202102661, "learning_rate": 4.941342908426032e-05, "loss": 1.183, "step": 39200 }, { "epoch": 0.0186, "grad_norm": 0.8423133492469788, "learning_rate": 4.941000730567297e-05, "loss": 1.1847, "step": 39300 }, { "epoch": 0.0188, "grad_norm": 0.9814749360084534, "learning_rate": 4.94065756947376e-05, "loss": 1.2022, "step": 39400 }, { "epoch": 0.019, "grad_norm": 1.125647783279419, "learning_rate": 4.9403134252836456e-05, "loss": 1.1966, "step": 39500 }, { "epoch": 0.0192, "grad_norm": 0.8501796722412109, "learning_rate": 4.9399682981355755e-05, "loss": 1.2347, "step": 39600 }, { "epoch": 0.0194, "grad_norm": 0.8226144909858704, "learning_rate": 4.9396221881685665e-05, "loss": 1.2129, "step": 39700 }, { "epoch": 0.0196, "grad_norm": 0.9265516400337219, "learning_rate": 4.939275095522032e-05, "loss": 1.1917, "step": 39800 }, { "epoch": 0.0198, "grad_norm": 0.8538194298744202, "learning_rate": 4.938927020335781e-05, "loss": 1.2548, "step": 39900 }, { "epoch": 0.02, "grad_norm": 1.2129065990447998, "learning_rate": 4.9385779627500174e-05, "loss": 1.2219, "step": 40000 }, { "epoch": 0.02, "eval_loss": 1.087021827697754, "eval_runtime": 76.3535, "eval_samples_per_second": 202.257, "eval_steps_per_second": 3.169, "step": 40000 }, { "epoch": 0.0202, "grad_norm": 1.2157970666885376, "learning_rate": 4.938227922905342e-05, "loss": 1.1623, "step": 40100 }, { "epoch": 0.0204, "grad_norm": 0.6873258948326111, "learning_rate": 4.9378769009427515e-05, "loss": 1.2088, "step": 40200 }, { "epoch": 0.0206, "grad_norm": 1.139224886894226, "learning_rate": 4.937524897003637e-05, "loss": 1.2158, "step": 40300 }, { "epoch": 0.0208, "grad_norm": 1.2190488576889038, "learning_rate": 4.9371719112297845e-05, "loss": 1.19, "step": 40400 }, { "epoch": 0.021, "grad_norm": 1.2439500093460083, "learning_rate": 4.936817943763378e-05, "loss": 1.173, "step": 40500 }, { "epoch": 0.0212, "grad_norm": 1.030110478401184, "learning_rate": 4.936462994746995e-05, "loss": 1.1995, "step": 40600 }, { "epoch": 0.0214, "grad_norm": 0.666333794593811, "learning_rate": 4.93610706432361e-05, "loss": 1.2476, "step": 40700 }, { "epoch": 0.0216, "grad_norm": 0.8477672934532166, "learning_rate": 4.93575015263659e-05, "loss": 1.2225, "step": 40800 }, { "epoch": 0.0218, "grad_norm": 1.087173342704773, "learning_rate": 4.9353922598296995e-05, "loss": 1.1758, "step": 40900 }, { "epoch": 0.022, "grad_norm": 1.2760623693466187, "learning_rate": 4.935033386047099e-05, "loss": 1.2811, "step": 41000 }, { "epoch": 0.022, "eval_loss": 1.082631230354309, "eval_runtime": 75.9811, "eval_samples_per_second": 203.248, "eval_steps_per_second": 3.185, "step": 41000 }, { "epoch": 0.0222, "grad_norm": 1.0236754417419434, "learning_rate": 4.934673531433341e-05, "loss": 1.2283, "step": 41100 }, { "epoch": 0.0224, "grad_norm": 1.509448766708374, "learning_rate": 4.934312696133376e-05, "loss": 1.1989, "step": 41200 }, { "epoch": 0.0226, "grad_norm": 1.2022035121917725, "learning_rate": 4.9339508802925475e-05, "loss": 1.2247, "step": 41300 }, { "epoch": 0.0228, "grad_norm": 1.4019054174423218, "learning_rate": 4.933588084056596e-05, "loss": 1.2201, "step": 41400 }, { "epoch": 0.023, "grad_norm": 1.06856107711792, "learning_rate": 4.933224307571655e-05, "loss": 1.1789, "step": 41500 }, { "epoch": 0.0232, "grad_norm": 1.0807596445083618, "learning_rate": 4.932859550984255e-05, "loss": 1.2361, "step": 41600 }, { "epoch": 0.0234, "grad_norm": 1.20824134349823, "learning_rate": 4.932493814441318e-05, "loss": 1.2167, "step": 41700 }, { "epoch": 0.0236, "grad_norm": 0.7066964507102966, "learning_rate": 4.9321270980901635e-05, "loss": 1.1941, "step": 41800 }, { "epoch": 0.0238, "grad_norm": 0.7342857122421265, "learning_rate": 4.9317594020785044e-05, "loss": 1.1709, "step": 41900 }, { "epoch": 0.024, "grad_norm": 1.239176630973816, "learning_rate": 4.931390726554449e-05, "loss": 1.2238, "step": 42000 }, { "epoch": 0.024, "eval_loss": 1.0859261751174927, "eval_runtime": 76.6051, "eval_samples_per_second": 201.592, "eval_steps_per_second": 3.159, "step": 42000 }, { "epoch": 0.0242, "grad_norm": 0.9031541347503662, "learning_rate": 4.9310210716665003e-05, "loss": 1.1621, "step": 42100 }, { "epoch": 0.0244, "grad_norm": 0.744767963886261, "learning_rate": 4.930650437563554e-05, "loss": 1.21, "step": 42200 }, { "epoch": 0.0246, "grad_norm": 1.2594637870788574, "learning_rate": 4.9302788243949025e-05, "loss": 1.21, "step": 42300 }, { "epoch": 0.0248, "grad_norm": 0.67472243309021, "learning_rate": 4.929906232310231e-05, "loss": 1.1785, "step": 42400 }, { "epoch": 0.025, "grad_norm": 1.3947267532348633, "learning_rate": 4.92953266145962e-05, "loss": 1.1598, "step": 42500 }, { "epoch": 0.0252, "grad_norm": 0.7739892601966858, "learning_rate": 4.929158111993543e-05, "loss": 1.1492, "step": 42600 }, { "epoch": 0.0254, "grad_norm": 0.8620167970657349, "learning_rate": 4.9287825840628695e-05, "loss": 1.1863, "step": 42700 }, { "epoch": 0.0256, "grad_norm": 0.7649038434028625, "learning_rate": 4.928406077818861e-05, "loss": 1.1782, "step": 42800 }, { "epoch": 0.0258, "grad_norm": 1.2743923664093018, "learning_rate": 4.9280285934131755e-05, "loss": 1.2254, "step": 42900 }, { "epoch": 0.026, "grad_norm": 0.6955134272575378, "learning_rate": 4.927650130997862e-05, "loss": 1.2254, "step": 43000 }, { "epoch": 0.026, "eval_loss": 1.0833112001419067, "eval_runtime": 77.475, "eval_samples_per_second": 199.329, "eval_steps_per_second": 3.124, "step": 43000 }, { "epoch": 0.0262, "grad_norm": 0.8997926115989685, "learning_rate": 4.927270690725367e-05, "loss": 1.1989, "step": 43100 }, { "epoch": 0.0264, "grad_norm": 1.3762701749801636, "learning_rate": 4.9268902727485276e-05, "loss": 1.1928, "step": 43200 }, { "epoch": 0.0266, "grad_norm": 0.7553657293319702, "learning_rate": 4.926508877220577e-05, "loss": 1.2266, "step": 43300 }, { "epoch": 0.0268, "grad_norm": 0.6331331133842468, "learning_rate": 4.92612650429514e-05, "loss": 1.2034, "step": 43400 }, { "epoch": 0.027, "grad_norm": 0.6229783892631531, "learning_rate": 4.925743154126238e-05, "loss": 1.2123, "step": 43500 }, { "epoch": 0.0272, "grad_norm": 1.2101593017578125, "learning_rate": 4.9253588268682835e-05, "loss": 1.2473, "step": 43600 }, { "epoch": 0.0274, "grad_norm": 1.2178127765655518, "learning_rate": 4.924973522676083e-05, "loss": 1.2391, "step": 43700 }, { "epoch": 0.0276, "grad_norm": 1.4870595932006836, "learning_rate": 4.924587241704838e-05, "loss": 1.2358, "step": 43800 }, { "epoch": 0.0278, "grad_norm": 1.2042150497436523, "learning_rate": 4.924199984110142e-05, "loss": 1.1996, "step": 43900 }, { "epoch": 0.028, "grad_norm": 1.3220444917678833, "learning_rate": 4.923811750047982e-05, "loss": 1.2052, "step": 44000 }, { "epoch": 0.028, "eval_loss": 1.0859400033950806, "eval_runtime": 76.6882, "eval_samples_per_second": 201.374, "eval_steps_per_second": 3.156, "step": 44000 }, { "epoch": 0.0282, "grad_norm": 1.464141607284546, "learning_rate": 4.923422539674739e-05, "loss": 1.2326, "step": 44100 }, { "epoch": 0.0284, "grad_norm": 1.2406100034713745, "learning_rate": 4.923032353147187e-05, "loss": 1.2092, "step": 44200 }, { "epoch": 0.0286, "grad_norm": 0.9459540247917175, "learning_rate": 4.9226411906224935e-05, "loss": 1.2023, "step": 44300 }, { "epoch": 0.0288, "grad_norm": 1.2143398523330688, "learning_rate": 4.922249052258217e-05, "loss": 1.2348, "step": 44400 }, { "epoch": 0.029, "grad_norm": 1.1002607345581055, "learning_rate": 4.921855938212312e-05, "loss": 1.1912, "step": 44500 }, { "epoch": 0.0292, "grad_norm": 1.169640302658081, "learning_rate": 4.921461848643126e-05, "loss": 1.1797, "step": 44600 }, { "epoch": 0.0294, "grad_norm": 1.2756543159484863, "learning_rate": 4.921066783709396e-05, "loss": 1.1691, "step": 44700 }, { "epoch": 0.0296, "grad_norm": 0.5525041222572327, "learning_rate": 4.920670743570255e-05, "loss": 1.2011, "step": 44800 }, { "epoch": 0.0298, "grad_norm": 0.7082927823066711, "learning_rate": 4.9202737283852284e-05, "loss": 1.1831, "step": 44900 }, { "epoch": 0.03, "grad_norm": 0.7773894667625427, "learning_rate": 4.919875738314233e-05, "loss": 1.1947, "step": 45000 }, { "epoch": 0.03, "eval_loss": 1.0890144109725952, "eval_runtime": 76.6594, "eval_samples_per_second": 201.45, "eval_steps_per_second": 3.157, "step": 45000 }, { "epoch": 0.0302, "grad_norm": 0.7057791352272034, "learning_rate": 4.91947677351758e-05, "loss": 1.2717, "step": 45100 }, { "epoch": 0.0304, "grad_norm": 0.9837706685066223, "learning_rate": 4.919076834155971e-05, "loss": 1.206, "step": 45200 }, { "epoch": 0.0306, "grad_norm": 0.5716899633407593, "learning_rate": 4.918675920390504e-05, "loss": 1.2071, "step": 45300 }, { "epoch": 0.0308, "grad_norm": 0.6972540020942688, "learning_rate": 4.918274032382665e-05, "loss": 1.1761, "step": 45400 }, { "epoch": 0.031, "grad_norm": 1.4802424907684326, "learning_rate": 4.917871170294334e-05, "loss": 1.2109, "step": 45500 }, { "epoch": 0.0312, "grad_norm": 0.7575565576553345, "learning_rate": 4.9174673342877854e-05, "loss": 1.2169, "step": 45600 }, { "epoch": 0.0314, "grad_norm": 2.227360963821411, "learning_rate": 4.917062524525684e-05, "loss": 1.1657, "step": 45700 }, { "epoch": 0.0316, "grad_norm": 0.8020743727684021, "learning_rate": 4.916656741171086e-05, "loss": 1.2073, "step": 45800 }, { "epoch": 0.0318, "grad_norm": 1.1863917112350464, "learning_rate": 4.916249984387443e-05, "loss": 1.211, "step": 45900 }, { "epoch": 0.032, "grad_norm": 0.5976528525352478, "learning_rate": 4.915842254338594e-05, "loss": 1.2468, "step": 46000 }, { "epoch": 0.032, "eval_loss": 1.0842978954315186, "eval_runtime": 76.5369, "eval_samples_per_second": 201.772, "eval_steps_per_second": 3.162, "step": 46000 }, { "epoch": 0.0322, "grad_norm": 1.4908519983291626, "learning_rate": 4.915433551188774e-05, "loss": 1.1695, "step": 46100 }, { "epoch": 0.0324, "grad_norm": 1.1190279722213745, "learning_rate": 4.915023875102609e-05, "loss": 1.2017, "step": 46200 }, { "epoch": 0.0326, "grad_norm": 1.1334049701690674, "learning_rate": 4.914613226245115e-05, "loss": 1.2083, "step": 46300 }, { "epoch": 0.0328, "grad_norm": 0.6902172565460205, "learning_rate": 4.914201604781703e-05, "loss": 1.233, "step": 46400 }, { "epoch": 0.033, "grad_norm": 0.7509928941726685, "learning_rate": 4.913789010878174e-05, "loss": 1.2437, "step": 46500 }, { "epoch": 0.0332, "grad_norm": 1.4217336177825928, "learning_rate": 4.9133754447007185e-05, "loss": 1.1909, "step": 46600 }, { "epoch": 0.0334, "grad_norm": 1.212930679321289, "learning_rate": 4.912960906415923e-05, "loss": 1.1828, "step": 46700 }, { "epoch": 0.0336, "grad_norm": 1.1408753395080566, "learning_rate": 4.912545396190763e-05, "loss": 1.2118, "step": 46800 }, { "epoch": 0.0338, "grad_norm": 0.649695634841919, "learning_rate": 4.9121289141926066e-05, "loss": 1.1877, "step": 46900 }, { "epoch": 0.034, "grad_norm": 1.4613287448883057, "learning_rate": 4.911711460589211e-05, "loss": 1.1977, "step": 47000 }, { "epoch": 0.034, "eval_loss": 1.0870256423950195, "eval_runtime": 76.7051, "eval_samples_per_second": 201.33, "eval_steps_per_second": 3.155, "step": 47000 }, { "epoch": 0.0342, "grad_norm": 1.1586204767227173, "learning_rate": 4.9112930355487284e-05, "loss": 1.2222, "step": 47100 }, { "epoch": 0.0344, "grad_norm": 1.220306158065796, "learning_rate": 4.910873639239699e-05, "loss": 1.1909, "step": 47200 }, { "epoch": 0.0346, "grad_norm": 0.589338481426239, "learning_rate": 4.910453271831056e-05, "loss": 1.2034, "step": 47300 }, { "epoch": 0.0348, "grad_norm": 1.4743396043777466, "learning_rate": 4.910031933492123e-05, "loss": 1.2019, "step": 47400 }, { "epoch": 0.035, "grad_norm": 0.6481319069862366, "learning_rate": 4.909609624392616e-05, "loss": 1.2107, "step": 47500 }, { "epoch": 0.0352, "grad_norm": 1.1668992042541504, "learning_rate": 4.9091863447026404e-05, "loss": 1.2498, "step": 47600 }, { "epoch": 0.0354, "grad_norm": 1.115519404411316, "learning_rate": 4.908762094592693e-05, "loss": 1.206, "step": 47700 }, { "epoch": 0.0356, "grad_norm": 1.3867928981781006, "learning_rate": 4.908336874233662e-05, "loss": 1.2082, "step": 47800 }, { "epoch": 0.0358, "grad_norm": 0.6380243301391602, "learning_rate": 4.9079106837968264e-05, "loss": 1.1693, "step": 47900 }, { "epoch": 0.036, "grad_norm": 1.8375539779663086, "learning_rate": 4.907483523453855e-05, "loss": 1.1531, "step": 48000 }, { "epoch": 0.036, "eval_loss": 1.0780328512191772, "eval_runtime": 76.5805, "eval_samples_per_second": 201.657, "eval_steps_per_second": 3.16, "step": 48000 }, { "epoch": 0.0362, "grad_norm": 1.231332778930664, "learning_rate": 4.907055393376808e-05, "loss": 1.1618, "step": 48100 }, { "epoch": 0.0364, "grad_norm": 1.2306678295135498, "learning_rate": 4.906626293738137e-05, "loss": 1.2365, "step": 48200 }, { "epoch": 0.0366, "grad_norm": 1.057521104812622, "learning_rate": 4.906196224710683e-05, "loss": 1.1775, "step": 48300 }, { "epoch": 0.0368, "grad_norm": 0.9679245352745056, "learning_rate": 4.905765186467677e-05, "loss": 1.2175, "step": 48400 }, { "epoch": 0.037, "grad_norm": 1.325900912284851, "learning_rate": 4.9053331791827404e-05, "loss": 1.1848, "step": 48500 }, { "epoch": 0.0372, "grad_norm": 1.3124104738235474, "learning_rate": 4.9049002030298887e-05, "loss": 1.1779, "step": 48600 }, { "epoch": 0.0374, "grad_norm": 1.7284040451049805, "learning_rate": 4.904466258183522e-05, "loss": 1.2144, "step": 48700 }, { "epoch": 0.0376, "grad_norm": 0.9314505457878113, "learning_rate": 4.904031344818434e-05, "loss": 1.219, "step": 48800 }, { "epoch": 0.0378, "grad_norm": 1.1688934564590454, "learning_rate": 4.903595463109808e-05, "loss": 1.2268, "step": 48900 }, { "epoch": 0.038, "grad_norm": 1.0910236835479736, "learning_rate": 4.903158613233216e-05, "loss": 1.2213, "step": 49000 }, { "epoch": 0.038, "eval_loss": 1.0866200923919678, "eval_runtime": 76.736, "eval_samples_per_second": 201.248, "eval_steps_per_second": 3.154, "step": 49000 }, { "epoch": 0.0382, "grad_norm": 1.0715341567993164, "learning_rate": 4.902720795364623e-05, "loss": 1.2007, "step": 49100 }, { "epoch": 0.0384, "grad_norm": 0.6578232645988464, "learning_rate": 4.902282009680381e-05, "loss": 1.2078, "step": 49200 }, { "epoch": 0.0386, "grad_norm": 1.34630286693573, "learning_rate": 4.9018422563572326e-05, "loss": 1.1894, "step": 49300 }, { "epoch": 0.0388, "grad_norm": 1.1832722425460815, "learning_rate": 4.9014015355723104e-05, "loss": 1.1846, "step": 49400 }, { "epoch": 0.039, "grad_norm": 0.9175591468811035, "learning_rate": 4.900959847503137e-05, "loss": 1.1984, "step": 49500 }, { "epoch": 0.0392, "grad_norm": 1.077879548072815, "learning_rate": 4.9005171923276236e-05, "loss": 1.1868, "step": 49600 }, { "epoch": 0.0394, "grad_norm": 0.5999984741210938, "learning_rate": 4.900073570224073e-05, "loss": 1.1816, "step": 49700 }, { "epoch": 0.0396, "grad_norm": 1.24228835105896, "learning_rate": 4.899628981371175e-05, "loss": 1.191, "step": 49800 }, { "epoch": 0.0398, "grad_norm": 0.7666544318199158, "learning_rate": 4.899183425948011e-05, "loss": 1.1813, "step": 49900 }, { "epoch": 0.04, "grad_norm": 1.2996748685836792, "learning_rate": 4.8987369041340486e-05, "loss": 1.184, "step": 50000 }, { "epoch": 0.04, "eval_loss": 1.0817583799362183, "eval_runtime": 77.0972, "eval_samples_per_second": 200.306, "eval_steps_per_second": 3.139, "step": 50000 }, { "epoch": 0.0402, "grad_norm": 1.1717365980148315, "learning_rate": 4.898289416109149e-05, "loss": 1.1936, "step": 50100 }, { "epoch": 0.0404, "grad_norm": 1.3680170774459839, "learning_rate": 4.8978409620535595e-05, "loss": 1.2138, "step": 50200 }, { "epoch": 0.0406, "grad_norm": 1.6390254497528076, "learning_rate": 4.897391542147916e-05, "loss": 1.1883, "step": 50300 }, { "epoch": 0.0408, "grad_norm": 1.2523001432418823, "learning_rate": 4.896941156573247e-05, "loss": 1.2157, "step": 50400 }, { "epoch": 0.041, "grad_norm": 1.4317930936813354, "learning_rate": 4.896489805510966e-05, "loss": 1.1721, "step": 50500 }, { "epoch": 0.0412, "grad_norm": 0.9794881939888, "learning_rate": 4.896037489142879e-05, "loss": 1.2073, "step": 50600 }, { "epoch": 0.0414, "grad_norm": 0.8774665594100952, "learning_rate": 4.895584207651178e-05, "loss": 1.1934, "step": 50700 }, { "epoch": 0.0416, "grad_norm": 1.421742558479309, "learning_rate": 4.895129961218444e-05, "loss": 1.2078, "step": 50800 }, { "epoch": 0.0418, "grad_norm": 1.0715827941894531, "learning_rate": 4.894674750027648e-05, "loss": 1.1713, "step": 50900 }, { "epoch": 0.042, "grad_norm": 0.7623746991157532, "learning_rate": 4.894218574262149e-05, "loss": 1.1779, "step": 51000 }, { "epoch": 0.042, "eval_loss": 1.0817545652389526, "eval_runtime": 76.5318, "eval_samples_per_second": 201.785, "eval_steps_per_second": 3.162, "step": 51000 }, { "epoch": 0.0422, "grad_norm": 0.710477888584137, "learning_rate": 4.893761434105695e-05, "loss": 1.1876, "step": 51100 }, { "epoch": 0.0424, "grad_norm": 1.244310736656189, "learning_rate": 4.893303329742421e-05, "loss": 1.2077, "step": 51200 }, { "epoch": 0.0426, "grad_norm": 1.6161651611328125, "learning_rate": 4.8928442613568535e-05, "loss": 1.1896, "step": 51300 }, { "epoch": 0.0428, "grad_norm": 1.0831233263015747, "learning_rate": 4.892384229133902e-05, "loss": 1.1904, "step": 51400 }, { "epoch": 0.043, "grad_norm": 0.8258353471755981, "learning_rate": 4.89192323325887e-05, "loss": 1.1906, "step": 51500 }, { "epoch": 0.0432, "grad_norm": 0.7877621054649353, "learning_rate": 4.8914612739174456e-05, "loss": 1.1416, "step": 51600 }, { "epoch": 0.0434, "grad_norm": 1.2102254629135132, "learning_rate": 4.890998351295706e-05, "loss": 1.1782, "step": 51700 }, { "epoch": 0.0436, "grad_norm": 1.139289140701294, "learning_rate": 4.890534465580115e-05, "loss": 1.1471, "step": 51800 }, { "epoch": 0.0438, "grad_norm": 1.2521135807037354, "learning_rate": 4.890069616957529e-05, "loss": 1.206, "step": 51900 }, { "epoch": 0.044, "grad_norm": 1.3690674304962158, "learning_rate": 4.889603805615187e-05, "loss": 1.2328, "step": 52000 }, { "epoch": 0.044, "eval_loss": 1.0797057151794434, "eval_runtime": 76.4385, "eval_samples_per_second": 202.032, "eval_steps_per_second": 3.166, "step": 52000 }, { "epoch": 0.0442, "grad_norm": 1.2689367532730103, "learning_rate": 4.889137031740717e-05, "loss": 1.2189, "step": 52100 }, { "epoch": 0.0444, "grad_norm": 1.0029367208480835, "learning_rate": 4.888669295522137e-05, "loss": 1.1754, "step": 52200 }, { "epoch": 0.0446, "grad_norm": 0.6958720684051514, "learning_rate": 4.8882005971478504e-05, "loss": 1.1601, "step": 52300 }, { "epoch": 0.0448, "grad_norm": 1.2337570190429688, "learning_rate": 4.887730936806648e-05, "loss": 1.2244, "step": 52400 }, { "epoch": 0.045, "grad_norm": 1.2311972379684448, "learning_rate": 4.8872603146877104e-05, "loss": 1.2031, "step": 52500 }, { "epoch": 0.0452, "grad_norm": 1.145331859588623, "learning_rate": 4.886788730980604e-05, "loss": 1.1947, "step": 52600 }, { "epoch": 0.0454, "grad_norm": 1.1688799858093262, "learning_rate": 4.886316185875282e-05, "loss": 1.1655, "step": 52700 }, { "epoch": 0.0456, "grad_norm": 1.2751972675323486, "learning_rate": 4.885842679562085e-05, "loss": 1.2038, "step": 52800 }, { "epoch": 0.0458, "grad_norm": 0.6860191822052002, "learning_rate": 4.8853682122317426e-05, "loss": 1.1922, "step": 52900 }, { "epoch": 0.046, "grad_norm": 1.4772953987121582, "learning_rate": 4.8848927840753695e-05, "loss": 1.1856, "step": 53000 }, { "epoch": 0.046, "eval_loss": 1.0836056470870972, "eval_runtime": 76.3679, "eval_samples_per_second": 202.218, "eval_steps_per_second": 3.169, "step": 53000 }, { "epoch": 0.0462, "grad_norm": 1.2491508722305298, "learning_rate": 4.884416395284468e-05, "loss": 1.1924, "step": 53100 }, { "epoch": 0.0464, "grad_norm": 1.1689327955245972, "learning_rate": 4.883939046050928e-05, "loss": 1.1675, "step": 53200 }, { "epoch": 0.0466, "grad_norm": 1.0528528690338135, "learning_rate": 4.883460736567025e-05, "loss": 1.1879, "step": 53300 }, { "epoch": 0.0468, "grad_norm": 1.141653060913086, "learning_rate": 4.8829814670254226e-05, "loss": 1.1637, "step": 53400 }, { "epoch": 0.047, "grad_norm": 0.8094840049743652, "learning_rate": 4.88250123761917e-05, "loss": 1.1924, "step": 53500 }, { "epoch": 0.0472, "grad_norm": 1.4988161325454712, "learning_rate": 4.8820200485417036e-05, "loss": 1.1962, "step": 53600 }, { "epoch": 0.0474, "grad_norm": 0.8497682809829712, "learning_rate": 4.881537899986847e-05, "loss": 1.1987, "step": 53700 }, { "epoch": 0.0476, "grad_norm": 1.0132189989089966, "learning_rate": 4.8810547921488083e-05, "loss": 1.1666, "step": 53800 }, { "epoch": 0.0478, "grad_norm": 1.275478720664978, "learning_rate": 4.8805707252221846e-05, "loss": 1.2072, "step": 53900 }, { "epoch": 0.048, "grad_norm": 1.1257511377334595, "learning_rate": 4.880085699401958e-05, "loss": 1.2128, "step": 54000 }, { "epoch": 0.048, "eval_loss": 1.081576943397522, "eval_runtime": 76.6431, "eval_samples_per_second": 201.492, "eval_steps_per_second": 3.157, "step": 54000 }, { "epoch": 0.0482, "grad_norm": 1.132750153541565, "learning_rate": 4.879599714883496e-05, "loss": 1.2239, "step": 54100 }, { "epoch": 0.0484, "grad_norm": 1.3854628801345825, "learning_rate": 4.8791127718625526e-05, "loss": 1.1447, "step": 54200 }, { "epoch": 0.0486, "grad_norm": 1.32233464717865, "learning_rate": 4.87862487053527e-05, "loss": 1.1765, "step": 54300 }, { "epoch": 0.0488, "grad_norm": 1.1571578979492188, "learning_rate": 4.8781360110981744e-05, "loss": 1.1844, "step": 54400 }, { "epoch": 0.049, "grad_norm": 1.552740216255188, "learning_rate": 4.877646193748177e-05, "loss": 1.1336, "step": 54500 }, { "epoch": 0.0492, "grad_norm": 1.3447420597076416, "learning_rate": 4.8771554186825774e-05, "loss": 1.2401, "step": 54600 }, { "epoch": 0.0494, "grad_norm": 1.0012767314910889, "learning_rate": 4.87666368609906e-05, "loss": 1.2236, "step": 54700 }, { "epoch": 0.0496, "grad_norm": 1.1246662139892578, "learning_rate": 4.876170996195693e-05, "loss": 1.2452, "step": 54800 }, { "epoch": 0.0498, "grad_norm": 0.7534450888633728, "learning_rate": 4.875677349170934e-05, "loss": 1.2333, "step": 54900 }, { "epoch": 0.05, "grad_norm": 1.2943884134292603, "learning_rate": 4.875182745223622e-05, "loss": 1.1986, "step": 55000 }, { "epoch": 0.05, "eval_loss": 1.0774849653244019, "eval_runtime": 76.6003, "eval_samples_per_second": 201.605, "eval_steps_per_second": 3.159, "step": 55000 }, { "epoch": 0.0502, "grad_norm": 1.0771546363830566, "learning_rate": 4.874687184552984e-05, "loss": 1.2022, "step": 55100 }, { "epoch": 0.0504, "grad_norm": 1.1722393035888672, "learning_rate": 4.8741906673586334e-05, "loss": 1.1856, "step": 55200 }, { "epoch": 0.0506, "grad_norm": 0.7547242045402527, "learning_rate": 4.873693193840565e-05, "loss": 1.153, "step": 55300 }, { "epoch": 0.0508, "grad_norm": 0.9694270491600037, "learning_rate": 4.873194764199162e-05, "loss": 1.2389, "step": 55400 }, { "epoch": 0.051, "grad_norm": 0.6288232803344727, "learning_rate": 4.872695378635192e-05, "loss": 1.195, "step": 55500 }, { "epoch": 0.0512, "grad_norm": 1.1400961875915527, "learning_rate": 4.872195037349807e-05, "loss": 1.1903, "step": 55600 }, { "epoch": 0.0514, "grad_norm": 1.0738123655319214, "learning_rate": 4.871693740544545e-05, "loss": 1.1764, "step": 55700 }, { "epoch": 0.0516, "grad_norm": 1.2298240661621094, "learning_rate": 4.871191488421327e-05, "loss": 1.1701, "step": 55800 }, { "epoch": 0.0518, "grad_norm": 1.3240865468978882, "learning_rate": 4.8706882811824624e-05, "loss": 1.1828, "step": 55900 }, { "epoch": 0.052, "grad_norm": 1.4167003631591797, "learning_rate": 4.870184119030641e-05, "loss": 1.204, "step": 56000 }, { "epoch": 0.052, "eval_loss": 1.0775164365768433, "eval_runtime": 76.8889, "eval_samples_per_second": 200.848, "eval_steps_per_second": 3.147, "step": 56000 }, { "epoch": 0.0522, "grad_norm": 0.6648851037025452, "learning_rate": 4.86967900216894e-05, "loss": 1.174, "step": 56100 }, { "epoch": 0.0524, "grad_norm": 1.29317307472229, "learning_rate": 4.8691729308008196e-05, "loss": 1.1695, "step": 56200 }, { "epoch": 0.0526, "grad_norm": 1.3121986389160156, "learning_rate": 4.868665905130127e-05, "loss": 1.1941, "step": 56300 }, { "epoch": 0.0528, "grad_norm": 0.6604340672492981, "learning_rate": 4.868157925361091e-05, "loss": 1.1875, "step": 56400 }, { "epoch": 0.053, "grad_norm": 1.0366885662078857, "learning_rate": 4.867648991698325e-05, "loss": 1.2265, "step": 56500 }, { "epoch": 0.0532, "grad_norm": 1.382543683052063, "learning_rate": 4.867139104346829e-05, "loss": 1.2122, "step": 56600 }, { "epoch": 0.0534, "grad_norm": 1.0773979425430298, "learning_rate": 4.866628263511985e-05, "loss": 1.2375, "step": 56700 }, { "epoch": 0.0536, "grad_norm": 1.178758978843689, "learning_rate": 4.8661164693995584e-05, "loss": 1.1959, "step": 56800 }, { "epoch": 0.0538, "grad_norm": 0.7048764228820801, "learning_rate": 4.865603722215702e-05, "loss": 1.1841, "step": 56900 }, { "epoch": 0.054, "grad_norm": 1.3390711545944214, "learning_rate": 4.865090022166949e-05, "loss": 1.2033, "step": 57000 }, { "epoch": 0.054, "eval_loss": 1.0746017694473267, "eval_runtime": 77.1768, "eval_samples_per_second": 200.099, "eval_steps_per_second": 3.136, "step": 57000 }, { "epoch": 0.0542, "grad_norm": 1.0639598369598389, "learning_rate": 4.864575369460218e-05, "loss": 1.1948, "step": 57100 }, { "epoch": 0.0544, "grad_norm": 1.1349152326583862, "learning_rate": 4.86405976430281e-05, "loss": 1.1666, "step": 57200 }, { "epoch": 0.0546, "grad_norm": 1.0187245607376099, "learning_rate": 4.8635432069024125e-05, "loss": 1.1614, "step": 57300 }, { "epoch": 0.0548, "grad_norm": 0.6468742489814758, "learning_rate": 4.863025697467094e-05, "loss": 1.2043, "step": 57400 }, { "epoch": 0.055, "grad_norm": 1.1298869848251343, "learning_rate": 4.862507236205307e-05, "loss": 1.1884, "step": 57500 }, { "epoch": 0.0552, "grad_norm": 0.7240111827850342, "learning_rate": 4.861987823325887e-05, "loss": 1.186, "step": 57600 }, { "epoch": 0.0554, "grad_norm": 0.8047366142272949, "learning_rate": 4.861467459038056e-05, "loss": 1.2029, "step": 57700 }, { "epoch": 0.0556, "grad_norm": 0.8840340375900269, "learning_rate": 4.860946143551413e-05, "loss": 1.19, "step": 57800 }, { "epoch": 0.0558, "grad_norm": 1.1575409173965454, "learning_rate": 4.860423877075947e-05, "loss": 1.1637, "step": 57900 }, { "epoch": 0.056, "grad_norm": 0.6591224074363708, "learning_rate": 4.859900659822025e-05, "loss": 1.2203, "step": 58000 }, { "epoch": 0.056, "eval_loss": 1.0788133144378662, "eval_runtime": 76.7654, "eval_samples_per_second": 201.171, "eval_steps_per_second": 3.152, "step": 58000 }, { "epoch": 0.0562, "grad_norm": 1.3405015468597412, "learning_rate": 4.859376492000399e-05, "loss": 1.19, "step": 58100 }, { "epoch": 0.0564, "grad_norm": 1.1912270784378052, "learning_rate": 4.858851373822205e-05, "loss": 1.1521, "step": 58200 }, { "epoch": 0.0566, "grad_norm": 1.0169751644134521, "learning_rate": 4.85832530549896e-05, "loss": 1.2054, "step": 58300 }, { "epoch": 0.0568, "grad_norm": 0.6713104248046875, "learning_rate": 4.857798287242563e-05, "loss": 1.2033, "step": 58400 }, { "epoch": 0.057, "grad_norm": 1.2116252183914185, "learning_rate": 4.857270319265298e-05, "loss": 1.1919, "step": 58500 }, { "epoch": 0.0572, "grad_norm": 0.9526674151420593, "learning_rate": 4.856741401779831e-05, "loss": 1.1724, "step": 58600 }, { "epoch": 0.0574, "grad_norm": 1.458253264427185, "learning_rate": 4.856211534999209e-05, "loss": 1.1479, "step": 58700 }, { "epoch": 0.0576, "grad_norm": 1.173437237739563, "learning_rate": 4.855680719136862e-05, "loss": 1.2005, "step": 58800 }, { "epoch": 0.0578, "grad_norm": 0.7292013168334961, "learning_rate": 4.8551489544066034e-05, "loss": 1.1292, "step": 58900 }, { "epoch": 0.058, "grad_norm": 0.6017533540725708, "learning_rate": 4.854616241022627e-05, "loss": 1.1527, "step": 59000 }, { "epoch": 0.058, "eval_loss": 1.0688042640686035, "eval_runtime": 76.596, "eval_samples_per_second": 201.616, "eval_steps_per_second": 3.159, "step": 59000 }, { "epoch": 0.0582, "grad_norm": 0.8270254731178284, "learning_rate": 4.8540825791995114e-05, "loss": 1.1517, "step": 59100 }, { "epoch": 0.0584, "grad_norm": 1.1182663440704346, "learning_rate": 4.8535479691522136e-05, "loss": 1.2282, "step": 59200 }, { "epoch": 0.0586, "grad_norm": 1.1249291896820068, "learning_rate": 4.853012411096075e-05, "loss": 1.1314, "step": 59300 }, { "epoch": 0.0588, "grad_norm": 0.6025962233543396, "learning_rate": 4.85247590524682e-05, "loss": 1.1879, "step": 59400 }, { "epoch": 0.059, "grad_norm": 1.2914466857910156, "learning_rate": 4.85193845182055e-05, "loss": 1.1926, "step": 59500 }, { "epoch": 0.0592, "grad_norm": 0.7965525388717651, "learning_rate": 4.8514000510337544e-05, "loss": 1.2344, "step": 59600 }, { "epoch": 0.0594, "grad_norm": 0.6595709323883057, "learning_rate": 4.850860703103298e-05, "loss": 1.2056, "step": 59700 }, { "epoch": 0.0596, "grad_norm": 0.783892035484314, "learning_rate": 4.850320408246433e-05, "loss": 1.1343, "step": 59800 }, { "epoch": 0.0598, "grad_norm": 0.948952853679657, "learning_rate": 4.849779166680788e-05, "loss": 1.1607, "step": 59900 }, { "epoch": 0.06, "grad_norm": 0.725027322769165, "learning_rate": 4.849236978624375e-05, "loss": 1.2125, "step": 60000 }, { "epoch": 0.06, "eval_loss": 1.0838971138000488, "eval_runtime": 76.8451, "eval_samples_per_second": 200.963, "eval_steps_per_second": 3.149, "step": 60000 }, { "epoch": 0.0002, "grad_norm": 1.1788556575775146, "learning_rate": 4.848693844295589e-05, "loss": 1.1917, "step": 60100 }, { "epoch": 0.0004, "grad_norm": 1.3381775617599487, "learning_rate": 4.848149763913202e-05, "loss": 1.2108, "step": 60200 }, { "epoch": 0.0006, "grad_norm": 0.9748820066452026, "learning_rate": 4.847604737696372e-05, "loss": 1.2054, "step": 60300 }, { "epoch": 0.0008, "grad_norm": 1.3528317213058472, "learning_rate": 4.847058765864634e-05, "loss": 1.1582, "step": 60400 }, { "epoch": 0.001, "grad_norm": 1.0475611686706543, "learning_rate": 4.8465118486379065e-05, "loss": 1.1409, "step": 60500 }, { "epoch": 0.0012, "grad_norm": 0.667515754699707, "learning_rate": 4.8459639862364866e-05, "loss": 1.1548, "step": 60600 }, { "epoch": 0.0014, "grad_norm": 1.3529212474822998, "learning_rate": 4.845415178881053e-05, "loss": 1.1741, "step": 60700 }, { "epoch": 0.0016, "grad_norm": 1.2415704727172852, "learning_rate": 4.844865426792666e-05, "loss": 1.1689, "step": 60800 }, { "epoch": 0.0018, "grad_norm": 0.9598329663276672, "learning_rate": 4.844314730192766e-05, "loss": 1.2138, "step": 60900 }, { "epoch": 0.002, "grad_norm": 0.660463273525238, "learning_rate": 4.843763089303173e-05, "loss": 1.1897, "step": 61000 }, { "epoch": 0.002, "eval_loss": 1.0804229974746704, "eval_runtime": 77.9042, "eval_samples_per_second": 198.231, "eval_steps_per_second": 3.106, "step": 61000 }, { "epoch": 0.0022, "grad_norm": 1.3137476444244385, "learning_rate": 4.843210504346088e-05, "loss": 1.2149, "step": 61100 }, { "epoch": 0.0024, "grad_norm": 2.466374158859253, "learning_rate": 4.842656975544092e-05, "loss": 1.2294, "step": 61200 }, { "epoch": 0.0026, "grad_norm": 0.9236732721328735, "learning_rate": 4.842102503120146e-05, "loss": 1.2316, "step": 61300 }, { "epoch": 0.0028, "grad_norm": 0.9453101754188538, "learning_rate": 4.841547087297592e-05, "loss": 1.1903, "step": 61400 }, { "epoch": 0.003, "grad_norm": 1.0694693326950073, "learning_rate": 4.840990728300151e-05, "loss": 1.2027, "step": 61500 }, { "epoch": 0.0032, "grad_norm": 1.0661156177520752, "learning_rate": 4.8404334263519254e-05, "loss": 1.2268, "step": 61600 }, { "epoch": 0.0034, "grad_norm": 1.3803828954696655, "learning_rate": 4.839875181677394e-05, "loss": 1.2084, "step": 61700 }, { "epoch": 0.0036, "grad_norm": 0.896979033946991, "learning_rate": 4.839315994501421e-05, "loss": 1.1818, "step": 61800 }, { "epoch": 0.0038, "grad_norm": 1.1509560346603394, "learning_rate": 4.8387558650492446e-05, "loss": 1.226, "step": 61900 }, { "epoch": 0.004, "grad_norm": 1.2490339279174805, "learning_rate": 4.8381947935464854e-05, "loss": 1.2283, "step": 62000 }, { "epoch": 0.004, "eval_loss": 1.086965560913086, "eval_runtime": 75.4991, "eval_samples_per_second": 204.545, "eval_steps_per_second": 3.205, "step": 62000 }, { "epoch": 0.0042, "grad_norm": 1.0047966241836548, "learning_rate": 4.837632780219142e-05, "loss": 1.2006, "step": 62100 }, { "epoch": 0.0044, "grad_norm": 1.3791793584823608, "learning_rate": 4.837069825293596e-05, "loss": 1.2191, "step": 62200 }, { "epoch": 0.0046, "grad_norm": 1.4083282947540283, "learning_rate": 4.836505928996603e-05, "loss": 1.2232, "step": 62300 }, { "epoch": 0.0048, "grad_norm": 1.5420063734054565, "learning_rate": 4.835941091555301e-05, "loss": 1.2281, "step": 62400 }, { "epoch": 0.005, "grad_norm": 0.7661809921264648, "learning_rate": 4.8353753131972066e-05, "loss": 1.2262, "step": 62500 }, { "epoch": 0.0052, "grad_norm": 0.5983784198760986, "learning_rate": 4.8348085941502164e-05, "loss": 1.2203, "step": 62600 }, { "epoch": 0.0054, "grad_norm": 0.8108716011047363, "learning_rate": 4.8342409346426024e-05, "loss": 1.1536, "step": 62700 }, { "epoch": 0.0056, "grad_norm": 0.9011421203613281, "learning_rate": 4.83367233490302e-05, "loss": 1.2214, "step": 62800 }, { "epoch": 0.0058, "grad_norm": 0.6667259335517883, "learning_rate": 4.8331027951604995e-05, "loss": 1.1932, "step": 62900 }, { "epoch": 0.006, "grad_norm": 1.2265853881835938, "learning_rate": 4.8325323156444525e-05, "loss": 1.235, "step": 63000 }, { "epoch": 0.006, "eval_loss": 1.0849037170410156, "eval_runtime": 76.5492, "eval_samples_per_second": 201.74, "eval_steps_per_second": 3.161, "step": 63000 }, { "epoch": 0.0062, "grad_norm": 1.468518614768982, "learning_rate": 4.831960896584667e-05, "loss": 1.1886, "step": 63100 }, { "epoch": 0.0064, "grad_norm": 1.2378790378570557, "learning_rate": 4.831388538211312e-05, "loss": 1.1983, "step": 63200 }, { "epoch": 0.0066, "grad_norm": 1.2989089488983154, "learning_rate": 4.830815240754933e-05, "loss": 1.1894, "step": 63300 }, { "epoch": 0.0068, "grad_norm": 1.3696600198745728, "learning_rate": 4.830241004446453e-05, "loss": 1.1798, "step": 63400 }, { "epoch": 0.007, "grad_norm": 1.3715136051177979, "learning_rate": 4.829665829517175e-05, "loss": 1.2323, "step": 63500 }, { "epoch": 0.0072, "grad_norm": 0.7888614535331726, "learning_rate": 4.82908971619878e-05, "loss": 1.2098, "step": 63600 }, { "epoch": 0.0074, "grad_norm": 1.0456979274749756, "learning_rate": 4.828512664723326e-05, "loss": 1.21, "step": 63700 }, { "epoch": 0.0076, "grad_norm": 1.4525970220565796, "learning_rate": 4.827934675323248e-05, "loss": 1.191, "step": 63800 }, { "epoch": 0.0078, "grad_norm": 1.6751583814620972, "learning_rate": 4.8273557482313625e-05, "loss": 1.2084, "step": 63900 }, { "epoch": 0.008, "grad_norm": 0.7282904982566833, "learning_rate": 4.826775883680859e-05, "loss": 1.2376, "step": 64000 }, { "epoch": 0.008, "eval_loss": 1.0806148052215576, "eval_runtime": 75.7629, "eval_samples_per_second": 203.833, "eval_steps_per_second": 3.194, "step": 64000 }, { "epoch": 0.0082, "grad_norm": 1.0859407186508179, "learning_rate": 4.826195081905308e-05, "loss": 1.1807, "step": 64100 }, { "epoch": 0.0084, "grad_norm": 1.3917006254196167, "learning_rate": 4.8256133431386566e-05, "loss": 1.2012, "step": 64200 }, { "epoch": 0.0086, "grad_norm": 1.4448059797286987, "learning_rate": 4.825030667615228e-05, "loss": 1.2305, "step": 64300 }, { "epoch": 0.0088, "grad_norm": 1.0721293687820435, "learning_rate": 4.824447055569725e-05, "loss": 1.2332, "step": 64400 }, { "epoch": 0.009, "grad_norm": 0.9940403699874878, "learning_rate": 4.823862507237226e-05, "loss": 1.2096, "step": 64500 }, { "epoch": 0.0092, "grad_norm": 1.5473828315734863, "learning_rate": 4.823277022853187e-05, "loss": 1.1706, "step": 64600 }, { "epoch": 0.0094, "grad_norm": 1.3127409219741821, "learning_rate": 4.822690602653441e-05, "loss": 1.2051, "step": 64700 }, { "epoch": 0.0096, "grad_norm": 1.7532451152801514, "learning_rate": 4.822103246874198e-05, "loss": 1.196, "step": 64800 }, { "epoch": 0.0098, "grad_norm": 0.8706884980201721, "learning_rate": 4.8215149557520446e-05, "loss": 1.1862, "step": 64900 }, { "epoch": 0.01, "grad_norm": 1.5764431953430176, "learning_rate": 4.8209257295239455e-05, "loss": 1.2257, "step": 65000 }, { "epoch": 0.01, "eval_loss": 1.0817573070526123, "eval_runtime": 75.771, "eval_samples_per_second": 203.811, "eval_steps_per_second": 3.194, "step": 65000 }, { "epoch": 0.0102, "grad_norm": 1.467939019203186, "learning_rate": 4.820335568427239e-05, "loss": 1.2317, "step": 65100 }, { "epoch": 0.0104, "grad_norm": 1.270477533340454, "learning_rate": 4.819744472699643e-05, "loss": 1.2308, "step": 65200 }, { "epoch": 0.0106, "grad_norm": 1.073867917060852, "learning_rate": 4.8191524425792526e-05, "loss": 1.1991, "step": 65300 }, { "epoch": 0.0108, "grad_norm": 1.0844908952713013, "learning_rate": 4.818559478304534e-05, "loss": 1.1914, "step": 65400 }, { "epoch": 0.011, "grad_norm": 1.282365322113037, "learning_rate": 4.817965580114335e-05, "loss": 1.2035, "step": 65500 }, { "epoch": 0.0112, "grad_norm": 1.3751475811004639, "learning_rate": 4.817370748247878e-05, "loss": 1.215, "step": 65600 }, { "epoch": 0.0114, "grad_norm": 1.484107255935669, "learning_rate": 4.81677498294476e-05, "loss": 1.2298, "step": 65700 }, { "epoch": 0.0116, "grad_norm": 1.326803207397461, "learning_rate": 4.8161782844449566e-05, "loss": 1.1794, "step": 65800 }, { "epoch": 0.0118, "grad_norm": 1.6823039054870605, "learning_rate": 4.815580652988817e-05, "loss": 1.1896, "step": 65900 }, { "epoch": 0.012, "grad_norm": 1.1735076904296875, "learning_rate": 4.8149820888170673e-05, "loss": 1.2089, "step": 66000 }, { "epoch": 0.012, "eval_loss": 1.081894874572754, "eval_runtime": 75.5115, "eval_samples_per_second": 204.512, "eval_steps_per_second": 3.205, "step": 66000 }, { "epoch": 0.0122, "grad_norm": 1.0032376050949097, "learning_rate": 4.814382592170808e-05, "loss": 1.2197, "step": 66100 }, { "epoch": 0.0124, "grad_norm": 1.2638306617736816, "learning_rate": 4.813782163291519e-05, "loss": 1.2009, "step": 66200 }, { "epoch": 0.0126, "grad_norm": 1.2233041524887085, "learning_rate": 4.813180802421051e-05, "loss": 1.2069, "step": 66300 }, { "epoch": 0.0128, "grad_norm": 0.857825756072998, "learning_rate": 4.812578509801632e-05, "loss": 1.1942, "step": 66400 }, { "epoch": 0.013, "grad_norm": 0.8879494667053223, "learning_rate": 4.811975285675866e-05, "loss": 1.1689, "step": 66500 }, { "epoch": 0.0132, "grad_norm": 1.3842177391052246, "learning_rate": 4.811371130286731e-05, "loss": 1.1941, "step": 66600 }, { "epoch": 0.0134, "grad_norm": 1.303063988685608, "learning_rate": 4.810766043877582e-05, "loss": 1.194, "step": 66700 }, { "epoch": 0.0136, "grad_norm": 1.3135032653808594, "learning_rate": 4.810160026692147e-05, "loss": 1.1536, "step": 66800 }, { "epoch": 0.0138, "grad_norm": 0.8059789538383484, "learning_rate": 4.809553078974528e-05, "loss": 1.2083, "step": 66900 }, { "epoch": 0.014, "grad_norm": 1.493458867073059, "learning_rate": 4.808945200969206e-05, "loss": 1.2031, "step": 67000 }, { "epoch": 0.014, "eval_loss": 1.0807029008865356, "eval_runtime": 76.4746, "eval_samples_per_second": 201.936, "eval_steps_per_second": 3.164, "step": 67000 }, { "epoch": 0.0142, "grad_norm": 0.9932582378387451, "learning_rate": 4.808336392921033e-05, "loss": 1.1932, "step": 67100 }, { "epoch": 0.0144, "grad_norm": 1.1588648557662964, "learning_rate": 4.807726655075237e-05, "loss": 1.2004, "step": 67200 }, { "epoch": 0.0146, "grad_norm": 0.713295042514801, "learning_rate": 4.80711598767742e-05, "loss": 1.1336, "step": 67300 }, { "epoch": 0.0148, "grad_norm": 1.474853277206421, "learning_rate": 4.80650439097356e-05, "loss": 1.1909, "step": 67400 }, { "epoch": 0.015, "grad_norm": 1.0433249473571777, "learning_rate": 4.805891865210006e-05, "loss": 1.1868, "step": 67500 }, { "epoch": 0.0152, "grad_norm": 0.9942545294761658, "learning_rate": 4.8052784106334854e-05, "loss": 1.1896, "step": 67600 }, { "epoch": 0.0154, "grad_norm": 0.9021309018135071, "learning_rate": 4.804664027491096e-05, "loss": 1.2265, "step": 67700 }, { "epoch": 0.0156, "grad_norm": 1.4818402528762817, "learning_rate": 4.8040487160303126e-05, "loss": 1.2149, "step": 67800 }, { "epoch": 0.0158, "grad_norm": 0.74870365858078, "learning_rate": 4.803432476498981e-05, "loss": 1.1928, "step": 67900 }, { "epoch": 0.016, "grad_norm": 0.7827754020690918, "learning_rate": 4.8028153091453246e-05, "loss": 1.2062, "step": 68000 }, { "epoch": 0.016, "eval_loss": 1.0748348236083984, "eval_runtime": 75.9274, "eval_samples_per_second": 203.392, "eval_steps_per_second": 3.187, "step": 68000 }, { "epoch": 0.0162, "grad_norm": 1.2250913381576538, "learning_rate": 4.802197214217936e-05, "loss": 1.1412, "step": 68100 }, { "epoch": 0.0164, "grad_norm": 1.4763202667236328, "learning_rate": 4.801578191965785e-05, "loss": 1.173, "step": 68200 }, { "epoch": 0.0166, "grad_norm": 0.8980317115783691, "learning_rate": 4.800958242638214e-05, "loss": 1.1801, "step": 68300 }, { "epoch": 0.0168, "grad_norm": 1.2781926393508911, "learning_rate": 4.800337366484937e-05, "loss": 1.2012, "step": 68400 }, { "epoch": 0.017, "grad_norm": 0.8269230723381042, "learning_rate": 4.799715563756045e-05, "loss": 1.2319, "step": 68500 }, { "epoch": 0.0172, "grad_norm": 0.633537232875824, "learning_rate": 4.7990928347019984e-05, "loss": 1.2058, "step": 68600 }, { "epoch": 0.0174, "grad_norm": 1.39164400100708, "learning_rate": 4.7984691795736324e-05, "loss": 1.2066, "step": 68700 }, { "epoch": 0.0176, "grad_norm": 1.5555399656295776, "learning_rate": 4.7978445986221566e-05, "loss": 1.2088, "step": 68800 }, { "epoch": 0.0178, "grad_norm": 1.2505526542663574, "learning_rate": 4.7972190920991514e-05, "loss": 1.203, "step": 68900 }, { "epoch": 0.018, "grad_norm": 1.5910965204238892, "learning_rate": 4.7965926602565705e-05, "loss": 1.1877, "step": 69000 }, { "epoch": 0.018, "eval_loss": 1.0717748403549194, "eval_runtime": 75.7519, "eval_samples_per_second": 203.863, "eval_steps_per_second": 3.195, "step": 69000 }, { "epoch": 0.0182, "grad_norm": 0.7755507826805115, "learning_rate": 4.79596530334674e-05, "loss": 1.1864, "step": 69100 }, { "epoch": 0.0184, "grad_norm": 1.2141857147216797, "learning_rate": 4.79533702162236e-05, "loss": 1.1849, "step": 69200 }, { "epoch": 0.0186, "grad_norm": 1.399149775505066, "learning_rate": 4.794707815336503e-05, "loss": 1.1702, "step": 69300 }, { "epoch": 0.0188, "grad_norm": 1.3381379842758179, "learning_rate": 4.7940776847426114e-05, "loss": 1.2052, "step": 69400 }, { "epoch": 0.019, "grad_norm": 1.347264051437378, "learning_rate": 4.793446630094503e-05, "loss": 1.1998, "step": 69500 }, { "epoch": 0.0192, "grad_norm": 1.2072675228118896, "learning_rate": 4.792814651646367e-05, "loss": 1.2127, "step": 69600 }, { "epoch": 0.0194, "grad_norm": 0.7959086894989014, "learning_rate": 4.792181749652763e-05, "loss": 1.1474, "step": 69700 }, { "epoch": 0.0196, "grad_norm": 1.0272786617279053, "learning_rate": 4.7915479243686244e-05, "loss": 1.2033, "step": 69800 }, { "epoch": 0.0198, "grad_norm": 0.8985835909843445, "learning_rate": 4.790913176049256e-05, "loss": 1.1942, "step": 69900 }, { "epoch": 0.02, "grad_norm": 0.676177442073822, "learning_rate": 4.7902775049503346e-05, "loss": 1.1883, "step": 70000 }, { "epoch": 0.02, "eval_loss": 1.0733578205108643, "eval_runtime": 75.8186, "eval_samples_per_second": 203.684, "eval_steps_per_second": 3.192, "step": 70000 }, { "epoch": 0.0202, "grad_norm": 0.7747570872306824, "learning_rate": 4.789640911327907e-05, "loss": 1.1883, "step": 70100 }, { "epoch": 0.0204, "grad_norm": 1.1808815002441406, "learning_rate": 4.789003395438395e-05, "loss": 1.1932, "step": 70200 }, { "epoch": 0.0206, "grad_norm": 1.29102623462677, "learning_rate": 4.7883649575385894e-05, "loss": 1.1654, "step": 70300 }, { "epoch": 0.0208, "grad_norm": 0.8418052792549133, "learning_rate": 4.7877255978856516e-05, "loss": 1.1702, "step": 70400 }, { "epoch": 0.021, "grad_norm": 1.1825124025344849, "learning_rate": 4.787085316737116e-05, "loss": 1.1654, "step": 70500 }, { "epoch": 0.0212, "grad_norm": 1.301255702972412, "learning_rate": 4.78644411435089e-05, "loss": 1.2505, "step": 70600 }, { "epoch": 0.0214, "grad_norm": 1.2461885213851929, "learning_rate": 4.785801990985247e-05, "loss": 1.1907, "step": 70700 }, { "epoch": 0.0216, "grad_norm": 1.2957687377929688, "learning_rate": 4.7851589468988364e-05, "loss": 1.2244, "step": 70800 }, { "epoch": 0.0218, "grad_norm": 1.9566733837127686, "learning_rate": 4.7845149823506744e-05, "loss": 1.1688, "step": 70900 }, { "epoch": 0.022, "grad_norm": 0.9749345779418945, "learning_rate": 4.783870097600151e-05, "loss": 1.2178, "step": 71000 }, { "epoch": 0.022, "eval_loss": 1.076163649559021, "eval_runtime": 75.78, "eval_samples_per_second": 203.787, "eval_steps_per_second": 3.193, "step": 71000 }, { "epoch": 0.0222, "grad_norm": 1.1278064250946045, "learning_rate": 4.783224292907025e-05, "loss": 1.1899, "step": 71100 }, { "epoch": 0.0224, "grad_norm": 1.023586392402649, "learning_rate": 4.7825775685314277e-05, "loss": 1.1967, "step": 71200 }, { "epoch": 0.0226, "grad_norm": 1.2925764322280884, "learning_rate": 4.781929924733858e-05, "loss": 1.2154, "step": 71300 }, { "epoch": 0.0228, "grad_norm": 0.8185212016105652, "learning_rate": 4.781281361775188e-05, "loss": 1.195, "step": 71400 }, { "epoch": 0.023, "grad_norm": 0.8742319941520691, "learning_rate": 4.7806318799166586e-05, "loss": 1.1746, "step": 71500 }, { "epoch": 0.0232, "grad_norm": 1.2598085403442383, "learning_rate": 4.77998147941988e-05, "loss": 1.1781, "step": 71600 }, { "epoch": 0.0234, "grad_norm": 1.2358424663543701, "learning_rate": 4.7793301605468344e-05, "loss": 1.2345, "step": 71700 }, { "epoch": 0.0236, "grad_norm": 1.2528828382492065, "learning_rate": 4.778677923559872e-05, "loss": 1.2109, "step": 71800 }, { "epoch": 0.0238, "grad_norm": 0.5741105675697327, "learning_rate": 4.778024768721716e-05, "loss": 1.2076, "step": 71900 }, { "epoch": 0.024, "grad_norm": 1.3200185298919678, "learning_rate": 4.7773706962954545e-05, "loss": 1.2124, "step": 72000 }, { "epoch": 0.024, "eval_loss": 1.0720120668411255, "eval_runtime": 76.4471, "eval_samples_per_second": 202.009, "eval_steps_per_second": 3.166, "step": 72000 }, { "epoch": 0.0242, "grad_norm": 1.4096635580062866, "learning_rate": 4.776715706544549e-05, "loss": 1.2283, "step": 72100 }, { "epoch": 0.0244, "grad_norm": 1.5862853527069092, "learning_rate": 4.7760597997328295e-05, "loss": 1.1927, "step": 72200 }, { "epoch": 0.0246, "grad_norm": 1.3406593799591064, "learning_rate": 4.7754029761244945e-05, "loss": 1.1709, "step": 72300 }, { "epoch": 0.0248, "grad_norm": 1.189676284790039, "learning_rate": 4.774745235984113e-05, "loss": 1.2176, "step": 72400 }, { "epoch": 0.025, "grad_norm": 1.4424960613250732, "learning_rate": 4.7740865795766224e-05, "loss": 1.2212, "step": 72500 }, { "epoch": 0.0252, "grad_norm": 0.7654275298118591, "learning_rate": 4.77342700716733e-05, "loss": 1.2196, "step": 72600 }, { "epoch": 0.0254, "grad_norm": 1.1389504671096802, "learning_rate": 4.772766519021911e-05, "loss": 1.1937, "step": 72700 }, { "epoch": 0.0256, "grad_norm": 1.1204986572265625, "learning_rate": 4.772105115406409e-05, "loss": 1.1623, "step": 72800 }, { "epoch": 0.0258, "grad_norm": 1.2594044208526611, "learning_rate": 4.771442796587239e-05, "loss": 1.2127, "step": 72900 }, { "epoch": 0.026, "grad_norm": 1.3245586156845093, "learning_rate": 4.770779562831181e-05, "loss": 1.1919, "step": 73000 }, { "epoch": 0.026, "eval_loss": 1.0672369003295898, "eval_runtime": 76.1554, "eval_samples_per_second": 202.783, "eval_steps_per_second": 3.178, "step": 73000 }, { "epoch": 0.0262, "grad_norm": 0.813410222530365, "learning_rate": 4.770115414405388e-05, "loss": 1.224, "step": 73100 }, { "epoch": 0.0264, "grad_norm": 1.3278921842575073, "learning_rate": 4.769450351577377e-05, "loss": 1.2304, "step": 73200 }, { "epoch": 0.0266, "grad_norm": 1.1676868200302124, "learning_rate": 4.768784374615036e-05, "loss": 1.2144, "step": 73300 }, { "epoch": 0.0268, "grad_norm": 1.2270694971084595, "learning_rate": 4.7681174837866196e-05, "loss": 1.2584, "step": 73400 }, { "epoch": 0.027, "grad_norm": 1.5095762014389038, "learning_rate": 4.7674496793607525e-05, "loss": 1.1892, "step": 73500 }, { "epoch": 0.0272, "grad_norm": 1.0437262058258057, "learning_rate": 4.766780961606426e-05, "loss": 1.2003, "step": 73600 }, { "epoch": 0.0274, "grad_norm": 0.6719204187393188, "learning_rate": 4.766111330793e-05, "loss": 1.2145, "step": 73700 }, { "epoch": 0.0276, "grad_norm": 0.7166513204574585, "learning_rate": 4.765440787190199e-05, "loss": 1.2463, "step": 73800 }, { "epoch": 0.0278, "grad_norm": 0.9765319228172302, "learning_rate": 4.7647693310681204e-05, "loss": 1.2095, "step": 73900 }, { "epoch": 0.028, "grad_norm": 1.298134446144104, "learning_rate": 4.7640969626972265e-05, "loss": 1.2089, "step": 74000 }, { "epoch": 0.028, "eval_loss": 1.0727263689041138, "eval_runtime": 76.0016, "eval_samples_per_second": 203.193, "eval_steps_per_second": 3.184, "step": 74000 }, { "epoch": 0.0282, "grad_norm": 1.1968761682510376, "learning_rate": 4.763423682348347e-05, "loss": 1.1719, "step": 74100 }, { "epoch": 0.0284, "grad_norm": 1.1887174844741821, "learning_rate": 4.762749490292678e-05, "loss": 1.1961, "step": 74200 }, { "epoch": 0.0286, "grad_norm": 1.4029371738433838, "learning_rate": 4.762074386801786e-05, "loss": 1.1609, "step": 74300 }, { "epoch": 0.0288, "grad_norm": 1.3785122632980347, "learning_rate": 4.761398372147601e-05, "loss": 1.1931, "step": 74400 }, { "epoch": 0.029, "grad_norm": 1.1329565048217773, "learning_rate": 4.760721446602422e-05, "loss": 1.2107, "step": 74500 }, { "epoch": 0.0292, "grad_norm": 1.2266113758087158, "learning_rate": 4.760043610438915e-05, "loss": 1.1708, "step": 74600 }, { "epoch": 0.0294, "grad_norm": 1.2526196241378784, "learning_rate": 4.759364863930112e-05, "loss": 1.2073, "step": 74700 }, { "epoch": 0.0296, "grad_norm": 1.3959336280822754, "learning_rate": 4.7586852073494113e-05, "loss": 1.1995, "step": 74800 }, { "epoch": 0.0298, "grad_norm": 1.2470852136611938, "learning_rate": 4.7580046409705806e-05, "loss": 1.2227, "step": 74900 }, { "epoch": 0.03, "grad_norm": 1.0915220975875854, "learning_rate": 4.7573231650677495e-05, "loss": 1.1955, "step": 75000 }, { "epoch": 0.03, "eval_loss": 1.0732471942901611, "eval_runtime": 75.8455, "eval_samples_per_second": 203.611, "eval_steps_per_second": 3.191, "step": 75000 }, { "epoch": 0.0302, "grad_norm": 1.4608689546585083, "learning_rate": 4.756640779915418e-05, "loss": 1.1588, "step": 75100 }, { "epoch": 0.0304, "grad_norm": 1.2811450958251953, "learning_rate": 4.755957485788449e-05, "loss": 1.1722, "step": 75200 }, { "epoch": 0.0306, "grad_norm": 1.3260635137557983, "learning_rate": 4.755273282962075e-05, "loss": 1.2238, "step": 75300 }, { "epoch": 0.0308, "grad_norm": 1.219567060470581, "learning_rate": 4.754588171711893e-05, "loss": 1.2718, "step": 75400 }, { "epoch": 0.031, "grad_norm": 1.368947982788086, "learning_rate": 4.753902152313865e-05, "loss": 1.1998, "step": 75500 }, { "epoch": 0.0312, "grad_norm": 1.3364487886428833, "learning_rate": 4.7532152250443194e-05, "loss": 1.2043, "step": 75600 }, { "epoch": 0.0314, "grad_norm": 1.348130702972412, "learning_rate": 4.7525273901799506e-05, "loss": 1.1834, "step": 75700 }, { "epoch": 0.0316, "grad_norm": 1.1862202882766724, "learning_rate": 4.751838647997818e-05, "loss": 1.2061, "step": 75800 }, { "epoch": 0.0318, "grad_norm": 0.7471460103988647, "learning_rate": 4.7511489987753476e-05, "loss": 1.1866, "step": 75900 }, { "epoch": 0.032, "grad_norm": 1.4090434312820435, "learning_rate": 4.75045844279033e-05, "loss": 1.1878, "step": 76000 }, { "epoch": 0.032, "eval_loss": 1.0745600461959839, "eval_runtime": 76.306, "eval_samples_per_second": 202.382, "eval_steps_per_second": 3.171, "step": 76000 }, { "epoch": 0.0322, "grad_norm": 1.591199278831482, "learning_rate": 4.7497669803209204e-05, "loss": 1.1824, "step": 76100 }, { "epoch": 0.0324, "grad_norm": 0.8325656652450562, "learning_rate": 4.749074611645641e-05, "loss": 1.1723, "step": 76200 }, { "epoch": 0.0326, "grad_norm": 0.8313425779342651, "learning_rate": 4.748381337043376e-05, "loss": 1.2033, "step": 76300 }, { "epoch": 0.0328, "grad_norm": 1.4721826314926147, "learning_rate": 4.7476871567933775e-05, "loss": 1.1988, "step": 76400 }, { "epoch": 0.033, "grad_norm": 0.9206506013870239, "learning_rate": 4.746992071175261e-05, "loss": 1.1844, "step": 76500 }, { "epoch": 0.0332, "grad_norm": 1.0820422172546387, "learning_rate": 4.746296080469007e-05, "loss": 1.1902, "step": 76600 }, { "epoch": 0.0334, "grad_norm": 0.9319769144058228, "learning_rate": 4.745599184954961e-05, "loss": 1.2031, "step": 76700 }, { "epoch": 0.0336, "grad_norm": 1.1914819478988647, "learning_rate": 4.744901384913831e-05, "loss": 1.166, "step": 76800 }, { "epoch": 0.0338, "grad_norm": 0.8440219163894653, "learning_rate": 4.7442026806266914e-05, "loss": 1.1493, "step": 76900 }, { "epoch": 0.034, "grad_norm": 1.001518726348877, "learning_rate": 4.7435030723749813e-05, "loss": 1.1835, "step": 77000 }, { "epoch": 0.034, "eval_loss": 1.0681182146072388, "eval_runtime": 76.1301, "eval_samples_per_second": 202.85, "eval_steps_per_second": 3.179, "step": 77000 }, { "epoch": 0.0342, "grad_norm": 1.347307562828064, "learning_rate": 4.742802560440501e-05, "loss": 1.2213, "step": 77100 }, { "epoch": 0.0344, "grad_norm": 1.1187894344329834, "learning_rate": 4.742101145105419e-05, "loss": 1.1949, "step": 77200 }, { "epoch": 0.0346, "grad_norm": 0.8066337704658508, "learning_rate": 4.741398826652262e-05, "loss": 1.2008, "step": 77300 }, { "epoch": 0.0348, "grad_norm": 1.0704104900360107, "learning_rate": 4.740695605363927e-05, "loss": 1.1804, "step": 77400 }, { "epoch": 0.035, "grad_norm": 1.104546070098877, "learning_rate": 4.7399914815236704e-05, "loss": 1.2232, "step": 77500 }, { "epoch": 0.0352, "grad_norm": 1.1818023920059204, "learning_rate": 4.7392864554151126e-05, "loss": 1.2062, "step": 77600 }, { "epoch": 0.0354, "grad_norm": 1.3036936521530151, "learning_rate": 4.738580527322238e-05, "loss": 1.1905, "step": 77700 }, { "epoch": 0.0356, "grad_norm": 1.1169214248657227, "learning_rate": 4.737873697529395e-05, "loss": 1.1759, "step": 77800 }, { "epoch": 0.0358, "grad_norm": 0.8993995189666748, "learning_rate": 4.7371659663212934e-05, "loss": 1.1677, "step": 77900 }, { "epoch": 0.036, "grad_norm": 1.258747935295105, "learning_rate": 4.736457333983009e-05, "loss": 1.2166, "step": 78000 }, { "epoch": 0.036, "eval_loss": 1.0701075792312622, "eval_runtime": 75.9209, "eval_samples_per_second": 203.409, "eval_steps_per_second": 3.188, "step": 78000 }, { "epoch": 0.0362, "grad_norm": 1.269551396369934, "learning_rate": 4.735747800799978e-05, "loss": 1.2185, "step": 78100 }, { "epoch": 0.0364, "grad_norm": 1.3016119003295898, "learning_rate": 4.735037367057999e-05, "loss": 1.182, "step": 78200 }, { "epoch": 0.0366, "grad_norm": 1.1407994031906128, "learning_rate": 4.734326033043238e-05, "loss": 1.2102, "step": 78300 }, { "epoch": 0.0368, "grad_norm": 1.1673243045806885, "learning_rate": 4.7336137990422164e-05, "loss": 1.1902, "step": 78400 }, { "epoch": 0.037, "grad_norm": 0.9958565831184387, "learning_rate": 4.732900665341824e-05, "loss": 1.2112, "step": 78500 }, { "epoch": 0.0372, "grad_norm": 0.6769017577171326, "learning_rate": 4.732186632229311e-05, "loss": 1.1933, "step": 78600 }, { "epoch": 0.0374, "grad_norm": 0.6430754661560059, "learning_rate": 4.7314716999922896e-05, "loss": 1.1851, "step": 78700 }, { "epoch": 0.0376, "grad_norm": 1.103901982307434, "learning_rate": 4.7307558689187334e-05, "loss": 1.2234, "step": 78800 }, { "epoch": 0.0378, "grad_norm": 1.143268346786499, "learning_rate": 4.73003913929698e-05, "loss": 1.1609, "step": 78900 }, { "epoch": 0.038, "grad_norm": 1.2543673515319824, "learning_rate": 4.7293215114157284e-05, "loss": 1.1862, "step": 79000 }, { "epoch": 0.038, "eval_loss": 1.075058937072754, "eval_runtime": 77.0151, "eval_samples_per_second": 200.519, "eval_steps_per_second": 3.142, "step": 79000 }, { "epoch": 0.0382, "grad_norm": 1.0687370300292969, "learning_rate": 4.728602985564039e-05, "loss": 1.1878, "step": 79100 }, { "epoch": 0.0384, "grad_norm": 1.230892539024353, "learning_rate": 4.727883562031333e-05, "loss": 1.1561, "step": 79200 }, { "epoch": 0.0386, "grad_norm": 1.0465742349624634, "learning_rate": 4.727163241107395e-05, "loss": 1.1677, "step": 79300 }, { "epoch": 0.0388, "grad_norm": 0.6553373336791992, "learning_rate": 4.726442023082369e-05, "loss": 1.2035, "step": 79400 }, { "epoch": 0.039, "grad_norm": 0.9347487688064575, "learning_rate": 4.725719908246763e-05, "loss": 1.2116, "step": 79500 }, { "epoch": 0.0392, "grad_norm": 1.0414602756500244, "learning_rate": 4.724996896891445e-05, "loss": 1.2237, "step": 79600 }, { "epoch": 0.0394, "grad_norm": 1.1857577562332153, "learning_rate": 4.724272989307642e-05, "loss": 1.1653, "step": 79700 }, { "epoch": 0.0396, "grad_norm": 1.3574703931808472, "learning_rate": 4.7235481857869446e-05, "loss": 1.2176, "step": 79800 }, { "epoch": 0.0398, "grad_norm": 1.3188464641571045, "learning_rate": 4.722822486621304e-05, "loss": 1.1872, "step": 79900 }, { "epoch": 0.04, "grad_norm": 1.1241661310195923, "learning_rate": 4.722095892103032e-05, "loss": 1.1926, "step": 80000 }, { "epoch": 0.04, "eval_loss": 1.0716365575790405, "eval_runtime": 76.5906, "eval_samples_per_second": 201.63, "eval_steps_per_second": 3.16, "step": 80000 }, { "epoch": 0.0402, "grad_norm": 0.9855309724807739, "learning_rate": 4.721368402524801e-05, "loss": 1.1427, "step": 80100 }, { "epoch": 0.0404, "grad_norm": 0.6458451151847839, "learning_rate": 4.720640018179642e-05, "loss": 1.2032, "step": 80200 }, { "epoch": 0.0406, "grad_norm": 1.1878600120544434, "learning_rate": 4.71991073936095e-05, "loss": 1.1879, "step": 80300 }, { "epoch": 0.0408, "grad_norm": 0.8349748253822327, "learning_rate": 4.719180566362477e-05, "loss": 1.1355, "step": 80400 }, { "epoch": 0.041, "grad_norm": 1.1937662363052368, "learning_rate": 4.7184494994783376e-05, "loss": 1.2018, "step": 80500 }, { "epoch": 0.0412, "grad_norm": 1.3011997938156128, "learning_rate": 4.7177175390030054e-05, "loss": 1.1697, "step": 80600 }, { "epoch": 0.0414, "grad_norm": 1.1909871101379395, "learning_rate": 4.7169846852313137e-05, "loss": 1.2126, "step": 80700 }, { "epoch": 0.0416, "grad_norm": 1.5078299045562744, "learning_rate": 4.7162509384584555e-05, "loss": 1.1983, "step": 80800 }, { "epoch": 0.0418, "grad_norm": 1.3141160011291504, "learning_rate": 4.715516298979984e-05, "loss": 1.2118, "step": 80900 }, { "epoch": 0.042, "grad_norm": 1.3565207719802856, "learning_rate": 4.714780767091813e-05, "loss": 1.2054, "step": 81000 }, { "epoch": 0.042, "eval_loss": 1.0669591426849365, "eval_runtime": 75.959, "eval_samples_per_second": 203.307, "eval_steps_per_second": 3.186, "step": 81000 }, { "epoch": 0.0422, "grad_norm": 1.3890715837478638, "learning_rate": 4.714044343090214e-05, "loss": 1.1917, "step": 81100 }, { "epoch": 0.0424, "grad_norm": 0.9992968440055847, "learning_rate": 4.713307027271817e-05, "loss": 1.1869, "step": 81200 }, { "epoch": 0.0426, "grad_norm": 0.8716880679130554, "learning_rate": 4.712568819933615e-05, "loss": 1.1941, "step": 81300 }, { "epoch": 0.0428, "grad_norm": 1.243594765663147, "learning_rate": 4.711829721372957e-05, "loss": 1.1667, "step": 81400 }, { "epoch": 0.043, "grad_norm": 0.6567416191101074, "learning_rate": 4.7110897318875516e-05, "loss": 1.2105, "step": 81500 }, { "epoch": 0.0432, "grad_norm": 0.5886017084121704, "learning_rate": 4.710348851775467e-05, "loss": 1.1867, "step": 81600 }, { "epoch": 0.0434, "grad_norm": 0.6296970248222351, "learning_rate": 4.709607081335129e-05, "loss": 1.1702, "step": 81700 }, { "epoch": 0.0436, "grad_norm": 0.9896938800811768, "learning_rate": 4.7088644208653226e-05, "loss": 1.1628, "step": 81800 }, { "epoch": 0.0438, "grad_norm": 0.7199723720550537, "learning_rate": 4.708120870665192e-05, "loss": 1.1792, "step": 81900 }, { "epoch": 0.044, "grad_norm": 1.3148512840270996, "learning_rate": 4.707376431034238e-05, "loss": 1.185, "step": 82000 }, { "epoch": 0.044, "eval_loss": 1.0709099769592285, "eval_runtime": 75.8635, "eval_samples_per_second": 203.563, "eval_steps_per_second": 3.19, "step": 82000 }, { "epoch": 0.0442, "grad_norm": 0.6634069681167603, "learning_rate": 4.706631102272323e-05, "loss": 1.1633, "step": 82100 }, { "epoch": 0.0444, "grad_norm": 1.3700015544891357, "learning_rate": 4.705884884679663e-05, "loss": 1.1712, "step": 82200 }, { "epoch": 0.0446, "grad_norm": 1.1697111129760742, "learning_rate": 4.705137778556835e-05, "loss": 1.1902, "step": 82300 }, { "epoch": 0.0448, "grad_norm": 1.4012552499771118, "learning_rate": 4.7043897842047735e-05, "loss": 1.216, "step": 82400 }, { "epoch": 0.045, "grad_norm": 1.2128801345825195, "learning_rate": 4.7036409019247706e-05, "loss": 1.2169, "step": 82500 }, { "epoch": 0.0452, "grad_norm": 1.435831904411316, "learning_rate": 4.7028911320184766e-05, "loss": 1.1839, "step": 82600 }, { "epoch": 0.0454, "grad_norm": 0.8126788139343262, "learning_rate": 4.702140474787898e-05, "loss": 1.1652, "step": 82700 }, { "epoch": 0.0456, "grad_norm": 1.1946730613708496, "learning_rate": 4.7013889305353985e-05, "loss": 1.2277, "step": 82800 }, { "epoch": 0.0458, "grad_norm": 0.6007882952690125, "learning_rate": 4.700636499563702e-05, "loss": 1.1586, "step": 82900 }, { "epoch": 0.046, "grad_norm": 0.6656979322433472, "learning_rate": 4.699883182175886e-05, "loss": 1.1902, "step": 83000 }, { "epoch": 0.046, "eval_loss": 1.072899580001831, "eval_runtime": 77.2342, "eval_samples_per_second": 199.95, "eval_steps_per_second": 3.133, "step": 83000 }, { "epoch": 0.0462, "grad_norm": 1.5463351011276245, "learning_rate": 4.6991289786753876e-05, "loss": 1.1988, "step": 83100 }, { "epoch": 0.0464, "grad_norm": 1.202536940574646, "learning_rate": 4.698373889366e-05, "loss": 1.1983, "step": 83200 }, { "epoch": 0.0466, "grad_norm": 0.7186087369918823, "learning_rate": 4.6976179145518724e-05, "loss": 1.15, "step": 83300 }, { "epoch": 0.0468, "grad_norm": 1.3059759140014648, "learning_rate": 4.6968610545375116e-05, "loss": 1.1896, "step": 83400 }, { "epoch": 0.047, "grad_norm": 0.8425590991973877, "learning_rate": 4.696103309627781e-05, "loss": 1.1747, "step": 83500 }, { "epoch": 0.0472, "grad_norm": 1.1745330095291138, "learning_rate": 4.695344680127899e-05, "loss": 1.1591, "step": 83600 }, { "epoch": 0.0474, "grad_norm": 0.6429449915885925, "learning_rate": 4.694585166343443e-05, "loss": 1.1893, "step": 83700 }, { "epoch": 0.0476, "grad_norm": 1.5323892831802368, "learning_rate": 4.693824768580344e-05, "loss": 1.2037, "step": 83800 }, { "epoch": 0.0478, "grad_norm": 1.2719688415527344, "learning_rate": 4.693063487144891e-05, "loss": 1.191, "step": 83900 }, { "epoch": 0.048, "grad_norm": 1.1735507249832153, "learning_rate": 4.6923013223437276e-05, "loss": 1.1904, "step": 84000 }, { "epoch": 0.048, "eval_loss": 1.0721956491470337, "eval_runtime": 76.3531, "eval_samples_per_second": 202.258, "eval_steps_per_second": 3.169, "step": 84000 }, { "epoch": 0.0482, "grad_norm": 1.1949045658111572, "learning_rate": 4.6915382744838536e-05, "loss": 1.1507, "step": 84100 }, { "epoch": 0.0484, "grad_norm": 1.074385404586792, "learning_rate": 4.690774343872625e-05, "loss": 1.1504, "step": 84200 }, { "epoch": 0.0486, "grad_norm": 1.0720473527908325, "learning_rate": 4.690009530817753e-05, "loss": 1.1758, "step": 84300 }, { "epoch": 0.0488, "grad_norm": 1.0596733093261719, "learning_rate": 4.6892438356273024e-05, "loss": 1.1778, "step": 84400 }, { "epoch": 0.049, "grad_norm": 1.2753647565841675, "learning_rate": 4.688477258609698e-05, "loss": 1.1827, "step": 84500 }, { "epoch": 0.0492, "grad_norm": 1.2803727388381958, "learning_rate": 4.687709800073715e-05, "loss": 1.164, "step": 84600 }, { "epoch": 0.0494, "grad_norm": 1.4797301292419434, "learning_rate": 4.6869414603284865e-05, "loss": 1.1748, "step": 84700 }, { "epoch": 0.0496, "grad_norm": 1.1455540657043457, "learning_rate": 4.6861722396834996e-05, "loss": 1.1918, "step": 84800 }, { "epoch": 0.0498, "grad_norm": 1.1636658906936646, "learning_rate": 4.6854021384485954e-05, "loss": 1.208, "step": 84900 }, { "epoch": 0.05, "grad_norm": 1.267817735671997, "learning_rate": 4.684631156933971e-05, "loss": 1.1679, "step": 85000 }, { "epoch": 0.05, "eval_loss": 1.0709259510040283, "eval_runtime": 76.3729, "eval_samples_per_second": 202.205, "eval_steps_per_second": 3.169, "step": 85000 }, { "epoch": 0.0502, "grad_norm": 1.5029271841049194, "learning_rate": 4.683859295450178e-05, "loss": 1.1459, "step": 85100 }, { "epoch": 0.0504, "grad_norm": 0.7328454256057739, "learning_rate": 4.683086554308123e-05, "loss": 1.1861, "step": 85200 }, { "epoch": 0.0506, "grad_norm": 1.114625334739685, "learning_rate": 4.682312933819063e-05, "loss": 1.1609, "step": 85300 }, { "epoch": 0.0508, "grad_norm": 1.4052484035491943, "learning_rate": 4.681538434294615e-05, "loss": 1.1534, "step": 85400 }, { "epoch": 0.051, "grad_norm": 0.7364799976348877, "learning_rate": 4.6807630560467475e-05, "loss": 1.1973, "step": 85500 }, { "epoch": 0.0512, "grad_norm": 0.701787531375885, "learning_rate": 4.679986799387781e-05, "loss": 1.1743, "step": 85600 }, { "epoch": 0.0514, "grad_norm": 1.331763744354248, "learning_rate": 4.679209664630393e-05, "loss": 1.1516, "step": 85700 }, { "epoch": 0.0516, "grad_norm": 0.9733197689056396, "learning_rate": 4.6784316520876124e-05, "loss": 1.1646, "step": 85800 }, { "epoch": 0.0518, "grad_norm": 0.7415294051170349, "learning_rate": 4.677652762072823e-05, "loss": 1.2005, "step": 85900 }, { "epoch": 0.052, "grad_norm": 1.1027395725250244, "learning_rate": 4.6768729948997606e-05, "loss": 1.1601, "step": 86000 }, { "epoch": 0.052, "eval_loss": 1.0681675672531128, "eval_runtime": 76.2441, "eval_samples_per_second": 202.547, "eval_steps_per_second": 3.174, "step": 86000 }, { "epoch": 0.0522, "grad_norm": 0.7156331539154053, "learning_rate": 4.676092350882517e-05, "loss": 1.1854, "step": 86100 }, { "epoch": 0.0524, "grad_norm": 1.3423713445663452, "learning_rate": 4.675310830335534e-05, "loss": 1.2135, "step": 86200 }, { "epoch": 0.0526, "grad_norm": 1.1925442218780518, "learning_rate": 4.6745284335736095e-05, "loss": 1.1414, "step": 86300 }, { "epoch": 0.0528, "grad_norm": 1.1717417240142822, "learning_rate": 4.673745160911891e-05, "loss": 1.184, "step": 86400 }, { "epoch": 0.053, "grad_norm": 1.0722715854644775, "learning_rate": 4.672961012665883e-05, "loss": 1.1685, "step": 86500 }, { "epoch": 0.0532, "grad_norm": 1.293058156967163, "learning_rate": 4.6721759891514386e-05, "loss": 1.1639, "step": 86600 }, { "epoch": 0.0534, "grad_norm": 1.1121761798858643, "learning_rate": 4.671390090684765e-05, "loss": 1.1693, "step": 86700 }, { "epoch": 0.0536, "grad_norm": 1.1979039907455444, "learning_rate": 4.6706033175824226e-05, "loss": 1.2123, "step": 86800 }, { "epoch": 0.0538, "grad_norm": 1.3003602027893066, "learning_rate": 4.669815670161324e-05, "loss": 1.1529, "step": 86900 }, { "epoch": 0.054, "grad_norm": 0.627068817615509, "learning_rate": 4.669027148738732e-05, "loss": 1.1901, "step": 87000 }, { "epoch": 0.054, "eval_loss": 1.0730499029159546, "eval_runtime": 76.271, "eval_samples_per_second": 202.475, "eval_steps_per_second": 3.173, "step": 87000 }, { "epoch": 0.0542, "grad_norm": 1.0153006315231323, "learning_rate": 4.6682377536322644e-05, "loss": 1.1295, "step": 87100 }, { "epoch": 0.0544, "grad_norm": 1.3619033098220825, "learning_rate": 4.667447485159889e-05, "loss": 1.1759, "step": 87200 }, { "epoch": 0.0546, "grad_norm": 0.8665468692779541, "learning_rate": 4.666656343639926e-05, "loss": 1.1602, "step": 87300 }, { "epoch": 0.0548, "grad_norm": 0.7338219285011292, "learning_rate": 4.665864329391046e-05, "loss": 1.1856, "step": 87400 }, { "epoch": 0.055, "grad_norm": 0.7363407611846924, "learning_rate": 4.665071442732274e-05, "loss": 1.1629, "step": 87500 }, { "epoch": 0.0552, "grad_norm": 0.9836055636405945, "learning_rate": 4.664277683982984e-05, "loss": 1.1755, "step": 87600 }, { "epoch": 0.0554, "grad_norm": 1.0638995170593262, "learning_rate": 4.663483053462901e-05, "loss": 1.186, "step": 87700 }, { "epoch": 0.0556, "grad_norm": 0.9050219058990479, "learning_rate": 4.662687551492103e-05, "loss": 1.2357, "step": 87800 }, { "epoch": 0.0558, "grad_norm": 0.917178213596344, "learning_rate": 4.661891178391018e-05, "loss": 1.1573, "step": 87900 }, { "epoch": 0.056, "grad_norm": 1.2023630142211914, "learning_rate": 4.661093934480425e-05, "loss": 1.1795, "step": 88000 }, { "epoch": 0.056, "eval_loss": 1.0689297914505005, "eval_runtime": 77.0471, "eval_samples_per_second": 200.436, "eval_steps_per_second": 3.141, "step": 88000 }, { "epoch": 0.0562, "grad_norm": 1.2633955478668213, "learning_rate": 4.660295820081453e-05, "loss": 1.1501, "step": 88100 }, { "epoch": 0.0564, "grad_norm": 0.5867215991020203, "learning_rate": 4.6594968355155835e-05, "loss": 1.2096, "step": 88200 }, { "epoch": 0.0566, "grad_norm": 1.3425019979476929, "learning_rate": 4.658696981104646e-05, "loss": 1.2016, "step": 88300 }, { "epoch": 0.0568, "grad_norm": 0.8101886510848999, "learning_rate": 4.657896257170825e-05, "loss": 1.1512, "step": 88400 }, { "epoch": 0.057, "grad_norm": 1.43784761428833, "learning_rate": 4.6570946640366474e-05, "loss": 1.1536, "step": 88500 }, { "epoch": 0.0572, "grad_norm": 0.766494870185852, "learning_rate": 4.6562922020249984e-05, "loss": 1.1521, "step": 88600 }, { "epoch": 0.0574, "grad_norm": 1.5485390424728394, "learning_rate": 4.6554888714591076e-05, "loss": 1.176, "step": 88700 }, { "epoch": 0.0576, "grad_norm": 0.8266467452049255, "learning_rate": 4.654684672662557e-05, "loss": 1.1514, "step": 88800 }, { "epoch": 0.0578, "grad_norm": 1.2086583375930786, "learning_rate": 4.6538796059592784e-05, "loss": 1.177, "step": 88900 }, { "epoch": 0.058, "grad_norm": 1.4609780311584473, "learning_rate": 4.6530736716735526e-05, "loss": 1.1447, "step": 89000 }, { "epoch": 0.058, "eval_loss": 1.0664150714874268, "eval_runtime": 76.3509, "eval_samples_per_second": 202.264, "eval_steps_per_second": 3.17, "step": 89000 }, { "epoch": 0.0582, "grad_norm": 1.0640435218811035, "learning_rate": 4.652266870130008e-05, "loss": 1.1392, "step": 89100 }, { "epoch": 0.0584, "grad_norm": 1.3286436796188354, "learning_rate": 4.651459201653626e-05, "loss": 1.222, "step": 89200 }, { "epoch": 0.0586, "grad_norm": 0.7577000260353088, "learning_rate": 4.650650666569736e-05, "loss": 1.1842, "step": 89300 }, { "epoch": 0.0588, "grad_norm": 1.0623698234558105, "learning_rate": 4.6498412652040137e-05, "loss": 1.2071, "step": 89400 }, { "epoch": 0.059, "grad_norm": 0.9597827792167664, "learning_rate": 4.6490309978824866e-05, "loss": 1.1781, "step": 89500 }, { "epoch": 0.0592, "grad_norm": 1.126639485359192, "learning_rate": 4.6482198649315306e-05, "loss": 1.1897, "step": 89600 }, { "epoch": 0.0594, "grad_norm": 1.1724388599395752, "learning_rate": 4.64740786667787e-05, "loss": 1.1567, "step": 89700 }, { "epoch": 0.0596, "grad_norm": 1.14126718044281, "learning_rate": 4.6465950034485776e-05, "loss": 1.1819, "step": 89800 }, { "epoch": 0.0598, "grad_norm": 0.8016234040260315, "learning_rate": 4.645781275571075e-05, "loss": 1.1906, "step": 89900 }, { "epoch": 0.06, "grad_norm": 1.3095015287399292, "learning_rate": 4.644966683373131e-05, "loss": 1.1976, "step": 90000 }, { "epoch": 0.06, "eval_loss": 1.0730445384979248, "eval_runtime": 76.1401, "eval_samples_per_second": 202.823, "eval_steps_per_second": 3.178, "step": 90000 }, { "epoch": 0.0602, "grad_norm": 0.5794508457183838, "learning_rate": 4.6441512271828626e-05, "loss": 1.1478, "step": 90100 }, { "epoch": 0.0604, "grad_norm": 0.9965047240257263, "learning_rate": 4.6433349073287366e-05, "loss": 1.201, "step": 90200 }, { "epoch": 0.0606, "grad_norm": 1.280166506767273, "learning_rate": 4.642517724139567e-05, "loss": 1.1542, "step": 90300 }, { "epoch": 0.0608, "grad_norm": 0.7828945517539978, "learning_rate": 4.641699677944514e-05, "loss": 1.186, "step": 90400 }, { "epoch": 0.061, "grad_norm": 1.096155047416687, "learning_rate": 4.640880769073087e-05, "loss": 1.1969, "step": 90500 }, { "epoch": 0.0612, "grad_norm": 0.7447170615196228, "learning_rate": 4.6400609978551416e-05, "loss": 1.1482, "step": 90600 }, { "epoch": 0.0614, "grad_norm": 0.8162779808044434, "learning_rate": 4.639240364620882e-05, "loss": 1.2072, "step": 90700 }, { "epoch": 0.0616, "grad_norm": 1.2612018585205078, "learning_rate": 4.638418869700861e-05, "loss": 1.1402, "step": 90800 }, { "epoch": 0.0618, "grad_norm": 0.8543398380279541, "learning_rate": 4.637596513425974e-05, "loss": 1.1718, "step": 90900 }, { "epoch": 0.062, "grad_norm": 1.2375905513763428, "learning_rate": 4.636773296127467e-05, "loss": 1.1587, "step": 91000 }, { "epoch": 0.062, "eval_loss": 1.0713858604431152, "eval_runtime": 76.3385, "eval_samples_per_second": 202.296, "eval_steps_per_second": 3.17, "step": 91000 } ], "logging_steps": 100, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.970894657486848e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }