diff --git "a/checkpoint-9570/trainer_state.json" "b/checkpoint-9570/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9570/trainer_state.json" @@ -0,0 +1,7109 @@ +{ + "best_global_step": 7600, + "best_metric": 0.7774137258529663, + "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-7600", + "epoch": 4.99960830395613, + "eval_steps": 200, + "global_step": 9570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005222613918266093, + "grad_norm": 11.394719123840332, + "learning_rate": 9.000000000000001e-07, + "loss": 3.5515, + "step": 10 + }, + { + "epoch": 0.010445227836532185, + "grad_norm": 6.919948577880859, + "learning_rate": 1.9000000000000002e-06, + "loss": 3.5008, + "step": 20 + }, + { + "epoch": 0.015667841754798278, + "grad_norm": 7.034717559814453, + "learning_rate": 2.9e-06, + "loss": 3.29, + "step": 30 + }, + { + "epoch": 0.02089045567306437, + "grad_norm": 5.738417625427246, + "learning_rate": 3.900000000000001e-06, + "loss": 3.1144, + "step": 40 + }, + { + "epoch": 0.02611306959133046, + "grad_norm": 6.416561603546143, + "learning_rate": 4.9000000000000005e-06, + "loss": 3.0745, + "step": 50 + }, + { + "epoch": 0.031335683509596556, + "grad_norm": 5.485833168029785, + "learning_rate": 5.9e-06, + "loss": 2.8625, + "step": 60 + }, + { + "epoch": 0.036558297427862645, + "grad_norm": 5.88957405090332, + "learning_rate": 6.9e-06, + "loss": 2.8214, + "step": 70 + }, + { + "epoch": 0.04178091134612874, + "grad_norm": 6.4762654304504395, + "learning_rate": 7.9e-06, + "loss": 2.8076, + "step": 80 + }, + { + "epoch": 0.04700352526439483, + "grad_norm": 6.038456916809082, + "learning_rate": 8.900000000000001e-06, + "loss": 2.7299, + "step": 90 + }, + { + "epoch": 0.05222613918266092, + "grad_norm": 7.74363899230957, + "learning_rate": 9.9e-06, + "loss": 2.5875, + "step": 100 + }, + { + "epoch": 0.057448753100927015, + "grad_norm": 11.824464797973633, + "learning_rate": 9.990496304118268e-06, + "loss": 2.5328, + "step": 110 + }, + { + "epoch": 0.06267136701919311, + "grad_norm": 6.736820697784424, + "learning_rate": 9.979936642027456e-06, + "loss": 2.414, + "step": 120 + }, + { + "epoch": 0.0678939809374592, + "grad_norm": 6.303720474243164, + "learning_rate": 9.969376979936643e-06, + "loss": 2.2947, + "step": 130 + }, + { + "epoch": 0.07311659485572529, + "grad_norm": 6.727591037750244, + "learning_rate": 9.95881731784583e-06, + "loss": 2.2003, + "step": 140 + }, + { + "epoch": 0.07833920877399138, + "grad_norm": 8.046416282653809, + "learning_rate": 9.948257655755017e-06, + "loss": 2.0726, + "step": 150 + }, + { + "epoch": 0.08356182269225748, + "grad_norm": 8.67299747467041, + "learning_rate": 9.937697993664203e-06, + "loss": 2.2524, + "step": 160 + }, + { + "epoch": 0.08878443661052357, + "grad_norm": 7.629809856414795, + "learning_rate": 9.927138331573391e-06, + "loss": 2.0773, + "step": 170 + }, + { + "epoch": 0.09400705052878966, + "grad_norm": 10.00472640991211, + "learning_rate": 9.916578669482577e-06, + "loss": 1.9697, + "step": 180 + }, + { + "epoch": 0.09922966444705575, + "grad_norm": 7.71968412399292, + "learning_rate": 9.907074973600845e-06, + "loss": 2.1101, + "step": 190 + }, + { + "epoch": 0.10445227836532184, + "grad_norm": 6.909250736236572, + "learning_rate": 9.896515311510033e-06, + "loss": 1.866, + "step": 200 + }, + { + "epoch": 0.10445227836532184, + "eval_loss": 2.0226237773895264, + "eval_runtime": 46.6876, + "eval_samples_per_second": 36.455, + "eval_steps_per_second": 4.562, + "step": 200 + }, + { + "epoch": 0.10967489228358794, + "grad_norm": 6.597925186157227, + "learning_rate": 9.88595564941922e-06, + "loss": 1.9556, + "step": 210 + }, + { + "epoch": 0.11489750620185403, + "grad_norm": 9.504620552062988, + "learning_rate": 9.875395987328407e-06, + "loss": 1.9974, + "step": 220 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 6.907344818115234, + "learning_rate": 9.864836325237593e-06, + "loss": 1.8866, + "step": 230 + }, + { + "epoch": 0.12534273403838622, + "grad_norm": 9.235527038574219, + "learning_rate": 9.85427666314678e-06, + "loss": 1.8387, + "step": 240 + }, + { + "epoch": 0.1305653479566523, + "grad_norm": 7.033239841461182, + "learning_rate": 9.843717001055967e-06, + "loss": 1.7348, + "step": 250 + }, + { + "epoch": 0.1357879618749184, + "grad_norm": 10.7998628616333, + "learning_rate": 9.833157338965154e-06, + "loss": 1.7569, + "step": 260 + }, + { + "epoch": 0.1410105757931845, + "grad_norm": 7.843267917633057, + "learning_rate": 9.82259767687434e-06, + "loss": 1.7569, + "step": 270 + }, + { + "epoch": 0.14623318971145058, + "grad_norm": 6.2468953132629395, + "learning_rate": 9.812038014783528e-06, + "loss": 1.6301, + "step": 280 + }, + { + "epoch": 0.15145580362971667, + "grad_norm": 7.654909133911133, + "learning_rate": 9.801478352692714e-06, + "loss": 1.7127, + "step": 290 + }, + { + "epoch": 0.15667841754798276, + "grad_norm": 7.152418613433838, + "learning_rate": 9.790918690601902e-06, + "loss": 1.8976, + "step": 300 + }, + { + "epoch": 0.16190103146624885, + "grad_norm": 7.338048458099365, + "learning_rate": 9.780359028511088e-06, + "loss": 1.7498, + "step": 310 + }, + { + "epoch": 0.16712364538451496, + "grad_norm": 7.256304740905762, + "learning_rate": 9.769799366420275e-06, + "loss": 1.7501, + "step": 320 + }, + { + "epoch": 0.17234625930278105, + "grad_norm": 10.67475700378418, + "learning_rate": 9.759239704329462e-06, + "loss": 1.6759, + "step": 330 + }, + { + "epoch": 0.17756887322104714, + "grad_norm": 7.884083271026611, + "learning_rate": 9.74868004223865e-06, + "loss": 1.707, + "step": 340 + }, + { + "epoch": 0.18279148713931323, + "grad_norm": 8.517298698425293, + "learning_rate": 9.738120380147837e-06, + "loss": 1.5422, + "step": 350 + }, + { + "epoch": 0.18801410105757932, + "grad_norm": 6.652080059051514, + "learning_rate": 9.727560718057023e-06, + "loss": 1.6762, + "step": 360 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 6.86594820022583, + "learning_rate": 9.71700105596621e-06, + "loss": 1.5937, + "step": 370 + }, + { + "epoch": 0.1984593288941115, + "grad_norm": 7.43917989730835, + "learning_rate": 9.707497360084478e-06, + "loss": 1.4299, + "step": 380 + }, + { + "epoch": 0.2036819428123776, + "grad_norm": 10.837226867675781, + "learning_rate": 9.696937697993665e-06, + "loss": 1.5797, + "step": 390 + }, + { + "epoch": 0.20890455673064368, + "grad_norm": 10.075883865356445, + "learning_rate": 9.686378035902851e-06, + "loss": 1.5084, + "step": 400 + }, + { + "epoch": 0.20890455673064368, + "eval_loss": 1.625764012336731, + "eval_runtime": 46.2554, + "eval_samples_per_second": 36.796, + "eval_steps_per_second": 4.605, + "step": 400 + }, + { + "epoch": 0.2141271706489098, + "grad_norm": 6.131842613220215, + "learning_rate": 9.675818373812039e-06, + "loss": 1.5666, + "step": 410 + }, + { + "epoch": 0.21934978456717588, + "grad_norm": 8.409153938293457, + "learning_rate": 9.665258711721227e-06, + "loss": 1.6822, + "step": 420 + }, + { + "epoch": 0.22457239848544197, + "grad_norm": 8.761375427246094, + "learning_rate": 9.654699049630413e-06, + "loss": 1.3924, + "step": 430 + }, + { + "epoch": 0.22979501240370806, + "grad_norm": 6.627100944519043, + "learning_rate": 9.6441393875396e-06, + "loss": 1.6737, + "step": 440 + }, + { + "epoch": 0.23501762632197415, + "grad_norm": 9.165101051330566, + "learning_rate": 9.633579725448786e-06, + "loss": 1.4854, + "step": 450 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 8.033590316772461, + "learning_rate": 9.623020063357974e-06, + "loss": 1.4066, + "step": 460 + }, + { + "epoch": 0.24546285415850633, + "grad_norm": 7.320120811462402, + "learning_rate": 9.612460401267162e-06, + "loss": 1.4489, + "step": 470 + }, + { + "epoch": 0.25068546807677244, + "grad_norm": 6.342758655548096, + "learning_rate": 9.601900739176348e-06, + "loss": 1.4078, + "step": 480 + }, + { + "epoch": 0.2559080819950385, + "grad_norm": 7.489528179168701, + "learning_rate": 9.591341077085534e-06, + "loss": 1.3413, + "step": 490 + }, + { + "epoch": 0.2611306959133046, + "grad_norm": 6.255088806152344, + "learning_rate": 9.58078141499472e-06, + "loss": 1.4824, + "step": 500 + }, + { + "epoch": 0.2663533098315707, + "grad_norm": 6.387566089630127, + "learning_rate": 9.570221752903908e-06, + "loss": 1.2526, + "step": 510 + }, + { + "epoch": 0.2715759237498368, + "grad_norm": 7.210233688354492, + "learning_rate": 9.559662090813095e-06, + "loss": 1.4269, + "step": 520 + }, + { + "epoch": 0.27679853766810286, + "grad_norm": 6.717288017272949, + "learning_rate": 9.549102428722282e-06, + "loss": 1.4725, + "step": 530 + }, + { + "epoch": 0.282021151586369, + "grad_norm": 6.161440372467041, + "learning_rate": 9.538542766631469e-06, + "loss": 1.4743, + "step": 540 + }, + { + "epoch": 0.2872437655046351, + "grad_norm": 6.798609733581543, + "learning_rate": 9.527983104540655e-06, + "loss": 1.4601, + "step": 550 + }, + { + "epoch": 0.29246637942290116, + "grad_norm": 7.112600326538086, + "learning_rate": 9.517423442449843e-06, + "loss": 1.3427, + "step": 560 + }, + { + "epoch": 0.2976889933411673, + "grad_norm": 5.958874225616455, + "learning_rate": 9.50686378035903e-06, + "loss": 1.5848, + "step": 570 + }, + { + "epoch": 0.30291160725943334, + "grad_norm": 8.950939178466797, + "learning_rate": 9.496304118268215e-06, + "loss": 1.3731, + "step": 580 + }, + { + "epoch": 0.30813422117769945, + "grad_norm": 7.173642635345459, + "learning_rate": 9.485744456177403e-06, + "loss": 1.4672, + "step": 590 + }, + { + "epoch": 0.3133568350959655, + "grad_norm": 5.97654390335083, + "learning_rate": 9.47518479408659e-06, + "loss": 1.3207, + "step": 600 + }, + { + "epoch": 0.3133568350959655, + "eval_loss": 1.425838589668274, + "eval_runtime": 46.3712, + "eval_samples_per_second": 36.704, + "eval_steps_per_second": 4.593, + "step": 600 + }, + { + "epoch": 0.31857944901423163, + "grad_norm": 6.612730503082275, + "learning_rate": 9.464625131995778e-06, + "loss": 1.3925, + "step": 610 + }, + { + "epoch": 0.3238020629324977, + "grad_norm": 7.3144049644470215, + "learning_rate": 9.454065469904964e-06, + "loss": 1.494, + "step": 620 + }, + { + "epoch": 0.3290246768507638, + "grad_norm": 5.725366592407227, + "learning_rate": 9.44350580781415e-06, + "loss": 1.4134, + "step": 630 + }, + { + "epoch": 0.3342472907690299, + "grad_norm": 8.623462677001953, + "learning_rate": 9.432946145723338e-06, + "loss": 1.144, + "step": 640 + }, + { + "epoch": 0.339469904687296, + "grad_norm": 6.790163040161133, + "learning_rate": 9.422386483632524e-06, + "loss": 1.4894, + "step": 650 + }, + { + "epoch": 0.3446925186055621, + "grad_norm": 7.366871356964111, + "learning_rate": 9.411826821541712e-06, + "loss": 1.3748, + "step": 660 + }, + { + "epoch": 0.34991513252382817, + "grad_norm": 5.874607086181641, + "learning_rate": 9.401267159450899e-06, + "loss": 1.0795, + "step": 670 + }, + { + "epoch": 0.3551377464420943, + "grad_norm": 6.789367198944092, + "learning_rate": 9.390707497360085e-06, + "loss": 1.2088, + "step": 680 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 6.885139465332031, + "learning_rate": 9.380147835269273e-06, + "loss": 1.1882, + "step": 690 + }, + { + "epoch": 0.36558297427862646, + "grad_norm": 7.129133224487305, + "learning_rate": 9.369588173178459e-06, + "loss": 1.2315, + "step": 700 + }, + { + "epoch": 0.3708055881968925, + "grad_norm": 6.09841775894165, + "learning_rate": 9.359028511087645e-06, + "loss": 1.1916, + "step": 710 + }, + { + "epoch": 0.37602820211515864, + "grad_norm": 6.911228179931641, + "learning_rate": 9.348468848996833e-06, + "loss": 1.44, + "step": 720 + }, + { + "epoch": 0.38125081603342476, + "grad_norm": 8.852502822875977, + "learning_rate": 9.33790918690602e-06, + "loss": 1.2999, + "step": 730 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 7.887015342712402, + "learning_rate": 9.327349524815207e-06, + "loss": 1.3509, + "step": 740 + }, + { + "epoch": 0.39169604386995693, + "grad_norm": 5.902195930480957, + "learning_rate": 9.316789862724394e-06, + "loss": 1.2506, + "step": 750 + }, + { + "epoch": 0.396918657788223, + "grad_norm": 5.92559814453125, + "learning_rate": 9.30623020063358e-06, + "loss": 1.1291, + "step": 760 + }, + { + "epoch": 0.4021412717064891, + "grad_norm": 7.447033405303955, + "learning_rate": 9.295670538542766e-06, + "loss": 1.1785, + "step": 770 + }, + { + "epoch": 0.4073638856247552, + "grad_norm": 5.407535552978516, + "learning_rate": 9.285110876451954e-06, + "loss": 1.1236, + "step": 780 + }, + { + "epoch": 0.4125864995430213, + "grad_norm": 5.1882219314575195, + "learning_rate": 9.274551214361142e-06, + "loss": 1.2353, + "step": 790 + }, + { + "epoch": 0.41780911346128735, + "grad_norm": 7.093064785003662, + "learning_rate": 9.263991552270328e-06, + "loss": 1.459, + "step": 800 + }, + { + "epoch": 0.41780911346128735, + "eval_loss": 1.3080272674560547, + "eval_runtime": 46.2478, + "eval_samples_per_second": 36.802, + "eval_steps_per_second": 4.606, + "step": 800 + }, + { + "epoch": 0.42303172737955347, + "grad_norm": 6.416601657867432, + "learning_rate": 9.253431890179515e-06, + "loss": 1.3287, + "step": 810 + }, + { + "epoch": 0.4282543412978196, + "grad_norm": 6.671374797821045, + "learning_rate": 9.242872228088701e-06, + "loss": 1.2637, + "step": 820 + }, + { + "epoch": 0.43347695521608565, + "grad_norm": 6.5349931716918945, + "learning_rate": 9.232312565997889e-06, + "loss": 1.2804, + "step": 830 + }, + { + "epoch": 0.43869956913435176, + "grad_norm": 5.837822437286377, + "learning_rate": 9.221752903907075e-06, + "loss": 1.4397, + "step": 840 + }, + { + "epoch": 0.4439221830526178, + "grad_norm": 6.69824743270874, + "learning_rate": 9.211193241816263e-06, + "loss": 1.2325, + "step": 850 + }, + { + "epoch": 0.44914479697088394, + "grad_norm": 5.331833362579346, + "learning_rate": 9.20063357972545e-06, + "loss": 1.3133, + "step": 860 + }, + { + "epoch": 0.45436741088915, + "grad_norm": 6.8653950691223145, + "learning_rate": 9.190073917634637e-06, + "loss": 1.0374, + "step": 870 + }, + { + "epoch": 0.4595900248074161, + "grad_norm": 6.36031436920166, + "learning_rate": 9.179514255543824e-06, + "loss": 1.0597, + "step": 880 + }, + { + "epoch": 0.4648126387256822, + "grad_norm": 3.890155553817749, + "learning_rate": 9.16895459345301e-06, + "loss": 1.2453, + "step": 890 + }, + { + "epoch": 0.4700352526439483, + "grad_norm": 5.179451942443848, + "learning_rate": 9.158394931362196e-06, + "loss": 1.0745, + "step": 900 + }, + { + "epoch": 0.4752578665622144, + "grad_norm": 7.032919406890869, + "learning_rate": 9.147835269271384e-06, + "loss": 1.1987, + "step": 910 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 6.223219394683838, + "learning_rate": 9.137275607180572e-06, + "loss": 1.2441, + "step": 920 + }, + { + "epoch": 0.4857030943987466, + "grad_norm": 8.567842483520508, + "learning_rate": 9.126715945089758e-06, + "loss": 1.1818, + "step": 930 + }, + { + "epoch": 0.49092570831701265, + "grad_norm": 5.338006019592285, + "learning_rate": 9.116156282998945e-06, + "loss": 1.1527, + "step": 940 + }, + { + "epoch": 0.49614832223527877, + "grad_norm": 6.287044048309326, + "learning_rate": 9.10559662090813e-06, + "loss": 1.3234, + "step": 950 + }, + { + "epoch": 0.5013709361535449, + "grad_norm": 6.575079441070557, + "learning_rate": 9.095036958817319e-06, + "loss": 1.1989, + "step": 960 + }, + { + "epoch": 0.506593550071811, + "grad_norm": 7.368027687072754, + "learning_rate": 9.084477296726507e-06, + "loss": 1.0985, + "step": 970 + }, + { + "epoch": 0.511816163990077, + "grad_norm": 5.3375020027160645, + "learning_rate": 9.073917634635693e-06, + "loss": 1.1171, + "step": 980 + }, + { + "epoch": 0.5170387779083431, + "grad_norm": 7.050992965698242, + "learning_rate": 9.06335797254488e-06, + "loss": 1.2531, + "step": 990 + }, + { + "epoch": 0.5222613918266092, + "grad_norm": 6.947799205780029, + "learning_rate": 9.052798310454066e-06, + "loss": 1.0675, + "step": 1000 + }, + { + "epoch": 0.5222613918266092, + "eval_loss": 1.2219752073287964, + "eval_runtime": 46.2981, + "eval_samples_per_second": 36.762, + "eval_steps_per_second": 4.601, + "step": 1000 + }, + { + "epoch": 0.5274840057448753, + "grad_norm": 10.056715965270996, + "learning_rate": 9.042238648363253e-06, + "loss": 1.1444, + "step": 1010 + }, + { + "epoch": 0.5327066196631414, + "grad_norm": 6.277551651000977, + "learning_rate": 9.03167898627244e-06, + "loss": 1.118, + "step": 1020 + }, + { + "epoch": 0.5379292335814075, + "grad_norm": 5.285930633544922, + "learning_rate": 9.021119324181626e-06, + "loss": 1.0377, + "step": 1030 + }, + { + "epoch": 0.5431518474996736, + "grad_norm": 5.600802421569824, + "learning_rate": 9.010559662090814e-06, + "loss": 1.0265, + "step": 1040 + }, + { + "epoch": 0.5483744614179397, + "grad_norm": 6.1389007568359375, + "learning_rate": 9e-06, + "loss": 1.1026, + "step": 1050 + }, + { + "epoch": 0.5535970753362057, + "grad_norm": 7.223113536834717, + "learning_rate": 8.989440337909188e-06, + "loss": 1.1998, + "step": 1060 + }, + { + "epoch": 0.5588196892544719, + "grad_norm": 8.13656997680664, + "learning_rate": 8.978880675818374e-06, + "loss": 1.0726, + "step": 1070 + }, + { + "epoch": 0.564042303172738, + "grad_norm": 7.210083484649658, + "learning_rate": 8.96832101372756e-06, + "loss": 1.1418, + "step": 1080 + }, + { + "epoch": 0.569264917091004, + "grad_norm": 7.832534313201904, + "learning_rate": 8.957761351636749e-06, + "loss": 1.1464, + "step": 1090 + }, + { + "epoch": 0.5744875310092702, + "grad_norm": 5.135114669799805, + "learning_rate": 8.947201689545935e-06, + "loss": 1.0915, + "step": 1100 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 8.231823921203613, + "learning_rate": 8.936642027455123e-06, + "loss": 1.1763, + "step": 1110 + }, + { + "epoch": 0.5849327588458023, + "grad_norm": 5.530185699462891, + "learning_rate": 8.926082365364309e-06, + "loss": 1.1993, + "step": 1120 + }, + { + "epoch": 0.5901553727640684, + "grad_norm": 5.953641414642334, + "learning_rate": 8.915522703273495e-06, + "loss": 1.0549, + "step": 1130 + }, + { + "epoch": 0.5953779866823345, + "grad_norm": 5.919338226318359, + "learning_rate": 8.904963041182683e-06, + "loss": 1.0351, + "step": 1140 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 6.962036609649658, + "learning_rate": 8.89440337909187e-06, + "loss": 1.046, + "step": 1150 + }, + { + "epoch": 0.6058232145188667, + "grad_norm": 5.828774929046631, + "learning_rate": 8.883843717001058e-06, + "loss": 1.087, + "step": 1160 + }, + { + "epoch": 0.6110458284371327, + "grad_norm": 6.581724643707275, + "learning_rate": 8.873284054910244e-06, + "loss": 1.0683, + "step": 1170 + }, + { + "epoch": 0.6162684423553989, + "grad_norm": 7.396463394165039, + "learning_rate": 8.86272439281943e-06, + "loss": 1.1508, + "step": 1180 + }, + { + "epoch": 0.621491056273665, + "grad_norm": 5.524245262145996, + "learning_rate": 8.852164730728618e-06, + "loss": 1.155, + "step": 1190 + }, + { + "epoch": 0.626713670191931, + "grad_norm": 8.499662399291992, + "learning_rate": 8.841605068637804e-06, + "loss": 1.2246, + "step": 1200 + }, + { + "epoch": 0.626713670191931, + "eval_loss": 1.154821753501892, + "eval_runtime": 46.2456, + "eval_samples_per_second": 36.803, + "eval_steps_per_second": 4.606, + "step": 1200 + }, + { + "epoch": 0.6319362841101972, + "grad_norm": 4.054498195648193, + "learning_rate": 8.83104540654699e-06, + "loss": 1.1299, + "step": 1210 + }, + { + "epoch": 0.6371588980284633, + "grad_norm": 6.266629695892334, + "learning_rate": 8.820485744456179e-06, + "loss": 1.0569, + "step": 1220 + }, + { + "epoch": 0.6423815119467293, + "grad_norm": 7.285578727722168, + "learning_rate": 8.809926082365365e-06, + "loss": 1.1132, + "step": 1230 + }, + { + "epoch": 0.6476041258649954, + "grad_norm": 7.280442237854004, + "learning_rate": 8.799366420274553e-06, + "loss": 1.1091, + "step": 1240 + }, + { + "epoch": 0.6528267397832616, + "grad_norm": 6.459787368774414, + "learning_rate": 8.788806758183739e-06, + "loss": 1.1433, + "step": 1250 + }, + { + "epoch": 0.6580493537015276, + "grad_norm": 6.095096588134766, + "learning_rate": 8.778247096092925e-06, + "loss": 1.1457, + "step": 1260 + }, + { + "epoch": 0.6632719676197937, + "grad_norm": 6.624663352966309, + "learning_rate": 8.767687434002112e-06, + "loss": 1.1755, + "step": 1270 + }, + { + "epoch": 0.6684945815380599, + "grad_norm": 5.858925819396973, + "learning_rate": 8.7571277719113e-06, + "loss": 0.9852, + "step": 1280 + }, + { + "epoch": 0.6737171954563259, + "grad_norm": 6.378047943115234, + "learning_rate": 8.746568109820487e-06, + "loss": 0.989, + "step": 1290 + }, + { + "epoch": 0.678939809374592, + "grad_norm": 6.781316757202148, + "learning_rate": 8.736008447729674e-06, + "loss": 0.8984, + "step": 1300 + }, + { + "epoch": 0.684162423292858, + "grad_norm": 5.8333940505981445, + "learning_rate": 8.72544878563886e-06, + "loss": 1.0745, + "step": 1310 + }, + { + "epoch": 0.6893850372111242, + "grad_norm": 5.035146236419678, + "learning_rate": 8.714889123548046e-06, + "loss": 0.9417, + "step": 1320 + }, + { + "epoch": 0.6946076511293903, + "grad_norm": 5.607509613037109, + "learning_rate": 8.704329461457234e-06, + "loss": 1.0106, + "step": 1330 + }, + { + "epoch": 0.6998302650476563, + "grad_norm": 8.404295921325684, + "learning_rate": 8.69376979936642e-06, + "loss": 1.063, + "step": 1340 + }, + { + "epoch": 0.7050528789659224, + "grad_norm": 6.693871021270752, + "learning_rate": 8.683210137275608e-06, + "loss": 0.9467, + "step": 1350 + }, + { + "epoch": 0.7102754928841886, + "grad_norm": 8.556498527526855, + "learning_rate": 8.672650475184795e-06, + "loss": 0.9936, + "step": 1360 + }, + { + "epoch": 0.7154981068024546, + "grad_norm": 6.516254425048828, + "learning_rate": 8.662090813093983e-06, + "loss": 0.958, + "step": 1370 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 6.926424026489258, + "learning_rate": 8.651531151003169e-06, + "loss": 1.0758, + "step": 1380 + }, + { + "epoch": 0.7259433346389869, + "grad_norm": 5.722439765930176, + "learning_rate": 8.640971488912355e-06, + "loss": 0.987, + "step": 1390 + }, + { + "epoch": 0.7311659485572529, + "grad_norm": 5.813411712646484, + "learning_rate": 8.630411826821541e-06, + "loss": 1.0847, + "step": 1400 + }, + { + "epoch": 0.7311659485572529, + "eval_loss": 1.0959590673446655, + "eval_runtime": 46.2353, + "eval_samples_per_second": 36.812, + "eval_steps_per_second": 4.607, + "step": 1400 + }, + { + "epoch": 0.736388562475519, + "grad_norm": 9.68322467803955, + "learning_rate": 8.61985216473073e-06, + "loss": 1.0947, + "step": 1410 + }, + { + "epoch": 0.741611176393785, + "grad_norm": 5.5112762451171875, + "learning_rate": 8.609292502639917e-06, + "loss": 1.0718, + "step": 1420 + }, + { + "epoch": 0.7468337903120512, + "grad_norm": 6.487595558166504, + "learning_rate": 8.598732840549104e-06, + "loss": 1.008, + "step": 1430 + }, + { + "epoch": 0.7520564042303173, + "grad_norm": 6.72760534286499, + "learning_rate": 8.58817317845829e-06, + "loss": 1.0198, + "step": 1440 + }, + { + "epoch": 0.7572790181485833, + "grad_norm": 6.073751449584961, + "learning_rate": 8.577613516367476e-06, + "loss": 1.0065, + "step": 1450 + }, + { + "epoch": 0.7625016320668495, + "grad_norm": 6.201911449432373, + "learning_rate": 8.567053854276664e-06, + "loss": 1.2082, + "step": 1460 + }, + { + "epoch": 0.7677242459851156, + "grad_norm": 7.030183792114258, + "learning_rate": 8.55649419218585e-06, + "loss": 1.0877, + "step": 1470 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 6.393901348114014, + "learning_rate": 8.545934530095038e-06, + "loss": 0.9444, + "step": 1480 + }, + { + "epoch": 0.7781694738216477, + "grad_norm": 7.0087571144104, + "learning_rate": 8.535374868004225e-06, + "loss": 0.9064, + "step": 1490 + }, + { + "epoch": 0.7833920877399139, + "grad_norm": 7.8241777420043945, + "learning_rate": 8.52481520591341e-06, + "loss": 1.0552, + "step": 1500 + }, + { + "epoch": 0.7886147016581799, + "grad_norm": 6.263652801513672, + "learning_rate": 8.514255543822599e-06, + "loss": 1.1847, + "step": 1510 + }, + { + "epoch": 0.793837315576446, + "grad_norm": 7.5798797607421875, + "learning_rate": 8.503695881731785e-06, + "loss": 1.076, + "step": 1520 + }, + { + "epoch": 0.799059929494712, + "grad_norm": 5.389642238616943, + "learning_rate": 8.493136219640971e-06, + "loss": 1.0102, + "step": 1530 + }, + { + "epoch": 0.8042825434129782, + "grad_norm": 5.936399459838867, + "learning_rate": 8.48257655755016e-06, + "loss": 1.0314, + "step": 1540 + }, + { + "epoch": 0.8095051573312443, + "grad_norm": 8.437224388122559, + "learning_rate": 8.472016895459345e-06, + "loss": 1.0985, + "step": 1550 + }, + { + "epoch": 0.8147277712495103, + "grad_norm": 5.470308303833008, + "learning_rate": 8.461457233368533e-06, + "loss": 1.0003, + "step": 1560 + }, + { + "epoch": 0.8199503851677765, + "grad_norm": 10.188332557678223, + "learning_rate": 8.45089757127772e-06, + "loss": 1.0165, + "step": 1570 + }, + { + "epoch": 0.8251729990860426, + "grad_norm": 8.477367401123047, + "learning_rate": 8.440337909186906e-06, + "loss": 1.0532, + "step": 1580 + }, + { + "epoch": 0.8303956130043086, + "grad_norm": 4.078097820281982, + "learning_rate": 8.429778247096094e-06, + "loss": 1.0828, + "step": 1590 + }, + { + "epoch": 0.8356182269225747, + "grad_norm": 5.285001277923584, + "learning_rate": 8.41921858500528e-06, + "loss": 0.9931, + "step": 1600 + }, + { + "epoch": 0.8356182269225747, + "eval_loss": 1.0581225156784058, + "eval_runtime": 46.2517, + "eval_samples_per_second": 36.799, + "eval_steps_per_second": 4.605, + "step": 1600 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 4.965864181518555, + "learning_rate": 8.408658922914468e-06, + "loss": 1.1802, + "step": 1610 + }, + { + "epoch": 0.8460634547591069, + "grad_norm": 6.969324588775635, + "learning_rate": 8.398099260823654e-06, + "loss": 1.0296, + "step": 1620 + }, + { + "epoch": 0.851286068677373, + "grad_norm": 8.806953430175781, + "learning_rate": 8.38753959873284e-06, + "loss": 0.9611, + "step": 1630 + }, + { + "epoch": 0.8565086825956392, + "grad_norm": 7.901791572570801, + "learning_rate": 8.376979936642029e-06, + "loss": 0.9345, + "step": 1640 + }, + { + "epoch": 0.8617312965139052, + "grad_norm": 6.055725574493408, + "learning_rate": 8.366420274551215e-06, + "loss": 1.0747, + "step": 1650 + }, + { + "epoch": 0.8669539104321713, + "grad_norm": 7.076270580291748, + "learning_rate": 8.355860612460403e-06, + "loss": 0.9801, + "step": 1660 + }, + { + "epoch": 0.8721765243504374, + "grad_norm": 5.258996963500977, + "learning_rate": 8.345300950369589e-06, + "loss": 1.0273, + "step": 1670 + }, + { + "epoch": 0.8773991382687035, + "grad_norm": 6.378342151641846, + "learning_rate": 8.334741288278775e-06, + "loss": 0.8198, + "step": 1680 + }, + { + "epoch": 0.8826217521869696, + "grad_norm": 6.257429599761963, + "learning_rate": 8.324181626187963e-06, + "loss": 0.9199, + "step": 1690 + }, + { + "epoch": 0.8878443661052356, + "grad_norm": 8.555800437927246, + "learning_rate": 8.31362196409715e-06, + "loss": 0.9679, + "step": 1700 + }, + { + "epoch": 0.8930669800235017, + "grad_norm": 5.934536933898926, + "learning_rate": 8.303062302006336e-06, + "loss": 0.9922, + "step": 1710 + }, + { + "epoch": 0.8982895939417679, + "grad_norm": 5.284457206726074, + "learning_rate": 8.292502639915522e-06, + "loss": 1.0682, + "step": 1720 + }, + { + "epoch": 0.903512207860034, + "grad_norm": 6.454044342041016, + "learning_rate": 8.28194297782471e-06, + "loss": 1.1901, + "step": 1730 + }, + { + "epoch": 0.9087348217783, + "grad_norm": 6.973818778991699, + "learning_rate": 8.271383315733898e-06, + "loss": 1.01, + "step": 1740 + }, + { + "epoch": 0.9139574356965662, + "grad_norm": 7.165948390960693, + "learning_rate": 8.260823653643084e-06, + "loss": 0.926, + "step": 1750 + }, + { + "epoch": 0.9191800496148322, + "grad_norm": 5.891210556030273, + "learning_rate": 8.25026399155227e-06, + "loss": 1.0527, + "step": 1760 + }, + { + "epoch": 0.9244026635330983, + "grad_norm": 6.440408229827881, + "learning_rate": 8.239704329461457e-06, + "loss": 1.0271, + "step": 1770 + }, + { + "epoch": 0.9296252774513644, + "grad_norm": 6.762996673583984, + "learning_rate": 8.229144667370645e-06, + "loss": 1.1016, + "step": 1780 + }, + { + "epoch": 0.9348478913696305, + "grad_norm": 7.777276515960693, + "learning_rate": 8.218585005279833e-06, + "loss": 1.2933, + "step": 1790 + }, + { + "epoch": 0.9400705052878966, + "grad_norm": 6.9960713386535645, + "learning_rate": 8.208025343189019e-06, + "loss": 1.1484, + "step": 1800 + }, + { + "epoch": 0.9400705052878966, + "eval_loss": 1.0149155855178833, + "eval_runtime": 46.3161, + "eval_samples_per_second": 36.748, + "eval_steps_per_second": 4.599, + "step": 1800 + }, + { + "epoch": 0.9452931192061627, + "grad_norm": 5.822863578796387, + "learning_rate": 8.197465681098205e-06, + "loss": 0.952, + "step": 1810 + }, + { + "epoch": 0.9505157331244288, + "grad_norm": 5.783324718475342, + "learning_rate": 8.186906019007393e-06, + "loss": 0.9129, + "step": 1820 + }, + { + "epoch": 0.9557383470426949, + "grad_norm": 7.200591564178467, + "learning_rate": 8.17634635691658e-06, + "loss": 1.0411, + "step": 1830 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 6.707890033721924, + "learning_rate": 8.165786694825766e-06, + "loss": 1.0755, + "step": 1840 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 4.4951372146606445, + "learning_rate": 8.155227032734954e-06, + "loss": 1.1139, + "step": 1850 + }, + { + "epoch": 0.9714061887974932, + "grad_norm": 7.841273307800293, + "learning_rate": 8.14466737064414e-06, + "loss": 0.9171, + "step": 1860 + }, + { + "epoch": 0.9766288027157592, + "grad_norm": 8.396512985229492, + "learning_rate": 8.134107708553328e-06, + "loss": 0.8624, + "step": 1870 + }, + { + "epoch": 0.9818514166340253, + "grad_norm": 7.903951168060303, + "learning_rate": 8.123548046462514e-06, + "loss": 0.9832, + "step": 1880 + }, + { + "epoch": 0.9870740305522914, + "grad_norm": 5.722747325897217, + "learning_rate": 8.1129883843717e-06, + "loss": 0.9931, + "step": 1890 + }, + { + "epoch": 0.9922966444705575, + "grad_norm": 6.809545993804932, + "learning_rate": 8.102428722280887e-06, + "loss": 0.987, + "step": 1900 + }, + { + "epoch": 0.9975192583888236, + "grad_norm": 5.796718120574951, + "learning_rate": 8.091869060190075e-06, + "loss": 0.9521, + "step": 1910 + }, + { + "epoch": 1.0031335683509597, + "grad_norm": 4.707700252532959, + "learning_rate": 8.081309398099263e-06, + "loss": 1.0939, + "step": 1920 + }, + { + "epoch": 1.0083561822692257, + "grad_norm": 5.458223342895508, + "learning_rate": 8.070749736008449e-06, + "loss": 0.8501, + "step": 1930 + }, + { + "epoch": 1.0135787961874918, + "grad_norm": 7.022110939025879, + "learning_rate": 8.060190073917635e-06, + "loss": 0.8575, + "step": 1940 + }, + { + "epoch": 1.0188014101057579, + "grad_norm": 6.051275730133057, + "learning_rate": 8.049630411826821e-06, + "loss": 0.7803, + "step": 1950 + }, + { + "epoch": 1.024024024024024, + "grad_norm": 13.546333312988281, + "learning_rate": 8.03907074973601e-06, + "loss": 0.691, + "step": 1960 + }, + { + "epoch": 1.0292466379422902, + "grad_norm": 6.829512596130371, + "learning_rate": 8.028511087645196e-06, + "loss": 0.7195, + "step": 1970 + }, + { + "epoch": 1.0344692518605563, + "grad_norm": 6.821556091308594, + "learning_rate": 8.017951425554384e-06, + "loss": 0.7773, + "step": 1980 + }, + { + "epoch": 1.0396918657788223, + "grad_norm": 4.730713844299316, + "learning_rate": 8.00739176346357e-06, + "loss": 0.6783, + "step": 1990 + }, + { + "epoch": 1.0449144796970884, + "grad_norm": 6.354138374328613, + "learning_rate": 7.996832101372756e-06, + "loss": 0.9788, + "step": 2000 + }, + { + "epoch": 1.0449144796970884, + "eval_loss": 0.9896802306175232, + "eval_runtime": 46.2765, + "eval_samples_per_second": 36.779, + "eval_steps_per_second": 4.603, + "step": 2000 + }, + { + "epoch": 1.0501370936153545, + "grad_norm": 6.299434185028076, + "learning_rate": 7.986272439281944e-06, + "loss": 0.7521, + "step": 2010 + }, + { + "epoch": 1.0553597075336205, + "grad_norm": 8.378788948059082, + "learning_rate": 7.97571277719113e-06, + "loss": 0.7983, + "step": 2020 + }, + { + "epoch": 1.0605823214518866, + "grad_norm": 5.674183368682861, + "learning_rate": 7.965153115100317e-06, + "loss": 0.8165, + "step": 2030 + }, + { + "epoch": 1.0658049353701529, + "grad_norm": 5.855607032775879, + "learning_rate": 7.954593453009504e-06, + "loss": 1.0197, + "step": 2040 + }, + { + "epoch": 1.071027549288419, + "grad_norm": 5.06273078918457, + "learning_rate": 7.94403379091869e-06, + "loss": 0.799, + "step": 2050 + }, + { + "epoch": 1.076250163206685, + "grad_norm": 4.809935092926025, + "learning_rate": 7.933474128827879e-06, + "loss": 0.8601, + "step": 2060 + }, + { + "epoch": 1.081472777124951, + "grad_norm": 4.642035007476807, + "learning_rate": 7.922914466737065e-06, + "loss": 0.8228, + "step": 2070 + }, + { + "epoch": 1.086695391043217, + "grad_norm": 6.76859188079834, + "learning_rate": 7.912354804646251e-06, + "loss": 0.8457, + "step": 2080 + }, + { + "epoch": 1.0919180049614832, + "grad_norm": 7.555065155029297, + "learning_rate": 7.901795142555439e-06, + "loss": 0.8369, + "step": 2090 + }, + { + "epoch": 1.0971406188797492, + "grad_norm": 5.144375324249268, + "learning_rate": 7.891235480464627e-06, + "loss": 0.6828, + "step": 2100 + }, + { + "epoch": 1.1023632327980155, + "grad_norm": 6.584686756134033, + "learning_rate": 7.880675818373813e-06, + "loss": 0.7198, + "step": 2110 + }, + { + "epoch": 1.1075858467162816, + "grad_norm": 5.92726469039917, + "learning_rate": 7.870116156283e-06, + "loss": 0.8571, + "step": 2120 + }, + { + "epoch": 1.1128084606345476, + "grad_norm": 5.866957187652588, + "learning_rate": 7.859556494192186e-06, + "loss": 0.8838, + "step": 2130 + }, + { + "epoch": 1.1180310745528137, + "grad_norm": 6.889613151550293, + "learning_rate": 7.848996832101374e-06, + "loss": 0.7764, + "step": 2140 + }, + { + "epoch": 1.1232536884710798, + "grad_norm": 7.770586013793945, + "learning_rate": 7.83843717001056e-06, + "loss": 0.7697, + "step": 2150 + }, + { + "epoch": 1.1284763023893458, + "grad_norm": 6.084799766540527, + "learning_rate": 7.827877507919746e-06, + "loss": 0.8748, + "step": 2160 + }, + { + "epoch": 1.1336989163076119, + "grad_norm": 8.996906280517578, + "learning_rate": 7.817317845828934e-06, + "loss": 0.6519, + "step": 2170 + }, + { + "epoch": 1.1389215302258782, + "grad_norm": 4.936269283294678, + "learning_rate": 7.80675818373812e-06, + "loss": 0.9594, + "step": 2180 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 5.769779205322266, + "learning_rate": 7.796198521647309e-06, + "loss": 0.7806, + "step": 2190 + }, + { + "epoch": 1.1493667580624103, + "grad_norm": 7.1322808265686035, + "learning_rate": 7.785638859556495e-06, + "loss": 0.8086, + "step": 2200 + }, + { + "epoch": 1.1493667580624103, + "eval_loss": 0.969113826751709, + "eval_runtime": 46.2762, + "eval_samples_per_second": 36.779, + "eval_steps_per_second": 4.603, + "step": 2200 + }, + { + "epoch": 1.1545893719806763, + "grad_norm": 6.716241359710693, + "learning_rate": 7.775079197465681e-06, + "loss": 0.9122, + "step": 2210 + }, + { + "epoch": 1.1598119858989424, + "grad_norm": 5.767160892486572, + "learning_rate": 7.764519535374867e-06, + "loss": 0.7598, + "step": 2220 + }, + { + "epoch": 1.1650345998172085, + "grad_norm": 7.989006519317627, + "learning_rate": 7.753959873284055e-06, + "loss": 0.6704, + "step": 2230 + }, + { + "epoch": 1.1702572137354745, + "grad_norm": 5.272390365600586, + "learning_rate": 7.743400211193243e-06, + "loss": 0.8521, + "step": 2240 + }, + { + "epoch": 1.1754798276537408, + "grad_norm": 6.191717147827148, + "learning_rate": 7.73284054910243e-06, + "loss": 0.8905, + "step": 2250 + }, + { + "epoch": 1.1807024415720069, + "grad_norm": 5.682114124298096, + "learning_rate": 7.722280887011616e-06, + "loss": 0.8386, + "step": 2260 + }, + { + "epoch": 1.185925055490273, + "grad_norm": 6.549655914306641, + "learning_rate": 7.711721224920804e-06, + "loss": 0.691, + "step": 2270 + }, + { + "epoch": 1.191147669408539, + "grad_norm": 6.487022876739502, + "learning_rate": 7.70116156282999e-06, + "loss": 0.7769, + "step": 2280 + }, + { + "epoch": 1.196370283326805, + "grad_norm": 7.281522274017334, + "learning_rate": 7.690601900739178e-06, + "loss": 0.7235, + "step": 2290 + }, + { + "epoch": 1.2015928972450711, + "grad_norm": 7.294795513153076, + "learning_rate": 7.680042238648364e-06, + "loss": 0.7813, + "step": 2300 + }, + { + "epoch": 1.2068155111633372, + "grad_norm": 6.67874813079834, + "learning_rate": 7.66948257655755e-06, + "loss": 0.7375, + "step": 2310 + }, + { + "epoch": 1.2120381250816032, + "grad_norm": 3.83427357673645, + "learning_rate": 7.658922914466738e-06, + "loss": 0.7533, + "step": 2320 + }, + { + "epoch": 1.2172607389998695, + "grad_norm": 5.654359340667725, + "learning_rate": 7.648363252375925e-06, + "loss": 0.8276, + "step": 2330 + }, + { + "epoch": 1.2224833529181356, + "grad_norm": 5.315032482147217, + "learning_rate": 7.637803590285111e-06, + "loss": 0.7584, + "step": 2340 + }, + { + "epoch": 1.2277059668364017, + "grad_norm": 6.630548477172852, + "learning_rate": 7.627243928194299e-06, + "loss": 0.6388, + "step": 2350 + }, + { + "epoch": 1.2329285807546677, + "grad_norm": 5.981212615966797, + "learning_rate": 7.616684266103486e-06, + "loss": 0.8596, + "step": 2360 + }, + { + "epoch": 1.2381511946729338, + "grad_norm": 5.186179161071777, + "learning_rate": 7.606124604012672e-06, + "loss": 0.7127, + "step": 2370 + }, + { + "epoch": 1.2433738085911998, + "grad_norm": 6.0210747718811035, + "learning_rate": 7.595564941921859e-06, + "loss": 0.8277, + "step": 2380 + }, + { + "epoch": 1.248596422509466, + "grad_norm": 6.917499542236328, + "learning_rate": 7.585005279831046e-06, + "loss": 0.6321, + "step": 2390 + }, + { + "epoch": 1.253819036427732, + "grad_norm": 6.423802852630615, + "learning_rate": 7.574445617740233e-06, + "loss": 0.8517, + "step": 2400 + }, + { + "epoch": 1.253819036427732, + "eval_loss": 0.9506328701972961, + "eval_runtime": 46.3505, + "eval_samples_per_second": 36.72, + "eval_steps_per_second": 4.595, + "step": 2400 + }, + { + "epoch": 1.2590416503459982, + "grad_norm": 6.889662742614746, + "learning_rate": 7.563885955649419e-06, + "loss": 0.7603, + "step": 2410 + }, + { + "epoch": 1.2642642642642643, + "grad_norm": 4.960092544555664, + "learning_rate": 7.553326293558607e-06, + "loss": 0.7804, + "step": 2420 + }, + { + "epoch": 1.2694868781825304, + "grad_norm": 5.164410591125488, + "learning_rate": 7.542766631467794e-06, + "loss": 0.6938, + "step": 2430 + }, + { + "epoch": 1.2747094921007964, + "grad_norm": 5.916507720947266, + "learning_rate": 7.53220696937698e-06, + "loss": 0.6964, + "step": 2440 + }, + { + "epoch": 1.2799321060190625, + "grad_norm": 6.940438747406006, + "learning_rate": 7.521647307286167e-06, + "loss": 0.7605, + "step": 2450 + }, + { + "epoch": 1.2851547199373288, + "grad_norm": 6.659502983093262, + "learning_rate": 7.511087645195354e-06, + "loss": 0.8735, + "step": 2460 + }, + { + "epoch": 1.2903773338555946, + "grad_norm": 5.659145355224609, + "learning_rate": 7.500527983104541e-06, + "loss": 0.726, + "step": 2470 + }, + { + "epoch": 1.295599947773861, + "grad_norm": 3.484576463699341, + "learning_rate": 7.489968321013729e-06, + "loss": 0.6992, + "step": 2480 + }, + { + "epoch": 1.300822561692127, + "grad_norm": 6.005791664123535, + "learning_rate": 7.479408658922915e-06, + "loss": 0.8257, + "step": 2490 + }, + { + "epoch": 1.306045175610393, + "grad_norm": 4.005056381225586, + "learning_rate": 7.468848996832102e-06, + "loss": 0.8418, + "step": 2500 + }, + { + "epoch": 1.311267789528659, + "grad_norm": 6.585374355316162, + "learning_rate": 7.458289334741288e-06, + "loss": 0.7168, + "step": 2510 + }, + { + "epoch": 1.3164904034469251, + "grad_norm": 6.849618434906006, + "learning_rate": 7.4477296726504755e-06, + "loss": 0.8348, + "step": 2520 + }, + { + "epoch": 1.3217130173651912, + "grad_norm": 4.997506618499756, + "learning_rate": 7.437170010559663e-06, + "loss": 0.7155, + "step": 2530 + }, + { + "epoch": 1.3269356312834573, + "grad_norm": 6.247817516326904, + "learning_rate": 7.42661034846885e-06, + "loss": 0.7999, + "step": 2540 + }, + { + "epoch": 1.3321582452017235, + "grad_norm": 6.865342617034912, + "learning_rate": 7.416050686378037e-06, + "loss": 0.8178, + "step": 2550 + }, + { + "epoch": 1.3373808591199896, + "grad_norm": 7.5182695388793945, + "learning_rate": 7.405491024287224e-06, + "loss": 0.8535, + "step": 2560 + }, + { + "epoch": 1.3426034730382557, + "grad_norm": 5.786922454833984, + "learning_rate": 7.39493136219641e-06, + "loss": 0.8538, + "step": 2570 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 5.576653480529785, + "learning_rate": 7.384371700105597e-06, + "loss": 0.6142, + "step": 2580 + }, + { + "epoch": 1.3530487008747878, + "grad_norm": 6.3509135246276855, + "learning_rate": 7.3738120380147835e-06, + "loss": 0.872, + "step": 2590 + }, + { + "epoch": 1.3582713147930539, + "grad_norm": 6.3292131423950195, + "learning_rate": 7.3632523759239715e-06, + "loss": 0.7452, + "step": 2600 + }, + { + "epoch": 1.3582713147930539, + "eval_loss": 0.928360104560852, + "eval_runtime": 46.2783, + "eval_samples_per_second": 36.778, + "eval_steps_per_second": 4.603, + "step": 2600 + }, + { + "epoch": 1.36349392871132, + "grad_norm": 5.916106700897217, + "learning_rate": 7.352692713833159e-06, + "loss": 0.7897, + "step": 2610 + }, + { + "epoch": 1.3687165426295862, + "grad_norm": 6.225005149841309, + "learning_rate": 7.342133051742345e-06, + "loss": 0.7989, + "step": 2620 + }, + { + "epoch": 1.3739391565478523, + "grad_norm": 7.300755500793457, + "learning_rate": 7.331573389651532e-06, + "loss": 0.8093, + "step": 2630 + }, + { + "epoch": 1.3791617704661183, + "grad_norm": 6.355301380157471, + "learning_rate": 7.321013727560718e-06, + "loss": 0.6979, + "step": 2640 + }, + { + "epoch": 1.3843843843843844, + "grad_norm": 6.439295291900635, + "learning_rate": 7.310454065469905e-06, + "loss": 0.6481, + "step": 2650 + }, + { + "epoch": 1.3896069983026504, + "grad_norm": 5.19166374206543, + "learning_rate": 7.299894403379092e-06, + "loss": 0.6853, + "step": 2660 + }, + { + "epoch": 1.3948296122209165, + "grad_norm": 7.574211120605469, + "learning_rate": 7.2893347412882796e-06, + "loss": 0.6786, + "step": 2670 + }, + { + "epoch": 1.4000522261391826, + "grad_norm": 6.01971435546875, + "learning_rate": 7.278775079197467e-06, + "loss": 0.7968, + "step": 2680 + }, + { + "epoch": 1.4052748400574488, + "grad_norm": 4.888395309448242, + "learning_rate": 7.268215417106653e-06, + "loss": 0.8298, + "step": 2690 + }, + { + "epoch": 1.410497453975715, + "grad_norm": 4.738596439361572, + "learning_rate": 7.25765575501584e-06, + "loss": 0.6968, + "step": 2700 + }, + { + "epoch": 1.415720067893981, + "grad_norm": 6.128376483917236, + "learning_rate": 7.247096092925026e-06, + "loss": 0.7189, + "step": 2710 + }, + { + "epoch": 1.420942681812247, + "grad_norm": 6.777405738830566, + "learning_rate": 7.236536430834213e-06, + "loss": 0.7324, + "step": 2720 + }, + { + "epoch": 1.426165295730513, + "grad_norm": 5.1269402503967285, + "learning_rate": 7.225976768743401e-06, + "loss": 0.6326, + "step": 2730 + }, + { + "epoch": 1.4313879096487792, + "grad_norm": 7.080173492431641, + "learning_rate": 7.215417106652588e-06, + "loss": 0.7193, + "step": 2740 + }, + { + "epoch": 1.4366105235670452, + "grad_norm": 6.149571895599365, + "learning_rate": 7.204857444561775e-06, + "loss": 0.815, + "step": 2750 + }, + { + "epoch": 1.4418331374853115, + "grad_norm": 4.2188849449157715, + "learning_rate": 7.194297782470962e-06, + "loss": 0.8526, + "step": 2760 + }, + { + "epoch": 1.4470557514035776, + "grad_norm": 6.189548492431641, + "learning_rate": 7.183738120380148e-06, + "loss": 0.7041, + "step": 2770 + }, + { + "epoch": 1.4522783653218436, + "grad_norm": 8.304208755493164, + "learning_rate": 7.173178458289335e-06, + "loss": 0.8734, + "step": 2780 + }, + { + "epoch": 1.4575009792401097, + "grad_norm": 8.095356941223145, + "learning_rate": 7.162618796198522e-06, + "loss": 0.733, + "step": 2790 + }, + { + "epoch": 1.4627235931583757, + "grad_norm": 5.834177017211914, + "learning_rate": 7.1520591341077094e-06, + "loss": 0.7102, + "step": 2800 + }, + { + "epoch": 1.4627235931583757, + "eval_loss": 0.9070786237716675, + "eval_runtime": 46.2298, + "eval_samples_per_second": 36.816, + "eval_steps_per_second": 4.607, + "step": 2800 + }, + { + "epoch": 1.4679462070766418, + "grad_norm": 7.127483367919922, + "learning_rate": 7.1414994720168965e-06, + "loss": 0.8149, + "step": 2810 + }, + { + "epoch": 1.4731688209949079, + "grad_norm": 5.216626167297363, + "learning_rate": 7.131995776135164e-06, + "loss": 0.7184, + "step": 2820 + }, + { + "epoch": 1.4783914349131742, + "grad_norm": 5.421391487121582, + "learning_rate": 7.121436114044351e-06, + "loss": 0.757, + "step": 2830 + }, + { + "epoch": 1.4836140488314402, + "grad_norm": 5.557046413421631, + "learning_rate": 7.110876451953538e-06, + "loss": 0.7887, + "step": 2840 + }, + { + "epoch": 1.4888366627497063, + "grad_norm": 4.0539870262146, + "learning_rate": 7.1003167898627245e-06, + "loss": 0.7473, + "step": 2850 + }, + { + "epoch": 1.4940592766679723, + "grad_norm": 5.317719459533691, + "learning_rate": 7.0897571277719124e-06, + "loss": 0.7603, + "step": 2860 + }, + { + "epoch": 1.4992818905862384, + "grad_norm": 7.20483922958374, + "learning_rate": 7.079197465681099e-06, + "loss": 0.618, + "step": 2870 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 5.992430686950684, + "learning_rate": 7.068637803590286e-06, + "loss": 0.7101, + "step": 2880 + }, + { + "epoch": 1.5097271184227705, + "grad_norm": 5.599936008453369, + "learning_rate": 7.058078141499473e-06, + "loss": 0.7421, + "step": 2890 + }, + { + "epoch": 1.5149497323410368, + "grad_norm": 3.7422847747802734, + "learning_rate": 7.047518479408659e-06, + "loss": 0.57, + "step": 2900 + }, + { + "epoch": 1.5201723462593026, + "grad_norm": 6.516021251678467, + "learning_rate": 7.036958817317846e-06, + "loss": 0.7308, + "step": 2910 + }, + { + "epoch": 1.525394960177569, + "grad_norm": 6.263866901397705, + "learning_rate": 7.026399155227034e-06, + "loss": 0.8289, + "step": 2920 + }, + { + "epoch": 1.530617574095835, + "grad_norm": 4.577157974243164, + "learning_rate": 7.0158394931362205e-06, + "loss": 0.8547, + "step": 2930 + }, + { + "epoch": 1.535840188014101, + "grad_norm": 4.392026424407959, + "learning_rate": 7.006335797254489e-06, + "loss": 0.7407, + "step": 2940 + }, + { + "epoch": 1.541062801932367, + "grad_norm": 4.726680755615234, + "learning_rate": 6.995776135163675e-06, + "loss": 0.7082, + "step": 2950 + }, + { + "epoch": 1.5462854158506332, + "grad_norm": 6.287652492523193, + "learning_rate": 6.985216473072862e-06, + "loss": 0.7867, + "step": 2960 + }, + { + "epoch": 1.5515080297688995, + "grad_norm": 4.373517990112305, + "learning_rate": 6.974656810982049e-06, + "loss": 0.6765, + "step": 2970 + }, + { + "epoch": 1.5567306436871653, + "grad_norm": 6.31062126159668, + "learning_rate": 6.9640971488912356e-06, + "loss": 0.6577, + "step": 2980 + }, + { + "epoch": 1.5619532576054316, + "grad_norm": 4.2869415283203125, + "learning_rate": 6.9535374868004235e-06, + "loss": 0.6042, + "step": 2990 + }, + { + "epoch": 1.5671758715236976, + "grad_norm": 4.132930278778076, + "learning_rate": 6.942977824709611e-06, + "loss": 0.7663, + "step": 3000 + }, + { + "epoch": 1.5671758715236976, + "eval_loss": 0.8875888586044312, + "eval_runtime": 46.2332, + "eval_samples_per_second": 36.813, + "eval_steps_per_second": 4.607, + "step": 3000 + }, + { + "epoch": 1.5723984854419637, + "grad_norm": 4.136049270629883, + "learning_rate": 6.932418162618797e-06, + "loss": 0.6862, + "step": 3010 + }, + { + "epoch": 1.5776210993602298, + "grad_norm": 4.138570308685303, + "learning_rate": 6.921858500527984e-06, + "loss": 0.6622, + "step": 3020 + }, + { + "epoch": 1.5828437132784958, + "grad_norm": 6.920501708984375, + "learning_rate": 6.91129883843717e-06, + "loss": 0.6052, + "step": 3030 + }, + { + "epoch": 1.588066327196762, + "grad_norm": 5.639624118804932, + "learning_rate": 6.900739176346357e-06, + "loss": 0.6701, + "step": 3040 + }, + { + "epoch": 1.593288941115028, + "grad_norm": 5.700570106506348, + "learning_rate": 6.890179514255544e-06, + "loss": 0.8079, + "step": 3050 + }, + { + "epoch": 1.5985115550332942, + "grad_norm": 4.964538097381592, + "learning_rate": 6.8796198521647316e-06, + "loss": 0.6963, + "step": 3060 + }, + { + "epoch": 1.6037341689515603, + "grad_norm": 4.319785118103027, + "learning_rate": 6.869060190073919e-06, + "loss": 0.6664, + "step": 3070 + }, + { + "epoch": 1.6089567828698264, + "grad_norm": 6.524580478668213, + "learning_rate": 6.858500527983105e-06, + "loss": 0.7526, + "step": 3080 + }, + { + "epoch": 1.6141793967880924, + "grad_norm": 5.287715911865234, + "learning_rate": 6.847940865892292e-06, + "loss": 0.6971, + "step": 3090 + }, + { + "epoch": 1.6194020107063585, + "grad_norm": 6.37127161026001, + "learning_rate": 6.837381203801478e-06, + "loss": 0.6264, + "step": 3100 + }, + { + "epoch": 1.6246246246246248, + "grad_norm": 5.0084123611450195, + "learning_rate": 6.826821541710665e-06, + "loss": 0.7568, + "step": 3110 + }, + { + "epoch": 1.6298472385428906, + "grad_norm": 4.985651969909668, + "learning_rate": 6.816261879619853e-06, + "loss": 0.7554, + "step": 3120 + }, + { + "epoch": 1.6350698524611569, + "grad_norm": 6.405339241027832, + "learning_rate": 6.80570221752904e-06, + "loss": 0.6628, + "step": 3130 + }, + { + "epoch": 1.640292466379423, + "grad_norm": 5.617925643920898, + "learning_rate": 6.795142555438227e-06, + "loss": 0.6515, + "step": 3140 + }, + { + "epoch": 1.645515080297689, + "grad_norm": 2.850438117980957, + "learning_rate": 6.784582893347413e-06, + "loss": 0.6283, + "step": 3150 + }, + { + "epoch": 1.650737694215955, + "grad_norm": 7.605482578277588, + "learning_rate": 6.7740232312566e-06, + "loss": 0.6881, + "step": 3160 + }, + { + "epoch": 1.6559603081342211, + "grad_norm": 4.773893356323242, + "learning_rate": 6.763463569165787e-06, + "loss": 0.6669, + "step": 3170 + }, + { + "epoch": 1.6611829220524874, + "grad_norm": 4.882478713989258, + "learning_rate": 6.752903907074974e-06, + "loss": 0.6374, + "step": 3180 + }, + { + "epoch": 1.6664055359707532, + "grad_norm": 6.456390380859375, + "learning_rate": 6.7423442449841614e-06, + "loss": 0.7759, + "step": 3190 + }, + { + "epoch": 1.6716281498890195, + "grad_norm": 7.474002361297607, + "learning_rate": 6.7317845828933485e-06, + "loss": 0.6657, + "step": 3200 + }, + { + "epoch": 1.6716281498890195, + "eval_loss": 0.8759788870811462, + "eval_runtime": 46.2024, + "eval_samples_per_second": 36.838, + "eval_steps_per_second": 4.61, + "step": 3200 + }, + { + "epoch": 1.6768507638072856, + "grad_norm": 4.953747272491455, + "learning_rate": 6.721224920802535e-06, + "loss": 0.7443, + "step": 3210 + }, + { + "epoch": 1.6820733777255517, + "grad_norm": 5.401075839996338, + "learning_rate": 6.710665258711722e-06, + "loss": 0.7067, + "step": 3220 + }, + { + "epoch": 1.6872959916438177, + "grad_norm": 5.775487899780273, + "learning_rate": 6.700105596620908e-06, + "loss": 0.78, + "step": 3230 + }, + { + "epoch": 1.6925186055620838, + "grad_norm": 6.578312397003174, + "learning_rate": 6.689545934530095e-06, + "loss": 0.6206, + "step": 3240 + }, + { + "epoch": 1.69774121948035, + "grad_norm": 7.354413986206055, + "learning_rate": 6.678986272439283e-06, + "loss": 0.7265, + "step": 3250 + }, + { + "epoch": 1.702963833398616, + "grad_norm": 6.80817985534668, + "learning_rate": 6.6684266103484695e-06, + "loss": 0.8696, + "step": 3260 + }, + { + "epoch": 1.7081864473168822, + "grad_norm": 6.331092357635498, + "learning_rate": 6.657866948257657e-06, + "loss": 0.7226, + "step": 3270 + }, + { + "epoch": 1.7134090612351482, + "grad_norm": 6.063718795776367, + "learning_rate": 6.647307286166843e-06, + "loss": 0.6339, + "step": 3280 + }, + { + "epoch": 1.7186316751534143, + "grad_norm": 4.693406105041504, + "learning_rate": 6.63674762407603e-06, + "loss": 0.7316, + "step": 3290 + }, + { + "epoch": 1.7238542890716804, + "grad_norm": 6.732961654663086, + "learning_rate": 6.626187961985216e-06, + "loss": 0.7185, + "step": 3300 + }, + { + "epoch": 1.7290769029899464, + "grad_norm": 4.482574939727783, + "learning_rate": 6.615628299894404e-06, + "loss": 0.7814, + "step": 3310 + }, + { + "epoch": 1.7342995169082127, + "grad_norm": 7.299856662750244, + "learning_rate": 6.605068637803591e-06, + "loss": 0.7275, + "step": 3320 + }, + { + "epoch": 1.7395221308264786, + "grad_norm": 4.190903186798096, + "learning_rate": 6.5945089757127776e-06, + "loss": 0.6967, + "step": 3330 + }, + { + "epoch": 1.7447447447447448, + "grad_norm": 5.144697666168213, + "learning_rate": 6.583949313621965e-06, + "loss": 0.7279, + "step": 3340 + }, + { + "epoch": 1.7499673586630107, + "grad_norm": 8.06899642944336, + "learning_rate": 6.573389651531151e-06, + "loss": 0.7133, + "step": 3350 + }, + { + "epoch": 1.755189972581277, + "grad_norm": 5.388707637786865, + "learning_rate": 6.562829989440338e-06, + "loss": 0.7695, + "step": 3360 + }, + { + "epoch": 1.760412586499543, + "grad_norm": 5.485361576080322, + "learning_rate": 6.552270327349526e-06, + "loss": 0.6475, + "step": 3370 + }, + { + "epoch": 1.765635200417809, + "grad_norm": 5.023000717163086, + "learning_rate": 6.541710665258712e-06, + "loss": 0.6575, + "step": 3380 + }, + { + "epoch": 1.7708578143360754, + "grad_norm": 5.406675815582275, + "learning_rate": 6.531151003167899e-06, + "loss": 0.7263, + "step": 3390 + }, + { + "epoch": 1.7760804282543412, + "grad_norm": 3.564267873764038, + "learning_rate": 6.520591341077086e-06, + "loss": 0.598, + "step": 3400 + }, + { + "epoch": 1.7760804282543412, + "eval_loss": 0.8584678769111633, + "eval_runtime": 46.2248, + "eval_samples_per_second": 36.82, + "eval_steps_per_second": 4.608, + "step": 3400 + }, + { + "epoch": 1.7813030421726075, + "grad_norm": 4.055863380432129, + "learning_rate": 6.510031678986273e-06, + "loss": 0.6674, + "step": 3410 + }, + { + "epoch": 1.7865256560908733, + "grad_norm": 5.625813007354736, + "learning_rate": 6.49947201689546e-06, + "loss": 0.8424, + "step": 3420 + }, + { + "epoch": 1.7917482700091396, + "grad_norm": 6.47999906539917, + "learning_rate": 6.488912354804647e-06, + "loss": 0.6039, + "step": 3430 + }, + { + "epoch": 1.7969708839274057, + "grad_norm": 5.702643871307373, + "learning_rate": 6.478352692713834e-06, + "loss": 0.588, + "step": 3440 + }, + { + "epoch": 1.8021934978456717, + "grad_norm": 6.600216388702393, + "learning_rate": 6.467793030623021e-06, + "loss": 0.7704, + "step": 3450 + }, + { + "epoch": 1.807416111763938, + "grad_norm": 5.743258476257324, + "learning_rate": 6.4572333685322074e-06, + "loss": 0.6381, + "step": 3460 + }, + { + "epoch": 1.8126387256822039, + "grad_norm": 7.323511123657227, + "learning_rate": 6.4466737064413945e-06, + "loss": 0.8124, + "step": 3470 + }, + { + "epoch": 1.8178613396004701, + "grad_norm": 4.996503829956055, + "learning_rate": 6.436114044350581e-06, + "loss": 0.6816, + "step": 3480 + }, + { + "epoch": 1.823083953518736, + "grad_norm": 6.126676559448242, + "learning_rate": 6.425554382259768e-06, + "loss": 0.6773, + "step": 3490 + }, + { + "epoch": 1.8283065674370023, + "grad_norm": 4.893184185028076, + "learning_rate": 6.414994720168956e-06, + "loss": 0.684, + "step": 3500 + }, + { + "epoch": 1.8335291813552683, + "grad_norm": 6.5332841873168945, + "learning_rate": 6.404435058078142e-06, + "loss": 0.771, + "step": 3510 + }, + { + "epoch": 1.8387517952735344, + "grad_norm": 8.364972114562988, + "learning_rate": 6.393875395987329e-06, + "loss": 0.7224, + "step": 3520 + }, + { + "epoch": 1.8439744091918004, + "grad_norm": 5.508388042449951, + "learning_rate": 6.3833157338965155e-06, + "loss": 0.7867, + "step": 3530 + }, + { + "epoch": 1.8491970231100665, + "grad_norm": 6.582828044891357, + "learning_rate": 6.372756071805703e-06, + "loss": 0.5697, + "step": 3540 + }, + { + "epoch": 1.8544196370283328, + "grad_norm": 6.311943531036377, + "learning_rate": 6.362196409714889e-06, + "loss": 0.6919, + "step": 3550 + }, + { + "epoch": 1.8596422509465986, + "grad_norm": 8.718938827514648, + "learning_rate": 6.351636747624077e-06, + "loss": 0.685, + "step": 3560 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 7.9847846031188965, + "learning_rate": 6.341077085533264e-06, + "loss": 0.5817, + "step": 3570 + }, + { + "epoch": 1.870087478783131, + "grad_norm": 5.257252216339111, + "learning_rate": 6.33051742344245e-06, + "loss": 0.7489, + "step": 3580 + }, + { + "epoch": 1.875310092701397, + "grad_norm": 6.155455589294434, + "learning_rate": 6.319957761351637e-06, + "loss": 0.6976, + "step": 3590 + }, + { + "epoch": 1.880532706619663, + "grad_norm": 5.958306789398193, + "learning_rate": 6.3093980992608236e-06, + "loss": 0.6385, + "step": 3600 + }, + { + "epoch": 1.880532706619663, + "eval_loss": 0.8434246778488159, + "eval_runtime": 46.2141, + "eval_samples_per_second": 36.829, + "eval_steps_per_second": 4.609, + "step": 3600 + }, + { + "epoch": 1.8857553205379292, + "grad_norm": 4.219689846038818, + "learning_rate": 6.298838437170011e-06, + "loss": 0.6891, + "step": 3610 + }, + { + "epoch": 1.8909779344561954, + "grad_norm": 6.840215682983398, + "learning_rate": 6.288278775079199e-06, + "loss": 0.5082, + "step": 3620 + }, + { + "epoch": 1.8962005483744613, + "grad_norm": 4.201242923736572, + "learning_rate": 6.277719112988385e-06, + "loss": 0.6311, + "step": 3630 + }, + { + "epoch": 1.9014231622927276, + "grad_norm": 4.635916709899902, + "learning_rate": 6.267159450897572e-06, + "loss": 0.5849, + "step": 3640 + }, + { + "epoch": 1.9066457762109936, + "grad_norm": 9.082616806030273, + "learning_rate": 6.256599788806758e-06, + "loss": 0.7785, + "step": 3650 + }, + { + "epoch": 1.9118683901292597, + "grad_norm": 7.005586624145508, + "learning_rate": 6.246040126715945e-06, + "loss": 0.5691, + "step": 3660 + }, + { + "epoch": 1.9170910040475257, + "grad_norm": 6.064583778381348, + "learning_rate": 6.2354804646251325e-06, + "loss": 0.7137, + "step": 3670 + }, + { + "epoch": 1.9223136179657918, + "grad_norm": 8.23308277130127, + "learning_rate": 6.2249208025343196e-06, + "loss": 0.7679, + "step": 3680 + }, + { + "epoch": 1.927536231884058, + "grad_norm": 3.684671401977539, + "learning_rate": 6.214361140443507e-06, + "loss": 0.652, + "step": 3690 + }, + { + "epoch": 1.932758845802324, + "grad_norm": 3.7538697719573975, + "learning_rate": 6.203801478352694e-06, + "loss": 0.7469, + "step": 3700 + }, + { + "epoch": 1.9379814597205902, + "grad_norm": 6.125692844390869, + "learning_rate": 6.19324181626188e-06, + "loss": 0.775, + "step": 3710 + }, + { + "epoch": 1.9432040736388563, + "grad_norm": 6.9215989112854, + "learning_rate": 6.182682154171067e-06, + "loss": 0.7417, + "step": 3720 + }, + { + "epoch": 1.9484266875571223, + "grad_norm": 5.37251615524292, + "learning_rate": 6.172122492080253e-06, + "loss": 0.6806, + "step": 3730 + }, + { + "epoch": 1.9536493014753884, + "grad_norm": 6.892059803009033, + "learning_rate": 6.1615628299894405e-06, + "loss": 0.72, + "step": 3740 + }, + { + "epoch": 1.9588719153936545, + "grad_norm": 4.601454257965088, + "learning_rate": 6.1510031678986285e-06, + "loss": 0.5358, + "step": 3750 + }, + { + "epoch": 1.9640945293119207, + "grad_norm": 6.319756984710693, + "learning_rate": 6.140443505807815e-06, + "loss": 0.6178, + "step": 3760 + }, + { + "epoch": 1.9693171432301866, + "grad_norm": 5.619534015655518, + "learning_rate": 6.129883843717002e-06, + "loss": 0.6324, + "step": 3770 + }, + { + "epoch": 1.9745397571484529, + "grad_norm": 6.824273109436035, + "learning_rate": 6.119324181626188e-06, + "loss": 0.6543, + "step": 3780 + }, + { + "epoch": 1.979762371066719, + "grad_norm": 4.03241491317749, + "learning_rate": 6.108764519535375e-06, + "loss": 0.6577, + "step": 3790 + }, + { + "epoch": 1.984984984984985, + "grad_norm": 8.365711212158203, + "learning_rate": 6.0982048574445615e-06, + "loss": 0.7096, + "step": 3800 + }, + { + "epoch": 1.984984984984985, + "eval_loss": 0.8318305611610413, + "eval_runtime": 46.2366, + "eval_samples_per_second": 36.811, + "eval_steps_per_second": 4.607, + "step": 3800 + }, + { + "epoch": 1.990207598903251, + "grad_norm": 4.457459926605225, + "learning_rate": 6.0876451953537494e-06, + "loss": 0.7665, + "step": 3810 + }, + { + "epoch": 1.9954302128215171, + "grad_norm": 5.687203407287598, + "learning_rate": 6.0770855332629365e-06, + "loss": 0.6722, + "step": 3820 + }, + { + "epoch": 2.001044522783653, + "grad_norm": 6.511070251464844, + "learning_rate": 6.066525871172123e-06, + "loss": 0.7669, + "step": 3830 + }, + { + "epoch": 2.0062671367019194, + "grad_norm": 5.137298583984375, + "learning_rate": 6.05596620908131e-06, + "loss": 0.5114, + "step": 3840 + }, + { + "epoch": 2.011489750620185, + "grad_norm": 7.063135623931885, + "learning_rate": 6.045406546990496e-06, + "loss": 0.5532, + "step": 3850 + }, + { + "epoch": 2.0167123645384515, + "grad_norm": 4.833804607391357, + "learning_rate": 6.034846884899683e-06, + "loss": 0.4304, + "step": 3860 + }, + { + "epoch": 2.0219349784567178, + "grad_norm": 6.820064544677734, + "learning_rate": 6.024287222808871e-06, + "loss": 0.5066, + "step": 3870 + }, + { + "epoch": 2.0271575923749836, + "grad_norm": 6.833749771118164, + "learning_rate": 6.0137275607180575e-06, + "loss": 0.5043, + "step": 3880 + }, + { + "epoch": 2.03238020629325, + "grad_norm": 4.371280670166016, + "learning_rate": 6.003167898627245e-06, + "loss": 0.6015, + "step": 3890 + }, + { + "epoch": 2.0376028202115157, + "grad_norm": 4.963273048400879, + "learning_rate": 5.992608236536432e-06, + "loss": 0.5881, + "step": 3900 + }, + { + "epoch": 2.042825434129782, + "grad_norm": 5.74879264831543, + "learning_rate": 5.982048574445618e-06, + "loss": 0.5776, + "step": 3910 + }, + { + "epoch": 2.048048048048048, + "grad_norm": 3.3885014057159424, + "learning_rate": 5.971488912354805e-06, + "loss": 0.6508, + "step": 3920 + }, + { + "epoch": 2.053270661966314, + "grad_norm": 3.875781774520874, + "learning_rate": 5.960929250263991e-06, + "loss": 0.5135, + "step": 3930 + }, + { + "epoch": 2.0584932758845804, + "grad_norm": 5.869786739349365, + "learning_rate": 5.950369588173179e-06, + "loss": 0.5343, + "step": 3940 + }, + { + "epoch": 2.0637158898028463, + "grad_norm": 5.666866779327393, + "learning_rate": 5.939809926082366e-06, + "loss": 0.5258, + "step": 3950 + }, + { + "epoch": 2.0689385037211125, + "grad_norm": 5.5713276863098145, + "learning_rate": 5.929250263991553e-06, + "loss": 0.6209, + "step": 3960 + }, + { + "epoch": 2.0741611176393784, + "grad_norm": 5.73265266418457, + "learning_rate": 5.91869060190074e-06, + "loss": 0.4707, + "step": 3970 + }, + { + "epoch": 2.0793837315576447, + "grad_norm": 5.312356948852539, + "learning_rate": 5.908130939809926e-06, + "loss": 0.5429, + "step": 3980 + }, + { + "epoch": 2.0846063454759105, + "grad_norm": 5.636459827423096, + "learning_rate": 5.897571277719113e-06, + "loss": 0.5524, + "step": 3990 + }, + { + "epoch": 2.089828959394177, + "grad_norm": 5.541628360748291, + "learning_rate": 5.887011615628301e-06, + "loss": 0.6338, + "step": 4000 + }, + { + "epoch": 2.089828959394177, + "eval_loss": 0.8418287038803101, + "eval_runtime": 46.2756, + "eval_samples_per_second": 36.78, + "eval_steps_per_second": 4.603, + "step": 4000 + }, + { + "epoch": 2.095051573312443, + "grad_norm": 7.190380096435547, + "learning_rate": 5.876451953537487e-06, + "loss": 0.5818, + "step": 4010 + }, + { + "epoch": 2.100274187230709, + "grad_norm": 6.190309047698975, + "learning_rate": 5.8658922914466745e-06, + "loss": 0.5268, + "step": 4020 + }, + { + "epoch": 2.105496801148975, + "grad_norm": 5.186282157897949, + "learning_rate": 5.855332629355861e-06, + "loss": 0.4396, + "step": 4030 + }, + { + "epoch": 2.110719415067241, + "grad_norm": 3.6927499771118164, + "learning_rate": 5.844772967265048e-06, + "loss": 0.6601, + "step": 4040 + }, + { + "epoch": 2.1159420289855073, + "grad_norm": 5.906070232391357, + "learning_rate": 5.834213305174234e-06, + "loss": 0.5902, + "step": 4050 + }, + { + "epoch": 2.121164642903773, + "grad_norm": 5.765960216522217, + "learning_rate": 5.823653643083422e-06, + "loss": 0.6127, + "step": 4060 + }, + { + "epoch": 2.1263872568220394, + "grad_norm": 6.663849830627441, + "learning_rate": 5.813093980992609e-06, + "loss": 0.5044, + "step": 4070 + }, + { + "epoch": 2.1316098707403057, + "grad_norm": 5.963075637817383, + "learning_rate": 5.8025343189017954e-06, + "loss": 0.4473, + "step": 4080 + }, + { + "epoch": 2.1368324846585716, + "grad_norm": 5.396392345428467, + "learning_rate": 5.7919746568109825e-06, + "loss": 0.4896, + "step": 4090 + }, + { + "epoch": 2.142055098576838, + "grad_norm": 6.087408542633057, + "learning_rate": 5.781414994720169e-06, + "loss": 0.5368, + "step": 4100 + }, + { + "epoch": 2.1472777124951037, + "grad_norm": 4.697368144989014, + "learning_rate": 5.770855332629356e-06, + "loss": 0.6034, + "step": 4110 + }, + { + "epoch": 2.15250032641337, + "grad_norm": 5.101240634918213, + "learning_rate": 5.760295670538544e-06, + "loss": 0.4699, + "step": 4120 + }, + { + "epoch": 2.157722940331636, + "grad_norm": 6.5133891105651855, + "learning_rate": 5.74973600844773e-06, + "loss": 0.4589, + "step": 4130 + }, + { + "epoch": 2.162945554249902, + "grad_norm": 7.932409763336182, + "learning_rate": 5.739176346356917e-06, + "loss": 0.5443, + "step": 4140 + }, + { + "epoch": 2.1681681681681684, + "grad_norm": 4.897655010223389, + "learning_rate": 5.728616684266104e-06, + "loss": 0.5367, + "step": 4150 + }, + { + "epoch": 2.173390782086434, + "grad_norm": 6.237987041473389, + "learning_rate": 5.718057022175291e-06, + "loss": 0.5477, + "step": 4160 + }, + { + "epoch": 2.1786133960047005, + "grad_norm": 5.690924167633057, + "learning_rate": 5.707497360084478e-06, + "loss": 0.6216, + "step": 4170 + }, + { + "epoch": 2.1838360099229663, + "grad_norm": 5.274245738983154, + "learning_rate": 5.696937697993664e-06, + "loss": 0.5441, + "step": 4180 + }, + { + "epoch": 2.1890586238412326, + "grad_norm": 6.222249984741211, + "learning_rate": 5.686378035902852e-06, + "loss": 0.5579, + "step": 4190 + }, + { + "epoch": 2.1942812377594985, + "grad_norm": 6.638361930847168, + "learning_rate": 5.675818373812039e-06, + "loss": 0.6108, + "step": 4200 + }, + { + "epoch": 2.1942812377594985, + "eval_loss": 0.8388937711715698, + "eval_runtime": 46.2559, + "eval_samples_per_second": 36.795, + "eval_steps_per_second": 4.605, + "step": 4200 + }, + { + "epoch": 2.1995038516777647, + "grad_norm": 5.303590297698975, + "learning_rate": 5.665258711721225e-06, + "loss": 0.5602, + "step": 4210 + }, + { + "epoch": 2.204726465596031, + "grad_norm": 4.8176727294921875, + "learning_rate": 5.654699049630412e-06, + "loss": 0.5329, + "step": 4220 + }, + { + "epoch": 2.209949079514297, + "grad_norm": 7.120988368988037, + "learning_rate": 5.644139387539599e-06, + "loss": 0.4963, + "step": 4230 + }, + { + "epoch": 2.215171693432563, + "grad_norm": 5.514588832855225, + "learning_rate": 5.633579725448786e-06, + "loss": 0.6124, + "step": 4240 + }, + { + "epoch": 2.220394307350829, + "grad_norm": 5.315512657165527, + "learning_rate": 5.623020063357974e-06, + "loss": 0.5925, + "step": 4250 + }, + { + "epoch": 2.2256169212690953, + "grad_norm": 5.721431732177734, + "learning_rate": 5.61246040126716e-06, + "loss": 0.6081, + "step": 4260 + }, + { + "epoch": 2.230839535187361, + "grad_norm": 7.8369832038879395, + "learning_rate": 5.601900739176347e-06, + "loss": 0.4506, + "step": 4270 + }, + { + "epoch": 2.2360621491056274, + "grad_norm": 5.328855514526367, + "learning_rate": 5.591341077085533e-06, + "loss": 0.61, + "step": 4280 + }, + { + "epoch": 2.2412847630238932, + "grad_norm": 4.6545891761779785, + "learning_rate": 5.5807814149947205e-06, + "loss": 0.5102, + "step": 4290 + }, + { + "epoch": 2.2465073769421595, + "grad_norm": 4.399157524108887, + "learning_rate": 5.570221752903907e-06, + "loss": 0.5495, + "step": 4300 + }, + { + "epoch": 2.251729990860426, + "grad_norm": 8.38592529296875, + "learning_rate": 5.559662090813095e-06, + "loss": 0.5832, + "step": 4310 + }, + { + "epoch": 2.2569526047786916, + "grad_norm": 3.9857163429260254, + "learning_rate": 5.549102428722282e-06, + "loss": 0.4725, + "step": 4320 + }, + { + "epoch": 2.262175218696958, + "grad_norm": 5.648230075836182, + "learning_rate": 5.538542766631468e-06, + "loss": 0.5457, + "step": 4330 + }, + { + "epoch": 2.2673978326152238, + "grad_norm": 3.6229002475738525, + "learning_rate": 5.527983104540655e-06, + "loss": 0.5997, + "step": 4340 + }, + { + "epoch": 2.27262044653349, + "grad_norm": 6.094500541687012, + "learning_rate": 5.517423442449842e-06, + "loss": 0.5271, + "step": 4350 + }, + { + "epoch": 2.2778430604517563, + "grad_norm": 7.447030544281006, + "learning_rate": 5.5068637803590285e-06, + "loss": 0.6592, + "step": 4360 + }, + { + "epoch": 2.283065674370022, + "grad_norm": 6.060546398162842, + "learning_rate": 5.4963041182682165e-06, + "loss": 0.5148, + "step": 4370 + }, + { + "epoch": 2.2882882882882885, + "grad_norm": 6.3843092918396, + "learning_rate": 5.485744456177403e-06, + "loss": 0.5469, + "step": 4380 + }, + { + "epoch": 2.2935109022065543, + "grad_norm": 5.431898593902588, + "learning_rate": 5.47518479408659e-06, + "loss": 0.5922, + "step": 4390 + }, + { + "epoch": 2.2987335161248206, + "grad_norm": 7.113710403442383, + "learning_rate": 5.464625131995777e-06, + "loss": 0.6052, + "step": 4400 + }, + { + "epoch": 2.2987335161248206, + "eval_loss": 0.8255796432495117, + "eval_runtime": 46.2225, + "eval_samples_per_second": 36.822, + "eval_steps_per_second": 4.608, + "step": 4400 + }, + { + "epoch": 2.3039561300430864, + "grad_norm": 5.431846618652344, + "learning_rate": 5.454065469904963e-06, + "loss": 0.5565, + "step": 4410 + }, + { + "epoch": 2.3091787439613527, + "grad_norm": 4.409207344055176, + "learning_rate": 5.44350580781415e-06, + "loss": 0.6042, + "step": 4420 + }, + { + "epoch": 2.3144013578796185, + "grad_norm": 6.498474597930908, + "learning_rate": 5.432946145723337e-06, + "loss": 0.526, + "step": 4430 + }, + { + "epoch": 2.319623971797885, + "grad_norm": 5.55870246887207, + "learning_rate": 5.4223864836325246e-06, + "loss": 0.6467, + "step": 4440 + }, + { + "epoch": 2.324846585716151, + "grad_norm": 5.727528095245361, + "learning_rate": 5.411826821541712e-06, + "loss": 0.5701, + "step": 4450 + }, + { + "epoch": 2.330069199634417, + "grad_norm": 5.781177520751953, + "learning_rate": 5.401267159450898e-06, + "loss": 0.5244, + "step": 4460 + }, + { + "epoch": 2.3352918135526832, + "grad_norm": 3.1030118465423584, + "learning_rate": 5.390707497360085e-06, + "loss": 0.4523, + "step": 4470 + }, + { + "epoch": 2.340514427470949, + "grad_norm": 3.780679941177368, + "learning_rate": 5.380147835269271e-06, + "loss": 0.5043, + "step": 4480 + }, + { + "epoch": 2.3457370413892153, + "grad_norm": 7.008651256561279, + "learning_rate": 5.369588173178458e-06, + "loss": 0.4744, + "step": 4490 + }, + { + "epoch": 2.3509596553074816, + "grad_norm": 5.0416693687438965, + "learning_rate": 5.359028511087646e-06, + "loss": 0.485, + "step": 4500 + }, + { + "epoch": 2.3561822692257475, + "grad_norm": 6.809351921081543, + "learning_rate": 5.348468848996833e-06, + "loss": 0.6127, + "step": 4510 + }, + { + "epoch": 2.3614048831440138, + "grad_norm": 5.149819374084473, + "learning_rate": 5.33790918690602e-06, + "loss": 0.5398, + "step": 4520 + }, + { + "epoch": 2.3666274970622796, + "grad_norm": 6.1692962646484375, + "learning_rate": 5.327349524815206e-06, + "loss": 0.5371, + "step": 4530 + }, + { + "epoch": 2.371850110980546, + "grad_norm": 7.224261283874512, + "learning_rate": 5.316789862724393e-06, + "loss": 0.5493, + "step": 4540 + }, + { + "epoch": 2.3770727248988117, + "grad_norm": 6.257209777832031, + "learning_rate": 5.306230200633579e-06, + "loss": 0.5177, + "step": 4550 + }, + { + "epoch": 2.382295338817078, + "grad_norm": 5.789205074310303, + "learning_rate": 5.295670538542767e-06, + "loss": 0.5016, + "step": 4560 + }, + { + "epoch": 2.387517952735344, + "grad_norm": 6.116456508636475, + "learning_rate": 5.285110876451954e-06, + "loss": 0.6088, + "step": 4570 + }, + { + "epoch": 2.39274056665361, + "grad_norm": 5.818353176116943, + "learning_rate": 5.274551214361141e-06, + "loss": 0.5105, + "step": 4580 + }, + { + "epoch": 2.3979631805718764, + "grad_norm": 5.000683784484863, + "learning_rate": 5.263991552270328e-06, + "loss": 0.5899, + "step": 4590 + }, + { + "epoch": 2.4031857944901422, + "grad_norm": 6.238855838775635, + "learning_rate": 5.253431890179515e-06, + "loss": 0.6326, + "step": 4600 + }, + { + "epoch": 2.4031857944901422, + "eval_loss": 0.8141046166419983, + "eval_runtime": 46.2243, + "eval_samples_per_second": 36.82, + "eval_steps_per_second": 4.608, + "step": 4600 + }, + { + "epoch": 2.4084084084084085, + "grad_norm": 4.040828704833984, + "learning_rate": 5.242872228088701e-06, + "loss": 0.4504, + "step": 4610 + }, + { + "epoch": 2.4136310223266744, + "grad_norm": 4.401372909545898, + "learning_rate": 5.232312565997888e-06, + "loss": 0.4497, + "step": 4620 + }, + { + "epoch": 2.4188536362449407, + "grad_norm": 5.584476947784424, + "learning_rate": 5.221752903907075e-06, + "loss": 0.6214, + "step": 4630 + }, + { + "epoch": 2.4240762501632065, + "grad_norm": 3.0335025787353516, + "learning_rate": 5.2111932418162625e-06, + "loss": 0.5114, + "step": 4640 + }, + { + "epoch": 2.4292988640814728, + "grad_norm": 6.4493727684021, + "learning_rate": 5.20063357972545e-06, + "loss": 0.6065, + "step": 4650 + }, + { + "epoch": 2.434521477999739, + "grad_norm": 4.674168109893799, + "learning_rate": 5.190073917634636e-06, + "loss": 0.4778, + "step": 4660 + }, + { + "epoch": 2.439744091918005, + "grad_norm": 6.156538963317871, + "learning_rate": 5.179514255543823e-06, + "loss": 0.489, + "step": 4670 + }, + { + "epoch": 2.444966705836271, + "grad_norm": 5.803397178649902, + "learning_rate": 5.168954593453009e-06, + "loss": 0.5948, + "step": 4680 + }, + { + "epoch": 2.450189319754537, + "grad_norm": 6.375555992126465, + "learning_rate": 5.158394931362197e-06, + "loss": 0.4768, + "step": 4690 + }, + { + "epoch": 2.4554119336728033, + "grad_norm": 6.442558288574219, + "learning_rate": 5.147835269271384e-06, + "loss": 0.4996, + "step": 4700 + }, + { + "epoch": 2.460634547591069, + "grad_norm": 8.586763381958008, + "learning_rate": 5.1372756071805705e-06, + "loss": 0.7869, + "step": 4710 + }, + { + "epoch": 2.4658571615093354, + "grad_norm": 6.680276393890381, + "learning_rate": 5.126715945089758e-06, + "loss": 0.5526, + "step": 4720 + }, + { + "epoch": 2.4710797754276017, + "grad_norm": 3.914189100265503, + "learning_rate": 5.116156282998944e-06, + "loss": 0.6078, + "step": 4730 + }, + { + "epoch": 2.4763023893458675, + "grad_norm": 7.448416233062744, + "learning_rate": 5.105596620908131e-06, + "loss": 0.4982, + "step": 4740 + }, + { + "epoch": 2.481525003264134, + "grad_norm": 7.687001705169678, + "learning_rate": 5.095036958817319e-06, + "loss": 0.6684, + "step": 4750 + }, + { + "epoch": 2.4867476171823997, + "grad_norm": 6.067854404449463, + "learning_rate": 5.084477296726505e-06, + "loss": 0.5189, + "step": 4760 + }, + { + "epoch": 2.491970231100666, + "grad_norm": 5.405977725982666, + "learning_rate": 5.073917634635692e-06, + "loss": 0.5212, + "step": 4770 + }, + { + "epoch": 2.497192845018932, + "grad_norm": 7.773893356323242, + "learning_rate": 5.063357972544879e-06, + "loss": 0.6681, + "step": 4780 + }, + { + "epoch": 2.502415458937198, + "grad_norm": 9.031746864318848, + "learning_rate": 5.052798310454066e-06, + "loss": 0.6579, + "step": 4790 + }, + { + "epoch": 2.507638072855464, + "grad_norm": 5.8720245361328125, + "learning_rate": 5.042238648363252e-06, + "loss": 0.5891, + "step": 4800 + }, + { + "epoch": 2.507638072855464, + "eval_loss": 0.8107092380523682, + "eval_runtime": 46.2852, + "eval_samples_per_second": 36.772, + "eval_steps_per_second": 4.602, + "step": 4800 + }, + { + "epoch": 2.51286068677373, + "grad_norm": 4.402786731719971, + "learning_rate": 5.03167898627244e-06, + "loss": 0.4713, + "step": 4810 + }, + { + "epoch": 2.5180833006919965, + "grad_norm": 5.443326473236084, + "learning_rate": 5.021119324181627e-06, + "loss": 0.6362, + "step": 4820 + }, + { + "epoch": 2.5233059146102623, + "grad_norm": 6.188055515289307, + "learning_rate": 5.010559662090813e-06, + "loss": 0.6042, + "step": 4830 + }, + { + "epoch": 2.5285285285285286, + "grad_norm": 5.5944600105285645, + "learning_rate": 5e-06, + "loss": 0.5093, + "step": 4840 + }, + { + "epoch": 2.5337511424467944, + "grad_norm": 6.214510917663574, + "learning_rate": 4.9894403379091875e-06, + "loss": 0.5405, + "step": 4850 + }, + { + "epoch": 2.5389737563650607, + "grad_norm": 4.829537868499756, + "learning_rate": 4.978880675818375e-06, + "loss": 0.52, + "step": 4860 + }, + { + "epoch": 2.544196370283327, + "grad_norm": 5.498637676239014, + "learning_rate": 4.968321013727561e-06, + "loss": 0.4998, + "step": 4870 + }, + { + "epoch": 2.549418984201593, + "grad_norm": 6.1551361083984375, + "learning_rate": 4.957761351636748e-06, + "loss": 0.5842, + "step": 4880 + }, + { + "epoch": 2.554641598119859, + "grad_norm": 5.512228488922119, + "learning_rate": 4.947201689545935e-06, + "loss": 0.5877, + "step": 4890 + }, + { + "epoch": 2.559864212038125, + "grad_norm": 6.113735675811768, + "learning_rate": 4.936642027455122e-06, + "loss": 0.4979, + "step": 4900 + }, + { + "epoch": 2.5650868259563913, + "grad_norm": 9.69180965423584, + "learning_rate": 4.9260823653643085e-06, + "loss": 0.5595, + "step": 4910 + }, + { + "epoch": 2.5703094398746575, + "grad_norm": 6.716381072998047, + "learning_rate": 4.915522703273496e-06, + "loss": 0.577, + "step": 4920 + }, + { + "epoch": 2.5755320537929234, + "grad_norm": 6.1616926193237305, + "learning_rate": 4.904963041182683e-06, + "loss": 0.4457, + "step": 4930 + }, + { + "epoch": 2.580754667711189, + "grad_norm": 5.395188331604004, + "learning_rate": 4.894403379091869e-06, + "loss": 0.4934, + "step": 4940 + }, + { + "epoch": 2.5859772816294555, + "grad_norm": 5.993736743927002, + "learning_rate": 4.883843717001057e-06, + "loss": 0.5133, + "step": 4950 + }, + { + "epoch": 2.591199895547722, + "grad_norm": 4.6023054122924805, + "learning_rate": 4.873284054910243e-06, + "loss": 0.5084, + "step": 4960 + }, + { + "epoch": 2.5964225094659876, + "grad_norm": 4.444733619689941, + "learning_rate": 4.86272439281943e-06, + "loss": 0.5717, + "step": 4970 + }, + { + "epoch": 2.601645123384254, + "grad_norm": 6.304750442504883, + "learning_rate": 4.8521647307286165e-06, + "loss": 0.5583, + "step": 4980 + }, + { + "epoch": 2.6068677373025197, + "grad_norm": 5.222369194030762, + "learning_rate": 4.8416050686378045e-06, + "loss": 0.5425, + "step": 4990 + }, + { + "epoch": 2.612090351220786, + "grad_norm": 5.956515789031982, + "learning_rate": 4.831045406546991e-06, + "loss": 0.5626, + "step": 5000 + }, + { + "epoch": 2.612090351220786, + "eval_loss": 0.8068883419036865, + "eval_runtime": 46.2364, + "eval_samples_per_second": 36.811, + "eval_steps_per_second": 4.607, + "step": 5000 + }, + { + "epoch": 2.6173129651390523, + "grad_norm": 6.04465913772583, + "learning_rate": 4.820485744456178e-06, + "loss": 0.6118, + "step": 5010 + }, + { + "epoch": 2.622535579057318, + "grad_norm": 4.643867015838623, + "learning_rate": 4.809926082365365e-06, + "loss": 0.6346, + "step": 5020 + }, + { + "epoch": 2.6277581929755844, + "grad_norm": 5.192962646484375, + "learning_rate": 4.799366420274551e-06, + "loss": 0.539, + "step": 5030 + }, + { + "epoch": 2.6329808068938503, + "grad_norm": 5.151241779327393, + "learning_rate": 4.788806758183738e-06, + "loss": 0.552, + "step": 5040 + }, + { + "epoch": 2.6382034208121166, + "grad_norm": 4.308994293212891, + "learning_rate": 4.7782470960929254e-06, + "loss": 0.5292, + "step": 5050 + }, + { + "epoch": 2.6434260347303824, + "grad_norm": 5.595186233520508, + "learning_rate": 4.7676874340021126e-06, + "loss": 0.4777, + "step": 5060 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 6.7816643714904785, + "learning_rate": 4.757127771911299e-06, + "loss": 0.4999, + "step": 5070 + }, + { + "epoch": 2.6538712625669145, + "grad_norm": 7.1444315910339355, + "learning_rate": 4.746568109820486e-06, + "loss": 0.5028, + "step": 5080 + }, + { + "epoch": 2.659093876485181, + "grad_norm": 5.04287576675415, + "learning_rate": 4.736008447729673e-06, + "loss": 0.5156, + "step": 5090 + }, + { + "epoch": 2.664316490403447, + "grad_norm": 4.410764694213867, + "learning_rate": 4.72544878563886e-06, + "loss": 0.5241, + "step": 5100 + }, + { + "epoch": 2.669539104321713, + "grad_norm": 4.788335800170898, + "learning_rate": 4.714889123548047e-06, + "loss": 0.5099, + "step": 5110 + }, + { + "epoch": 2.674761718239979, + "grad_norm": 5.737407207489014, + "learning_rate": 4.7043294614572335e-06, + "loss": 0.4395, + "step": 5120 + }, + { + "epoch": 2.679984332158245, + "grad_norm": 6.255344867706299, + "learning_rate": 4.693769799366421e-06, + "loss": 0.5994, + "step": 5130 + }, + { + "epoch": 2.6852069460765113, + "grad_norm": 5.028295516967773, + "learning_rate": 4.683210137275608e-06, + "loss": 0.5518, + "step": 5140 + }, + { + "epoch": 2.6904295599947776, + "grad_norm": 4.391537189483643, + "learning_rate": 4.672650475184795e-06, + "loss": 0.4991, + "step": 5150 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 5.823568820953369, + "learning_rate": 4.662090813093981e-06, + "loss": 0.5885, + "step": 5160 + }, + { + "epoch": 2.7008747878313093, + "grad_norm": 2.928971529006958, + "learning_rate": 4.651531151003168e-06, + "loss": 0.6942, + "step": 5170 + }, + { + "epoch": 2.7060974017495756, + "grad_norm": 9.696499824523926, + "learning_rate": 4.640971488912355e-06, + "loss": 0.5699, + "step": 5180 + }, + { + "epoch": 2.711320015667842, + "grad_norm": 4.2604217529296875, + "learning_rate": 4.630411826821542e-06, + "loss": 0.4676, + "step": 5190 + }, + { + "epoch": 2.7165426295861077, + "grad_norm": 5.672421932220459, + "learning_rate": 4.619852164730729e-06, + "loss": 0.4576, + "step": 5200 + }, + { + "epoch": 2.7165426295861077, + "eval_loss": 0.7933436632156372, + "eval_runtime": 46.2281, + "eval_samples_per_second": 36.817, + "eval_steps_per_second": 4.608, + "step": 5200 + }, + { + "epoch": 2.721765243504374, + "grad_norm": 4.114403247833252, + "learning_rate": 4.609292502639916e-06, + "loss": 0.4742, + "step": 5210 + }, + { + "epoch": 2.72698785742264, + "grad_norm": 5.642977237701416, + "learning_rate": 4.598732840549103e-06, + "loss": 0.5453, + "step": 5220 + }, + { + "epoch": 2.732210471340906, + "grad_norm": 5.690392017364502, + "learning_rate": 4.588173178458289e-06, + "loss": 0.5248, + "step": 5230 + }, + { + "epoch": 2.7374330852591724, + "grad_norm": 7.28243350982666, + "learning_rate": 4.577613516367477e-06, + "loss": 0.5059, + "step": 5240 + }, + { + "epoch": 2.7426556991774382, + "grad_norm": 5.291462421417236, + "learning_rate": 4.567053854276663e-06, + "loss": 0.4494, + "step": 5250 + }, + { + "epoch": 2.7478783130957045, + "grad_norm": 2.6974287033081055, + "learning_rate": 4.5564941921858505e-06, + "loss": 0.5439, + "step": 5260 + }, + { + "epoch": 2.7531009270139704, + "grad_norm": 4.0036940574646, + "learning_rate": 4.545934530095038e-06, + "loss": 0.4797, + "step": 5270 + }, + { + "epoch": 2.7583235409322366, + "grad_norm": 5.800724506378174, + "learning_rate": 4.535374868004224e-06, + "loss": 0.5122, + "step": 5280 + }, + { + "epoch": 2.763546154850503, + "grad_norm": 6.420878887176514, + "learning_rate": 4.524815205913411e-06, + "loss": 0.4956, + "step": 5290 + }, + { + "epoch": 2.7687687687687688, + "grad_norm": 6.129545211791992, + "learning_rate": 4.514255543822598e-06, + "loss": 0.5465, + "step": 5300 + }, + { + "epoch": 2.7739913826870346, + "grad_norm": 5.964089870452881, + "learning_rate": 4.503695881731785e-06, + "loss": 0.5012, + "step": 5310 + }, + { + "epoch": 2.779213996605301, + "grad_norm": 5.476171493530273, + "learning_rate": 4.4931362196409714e-06, + "loss": 0.4842, + "step": 5320 + }, + { + "epoch": 2.784436610523567, + "grad_norm": 3.6184587478637695, + "learning_rate": 4.4825765575501585e-06, + "loss": 0.464, + "step": 5330 + }, + { + "epoch": 2.789659224441833, + "grad_norm": 6.052497863769531, + "learning_rate": 4.472016895459346e-06, + "loss": 0.5625, + "step": 5340 + }, + { + "epoch": 2.7948818383600993, + "grad_norm": 8.540828704833984, + "learning_rate": 4.461457233368533e-06, + "loss": 0.581, + "step": 5350 + }, + { + "epoch": 2.800104452278365, + "grad_norm": 6.093939781188965, + "learning_rate": 4.45089757127772e-06, + "loss": 0.5907, + "step": 5360 + }, + { + "epoch": 2.8053270661966314, + "grad_norm": 4.6257405281066895, + "learning_rate": 4.440337909186906e-06, + "loss": 0.4679, + "step": 5370 + }, + { + "epoch": 2.8105496801148977, + "grad_norm": 4.155122756958008, + "learning_rate": 4.429778247096093e-06, + "loss": 0.4799, + "step": 5380 + }, + { + "epoch": 2.8157722940331635, + "grad_norm": 6.194579601287842, + "learning_rate": 4.41921858500528e-06, + "loss": 0.5603, + "step": 5390 + }, + { + "epoch": 2.82099490795143, + "grad_norm": 7.508232593536377, + "learning_rate": 4.4086589229144675e-06, + "loss": 0.6672, + "step": 5400 + }, + { + "epoch": 2.82099490795143, + "eval_loss": 0.7919074296951294, + "eval_runtime": 46.2409, + "eval_samples_per_second": 36.807, + "eval_steps_per_second": 4.606, + "step": 5400 + }, + { + "epoch": 2.8262175218696957, + "grad_norm": 6.648965358734131, + "learning_rate": 4.398099260823654e-06, + "loss": 0.6096, + "step": 5410 + }, + { + "epoch": 2.831440135787962, + "grad_norm": 6.033213138580322, + "learning_rate": 4.387539598732841e-06, + "loss": 0.5375, + "step": 5420 + }, + { + "epoch": 2.8366627497062282, + "grad_norm": 5.509268760681152, + "learning_rate": 4.376979936642028e-06, + "loss": 0.4289, + "step": 5430 + }, + { + "epoch": 2.841885363624494, + "grad_norm": 4.335843563079834, + "learning_rate": 4.366420274551215e-06, + "loss": 0.443, + "step": 5440 + }, + { + "epoch": 2.84710797754276, + "grad_norm": 3.9269707202911377, + "learning_rate": 4.355860612460401e-06, + "loss": 0.4666, + "step": 5450 + }, + { + "epoch": 2.852330591461026, + "grad_norm": 7.474977970123291, + "learning_rate": 4.345300950369588e-06, + "loss": 0.5793, + "step": 5460 + }, + { + "epoch": 2.8575532053792925, + "grad_norm": 3.6517860889434814, + "learning_rate": 4.3347412882787755e-06, + "loss": 0.4276, + "step": 5470 + }, + { + "epoch": 2.8627758192975583, + "grad_norm": 4.662909030914307, + "learning_rate": 4.324181626187962e-06, + "loss": 0.5647, + "step": 5480 + }, + { + "epoch": 2.8679984332158246, + "grad_norm": 6.30706787109375, + "learning_rate": 4.31362196409715e-06, + "loss": 0.5985, + "step": 5490 + }, + { + "epoch": 2.8732210471340904, + "grad_norm": 3.8878538608551025, + "learning_rate": 4.303062302006336e-06, + "loss": 0.4561, + "step": 5500 + }, + { + "epoch": 2.8784436610523567, + "grad_norm": 4.7928667068481445, + "learning_rate": 4.292502639915523e-06, + "loss": 0.5941, + "step": 5510 + }, + { + "epoch": 2.883666274970623, + "grad_norm": 7.024383544921875, + "learning_rate": 4.28194297782471e-06, + "loss": 0.51, + "step": 5520 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 4.680624961853027, + "learning_rate": 4.2713833157338965e-06, + "loss": 0.5751, + "step": 5530 + }, + { + "epoch": 2.894111502807155, + "grad_norm": 5.982644081115723, + "learning_rate": 4.260823653643084e-06, + "loss": 0.5139, + "step": 5540 + }, + { + "epoch": 2.899334116725421, + "grad_norm": 5.7597784996032715, + "learning_rate": 4.250263991552271e-06, + "loss": 0.5367, + "step": 5550 + }, + { + "epoch": 2.9045567306436872, + "grad_norm": 5.401406764984131, + "learning_rate": 4.239704329461458e-06, + "loss": 0.5542, + "step": 5560 + }, + { + "epoch": 2.9097793445619535, + "grad_norm": 8.459036827087402, + "learning_rate": 4.229144667370644e-06, + "loss": 0.5354, + "step": 5570 + }, + { + "epoch": 2.9150019584802194, + "grad_norm": 6.203250885009766, + "learning_rate": 4.218585005279832e-06, + "loss": 0.5376, + "step": 5580 + }, + { + "epoch": 2.920224572398485, + "grad_norm": 6.801321983337402, + "learning_rate": 4.208025343189018e-06, + "loss": 0.4566, + "step": 5590 + }, + { + "epoch": 2.9254471863167515, + "grad_norm": 2.9718496799468994, + "learning_rate": 4.197465681098205e-06, + "loss": 0.483, + "step": 5600 + }, + { + "epoch": 2.9254471863167515, + "eval_loss": 0.7785268425941467, + "eval_runtime": 46.2314, + "eval_samples_per_second": 36.815, + "eval_steps_per_second": 4.607, + "step": 5600 + }, + { + "epoch": 2.9306698002350178, + "grad_norm": 3.2528350353240967, + "learning_rate": 4.1869060190073925e-06, + "loss": 0.5526, + "step": 5610 + }, + { + "epoch": 2.9358924141532836, + "grad_norm": 3.9217379093170166, + "learning_rate": 4.176346356916579e-06, + "loss": 0.558, + "step": 5620 + }, + { + "epoch": 2.94111502807155, + "grad_norm": 4.15424919128418, + "learning_rate": 4.165786694825766e-06, + "loss": 0.4264, + "step": 5630 + }, + { + "epoch": 2.9463376419898157, + "grad_norm": 5.303351402282715, + "learning_rate": 4.155227032734953e-06, + "loss": 0.5739, + "step": 5640 + }, + { + "epoch": 2.951560255908082, + "grad_norm": 7.237427234649658, + "learning_rate": 4.14466737064414e-06, + "loss": 0.5442, + "step": 5650 + }, + { + "epoch": 2.9567828698263483, + "grad_norm": 4.967709541320801, + "learning_rate": 4.134107708553326e-06, + "loss": 0.4926, + "step": 5660 + }, + { + "epoch": 2.962005483744614, + "grad_norm": 5.920149326324463, + "learning_rate": 4.1235480464625134e-06, + "loss": 0.5655, + "step": 5670 + }, + { + "epoch": 2.9672280976628804, + "grad_norm": 5.823659896850586, + "learning_rate": 4.1129883843717006e-06, + "loss": 0.5124, + "step": 5680 + }, + { + "epoch": 2.9724507115811463, + "grad_norm": 3.851020336151123, + "learning_rate": 4.102428722280888e-06, + "loss": 0.423, + "step": 5690 + }, + { + "epoch": 2.9776733254994125, + "grad_norm": 4.632102012634277, + "learning_rate": 4.091869060190074e-06, + "loss": 0.42, + "step": 5700 + }, + { + "epoch": 2.9828959394176784, + "grad_norm": 6.207057476043701, + "learning_rate": 4.081309398099261e-06, + "loss": 0.4569, + "step": 5710 + }, + { + "epoch": 2.9881185533359447, + "grad_norm": 4.414632797241211, + "learning_rate": 4.070749736008448e-06, + "loss": 0.4628, + "step": 5720 + }, + { + "epoch": 2.9933411672542105, + "grad_norm": 5.721477508544922, + "learning_rate": 4.060190073917634e-06, + "loss": 0.4518, + "step": 5730 + }, + { + "epoch": 2.998563781172477, + "grad_norm": 5.436526298522949, + "learning_rate": 4.049630411826822e-06, + "loss": 0.5313, + "step": 5740 + }, + { + "epoch": 3.0041780911346128, + "grad_norm": 7.654147148132324, + "learning_rate": 4.039070749736009e-06, + "loss": 0.494, + "step": 5750 + }, + { + "epoch": 3.009400705052879, + "grad_norm": 5.68324089050293, + "learning_rate": 4.028511087645196e-06, + "loss": 0.3699, + "step": 5760 + }, + { + "epoch": 3.014623318971145, + "grad_norm": 5.219386577606201, + "learning_rate": 4.017951425554383e-06, + "loss": 0.4355, + "step": 5770 + }, + { + "epoch": 3.019845932889411, + "grad_norm": 6.570154190063477, + "learning_rate": 4.007391763463569e-06, + "loss": 0.4075, + "step": 5780 + }, + { + "epoch": 3.0250685468076774, + "grad_norm": 7.014920234680176, + "learning_rate": 3.996832101372756e-06, + "loss": 0.3904, + "step": 5790 + }, + { + "epoch": 3.0302911607259433, + "grad_norm": 4.148968696594238, + "learning_rate": 3.986272439281943e-06, + "loss": 0.3938, + "step": 5800 + }, + { + "epoch": 3.0302911607259433, + "eval_loss": 0.8061103224754333, + "eval_runtime": 46.2431, + "eval_samples_per_second": 36.806, + "eval_steps_per_second": 4.606, + "step": 5800 + }, + { + "epoch": 3.0355137746442096, + "grad_norm": 8.123299598693848, + "learning_rate": 3.97571277719113e-06, + "loss": 0.4201, + "step": 5810 + }, + { + "epoch": 3.0407363885624754, + "grad_norm": 8.127564430236816, + "learning_rate": 3.965153115100317e-06, + "loss": 0.416, + "step": 5820 + }, + { + "epoch": 3.0459590024807417, + "grad_norm": 3.643094539642334, + "learning_rate": 3.954593453009505e-06, + "loss": 0.36, + "step": 5830 + }, + { + "epoch": 3.0511816163990075, + "grad_norm": 3.6215567588806152, + "learning_rate": 3.944033790918691e-06, + "loss": 0.354, + "step": 5840 + }, + { + "epoch": 3.056404230317274, + "grad_norm": 6.820983409881592, + "learning_rate": 3.933474128827878e-06, + "loss": 0.4113, + "step": 5850 + }, + { + "epoch": 3.06162684423554, + "grad_norm": 5.715891361236572, + "learning_rate": 3.922914466737065e-06, + "loss": 0.3628, + "step": 5860 + }, + { + "epoch": 3.066849458153806, + "grad_norm": 7.616763114929199, + "learning_rate": 3.912354804646251e-06, + "loss": 0.4049, + "step": 5870 + }, + { + "epoch": 3.0720720720720722, + "grad_norm": 4.177463531494141, + "learning_rate": 3.9017951425554385e-06, + "loss": 0.4452, + "step": 5880 + }, + { + "epoch": 3.077294685990338, + "grad_norm": 4.512898921966553, + "learning_rate": 3.891235480464626e-06, + "loss": 0.384, + "step": 5890 + }, + { + "epoch": 3.0825172999086043, + "grad_norm": 3.9176089763641357, + "learning_rate": 3.880675818373813e-06, + "loss": 0.4524, + "step": 5900 + }, + { + "epoch": 3.08773991382687, + "grad_norm": 5.587650299072266, + "learning_rate": 3.870116156282999e-06, + "loss": 0.3638, + "step": 5910 + }, + { + "epoch": 3.0929625277451365, + "grad_norm": 5.886160373687744, + "learning_rate": 3.859556494192186e-06, + "loss": 0.3664, + "step": 5920 + }, + { + "epoch": 3.0981851416634028, + "grad_norm": 3.332893133163452, + "learning_rate": 3.848996832101373e-06, + "loss": 0.4352, + "step": 5930 + }, + { + "epoch": 3.1034077555816686, + "grad_norm": 4.1508097648620605, + "learning_rate": 3.83843717001056e-06, + "loss": 0.485, + "step": 5940 + }, + { + "epoch": 3.108630369499935, + "grad_norm": 6.934416770935059, + "learning_rate": 3.8278775079197465e-06, + "loss": 0.4333, + "step": 5950 + }, + { + "epoch": 3.1138529834182007, + "grad_norm": 5.893505573272705, + "learning_rate": 3.817317845828934e-06, + "loss": 0.4209, + "step": 5960 + }, + { + "epoch": 3.119075597336467, + "grad_norm": 6.001057147979736, + "learning_rate": 3.8067581837381208e-06, + "loss": 0.4274, + "step": 5970 + }, + { + "epoch": 3.124298211254733, + "grad_norm": 4.873240947723389, + "learning_rate": 3.7961985216473074e-06, + "loss": 0.3278, + "step": 5980 + }, + { + "epoch": 3.129520825172999, + "grad_norm": 6.454697132110596, + "learning_rate": 3.7856388595564946e-06, + "loss": 0.4673, + "step": 5990 + }, + { + "epoch": 3.134743439091265, + "grad_norm": 5.828022003173828, + "learning_rate": 3.7750791974656812e-06, + "loss": 0.4288, + "step": 6000 + }, + { + "epoch": 3.134743439091265, + "eval_loss": 0.8229681849479675, + "eval_runtime": 46.2578, + "eval_samples_per_second": 36.794, + "eval_steps_per_second": 4.605, + "step": 6000 + }, + { + "epoch": 3.1399660530095312, + "grad_norm": 6.981528282165527, + "learning_rate": 3.7645195353748684e-06, + "loss": 0.3955, + "step": 6010 + }, + { + "epoch": 3.1451886669277975, + "grad_norm": 4.903995990753174, + "learning_rate": 3.7539598732840555e-06, + "loss": 0.3907, + "step": 6020 + }, + { + "epoch": 3.1504112808460634, + "grad_norm": 4.399137496948242, + "learning_rate": 3.743400211193242e-06, + "loss": 0.4785, + "step": 6030 + }, + { + "epoch": 3.1556338947643296, + "grad_norm": 3.4194514751434326, + "learning_rate": 3.732840549102429e-06, + "loss": 0.4086, + "step": 6040 + }, + { + "epoch": 3.1608565086825955, + "grad_norm": 6.683743476867676, + "learning_rate": 3.7222808870116164e-06, + "loss": 0.4418, + "step": 6050 + }, + { + "epoch": 3.1660791226008618, + "grad_norm": 4.924780368804932, + "learning_rate": 3.711721224920803e-06, + "loss": 0.4035, + "step": 6060 + }, + { + "epoch": 3.171301736519128, + "grad_norm": 6.23117733001709, + "learning_rate": 3.7011615628299897e-06, + "loss": 0.3554, + "step": 6070 + }, + { + "epoch": 3.176524350437394, + "grad_norm": 7.044112682342529, + "learning_rate": 3.690601900739177e-06, + "loss": 0.4143, + "step": 6080 + }, + { + "epoch": 3.18174696435566, + "grad_norm": 6.9131059646606445, + "learning_rate": 3.6800422386483635e-06, + "loss": 0.4045, + "step": 6090 + }, + { + "epoch": 3.186969578273926, + "grad_norm": 5.362022876739502, + "learning_rate": 3.66948257655755e-06, + "loss": 0.4318, + "step": 6100 + }, + { + "epoch": 3.1921921921921923, + "grad_norm": 5.799452781677246, + "learning_rate": 3.6589229144667377e-06, + "loss": 0.4649, + "step": 6110 + }, + { + "epoch": 3.197414806110458, + "grad_norm": 7.595244884490967, + "learning_rate": 3.6483632523759244e-06, + "loss": 0.4466, + "step": 6120 + }, + { + "epoch": 3.2026374200287244, + "grad_norm": 2.4312336444854736, + "learning_rate": 3.637803590285111e-06, + "loss": 0.4557, + "step": 6130 + }, + { + "epoch": 3.2078600339469903, + "grad_norm": 5.092735767364502, + "learning_rate": 3.627243928194298e-06, + "loss": 0.4255, + "step": 6140 + }, + { + "epoch": 3.2130826478652565, + "grad_norm": 5.3324151039123535, + "learning_rate": 3.616684266103485e-06, + "loss": 0.432, + "step": 6150 + }, + { + "epoch": 3.218305261783523, + "grad_norm": 4.059586524963379, + "learning_rate": 3.606124604012672e-06, + "loss": 0.4766, + "step": 6160 + }, + { + "epoch": 3.2235278757017887, + "grad_norm": 6.850623607635498, + "learning_rate": 3.5955649419218587e-06, + "loss": 0.4387, + "step": 6170 + }, + { + "epoch": 3.228750489620055, + "grad_norm": 5.995065212249756, + "learning_rate": 3.585005279831046e-06, + "loss": 0.4257, + "step": 6180 + }, + { + "epoch": 3.233973103538321, + "grad_norm": 3.885401487350464, + "learning_rate": 3.5744456177402325e-06, + "loss": 0.4423, + "step": 6190 + }, + { + "epoch": 3.239195717456587, + "grad_norm": 4.709335803985596, + "learning_rate": 3.563885955649419e-06, + "loss": 0.3812, + "step": 6200 + }, + { + "epoch": 3.239195717456587, + "eval_loss": 0.810655415058136, + "eval_runtime": 46.276, + "eval_samples_per_second": 36.779, + "eval_steps_per_second": 4.603, + "step": 6200 + }, + { + "epoch": 3.244418331374853, + "grad_norm": 7.61454963684082, + "learning_rate": 3.5533262935586067e-06, + "loss": 0.4508, + "step": 6210 + }, + { + "epoch": 3.249640945293119, + "grad_norm": 7.8873066902160645, + "learning_rate": 3.5427666314677934e-06, + "loss": 0.4208, + "step": 6220 + }, + { + "epoch": 3.2548635592113855, + "grad_norm": 5.000669956207275, + "learning_rate": 3.53220696937698e-06, + "loss": 0.4103, + "step": 6230 + }, + { + "epoch": 3.2600861731296513, + "grad_norm": 4.964175701141357, + "learning_rate": 3.521647307286167e-06, + "loss": 0.4562, + "step": 6240 + }, + { + "epoch": 3.2653087870479176, + "grad_norm": 4.287696838378906, + "learning_rate": 3.511087645195354e-06, + "loss": 0.4661, + "step": 6250 + }, + { + "epoch": 3.2705314009661834, + "grad_norm": 5.519683837890625, + "learning_rate": 3.500527983104541e-06, + "loss": 0.4882, + "step": 6260 + }, + { + "epoch": 3.2757540148844497, + "grad_norm": 5.700749397277832, + "learning_rate": 3.489968321013728e-06, + "loss": 0.4463, + "step": 6270 + }, + { + "epoch": 3.2809766288027156, + "grad_norm": 5.7745466232299805, + "learning_rate": 3.4794086589229148e-06, + "loss": 0.4626, + "step": 6280 + }, + { + "epoch": 3.286199242720982, + "grad_norm": 8.0064058303833, + "learning_rate": 3.4688489968321015e-06, + "loss": 0.3982, + "step": 6290 + }, + { + "epoch": 3.291421856639248, + "grad_norm": 5.860507488250732, + "learning_rate": 3.458289334741289e-06, + "loss": 0.3835, + "step": 6300 + }, + { + "epoch": 3.296644470557514, + "grad_norm": 7.413349628448486, + "learning_rate": 3.4477296726504757e-06, + "loss": 0.4338, + "step": 6310 + }, + { + "epoch": 3.3018670844757803, + "grad_norm": 4.818141937255859, + "learning_rate": 3.4371700105596624e-06, + "loss": 0.497, + "step": 6320 + }, + { + "epoch": 3.307089698394046, + "grad_norm": 3.987377405166626, + "learning_rate": 3.4266103484688495e-06, + "loss": 0.4099, + "step": 6330 + }, + { + "epoch": 3.3123123123123124, + "grad_norm": 7.0202860832214355, + "learning_rate": 3.416050686378036e-06, + "loss": 0.4791, + "step": 6340 + }, + { + "epoch": 3.317534926230578, + "grad_norm": 6.688587665557861, + "learning_rate": 3.405491024287223e-06, + "loss": 0.427, + "step": 6350 + }, + { + "epoch": 3.3227575401488445, + "grad_norm": 6.508810997009277, + "learning_rate": 3.39493136219641e-06, + "loss": 0.431, + "step": 6360 + }, + { + "epoch": 3.3279801540671103, + "grad_norm": 7.402127265930176, + "learning_rate": 3.384371700105597e-06, + "loss": 0.4145, + "step": 6370 + }, + { + "epoch": 3.3332027679853766, + "grad_norm": 4.331240177154541, + "learning_rate": 3.3738120380147837e-06, + "loss": 0.3799, + "step": 6380 + }, + { + "epoch": 3.338425381903643, + "grad_norm": 6.24545431137085, + "learning_rate": 3.3632523759239704e-06, + "loss": 0.4339, + "step": 6390 + }, + { + "epoch": 3.3436479958219087, + "grad_norm": 6.327270030975342, + "learning_rate": 3.352692713833158e-06, + "loss": 0.4506, + "step": 6400 + }, + { + "epoch": 3.3436479958219087, + "eval_loss": 0.8060568571090698, + "eval_runtime": 46.2458, + "eval_samples_per_second": 36.803, + "eval_steps_per_second": 4.606, + "step": 6400 + }, + { + "epoch": 3.348870609740175, + "grad_norm": 2.5744376182556152, + "learning_rate": 3.3421330517423446e-06, + "loss": 0.484, + "step": 6410 + }, + { + "epoch": 3.354093223658441, + "grad_norm": 3.4706344604492188, + "learning_rate": 3.3315733896515313e-06, + "loss": 0.3897, + "step": 6420 + }, + { + "epoch": 3.359315837576707, + "grad_norm": 6.175302028656006, + "learning_rate": 3.3210137275607184e-06, + "loss": 0.4054, + "step": 6430 + }, + { + "epoch": 3.3645384514949734, + "grad_norm": 5.064645767211914, + "learning_rate": 3.310454065469905e-06, + "loss": 0.4458, + "step": 6440 + }, + { + "epoch": 3.3697610654132393, + "grad_norm": 8.018420219421387, + "learning_rate": 3.299894403379092e-06, + "loss": 0.5056, + "step": 6450 + }, + { + "epoch": 3.3749836793315056, + "grad_norm": 6.1567301750183105, + "learning_rate": 3.2893347412882793e-06, + "loss": 0.3735, + "step": 6460 + }, + { + "epoch": 3.3802062932497714, + "grad_norm": 5.155027866363525, + "learning_rate": 3.278775079197466e-06, + "loss": 0.3715, + "step": 6470 + }, + { + "epoch": 3.3854289071680377, + "grad_norm": 5.396885395050049, + "learning_rate": 3.2682154171066527e-06, + "loss": 0.3597, + "step": 6480 + }, + { + "epoch": 3.3906515210863035, + "grad_norm": 3.0646581649780273, + "learning_rate": 3.25765575501584e-06, + "loss": 0.4777, + "step": 6490 + }, + { + "epoch": 3.39587413500457, + "grad_norm": 5.380611419677734, + "learning_rate": 3.2470960929250265e-06, + "loss": 0.4137, + "step": 6500 + }, + { + "epoch": 3.4010967489228356, + "grad_norm": 6.225546360015869, + "learning_rate": 3.2365364308342136e-06, + "loss": 0.313, + "step": 6510 + }, + { + "epoch": 3.406319362841102, + "grad_norm": 4.760247707366943, + "learning_rate": 3.2259767687434007e-06, + "loss": 0.3674, + "step": 6520 + }, + { + "epoch": 3.411541976759368, + "grad_norm": 7.30977725982666, + "learning_rate": 3.2154171066525874e-06, + "loss": 0.3591, + "step": 6530 + }, + { + "epoch": 3.416764590677634, + "grad_norm": 7.618262767791748, + "learning_rate": 3.204857444561774e-06, + "loss": 0.4384, + "step": 6540 + }, + { + "epoch": 3.4219872045959003, + "grad_norm": 3.0289359092712402, + "learning_rate": 3.1942977824709616e-06, + "loss": 0.3818, + "step": 6550 + }, + { + "epoch": 3.427209818514166, + "grad_norm": 3.5998988151550293, + "learning_rate": 3.1837381203801483e-06, + "loss": 0.4318, + "step": 6560 + }, + { + "epoch": 3.4324324324324325, + "grad_norm": 4.631134033203125, + "learning_rate": 3.173178458289335e-06, + "loss": 0.3956, + "step": 6570 + }, + { + "epoch": 3.4376550463506987, + "grad_norm": 8.129390716552734, + "learning_rate": 3.1626187961985217e-06, + "loss": 0.3928, + "step": 6580 + }, + { + "epoch": 3.4428776602689646, + "grad_norm": 7.39198637008667, + "learning_rate": 3.1520591341077088e-06, + "loss": 0.4827, + "step": 6590 + }, + { + "epoch": 3.448100274187231, + "grad_norm": 4.935920238494873, + "learning_rate": 3.1414994720168955e-06, + "loss": 0.4126, + "step": 6600 + }, + { + "epoch": 3.448100274187231, + "eval_loss": 0.8010614514350891, + "eval_runtime": 46.3186, + "eval_samples_per_second": 36.746, + "eval_steps_per_second": 4.599, + "step": 6600 + }, + { + "epoch": 3.4533228881054967, + "grad_norm": 4.4114460945129395, + "learning_rate": 3.1309398099260826e-06, + "loss": 0.4439, + "step": 6610 + }, + { + "epoch": 3.458545502023763, + "grad_norm": 6.824039459228516, + "learning_rate": 3.1203801478352697e-06, + "loss": 0.3552, + "step": 6620 + }, + { + "epoch": 3.463768115942029, + "grad_norm": 4.671921253204346, + "learning_rate": 3.1098204857444564e-06, + "loss": 0.3289, + "step": 6630 + }, + { + "epoch": 3.468990729860295, + "grad_norm": 3.557352304458618, + "learning_rate": 3.099260823653643e-06, + "loss": 0.4342, + "step": 6640 + }, + { + "epoch": 3.474213343778561, + "grad_norm": 5.833057403564453, + "learning_rate": 3.0887011615628306e-06, + "loss": 0.4985, + "step": 6650 + }, + { + "epoch": 3.4794359576968272, + "grad_norm": 3.8494341373443604, + "learning_rate": 3.0781414994720173e-06, + "loss": 0.4175, + "step": 6660 + }, + { + "epoch": 3.4846585716150935, + "grad_norm": 5.017399311065674, + "learning_rate": 3.067581837381204e-06, + "loss": 0.3396, + "step": 6670 + }, + { + "epoch": 3.4898811855333594, + "grad_norm": 8.080399513244629, + "learning_rate": 3.057022175290391e-06, + "loss": 0.412, + "step": 6680 + }, + { + "epoch": 3.4951037994516256, + "grad_norm": 4.017096996307373, + "learning_rate": 3.0464625131995777e-06, + "loss": 0.4273, + "step": 6690 + }, + { + "epoch": 3.5003264133698915, + "grad_norm": 5.441287517547607, + "learning_rate": 3.0359028511087644e-06, + "loss": 0.3789, + "step": 6700 + }, + { + "epoch": 3.5055490272881578, + "grad_norm": 6.666896343231201, + "learning_rate": 3.025343189017952e-06, + "loss": 0.4841, + "step": 6710 + }, + { + "epoch": 3.510771641206424, + "grad_norm": 6.075514316558838, + "learning_rate": 3.0147835269271386e-06, + "loss": 0.4189, + "step": 6720 + }, + { + "epoch": 3.51599425512469, + "grad_norm": 4.0529303550720215, + "learning_rate": 3.0042238648363253e-06, + "loss": 0.442, + "step": 6730 + }, + { + "epoch": 3.521216869042956, + "grad_norm": 4.397035598754883, + "learning_rate": 2.9936642027455124e-06, + "loss": 0.3511, + "step": 6740 + }, + { + "epoch": 3.526439482961222, + "grad_norm": 4.687301158905029, + "learning_rate": 2.9831045406546995e-06, + "loss": 0.4178, + "step": 6750 + }, + { + "epoch": 3.5316620968794883, + "grad_norm": 7.033337593078613, + "learning_rate": 2.9725448785638862e-06, + "loss": 0.4109, + "step": 6760 + }, + { + "epoch": 3.536884710797754, + "grad_norm": 5.542331218719482, + "learning_rate": 2.9619852164730733e-06, + "loss": 0.4457, + "step": 6770 + }, + { + "epoch": 3.5421073247160204, + "grad_norm": 2.987297534942627, + "learning_rate": 2.95142555438226e-06, + "loss": 0.4106, + "step": 6780 + }, + { + "epoch": 3.5473299386342863, + "grad_norm": 4.037609100341797, + "learning_rate": 2.9408658922914467e-06, + "loss": 0.3515, + "step": 6790 + }, + { + "epoch": 3.5525525525525525, + "grad_norm": 6.087532997131348, + "learning_rate": 2.9303062302006342e-06, + "loss": 0.4127, + "step": 6800 + }, + { + "epoch": 3.5525525525525525, + "eval_loss": 0.799282431602478, + "eval_runtime": 46.2878, + "eval_samples_per_second": 36.77, + "eval_steps_per_second": 4.602, + "step": 6800 + }, + { + "epoch": 3.557775166470819, + "grad_norm": 5.302961826324463, + "learning_rate": 2.919746568109821e-06, + "loss": 0.3994, + "step": 6810 + }, + { + "epoch": 3.5629977803890847, + "grad_norm": 6.701052188873291, + "learning_rate": 2.9091869060190076e-06, + "loss": 0.3903, + "step": 6820 + }, + { + "epoch": 3.568220394307351, + "grad_norm": 5.228213310241699, + "learning_rate": 2.8986272439281943e-06, + "loss": 0.4006, + "step": 6830 + }, + { + "epoch": 3.573443008225617, + "grad_norm": 5.282093524932861, + "learning_rate": 2.8880675818373814e-06, + "loss": 0.4515, + "step": 6840 + }, + { + "epoch": 3.578665622143883, + "grad_norm": 3.786198616027832, + "learning_rate": 2.8775079197465685e-06, + "loss": 0.3696, + "step": 6850 + }, + { + "epoch": 3.5838882360621493, + "grad_norm": 5.512637138366699, + "learning_rate": 2.866948257655755e-06, + "loss": 0.4546, + "step": 6860 + }, + { + "epoch": 3.589110849980415, + "grad_norm": 7.117464542388916, + "learning_rate": 2.8563885955649423e-06, + "loss": 0.4562, + "step": 6870 + }, + { + "epoch": 3.594333463898681, + "grad_norm": 4.943199634552002, + "learning_rate": 2.845828933474129e-06, + "loss": 0.4894, + "step": 6880 + }, + { + "epoch": 3.5995560778169473, + "grad_norm": 8.774984359741211, + "learning_rate": 2.8352692713833157e-06, + "loss": 0.3925, + "step": 6890 + }, + { + "epoch": 3.6047786917352136, + "grad_norm": 7.126657009124756, + "learning_rate": 2.824709609292503e-06, + "loss": 0.4375, + "step": 6900 + }, + { + "epoch": 3.6100013056534794, + "grad_norm": 5.460080146789551, + "learning_rate": 2.81414994720169e-06, + "loss": 0.3847, + "step": 6910 + }, + { + "epoch": 3.6152239195717457, + "grad_norm": 5.7454833984375, + "learning_rate": 2.8035902851108766e-06, + "loss": 0.3669, + "step": 6920 + }, + { + "epoch": 3.6204465334900116, + "grad_norm": 7.132731914520264, + "learning_rate": 2.7930306230200637e-06, + "loss": 0.4013, + "step": 6930 + }, + { + "epoch": 3.625669147408278, + "grad_norm": 4.874327659606934, + "learning_rate": 2.7824709609292504e-06, + "loss": 0.3866, + "step": 6940 + }, + { + "epoch": 3.630891761326544, + "grad_norm": 6.133016586303711, + "learning_rate": 2.771911298838437e-06, + "loss": 0.3879, + "step": 6950 + }, + { + "epoch": 3.63611437524481, + "grad_norm": 7.170290470123291, + "learning_rate": 2.7613516367476246e-06, + "loss": 0.374, + "step": 6960 + }, + { + "epoch": 3.6413369891630762, + "grad_norm": 4.124912738800049, + "learning_rate": 2.7507919746568113e-06, + "loss": 0.5092, + "step": 6970 + }, + { + "epoch": 3.646559603081342, + "grad_norm": 6.091069221496582, + "learning_rate": 2.740232312565998e-06, + "loss": 0.4028, + "step": 6980 + }, + { + "epoch": 3.6517822169996084, + "grad_norm": 3.907172203063965, + "learning_rate": 2.729672650475185e-06, + "loss": 0.4105, + "step": 6990 + }, + { + "epoch": 3.6570048309178746, + "grad_norm": 4.004384517669678, + "learning_rate": 2.719112988384372e-06, + "loss": 0.3923, + "step": 7000 + }, + { + "epoch": 3.6570048309178746, + "eval_loss": 0.7925397157669067, + "eval_runtime": 46.2922, + "eval_samples_per_second": 36.766, + "eval_steps_per_second": 4.601, + "step": 7000 + }, + { + "epoch": 3.6622274448361405, + "grad_norm": 4.95105504989624, + "learning_rate": 2.708553326293559e-06, + "loss": 0.4407, + "step": 7010 + }, + { + "epoch": 3.6674500587544063, + "grad_norm": 5.319108963012695, + "learning_rate": 2.697993664202746e-06, + "loss": 0.4652, + "step": 7020 + }, + { + "epoch": 3.6726726726726726, + "grad_norm": 7.128709316253662, + "learning_rate": 2.6874340021119326e-06, + "loss": 0.4353, + "step": 7030 + }, + { + "epoch": 3.677895286590939, + "grad_norm": 4.808097839355469, + "learning_rate": 2.6768743400211193e-06, + "loss": 0.345, + "step": 7040 + }, + { + "epoch": 3.6831179005092047, + "grad_norm": 6.002725601196289, + "learning_rate": 2.666314677930306e-06, + "loss": 0.425, + "step": 7050 + }, + { + "epoch": 3.688340514427471, + "grad_norm": 4.452878952026367, + "learning_rate": 2.6557550158394935e-06, + "loss": 0.4487, + "step": 7060 + }, + { + "epoch": 3.693563128345737, + "grad_norm": 5.096455097198486, + "learning_rate": 2.6451953537486802e-06, + "loss": 0.4158, + "step": 7070 + }, + { + "epoch": 3.698785742264003, + "grad_norm": 6.426013946533203, + "learning_rate": 2.634635691657867e-06, + "loss": 0.3701, + "step": 7080 + }, + { + "epoch": 3.7040083561822694, + "grad_norm": 7.101649284362793, + "learning_rate": 2.624076029567054e-06, + "loss": 0.4757, + "step": 7090 + }, + { + "epoch": 3.7092309701005353, + "grad_norm": 5.96986198425293, + "learning_rate": 2.613516367476241e-06, + "loss": 0.4475, + "step": 7100 + }, + { + "epoch": 3.7144535840188015, + "grad_norm": 6.221879482269287, + "learning_rate": 2.602956705385428e-06, + "loss": 0.3911, + "step": 7110 + }, + { + "epoch": 3.7196761979370674, + "grad_norm": 6.827433109283447, + "learning_rate": 2.592397043294615e-06, + "loss": 0.4751, + "step": 7120 + }, + { + "epoch": 3.7248988118553337, + "grad_norm": 5.734457015991211, + "learning_rate": 2.5818373812038016e-06, + "loss": 0.4343, + "step": 7130 + }, + { + "epoch": 3.7301214257736, + "grad_norm": 3.825587034225464, + "learning_rate": 2.5712777191129883e-06, + "loss": 0.3712, + "step": 7140 + }, + { + "epoch": 3.735344039691866, + "grad_norm": 7.958340644836426, + "learning_rate": 2.560718057022176e-06, + "loss": 0.4076, + "step": 7150 + }, + { + "epoch": 3.7405666536101316, + "grad_norm": 6.706486701965332, + "learning_rate": 2.5501583949313625e-06, + "loss": 0.4379, + "step": 7160 + }, + { + "epoch": 3.745789267528398, + "grad_norm": 3.6787171363830566, + "learning_rate": 2.539598732840549e-06, + "loss": 0.3937, + "step": 7170 + }, + { + "epoch": 3.751011881446664, + "grad_norm": 3.7710745334625244, + "learning_rate": 2.5290390707497363e-06, + "loss": 0.4023, + "step": 7180 + }, + { + "epoch": 3.75623449536493, + "grad_norm": 9.986141204833984, + "learning_rate": 2.518479408658923e-06, + "loss": 0.4278, + "step": 7190 + }, + { + "epoch": 3.7614571092831963, + "grad_norm": 5.851425647735596, + "learning_rate": 2.50791974656811e-06, + "loss": 0.3561, + "step": 7200 + }, + { + "epoch": 3.7614571092831963, + "eval_loss": 0.7960723042488098, + "eval_runtime": 46.2236, + "eval_samples_per_second": 36.821, + "eval_steps_per_second": 4.608, + "step": 7200 + }, + { + "epoch": 3.766679723201462, + "grad_norm": 5.080770969390869, + "learning_rate": 2.4973600844772968e-06, + "loss": 0.4494, + "step": 7210 + }, + { + "epoch": 3.7719023371197284, + "grad_norm": 6.7447190284729, + "learning_rate": 2.486800422386484e-06, + "loss": 0.4839, + "step": 7220 + }, + { + "epoch": 3.7771249510379947, + "grad_norm": 5.529577255249023, + "learning_rate": 2.4762407602956706e-06, + "loss": 0.3799, + "step": 7230 + }, + { + "epoch": 3.7823475649562606, + "grad_norm": 3.1499006748199463, + "learning_rate": 2.4656810982048577e-06, + "loss": 0.3959, + "step": 7240 + }, + { + "epoch": 3.787570178874527, + "grad_norm": 7.214032173156738, + "learning_rate": 2.4551214361140448e-06, + "loss": 0.4165, + "step": 7250 + }, + { + "epoch": 3.7927927927927927, + "grad_norm": 3.6615257263183594, + "learning_rate": 2.4445617740232315e-06, + "loss": 0.3899, + "step": 7260 + }, + { + "epoch": 3.798015406711059, + "grad_norm": 8.117351531982422, + "learning_rate": 2.4340021119324186e-06, + "loss": 0.3904, + "step": 7270 + }, + { + "epoch": 3.803238020629325, + "grad_norm": 6.586986064910889, + "learning_rate": 2.4234424498416053e-06, + "loss": 0.4566, + "step": 7280 + }, + { + "epoch": 3.808460634547591, + "grad_norm": 2.746188163757324, + "learning_rate": 2.412882787750792e-06, + "loss": 0.3717, + "step": 7290 + }, + { + "epoch": 3.813683248465857, + "grad_norm": 3.323824167251587, + "learning_rate": 2.402323125659979e-06, + "loss": 0.4007, + "step": 7300 + }, + { + "epoch": 3.818905862384123, + "grad_norm": 4.206129550933838, + "learning_rate": 2.391763463569166e-06, + "loss": 0.3699, + "step": 7310 + }, + { + "epoch": 3.8241284763023895, + "grad_norm": 4.980790615081787, + "learning_rate": 2.381203801478353e-06, + "loss": 0.4213, + "step": 7320 + }, + { + "epoch": 3.8293510902206553, + "grad_norm": 4.453920841217041, + "learning_rate": 2.37064413938754e-06, + "loss": 0.3974, + "step": 7330 + }, + { + "epoch": 3.8345737041389216, + "grad_norm": 4.445418834686279, + "learning_rate": 2.3600844772967266e-06, + "loss": 0.3761, + "step": 7340 + }, + { + "epoch": 3.8397963180571875, + "grad_norm": 5.032138347625732, + "learning_rate": 2.3495248152059137e-06, + "loss": 0.3759, + "step": 7350 + }, + { + "epoch": 3.8450189319754537, + "grad_norm": 6.470358371734619, + "learning_rate": 2.3389651531151004e-06, + "loss": 0.4194, + "step": 7360 + }, + { + "epoch": 3.85024154589372, + "grad_norm": 3.676422119140625, + "learning_rate": 2.328405491024287e-06, + "loss": 0.4073, + "step": 7370 + }, + { + "epoch": 3.855464159811986, + "grad_norm": 2.7682676315307617, + "learning_rate": 2.3178458289334742e-06, + "loss": 0.4572, + "step": 7380 + }, + { + "epoch": 3.860686773730252, + "grad_norm": 2.9539079666137695, + "learning_rate": 2.3072861668426613e-06, + "loss": 0.4329, + "step": 7390 + }, + { + "epoch": 3.865909387648518, + "grad_norm": 3.254023551940918, + "learning_rate": 2.296726504751848e-06, + "loss": 0.3963, + "step": 7400 + }, + { + "epoch": 3.865909387648518, + "eval_loss": 0.7841590046882629, + "eval_runtime": 46.2618, + "eval_samples_per_second": 36.791, + "eval_steps_per_second": 4.604, + "step": 7400 + }, + { + "epoch": 3.8711320015667843, + "grad_norm": 3.7601821422576904, + "learning_rate": 2.286166842661035e-06, + "loss": 0.4022, + "step": 7410 + }, + { + "epoch": 3.87635461548505, + "grad_norm": 6.364165782928467, + "learning_rate": 2.2756071805702222e-06, + "loss": 0.5135, + "step": 7420 + }, + { + "epoch": 3.8815772294033164, + "grad_norm": 4.010589599609375, + "learning_rate": 2.265047518479409e-06, + "loss": 0.433, + "step": 7430 + }, + { + "epoch": 3.8867998433215822, + "grad_norm": 9.321678161621094, + "learning_rate": 2.2544878563885956e-06, + "loss": 0.4228, + "step": 7440 + }, + { + "epoch": 3.8920224572398485, + "grad_norm": 7.4074273109436035, + "learning_rate": 2.2439281942977827e-06, + "loss": 0.4568, + "step": 7450 + }, + { + "epoch": 3.897245071158115, + "grad_norm": 6.130796432495117, + "learning_rate": 2.2333685322069694e-06, + "loss": 0.3895, + "step": 7460 + }, + { + "epoch": 3.9024676850763806, + "grad_norm": 6.908585071563721, + "learning_rate": 2.2228088701161565e-06, + "loss": 0.35, + "step": 7470 + }, + { + "epoch": 3.907690298994647, + "grad_norm": 2.8046581745147705, + "learning_rate": 2.212249208025343e-06, + "loss": 0.4573, + "step": 7480 + }, + { + "epoch": 3.9129129129129128, + "grad_norm": 4.019318580627441, + "learning_rate": 2.2016895459345303e-06, + "loss": 0.4703, + "step": 7490 + }, + { + "epoch": 3.918135526831179, + "grad_norm": 5.776391983032227, + "learning_rate": 2.1911298838437174e-06, + "loss": 0.3865, + "step": 7500 + }, + { + "epoch": 3.9233581407494453, + "grad_norm": 5.176472187042236, + "learning_rate": 2.180570221752904e-06, + "loss": 0.3992, + "step": 7510 + }, + { + "epoch": 3.928580754667711, + "grad_norm": 5.863769054412842, + "learning_rate": 2.170010559662091e-06, + "loss": 0.4149, + "step": 7520 + }, + { + "epoch": 3.933803368585977, + "grad_norm": 4.269286632537842, + "learning_rate": 2.159450897571278e-06, + "loss": 0.3281, + "step": 7530 + }, + { + "epoch": 3.9390259825042433, + "grad_norm": 5.141351699829102, + "learning_rate": 2.1488912354804646e-06, + "loss": 0.3818, + "step": 7540 + }, + { + "epoch": 3.9442485964225096, + "grad_norm": 7.267117977142334, + "learning_rate": 2.1383315733896517e-06, + "loss": 0.4357, + "step": 7550 + }, + { + "epoch": 3.9494712103407754, + "grad_norm": 2.134504556655884, + "learning_rate": 2.1277719112988384e-06, + "loss": 0.408, + "step": 7560 + }, + { + "epoch": 3.9546938242590417, + "grad_norm": 2.806506872177124, + "learning_rate": 2.1172122492080255e-06, + "loss": 0.4074, + "step": 7570 + }, + { + "epoch": 3.9599164381773075, + "grad_norm": 7.443352699279785, + "learning_rate": 2.1066525871172126e-06, + "loss": 0.4055, + "step": 7580 + }, + { + "epoch": 3.965139052095574, + "grad_norm": 4.180816650390625, + "learning_rate": 2.0960929250263993e-06, + "loss": 0.4754, + "step": 7590 + }, + { + "epoch": 3.97036166601384, + "grad_norm": 7.595555305480957, + "learning_rate": 2.0855332629355864e-06, + "loss": 0.4441, + "step": 7600 + }, + { + "epoch": 3.97036166601384, + "eval_loss": 0.7774137258529663, + "eval_runtime": 46.2875, + "eval_samples_per_second": 36.77, + "eval_steps_per_second": 4.602, + "step": 7600 + }, + { + "epoch": 3.975584279932106, + "grad_norm": 3.3861186504364014, + "learning_rate": 2.074973600844773e-06, + "loss": 0.4336, + "step": 7610 + }, + { + "epoch": 3.9808068938503722, + "grad_norm": 3.5776193141937256, + "learning_rate": 2.06441393875396e-06, + "loss": 0.336, + "step": 7620 + }, + { + "epoch": 3.986029507768638, + "grad_norm": 6.825954437255859, + "learning_rate": 2.053854276663147e-06, + "loss": 0.4407, + "step": 7630 + }, + { + "epoch": 3.9912521216869044, + "grad_norm": 4.183834075927734, + "learning_rate": 2.043294614572334e-06, + "loss": 0.4037, + "step": 7640 + }, + { + "epoch": 3.9964747356051706, + "grad_norm": 8.502843856811523, + "learning_rate": 2.0327349524815206e-06, + "loss": 0.4222, + "step": 7650 + }, + { + "epoch": 4.002089045567306, + "grad_norm": 3.6274964809417725, + "learning_rate": 2.0221752903907077e-06, + "loss": 0.4458, + "step": 7660 + }, + { + "epoch": 4.007311659485572, + "grad_norm": 5.611544132232666, + "learning_rate": 2.0116156282998944e-06, + "loss": 0.3091, + "step": 7670 + }, + { + "epoch": 4.012534273403839, + "grad_norm": 4.243778228759766, + "learning_rate": 2.0010559662090815e-06, + "loss": 0.3231, + "step": 7680 + }, + { + "epoch": 4.017756887322105, + "grad_norm": 4.5843939781188965, + "learning_rate": 1.9904963041182686e-06, + "loss": 0.3072, + "step": 7690 + }, + { + "epoch": 4.02297950124037, + "grad_norm": 4.933235168457031, + "learning_rate": 1.9799366420274553e-06, + "loss": 0.3666, + "step": 7700 + }, + { + "epoch": 4.028202115158637, + "grad_norm": 6.930262565612793, + "learning_rate": 1.969376979936642e-06, + "loss": 0.3716, + "step": 7710 + }, + { + "epoch": 4.033424729076903, + "grad_norm": 2.2469685077667236, + "learning_rate": 1.958817317845829e-06, + "loss": 0.3, + "step": 7720 + }, + { + "epoch": 4.038647342995169, + "grad_norm": 9.199947357177734, + "learning_rate": 1.948257655755016e-06, + "loss": 0.3838, + "step": 7730 + }, + { + "epoch": 4.0438699569134355, + "grad_norm": 4.198123455047607, + "learning_rate": 1.937697993664203e-06, + "loss": 0.3468, + "step": 7740 + }, + { + "epoch": 4.049092570831701, + "grad_norm": 4.319494724273682, + "learning_rate": 1.92713833157339e-06, + "loss": 0.3096, + "step": 7750 + }, + { + "epoch": 4.054315184749967, + "grad_norm": 6.707279205322266, + "learning_rate": 1.9165786694825767e-06, + "loss": 0.3735, + "step": 7760 + }, + { + "epoch": 4.0595377986682335, + "grad_norm": 4.561187744140625, + "learning_rate": 1.9060190073917636e-06, + "loss": 0.3206, + "step": 7770 + }, + { + "epoch": 4.0647604125865, + "grad_norm": 4.949713706970215, + "learning_rate": 1.8954593453009507e-06, + "loss": 0.3828, + "step": 7780 + }, + { + "epoch": 4.069983026504766, + "grad_norm": 6.082244396209717, + "learning_rate": 1.8848996832101374e-06, + "loss": 0.3252, + "step": 7790 + }, + { + "epoch": 4.0752056404230315, + "grad_norm": 4.181164741516113, + "learning_rate": 1.8743400211193243e-06, + "loss": 0.2709, + "step": 7800 + }, + { + "epoch": 4.0752056404230315, + "eval_loss": 0.8169022798538208, + "eval_runtime": 46.3208, + "eval_samples_per_second": 36.744, + "eval_steps_per_second": 4.598, + "step": 7800 + }, + { + "epoch": 4.080428254341298, + "grad_norm": 8.991779327392578, + "learning_rate": 1.8637803590285112e-06, + "loss": 0.3389, + "step": 7810 + }, + { + "epoch": 4.085650868259564, + "grad_norm": 4.297403335571289, + "learning_rate": 1.853220696937698e-06, + "loss": 0.3446, + "step": 7820 + }, + { + "epoch": 4.09087348217783, + "grad_norm": 6.165748596191406, + "learning_rate": 1.8426610348468852e-06, + "loss": 0.3219, + "step": 7830 + }, + { + "epoch": 4.096096096096096, + "grad_norm": 5.063658237457275, + "learning_rate": 1.8321013727560719e-06, + "loss": 0.3625, + "step": 7840 + }, + { + "epoch": 4.101318710014362, + "grad_norm": 5.90833044052124, + "learning_rate": 1.8215417106652588e-06, + "loss": 0.3221, + "step": 7850 + }, + { + "epoch": 4.106541323932628, + "grad_norm": 5.1894049644470215, + "learning_rate": 1.8109820485744459e-06, + "loss": 0.3513, + "step": 7860 + }, + { + "epoch": 4.111763937850895, + "grad_norm": 7.121092796325684, + "learning_rate": 1.8004223864836326e-06, + "loss": 0.3506, + "step": 7870 + }, + { + "epoch": 4.116986551769161, + "grad_norm": 3.306450605392456, + "learning_rate": 1.7898627243928197e-06, + "loss": 0.2823, + "step": 7880 + }, + { + "epoch": 4.122209165687426, + "grad_norm": 4.995865345001221, + "learning_rate": 1.7793030623020066e-06, + "loss": 0.3005, + "step": 7890 + }, + { + "epoch": 4.1274317796056925, + "grad_norm": 6.791319370269775, + "learning_rate": 1.7687434002111933e-06, + "loss": 0.3675, + "step": 7900 + }, + { + "epoch": 4.132654393523959, + "grad_norm": 4.911620140075684, + "learning_rate": 1.7581837381203804e-06, + "loss": 0.3559, + "step": 7910 + }, + { + "epoch": 4.137877007442225, + "grad_norm": 5.022863864898682, + "learning_rate": 1.747624076029567e-06, + "loss": 0.3093, + "step": 7920 + }, + { + "epoch": 4.143099621360491, + "grad_norm": 6.013493061065674, + "learning_rate": 1.7370644139387542e-06, + "loss": 0.3768, + "step": 7930 + }, + { + "epoch": 4.148322235278757, + "grad_norm": 6.149303913116455, + "learning_rate": 1.726504751847941e-06, + "loss": 0.3327, + "step": 7940 + }, + { + "epoch": 4.153544849197023, + "grad_norm": 7.961324691772461, + "learning_rate": 1.7159450897571277e-06, + "loss": 0.3803, + "step": 7950 + }, + { + "epoch": 4.158767463115289, + "grad_norm": 8.543217658996582, + "learning_rate": 1.7053854276663148e-06, + "loss": 0.3514, + "step": 7960 + }, + { + "epoch": 4.163990077033556, + "grad_norm": 6.827886581420898, + "learning_rate": 1.6948257655755017e-06, + "loss": 0.3535, + "step": 7970 + }, + { + "epoch": 4.169212690951821, + "grad_norm": 3.4540553092956543, + "learning_rate": 1.6842661034846886e-06, + "loss": 0.3159, + "step": 7980 + }, + { + "epoch": 4.174435304870087, + "grad_norm": 5.032063961029053, + "learning_rate": 1.6737064413938755e-06, + "loss": 0.3044, + "step": 7990 + }, + { + "epoch": 4.179657918788354, + "grad_norm": 3.578277349472046, + "learning_rate": 1.6631467793030626e-06, + "loss": 0.3876, + "step": 8000 + }, + { + "epoch": 4.179657918788354, + "eval_loss": 0.8253039121627808, + "eval_runtime": 46.3214, + "eval_samples_per_second": 36.743, + "eval_steps_per_second": 4.598, + "step": 8000 + }, + { + "epoch": 4.18488053270662, + "grad_norm": 5.262123107910156, + "learning_rate": 1.6525871172122493e-06, + "loss": 0.3285, + "step": 8010 + }, + { + "epoch": 4.190103146624886, + "grad_norm": 4.203868389129639, + "learning_rate": 1.6420274551214362e-06, + "loss": 0.3721, + "step": 8020 + }, + { + "epoch": 4.1953257605431515, + "grad_norm": 4.308624744415283, + "learning_rate": 1.6314677930306231e-06, + "loss": 0.3702, + "step": 8030 + }, + { + "epoch": 4.200548374461418, + "grad_norm": 3.027498722076416, + "learning_rate": 1.62090813093981e-06, + "loss": 0.2974, + "step": 8040 + }, + { + "epoch": 4.205770988379684, + "grad_norm": 3.877850294113159, + "learning_rate": 1.6103484688489971e-06, + "loss": 0.3333, + "step": 8050 + }, + { + "epoch": 4.21099360229795, + "grad_norm": 3.6501309871673584, + "learning_rate": 1.5997888067581838e-06, + "loss": 0.3098, + "step": 8060 + }, + { + "epoch": 4.216216216216216, + "grad_norm": 3.677426815032959, + "learning_rate": 1.5892291446673707e-06, + "loss": 0.3724, + "step": 8070 + }, + { + "epoch": 4.221438830134482, + "grad_norm": 7.220156669616699, + "learning_rate": 1.5786694825765578e-06, + "loss": 0.3313, + "step": 8080 + }, + { + "epoch": 4.226661444052748, + "grad_norm": 5.49832820892334, + "learning_rate": 1.5681098204857445e-06, + "loss": 0.3453, + "step": 8090 + }, + { + "epoch": 4.231884057971015, + "grad_norm": 2.545565605163574, + "learning_rate": 1.5575501583949316e-06, + "loss": 0.3117, + "step": 8100 + }, + { + "epoch": 4.237106671889281, + "grad_norm": 7.097994804382324, + "learning_rate": 1.5469904963041185e-06, + "loss": 0.3841, + "step": 8110 + }, + { + "epoch": 4.242329285807546, + "grad_norm": 5.2458391189575195, + "learning_rate": 1.5364308342133052e-06, + "loss": 0.3832, + "step": 8120 + }, + { + "epoch": 4.247551899725813, + "grad_norm": 7.351592540740967, + "learning_rate": 1.5258711721224923e-06, + "loss": 0.3717, + "step": 8130 + }, + { + "epoch": 4.252774513644079, + "grad_norm": 6.467917442321777, + "learning_rate": 1.515311510031679e-06, + "loss": 0.3783, + "step": 8140 + }, + { + "epoch": 4.257997127562345, + "grad_norm": 3.184577703475952, + "learning_rate": 1.5047518479408659e-06, + "loss": 0.349, + "step": 8150 + }, + { + "epoch": 4.2632197414806114, + "grad_norm": 5.695311546325684, + "learning_rate": 1.494192185850053e-06, + "loss": 0.3756, + "step": 8160 + }, + { + "epoch": 4.268442355398877, + "grad_norm": 3.671365737915039, + "learning_rate": 1.4836325237592397e-06, + "loss": 0.3132, + "step": 8170 + }, + { + "epoch": 4.273664969317143, + "grad_norm": 3.2261226177215576, + "learning_rate": 1.4730728616684268e-06, + "loss": 0.3501, + "step": 8180 + }, + { + "epoch": 4.278887583235409, + "grad_norm": 5.452921390533447, + "learning_rate": 1.4625131995776137e-06, + "loss": 0.3941, + "step": 8190 + }, + { + "epoch": 4.284110197153676, + "grad_norm": 4.698659896850586, + "learning_rate": 1.4519535374868004e-06, + "loss": 0.3427, + "step": 8200 + }, + { + "epoch": 4.284110197153676, + "eval_loss": 0.8172882199287415, + "eval_runtime": 46.2511, + "eval_samples_per_second": 36.799, + "eval_steps_per_second": 4.605, + "step": 8200 + }, + { + "epoch": 4.289332811071942, + "grad_norm": 5.259459018707275, + "learning_rate": 1.4413938753959875e-06, + "loss": 0.31, + "step": 8210 + }, + { + "epoch": 4.294555424990207, + "grad_norm": 3.4094529151916504, + "learning_rate": 1.4308342133051744e-06, + "loss": 0.3506, + "step": 8220 + }, + { + "epoch": 4.299778038908474, + "grad_norm": 4.930992126464844, + "learning_rate": 1.4202745512143613e-06, + "loss": 0.3523, + "step": 8230 + }, + { + "epoch": 4.30500065282674, + "grad_norm": 5.238248348236084, + "learning_rate": 1.4097148891235482e-06, + "loss": 0.3604, + "step": 8240 + }, + { + "epoch": 4.310223266745006, + "grad_norm": 4.543814659118652, + "learning_rate": 1.3991552270327348e-06, + "loss": 0.3138, + "step": 8250 + }, + { + "epoch": 4.315445880663272, + "grad_norm": 6.896186351776123, + "learning_rate": 1.388595564941922e-06, + "loss": 0.3437, + "step": 8260 + }, + { + "epoch": 4.320668494581538, + "grad_norm": 3.8952276706695557, + "learning_rate": 1.3780359028511089e-06, + "loss": 0.3063, + "step": 8270 + }, + { + "epoch": 4.325891108499804, + "grad_norm": 5.249785900115967, + "learning_rate": 1.3674762407602957e-06, + "loss": 0.3514, + "step": 8280 + }, + { + "epoch": 4.3311137224180705, + "grad_norm": 5.353476524353027, + "learning_rate": 1.3569165786694826e-06, + "loss": 0.3733, + "step": 8290 + }, + { + "epoch": 4.336336336336337, + "grad_norm": 7.042110443115234, + "learning_rate": 1.3463569165786698e-06, + "loss": 0.3523, + "step": 8300 + }, + { + "epoch": 4.341558950254602, + "grad_norm": 3.094001531600952, + "learning_rate": 1.3357972544878564e-06, + "loss": 0.3341, + "step": 8310 + }, + { + "epoch": 4.346781564172868, + "grad_norm": 6.122366905212402, + "learning_rate": 1.3252375923970433e-06, + "loss": 0.372, + "step": 8320 + }, + { + "epoch": 4.352004178091135, + "grad_norm": 5.153871536254883, + "learning_rate": 1.3146779303062304e-06, + "loss": 0.328, + "step": 8330 + }, + { + "epoch": 4.357226792009401, + "grad_norm": 6.744436264038086, + "learning_rate": 1.3041182682154171e-06, + "loss": 0.3581, + "step": 8340 + }, + { + "epoch": 4.362449405927666, + "grad_norm": 4.01329231262207, + "learning_rate": 1.2935586061246042e-06, + "loss": 0.3457, + "step": 8350 + }, + { + "epoch": 4.367672019845933, + "grad_norm": 7.351885795593262, + "learning_rate": 1.2829989440337911e-06, + "loss": 0.3765, + "step": 8360 + }, + { + "epoch": 4.372894633764199, + "grad_norm": 5.235280990600586, + "learning_rate": 1.2724392819429778e-06, + "loss": 0.3262, + "step": 8370 + }, + { + "epoch": 4.378117247682465, + "grad_norm": 7.142249584197998, + "learning_rate": 1.261879619852165e-06, + "loss": 0.4138, + "step": 8380 + }, + { + "epoch": 4.3833398616007315, + "grad_norm": 7.363052845001221, + "learning_rate": 1.2513199577613516e-06, + "loss": 0.2946, + "step": 8390 + }, + { + "epoch": 4.388562475518997, + "grad_norm": 7.17656135559082, + "learning_rate": 1.2407602956705387e-06, + "loss": 0.3248, + "step": 8400 + }, + { + "epoch": 4.388562475518997, + "eval_loss": 0.8222038149833679, + "eval_runtime": 46.2143, + "eval_samples_per_second": 36.828, + "eval_steps_per_second": 4.609, + "step": 8400 + }, + { + "epoch": 4.393785089437263, + "grad_norm": 4.6823344230651855, + "learning_rate": 1.2302006335797254e-06, + "loss": 0.32, + "step": 8410 + }, + { + "epoch": 4.3990077033555295, + "grad_norm": 6.020813941955566, + "learning_rate": 1.2196409714889125e-06, + "loss": 0.2931, + "step": 8420 + }, + { + "epoch": 4.404230317273796, + "grad_norm": 6.5272088050842285, + "learning_rate": 1.2090813093980994e-06, + "loss": 0.3807, + "step": 8430 + }, + { + "epoch": 4.409452931192062, + "grad_norm": 6.227059364318848, + "learning_rate": 1.1985216473072863e-06, + "loss": 0.3693, + "step": 8440 + }, + { + "epoch": 4.4146755451103274, + "grad_norm": 5.219118118286133, + "learning_rate": 1.1879619852164732e-06, + "loss": 0.3124, + "step": 8450 + }, + { + "epoch": 4.419898159028594, + "grad_norm": 3.7675588130950928, + "learning_rate": 1.17740232312566e-06, + "loss": 0.3157, + "step": 8460 + }, + { + "epoch": 4.42512077294686, + "grad_norm": 3.7476587295532227, + "learning_rate": 1.166842661034847e-06, + "loss": 0.3217, + "step": 8470 + }, + { + "epoch": 4.430343386865126, + "grad_norm": 7.094352722167969, + "learning_rate": 1.1562829989440339e-06, + "loss": 0.3197, + "step": 8480 + }, + { + "epoch": 4.435566000783392, + "grad_norm": 3.2805261611938477, + "learning_rate": 1.1457233368532208e-06, + "loss": 0.2653, + "step": 8490 + }, + { + "epoch": 4.440788614701658, + "grad_norm": 4.148542404174805, + "learning_rate": 1.1351636747624077e-06, + "loss": 0.3137, + "step": 8500 + }, + { + "epoch": 4.446011228619924, + "grad_norm": 5.286221981048584, + "learning_rate": 1.1246040126715946e-06, + "loss": 0.3123, + "step": 8510 + }, + { + "epoch": 4.4512338425381905, + "grad_norm": 2.6632936000823975, + "learning_rate": 1.1140443505807815e-06, + "loss": 0.3666, + "step": 8520 + }, + { + "epoch": 4.456456456456457, + "grad_norm": 3.3885107040405273, + "learning_rate": 1.1034846884899684e-06, + "loss": 0.3369, + "step": 8530 + }, + { + "epoch": 4.461679070374722, + "grad_norm": 4.203908443450928, + "learning_rate": 1.0929250263991553e-06, + "loss": 0.3557, + "step": 8540 + }, + { + "epoch": 4.4669016842929885, + "grad_norm": 3.4202535152435303, + "learning_rate": 1.0823653643083422e-06, + "loss": 0.3843, + "step": 8550 + }, + { + "epoch": 4.472124298211255, + "grad_norm": 7.479564189910889, + "learning_rate": 1.0718057022175293e-06, + "loss": 0.3472, + "step": 8560 + }, + { + "epoch": 4.477346912129521, + "grad_norm": 8.25882339477539, + "learning_rate": 1.061246040126716e-06, + "loss": 0.3083, + "step": 8570 + }, + { + "epoch": 4.4825695260477865, + "grad_norm": 3.9930543899536133, + "learning_rate": 1.0506863780359029e-06, + "loss": 0.3187, + "step": 8580 + }, + { + "epoch": 4.487792139966053, + "grad_norm": 7.148872375488281, + "learning_rate": 1.0401267159450897e-06, + "loss": 0.325, + "step": 8590 + }, + { + "epoch": 4.493014753884319, + "grad_norm": 4.759425163269043, + "learning_rate": 1.0295670538542769e-06, + "loss": 0.348, + "step": 8600 + }, + { + "epoch": 4.493014753884319, + "eval_loss": 0.8176538944244385, + "eval_runtime": 46.2968, + "eval_samples_per_second": 36.763, + "eval_steps_per_second": 4.601, + "step": 8600 + }, + { + "epoch": 4.498237367802585, + "grad_norm": 6.194363594055176, + "learning_rate": 1.0190073917634638e-06, + "loss": 0.3826, + "step": 8610 + }, + { + "epoch": 4.503459981720852, + "grad_norm": 6.426743984222412, + "learning_rate": 1.0084477296726504e-06, + "loss": 0.3398, + "step": 8620 + }, + { + "epoch": 4.508682595639117, + "grad_norm": 5.054692268371582, + "learning_rate": 9.978880675818373e-07, + "loss": 0.3245, + "step": 8630 + }, + { + "epoch": 4.513905209557383, + "grad_norm": 6.68280029296875, + "learning_rate": 9.873284054910244e-07, + "loss": 0.3174, + "step": 8640 + }, + { + "epoch": 4.51912782347565, + "grad_norm": 5.14640998840332, + "learning_rate": 9.767687434002113e-07, + "loss": 0.3552, + "step": 8650 + }, + { + "epoch": 4.524350437393916, + "grad_norm": 6.840033531188965, + "learning_rate": 9.662090813093982e-07, + "loss": 0.3612, + "step": 8660 + }, + { + "epoch": 4.529573051312182, + "grad_norm": 5.918385028839111, + "learning_rate": 9.556494192185851e-07, + "loss": 0.3532, + "step": 8670 + }, + { + "epoch": 4.5347956652304475, + "grad_norm": 9.579689979553223, + "learning_rate": 9.45089757127772e-07, + "loss": 0.3814, + "step": 8680 + }, + { + "epoch": 4.540018279148714, + "grad_norm": 2.852560520172119, + "learning_rate": 9.345300950369589e-07, + "loss": 0.324, + "step": 8690 + }, + { + "epoch": 4.54524089306698, + "grad_norm": 3.4993200302124023, + "learning_rate": 9.239704329461457e-07, + "loss": 0.2616, + "step": 8700 + }, + { + "epoch": 4.550463506985246, + "grad_norm": 5.047815799713135, + "learning_rate": 9.134107708553327e-07, + "loss": 0.3717, + "step": 8710 + }, + { + "epoch": 4.555686120903513, + "grad_norm": 6.2444610595703125, + "learning_rate": 9.028511087645196e-07, + "loss": 0.322, + "step": 8720 + }, + { + "epoch": 4.560908734821778, + "grad_norm": 7.351525783538818, + "learning_rate": 8.922914466737065e-07, + "loss": 0.3236, + "step": 8730 + }, + { + "epoch": 4.566131348740044, + "grad_norm": 4.019290447235107, + "learning_rate": 8.817317845828933e-07, + "loss": 0.3372, + "step": 8740 + }, + { + "epoch": 4.571353962658311, + "grad_norm": 3.271005153656006, + "learning_rate": 8.711721224920804e-07, + "loss": 0.291, + "step": 8750 + }, + { + "epoch": 4.576576576576577, + "grad_norm": 9.036681175231934, + "learning_rate": 8.606124604012672e-07, + "loss": 0.3622, + "step": 8760 + }, + { + "epoch": 4.581799190494842, + "grad_norm": 4.411876678466797, + "learning_rate": 8.500527983104541e-07, + "loss": 0.4119, + "step": 8770 + }, + { + "epoch": 4.587021804413109, + "grad_norm": 3.062072992324829, + "learning_rate": 8.394931362196411e-07, + "loss": 0.3447, + "step": 8780 + }, + { + "epoch": 4.592244418331375, + "grad_norm": 7.340733528137207, + "learning_rate": 8.28933474128828e-07, + "loss": 0.3253, + "step": 8790 + }, + { + "epoch": 4.597467032249641, + "grad_norm": 7.044495582580566, + "learning_rate": 8.183738120380148e-07, + "loss": 0.3518, + "step": 8800 + }, + { + "epoch": 4.597467032249641, + "eval_loss": 0.8164646029472351, + "eval_runtime": 46.4842, + "eval_samples_per_second": 36.615, + "eval_steps_per_second": 4.582, + "step": 8800 + }, + { + "epoch": 4.6026896461679065, + "grad_norm": 2.9957234859466553, + "learning_rate": 8.078141499472017e-07, + "loss": 0.3615, + "step": 8810 + }, + { + "epoch": 4.607912260086173, + "grad_norm": 6.641829967498779, + "learning_rate": 7.972544878563887e-07, + "loss": 0.3345, + "step": 8820 + }, + { + "epoch": 4.613134874004439, + "grad_norm": 3.980393409729004, + "learning_rate": 7.866948257655756e-07, + "loss": 0.3427, + "step": 8830 + }, + { + "epoch": 4.618357487922705, + "grad_norm": 3.8211607933044434, + "learning_rate": 7.761351636747625e-07, + "loss": 0.3466, + "step": 8840 + }, + { + "epoch": 4.623580101840972, + "grad_norm": 4.5899658203125, + "learning_rate": 7.655755015839495e-07, + "loss": 0.2956, + "step": 8850 + }, + { + "epoch": 4.628802715759237, + "grad_norm": 4.0807719230651855, + "learning_rate": 7.550158394931363e-07, + "loss": 0.2967, + "step": 8860 + }, + { + "epoch": 4.634025329677503, + "grad_norm": 3.9495162963867188, + "learning_rate": 7.444561774023232e-07, + "loss": 0.2952, + "step": 8870 + }, + { + "epoch": 4.63924794359577, + "grad_norm": 3.607835054397583, + "learning_rate": 7.338965153115101e-07, + "loss": 0.3354, + "step": 8880 + }, + { + "epoch": 4.644470557514036, + "grad_norm": 5.947987079620361, + "learning_rate": 7.233368532206971e-07, + "loss": 0.3429, + "step": 8890 + }, + { + "epoch": 4.649693171432302, + "grad_norm": 3.4477195739746094, + "learning_rate": 7.12777191129884e-07, + "loss": 0.3291, + "step": 8900 + }, + { + "epoch": 4.654915785350568, + "grad_norm": 3.3445801734924316, + "learning_rate": 7.022175290390708e-07, + "loss": 0.2841, + "step": 8910 + }, + { + "epoch": 4.660138399268834, + "grad_norm": 6.880585670471191, + "learning_rate": 6.916578669482576e-07, + "loss": 0.3917, + "step": 8920 + }, + { + "epoch": 4.6653610131871, + "grad_norm": 4.318111896514893, + "learning_rate": 6.810982048574447e-07, + "loss": 0.2707, + "step": 8930 + }, + { + "epoch": 4.6705836271053665, + "grad_norm": 3.5282676219940186, + "learning_rate": 6.705385427666315e-07, + "loss": 0.2921, + "step": 8940 + }, + { + "epoch": 4.675806241023633, + "grad_norm": 4.396801948547363, + "learning_rate": 6.599788806758183e-07, + "loss": 0.3181, + "step": 8950 + }, + { + "epoch": 4.681028854941898, + "grad_norm": 4.138298034667969, + "learning_rate": 6.494192185850054e-07, + "loss": 0.3129, + "step": 8960 + }, + { + "epoch": 4.686251468860164, + "grad_norm": 3.6016428470611572, + "learning_rate": 6.388595564941922e-07, + "loss": 0.3396, + "step": 8970 + }, + { + "epoch": 4.691474082778431, + "grad_norm": 3.1963398456573486, + "learning_rate": 6.282998944033791e-07, + "loss": 0.2742, + "step": 8980 + }, + { + "epoch": 4.696696696696697, + "grad_norm": 5.4085259437561035, + "learning_rate": 6.17740232312566e-07, + "loss": 0.3437, + "step": 8990 + }, + { + "epoch": 4.701919310614963, + "grad_norm": 4.243213653564453, + "learning_rate": 6.071805702217529e-07, + "loss": 0.3385, + "step": 9000 + }, + { + "epoch": 4.701919310614963, + "eval_loss": 0.8144433498382568, + "eval_runtime": 46.2975, + "eval_samples_per_second": 36.762, + "eval_steps_per_second": 4.601, + "step": 9000 + }, + { + "epoch": 4.707141924533229, + "grad_norm": 5.212153434753418, + "learning_rate": 5.966209081309398e-07, + "loss": 0.2875, + "step": 9010 + }, + { + "epoch": 4.712364538451495, + "grad_norm": 4.833583354949951, + "learning_rate": 5.860612460401267e-07, + "loss": 0.2909, + "step": 9020 + }, + { + "epoch": 4.717587152369761, + "grad_norm": 4.737828731536865, + "learning_rate": 5.755015839493137e-07, + "loss": 0.3342, + "step": 9030 + }, + { + "epoch": 4.7228097662880275, + "grad_norm": 6.1549153327941895, + "learning_rate": 5.649419218585006e-07, + "loss": 0.3232, + "step": 9040 + }, + { + "epoch": 4.728032380206293, + "grad_norm": 9.347796440124512, + "learning_rate": 5.543822597676875e-07, + "loss": 0.3775, + "step": 9050 + }, + { + "epoch": 4.733254994124559, + "grad_norm": 4.438070774078369, + "learning_rate": 5.438225976768744e-07, + "loss": 0.3284, + "step": 9060 + }, + { + "epoch": 4.7384776080428255, + "grad_norm": 6.349786758422852, + "learning_rate": 5.332629355860613e-07, + "loss": 0.3574, + "step": 9070 + }, + { + "epoch": 4.743700221961092, + "grad_norm": 3.8678805828094482, + "learning_rate": 5.227032734952482e-07, + "loss": 0.3484, + "step": 9080 + }, + { + "epoch": 4.748922835879357, + "grad_norm": 5.201286315917969, + "learning_rate": 5.121436114044351e-07, + "loss": 0.3131, + "step": 9090 + }, + { + "epoch": 4.754145449797623, + "grad_norm": 5.971492290496826, + "learning_rate": 5.01583949313622e-07, + "loss": 0.3042, + "step": 9100 + }, + { + "epoch": 4.75936806371589, + "grad_norm": 7.516099452972412, + "learning_rate": 4.910242872228089e-07, + "loss": 0.3178, + "step": 9110 + }, + { + "epoch": 4.764590677634156, + "grad_norm": 6.501901149749756, + "learning_rate": 4.804646251319958e-07, + "loss": 0.4111, + "step": 9120 + }, + { + "epoch": 4.769813291552422, + "grad_norm": 5.679157257080078, + "learning_rate": 4.699049630411827e-07, + "loss": 0.3068, + "step": 9130 + }, + { + "epoch": 4.775035905470688, + "grad_norm": 7.733293533325195, + "learning_rate": 4.5934530095036963e-07, + "loss": 0.3639, + "step": 9140 + }, + { + "epoch": 4.780258519388954, + "grad_norm": 4.554075241088867, + "learning_rate": 4.487856388595566e-07, + "loss": 0.2758, + "step": 9150 + }, + { + "epoch": 4.78548113330722, + "grad_norm": 4.692908763885498, + "learning_rate": 4.3822597676874343e-07, + "loss": 0.3327, + "step": 9160 + }, + { + "epoch": 4.7907037472254865, + "grad_norm": 5.9607977867126465, + "learning_rate": 4.276663146779304e-07, + "loss": 0.3769, + "step": 9170 + }, + { + "epoch": 4.795926361143753, + "grad_norm": 4.254061222076416, + "learning_rate": 4.171066525871172e-07, + "loss": 0.323, + "step": 9180 + }, + { + "epoch": 4.801148975062018, + "grad_norm": 3.6072418689727783, + "learning_rate": 4.0654699049630417e-07, + "loss": 0.3145, + "step": 9190 + }, + { + "epoch": 4.8063715889802845, + "grad_norm": 6.394986152648926, + "learning_rate": 3.95987328405491e-07, + "loss": 0.3525, + "step": 9200 + }, + { + "epoch": 4.8063715889802845, + "eval_loss": 0.8159402012825012, + "eval_runtime": 46.2685, + "eval_samples_per_second": 36.785, + "eval_steps_per_second": 4.604, + "step": 9200 + }, + { + "epoch": 4.811594202898551, + "grad_norm": 6.0406036376953125, + "learning_rate": 3.8542766631467796e-07, + "loss": 0.3065, + "step": 9210 + }, + { + "epoch": 4.816816816816817, + "grad_norm": 4.4743523597717285, + "learning_rate": 3.7486800422386486e-07, + "loss": 0.2928, + "step": 9220 + }, + { + "epoch": 4.822039430735083, + "grad_norm": 3.1444690227508545, + "learning_rate": 3.6430834213305176e-07, + "loss": 0.2871, + "step": 9230 + }, + { + "epoch": 4.827262044653349, + "grad_norm": 4.292675018310547, + "learning_rate": 3.537486800422387e-07, + "loss": 0.2826, + "step": 9240 + }, + { + "epoch": 4.832484658571615, + "grad_norm": 6.358124732971191, + "learning_rate": 3.431890179514256e-07, + "loss": 0.3535, + "step": 9250 + }, + { + "epoch": 4.837707272489881, + "grad_norm": 2.386103868484497, + "learning_rate": 3.326293558606125e-07, + "loss": 0.3524, + "step": 9260 + }, + { + "epoch": 4.842929886408148, + "grad_norm": 4.566143989562988, + "learning_rate": 3.220696937697994e-07, + "loss": 0.3019, + "step": 9270 + }, + { + "epoch": 4.848152500326413, + "grad_norm": 3.1321911811828613, + "learning_rate": 3.115100316789863e-07, + "loss": 0.2784, + "step": 9280 + }, + { + "epoch": 4.853375114244679, + "grad_norm": 7.200014114379883, + "learning_rate": 3.0095036958817324e-07, + "loss": 0.3235, + "step": 9290 + }, + { + "epoch": 4.8585977281629456, + "grad_norm": 3.960508108139038, + "learning_rate": 2.9039070749736014e-07, + "loss": 0.3063, + "step": 9300 + }, + { + "epoch": 4.863820342081212, + "grad_norm": 6.61761999130249, + "learning_rate": 2.7983104540654703e-07, + "loss": 0.3467, + "step": 9310 + }, + { + "epoch": 4.869042955999478, + "grad_norm": 7.042169570922852, + "learning_rate": 2.6927138331573393e-07, + "loss": 0.305, + "step": 9320 + }, + { + "epoch": 4.8742655699177435, + "grad_norm": 6.283872604370117, + "learning_rate": 2.587117212249208e-07, + "loss": 0.3645, + "step": 9330 + }, + { + "epoch": 4.87948818383601, + "grad_norm": 4.147797584533691, + "learning_rate": 2.481520591341077e-07, + "loss": 0.3352, + "step": 9340 + }, + { + "epoch": 4.884710797754276, + "grad_norm": 7.539230823516846, + "learning_rate": 2.3759239704329462e-07, + "loss": 0.3125, + "step": 9350 + }, + { + "epoch": 4.889933411672542, + "grad_norm": 6.097413063049316, + "learning_rate": 2.2703273495248154e-07, + "loss": 0.3444, + "step": 9360 + }, + { + "epoch": 4.895156025590808, + "grad_norm": 6.565594673156738, + "learning_rate": 2.1647307286166844e-07, + "loss": 0.3436, + "step": 9370 + }, + { + "epoch": 4.900378639509074, + "grad_norm": 7.224295616149902, + "learning_rate": 2.0591341077085536e-07, + "loss": 0.3328, + "step": 9380 + }, + { + "epoch": 4.90560125342734, + "grad_norm": 3.462695598602295, + "learning_rate": 1.9535374868004226e-07, + "loss": 0.3503, + "step": 9390 + }, + { + "epoch": 4.910823867345607, + "grad_norm": 5.290524482727051, + "learning_rate": 1.8479408658922918e-07, + "loss": 0.3291, + "step": 9400 + }, + { + "epoch": 4.910823867345607, + "eval_loss": 0.813969075679779, + "eval_runtime": 46.1949, + "eval_samples_per_second": 36.844, + "eval_steps_per_second": 4.611, + "step": 9400 + }, + { + "epoch": 4.916046481263873, + "grad_norm": 8.547982215881348, + "learning_rate": 1.7423442449841608e-07, + "loss": 0.379, + "step": 9410 + }, + { + "epoch": 4.921269095182138, + "grad_norm": 5.874237060546875, + "learning_rate": 1.6367476240760297e-07, + "loss": 0.341, + "step": 9420 + }, + { + "epoch": 4.926491709100405, + "grad_norm": 4.818123817443848, + "learning_rate": 1.5311510031678987e-07, + "loss": 0.3042, + "step": 9430 + }, + { + "epoch": 4.931714323018671, + "grad_norm": 6.666902542114258, + "learning_rate": 1.4255543822597677e-07, + "loss": 0.2906, + "step": 9440 + }, + { + "epoch": 4.936936936936937, + "grad_norm": 7.183316707611084, + "learning_rate": 1.319957761351637e-07, + "loss": 0.3354, + "step": 9450 + }, + { + "epoch": 4.942159550855203, + "grad_norm": 4.611485004425049, + "learning_rate": 1.2143611404435059e-07, + "loss": 0.2954, + "step": 9460 + }, + { + "epoch": 4.947382164773469, + "grad_norm": 3.7083687782287598, + "learning_rate": 1.108764519535375e-07, + "loss": 0.289, + "step": 9470 + }, + { + "epoch": 4.952604778691735, + "grad_norm": 5.882250785827637, + "learning_rate": 1.003167898627244e-07, + "loss": 0.3278, + "step": 9480 + }, + { + "epoch": 4.957827392610001, + "grad_norm": 2.812523126602173, + "learning_rate": 8.97571277719113e-08, + "loss": 0.3284, + "step": 9490 + }, + { + "epoch": 4.963050006528268, + "grad_norm": 4.554064750671387, + "learning_rate": 7.91974656810982e-08, + "loss": 0.3259, + "step": 9500 + }, + { + "epoch": 4.968272620446534, + "grad_norm": 7.692569732666016, + "learning_rate": 6.863780359028511e-08, + "loss": 0.3239, + "step": 9510 + }, + { + "epoch": 4.973495234364799, + "grad_norm": 3.9875969886779785, + "learning_rate": 5.807814149947202e-08, + "loss": 0.2938, + "step": 9520 + }, + { + "epoch": 4.978717848283066, + "grad_norm": 6.04777193069458, + "learning_rate": 4.7518479408658926e-08, + "loss": 0.3239, + "step": 9530 + }, + { + "epoch": 4.983940462201332, + "grad_norm": 5.224954128265381, + "learning_rate": 3.695881731784583e-08, + "loss": 0.3139, + "step": 9540 + }, + { + "epoch": 4.989163076119598, + "grad_norm": 4.514588356018066, + "learning_rate": 2.6399155227032736e-08, + "loss": 0.3265, + "step": 9550 + }, + { + "epoch": 4.994385690037864, + "grad_norm": 7.125428199768066, + "learning_rate": 1.5839493136219642e-08, + "loss": 0.3744, + "step": 9560 + }, + { + "epoch": 4.99960830395613, + "grad_norm": 9.685089111328125, + "learning_rate": 5.279831045406547e-09, + "loss": 0.3747, + "step": 9570 + } + ], + "logging_steps": 10, + "max_steps": 9570, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0361621271753523e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}