{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8920877793436682, "eval_steps": 100, "global_step": 4431, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.0001999999444378697, "loss": 0.7972, "step": 10 }, { "epoch": 0.0, "learning_rate": 0.0001999997777515405, "loss": 0.221, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.0001999994999411976, "loss": 0.1313, "step": 30 }, { "epoch": 0.01, "learning_rate": 0.00019999911100714978, "loss": 0.1169, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00019999861094982923, "loss": 0.1353, "step": 50 }, { "epoch": 0.01, "learning_rate": 0.00019999799976979157, "loss": 0.1355, "step": 60 }, { "epoch": 0.01, "learning_rate": 0.00019999727746771603, "loss": 0.0991, "step": 70 }, { "epoch": 0.02, "learning_rate": 0.00019999644404440528, "loss": 0.0995, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.00019999549950078536, "loss": 0.1175, "step": 90 }, { "epoch": 0.02, "learning_rate": 0.000199994443837906, "loss": 0.0941, "step": 100 }, { "epoch": 0.02, "eval_loss": 0.09952569007873535, "eval_runtime": 659.9109, "eval_samples_per_second": 11.15, "eval_steps_per_second": 2.788, "step": 100 }, { "epoch": 0.02, "learning_rate": 0.00019999327705694028, "loss": 0.09, "step": 110 }, { "epoch": 0.02, "learning_rate": 0.00019999199915918467, "loss": 0.1044, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00019999061014605934, "loss": 0.0978, "step": 130 }, { "epoch": 0.03, "learning_rate": 0.00019998911001910778, "loss": 0.0963, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.000199987498779997, "loss": 0.0955, "step": 150 }, { "epoch": 0.03, "learning_rate": 0.00019998577643051744, "loss": 0.0994, "step": 160 }, { "epoch": 0.03, "learning_rate": 0.00019998394297258311, "loss": 0.0921, "step": 170 }, { "epoch": 0.04, "learning_rate": 0.0001999819984082314, "loss": 0.087, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.00019997994273962316, "loss": 0.0826, "step": 190 }, { "epoch": 0.04, "learning_rate": 0.00019997777596904282, "loss": 0.0903, "step": 200 }, { "epoch": 0.04, "eval_loss": 0.090216726064682, "eval_runtime": 649.064, "eval_samples_per_second": 11.336, "eval_steps_per_second": 2.835, "step": 200 }, { "epoch": 0.04, "learning_rate": 0.00019997549809889811, "loss": 0.0949, "step": 210 }, { "epoch": 0.04, "learning_rate": 0.00019997310913172028, "loss": 0.0902, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.00019997060907016415, "loss": 0.0792, "step": 230 }, { "epoch": 0.05, "learning_rate": 0.0001999679979170078, "loss": 0.0901, "step": 240 }, { "epoch": 0.05, "learning_rate": 0.0001999652756751529, "loss": 0.1035, "step": 250 }, { "epoch": 0.05, "learning_rate": 0.00019996244234762452, "loss": 0.0875, "step": 260 }, { "epoch": 0.05, "learning_rate": 0.00019995949793757118, "loss": 0.0718, "step": 270 }, { "epoch": 0.06, "learning_rate": 0.00019995644244826481, "loss": 0.0799, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.0001999532758831008, "loss": 0.0767, "step": 290 }, { "epoch": 0.06, "learning_rate": 0.000199949998245598, "loss": 0.089, "step": 300 }, { "epoch": 0.06, "eval_loss": 0.08015166223049164, "eval_runtime": 646.37, "eval_samples_per_second": 11.384, "eval_steps_per_second": 2.847, "step": 300 }, { "epoch": 0.06, "learning_rate": 0.00019994660953939864, "loss": 0.0833, "step": 310 }, { "epoch": 0.06, "learning_rate": 0.00019994310976826837, "loss": 0.0799, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.00019993949893609635, "loss": 0.0747, "step": 330 }, { "epoch": 0.07, "learning_rate": 0.00019993577704689505, "loss": 0.0888, "step": 340 }, { "epoch": 0.07, "learning_rate": 0.0001999319441048004, "loss": 0.0872, "step": 350 }, { "epoch": 0.07, "learning_rate": 0.00019992800011407172, "loss": 0.0762, "step": 360 }, { "epoch": 0.07, "learning_rate": 0.00019992394507909174, "loss": 0.0746, "step": 370 }, { "epoch": 0.08, "learning_rate": 0.00019991977900436658, "loss": 0.0713, "step": 380 }, { "epoch": 0.08, "learning_rate": 0.0001999155018945258, "loss": 0.077, "step": 390 }, { "epoch": 0.08, "learning_rate": 0.00019991111375432225, "loss": 0.0619, "step": 400 }, { "epoch": 0.08, "eval_loss": 0.07623383402824402, "eval_runtime": 654.0144, "eval_samples_per_second": 11.251, "eval_steps_per_second": 2.813, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.00019990661458863226, "loss": 0.0702, "step": 410 }, { "epoch": 0.08, "learning_rate": 0.0001999020044024555, "loss": 0.0725, "step": 420 }, { "epoch": 0.09, "learning_rate": 0.00019989728320091496, "loss": 0.065, "step": 430 }, { "epoch": 0.09, "learning_rate": 0.00019989245098925708, "loss": 0.0632, "step": 440 }, { "epoch": 0.09, "learning_rate": 0.00019988750777285162, "loss": 0.0762, "step": 450 }, { "epoch": 0.09, "learning_rate": 0.00019988245355719166, "loss": 0.0736, "step": 460 }, { "epoch": 0.09, "learning_rate": 0.0001998772883478937, "loss": 0.0662, "step": 470 }, { "epoch": 0.1, "learning_rate": 0.0001998720121506975, "loss": 0.0765, "step": 480 }, { "epoch": 0.1, "learning_rate": 0.00019986662497146622, "loss": 0.0694, "step": 490 }, { "epoch": 0.1, "learning_rate": 0.00019986112681618634, "loss": 0.0722, "step": 500 }, { "epoch": 0.1, "eval_loss": 0.075162373483181, "eval_runtime": 656.2929, "eval_samples_per_second": 11.211, "eval_steps_per_second": 2.804, "step": 500 }, { "epoch": 0.1, "learning_rate": 0.0001998555176909676, "loss": 0.073, "step": 510 }, { "epoch": 0.1, "learning_rate": 0.00019984979760204313, "loss": 0.0681, "step": 520 }, { "epoch": 0.11, "learning_rate": 0.00019984396655576932, "loss": 0.057, "step": 530 }, { "epoch": 0.11, "learning_rate": 0.0001998380245586259, "loss": 0.0689, "step": 540 }, { "epoch": 0.11, "learning_rate": 0.00019983197161721583, "loss": 0.057, "step": 550 }, { "epoch": 0.11, "learning_rate": 0.00019982580773826545, "loss": 0.0693, "step": 560 }, { "epoch": 0.11, "learning_rate": 0.00019981953292862428, "loss": 0.0702, "step": 570 }, { "epoch": 0.12, "learning_rate": 0.0001998131471952652, "loss": 0.0681, "step": 580 }, { "epoch": 0.12, "learning_rate": 0.00019980665054528425, "loss": 0.0657, "step": 590 }, { "epoch": 0.12, "learning_rate": 0.00019980004298590081, "loss": 0.0724, "step": 600 }, { "epoch": 0.12, "eval_loss": 0.0736309364438057, "eval_runtime": 657.9967, "eval_samples_per_second": 11.182, "eval_steps_per_second": 2.796, "step": 600 }, { "epoch": 0.12, "learning_rate": 0.00019979332452445752, "loss": 0.0599, "step": 610 }, { "epoch": 0.12, "learning_rate": 0.00019978649516842016, "loss": 0.0697, "step": 620 }, { "epoch": 0.13, "learning_rate": 0.00019977955492537787, "loss": 0.0715, "step": 630 }, { "epoch": 0.13, "learning_rate": 0.00019977250380304287, "loss": 0.0589, "step": 640 }, { "epoch": 0.13, "learning_rate": 0.0001997653418092507, "loss": 0.0722, "step": 650 }, { "epoch": 0.13, "learning_rate": 0.00019975806895196008, "loss": 0.0611, "step": 660 }, { "epoch": 0.13, "learning_rate": 0.0001997506852392529, "loss": 0.056, "step": 670 }, { "epoch": 0.14, "learning_rate": 0.0001997431906793343, "loss": 0.0614, "step": 680 }, { "epoch": 0.14, "learning_rate": 0.00019973558528053252, "loss": 0.0537, "step": 690 }, { "epoch": 0.14, "learning_rate": 0.000199727869051299, "loss": 0.0762, "step": 700 }, { "epoch": 0.14, "eval_loss": 0.07101555913686752, "eval_runtime": 665.8897, "eval_samples_per_second": 11.05, "eval_steps_per_second": 2.763, "step": 700 }, { "epoch": 0.14, "learning_rate": 0.00019972004200020832, "loss": 0.0634, "step": 710 }, { "epoch": 0.14, "learning_rate": 0.0001997121041359583, "loss": 0.0697, "step": 720 }, { "epoch": 0.15, "learning_rate": 0.00019970405546736978, "loss": 0.0605, "step": 730 }, { "epoch": 0.15, "learning_rate": 0.00019969589600338678, "loss": 0.0598, "step": 740 }, { "epoch": 0.15, "learning_rate": 0.00019968762575307649, "loss": 0.066, "step": 750 }, { "epoch": 0.15, "learning_rate": 0.00019967924472562914, "loss": 0.0656, "step": 760 }, { "epoch": 0.16, "learning_rate": 0.00019967075293035805, "loss": 0.0614, "step": 770 }, { "epoch": 0.16, "learning_rate": 0.0001996621503766997, "loss": 0.0633, "step": 780 }, { "epoch": 0.16, "learning_rate": 0.00019965343707421362, "loss": 0.0661, "step": 790 }, { "epoch": 0.16, "learning_rate": 0.00019964461303258242, "loss": 0.0567, "step": 800 }, { "epoch": 0.16, "eval_loss": 0.07119747996330261, "eval_runtime": 657.3478, "eval_samples_per_second": 11.193, "eval_steps_per_second": 2.799, "step": 800 }, { "epoch": 0.16, "learning_rate": 0.00019963567826161168, "loss": 0.0577, "step": 810 }, { "epoch": 0.17, "learning_rate": 0.00019962663277123016, "loss": 0.0633, "step": 820 }, { "epoch": 0.17, "learning_rate": 0.0001996174765714896, "loss": 0.0701, "step": 830 }, { "epoch": 0.17, "learning_rate": 0.0001996082096725647, "loss": 0.0715, "step": 840 }, { "epoch": 0.17, "learning_rate": 0.0001995988320847533, "loss": 0.0618, "step": 850 }, { "epoch": 0.17, "learning_rate": 0.00019958934381847612, "loss": 0.0589, "step": 860 }, { "epoch": 0.18, "learning_rate": 0.00019957974488427698, "loss": 0.0658, "step": 870 }, { "epoch": 0.18, "learning_rate": 0.00019957003529282255, "loss": 0.0549, "step": 880 }, { "epoch": 0.18, "learning_rate": 0.00019956021505490262, "loss": 0.0664, "step": 890 }, { "epoch": 0.18, "learning_rate": 0.0001995502841814298, "loss": 0.0578, "step": 900 }, { "epoch": 0.18, "eval_loss": 0.06790292263031006, "eval_runtime": 663.3404, "eval_samples_per_second": 11.092, "eval_steps_per_second": 2.774, "step": 900 }, { "epoch": 0.18, "learning_rate": 0.00019954024268343975, "loss": 0.0626, "step": 910 }, { "epoch": 0.19, "learning_rate": 0.000199530090572091, "loss": 0.0621, "step": 920 }, { "epoch": 0.19, "learning_rate": 0.00019951982785866492, "loss": 0.0527, "step": 930 }, { "epoch": 0.19, "learning_rate": 0.00019950945455456597, "loss": 0.0636, "step": 940 }, { "epoch": 0.19, "learning_rate": 0.00019949897067132142, "loss": 0.0767, "step": 950 }, { "epoch": 0.19, "learning_rate": 0.00019948837622058135, "loss": 0.0785, "step": 960 }, { "epoch": 0.2, "learning_rate": 0.00019947767121411878, "loss": 0.0698, "step": 970 }, { "epoch": 0.2, "learning_rate": 0.0001994668556638296, "loss": 0.0576, "step": 980 }, { "epoch": 0.2, "learning_rate": 0.00019945592958173247, "loss": 0.0616, "step": 990 }, { "epoch": 0.2, "learning_rate": 0.00019944489297996892, "loss": 0.0597, "step": 1000 }, { "epoch": 0.2, "eval_loss": 0.06911394000053406, "eval_runtime": 654.3952, "eval_samples_per_second": 11.244, "eval_steps_per_second": 2.812, "step": 1000 }, { "epoch": 0.2, "learning_rate": 0.00019943374587080333, "loss": 0.0674, "step": 1010 }, { "epoch": 0.21, "learning_rate": 0.0001994224882666228, "loss": 0.0528, "step": 1020 }, { "epoch": 0.21, "learning_rate": 0.0001994111201799373, "loss": 0.0602, "step": 1030 }, { "epoch": 0.21, "learning_rate": 0.00019939964162337946, "loss": 0.0568, "step": 1040 }, { "epoch": 0.21, "learning_rate": 0.00019938805260970486, "loss": 0.0571, "step": 1050 }, { "epoch": 0.21, "learning_rate": 0.0001993763531517916, "loss": 0.0598, "step": 1060 }, { "epoch": 0.22, "learning_rate": 0.00019936454326264068, "loss": 0.0636, "step": 1070 }, { "epoch": 0.22, "learning_rate": 0.00019935262295537568, "loss": 0.0602, "step": 1080 }, { "epoch": 0.22, "learning_rate": 0.00019934059224324303, "loss": 0.0763, "step": 1090 }, { "epoch": 0.22, "learning_rate": 0.00019932845113961172, "loss": 0.0538, "step": 1100 }, { "epoch": 0.22, "eval_loss": 0.06597457081079483, "eval_runtime": 653.2731, "eval_samples_per_second": 11.263, "eval_steps_per_second": 2.817, "step": 1100 }, { "epoch": 0.22, "learning_rate": 0.0001993161996579735, "loss": 0.0578, "step": 1110 }, { "epoch": 0.23, "learning_rate": 0.00019930383781194272, "loss": 0.0561, "step": 1120 }, { "epoch": 0.23, "learning_rate": 0.00019929136561525637, "loss": 0.0558, "step": 1130 }, { "epoch": 0.23, "learning_rate": 0.0001992787830817741, "loss": 0.0669, "step": 1140 }, { "epoch": 0.23, "learning_rate": 0.00019926609022547818, "loss": 0.0616, "step": 1150 }, { "epoch": 0.23, "learning_rate": 0.00019925328706047344, "loss": 0.0637, "step": 1160 }, { "epoch": 0.24, "learning_rate": 0.00019924037360098727, "loss": 0.052, "step": 1170 }, { "epoch": 0.24, "learning_rate": 0.0001992273498613697, "loss": 0.0548, "step": 1180 }, { "epoch": 0.24, "learning_rate": 0.00019921421585609325, "loss": 0.0545, "step": 1190 }, { "epoch": 0.24, "learning_rate": 0.000199200971599753, "loss": 0.0498, "step": 1200 }, { "epoch": 0.24, "eval_loss": 0.06580409407615662, "eval_runtime": 655.2658, "eval_samples_per_second": 11.229, "eval_steps_per_second": 2.808, "step": 1200 }, { "epoch": 0.24, "learning_rate": 0.00019918761710706649, "loss": 0.0537, "step": 1210 }, { "epoch": 0.25, "learning_rate": 0.00019917415239287386, "loss": 0.0579, "step": 1220 }, { "epoch": 0.25, "learning_rate": 0.00019916057747213758, "loss": 0.053, "step": 1230 }, { "epoch": 0.25, "learning_rate": 0.00019914689235994278, "loss": 0.0562, "step": 1240 }, { "epoch": 0.25, "learning_rate": 0.00019913309707149694, "loss": 0.0595, "step": 1250 }, { "epoch": 0.25, "learning_rate": 0.00019911919162212988, "loss": 0.0515, "step": 1260 }, { "epoch": 0.26, "learning_rate": 0.000199105176027294, "loss": 0.053, "step": 1270 }, { "epoch": 0.26, "learning_rate": 0.00019909105030256402, "loss": 0.065, "step": 1280 }, { "epoch": 0.26, "learning_rate": 0.000199076814463637, "loss": 0.0574, "step": 1290 }, { "epoch": 0.26, "learning_rate": 0.0001990624685263325, "loss": 0.0595, "step": 1300 }, { "epoch": 0.26, "eval_loss": 0.06419895589351654, "eval_runtime": 662.3031, "eval_samples_per_second": 11.11, "eval_steps_per_second": 2.778, "step": 1300 }, { "epoch": 0.26, "learning_rate": 0.00019904801250659223, "loss": 0.0667, "step": 1310 }, { "epoch": 0.27, "learning_rate": 0.00019903344642048043, "loss": 0.0562, "step": 1320 }, { "epoch": 0.27, "learning_rate": 0.00019901877028418347, "loss": 0.0604, "step": 1330 }, { "epoch": 0.27, "learning_rate": 0.00019900398411401015, "loss": 0.0474, "step": 1340 }, { "epoch": 0.27, "learning_rate": 0.0001989890879263915, "loss": 0.0444, "step": 1350 }, { "epoch": 0.27, "learning_rate": 0.0001989740817378808, "loss": 0.0522, "step": 1360 }, { "epoch": 0.28, "learning_rate": 0.00019895896556515352, "loss": 0.0543, "step": 1370 }, { "epoch": 0.28, "learning_rate": 0.00019894373942500743, "loss": 0.0447, "step": 1380 }, { "epoch": 0.28, "learning_rate": 0.00019892840333436245, "loss": 0.0454, "step": 1390 }, { "epoch": 0.28, "learning_rate": 0.00019891295731026075, "loss": 0.0465, "step": 1400 }, { "epoch": 0.28, "eval_loss": 0.0676579549908638, "eval_runtime": 660.0804, "eval_samples_per_second": 11.147, "eval_steps_per_second": 2.788, "step": 1400 }, { "epoch": 0.28, "learning_rate": 0.0001988974013698665, "loss": 0.0533, "step": 1410 }, { "epoch": 0.29, "learning_rate": 0.00019888173553046624, "loss": 0.0522, "step": 1420 }, { "epoch": 0.29, "learning_rate": 0.00019886595980946842, "loss": 0.0592, "step": 1430 }, { "epoch": 0.29, "learning_rate": 0.00019885007422440379, "loss": 0.0584, "step": 1440 }, { "epoch": 0.29, "learning_rate": 0.00019883407879292498, "loss": 0.054, "step": 1450 }, { "epoch": 0.29, "learning_rate": 0.00019881797353280695, "loss": 0.0442, "step": 1460 }, { "epoch": 0.3, "learning_rate": 0.00019880175846194638, "loss": 0.0659, "step": 1470 }, { "epoch": 0.3, "learning_rate": 0.00019878543359836223, "loss": 0.0483, "step": 1480 }, { "epoch": 0.3, "learning_rate": 0.0001987689989601954, "loss": 0.0525, "step": 1490 }, { "epoch": 0.3, "learning_rate": 0.00019875245456570873, "loss": 0.0533, "step": 1500 }, { "epoch": 0.3, "eval_loss": 0.06512827426195145, "eval_runtime": 659.6413, "eval_samples_per_second": 11.155, "eval_steps_per_second": 2.789, "step": 1500 }, { "epoch": 0.3, "learning_rate": 0.00019873580043328705, "loss": 0.0438, "step": 1510 }, { "epoch": 0.31, "learning_rate": 0.00019871903658143719, "loss": 0.0542, "step": 1520 }, { "epoch": 0.31, "learning_rate": 0.0001987021630287878, "loss": 0.0555, "step": 1530 }, { "epoch": 0.31, "learning_rate": 0.00019868517979408954, "loss": 0.0561, "step": 1540 }, { "epoch": 0.31, "learning_rate": 0.00019866808689621482, "loss": 0.0517, "step": 1550 }, { "epoch": 0.31, "learning_rate": 0.0001986508843541581, "loss": 0.051, "step": 1560 }, { "epoch": 0.32, "learning_rate": 0.0001986335721870355, "loss": 0.0525, "step": 1570 }, { "epoch": 0.32, "learning_rate": 0.0001986161504140851, "loss": 0.0443, "step": 1580 }, { "epoch": 0.32, "learning_rate": 0.00019859861905466668, "loss": 0.0542, "step": 1590 }, { "epoch": 0.32, "learning_rate": 0.00019858097812826183, "loss": 0.0593, "step": 1600 }, { "epoch": 0.32, "eval_loss": 0.06405352801084518, "eval_runtime": 658.5179, "eval_samples_per_second": 11.174, "eval_steps_per_second": 2.794, "step": 1600 }, { "epoch": 0.32, "learning_rate": 0.0001985632276544739, "loss": 0.0527, "step": 1610 }, { "epoch": 0.33, "learning_rate": 0.000198545367653028, "loss": 0.0422, "step": 1620 }, { "epoch": 0.33, "learning_rate": 0.00019852739814377087, "loss": 0.0533, "step": 1630 }, { "epoch": 0.33, "learning_rate": 0.00019850931914667107, "loss": 0.0507, "step": 1640 }, { "epoch": 0.33, "learning_rate": 0.00019849113068181869, "loss": 0.0581, "step": 1650 }, { "epoch": 0.33, "learning_rate": 0.00019847283276942554, "loss": 0.0583, "step": 1660 }, { "epoch": 0.34, "learning_rate": 0.00019845442542982506, "loss": 0.0562, "step": 1670 }, { "epoch": 0.34, "learning_rate": 0.00019843590868347225, "loss": 0.0499, "step": 1680 }, { "epoch": 0.34, "learning_rate": 0.00019841728255094374, "loss": 0.0543, "step": 1690 }, { "epoch": 0.34, "learning_rate": 0.00019839854705293764, "loss": 0.055, "step": 1700 }, { "epoch": 0.34, "eval_loss": 0.0653354823589325, "eval_runtime": 674.0253, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.73, "step": 1700 }, { "epoch": 0.34, "learning_rate": 0.00019837970221027365, "loss": 0.0422, "step": 1710 }, { "epoch": 0.35, "learning_rate": 0.00019836074804389296, "loss": 0.0586, "step": 1720 }, { "epoch": 0.35, "learning_rate": 0.00019834168457485824, "loss": 0.0408, "step": 1730 }, { "epoch": 0.35, "learning_rate": 0.00019832251182435367, "loss": 0.0587, "step": 1740 }, { "epoch": 0.35, "learning_rate": 0.00019830322981368478, "loss": 0.0498, "step": 1750 }, { "epoch": 0.35, "learning_rate": 0.0001982838385642786, "loss": 0.0475, "step": 1760 }, { "epoch": 0.36, "learning_rate": 0.00019826433809768345, "loss": 0.0571, "step": 1770 }, { "epoch": 0.36, "learning_rate": 0.00019824472843556914, "loss": 0.0512, "step": 1780 }, { "epoch": 0.36, "learning_rate": 0.00019822500959972673, "loss": 0.0477, "step": 1790 }, { "epoch": 0.36, "learning_rate": 0.00019820518161206864, "loss": 0.0546, "step": 1800 }, { "epoch": 0.36, "eval_loss": 0.06339588016271591, "eval_runtime": 1055.2159, "eval_samples_per_second": 6.973, "eval_steps_per_second": 1.744, "step": 1800 }, { "epoch": 0.36, "learning_rate": 0.00019818524449462862, "loss": 0.0463, "step": 1810 }, { "epoch": 0.37, "learning_rate": 0.00019816519826956156, "loss": 0.0571, "step": 1820 }, { "epoch": 0.37, "learning_rate": 0.00019814504295914372, "loss": 0.0516, "step": 1830 }, { "epoch": 0.37, "learning_rate": 0.00019812477858577258, "loss": 0.0511, "step": 1840 }, { "epoch": 0.37, "learning_rate": 0.00019810440517196672, "loss": 0.0471, "step": 1850 }, { "epoch": 0.37, "learning_rate": 0.00019808392274036596, "loss": 0.0497, "step": 1860 }, { "epoch": 0.38, "learning_rate": 0.00019806333131373126, "loss": 0.0608, "step": 1870 }, { "epoch": 0.38, "learning_rate": 0.00019804263091494466, "loss": 0.0431, "step": 1880 }, { "epoch": 0.38, "learning_rate": 0.00019802182156700938, "loss": 0.0558, "step": 1890 }, { "epoch": 0.38, "learning_rate": 0.00019800090329304956, "loss": 0.0524, "step": 1900 }, { "epoch": 0.38, "eval_loss": 0.06149962544441223, "eval_runtime": 1053.8269, "eval_samples_per_second": 6.982, "eval_steps_per_second": 1.746, "step": 1900 }, { "epoch": 0.38, "learning_rate": 0.00019797987611631059, "loss": 0.0528, "step": 1910 }, { "epoch": 0.39, "learning_rate": 0.00019795874006015872, "loss": 0.0481, "step": 1920 }, { "epoch": 0.39, "learning_rate": 0.0001979374951480812, "loss": 0.0491, "step": 1930 }, { "epoch": 0.39, "learning_rate": 0.00019791614140368633, "loss": 0.049, "step": 1940 }, { "epoch": 0.39, "learning_rate": 0.00019789467885070327, "loss": 0.0444, "step": 1950 }, { "epoch": 0.39, "learning_rate": 0.00019787310751298213, "loss": 0.0466, "step": 1960 }, { "epoch": 0.4, "learning_rate": 0.00019785142741449393, "loss": 0.0498, "step": 1970 }, { "epoch": 0.4, "learning_rate": 0.00019782963857933048, "loss": 0.0475, "step": 1980 }, { "epoch": 0.4, "learning_rate": 0.00019780774103170446, "loss": 0.047, "step": 1990 }, { "epoch": 0.4, "learning_rate": 0.00019778573479594942, "loss": 0.0432, "step": 2000 }, { "epoch": 0.4, "eval_loss": 0.06317023187875748, "eval_runtime": 1054.8968, "eval_samples_per_second": 6.975, "eval_steps_per_second": 1.744, "step": 2000 }, { "epoch": 0.4, "learning_rate": 0.00019776361989651956, "loss": 0.0546, "step": 2010 }, { "epoch": 0.41, "learning_rate": 0.0001977413963579899, "loss": 0.043, "step": 2020 }, { "epoch": 0.41, "learning_rate": 0.00019771906420505624, "loss": 0.0579, "step": 2030 }, { "epoch": 0.41, "learning_rate": 0.00019769662346253493, "loss": 0.057, "step": 2040 }, { "epoch": 0.41, "learning_rate": 0.00019767407415536317, "loss": 0.0476, "step": 2050 }, { "epoch": 0.41, "learning_rate": 0.00019765141630859865, "loss": 0.0418, "step": 2060 }, { "epoch": 0.42, "learning_rate": 0.00019762864994741976, "loss": 0.0497, "step": 2070 }, { "epoch": 0.42, "learning_rate": 0.00019760577509712546, "loss": 0.0493, "step": 2080 }, { "epoch": 0.42, "learning_rate": 0.00019758279178313525, "loss": 0.0512, "step": 2090 }, { "epoch": 0.42, "learning_rate": 0.00019755970003098916, "loss": 0.0631, "step": 2100 }, { "epoch": 0.42, "eval_loss": 0.06188439950346947, "eval_runtime": 1062.072, "eval_samples_per_second": 6.928, "eval_steps_per_second": 1.732, "step": 2100 }, { "epoch": 0.42, "learning_rate": 0.00019753649986634772, "loss": 0.0543, "step": 2110 }, { "epoch": 0.43, "learning_rate": 0.00019751319131499194, "loss": 0.0405, "step": 2120 }, { "epoch": 0.43, "learning_rate": 0.00019748977440282333, "loss": 0.0564, "step": 2130 }, { "epoch": 0.43, "learning_rate": 0.0001974662491558637, "loss": 0.0396, "step": 2140 }, { "epoch": 0.43, "learning_rate": 0.00019744261560025533, "loss": 0.039, "step": 2150 }, { "epoch": 0.43, "learning_rate": 0.00019741887376226083, "loss": 0.0348, "step": 2160 }, { "epoch": 0.44, "learning_rate": 0.00019739502366826313, "loss": 0.0533, "step": 2170 }, { "epoch": 0.44, "learning_rate": 0.0001973710653447655, "loss": 0.0417, "step": 2180 }, { "epoch": 0.44, "learning_rate": 0.0001973469988183914, "loss": 0.0502, "step": 2190 }, { "epoch": 0.44, "learning_rate": 0.00019732282411588463, "loss": 0.0519, "step": 2200 }, { "epoch": 0.44, "eval_loss": 0.05978045240044594, "eval_runtime": 1059.6808, "eval_samples_per_second": 6.944, "eval_steps_per_second": 1.736, "step": 2200 }, { "epoch": 0.44, "learning_rate": 0.00019729854126410913, "loss": 0.0416, "step": 2210 }, { "epoch": 0.45, "learning_rate": 0.00019727415029004906, "loss": 0.0586, "step": 2220 }, { "epoch": 0.45, "learning_rate": 0.00019724965122080868, "loss": 0.0535, "step": 2230 }, { "epoch": 0.45, "learning_rate": 0.0001972250440836124, "loss": 0.0499, "step": 2240 }, { "epoch": 0.45, "learning_rate": 0.00019720032890580474, "loss": 0.0484, "step": 2250 }, { "epoch": 0.46, "learning_rate": 0.00019717550571485024, "loss": 0.044, "step": 2260 }, { "epoch": 0.46, "learning_rate": 0.0001971505745383335, "loss": 0.0496, "step": 2270 }, { "epoch": 0.46, "learning_rate": 0.00019712553540395908, "loss": 0.05, "step": 2280 }, { "epoch": 0.46, "learning_rate": 0.0001971003883395516, "loss": 0.0387, "step": 2290 }, { "epoch": 0.46, "learning_rate": 0.00019707513337305547, "loss": 0.0397, "step": 2300 }, { "epoch": 0.46, "eval_loss": 0.060694798827171326, "eval_runtime": 1049.503, "eval_samples_per_second": 7.011, "eval_steps_per_second": 1.753, "step": 2300 }, { "epoch": 0.47, "learning_rate": 0.0001970497705325351, "loss": 0.0475, "step": 2310 }, { "epoch": 0.47, "learning_rate": 0.00019702429984617484, "loss": 0.0405, "step": 2320 }, { "epoch": 0.47, "learning_rate": 0.00019699872134227867, "loss": 0.0447, "step": 2330 }, { "epoch": 0.47, "learning_rate": 0.00019697303504927061, "loss": 0.0608, "step": 2340 }, { "epoch": 0.47, "learning_rate": 0.00019694724099569434, "loss": 0.0478, "step": 2350 }, { "epoch": 0.48, "learning_rate": 0.00019692133921021332, "loss": 0.0432, "step": 2360 }, { "epoch": 0.48, "learning_rate": 0.00019689532972161068, "loss": 0.0492, "step": 2370 }, { "epoch": 0.48, "learning_rate": 0.00019686921255878932, "loss": 0.0441, "step": 2380 }, { "epoch": 0.48, "learning_rate": 0.0001968429877507717, "loss": 0.0603, "step": 2390 }, { "epoch": 0.48, "learning_rate": 0.00019681665532669996, "loss": 0.0467, "step": 2400 }, { "epoch": 0.48, "eval_loss": 0.06163698434829712, "eval_runtime": 1049.1311, "eval_samples_per_second": 7.013, "eval_steps_per_second": 1.754, "step": 2400 }, { "epoch": 0.49, "learning_rate": 0.00019679021531583584, "loss": 0.0406, "step": 2410 }, { "epoch": 0.49, "learning_rate": 0.00019676366774756056, "loss": 0.0484, "step": 2420 }, { "epoch": 0.49, "learning_rate": 0.00019673701265137495, "loss": 0.0402, "step": 2430 }, { "epoch": 0.49, "learning_rate": 0.00019671025005689926, "loss": 0.0541, "step": 2440 }, { "epoch": 0.49, "learning_rate": 0.00019668337999387324, "loss": 0.0438, "step": 2450 }, { "epoch": 0.5, "learning_rate": 0.00019665640249215605, "loss": 0.045, "step": 2460 }, { "epoch": 0.5, "learning_rate": 0.0001966293175817262, "loss": 0.0456, "step": 2470 }, { "epoch": 0.5, "learning_rate": 0.00019660212529268168, "loss": 0.0454, "step": 2480 }, { "epoch": 0.5, "learning_rate": 0.00019657482565523963, "loss": 0.0464, "step": 2490 }, { "epoch": 0.5, "learning_rate": 0.00019654741869973663, "loss": 0.049, "step": 2500 }, { "epoch": 0.5, "eval_loss": 0.06358644366264343, "eval_runtime": 1050.5846, "eval_samples_per_second": 7.004, "eval_steps_per_second": 1.751, "step": 2500 }, { "epoch": 0.51, "learning_rate": 0.00019651990445662841, "loss": 0.053, "step": 2510 }, { "epoch": 0.51, "learning_rate": 0.00019649228295649004, "loss": 0.05, "step": 2520 }, { "epoch": 0.51, "learning_rate": 0.00019646455423001565, "loss": 0.0423, "step": 2530 }, { "epoch": 0.51, "learning_rate": 0.0001964367183080186, "loss": 0.0556, "step": 2540 }, { "epoch": 0.51, "learning_rate": 0.00019640877522143134, "loss": 0.0431, "step": 2550 }, { "epoch": 0.52, "learning_rate": 0.0001963807250013054, "loss": 0.0377, "step": 2560 }, { "epoch": 0.52, "learning_rate": 0.00019635256767881144, "loss": 0.0432, "step": 2570 }, { "epoch": 0.52, "learning_rate": 0.00019632430328523902, "loss": 0.0562, "step": 2580 }, { "epoch": 0.52, "learning_rate": 0.0001962959318519968, "loss": 0.04, "step": 2590 }, { "epoch": 0.52, "learning_rate": 0.00019626745341061225, "loss": 0.0488, "step": 2600 }, { "epoch": 0.52, "eval_loss": 0.06040577217936516, "eval_runtime": 1049.7742, "eval_samples_per_second": 7.009, "eval_steps_per_second": 1.753, "step": 2600 }, { "epoch": 0.53, "learning_rate": 0.0001962388679927319, "loss": 0.0371, "step": 2610 }, { "epoch": 0.53, "learning_rate": 0.000196210175630121, "loss": 0.0412, "step": 2620 }, { "epoch": 0.53, "learning_rate": 0.00019618137635466382, "loss": 0.0432, "step": 2630 }, { "epoch": 0.53, "learning_rate": 0.00019615247019836327, "loss": 0.0501, "step": 2640 }, { "epoch": 0.53, "learning_rate": 0.00019612345719334116, "loss": 0.0422, "step": 2650 }, { "epoch": 0.54, "learning_rate": 0.00019609433737183791, "loss": 0.0511, "step": 2660 }, { "epoch": 0.54, "learning_rate": 0.00019606511076621276, "loss": 0.0372, "step": 2670 }, { "epoch": 0.54, "learning_rate": 0.00019603577740894354, "loss": 0.0541, "step": 2680 }, { "epoch": 0.54, "learning_rate": 0.0001960063373326267, "loss": 0.0472, "step": 2690 }, { "epoch": 0.54, "learning_rate": 0.00019597679056997737, "loss": 0.0449, "step": 2700 }, { "epoch": 0.54, "eval_loss": 0.05977201834321022, "eval_runtime": 1052.2024, "eval_samples_per_second": 6.993, "eval_steps_per_second": 1.749, "step": 2700 }, { "epoch": 0.55, "learning_rate": 0.00019594713715382915, "loss": 0.0517, "step": 2710 }, { "epoch": 0.55, "learning_rate": 0.00019591737711713414, "loss": 0.0476, "step": 2720 }, { "epoch": 0.55, "learning_rate": 0.00019588751049296298, "loss": 0.048, "step": 2730 }, { "epoch": 0.55, "learning_rate": 0.00019585753731450478, "loss": 0.0342, "step": 2740 }, { "epoch": 0.55, "learning_rate": 0.00019582745761506697, "loss": 0.0508, "step": 2750 }, { "epoch": 0.56, "learning_rate": 0.00019579727142807535, "loss": 0.0407, "step": 2760 }, { "epoch": 0.56, "learning_rate": 0.0001957669787870742, "loss": 0.0358, "step": 2770 }, { "epoch": 0.56, "learning_rate": 0.00019573657972572593, "loss": 0.0506, "step": 2780 }, { "epoch": 0.56, "learning_rate": 0.00019570607427781128, "loss": 0.0476, "step": 2790 }, { "epoch": 0.56, "learning_rate": 0.0001956754624772292, "loss": 0.0438, "step": 2800 }, { "epoch": 0.56, "eval_loss": 0.061029911041259766, "eval_runtime": 1062.1739, "eval_samples_per_second": 6.927, "eval_steps_per_second": 1.732, "step": 2800 }, { "epoch": 0.57, "learning_rate": 0.0001956447443579968, "loss": 0.0481, "step": 2810 }, { "epoch": 0.57, "learning_rate": 0.00019561391995424941, "loss": 0.0515, "step": 2820 }, { "epoch": 0.57, "learning_rate": 0.00019558298930024044, "loss": 0.0491, "step": 2830 }, { "epoch": 0.57, "learning_rate": 0.0001955519524303413, "loss": 0.0367, "step": 2840 }, { "epoch": 0.57, "learning_rate": 0.0001955208093790415, "loss": 0.0431, "step": 2850 }, { "epoch": 0.58, "learning_rate": 0.0001954895601809485, "loss": 0.0436, "step": 2860 }, { "epoch": 0.58, "learning_rate": 0.0001954582048707878, "loss": 0.0442, "step": 2870 }, { "epoch": 0.58, "learning_rate": 0.00019542674348340267, "loss": 0.0532, "step": 2880 }, { "epoch": 0.58, "learning_rate": 0.00019539517605375446, "loss": 0.0441, "step": 2890 }, { "epoch": 0.58, "learning_rate": 0.00019536350261692214, "loss": 0.036, "step": 2900 }, { "epoch": 0.58, "eval_loss": 0.06326338648796082, "eval_runtime": 1061.8457, "eval_samples_per_second": 6.929, "eval_steps_per_second": 1.733, "step": 2900 }, { "epoch": 0.59, "learning_rate": 0.00019533172320810265, "loss": 0.0486, "step": 2910 }, { "epoch": 0.59, "learning_rate": 0.00019529983786261058, "loss": 0.0568, "step": 2920 }, { "epoch": 0.59, "learning_rate": 0.00019526784661587829, "loss": 0.0434, "step": 2930 }, { "epoch": 0.59, "learning_rate": 0.0001952357495034558, "loss": 0.0541, "step": 2940 }, { "epoch": 0.59, "learning_rate": 0.00019520354656101085, "loss": 0.0395, "step": 2950 }, { "epoch": 0.6, "learning_rate": 0.00019517123782432868, "loss": 0.0458, "step": 2960 }, { "epoch": 0.6, "learning_rate": 0.00019513882332931212, "loss": 0.0395, "step": 2970 }, { "epoch": 0.6, "learning_rate": 0.00019510630311198157, "loss": 0.0442, "step": 2980 }, { "epoch": 0.6, "learning_rate": 0.00019507367720847488, "loss": 0.041, "step": 2990 }, { "epoch": 0.6, "learning_rate": 0.00019504094565504733, "loss": 0.0464, "step": 3000 }, { "epoch": 0.6, "eval_loss": 0.06028781458735466, "eval_runtime": 1035.9718, "eval_samples_per_second": 7.103, "eval_steps_per_second": 1.776, "step": 3000 }, { "epoch": 0.61, "learning_rate": 0.00019500810848807162, "loss": 0.0396, "step": 3010 }, { "epoch": 0.61, "learning_rate": 0.00019497516574403778, "loss": 0.0491, "step": 3020 }, { "epoch": 0.61, "learning_rate": 0.00019494211745955324, "loss": 0.0585, "step": 3030 }, { "epoch": 0.61, "learning_rate": 0.00019490896367134266, "loss": 0.0484, "step": 3040 }, { "epoch": 0.61, "learning_rate": 0.00019487570441624791, "loss": 0.0507, "step": 3050 }, { "epoch": 0.62, "learning_rate": 0.0001948423397312281, "loss": 0.0467, "step": 3060 }, { "epoch": 0.62, "learning_rate": 0.0001948088696533595, "loss": 0.0444, "step": 3070 }, { "epoch": 0.62, "learning_rate": 0.00019477529421983546, "loss": 0.0498, "step": 3080 }, { "epoch": 0.62, "learning_rate": 0.0001947416134679665, "loss": 0.0373, "step": 3090 }, { "epoch": 0.62, "learning_rate": 0.00019470782743518002, "loss": 0.0437, "step": 3100 }, { "epoch": 0.62, "eval_loss": 0.06118550896644592, "eval_runtime": 1055.3909, "eval_samples_per_second": 6.972, "eval_steps_per_second": 1.743, "step": 3100 }, { "epoch": 0.63, "learning_rate": 0.00019467393615902055, "loss": 0.0391, "step": 3110 }, { "epoch": 0.63, "learning_rate": 0.0001946399396771495, "loss": 0.0456, "step": 3120 }, { "epoch": 0.63, "learning_rate": 0.00019460583802734523, "loss": 0.0368, "step": 3130 }, { "epoch": 0.63, "learning_rate": 0.0001945716312475029, "loss": 0.0451, "step": 3140 }, { "epoch": 0.63, "learning_rate": 0.0001945373193756346, "loss": 0.046, "step": 3150 }, { "epoch": 0.64, "learning_rate": 0.00019450290244986914, "loss": 0.0364, "step": 3160 }, { "epoch": 0.64, "learning_rate": 0.00019446838050845205, "loss": 0.0576, "step": 3170 }, { "epoch": 0.64, "learning_rate": 0.00019443375358974555, "loss": 0.0421, "step": 3180 }, { "epoch": 0.64, "learning_rate": 0.00019439902173222859, "loss": 0.0528, "step": 3190 }, { "epoch": 0.64, "learning_rate": 0.0001943641849744967, "loss": 0.0389, "step": 3200 }, { "epoch": 0.64, "eval_loss": 0.06049993634223938, "eval_runtime": 1052.016, "eval_samples_per_second": 6.994, "eval_steps_per_second": 1.749, "step": 3200 }, { "epoch": 0.65, "learning_rate": 0.00019432924335526194, "loss": 0.0432, "step": 3210 }, { "epoch": 0.65, "learning_rate": 0.00019429419691335297, "loss": 0.0339, "step": 3220 }, { "epoch": 0.65, "learning_rate": 0.00019425904568771483, "loss": 0.0434, "step": 3230 }, { "epoch": 0.65, "learning_rate": 0.00019422378971740907, "loss": 0.0472, "step": 3240 }, { "epoch": 0.65, "learning_rate": 0.00019418842904161368, "loss": 0.0394, "step": 3250 }, { "epoch": 0.66, "learning_rate": 0.00019415296369962288, "loss": 0.0404, "step": 3260 }, { "epoch": 0.66, "learning_rate": 0.00019411739373084732, "loss": 0.0352, "step": 3270 }, { "epoch": 0.66, "learning_rate": 0.00019408171917481386, "loss": 0.0364, "step": 3280 }, { "epoch": 0.66, "learning_rate": 0.00019404594007116555, "loss": 0.0433, "step": 3290 }, { "epoch": 0.66, "learning_rate": 0.00019401005645966167, "loss": 0.0377, "step": 3300 }, { "epoch": 0.66, "eval_loss": 0.06322025507688522, "eval_runtime": 1050.1226, "eval_samples_per_second": 7.007, "eval_steps_per_second": 1.752, "step": 3300 }, { "epoch": 0.67, "learning_rate": 0.00019397406838017766, "loss": 0.042, "step": 3310 }, { "epoch": 0.67, "learning_rate": 0.00019393797587270497, "loss": 0.0326, "step": 3320 }, { "epoch": 0.67, "learning_rate": 0.00019390177897735114, "loss": 0.0443, "step": 3330 }, { "epoch": 0.67, "learning_rate": 0.00019386547773433965, "loss": 0.0545, "step": 3340 }, { "epoch": 0.67, "learning_rate": 0.00019382907218401006, "loss": 0.0422, "step": 3350 }, { "epoch": 0.68, "learning_rate": 0.00019379256236681775, "loss": 0.0393, "step": 3360 }, { "epoch": 0.68, "learning_rate": 0.000193755948323334, "loss": 0.0516, "step": 3370 }, { "epoch": 0.68, "learning_rate": 0.00019371923009424587, "loss": 0.0499, "step": 3380 }, { "epoch": 0.68, "learning_rate": 0.0001936824077203562, "loss": 0.0491, "step": 3390 }, { "epoch": 0.68, "learning_rate": 0.0001936454812425836, "loss": 0.0382, "step": 3400 }, { "epoch": 0.68, "eval_loss": 0.06261178106069565, "eval_runtime": 1054.0723, "eval_samples_per_second": 6.981, "eval_steps_per_second": 1.746, "step": 3400 }, { "epoch": 0.69, "learning_rate": 0.00019360845070196236, "loss": 0.0431, "step": 3410 }, { "epoch": 0.69, "learning_rate": 0.0001935713161396424, "loss": 0.0498, "step": 3420 }, { "epoch": 0.69, "learning_rate": 0.0001935340775968892, "loss": 0.0457, "step": 3430 }, { "epoch": 0.69, "learning_rate": 0.00019349673511508383, "loss": 0.0362, "step": 3440 }, { "epoch": 0.69, "learning_rate": 0.00019345928873572282, "loss": 0.0432, "step": 3450 }, { "epoch": 0.7, "learning_rate": 0.00019342173850041822, "loss": 0.0365, "step": 3460 }, { "epoch": 0.7, "learning_rate": 0.00019338408445089745, "loss": 0.0358, "step": 3470 }, { "epoch": 0.7, "learning_rate": 0.0001933463266290033, "loss": 0.039, "step": 3480 }, { "epoch": 0.7, "learning_rate": 0.00019330846507669382, "loss": 0.0412, "step": 3490 }, { "epoch": 0.7, "learning_rate": 0.00019327049983604245, "loss": 0.051, "step": 3500 }, { "epoch": 0.7, "eval_loss": 0.061244383454322815, "eval_runtime": 1045.5529, "eval_samples_per_second": 7.037, "eval_steps_per_second": 1.76, "step": 3500 }, { "epoch": 0.71, "learning_rate": 0.00019323243094923772, "loss": 0.0387, "step": 3510 }, { "epoch": 0.71, "learning_rate": 0.00019319425845858341, "loss": 0.0473, "step": 3520 }, { "epoch": 0.71, "learning_rate": 0.00019315598240649847, "loss": 0.037, "step": 3530 }, { "epoch": 0.71, "learning_rate": 0.0001931176028355168, "loss": 0.0435, "step": 3540 }, { "epoch": 0.71, "learning_rate": 0.00019307911978828747, "loss": 0.0364, "step": 3550 }, { "epoch": 0.72, "learning_rate": 0.0001930405333075745, "loss": 0.0424, "step": 3560 }, { "epoch": 0.72, "learning_rate": 0.00019300184343625678, "loss": 0.0348, "step": 3570 }, { "epoch": 0.72, "learning_rate": 0.00019296305021732817, "loss": 0.0395, "step": 3580 }, { "epoch": 0.72, "learning_rate": 0.00019292415369389734, "loss": 0.0486, "step": 3590 }, { "epoch": 0.72, "learning_rate": 0.00019288515390918776, "loss": 0.047, "step": 3600 }, { "epoch": 0.72, "eval_loss": 0.06326541304588318, "eval_runtime": 1046.8433, "eval_samples_per_second": 7.029, "eval_steps_per_second": 1.758, "step": 3600 }, { "epoch": 0.73, "learning_rate": 0.00019284605090653766, "loss": 0.0403, "step": 3610 }, { "epoch": 0.73, "learning_rate": 0.00019280684472939994, "loss": 0.0381, "step": 3620 }, { "epoch": 0.73, "learning_rate": 0.00019276753542134224, "loss": 0.0332, "step": 3630 }, { "epoch": 0.73, "learning_rate": 0.00019272812302604665, "loss": 0.0435, "step": 3640 }, { "epoch": 0.73, "learning_rate": 0.00019268860758730997, "loss": 0.0512, "step": 3650 }, { "epoch": 0.74, "learning_rate": 0.00019264898914904342, "loss": 0.0403, "step": 3660 }, { "epoch": 0.74, "learning_rate": 0.00019260926775527265, "loss": 0.0336, "step": 3670 }, { "epoch": 0.74, "learning_rate": 0.00019256944345013785, "loss": 0.0425, "step": 3680 }, { "epoch": 0.74, "learning_rate": 0.00019252951627789344, "loss": 0.0431, "step": 3690 }, { "epoch": 0.74, "learning_rate": 0.00019248948628290818, "loss": 0.0451, "step": 3700 }, { "epoch": 0.74, "eval_loss": 0.06083804368972778, "eval_runtime": 1049.5583, "eval_samples_per_second": 7.011, "eval_steps_per_second": 1.753, "step": 3700 }, { "epoch": 0.75, "learning_rate": 0.00019244935350966514, "loss": 0.032, "step": 3710 }, { "epoch": 0.75, "learning_rate": 0.00019240911800276153, "loss": 0.0378, "step": 3720 }, { "epoch": 0.75, "learning_rate": 0.0001923687798069088, "loss": 0.044, "step": 3730 }, { "epoch": 0.75, "learning_rate": 0.00019232833896693242, "loss": 0.0363, "step": 3740 }, { "epoch": 0.75, "learning_rate": 0.00019228779552777202, "loss": 0.0412, "step": 3750 }, { "epoch": 0.76, "learning_rate": 0.0001922471495344812, "loss": 0.0376, "step": 3760 }, { "epoch": 0.76, "learning_rate": 0.0001922064010322275, "loss": 0.0451, "step": 3770 }, { "epoch": 0.76, "learning_rate": 0.00019216555006629237, "loss": 0.0443, "step": 3780 }, { "epoch": 0.76, "learning_rate": 0.0001921245966820712, "loss": 0.0394, "step": 3790 }, { "epoch": 0.77, "learning_rate": 0.00019208354092507305, "loss": 0.0472, "step": 3800 }, { "epoch": 0.77, "eval_loss": 0.06302132457494736, "eval_runtime": 1043.9496, "eval_samples_per_second": 7.048, "eval_steps_per_second": 1.763, "step": 3800 }, { "epoch": 0.77, "learning_rate": 0.00019204238284092093, "loss": 0.0418, "step": 3810 }, { "epoch": 0.77, "learning_rate": 0.00019200112247535141, "loss": 0.0416, "step": 3820 }, { "epoch": 0.77, "learning_rate": 0.00019195975987421472, "loss": 0.0418, "step": 3830 }, { "epoch": 0.77, "learning_rate": 0.00019191829508347481, "loss": 0.0394, "step": 3840 }, { "epoch": 0.78, "learning_rate": 0.00019187672814920912, "loss": 0.0405, "step": 3850 }, { "epoch": 0.78, "learning_rate": 0.00019183505911760855, "loss": 0.0306, "step": 3860 }, { "epoch": 0.78, "learning_rate": 0.00019179328803497754, "loss": 0.0443, "step": 3870 }, { "epoch": 0.78, "learning_rate": 0.0001917514149477339, "loss": 0.035, "step": 3880 }, { "epoch": 0.78, "learning_rate": 0.00019170943990240877, "loss": 0.0371, "step": 3890 }, { "epoch": 0.79, "learning_rate": 0.0001916673629456466, "loss": 0.0394, "step": 3900 }, { "epoch": 0.79, "eval_loss": 0.06118810176849365, "eval_runtime": 1046.2783, "eval_samples_per_second": 7.033, "eval_steps_per_second": 1.759, "step": 3900 }, { "epoch": 0.79, "learning_rate": 0.00019162518412420512, "loss": 0.0337, "step": 3910 }, { "epoch": 0.79, "learning_rate": 0.00019158290348495524, "loss": 0.0433, "step": 3920 }, { "epoch": 0.79, "learning_rate": 0.000191540521074881, "loss": 0.0402, "step": 3930 }, { "epoch": 0.79, "learning_rate": 0.0001914980369410795, "loss": 0.0326, "step": 3940 }, { "epoch": 0.8, "learning_rate": 0.00019145545113076096, "loss": 0.0501, "step": 3950 }, { "epoch": 0.8, "learning_rate": 0.00019141276369124855, "loss": 0.0448, "step": 3960 }, { "epoch": 0.8, "learning_rate": 0.00019136997466997837, "loss": 0.0403, "step": 3970 }, { "epoch": 0.8, "learning_rate": 0.00019132708411449936, "loss": 0.0427, "step": 3980 }, { "epoch": 0.8, "learning_rate": 0.0001912840920724734, "loss": 0.0322, "step": 3990 }, { "epoch": 0.81, "learning_rate": 0.00019124099859167503, "loss": 0.0471, "step": 4000 }, { "epoch": 0.81, "eval_loss": 0.06139204651117325, "eval_runtime": 1057.3652, "eval_samples_per_second": 6.959, "eval_steps_per_second": 1.74, "step": 4000 }, { "epoch": 0.81, "learning_rate": 0.00019119780371999162, "loss": 0.0306, "step": 4010 }, { "epoch": 0.81, "learning_rate": 0.00019115450750542304, "loss": 0.0352, "step": 4020 }, { "epoch": 0.81, "learning_rate": 0.000191111109996082, "loss": 0.0351, "step": 4030 }, { "epoch": 0.81, "learning_rate": 0.00019106761124019364, "loss": 0.0295, "step": 4040 }, { "epoch": 0.82, "learning_rate": 0.00019102401128609557, "loss": 0.0379, "step": 4050 }, { "epoch": 0.82, "learning_rate": 0.00019098031018223796, "loss": 0.048, "step": 4060 }, { "epoch": 0.82, "learning_rate": 0.00019093650797718338, "loss": 0.0454, "step": 4070 }, { "epoch": 0.82, "learning_rate": 0.00019089260471960663, "loss": 0.0463, "step": 4080 }, { "epoch": 0.82, "learning_rate": 0.0001908486004582949, "loss": 0.0447, "step": 4090 }, { "epoch": 0.83, "learning_rate": 0.00019080449524214762, "loss": 0.031, "step": 4100 }, { "epoch": 0.83, "eval_loss": 0.06018984317779541, "eval_runtime": 1049.9509, "eval_samples_per_second": 7.008, "eval_steps_per_second": 1.752, "step": 4100 }, { "epoch": 0.83, "learning_rate": 0.00019076028912017642, "loss": 0.0292, "step": 4110 }, { "epoch": 0.83, "learning_rate": 0.00019071598214150494, "loss": 0.0252, "step": 4120 }, { "epoch": 0.83, "learning_rate": 0.00019067157435536904, "loss": 0.0382, "step": 4130 }, { "epoch": 0.83, "learning_rate": 0.00019062706581111653, "loss": 0.0303, "step": 4140 }, { "epoch": 0.84, "learning_rate": 0.0001905824565582072, "loss": 0.0419, "step": 4150 }, { "epoch": 0.84, "learning_rate": 0.00019053774664621272, "loss": 0.0358, "step": 4160 }, { "epoch": 0.84, "learning_rate": 0.00019049293612481675, "loss": 0.037, "step": 4170 }, { "epoch": 0.84, "learning_rate": 0.00019044802504381453, "loss": 0.0306, "step": 4180 }, { "epoch": 0.84, "learning_rate": 0.00019040301345311326, "loss": 0.0431, "step": 4190 }, { "epoch": 0.85, "learning_rate": 0.0001903579014027317, "loss": 0.0405, "step": 4200 }, { "epoch": 0.85, "eval_loss": 0.05838855355978012, "eval_runtime": 1051.4215, "eval_samples_per_second": 6.998, "eval_steps_per_second": 1.75, "step": 4200 }, { "epoch": 0.85, "learning_rate": 0.00019031268894280023, "loss": 0.0334, "step": 4210 }, { "epoch": 0.85, "learning_rate": 0.00019026737612356094, "loss": 0.0382, "step": 4220 }, { "epoch": 0.85, "learning_rate": 0.00019022196299536733, "loss": 0.0371, "step": 4230 }, { "epoch": 0.85, "learning_rate": 0.00019017644960868445, "loss": 0.0353, "step": 4240 }, { "epoch": 0.86, "learning_rate": 0.00019013083601408863, "loss": 0.0368, "step": 4250 }, { "epoch": 0.86, "learning_rate": 0.0001900851222622677, "loss": 0.0427, "step": 4260 }, { "epoch": 0.86, "learning_rate": 0.00019003930840402072, "loss": 0.031, "step": 4270 }, { "epoch": 0.86, "learning_rate": 0.00018999339449025796, "loss": 0.0377, "step": 4280 }, { "epoch": 0.86, "learning_rate": 0.00018994738057200099, "loss": 0.0373, "step": 4290 }, { "epoch": 0.87, "learning_rate": 0.0001899012667003824, "loss": 0.0421, "step": 4300 }, { "epoch": 0.87, "eval_loss": 0.05914044752717018, "eval_runtime": 1056.4599, "eval_samples_per_second": 6.965, "eval_steps_per_second": 1.742, "step": 4300 }, { "epoch": 0.87, "learning_rate": 0.00018985505292664587, "loss": 0.0344, "step": 4310 }, { "epoch": 0.87, "learning_rate": 0.00018980873930214614, "loss": 0.0355, "step": 4320 }, { "epoch": 0.87, "learning_rate": 0.00018976232587834886, "loss": 0.0416, "step": 4330 }, { "epoch": 0.87, "learning_rate": 0.00018971581270683062, "loss": 0.0395, "step": 4340 }, { "epoch": 0.88, "learning_rate": 0.00018966919983927886, "loss": 0.0366, "step": 4350 }, { "epoch": 0.88, "learning_rate": 0.00018962248732749175, "loss": 0.0368, "step": 4360 }, { "epoch": 0.88, "learning_rate": 0.0001895756752233782, "loss": 0.0362, "step": 4370 }, { "epoch": 0.88, "learning_rate": 0.0001895287635789579, "loss": 0.0395, "step": 4380 }, { "epoch": 0.88, "learning_rate": 0.00018948175244636097, "loss": 0.0462, "step": 4390 }, { "epoch": 0.89, "learning_rate": 0.00018943464187782828, "loss": 0.0454, "step": 4400 }, { "epoch": 0.89, "eval_loss": 0.05928103253245354, "eval_runtime": 1029.6162, "eval_samples_per_second": 7.146, "eval_steps_per_second": 1.787, "step": 4400 }, { "epoch": 0.89, "learning_rate": 0.000189387431925711, "loss": 0.0326, "step": 4410 }, { "epoch": 0.89, "learning_rate": 0.0001893401226424709, "loss": 0.031, "step": 4420 }, { "epoch": 0.89, "learning_rate": 0.00018929271408068011, "loss": 0.0408, "step": 4430 }, { "epoch": 0.89, "step": 4431, "total_flos": 1.532304306097619e+18, "train_loss": 9.867495303766314e-06, "train_runtime": 4.4906, "train_samples_per_second": 15784.097, "train_steps_per_second": 986.506 } ], "logging_steps": 10, "max_steps": 4430, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "total_flos": 1.532304306097619e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }