{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.888888888888889, "eval_steps": 30, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012044564890093345, "grad_norm": 11.71904468536377, "learning_rate": 7.2e-06, "loss": 2.9594, "step": 10 }, { "epoch": 0.02408912978018669, "grad_norm": 2.4383621215820312, "learning_rate": 1.52e-05, "loss": 1.5501, "step": 20 }, { "epoch": 0.036133694670280034, "grad_norm": 0.7187284231185913, "learning_rate": 2.32e-05, "loss": 0.7478, "step": 30 }, { "epoch": 0.036133694670280034, "eval_loss": 0.6954202651977539, "eval_runtime": 81.243, "eval_samples_per_second": 4.308, "eval_steps_per_second": 2.154, "step": 30 }, { "epoch": 0.04817825956037338, "grad_norm": 0.9893588423728943, "learning_rate": 3.12e-05, "loss": 0.6087, "step": 40 }, { "epoch": 0.06022282445046673, "grad_norm": 0.6181250214576721, "learning_rate": 3.9200000000000004e-05, "loss": 0.4514, "step": 50 }, { "epoch": 0.07226738934056007, "grad_norm": 0.5894028544425964, "learning_rate": 4.72e-05, "loss": 0.3809, "step": 60 }, { "epoch": 0.07226738934056007, "eval_loss": 0.35219648480415344, "eval_runtime": 80.7477, "eval_samples_per_second": 4.334, "eval_steps_per_second": 2.167, "step": 60 }, { "epoch": 0.08431195423065342, "grad_norm": 3.429518222808838, "learning_rate": 5.520000000000001e-05, "loss": 0.2992, "step": 70 }, { "epoch": 0.09635651912074676, "grad_norm": 0.6878916025161743, "learning_rate": 6.32e-05, "loss": 0.2921, "step": 80 }, { "epoch": 0.10840108401084012, "grad_norm": 0.5862188339233398, "learning_rate": 7.12e-05, "loss": 0.2483, "step": 90 }, { "epoch": 0.10840108401084012, "eval_loss": 0.22866909205913544, "eval_runtime": 81.2164, "eval_samples_per_second": 4.309, "eval_steps_per_second": 2.155, "step": 90 }, { "epoch": 0.12044564890093346, "grad_norm": 0.7386724948883057, "learning_rate": 7.920000000000001e-05, "loss": 0.2107, "step": 100 }, { "epoch": 0.1324902137910268, "grad_norm": 0.6922011375427246, "learning_rate": 8.72e-05, "loss": 0.1962, "step": 110 }, { "epoch": 0.14453477868112014, "grad_norm": 0.4779876470565796, "learning_rate": 9.52e-05, "loss": 0.1741, "step": 120 }, { "epoch": 0.14453477868112014, "eval_loss": 0.18707558512687683, "eval_runtime": 81.1016, "eval_samples_per_second": 4.316, "eval_steps_per_second": 2.158, "step": 120 }, { "epoch": 0.1565793435712135, "grad_norm": 0.3711669445037842, "learning_rate": 0.0001032, "loss": 0.1775, "step": 130 }, { "epoch": 0.16862390846130684, "grad_norm": 0.5507099032402039, "learning_rate": 0.00011120000000000002, "loss": 0.1663, "step": 140 }, { "epoch": 0.18066847335140018, "grad_norm": 0.6897018551826477, "learning_rate": 0.0001192, "loss": 0.162, "step": 150 }, { "epoch": 0.18066847335140018, "eval_loss": 0.1689070761203766, "eval_runtime": 81.1763, "eval_samples_per_second": 4.312, "eval_steps_per_second": 2.156, "step": 150 }, { "epoch": 0.19271303824149352, "grad_norm": 0.30447226762771606, "learning_rate": 0.0001272, "loss": 0.1654, "step": 160 }, { "epoch": 0.20475760313158686, "grad_norm": 0.36739301681518555, "learning_rate": 0.0001352, "loss": 0.1692, "step": 170 }, { "epoch": 0.21680216802168023, "grad_norm": 0.42459923028945923, "learning_rate": 0.0001432, "loss": 0.1479, "step": 180 }, { "epoch": 0.21680216802168023, "eval_loss": 0.16287875175476074, "eval_runtime": 81.3447, "eval_samples_per_second": 4.303, "eval_steps_per_second": 2.151, "step": 180 }, { "epoch": 0.22884673291177357, "grad_norm": 0.24459399282932281, "learning_rate": 0.00015120000000000002, "loss": 0.1567, "step": 190 }, { "epoch": 0.2408912978018669, "grad_norm": 0.29077818989753723, "learning_rate": 0.00015920000000000002, "loss": 0.1491, "step": 200 }, { "epoch": 0.2529358626919603, "grad_norm": 0.3934674561023712, "learning_rate": 0.0001672, "loss": 0.1587, "step": 210 }, { "epoch": 0.2529358626919603, "eval_loss": 0.1568612903356552, "eval_runtime": 80.9345, "eval_samples_per_second": 4.324, "eval_steps_per_second": 2.162, "step": 210 }, { "epoch": 0.2649804275820536, "grad_norm": 0.31920939683914185, "learning_rate": 0.0001752, "loss": 0.1622, "step": 220 }, { "epoch": 0.27702499247214696, "grad_norm": 0.5031167268753052, "learning_rate": 0.0001832, "loss": 0.147, "step": 230 }, { "epoch": 0.28906955736224027, "grad_norm": 0.2890608310699463, "learning_rate": 0.0001912, "loss": 0.1433, "step": 240 }, { "epoch": 0.28906955736224027, "eval_loss": 0.15362557768821716, "eval_runtime": 80.9234, "eval_samples_per_second": 4.325, "eval_steps_per_second": 2.163, "step": 240 }, { "epoch": 0.30111412225233364, "grad_norm": 0.2849373519420624, "learning_rate": 0.00019920000000000002, "loss": 0.1471, "step": 250 }, { "epoch": 0.313158687142427, "grad_norm": 0.263950914144516, "learning_rate": 0.0001999920550627653, "loss": 0.155, "step": 260 }, { "epoch": 0.3252032520325203, "grad_norm": 0.31182682514190674, "learning_rate": 0.00019996459270297992, "loss": 0.138, "step": 270 }, { "epoch": 0.3252032520325203, "eval_loss": 0.15008553862571716, "eval_runtime": 80.9483, "eval_samples_per_second": 4.324, "eval_steps_per_second": 2.162, "step": 270 }, { "epoch": 0.3372478169226137, "grad_norm": 0.41790467500686646, "learning_rate": 0.00019991752022102606, "loss": 0.1472, "step": 280 }, { "epoch": 0.349292381812707, "grad_norm": 0.29337936639785767, "learning_rate": 0.0001998508468511445, "loss": 0.152, "step": 290 }, { "epoch": 0.36133694670280037, "grad_norm": 0.1533428430557251, "learning_rate": 0.0001997645856726956, "loss": 0.1427, "step": 300 }, { "epoch": 0.36133694670280037, "eval_loss": 0.14941252768039703, "eval_runtime": 81.0588, "eval_samples_per_second": 4.318, "eval_steps_per_second": 2.159, "step": 300 }, { "epoch": 0.37338151159289373, "grad_norm": 0.3626411557197571, "learning_rate": 0.0001996587536075934, "loss": 0.1539, "step": 310 }, { "epoch": 0.38542607648298705, "grad_norm": 0.3157321810722351, "learning_rate": 0.00019953337141698617, "loss": 0.1457, "step": 320 }, { "epoch": 0.3974706413730804, "grad_norm": 0.18949578702449799, "learning_rate": 0.00019938846369718348, "loss": 0.1471, "step": 330 }, { "epoch": 0.3974706413730804, "eval_loss": 0.14507745206356049, "eval_runtime": 81.0028, "eval_samples_per_second": 4.321, "eval_steps_per_second": 2.16, "step": 330 }, { "epoch": 0.4095152062631737, "grad_norm": 0.2189997136592865, "learning_rate": 0.0001992240588748314, "loss": 0.1423, "step": 340 }, { "epoch": 0.4215597711532671, "grad_norm": 0.2674465775489807, "learning_rate": 0.00019904018920133574, "loss": 0.1397, "step": 350 }, { "epoch": 0.43360433604336046, "grad_norm": 0.3467840552330017, "learning_rate": 0.00019883689074653548, "loss": 0.1486, "step": 360 }, { "epoch": 0.43360433604336046, "eval_loss": 0.1380164623260498, "eval_runtime": 81.2554, "eval_samples_per_second": 4.307, "eval_steps_per_second": 2.154, "step": 360 }, { "epoch": 0.4456489009334538, "grad_norm": 0.22817489504814148, "learning_rate": 0.00019861420339162682, "loss": 0.1411, "step": 370 }, { "epoch": 0.45769346582354714, "grad_norm": 0.1939237415790558, "learning_rate": 0.0001983721708213397, "loss": 0.1337, "step": 380 }, { "epoch": 0.46973803071364045, "grad_norm": 0.20223841071128845, "learning_rate": 0.00019811084051536812, "loss": 0.147, "step": 390 }, { "epoch": 0.46973803071364045, "eval_loss": 0.13835138082504272, "eval_runtime": 81.2083, "eval_samples_per_second": 4.31, "eval_steps_per_second": 2.155, "step": 390 }, { "epoch": 0.4817825956037338, "grad_norm": 0.16610513627529144, "learning_rate": 0.00019783026373905603, "loss": 0.1261, "step": 400 }, { "epoch": 0.49382716049382713, "grad_norm": 0.13892178237438202, "learning_rate": 0.0001975304955333405, "loss": 0.1303, "step": 410 }, { "epoch": 0.5058717253839206, "grad_norm": 0.17607590556144714, "learning_rate": 0.00019721159470395446, "loss": 0.1408, "step": 420 }, { "epoch": 0.5058717253839206, "eval_loss": 0.13557562232017517, "eval_runtime": 81.2501, "eval_samples_per_second": 4.308, "eval_steps_per_second": 2.154, "step": 420 }, { "epoch": 0.5179162902740139, "grad_norm": 0.2578093409538269, "learning_rate": 0.0001968736238098906, "loss": 0.1328, "step": 430 }, { "epoch": 0.5299608551641072, "grad_norm": 0.5826597213745117, "learning_rate": 0.00019651664915112913, "loss": 0.1379, "step": 440 }, { "epoch": 0.5420054200542005, "grad_norm": 0.2640504240989685, "learning_rate": 0.00019614074075563178, "loss": 0.1328, "step": 450 }, { "epoch": 0.5420054200542005, "eval_loss": 0.13704629242420197, "eval_runtime": 81.2461, "eval_samples_per_second": 4.308, "eval_steps_per_second": 2.154, "step": 450 }, { "epoch": 0.5540499849442939, "grad_norm": 0.25430986285209656, "learning_rate": 0.00019574597236560428, "loss": 0.1295, "step": 460 }, { "epoch": 0.5660945498343872, "grad_norm": 0.2172861099243164, "learning_rate": 0.00019533242142303028, "loss": 0.1274, "step": 470 }, { "epoch": 0.5781391147244805, "grad_norm": 0.18936924636363983, "learning_rate": 0.00019490016905447958, "loss": 0.131, "step": 480 }, { "epoch": 0.5781391147244805, "eval_loss": 0.13753947615623474, "eval_runtime": 81.0949, "eval_samples_per_second": 4.316, "eval_steps_per_second": 2.158, "step": 480 }, { "epoch": 0.590183679614574, "grad_norm": 0.26435989141464233, "learning_rate": 0.00019444930005519347, "loss": 0.132, "step": 490 }, { "epoch": 0.6022282445046673, "grad_norm": 0.23327338695526123, "learning_rate": 0.00019397990287245027, "loss": 0.1417, "step": 500 }, { "epoch": 0.6142728093947606, "grad_norm": 0.14256203174591064, "learning_rate": 0.00019349206958821474, "loss": 0.1392, "step": 510 }, { "epoch": 0.6142728093947606, "eval_loss": 0.13641956448554993, "eval_runtime": 81.0841, "eval_samples_per_second": 4.317, "eval_steps_per_second": 2.158, "step": 510 }, { "epoch": 0.626317374284854, "grad_norm": 0.16011199355125427, "learning_rate": 0.00019298589590107415, "loss": 0.1406, "step": 520 }, { "epoch": 0.6383619391749473, "grad_norm": 0.1269129067659378, "learning_rate": 0.00019246148110746515, "loss": 0.1346, "step": 530 }, { "epoch": 0.6504065040650406, "grad_norm": 0.13770046830177307, "learning_rate": 0.0001919189280821946, "loss": 0.1315, "step": 540 }, { "epoch": 0.6504065040650406, "eval_loss": 0.1363556832075119, "eval_runtime": 81.2186, "eval_samples_per_second": 4.309, "eval_steps_per_second": 2.155, "step": 540 }, { "epoch": 0.662451068955134, "grad_norm": 0.18796966969966888, "learning_rate": 0.00019135834325825868, "loss": 0.13, "step": 550 }, { "epoch": 0.6744956338452274, "grad_norm": 0.12326055020093918, "learning_rate": 0.00019077983660596365, "loss": 0.1339, "step": 560 }, { "epoch": 0.6865401987353207, "grad_norm": 0.1480596363544464, "learning_rate": 0.00019018352161135317, "loss": 0.1296, "step": 570 }, { "epoch": 0.6865401987353207, "eval_loss": 0.13280269503593445, "eval_runtime": 81.3546, "eval_samples_per_second": 4.302, "eval_steps_per_second": 2.151, "step": 570 }, { "epoch": 0.698584763625414, "grad_norm": 0.15606318414211273, "learning_rate": 0.0001895695152539455, "loss": 0.1296, "step": 580 }, { "epoch": 0.7106293285155074, "grad_norm": 0.09551403671503067, "learning_rate": 0.00018893793798378553, "loss": 0.1323, "step": 590 }, { "epoch": 0.7226738934056007, "grad_norm": 0.11749322712421417, "learning_rate": 0.00018828891369781605, "loss": 0.1179, "step": 600 }, { "epoch": 0.7226738934056007, "eval_loss": 0.12917345762252808, "eval_runtime": 81.4843, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.148, "step": 600 }, { "epoch": 0.734718458295694, "grad_norm": 0.11540284752845764, "learning_rate": 0.0001876225697155729, "loss": 0.1227, "step": 610 }, { "epoch": 0.7467630231857875, "grad_norm": 0.3190910518169403, "learning_rate": 0.00018693903675420846, "loss": 0.1281, "step": 620 }, { "epoch": 0.7588075880758808, "grad_norm": 0.11964758485555649, "learning_rate": 0.00018623844890284884, "loss": 0.1356, "step": 630 }, { "epoch": 0.7588075880758808, "eval_loss": 0.12936915457248688, "eval_runtime": 81.4409, "eval_samples_per_second": 4.298, "eval_steps_per_second": 2.149, "step": 630 }, { "epoch": 0.7708521529659741, "grad_norm": 0.10809649527072906, "learning_rate": 0.00018552094359628956, "loss": 0.1337, "step": 640 }, { "epoch": 0.7828967178560674, "grad_norm": 0.13179966807365417, "learning_rate": 0.00018478666158803475, "loss": 0.1213, "step": 650 }, { "epoch": 0.7949412827461608, "grad_norm": 0.13050246238708496, "learning_rate": 0.00018403574692268566, "loss": 0.1274, "step": 660 }, { "epoch": 0.7949412827461608, "eval_loss": 0.129548579454422, "eval_runtime": 81.3854, "eval_samples_per_second": 4.301, "eval_steps_per_second": 2.15, "step": 660 }, { "epoch": 0.8069858476362541, "grad_norm": 0.10548313707113266, "learning_rate": 0.00018326834690768308, "loss": 0.1266, "step": 670 }, { "epoch": 0.8190304125263475, "grad_norm": 0.11163028329610825, "learning_rate": 0.00018248461208441016, "loss": 0.1275, "step": 680 }, { "epoch": 0.8310749774164409, "grad_norm": 0.10246960818767548, "learning_rate": 0.00018168469619866037, "loss": 0.1262, "step": 690 }, { "epoch": 0.8310749774164409, "eval_loss": 0.12998254597187042, "eval_runtime": 81.2607, "eval_samples_per_second": 4.307, "eval_steps_per_second": 2.154, "step": 690 }, { "epoch": 0.8431195423065342, "grad_norm": 0.16775400936603546, "learning_rate": 0.00018086875617047738, "loss": 0.128, "step": 700 }, { "epoch": 0.8551641071966275, "grad_norm": 0.09823903441429138, "learning_rate": 0.00018003695206337164, "loss": 0.1186, "step": 710 }, { "epoch": 0.8672086720867209, "grad_norm": 0.08154378831386566, "learning_rate": 0.0001791894470529209, "loss": 0.1304, "step": 720 }, { "epoch": 0.8672086720867209, "eval_loss": 0.12851889431476593, "eval_runtime": 81.3179, "eval_samples_per_second": 4.304, "eval_steps_per_second": 2.152, "step": 720 }, { "epoch": 0.8792532369768142, "grad_norm": 0.13566914200782776, "learning_rate": 0.00017832640739475964, "loss": 0.1256, "step": 730 }, { "epoch": 0.8912978018669075, "grad_norm": 0.21967343986034393, "learning_rate": 0.00017744800239196485, "loss": 0.1049, "step": 740 }, { "epoch": 0.9033423667570009, "grad_norm": 0.12291447818279266, "learning_rate": 0.00017655440436184361, "loss": 0.1352, "step": 750 }, { "epoch": 0.9033423667570009, "eval_loss": 0.12859711050987244, "eval_runtime": 81.5112, "eval_samples_per_second": 4.294, "eval_steps_per_second": 2.147, "step": 750 }, { "epoch": 0.9153869316470943, "grad_norm": 0.23631049692630768, "learning_rate": 0.00017564578860212952, "loss": 0.1236, "step": 760 }, { "epoch": 0.9274314965371876, "grad_norm": 0.13145385682582855, "learning_rate": 0.00017472233335659443, "loss": 0.1235, "step": 770 }, { "epoch": 0.9394760614272809, "grad_norm": 0.0923519879579544, "learning_rate": 0.00017378421978008212, "loss": 0.1224, "step": 780 }, { "epoch": 0.9394760614272809, "eval_loss": 0.12813878059387207, "eval_runtime": 81.2514, "eval_samples_per_second": 4.308, "eval_steps_per_second": 2.154, "step": 780 }, { "epoch": 0.9515206263173743, "grad_norm": 0.2052728235721588, "learning_rate": 0.0001728316319029713, "loss": 0.1189, "step": 790 }, { "epoch": 0.9635651912074676, "grad_norm": 0.19939783215522766, "learning_rate": 0.0001718647565950739, "loss": 0.128, "step": 800 }, { "epoch": 0.975609756097561, "grad_norm": 0.11448610574007034, "learning_rate": 0.00017088378352897703, "loss": 0.1103, "step": 810 }, { "epoch": 0.975609756097561, "eval_loss": 0.12625984847545624, "eval_runtime": 81.2673, "eval_samples_per_second": 4.307, "eval_steps_per_second": 2.153, "step": 810 }, { "epoch": 0.9876543209876543, "grad_norm": 0.11135770380496979, "learning_rate": 0.00016988890514283447, "loss": 0.1281, "step": 820 }, { "epoch": 0.9996988858777477, "grad_norm": 0.11028297245502472, "learning_rate": 0.00016888031660261622, "loss": 0.1225, "step": 830 }, { "epoch": 1.010840108401084, "grad_norm": 0.11298307776451111, "learning_rate": 0.00016785821576382245, "loss": 0.1188, "step": 840 }, { "epoch": 1.010840108401084, "eval_loss": 0.1276378333568573, "eval_runtime": 81.1811, "eval_samples_per_second": 4.311, "eval_steps_per_second": 2.156, "step": 840 }, { "epoch": 1.0228846732911774, "grad_norm": 0.09120020270347595, "learning_rate": 0.0001668228031326702, "loss": 0.1169, "step": 850 }, { "epoch": 1.0349292381812707, "grad_norm": 0.11060494184494019, "learning_rate": 0.00016577428182675973, "loss": 0.1243, "step": 860 }, { "epoch": 1.046973803071364, "grad_norm": 0.09636874496936798, "learning_rate": 0.0001647128575352292, "loss": 0.1187, "step": 870 }, { "epoch": 1.046973803071364, "eval_loss": 0.12680906057357788, "eval_runtime": 81.1164, "eval_samples_per_second": 4.315, "eval_steps_per_second": 2.157, "step": 870 }, { "epoch": 1.0590183679614573, "grad_norm": 0.10281568765640259, "learning_rate": 0.000163638738478404, "loss": 0.133, "step": 880 }, { "epoch": 1.0710629328515506, "grad_norm": 0.08682233840227127, "learning_rate": 0.0001625521353669504, "loss": 0.1171, "step": 890 }, { "epoch": 1.0831074977416442, "grad_norm": 0.1339954435825348, "learning_rate": 0.00016145326136054008, "loss": 0.1257, "step": 900 }, { "epoch": 1.0831074977416442, "eval_loss": 0.12813013792037964, "eval_runtime": 81.4418, "eval_samples_per_second": 4.298, "eval_steps_per_second": 2.149, "step": 900 }, { "epoch": 1.0951520626317375, "grad_norm": 0.09956265985965729, "learning_rate": 0.00016034233202603463, "loss": 0.1287, "step": 910 }, { "epoch": 1.1071966275218308, "grad_norm": 0.07889163494110107, "learning_rate": 0.00015921956529519747, "loss": 0.1135, "step": 920 }, { "epoch": 1.119241192411924, "grad_norm": 0.07798007130622864, "learning_rate": 0.00015808518142194214, "loss": 0.1128, "step": 930 }, { "epoch": 1.119241192411924, "eval_loss": 0.12662966549396515, "eval_runtime": 81.2769, "eval_samples_per_second": 4.306, "eval_steps_per_second": 2.153, "step": 930 }, { "epoch": 1.1312857573020174, "grad_norm": 0.09110717475414276, "learning_rate": 0.00015693940293912492, "loss": 0.1161, "step": 940 }, { "epoch": 1.1433303221921107, "grad_norm": 0.10704551637172699, "learning_rate": 0.00015578245461489042, "loss": 0.1237, "step": 950 }, { "epoch": 1.1553748870822043, "grad_norm": 0.12007839977741241, "learning_rate": 0.00015461456340857857, "loss": 0.1246, "step": 960 }, { "epoch": 1.1553748870822043, "eval_loss": 0.12597930431365967, "eval_runtime": 81.2379, "eval_samples_per_second": 4.308, "eval_steps_per_second": 2.154, "step": 960 }, { "epoch": 1.1674194519722976, "grad_norm": 0.09195715934038162, "learning_rate": 0.00015343595842620198, "loss": 0.1179, "step": 970 }, { "epoch": 1.1794640168623909, "grad_norm": 0.11327285319566727, "learning_rate": 0.000152246870875502, "loss": 0.1219, "step": 980 }, { "epoch": 1.1915085817524842, "grad_norm": 0.12147443741559982, "learning_rate": 0.00015104753402059252, "loss": 0.1198, "step": 990 }, { "epoch": 1.1915085817524842, "eval_loss": 0.12678198516368866, "eval_runtime": 81.2272, "eval_samples_per_second": 4.309, "eval_steps_per_second": 2.154, "step": 990 }, { "epoch": 1.2035531466425775, "grad_norm": 0.12530316412448883, "learning_rate": 0.00014983818313620047, "loss": 0.1215, "step": 1000 }, { "epoch": 1.2155977115326708, "grad_norm": 0.21503940224647522, "learning_rate": 0.00014861905546151164, "loss": 0.132, "step": 1010 }, { "epoch": 1.2276422764227641, "grad_norm": 0.09851568937301636, "learning_rate": 0.00014739039015363155, "loss": 0.1255, "step": 1020 }, { "epoch": 1.2276422764227641, "eval_loss": 0.1265084594488144, "eval_runtime": 81.2978, "eval_samples_per_second": 4.305, "eval_steps_per_second": 2.153, "step": 1020 }, { "epoch": 1.2396868413128577, "grad_norm": 0.09794170409440994, "learning_rate": 0.0001461524282406696, "loss": 0.1203, "step": 1030 }, { "epoch": 1.251731406202951, "grad_norm": 0.09283249080181122, "learning_rate": 0.00014490541257445664, "loss": 0.1241, "step": 1040 }, { "epoch": 1.2637759710930443, "grad_norm": 0.10757778584957123, "learning_rate": 0.00014364958778290436, "loss": 0.1263, "step": 1050 }, { "epoch": 1.2637759710930443, "eval_loss": 0.1275068074464798, "eval_runtime": 81.3037, "eval_samples_per_second": 4.305, "eval_steps_per_second": 2.152, "step": 1050 }, { "epoch": 1.2758205359831376, "grad_norm": 0.08741755038499832, "learning_rate": 0.00014238520022201665, "loss": 0.1279, "step": 1060 }, { "epoch": 1.287865100873231, "grad_norm": 0.09674480557441711, "learning_rate": 0.00014111249792756164, "loss": 0.1203, "step": 1070 }, { "epoch": 1.2999096657633242, "grad_norm": 0.18980656564235687, "learning_rate": 0.00013983173056641437, "loss": 0.1273, "step": 1080 }, { "epoch": 1.2999096657633242, "eval_loss": 0.12540876865386963, "eval_runtime": 81.3684, "eval_samples_per_second": 4.301, "eval_steps_per_second": 2.151, "step": 1080 }, { "epoch": 1.3119542306534178, "grad_norm": 0.11260770261287689, "learning_rate": 0.00013854314938757954, "loss": 0.1209, "step": 1090 }, { "epoch": 1.323998795543511, "grad_norm": 0.08883814513683319, "learning_rate": 0.00013724700717290385, "loss": 0.1307, "step": 1100 }, { "epoch": 1.3360433604336044, "grad_norm": 0.08555177599191666, "learning_rate": 0.0001359435581874874, "loss": 0.1187, "step": 1110 }, { "epoch": 1.3360433604336044, "eval_loss": 0.1246650293469429, "eval_runtime": 81.3179, "eval_samples_per_second": 4.304, "eval_steps_per_second": 2.152, "step": 1110 }, { "epoch": 1.3480879253236977, "grad_norm": 0.09265279769897461, "learning_rate": 0.0001346330581298046, "loss": 0.1237, "step": 1120 }, { "epoch": 1.360132490213791, "grad_norm": 0.07397205382585526, "learning_rate": 0.0001333157640815434, "loss": 0.1194, "step": 1130 }, { "epoch": 1.3721770551038843, "grad_norm": 0.10523436218500137, "learning_rate": 0.00013199193445717362, "loss": 0.1215, "step": 1140 }, { "epoch": 1.3721770551038843, "eval_loss": 0.12532441318035126, "eval_runtime": 81.5003, "eval_samples_per_second": 4.294, "eval_steps_per_second": 2.147, "step": 1140 }, { "epoch": 1.3842216199939776, "grad_norm": 0.08104018867015839, "learning_rate": 0.00013066182895325339, "loss": 0.1276, "step": 1150 }, { "epoch": 1.396266184884071, "grad_norm": 0.1049809381365776, "learning_rate": 0.00012932570849748446, "loss": 0.128, "step": 1160 }, { "epoch": 1.4083107497741645, "grad_norm": 0.10500895977020264, "learning_rate": 0.00012798383519752577, "loss": 0.1179, "step": 1170 }, { "epoch": 1.4083107497741645, "eval_loss": 0.12398175895214081, "eval_runtime": 81.4012, "eval_samples_per_second": 4.3, "eval_steps_per_second": 2.15, "step": 1170 }, { "epoch": 1.4203553146642578, "grad_norm": 0.08427383005619049, "learning_rate": 0.00012663647228957562, "loss": 0.1145, "step": 1180 }, { "epoch": 1.432399879554351, "grad_norm": 0.08236031979322433, "learning_rate": 0.0001252838840867324, "loss": 0.1216, "step": 1190 }, { "epoch": 1.4444444444444444, "grad_norm": 0.08572593331336975, "learning_rate": 0.00012392633592714423, "loss": 0.128, "step": 1200 }, { "epoch": 1.4444444444444444, "eval_loss": 0.1240333542227745, "eval_runtime": 81.3808, "eval_samples_per_second": 4.301, "eval_steps_per_second": 2.15, "step": 1200 }, { "epoch": 1.4564890093345377, "grad_norm": 0.08900213986635208, "learning_rate": 0.00012256409412195727, "loss": 0.1171, "step": 1210 }, { "epoch": 1.4685335742246313, "grad_norm": 0.09740438312292099, "learning_rate": 0.0001211974259030733, "loss": 0.1316, "step": 1220 }, { "epoch": 1.4805781391147246, "grad_norm": 0.07978689670562744, "learning_rate": 0.00011982659937072677, "loss": 0.1219, "step": 1230 }, { "epoch": 1.4805781391147246, "eval_loss": 0.12412716448307037, "eval_runtime": 81.2608, "eval_samples_per_second": 4.307, "eval_steps_per_second": 2.154, "step": 1230 }, { "epoch": 1.492622704004818, "grad_norm": 0.08000902086496353, "learning_rate": 0.00011845188344089126, "loss": 0.1135, "step": 1240 }, { "epoch": 1.5046672688949112, "grad_norm": 0.07367110252380371, "learning_rate": 0.00011707354779252612, "loss": 0.1206, "step": 1250 }, { "epoch": 1.5167118337850045, "grad_norm": 0.07485098391771317, "learning_rate": 0.00011569186281467335, "loss": 0.1138, "step": 1260 }, { "epoch": 1.5167118337850045, "eval_loss": 0.1249684989452362, "eval_runtime": 81.6234, "eval_samples_per_second": 4.288, "eval_steps_per_second": 2.144, "step": 1260 }, { "epoch": 1.5287563986750978, "grad_norm": 0.11146491765975952, "learning_rate": 0.00011430709955341514, "loss": 0.1272, "step": 1270 }, { "epoch": 1.5408009635651911, "grad_norm": 0.08664289861917496, "learning_rate": 0.00011291952965870269, "loss": 0.1147, "step": 1280 }, { "epoch": 1.5528455284552845, "grad_norm": 0.09051596373319626, "learning_rate": 0.00011152942533106638, "loss": 0.1161, "step": 1290 }, { "epoch": 1.5528455284552845, "eval_loss": 0.12454230338335037, "eval_runtime": 81.4318, "eval_samples_per_second": 4.298, "eval_steps_per_second": 2.149, "step": 1290 }, { "epoch": 1.5648900933453778, "grad_norm": 0.06956392526626587, "learning_rate": 0.000110137059268218, "loss": 0.1077, "step": 1300 }, { "epoch": 1.5769346582354713, "grad_norm": 0.07337108254432678, "learning_rate": 0.00010874270461155554, "loss": 0.124, "step": 1310 }, { "epoch": 1.5889792231255646, "grad_norm": 0.08082354068756104, "learning_rate": 0.0001073466348925807, "loss": 0.1108, "step": 1320 }, { "epoch": 1.5889792231255646, "eval_loss": 0.12420380860567093, "eval_runtime": 81.6866, "eval_samples_per_second": 4.285, "eval_steps_per_second": 2.142, "step": 1320 }, { "epoch": 1.601023788015658, "grad_norm": 0.07141824066638947, "learning_rate": 0.00010594912397924018, "loss": 0.1112, "step": 1330 }, { "epoch": 1.6130683529057512, "grad_norm": 0.09476039558649063, "learning_rate": 0.00010455044602220076, "loss": 0.1257, "step": 1340 }, { "epoch": 1.6251129177958448, "grad_norm": 0.06789754331111908, "learning_rate": 0.00010315087540106894, "loss": 0.1167, "step": 1350 }, { "epoch": 1.6251129177958448, "eval_loss": 0.12411510944366455, "eval_runtime": 81.5821, "eval_samples_per_second": 4.29, "eval_steps_per_second": 2.145, "step": 1350 }, { "epoch": 1.637157482685938, "grad_norm": 0.08458850532770157, "learning_rate": 0.00010175068667056578, "loss": 0.1215, "step": 1360 }, { "epoch": 1.6492020475760314, "grad_norm": 0.0818653479218483, "learning_rate": 0.00010035015450666723, "loss": 0.1193, "step": 1370 }, { "epoch": 1.6612466124661247, "grad_norm": 0.060270510613918304, "learning_rate": 9.894955365272087e-05, "loss": 0.1094, "step": 1380 }, { "epoch": 1.6612466124661247, "eval_loss": 0.12342710047960281, "eval_runtime": 81.3723, "eval_samples_per_second": 4.301, "eval_steps_per_second": 2.151, "step": 1380 }, { "epoch": 1.673291177356218, "grad_norm": 0.08454828709363937, "learning_rate": 9.75491588655492e-05, "loss": 0.1205, "step": 1390 }, { "epoch": 1.6853357422463113, "grad_norm": 0.06156294047832489, "learning_rate": 9.614924486155047e-05, "loss": 0.1111, "step": 1400 }, { "epoch": 1.6973803071364046, "grad_norm": 0.08005426079034805, "learning_rate": 9.475008626280739e-05, "loss": 0.1077, "step": 1410 }, { "epoch": 1.6973803071364046, "eval_loss": 0.12282679229974747, "eval_runtime": 81.2841, "eval_samples_per_second": 4.306, "eval_steps_per_second": 2.153, "step": 1410 }, { "epoch": 1.709424872026498, "grad_norm": 0.08515966683626175, "learning_rate": 9.335195754321427e-05, "loss": 0.1226, "step": 1420 }, { "epoch": 1.7214694369165913, "grad_norm": 0.08117437362670898, "learning_rate": 9.195513297463339e-05, "loss": 0.1158, "step": 1430 }, { "epoch": 1.7335140018066846, "grad_norm": 0.0748409777879715, "learning_rate": 9.055988657309075e-05, "loss": 0.1152, "step": 1440 }, { "epoch": 1.7335140018066846, "eval_loss": 0.12351047992706299, "eval_runtime": 81.3284, "eval_samples_per_second": 4.304, "eval_steps_per_second": 2.152, "step": 1440 }, { "epoch": 1.7455585666967781, "grad_norm": 0.08821168541908264, "learning_rate": 8.916649204502231e-05, "loss": 0.1231, "step": 1450 }, { "epoch": 1.7576031315868714, "grad_norm": 0.06368843466043472, "learning_rate": 8.777522273358076e-05, "loss": 0.1144, "step": 1460 }, { "epoch": 1.7696476964769647, "grad_norm": 0.08906359225511551, "learning_rate": 8.638635156501353e-05, "loss": 0.1278, "step": 1470 }, { "epoch": 1.7696476964769647, "eval_loss": 0.12412309646606445, "eval_runtime": 81.4647, "eval_samples_per_second": 4.296, "eval_steps_per_second": 2.148, "step": 1470 }, { "epoch": 1.7816922613670583, "grad_norm": 0.13860240578651428, "learning_rate": 8.500015099512282e-05, "loss": 0.1135, "step": 1480 }, { "epoch": 1.7937368262571516, "grad_norm": 0.08925935626029968, "learning_rate": 8.361689295581759e-05, "loss": 0.1274, "step": 1490 }, { "epoch": 1.805781391147245, "grad_norm": 0.2061154991388321, "learning_rate": 8.223684880176861e-05, "loss": 0.1245, "step": 1500 }, { "epoch": 1.805781391147245, "eval_loss": 0.12294992804527283, "eval_runtime": 81.5386, "eval_samples_per_second": 4.292, "eval_steps_per_second": 2.146, "step": 1500 }, { "epoch": 1.8178259560373382, "grad_norm": 0.07016266882419586, "learning_rate": 8.086028925717661e-05, "loss": 0.1215, "step": 1510 }, { "epoch": 1.8298705209274315, "grad_norm": 0.19503405690193176, "learning_rate": 7.948748436266409e-05, "loss": 0.1169, "step": 1520 }, { "epoch": 1.8419150858175248, "grad_norm": 0.07560829073190689, "learning_rate": 7.811870342230127e-05, "loss": 0.1219, "step": 1530 }, { "epoch": 1.8419150858175248, "eval_loss": 0.12353645265102386, "eval_runtime": 81.5233, "eval_samples_per_second": 4.293, "eval_steps_per_second": 2.147, "step": 1530 }, { "epoch": 1.8539596507076181, "grad_norm": 0.06638047844171524, "learning_rate": 7.675421495077657e-05, "loss": 0.1174, "step": 1540 }, { "epoch": 1.8660042155977115, "grad_norm": 0.07245375961065292, "learning_rate": 7.539428662072188e-05, "loss": 0.1263, "step": 1550 }, { "epoch": 1.8780487804878048, "grad_norm": 0.08080323040485382, "learning_rate": 7.403918521020305e-05, "loss": 0.1261, "step": 1560 }, { "epoch": 1.8780487804878048, "eval_loss": 0.12283791601657867, "eval_runtime": 81.5734, "eval_samples_per_second": 4.291, "eval_steps_per_second": 2.145, "step": 1560 }, { "epoch": 1.890093345377898, "grad_norm": 0.06938530504703522, "learning_rate": 7.268917655038581e-05, "loss": 0.1167, "step": 1570 }, { "epoch": 1.9021379102679916, "grad_norm": 0.07764075696468353, "learning_rate": 7.134452547338753e-05, "loss": 0.1191, "step": 1580 }, { "epoch": 1.914182475158085, "grad_norm": 0.10729371011257172, "learning_rate": 7.000549576032489e-05, "loss": 0.1175, "step": 1590 }, { "epoch": 1.914182475158085, "eval_loss": 0.12257985025644302, "eval_runtime": 81.3844, "eval_samples_per_second": 4.301, "eval_steps_per_second": 2.15, "step": 1590 }, { "epoch": 1.9262270400481782, "grad_norm": 0.06268001347780228, "learning_rate": 6.867235008956783e-05, "loss": 0.1203, "step": 1600 }, { "epoch": 1.9382716049382716, "grad_norm": 0.059695664793252945, "learning_rate": 6.734534998520969e-05, "loss": 0.1147, "step": 1610 }, { "epoch": 1.950316169828365, "grad_norm": 0.07781612873077393, "learning_rate": 6.602475576576383e-05, "loss": 0.1191, "step": 1620 }, { "epoch": 1.950316169828365, "eval_loss": 0.1225815936923027, "eval_runtime": 81.7725, "eval_samples_per_second": 4.28, "eval_steps_per_second": 2.14, "step": 1620 }, { "epoch": 1.9623607347184584, "grad_norm": 0.06548433750867844, "learning_rate": 6.471082649309686e-05, "loss": 0.1181, "step": 1630 }, { "epoch": 1.9744052996085517, "grad_norm": 0.0688992366194725, "learning_rate": 6.34038199216082e-05, "loss": 0.1139, "step": 1640 }, { "epoch": 1.986449864498645, "grad_norm": 0.07544533908367157, "learning_rate": 6.210399244766632e-05, "loss": 0.1194, "step": 1650 }, { "epoch": 1.986449864498645, "eval_loss": 0.12259992212057114, "eval_runtime": 81.5949, "eval_samples_per_second": 4.289, "eval_steps_per_second": 2.145, "step": 1650 }, { "epoch": 1.9984944293887383, "grad_norm": 0.13208015263080597, "learning_rate": 6.0811599059311195e-05, "loss": 0.1185, "step": 1660 }, { "epoch": 2.009635651912075, "grad_norm": 0.08133638650178909, "learning_rate": 5.952689328623321e-05, "loss": 0.1264, "step": 1670 }, { "epoch": 2.021680216802168, "grad_norm": 0.08000742644071579, "learning_rate": 5.8250127150038016e-05, "loss": 0.1108, "step": 1680 }, { "epoch": 2.021680216802168, "eval_loss": 0.12281496077775955, "eval_runtime": 81.5447, "eval_samples_per_second": 4.292, "eval_steps_per_second": 2.146, "step": 1680 }, { "epoch": 2.0337247816922615, "grad_norm": 0.06525713950395584, "learning_rate": 5.698155111480722e-05, "loss": 0.1141, "step": 1690 }, { "epoch": 2.0457693465823548, "grad_norm": 0.06997241824865341, "learning_rate": 5.57214140379649e-05, "loss": 0.1146, "step": 1700 }, { "epoch": 2.057813911472448, "grad_norm": 0.07603967189788818, "learning_rate": 5.4469963121458776e-05, "loss": 0.1169, "step": 1710 }, { "epoch": 2.057813911472448, "eval_loss": 0.12317664176225662, "eval_runtime": 81.5005, "eval_samples_per_second": 4.294, "eval_steps_per_second": 2.147, "step": 1710 }, { "epoch": 2.0698584763625414, "grad_norm": 0.09060684591531754, "learning_rate": 5.322744386326675e-05, "loss": 0.1139, "step": 1720 }, { "epoch": 2.0819030412526347, "grad_norm": 0.09253629297018051, "learning_rate": 5.1994100009237e-05, "loss": 0.1174, "step": 1730 }, { "epoch": 2.093947606142728, "grad_norm": 0.0707191526889801, "learning_rate": 5.077017350527269e-05, "loss": 0.1138, "step": 1740 }, { "epoch": 2.093947606142728, "eval_loss": 0.1225883737206459, "eval_runtime": 81.4669, "eval_samples_per_second": 4.296, "eval_steps_per_second": 2.148, "step": 1740 }, { "epoch": 2.1059921710328213, "grad_norm": 0.06803560256958008, "learning_rate": 4.9555904449868795e-05, "loss": 0.1196, "step": 1750 }, { "epoch": 2.1180367359229146, "grad_norm": 0.08914126455783844, "learning_rate": 4.835153104701221e-05, "loss": 0.1129, "step": 1760 }, { "epoch": 2.130081300813008, "grad_norm": 0.08918345719575882, "learning_rate": 4.71572895594528e-05, "loss": 0.1183, "step": 1770 }, { "epoch": 2.130081300813008, "eval_loss": 0.12287717312574387, "eval_runtime": 81.4917, "eval_samples_per_second": 4.295, "eval_steps_per_second": 2.147, "step": 1770 }, { "epoch": 2.1421258657031013, "grad_norm": 0.07838484644889832, "learning_rate": 4.5973414262355785e-05, "loss": 0.1141, "step": 1780 }, { "epoch": 2.154170430593195, "grad_norm": 0.11005687713623047, "learning_rate": 4.480013739734368e-05, "loss": 0.1223, "step": 1790 }, { "epoch": 2.1662149954832883, "grad_norm": 0.08749410510063171, "learning_rate": 4.363768912693749e-05, "loss": 0.1117, "step": 1800 }, { "epoch": 2.1662149954832883, "eval_loss": 0.12259072810411453, "eval_runtime": 81.4558, "eval_samples_per_second": 4.297, "eval_steps_per_second": 2.148, "step": 1800 }, { "epoch": 2.1782595603733816, "grad_norm": 0.08549398928880692, "learning_rate": 4.24862974894053e-05, "loss": 0.1088, "step": 1810 }, { "epoch": 2.190304125263475, "grad_norm": 0.05755528435111046, "learning_rate": 4.134618835402816e-05, "loss": 0.1063, "step": 1820 }, { "epoch": 2.2023486901535683, "grad_norm": 0.07486403733491898, "learning_rate": 4.0217585376790834e-05, "loss": 0.1183, "step": 1830 }, { "epoch": 2.2023486901535683, "eval_loss": 0.12217788398265839, "eval_runtime": 81.8011, "eval_samples_per_second": 4.279, "eval_steps_per_second": 2.139, "step": 1830 }, { "epoch": 2.2143932550436616, "grad_norm": 0.08609265834093094, "learning_rate": 3.9100709956507356e-05, "loss": 0.1254, "step": 1840 }, { "epoch": 2.226437819933755, "grad_norm": 0.0692862719297409, "learning_rate": 3.79957811913888e-05, "loss": 0.1121, "step": 1850 }, { "epoch": 2.238482384823848, "grad_norm": 0.08547110855579376, "learning_rate": 3.6903015836062905e-05, "loss": 0.1097, "step": 1860 }, { "epoch": 2.238482384823848, "eval_loss": 0.12183844298124313, "eval_runtime": 81.5286, "eval_samples_per_second": 4.293, "eval_steps_per_second": 2.146, "step": 1860 }, { "epoch": 2.2505269497139415, "grad_norm": 0.08573822677135468, "learning_rate": 3.5822628259052906e-05, "loss": 0.1174, "step": 1870 }, { "epoch": 2.262571514604035, "grad_norm": 0.08069294691085815, "learning_rate": 3.475483040072495e-05, "loss": 0.1198, "step": 1880 }, { "epoch": 2.274616079494128, "grad_norm": 0.08202630281448364, "learning_rate": 3.369983173171141e-05, "loss": 0.1132, "step": 1890 }, { "epoch": 2.274616079494128, "eval_loss": 0.12189455330371857, "eval_runtime": 81.6511, "eval_samples_per_second": 4.287, "eval_steps_per_second": 2.143, "step": 1890 }, { "epoch": 2.2866606443842215, "grad_norm": 0.08111971616744995, "learning_rate": 3.2657839211819085e-05, "loss": 0.1086, "step": 1900 }, { "epoch": 2.298705209274315, "grad_norm": 0.07577092200517654, "learning_rate": 3.1629057249429527e-05, "loss": 0.1205, "step": 1910 }, { "epoch": 2.3107497741644085, "grad_norm": 0.0689835473895073, "learning_rate": 3.0613687661400384e-05, "loss": 0.1133, "step": 1920 }, { "epoch": 2.3107497741644085, "eval_loss": 0.12182266265153885, "eval_runtime": 81.5792, "eval_samples_per_second": 4.29, "eval_steps_per_second": 2.145, "step": 1920 }, { "epoch": 2.322794339054502, "grad_norm": 0.09697998315095901, "learning_rate": 2.9611929633474555e-05, "loss": 0.1214, "step": 1930 }, { "epoch": 2.334838903944595, "grad_norm": 0.07920137792825699, "learning_rate": 2.8623979681206002e-05, "loss": 0.1108, "step": 1940 }, { "epoch": 2.3468834688346885, "grad_norm": 0.07563956826925278, "learning_rate": 2.765003161140911e-05, "loss": 0.1213, "step": 1950 }, { "epoch": 2.3468834688346885, "eval_loss": 0.12195436656475067, "eval_runtime": 81.5329, "eval_samples_per_second": 4.293, "eval_steps_per_second": 2.146, "step": 1950 }, { "epoch": 2.3589280337247818, "grad_norm": 0.08294857293367386, "learning_rate": 2.66902764841394e-05, "loss": 0.1186, "step": 1960 }, { "epoch": 2.370972598614875, "grad_norm": 0.0806855708360672, "learning_rate": 2.5744902575213248e-05, "loss": 0.1127, "step": 1970 }, { "epoch": 2.3830171635049684, "grad_norm": 0.1039934828877449, "learning_rate": 2.481409533927358e-05, "loss": 0.1188, "step": 1980 }, { "epoch": 2.3830171635049684, "eval_loss": 0.12177734076976776, "eval_runtime": 81.5272, "eval_samples_per_second": 4.293, "eval_steps_per_second": 2.147, "step": 1980 }, { "epoch": 2.3950617283950617, "grad_norm": 0.09341799467802048, "learning_rate": 2.3898037373409276e-05, "loss": 0.1173, "step": 1990 }, { "epoch": 2.407106293285155, "grad_norm": 0.08933733403682709, "learning_rate": 2.2996908381334736e-05, "loss": 0.1148, "step": 2000 }, { "epoch": 2.4191508581752483, "grad_norm": 0.08493391424417496, "learning_rate": 2.211088513813754e-05, "loss": 0.1121, "step": 2010 }, { "epoch": 2.4191508581752483, "eval_loss": 0.12171091139316559, "eval_runtime": 81.5846, "eval_samples_per_second": 4.29, "eval_steps_per_second": 2.145, "step": 2010 }, { "epoch": 2.4311954230653416, "grad_norm": 0.1086319163441658, "learning_rate": 2.1240141455600116e-05, "loss": 0.1145, "step": 2020 }, { "epoch": 2.443239987955435, "grad_norm": 0.09176024794578552, "learning_rate": 2.0384848148103196e-05, "loss": 0.1092, "step": 2030 }, { "epoch": 2.4552845528455283, "grad_norm": 0.1033325344324112, "learning_rate": 1.9545172999116812e-05, "loss": 0.1055, "step": 2040 }, { "epoch": 2.4552845528455283, "eval_loss": 0.12144716829061508, "eval_runtime": 81.758, "eval_samples_per_second": 4.281, "eval_steps_per_second": 2.14, "step": 2040 }, { "epoch": 2.4673291177356216, "grad_norm": 0.07238776981830597, "learning_rate": 1.872128072828634e-05, "loss": 0.1105, "step": 2050 }, { "epoch": 2.4793736826257153, "grad_norm": 0.06941673159599304, "learning_rate": 1.791333295911909e-05, "loss": 0.1118, "step": 2060 }, { "epoch": 2.4914182475158086, "grad_norm": 0.08462639153003693, "learning_rate": 1.7121488187278713e-05, "loss": 0.1082, "step": 2070 }, { "epoch": 2.4914182475158086, "eval_loss": 0.12135373055934906, "eval_runtime": 81.6101, "eval_samples_per_second": 4.289, "eval_steps_per_second": 2.144, "step": 2070 }, { "epoch": 2.503462812405902, "grad_norm": 0.09350676834583282, "learning_rate": 1.6345901749492887e-05, "loss": 0.1121, "step": 2080 }, { "epoch": 2.5155073772959953, "grad_norm": 0.08438068628311157, "learning_rate": 1.5586725793080814e-05, "loss": 0.1146, "step": 2090 }, { "epoch": 2.5275519421860886, "grad_norm": 0.10219820588827133, "learning_rate": 1.484410924610642e-05, "loss": 0.1179, "step": 2100 }, { "epoch": 2.5275519421860886, "eval_loss": 0.12126310169696808, "eval_runtime": 81.4438, "eval_samples_per_second": 4.297, "eval_steps_per_second": 2.149, "step": 2100 }, { "epoch": 2.539596507076182, "grad_norm": 0.0966809093952179, "learning_rate": 1.4118197788163056e-05, "loss": 0.1186, "step": 2110 }, { "epoch": 2.551641071966275, "grad_norm": 0.1106065958738327, "learning_rate": 1.3409133821795306e-05, "loss": 0.1112, "step": 2120 }, { "epoch": 2.5636856368563685, "grad_norm": 0.09320900589227676, "learning_rate": 1.2717056444563957e-05, "loss": 0.1148, "step": 2130 }, { "epoch": 2.5636856368563685, "eval_loss": 0.12110390514135361, "eval_runtime": 81.8244, "eval_samples_per_second": 4.277, "eval_steps_per_second": 2.139, "step": 2130 }, { "epoch": 2.575730201746462, "grad_norm": 0.09060157835483551, "learning_rate": 1.2042101421758955e-05, "loss": 0.1219, "step": 2140 }, { "epoch": 2.587774766636555, "grad_norm": 0.09531334787607193, "learning_rate": 1.1384401159766433e-05, "loss": 0.1136, "step": 2150 }, { "epoch": 2.5998193315266485, "grad_norm": 0.07259315997362137, "learning_rate": 1.0744084680094246e-05, "loss": 0.1062, "step": 2160 }, { "epoch": 2.5998193315266485, "eval_loss": 0.12105338275432587, "eval_runtime": 81.5762, "eval_samples_per_second": 4.29, "eval_steps_per_second": 2.145, "step": 2160 }, { "epoch": 2.611863896416742, "grad_norm": 0.07185359299182892, "learning_rate": 1.0121277594061939e-05, "loss": 0.1184, "step": 2170 }, { "epoch": 2.6239084613068355, "grad_norm": 0.0885002538561821, "learning_rate": 9.516102078159317e-06, "loss": 0.1243, "step": 2180 }, { "epoch": 2.635953026196929, "grad_norm": 0.10129860043525696, "learning_rate": 8.928676850079133e-06, "loss": 0.1249, "step": 2190 }, { "epoch": 2.635953026196929, "eval_loss": 0.1209418997168541, "eval_runtime": 81.4691, "eval_samples_per_second": 4.296, "eval_steps_per_second": 2.148, "step": 2190 }, { "epoch": 2.647997591087022, "grad_norm": 0.08937793970108032, "learning_rate": 8.359117145428053e-06, "loss": 0.1188, "step": 2200 }, { "epoch": 2.6600421559771155, "grad_norm": 0.08618754148483276, "learning_rate": 7.807534695120911e-06, "loss": 0.1193, "step": 2210 }, { "epoch": 2.6720867208672088, "grad_norm": 0.0760350152850151, "learning_rate": 7.274037703462244e-06, "loss": 0.107, "step": 2220 }, { "epoch": 2.6720867208672088, "eval_loss": 0.12113867700099945, "eval_runtime": 81.609, "eval_samples_per_second": 4.289, "eval_steps_per_second": 2.144, "step": 2220 }, { "epoch": 2.684131285757302, "grad_norm": 0.08701752126216888, "learning_rate": 6.7587308269199786e-06, "loss": 0.1042, "step": 2230 }, { "epoch": 2.6961758506473954, "grad_norm": 0.08244354277849197, "learning_rate": 6.261715153594627e-06, "loss": 0.1156, "step": 2240 }, { "epoch": 2.7082204155374887, "grad_norm": 0.14877063035964966, "learning_rate": 5.783088183389062e-06, "loss": 0.1076, "step": 2250 }, { "epoch": 2.7082204155374887, "eval_loss": 0.12096786499023438, "eval_runtime": 81.6046, "eval_samples_per_second": 4.289, "eval_steps_per_second": 2.144, "step": 2250 }, { "epoch": 2.720264980427582, "grad_norm": 0.09160082787275314, "learning_rate": 5.322943808881675e-06, "loss": 0.1188, "step": 2260 }, { "epoch": 2.7323095453176753, "grad_norm": 0.08697830885648727, "learning_rate": 4.881372296907516e-06, "loss": 0.1082, "step": 2270 }, { "epoch": 2.7443541102077686, "grad_norm": 0.09075489640235901, "learning_rate": 4.4584602708505285e-06, "loss": 0.118, "step": 2280 }, { "epoch": 2.7443541102077686, "eval_loss": 0.12092573195695877, "eval_runtime": 81.6503, "eval_samples_per_second": 4.287, "eval_steps_per_second": 2.143, "step": 2280 }, { "epoch": 2.756398675097862, "grad_norm": 0.0934319868683815, "learning_rate": 4.054290693650642e-06, "loss": 0.1091, "step": 2290 }, { "epoch": 2.7684432399879553, "grad_norm": 0.08008915930986404, "learning_rate": 3.6689428515288004e-06, "loss": 0.1103, "step": 2300 }, { "epoch": 2.7804878048780486, "grad_norm": 0.07896912097930908, "learning_rate": 3.3024923384334163e-06, "loss": 0.1087, "step": 2310 }, { "epoch": 2.7804878048780486, "eval_loss": 0.12093985080718994, "eval_runtime": 81.4605, "eval_samples_per_second": 4.297, "eval_steps_per_second": 2.148, "step": 2310 }, { "epoch": 2.792532369768142, "grad_norm": 0.08136588335037231, "learning_rate": 2.9550110412109534e-06, "loss": 0.1106, "step": 2320 }, { "epoch": 2.804576934658235, "grad_norm": 0.07197284698486328, "learning_rate": 2.6265671255039537e-06, "loss": 0.1265, "step": 2330 }, { "epoch": 2.816621499548329, "grad_norm": 0.09639979153871536, "learning_rate": 2.3172250223787994e-06, "loss": 0.1168, "step": 2340 }, { "epoch": 2.816621499548329, "eval_loss": 0.12088128179311752, "eval_runtime": 81.8851, "eval_samples_per_second": 4.274, "eval_steps_per_second": 2.137, "step": 2340 }, { "epoch": 2.8286660644384223, "grad_norm": 0.09752853214740753, "learning_rate": 2.0270454156863905e-06, "loss": 0.1209, "step": 2350 }, { "epoch": 2.8407106293285156, "grad_norm": 0.08983393758535385, "learning_rate": 1.7560852301575892e-06, "loss": 0.1185, "step": 2360 }, { "epoch": 2.852755194218609, "grad_norm": 0.09476975351572037, "learning_rate": 1.5043976202363641e-06, "loss": 0.1172, "step": 2370 }, { "epoch": 2.852755194218609, "eval_loss": 0.1208338588476181, "eval_runtime": 81.5048, "eval_samples_per_second": 4.294, "eval_steps_per_second": 2.147, "step": 2370 }, { "epoch": 2.864799759108702, "grad_norm": 0.06642317026853561, "learning_rate": 1.2720319596523977e-06, "loss": 0.116, "step": 2380 }, { "epoch": 2.8768443239987955, "grad_norm": 0.10626640915870667, "learning_rate": 1.0590338317354454e-06, "loss": 0.112, "step": 2390 }, { "epoch": 2.888888888888889, "grad_norm": 0.07917091995477676, "learning_rate": 8.654450204731768e-07, "loss": 0.1092, "step": 2400 }, { "epoch": 2.888888888888889, "eval_loss": 0.12085919827222824, "eval_runtime": 81.5821, "eval_samples_per_second": 4.29, "eval_steps_per_second": 2.145, "step": 2400 } ], "logging_steps": 10, "max_steps": 2493, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.360791333022802e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }