| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.888888888888889, | |
| "eval_steps": 30, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012044564890093345, | |
| "grad_norm": 11.71904468536377, | |
| "learning_rate": 7.2e-06, | |
| "loss": 2.9594, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02408912978018669, | |
| "grad_norm": 2.4383621215820312, | |
| "learning_rate": 1.52e-05, | |
| "loss": 1.5501, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.036133694670280034, | |
| "grad_norm": 0.7187284231185913, | |
| "learning_rate": 2.32e-05, | |
| "loss": 0.7478, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.036133694670280034, | |
| "eval_loss": 0.6954202651977539, | |
| "eval_runtime": 81.243, | |
| "eval_samples_per_second": 4.308, | |
| "eval_steps_per_second": 2.154, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04817825956037338, | |
| "grad_norm": 0.9893588423728943, | |
| "learning_rate": 3.12e-05, | |
| "loss": 0.6087, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06022282445046673, | |
| "grad_norm": 0.6181250214576721, | |
| "learning_rate": 3.9200000000000004e-05, | |
| "loss": 0.4514, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07226738934056007, | |
| "grad_norm": 0.5894028544425964, | |
| "learning_rate": 4.72e-05, | |
| "loss": 0.3809, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07226738934056007, | |
| "eval_loss": 0.35219648480415344, | |
| "eval_runtime": 80.7477, | |
| "eval_samples_per_second": 4.334, | |
| "eval_steps_per_second": 2.167, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08431195423065342, | |
| "grad_norm": 3.429518222808838, | |
| "learning_rate": 5.520000000000001e-05, | |
| "loss": 0.2992, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09635651912074676, | |
| "grad_norm": 0.6878916025161743, | |
| "learning_rate": 6.32e-05, | |
| "loss": 0.2921, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10840108401084012, | |
| "grad_norm": 0.5862188339233398, | |
| "learning_rate": 7.12e-05, | |
| "loss": 0.2483, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10840108401084012, | |
| "eval_loss": 0.22866909205913544, | |
| "eval_runtime": 81.2164, | |
| "eval_samples_per_second": 4.309, | |
| "eval_steps_per_second": 2.155, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12044564890093346, | |
| "grad_norm": 0.7386724948883057, | |
| "learning_rate": 7.920000000000001e-05, | |
| "loss": 0.2107, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1324902137910268, | |
| "grad_norm": 0.6922011375427246, | |
| "learning_rate": 8.72e-05, | |
| "loss": 0.1962, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14453477868112014, | |
| "grad_norm": 0.4779876470565796, | |
| "learning_rate": 9.52e-05, | |
| "loss": 0.1741, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14453477868112014, | |
| "eval_loss": 0.18707558512687683, | |
| "eval_runtime": 81.1016, | |
| "eval_samples_per_second": 4.316, | |
| "eval_steps_per_second": 2.158, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1565793435712135, | |
| "grad_norm": 0.3711669445037842, | |
| "learning_rate": 0.0001032, | |
| "loss": 0.1775, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16862390846130684, | |
| "grad_norm": 0.5507099032402039, | |
| "learning_rate": 0.00011120000000000002, | |
| "loss": 0.1663, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18066847335140018, | |
| "grad_norm": 0.6897018551826477, | |
| "learning_rate": 0.0001192, | |
| "loss": 0.162, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18066847335140018, | |
| "eval_loss": 0.1689070761203766, | |
| "eval_runtime": 81.1763, | |
| "eval_samples_per_second": 4.312, | |
| "eval_steps_per_second": 2.156, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19271303824149352, | |
| "grad_norm": 0.30447226762771606, | |
| "learning_rate": 0.0001272, | |
| "loss": 0.1654, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20475760313158686, | |
| "grad_norm": 0.36739301681518555, | |
| "learning_rate": 0.0001352, | |
| "loss": 0.1692, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21680216802168023, | |
| "grad_norm": 0.42459923028945923, | |
| "learning_rate": 0.0001432, | |
| "loss": 0.1479, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21680216802168023, | |
| "eval_loss": 0.16287875175476074, | |
| "eval_runtime": 81.3447, | |
| "eval_samples_per_second": 4.303, | |
| "eval_steps_per_second": 2.151, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22884673291177357, | |
| "grad_norm": 0.24459399282932281, | |
| "learning_rate": 0.00015120000000000002, | |
| "loss": 0.1567, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2408912978018669, | |
| "grad_norm": 0.29077818989753723, | |
| "learning_rate": 0.00015920000000000002, | |
| "loss": 0.1491, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2529358626919603, | |
| "grad_norm": 0.3934674561023712, | |
| "learning_rate": 0.0001672, | |
| "loss": 0.1587, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2529358626919603, | |
| "eval_loss": 0.1568612903356552, | |
| "eval_runtime": 80.9345, | |
| "eval_samples_per_second": 4.324, | |
| "eval_steps_per_second": 2.162, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2649804275820536, | |
| "grad_norm": 0.31920939683914185, | |
| "learning_rate": 0.0001752, | |
| "loss": 0.1622, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.27702499247214696, | |
| "grad_norm": 0.5031167268753052, | |
| "learning_rate": 0.0001832, | |
| "loss": 0.147, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.28906955736224027, | |
| "grad_norm": 0.2890608310699463, | |
| "learning_rate": 0.0001912, | |
| "loss": 0.1433, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.28906955736224027, | |
| "eval_loss": 0.15362557768821716, | |
| "eval_runtime": 80.9234, | |
| "eval_samples_per_second": 4.325, | |
| "eval_steps_per_second": 2.163, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30111412225233364, | |
| "grad_norm": 0.2849373519420624, | |
| "learning_rate": 0.00019920000000000002, | |
| "loss": 0.1471, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.313158687142427, | |
| "grad_norm": 0.263950914144516, | |
| "learning_rate": 0.0001999920550627653, | |
| "loss": 0.155, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3252032520325203, | |
| "grad_norm": 0.31182682514190674, | |
| "learning_rate": 0.00019996459270297992, | |
| "loss": 0.138, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3252032520325203, | |
| "eval_loss": 0.15008553862571716, | |
| "eval_runtime": 80.9483, | |
| "eval_samples_per_second": 4.324, | |
| "eval_steps_per_second": 2.162, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3372478169226137, | |
| "grad_norm": 0.41790467500686646, | |
| "learning_rate": 0.00019991752022102606, | |
| "loss": 0.1472, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.349292381812707, | |
| "grad_norm": 0.29337936639785767, | |
| "learning_rate": 0.0001998508468511445, | |
| "loss": 0.152, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.36133694670280037, | |
| "grad_norm": 0.1533428430557251, | |
| "learning_rate": 0.0001997645856726956, | |
| "loss": 0.1427, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.36133694670280037, | |
| "eval_loss": 0.14941252768039703, | |
| "eval_runtime": 81.0588, | |
| "eval_samples_per_second": 4.318, | |
| "eval_steps_per_second": 2.159, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.37338151159289373, | |
| "grad_norm": 0.3626411557197571, | |
| "learning_rate": 0.0001996587536075934, | |
| "loss": 0.1539, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.38542607648298705, | |
| "grad_norm": 0.3157321810722351, | |
| "learning_rate": 0.00019953337141698617, | |
| "loss": 0.1457, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3974706413730804, | |
| "grad_norm": 0.18949578702449799, | |
| "learning_rate": 0.00019938846369718348, | |
| "loss": 0.1471, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3974706413730804, | |
| "eval_loss": 0.14507745206356049, | |
| "eval_runtime": 81.0028, | |
| "eval_samples_per_second": 4.321, | |
| "eval_steps_per_second": 2.16, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4095152062631737, | |
| "grad_norm": 0.2189997136592865, | |
| "learning_rate": 0.0001992240588748314, | |
| "loss": 0.1423, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4215597711532671, | |
| "grad_norm": 0.2674465775489807, | |
| "learning_rate": 0.00019904018920133574, | |
| "loss": 0.1397, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.43360433604336046, | |
| "grad_norm": 0.3467840552330017, | |
| "learning_rate": 0.00019883689074653548, | |
| "loss": 0.1486, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.43360433604336046, | |
| "eval_loss": 0.1380164623260498, | |
| "eval_runtime": 81.2554, | |
| "eval_samples_per_second": 4.307, | |
| "eval_steps_per_second": 2.154, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4456489009334538, | |
| "grad_norm": 0.22817489504814148, | |
| "learning_rate": 0.00019861420339162682, | |
| "loss": 0.1411, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.45769346582354714, | |
| "grad_norm": 0.1939237415790558, | |
| "learning_rate": 0.0001983721708213397, | |
| "loss": 0.1337, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.46973803071364045, | |
| "grad_norm": 0.20223841071128845, | |
| "learning_rate": 0.00019811084051536812, | |
| "loss": 0.147, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.46973803071364045, | |
| "eval_loss": 0.13835138082504272, | |
| "eval_runtime": 81.2083, | |
| "eval_samples_per_second": 4.31, | |
| "eval_steps_per_second": 2.155, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4817825956037338, | |
| "grad_norm": 0.16610513627529144, | |
| "learning_rate": 0.00019783026373905603, | |
| "loss": 0.1261, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 0.13892178237438202, | |
| "learning_rate": 0.0001975304955333405, | |
| "loss": 0.1303, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5058717253839206, | |
| "grad_norm": 0.17607590556144714, | |
| "learning_rate": 0.00019721159470395446, | |
| "loss": 0.1408, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5058717253839206, | |
| "eval_loss": 0.13557562232017517, | |
| "eval_runtime": 81.2501, | |
| "eval_samples_per_second": 4.308, | |
| "eval_steps_per_second": 2.154, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5179162902740139, | |
| "grad_norm": 0.2578093409538269, | |
| "learning_rate": 0.0001968736238098906, | |
| "loss": 0.1328, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5299608551641072, | |
| "grad_norm": 0.5826597213745117, | |
| "learning_rate": 0.00019651664915112913, | |
| "loss": 0.1379, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5420054200542005, | |
| "grad_norm": 0.2640504240989685, | |
| "learning_rate": 0.00019614074075563178, | |
| "loss": 0.1328, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5420054200542005, | |
| "eval_loss": 0.13704629242420197, | |
| "eval_runtime": 81.2461, | |
| "eval_samples_per_second": 4.308, | |
| "eval_steps_per_second": 2.154, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5540499849442939, | |
| "grad_norm": 0.25430986285209656, | |
| "learning_rate": 0.00019574597236560428, | |
| "loss": 0.1295, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5660945498343872, | |
| "grad_norm": 0.2172861099243164, | |
| "learning_rate": 0.00019533242142303028, | |
| "loss": 0.1274, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5781391147244805, | |
| "grad_norm": 0.18936924636363983, | |
| "learning_rate": 0.00019490016905447958, | |
| "loss": 0.131, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5781391147244805, | |
| "eval_loss": 0.13753947615623474, | |
| "eval_runtime": 81.0949, | |
| "eval_samples_per_second": 4.316, | |
| "eval_steps_per_second": 2.158, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.590183679614574, | |
| "grad_norm": 0.26435989141464233, | |
| "learning_rate": 0.00019444930005519347, | |
| "loss": 0.132, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6022282445046673, | |
| "grad_norm": 0.23327338695526123, | |
| "learning_rate": 0.00019397990287245027, | |
| "loss": 0.1417, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6142728093947606, | |
| "grad_norm": 0.14256203174591064, | |
| "learning_rate": 0.00019349206958821474, | |
| "loss": 0.1392, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6142728093947606, | |
| "eval_loss": 0.13641956448554993, | |
| "eval_runtime": 81.0841, | |
| "eval_samples_per_second": 4.317, | |
| "eval_steps_per_second": 2.158, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.626317374284854, | |
| "grad_norm": 0.16011199355125427, | |
| "learning_rate": 0.00019298589590107415, | |
| "loss": 0.1406, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6383619391749473, | |
| "grad_norm": 0.1269129067659378, | |
| "learning_rate": 0.00019246148110746515, | |
| "loss": 0.1346, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6504065040650406, | |
| "grad_norm": 0.13770046830177307, | |
| "learning_rate": 0.0001919189280821946, | |
| "loss": 0.1315, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6504065040650406, | |
| "eval_loss": 0.1363556832075119, | |
| "eval_runtime": 81.2186, | |
| "eval_samples_per_second": 4.309, | |
| "eval_steps_per_second": 2.155, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.662451068955134, | |
| "grad_norm": 0.18796966969966888, | |
| "learning_rate": 0.00019135834325825868, | |
| "loss": 0.13, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6744956338452274, | |
| "grad_norm": 0.12326055020093918, | |
| "learning_rate": 0.00019077983660596365, | |
| "loss": 0.1339, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6865401987353207, | |
| "grad_norm": 0.1480596363544464, | |
| "learning_rate": 0.00019018352161135317, | |
| "loss": 0.1296, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6865401987353207, | |
| "eval_loss": 0.13280269503593445, | |
| "eval_runtime": 81.3546, | |
| "eval_samples_per_second": 4.302, | |
| "eval_steps_per_second": 2.151, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.698584763625414, | |
| "grad_norm": 0.15606318414211273, | |
| "learning_rate": 0.0001895695152539455, | |
| "loss": 0.1296, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7106293285155074, | |
| "grad_norm": 0.09551403671503067, | |
| "learning_rate": 0.00018893793798378553, | |
| "loss": 0.1323, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7226738934056007, | |
| "grad_norm": 0.11749322712421417, | |
| "learning_rate": 0.00018828891369781605, | |
| "loss": 0.1179, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7226738934056007, | |
| "eval_loss": 0.12917345762252808, | |
| "eval_runtime": 81.4843, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.148, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.734718458295694, | |
| "grad_norm": 0.11540284752845764, | |
| "learning_rate": 0.0001876225697155729, | |
| "loss": 0.1227, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7467630231857875, | |
| "grad_norm": 0.3190910518169403, | |
| "learning_rate": 0.00018693903675420846, | |
| "loss": 0.1281, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7588075880758808, | |
| "grad_norm": 0.11964758485555649, | |
| "learning_rate": 0.00018623844890284884, | |
| "loss": 0.1356, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7588075880758808, | |
| "eval_loss": 0.12936915457248688, | |
| "eval_runtime": 81.4409, | |
| "eval_samples_per_second": 4.298, | |
| "eval_steps_per_second": 2.149, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7708521529659741, | |
| "grad_norm": 0.10809649527072906, | |
| "learning_rate": 0.00018552094359628956, | |
| "loss": 0.1337, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7828967178560674, | |
| "grad_norm": 0.13179966807365417, | |
| "learning_rate": 0.00018478666158803475, | |
| "loss": 0.1213, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7949412827461608, | |
| "grad_norm": 0.13050246238708496, | |
| "learning_rate": 0.00018403574692268566, | |
| "loss": 0.1274, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7949412827461608, | |
| "eval_loss": 0.129548579454422, | |
| "eval_runtime": 81.3854, | |
| "eval_samples_per_second": 4.301, | |
| "eval_steps_per_second": 2.15, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8069858476362541, | |
| "grad_norm": 0.10548313707113266, | |
| "learning_rate": 0.00018326834690768308, | |
| "loss": 0.1266, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8190304125263475, | |
| "grad_norm": 0.11163028329610825, | |
| "learning_rate": 0.00018248461208441016, | |
| "loss": 0.1275, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8310749774164409, | |
| "grad_norm": 0.10246960818767548, | |
| "learning_rate": 0.00018168469619866037, | |
| "loss": 0.1262, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8310749774164409, | |
| "eval_loss": 0.12998254597187042, | |
| "eval_runtime": 81.2607, | |
| "eval_samples_per_second": 4.307, | |
| "eval_steps_per_second": 2.154, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8431195423065342, | |
| "grad_norm": 0.16775400936603546, | |
| "learning_rate": 0.00018086875617047738, | |
| "loss": 0.128, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8551641071966275, | |
| "grad_norm": 0.09823903441429138, | |
| "learning_rate": 0.00018003695206337164, | |
| "loss": 0.1186, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8672086720867209, | |
| "grad_norm": 0.08154378831386566, | |
| "learning_rate": 0.0001791894470529209, | |
| "loss": 0.1304, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8672086720867209, | |
| "eval_loss": 0.12851889431476593, | |
| "eval_runtime": 81.3179, | |
| "eval_samples_per_second": 4.304, | |
| "eval_steps_per_second": 2.152, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8792532369768142, | |
| "grad_norm": 0.13566914200782776, | |
| "learning_rate": 0.00017832640739475964, | |
| "loss": 0.1256, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8912978018669075, | |
| "grad_norm": 0.21967343986034393, | |
| "learning_rate": 0.00017744800239196485, | |
| "loss": 0.1049, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9033423667570009, | |
| "grad_norm": 0.12291447818279266, | |
| "learning_rate": 0.00017655440436184361, | |
| "loss": 0.1352, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9033423667570009, | |
| "eval_loss": 0.12859711050987244, | |
| "eval_runtime": 81.5112, | |
| "eval_samples_per_second": 4.294, | |
| "eval_steps_per_second": 2.147, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9153869316470943, | |
| "grad_norm": 0.23631049692630768, | |
| "learning_rate": 0.00017564578860212952, | |
| "loss": 0.1236, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9274314965371876, | |
| "grad_norm": 0.13145385682582855, | |
| "learning_rate": 0.00017472233335659443, | |
| "loss": 0.1235, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9394760614272809, | |
| "grad_norm": 0.0923519879579544, | |
| "learning_rate": 0.00017378421978008212, | |
| "loss": 0.1224, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9394760614272809, | |
| "eval_loss": 0.12813878059387207, | |
| "eval_runtime": 81.2514, | |
| "eval_samples_per_second": 4.308, | |
| "eval_steps_per_second": 2.154, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9515206263173743, | |
| "grad_norm": 0.2052728235721588, | |
| "learning_rate": 0.0001728316319029713, | |
| "loss": 0.1189, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9635651912074676, | |
| "grad_norm": 0.19939783215522766, | |
| "learning_rate": 0.0001718647565950739, | |
| "loss": 0.128, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.11448610574007034, | |
| "learning_rate": 0.00017088378352897703, | |
| "loss": 0.1103, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "eval_loss": 0.12625984847545624, | |
| "eval_runtime": 81.2673, | |
| "eval_samples_per_second": 4.307, | |
| "eval_steps_per_second": 2.153, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 0.11135770380496979, | |
| "learning_rate": 0.00016988890514283447, | |
| "loss": 0.1281, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9996988858777477, | |
| "grad_norm": 0.11028297245502472, | |
| "learning_rate": 0.00016888031660261622, | |
| "loss": 0.1225, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.010840108401084, | |
| "grad_norm": 0.11298307776451111, | |
| "learning_rate": 0.00016785821576382245, | |
| "loss": 0.1188, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.010840108401084, | |
| "eval_loss": 0.1276378333568573, | |
| "eval_runtime": 81.1811, | |
| "eval_samples_per_second": 4.311, | |
| "eval_steps_per_second": 2.156, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0228846732911774, | |
| "grad_norm": 0.09120020270347595, | |
| "learning_rate": 0.0001668228031326702, | |
| "loss": 0.1169, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0349292381812707, | |
| "grad_norm": 0.11060494184494019, | |
| "learning_rate": 0.00016577428182675973, | |
| "loss": 0.1243, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.046973803071364, | |
| "grad_norm": 0.09636874496936798, | |
| "learning_rate": 0.0001647128575352292, | |
| "loss": 0.1187, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.046973803071364, | |
| "eval_loss": 0.12680906057357788, | |
| "eval_runtime": 81.1164, | |
| "eval_samples_per_second": 4.315, | |
| "eval_steps_per_second": 2.157, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0590183679614573, | |
| "grad_norm": 0.10281568765640259, | |
| "learning_rate": 0.000163638738478404, | |
| "loss": 0.133, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0710629328515506, | |
| "grad_norm": 0.08682233840227127, | |
| "learning_rate": 0.0001625521353669504, | |
| "loss": 0.1171, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0831074977416442, | |
| "grad_norm": 0.1339954435825348, | |
| "learning_rate": 0.00016145326136054008, | |
| "loss": 0.1257, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0831074977416442, | |
| "eval_loss": 0.12813013792037964, | |
| "eval_runtime": 81.4418, | |
| "eval_samples_per_second": 4.298, | |
| "eval_steps_per_second": 2.149, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0951520626317375, | |
| "grad_norm": 0.09956265985965729, | |
| "learning_rate": 0.00016034233202603463, | |
| "loss": 0.1287, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1071966275218308, | |
| "grad_norm": 0.07889163494110107, | |
| "learning_rate": 0.00015921956529519747, | |
| "loss": 0.1135, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.119241192411924, | |
| "grad_norm": 0.07798007130622864, | |
| "learning_rate": 0.00015808518142194214, | |
| "loss": 0.1128, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.119241192411924, | |
| "eval_loss": 0.12662966549396515, | |
| "eval_runtime": 81.2769, | |
| "eval_samples_per_second": 4.306, | |
| "eval_steps_per_second": 2.153, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1312857573020174, | |
| "grad_norm": 0.09110717475414276, | |
| "learning_rate": 0.00015693940293912492, | |
| "loss": 0.1161, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.1433303221921107, | |
| "grad_norm": 0.10704551637172699, | |
| "learning_rate": 0.00015578245461489042, | |
| "loss": 0.1237, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1553748870822043, | |
| "grad_norm": 0.12007839977741241, | |
| "learning_rate": 0.00015461456340857857, | |
| "loss": 0.1246, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1553748870822043, | |
| "eval_loss": 0.12597930431365967, | |
| "eval_runtime": 81.2379, | |
| "eval_samples_per_second": 4.308, | |
| "eval_steps_per_second": 2.154, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1674194519722976, | |
| "grad_norm": 0.09195715934038162, | |
| "learning_rate": 0.00015343595842620198, | |
| "loss": 0.1179, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1794640168623909, | |
| "grad_norm": 0.11327285319566727, | |
| "learning_rate": 0.000152246870875502, | |
| "loss": 0.1219, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1915085817524842, | |
| "grad_norm": 0.12147443741559982, | |
| "learning_rate": 0.00015104753402059252, | |
| "loss": 0.1198, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1915085817524842, | |
| "eval_loss": 0.12678198516368866, | |
| "eval_runtime": 81.2272, | |
| "eval_samples_per_second": 4.309, | |
| "eval_steps_per_second": 2.154, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2035531466425775, | |
| "grad_norm": 0.12530316412448883, | |
| "learning_rate": 0.00014983818313620047, | |
| "loss": 0.1215, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2155977115326708, | |
| "grad_norm": 0.21503940224647522, | |
| "learning_rate": 0.00014861905546151164, | |
| "loss": 0.132, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2276422764227641, | |
| "grad_norm": 0.09851568937301636, | |
| "learning_rate": 0.00014739039015363155, | |
| "loss": 0.1255, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2276422764227641, | |
| "eval_loss": 0.1265084594488144, | |
| "eval_runtime": 81.2978, | |
| "eval_samples_per_second": 4.305, | |
| "eval_steps_per_second": 2.153, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2396868413128577, | |
| "grad_norm": 0.09794170409440994, | |
| "learning_rate": 0.0001461524282406696, | |
| "loss": 0.1203, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.251731406202951, | |
| "grad_norm": 0.09283249080181122, | |
| "learning_rate": 0.00014490541257445664, | |
| "loss": 0.1241, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.2637759710930443, | |
| "grad_norm": 0.10757778584957123, | |
| "learning_rate": 0.00014364958778290436, | |
| "loss": 0.1263, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2637759710930443, | |
| "eval_loss": 0.1275068074464798, | |
| "eval_runtime": 81.3037, | |
| "eval_samples_per_second": 4.305, | |
| "eval_steps_per_second": 2.152, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2758205359831376, | |
| "grad_norm": 0.08741755038499832, | |
| "learning_rate": 0.00014238520022201665, | |
| "loss": 0.1279, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.287865100873231, | |
| "grad_norm": 0.09674480557441711, | |
| "learning_rate": 0.00014111249792756164, | |
| "loss": 0.1203, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2999096657633242, | |
| "grad_norm": 0.18980656564235687, | |
| "learning_rate": 0.00013983173056641437, | |
| "loss": 0.1273, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.2999096657633242, | |
| "eval_loss": 0.12540876865386963, | |
| "eval_runtime": 81.3684, | |
| "eval_samples_per_second": 4.301, | |
| "eval_steps_per_second": 2.151, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.3119542306534178, | |
| "grad_norm": 0.11260770261287689, | |
| "learning_rate": 0.00013854314938757954, | |
| "loss": 0.1209, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.323998795543511, | |
| "grad_norm": 0.08883814513683319, | |
| "learning_rate": 0.00013724700717290385, | |
| "loss": 0.1307, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3360433604336044, | |
| "grad_norm": 0.08555177599191666, | |
| "learning_rate": 0.0001359435581874874, | |
| "loss": 0.1187, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.3360433604336044, | |
| "eval_loss": 0.1246650293469429, | |
| "eval_runtime": 81.3179, | |
| "eval_samples_per_second": 4.304, | |
| "eval_steps_per_second": 2.152, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.3480879253236977, | |
| "grad_norm": 0.09265279769897461, | |
| "learning_rate": 0.0001346330581298046, | |
| "loss": 0.1237, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.360132490213791, | |
| "grad_norm": 0.07397205382585526, | |
| "learning_rate": 0.0001333157640815434, | |
| "loss": 0.1194, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.3721770551038843, | |
| "grad_norm": 0.10523436218500137, | |
| "learning_rate": 0.00013199193445717362, | |
| "loss": 0.1215, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3721770551038843, | |
| "eval_loss": 0.12532441318035126, | |
| "eval_runtime": 81.5003, | |
| "eval_samples_per_second": 4.294, | |
| "eval_steps_per_second": 2.147, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3842216199939776, | |
| "grad_norm": 0.08104018867015839, | |
| "learning_rate": 0.00013066182895325339, | |
| "loss": 0.1276, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.396266184884071, | |
| "grad_norm": 0.1049809381365776, | |
| "learning_rate": 0.00012932570849748446, | |
| "loss": 0.128, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.4083107497741645, | |
| "grad_norm": 0.10500895977020264, | |
| "learning_rate": 0.00012798383519752577, | |
| "loss": 0.1179, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.4083107497741645, | |
| "eval_loss": 0.12398175895214081, | |
| "eval_runtime": 81.4012, | |
| "eval_samples_per_second": 4.3, | |
| "eval_steps_per_second": 2.15, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.4203553146642578, | |
| "grad_norm": 0.08427383005619049, | |
| "learning_rate": 0.00012663647228957562, | |
| "loss": 0.1145, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.432399879554351, | |
| "grad_norm": 0.08236031979322433, | |
| "learning_rate": 0.0001252838840867324, | |
| "loss": 0.1216, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.08572593331336975, | |
| "learning_rate": 0.00012392633592714423, | |
| "loss": 0.128, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "eval_loss": 0.1240333542227745, | |
| "eval_runtime": 81.3808, | |
| "eval_samples_per_second": 4.301, | |
| "eval_steps_per_second": 2.15, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4564890093345377, | |
| "grad_norm": 0.08900213986635208, | |
| "learning_rate": 0.00012256409412195727, | |
| "loss": 0.1171, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.4685335742246313, | |
| "grad_norm": 0.09740438312292099, | |
| "learning_rate": 0.0001211974259030733, | |
| "loss": 0.1316, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.4805781391147246, | |
| "grad_norm": 0.07978689670562744, | |
| "learning_rate": 0.00011982659937072677, | |
| "loss": 0.1219, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.4805781391147246, | |
| "eval_loss": 0.12412716448307037, | |
| "eval_runtime": 81.2608, | |
| "eval_samples_per_second": 4.307, | |
| "eval_steps_per_second": 2.154, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.492622704004818, | |
| "grad_norm": 0.08000902086496353, | |
| "learning_rate": 0.00011845188344089126, | |
| "loss": 0.1135, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.5046672688949112, | |
| "grad_norm": 0.07367110252380371, | |
| "learning_rate": 0.00011707354779252612, | |
| "loss": 0.1206, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.5167118337850045, | |
| "grad_norm": 0.07485098391771317, | |
| "learning_rate": 0.00011569186281467335, | |
| "loss": 0.1138, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.5167118337850045, | |
| "eval_loss": 0.1249684989452362, | |
| "eval_runtime": 81.6234, | |
| "eval_samples_per_second": 4.288, | |
| "eval_steps_per_second": 2.144, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.5287563986750978, | |
| "grad_norm": 0.11146491765975952, | |
| "learning_rate": 0.00011430709955341514, | |
| "loss": 0.1272, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.5408009635651911, | |
| "grad_norm": 0.08664289861917496, | |
| "learning_rate": 0.00011291952965870269, | |
| "loss": 0.1147, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.5528455284552845, | |
| "grad_norm": 0.09051596373319626, | |
| "learning_rate": 0.00011152942533106638, | |
| "loss": 0.1161, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.5528455284552845, | |
| "eval_loss": 0.12454230338335037, | |
| "eval_runtime": 81.4318, | |
| "eval_samples_per_second": 4.298, | |
| "eval_steps_per_second": 2.149, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.5648900933453778, | |
| "grad_norm": 0.06956392526626587, | |
| "learning_rate": 0.000110137059268218, | |
| "loss": 0.1077, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5769346582354713, | |
| "grad_norm": 0.07337108254432678, | |
| "learning_rate": 0.00010874270461155554, | |
| "loss": 0.124, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.5889792231255646, | |
| "grad_norm": 0.08082354068756104, | |
| "learning_rate": 0.0001073466348925807, | |
| "loss": 0.1108, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5889792231255646, | |
| "eval_loss": 0.12420380860567093, | |
| "eval_runtime": 81.6866, | |
| "eval_samples_per_second": 4.285, | |
| "eval_steps_per_second": 2.142, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.601023788015658, | |
| "grad_norm": 0.07141824066638947, | |
| "learning_rate": 0.00010594912397924018, | |
| "loss": 0.1112, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.6130683529057512, | |
| "grad_norm": 0.09476039558649063, | |
| "learning_rate": 0.00010455044602220076, | |
| "loss": 0.1257, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.6251129177958448, | |
| "grad_norm": 0.06789754331111908, | |
| "learning_rate": 0.00010315087540106894, | |
| "loss": 0.1167, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6251129177958448, | |
| "eval_loss": 0.12411510944366455, | |
| "eval_runtime": 81.5821, | |
| "eval_samples_per_second": 4.29, | |
| "eval_steps_per_second": 2.145, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.637157482685938, | |
| "grad_norm": 0.08458850532770157, | |
| "learning_rate": 0.00010175068667056578, | |
| "loss": 0.1215, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.6492020475760314, | |
| "grad_norm": 0.0818653479218483, | |
| "learning_rate": 0.00010035015450666723, | |
| "loss": 0.1193, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.6612466124661247, | |
| "grad_norm": 0.060270510613918304, | |
| "learning_rate": 9.894955365272087e-05, | |
| "loss": 0.1094, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6612466124661247, | |
| "eval_loss": 0.12342710047960281, | |
| "eval_runtime": 81.3723, | |
| "eval_samples_per_second": 4.301, | |
| "eval_steps_per_second": 2.151, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.673291177356218, | |
| "grad_norm": 0.08454828709363937, | |
| "learning_rate": 9.75491588655492e-05, | |
| "loss": 0.1205, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.6853357422463113, | |
| "grad_norm": 0.06156294047832489, | |
| "learning_rate": 9.614924486155047e-05, | |
| "loss": 0.1111, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.6973803071364046, | |
| "grad_norm": 0.08005426079034805, | |
| "learning_rate": 9.475008626280739e-05, | |
| "loss": 0.1077, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6973803071364046, | |
| "eval_loss": 0.12282679229974747, | |
| "eval_runtime": 81.2841, | |
| "eval_samples_per_second": 4.306, | |
| "eval_steps_per_second": 2.153, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.709424872026498, | |
| "grad_norm": 0.08515966683626175, | |
| "learning_rate": 9.335195754321427e-05, | |
| "loss": 0.1226, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.7214694369165913, | |
| "grad_norm": 0.08117437362670898, | |
| "learning_rate": 9.195513297463339e-05, | |
| "loss": 0.1158, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.7335140018066846, | |
| "grad_norm": 0.0748409777879715, | |
| "learning_rate": 9.055988657309075e-05, | |
| "loss": 0.1152, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7335140018066846, | |
| "eval_loss": 0.12351047992706299, | |
| "eval_runtime": 81.3284, | |
| "eval_samples_per_second": 4.304, | |
| "eval_steps_per_second": 2.152, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7455585666967781, | |
| "grad_norm": 0.08821168541908264, | |
| "learning_rate": 8.916649204502231e-05, | |
| "loss": 0.1231, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.7576031315868714, | |
| "grad_norm": 0.06368843466043472, | |
| "learning_rate": 8.777522273358076e-05, | |
| "loss": 0.1144, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.7696476964769647, | |
| "grad_norm": 0.08906359225511551, | |
| "learning_rate": 8.638635156501353e-05, | |
| "loss": 0.1278, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.7696476964769647, | |
| "eval_loss": 0.12412309646606445, | |
| "eval_runtime": 81.4647, | |
| "eval_samples_per_second": 4.296, | |
| "eval_steps_per_second": 2.148, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.7816922613670583, | |
| "grad_norm": 0.13860240578651428, | |
| "learning_rate": 8.500015099512282e-05, | |
| "loss": 0.1135, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.7937368262571516, | |
| "grad_norm": 0.08925935626029968, | |
| "learning_rate": 8.361689295581759e-05, | |
| "loss": 0.1274, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.805781391147245, | |
| "grad_norm": 0.2061154991388321, | |
| "learning_rate": 8.223684880176861e-05, | |
| "loss": 0.1245, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.805781391147245, | |
| "eval_loss": 0.12294992804527283, | |
| "eval_runtime": 81.5386, | |
| "eval_samples_per_second": 4.292, | |
| "eval_steps_per_second": 2.146, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8178259560373382, | |
| "grad_norm": 0.07016266882419586, | |
| "learning_rate": 8.086028925717661e-05, | |
| "loss": 0.1215, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.8298705209274315, | |
| "grad_norm": 0.19503405690193176, | |
| "learning_rate": 7.948748436266409e-05, | |
| "loss": 0.1169, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.8419150858175248, | |
| "grad_norm": 0.07560829073190689, | |
| "learning_rate": 7.811870342230127e-05, | |
| "loss": 0.1219, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.8419150858175248, | |
| "eval_loss": 0.12353645265102386, | |
| "eval_runtime": 81.5233, | |
| "eval_samples_per_second": 4.293, | |
| "eval_steps_per_second": 2.147, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.8539596507076181, | |
| "grad_norm": 0.06638047844171524, | |
| "learning_rate": 7.675421495077657e-05, | |
| "loss": 0.1174, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.8660042155977115, | |
| "grad_norm": 0.07245375961065292, | |
| "learning_rate": 7.539428662072188e-05, | |
| "loss": 0.1263, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8780487804878048, | |
| "grad_norm": 0.08080323040485382, | |
| "learning_rate": 7.403918521020305e-05, | |
| "loss": 0.1261, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.8780487804878048, | |
| "eval_loss": 0.12283791601657867, | |
| "eval_runtime": 81.5734, | |
| "eval_samples_per_second": 4.291, | |
| "eval_steps_per_second": 2.145, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.890093345377898, | |
| "grad_norm": 0.06938530504703522, | |
| "learning_rate": 7.268917655038581e-05, | |
| "loss": 0.1167, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.9021379102679916, | |
| "grad_norm": 0.07764075696468353, | |
| "learning_rate": 7.134452547338753e-05, | |
| "loss": 0.1191, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.914182475158085, | |
| "grad_norm": 0.10729371011257172, | |
| "learning_rate": 7.000549576032489e-05, | |
| "loss": 0.1175, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.914182475158085, | |
| "eval_loss": 0.12257985025644302, | |
| "eval_runtime": 81.3844, | |
| "eval_samples_per_second": 4.301, | |
| "eval_steps_per_second": 2.15, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.9262270400481782, | |
| "grad_norm": 0.06268001347780228, | |
| "learning_rate": 6.867235008956783e-05, | |
| "loss": 0.1203, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.9382716049382716, | |
| "grad_norm": 0.059695664793252945, | |
| "learning_rate": 6.734534998520969e-05, | |
| "loss": 0.1147, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.950316169828365, | |
| "grad_norm": 0.07781612873077393, | |
| "learning_rate": 6.602475576576383e-05, | |
| "loss": 0.1191, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.950316169828365, | |
| "eval_loss": 0.1225815936923027, | |
| "eval_runtime": 81.7725, | |
| "eval_samples_per_second": 4.28, | |
| "eval_steps_per_second": 2.14, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9623607347184584, | |
| "grad_norm": 0.06548433750867844, | |
| "learning_rate": 6.471082649309686e-05, | |
| "loss": 0.1181, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.9744052996085517, | |
| "grad_norm": 0.0688992366194725, | |
| "learning_rate": 6.34038199216082e-05, | |
| "loss": 0.1139, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.986449864498645, | |
| "grad_norm": 0.07544533908367157, | |
| "learning_rate": 6.210399244766632e-05, | |
| "loss": 0.1194, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.986449864498645, | |
| "eval_loss": 0.12259992212057114, | |
| "eval_runtime": 81.5949, | |
| "eval_samples_per_second": 4.289, | |
| "eval_steps_per_second": 2.145, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.9984944293887383, | |
| "grad_norm": 0.13208015263080597, | |
| "learning_rate": 6.0811599059311195e-05, | |
| "loss": 0.1185, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.009635651912075, | |
| "grad_norm": 0.08133638650178909, | |
| "learning_rate": 5.952689328623321e-05, | |
| "loss": 0.1264, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.021680216802168, | |
| "grad_norm": 0.08000742644071579, | |
| "learning_rate": 5.8250127150038016e-05, | |
| "loss": 0.1108, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.021680216802168, | |
| "eval_loss": 0.12281496077775955, | |
| "eval_runtime": 81.5447, | |
| "eval_samples_per_second": 4.292, | |
| "eval_steps_per_second": 2.146, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.0337247816922615, | |
| "grad_norm": 0.06525713950395584, | |
| "learning_rate": 5.698155111480722e-05, | |
| "loss": 0.1141, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.0457693465823548, | |
| "grad_norm": 0.06997241824865341, | |
| "learning_rate": 5.57214140379649e-05, | |
| "loss": 0.1146, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.057813911472448, | |
| "grad_norm": 0.07603967189788818, | |
| "learning_rate": 5.4469963121458776e-05, | |
| "loss": 0.1169, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.057813911472448, | |
| "eval_loss": 0.12317664176225662, | |
| "eval_runtime": 81.5005, | |
| "eval_samples_per_second": 4.294, | |
| "eval_steps_per_second": 2.147, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.0698584763625414, | |
| "grad_norm": 0.09060684591531754, | |
| "learning_rate": 5.322744386326675e-05, | |
| "loss": 0.1139, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.0819030412526347, | |
| "grad_norm": 0.09253629297018051, | |
| "learning_rate": 5.1994100009237e-05, | |
| "loss": 0.1174, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.093947606142728, | |
| "grad_norm": 0.0707191526889801, | |
| "learning_rate": 5.077017350527269e-05, | |
| "loss": 0.1138, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.093947606142728, | |
| "eval_loss": 0.1225883737206459, | |
| "eval_runtime": 81.4669, | |
| "eval_samples_per_second": 4.296, | |
| "eval_steps_per_second": 2.148, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.1059921710328213, | |
| "grad_norm": 0.06803560256958008, | |
| "learning_rate": 4.9555904449868795e-05, | |
| "loss": 0.1196, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.1180367359229146, | |
| "grad_norm": 0.08914126455783844, | |
| "learning_rate": 4.835153104701221e-05, | |
| "loss": 0.1129, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.130081300813008, | |
| "grad_norm": 0.08918345719575882, | |
| "learning_rate": 4.71572895594528e-05, | |
| "loss": 0.1183, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.130081300813008, | |
| "eval_loss": 0.12287717312574387, | |
| "eval_runtime": 81.4917, | |
| "eval_samples_per_second": 4.295, | |
| "eval_steps_per_second": 2.147, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.1421258657031013, | |
| "grad_norm": 0.07838484644889832, | |
| "learning_rate": 4.5973414262355785e-05, | |
| "loss": 0.1141, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.154170430593195, | |
| "grad_norm": 0.11005687713623047, | |
| "learning_rate": 4.480013739734368e-05, | |
| "loss": 0.1223, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.1662149954832883, | |
| "grad_norm": 0.08749410510063171, | |
| "learning_rate": 4.363768912693749e-05, | |
| "loss": 0.1117, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1662149954832883, | |
| "eval_loss": 0.12259072810411453, | |
| "eval_runtime": 81.4558, | |
| "eval_samples_per_second": 4.297, | |
| "eval_steps_per_second": 2.148, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1782595603733816, | |
| "grad_norm": 0.08549398928880692, | |
| "learning_rate": 4.24862974894053e-05, | |
| "loss": 0.1088, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.190304125263475, | |
| "grad_norm": 0.05755528435111046, | |
| "learning_rate": 4.134618835402816e-05, | |
| "loss": 0.1063, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.2023486901535683, | |
| "grad_norm": 0.07486403733491898, | |
| "learning_rate": 4.0217585376790834e-05, | |
| "loss": 0.1183, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.2023486901535683, | |
| "eval_loss": 0.12217788398265839, | |
| "eval_runtime": 81.8011, | |
| "eval_samples_per_second": 4.279, | |
| "eval_steps_per_second": 2.139, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.2143932550436616, | |
| "grad_norm": 0.08609265834093094, | |
| "learning_rate": 3.9100709956507356e-05, | |
| "loss": 0.1254, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.226437819933755, | |
| "grad_norm": 0.0692862719297409, | |
| "learning_rate": 3.79957811913888e-05, | |
| "loss": 0.1121, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.238482384823848, | |
| "grad_norm": 0.08547110855579376, | |
| "learning_rate": 3.6903015836062905e-05, | |
| "loss": 0.1097, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.238482384823848, | |
| "eval_loss": 0.12183844298124313, | |
| "eval_runtime": 81.5286, | |
| "eval_samples_per_second": 4.293, | |
| "eval_steps_per_second": 2.146, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.2505269497139415, | |
| "grad_norm": 0.08573822677135468, | |
| "learning_rate": 3.5822628259052906e-05, | |
| "loss": 0.1174, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.262571514604035, | |
| "grad_norm": 0.08069294691085815, | |
| "learning_rate": 3.475483040072495e-05, | |
| "loss": 0.1198, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.274616079494128, | |
| "grad_norm": 0.08202630281448364, | |
| "learning_rate": 3.369983173171141e-05, | |
| "loss": 0.1132, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.274616079494128, | |
| "eval_loss": 0.12189455330371857, | |
| "eval_runtime": 81.6511, | |
| "eval_samples_per_second": 4.287, | |
| "eval_steps_per_second": 2.143, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.2866606443842215, | |
| "grad_norm": 0.08111971616744995, | |
| "learning_rate": 3.2657839211819085e-05, | |
| "loss": 0.1086, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.298705209274315, | |
| "grad_norm": 0.07577092200517654, | |
| "learning_rate": 3.1629057249429527e-05, | |
| "loss": 0.1205, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.3107497741644085, | |
| "grad_norm": 0.0689835473895073, | |
| "learning_rate": 3.0613687661400384e-05, | |
| "loss": 0.1133, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.3107497741644085, | |
| "eval_loss": 0.12182266265153885, | |
| "eval_runtime": 81.5792, | |
| "eval_samples_per_second": 4.29, | |
| "eval_steps_per_second": 2.145, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.322794339054502, | |
| "grad_norm": 0.09697998315095901, | |
| "learning_rate": 2.9611929633474555e-05, | |
| "loss": 0.1214, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.334838903944595, | |
| "grad_norm": 0.07920137792825699, | |
| "learning_rate": 2.8623979681206002e-05, | |
| "loss": 0.1108, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.3468834688346885, | |
| "grad_norm": 0.07563956826925278, | |
| "learning_rate": 2.765003161140911e-05, | |
| "loss": 0.1213, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3468834688346885, | |
| "eval_loss": 0.12195436656475067, | |
| "eval_runtime": 81.5329, | |
| "eval_samples_per_second": 4.293, | |
| "eval_steps_per_second": 2.146, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3589280337247818, | |
| "grad_norm": 0.08294857293367386, | |
| "learning_rate": 2.66902764841394e-05, | |
| "loss": 0.1186, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.370972598614875, | |
| "grad_norm": 0.0806855708360672, | |
| "learning_rate": 2.5744902575213248e-05, | |
| "loss": 0.1127, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.3830171635049684, | |
| "grad_norm": 0.1039934828877449, | |
| "learning_rate": 2.481409533927358e-05, | |
| "loss": 0.1188, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.3830171635049684, | |
| "eval_loss": 0.12177734076976776, | |
| "eval_runtime": 81.5272, | |
| "eval_samples_per_second": 4.293, | |
| "eval_steps_per_second": 2.147, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.3950617283950617, | |
| "grad_norm": 0.09341799467802048, | |
| "learning_rate": 2.3898037373409276e-05, | |
| "loss": 0.1173, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.407106293285155, | |
| "grad_norm": 0.08933733403682709, | |
| "learning_rate": 2.2996908381334736e-05, | |
| "loss": 0.1148, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.4191508581752483, | |
| "grad_norm": 0.08493391424417496, | |
| "learning_rate": 2.211088513813754e-05, | |
| "loss": 0.1121, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.4191508581752483, | |
| "eval_loss": 0.12171091139316559, | |
| "eval_runtime": 81.5846, | |
| "eval_samples_per_second": 4.29, | |
| "eval_steps_per_second": 2.145, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.4311954230653416, | |
| "grad_norm": 0.1086319163441658, | |
| "learning_rate": 2.1240141455600116e-05, | |
| "loss": 0.1145, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.443239987955435, | |
| "grad_norm": 0.09176024794578552, | |
| "learning_rate": 2.0384848148103196e-05, | |
| "loss": 0.1092, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.4552845528455283, | |
| "grad_norm": 0.1033325344324112, | |
| "learning_rate": 1.9545172999116812e-05, | |
| "loss": 0.1055, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4552845528455283, | |
| "eval_loss": 0.12144716829061508, | |
| "eval_runtime": 81.758, | |
| "eval_samples_per_second": 4.281, | |
| "eval_steps_per_second": 2.14, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4673291177356216, | |
| "grad_norm": 0.07238776981830597, | |
| "learning_rate": 1.872128072828634e-05, | |
| "loss": 0.1105, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.4793736826257153, | |
| "grad_norm": 0.06941673159599304, | |
| "learning_rate": 1.791333295911909e-05, | |
| "loss": 0.1118, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.4914182475158086, | |
| "grad_norm": 0.08462639153003693, | |
| "learning_rate": 1.7121488187278713e-05, | |
| "loss": 0.1082, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.4914182475158086, | |
| "eval_loss": 0.12135373055934906, | |
| "eval_runtime": 81.6101, | |
| "eval_samples_per_second": 4.289, | |
| "eval_steps_per_second": 2.144, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.503462812405902, | |
| "grad_norm": 0.09350676834583282, | |
| "learning_rate": 1.6345901749492887e-05, | |
| "loss": 0.1121, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.5155073772959953, | |
| "grad_norm": 0.08438068628311157, | |
| "learning_rate": 1.5586725793080814e-05, | |
| "loss": 0.1146, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.5275519421860886, | |
| "grad_norm": 0.10219820588827133, | |
| "learning_rate": 1.484410924610642e-05, | |
| "loss": 0.1179, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5275519421860886, | |
| "eval_loss": 0.12126310169696808, | |
| "eval_runtime": 81.4438, | |
| "eval_samples_per_second": 4.297, | |
| "eval_steps_per_second": 2.149, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.539596507076182, | |
| "grad_norm": 0.0966809093952179, | |
| "learning_rate": 1.4118197788163056e-05, | |
| "loss": 0.1186, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.551641071966275, | |
| "grad_norm": 0.1106065958738327, | |
| "learning_rate": 1.3409133821795306e-05, | |
| "loss": 0.1112, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.5636856368563685, | |
| "grad_norm": 0.09320900589227676, | |
| "learning_rate": 1.2717056444563957e-05, | |
| "loss": 0.1148, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.5636856368563685, | |
| "eval_loss": 0.12110390514135361, | |
| "eval_runtime": 81.8244, | |
| "eval_samples_per_second": 4.277, | |
| "eval_steps_per_second": 2.139, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.575730201746462, | |
| "grad_norm": 0.09060157835483551, | |
| "learning_rate": 1.2042101421758955e-05, | |
| "loss": 0.1219, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.587774766636555, | |
| "grad_norm": 0.09531334787607193, | |
| "learning_rate": 1.1384401159766433e-05, | |
| "loss": 0.1136, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.5998193315266485, | |
| "grad_norm": 0.07259315997362137, | |
| "learning_rate": 1.0744084680094246e-05, | |
| "loss": 0.1062, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.5998193315266485, | |
| "eval_loss": 0.12105338275432587, | |
| "eval_runtime": 81.5762, | |
| "eval_samples_per_second": 4.29, | |
| "eval_steps_per_second": 2.145, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.611863896416742, | |
| "grad_norm": 0.07185359299182892, | |
| "learning_rate": 1.0121277594061939e-05, | |
| "loss": 0.1184, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.6239084613068355, | |
| "grad_norm": 0.0885002538561821, | |
| "learning_rate": 9.516102078159317e-06, | |
| "loss": 0.1243, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.635953026196929, | |
| "grad_norm": 0.10129860043525696, | |
| "learning_rate": 8.928676850079133e-06, | |
| "loss": 0.1249, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.635953026196929, | |
| "eval_loss": 0.1209418997168541, | |
| "eval_runtime": 81.4691, | |
| "eval_samples_per_second": 4.296, | |
| "eval_steps_per_second": 2.148, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.647997591087022, | |
| "grad_norm": 0.08937793970108032, | |
| "learning_rate": 8.359117145428053e-06, | |
| "loss": 0.1188, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6600421559771155, | |
| "grad_norm": 0.08618754148483276, | |
| "learning_rate": 7.807534695120911e-06, | |
| "loss": 0.1193, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.6720867208672088, | |
| "grad_norm": 0.0760350152850151, | |
| "learning_rate": 7.274037703462244e-06, | |
| "loss": 0.107, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.6720867208672088, | |
| "eval_loss": 0.12113867700099945, | |
| "eval_runtime": 81.609, | |
| "eval_samples_per_second": 4.289, | |
| "eval_steps_per_second": 2.144, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.684131285757302, | |
| "grad_norm": 0.08701752126216888, | |
| "learning_rate": 6.7587308269199786e-06, | |
| "loss": 0.1042, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.6961758506473954, | |
| "grad_norm": 0.08244354277849197, | |
| "learning_rate": 6.261715153594627e-06, | |
| "loss": 0.1156, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.7082204155374887, | |
| "grad_norm": 0.14877063035964966, | |
| "learning_rate": 5.783088183389062e-06, | |
| "loss": 0.1076, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.7082204155374887, | |
| "eval_loss": 0.12096786499023438, | |
| "eval_runtime": 81.6046, | |
| "eval_samples_per_second": 4.289, | |
| "eval_steps_per_second": 2.144, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.720264980427582, | |
| "grad_norm": 0.09160082787275314, | |
| "learning_rate": 5.322943808881675e-06, | |
| "loss": 0.1188, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.7323095453176753, | |
| "grad_norm": 0.08697830885648727, | |
| "learning_rate": 4.881372296907516e-06, | |
| "loss": 0.1082, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.7443541102077686, | |
| "grad_norm": 0.09075489640235901, | |
| "learning_rate": 4.4584602708505285e-06, | |
| "loss": 0.118, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.7443541102077686, | |
| "eval_loss": 0.12092573195695877, | |
| "eval_runtime": 81.6503, | |
| "eval_samples_per_second": 4.287, | |
| "eval_steps_per_second": 2.143, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.756398675097862, | |
| "grad_norm": 0.0934319868683815, | |
| "learning_rate": 4.054290693650642e-06, | |
| "loss": 0.1091, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.7684432399879553, | |
| "grad_norm": 0.08008915930986404, | |
| "learning_rate": 3.6689428515288004e-06, | |
| "loss": 0.1103, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.7804878048780486, | |
| "grad_norm": 0.07896912097930908, | |
| "learning_rate": 3.3024923384334163e-06, | |
| "loss": 0.1087, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.7804878048780486, | |
| "eval_loss": 0.12093985080718994, | |
| "eval_runtime": 81.4605, | |
| "eval_samples_per_second": 4.297, | |
| "eval_steps_per_second": 2.148, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.792532369768142, | |
| "grad_norm": 0.08136588335037231, | |
| "learning_rate": 2.9550110412109534e-06, | |
| "loss": 0.1106, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.804576934658235, | |
| "grad_norm": 0.07197284698486328, | |
| "learning_rate": 2.6265671255039537e-06, | |
| "loss": 0.1265, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.816621499548329, | |
| "grad_norm": 0.09639979153871536, | |
| "learning_rate": 2.3172250223787994e-06, | |
| "loss": 0.1168, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.816621499548329, | |
| "eval_loss": 0.12088128179311752, | |
| "eval_runtime": 81.8851, | |
| "eval_samples_per_second": 4.274, | |
| "eval_steps_per_second": 2.137, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.8286660644384223, | |
| "grad_norm": 0.09752853214740753, | |
| "learning_rate": 2.0270454156863905e-06, | |
| "loss": 0.1209, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.8407106293285156, | |
| "grad_norm": 0.08983393758535385, | |
| "learning_rate": 1.7560852301575892e-06, | |
| "loss": 0.1185, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.852755194218609, | |
| "grad_norm": 0.09476975351572037, | |
| "learning_rate": 1.5043976202363641e-06, | |
| "loss": 0.1172, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.852755194218609, | |
| "eval_loss": 0.1208338588476181, | |
| "eval_runtime": 81.5048, | |
| "eval_samples_per_second": 4.294, | |
| "eval_steps_per_second": 2.147, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.864799759108702, | |
| "grad_norm": 0.06642317026853561, | |
| "learning_rate": 1.2720319596523977e-06, | |
| "loss": 0.116, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.8768443239987955, | |
| "grad_norm": 0.10626640915870667, | |
| "learning_rate": 1.0590338317354454e-06, | |
| "loss": 0.112, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.07917091995477676, | |
| "learning_rate": 8.654450204731768e-07, | |
| "loss": 0.1092, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "eval_loss": 0.12085919827222824, | |
| "eval_runtime": 81.5821, | |
| "eval_samples_per_second": 4.29, | |
| "eval_steps_per_second": 2.145, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2493, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.360791333022802e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |