{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9857231533209188, "eval_steps": 30, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024829298572315334, "grad_norm": 7.621621131896973, "learning_rate": 2.222222222222222e-06, "loss": 2.9278, "step": 10 }, { "epoch": 0.04965859714463067, "grad_norm": 6.982970237731934, "learning_rate": 4.691358024691358e-06, "loss": 2.7139, "step": 20 }, { "epoch": 0.074487895716946, "grad_norm": 6.744388580322266, "learning_rate": 7.160493827160494e-06, "loss": 2.5972, "step": 30 }, { "epoch": 0.074487895716946, "eval_loss": 2.417867660522461, "eval_runtime": 41.6844, "eval_samples_per_second": 4.078, "eval_steps_per_second": 2.039, "step": 30 }, { "epoch": 0.09931719428926133, "grad_norm": 5.335218906402588, "learning_rate": 9.62962962962963e-06, "loss": 2.0383, "step": 40 }, { "epoch": 0.12414649286157665, "grad_norm": 2.143411874771118, "learning_rate": 1.2098765432098767e-05, "loss": 1.3351, "step": 50 }, { "epoch": 0.148975791433892, "grad_norm": 0.8463016748428345, "learning_rate": 1.4567901234567903e-05, "loss": 0.9604, "step": 60 }, { "epoch": 0.148975791433892, "eval_loss": 0.9155183434486389, "eval_runtime": 41.0238, "eval_samples_per_second": 4.144, "eval_steps_per_second": 2.072, "step": 60 }, { "epoch": 0.17380509000620734, "grad_norm": 0.595171332359314, "learning_rate": 1.7037037037037038e-05, "loss": 0.7447, "step": 70 }, { "epoch": 0.19863438857852267, "grad_norm": 0.5561698079109192, "learning_rate": 1.9506172839506175e-05, "loss": 0.6919, "step": 80 }, { "epoch": 0.22346368715083798, "grad_norm": 0.5283234715461731, "learning_rate": 1.999399199592735e-05, "loss": 0.6338, "step": 90 }, { "epoch": 0.22346368715083798, "eval_loss": 0.6309370994567871, "eval_runtime": 41.0557, "eval_samples_per_second": 4.141, "eval_steps_per_second": 2.07, "step": 90 }, { "epoch": 0.2482929857231533, "grad_norm": 0.5148088335990906, "learning_rate": 1.996959685164433e-05, "loss": 0.5407, "step": 100 }, { "epoch": 0.27312228429546864, "grad_norm": 0.7455502152442932, "learning_rate": 1.9926484830975116e-05, "loss": 0.5309, "step": 110 }, { "epoch": 0.297951582867784, "grad_norm": 0.5646942853927612, "learning_rate": 1.986473687223383e-05, "loss": 0.474, "step": 120 }, { "epoch": 0.297951582867784, "eval_loss": 0.4833507835865021, "eval_runtime": 41.0325, "eval_samples_per_second": 4.143, "eval_steps_per_second": 2.072, "step": 120 }, { "epoch": 0.3227808814400993, "grad_norm": 0.5670679211616516, "learning_rate": 1.9784468900761097e-05, "loss": 0.4259, "step": 130 }, { "epoch": 0.34761018001241467, "grad_norm": 0.7423049211502075, "learning_rate": 1.9685831611286312e-05, "loss": 0.412, "step": 140 }, { "epoch": 0.37243947858473, "grad_norm": 0.6956289410591125, "learning_rate": 1.9569010185014062e-05, "loss": 0.377, "step": 150 }, { "epoch": 0.37243947858473, "eval_loss": 0.38657110929489136, "eval_runtime": 40.924, "eval_samples_per_second": 4.154, "eval_steps_per_second": 2.077, "step": 150 }, { "epoch": 0.39726877715704534, "grad_norm": 0.8065354824066162, "learning_rate": 1.9434223941965738e-05, "loss": 0.3361, "step": 160 }, { "epoch": 0.42209807572936064, "grad_norm": 0.7633559703826904, "learning_rate": 1.9281725929229127e-05, "loss": 0.3348, "step": 170 }, { "epoch": 0.44692737430167595, "grad_norm": 0.7941247224807739, "learning_rate": 1.9111802445888936e-05, "loss": 0.2987, "step": 180 }, { "epoch": 0.44692737430167595, "eval_loss": 0.3138997256755829, "eval_runtime": 41.0146, "eval_samples_per_second": 4.145, "eval_steps_per_second": 2.072, "step": 180 }, { "epoch": 0.4717566728739913, "grad_norm": 1.1606348752975464, "learning_rate": 1.8924772505530177e-05, "loss": 0.2776, "step": 190 }, { "epoch": 0.4965859714463066, "grad_norm": 1.1247198581695557, "learning_rate": 1.8720987237323497e-05, "loss": 0.2788, "step": 200 }, { "epoch": 0.521415270018622, "grad_norm": 0.8864783644676208, "learning_rate": 1.8500829226816853e-05, "loss": 0.2588, "step": 210 }, { "epoch": 0.521415270018622, "eval_loss": 0.26379144191741943, "eval_runtime": 41.1029, "eval_samples_per_second": 4.136, "eval_steps_per_second": 2.068, "step": 210 }, { "epoch": 0.5462445685909373, "grad_norm": 0.9481696486473083, "learning_rate": 1.826471179767111e-05, "loss": 0.2359, "step": 220 }, { "epoch": 0.5710738671632526, "grad_norm": 1.218240737915039, "learning_rate": 1.801307823568806e-05, "loss": 0.2406, "step": 230 }, { "epoch": 0.595903165735568, "grad_norm": 0.9764422178268433, "learning_rate": 1.7746400956587653e-05, "loss": 0.2241, "step": 240 }, { "epoch": 0.595903165735568, "eval_loss": 0.2281169593334198, "eval_runtime": 41.0529, "eval_samples_per_second": 4.141, "eval_steps_per_second": 2.071, "step": 240 }, { "epoch": 0.6207324643078833, "grad_norm": 1.1048036813735962, "learning_rate": 1.7465180619096834e-05, "loss": 0.2199, "step": 250 }, { "epoch": 0.6455617628801986, "grad_norm": 0.973822832107544, "learning_rate": 1.7169945185015106e-05, "loss": 0.2025, "step": 260 }, { "epoch": 0.6703910614525139, "grad_norm": 0.758234977722168, "learning_rate": 1.686124892802141e-05, "loss": 0.1932, "step": 270 }, { "epoch": 0.6703910614525139, "eval_loss": 0.20496493577957153, "eval_runtime": 41.1068, "eval_samples_per_second": 4.136, "eval_steps_per_second": 2.068, "step": 270 }, { "epoch": 0.6952203600248293, "grad_norm": 1.2279834747314453, "learning_rate": 1.6539671393083218e-05, "loss": 0.1943, "step": 280 }, { "epoch": 0.7200496585971446, "grad_norm": 0.7134841084480286, "learning_rate": 1.6205816308421386e-05, "loss": 0.1796, "step": 290 }, { "epoch": 0.74487895716946, "grad_norm": 0.9929455518722534, "learning_rate": 1.586031045207354e-05, "loss": 0.1874, "step": 300 }, { "epoch": 0.74487895716946, "eval_loss": 0.19150954484939575, "eval_runtime": 41.0713, "eval_samples_per_second": 4.139, "eval_steps_per_second": 2.07, "step": 300 }, { "epoch": 0.7697082557417753, "grad_norm": 0.8576317429542542, "learning_rate": 1.5503802475183773e-05, "loss": 0.1852, "step": 310 }, { "epoch": 0.7945375543140907, "grad_norm": 1.0153837203979492, "learning_rate": 1.5136961684227905e-05, "loss": 0.1814, "step": 320 }, { "epoch": 0.819366852886406, "grad_norm": 0.8694589734077454, "learning_rate": 1.4760476784460514e-05, "loss": 0.1841, "step": 330 }, { "epoch": 0.819366852886406, "eval_loss": 0.17898762226104736, "eval_runtime": 41.0055, "eval_samples_per_second": 4.146, "eval_steps_per_second": 2.073, "step": 330 }, { "epoch": 0.8441961514587213, "grad_norm": 1.1345281600952148, "learning_rate": 1.4375054586942771e-05, "loss": 0.1725, "step": 340 }, { "epoch": 0.8690254500310366, "grad_norm": 1.0460193157196045, "learning_rate": 1.3981418681578546e-05, "loss": 0.1605, "step": 350 }, { "epoch": 0.8938547486033519, "grad_norm": 1.0834463834762573, "learning_rate": 1.3580308078649948e-05, "loss": 0.1652, "step": 360 }, { "epoch": 0.8938547486033519, "eval_loss": 0.1723683923482895, "eval_runtime": 41.0124, "eval_samples_per_second": 4.145, "eval_steps_per_second": 2.073, "step": 360 }, { "epoch": 0.9186840471756673, "grad_norm": 0.7139394283294678, "learning_rate": 1.3172475821402748e-05, "loss": 0.1742, "step": 370 }, { "epoch": 0.9435133457479826, "grad_norm": 0.8907492756843567, "learning_rate": 1.2758687572286367e-05, "loss": 0.154, "step": 380 }, { "epoch": 0.9683426443202979, "grad_norm": 0.7731947302818298, "learning_rate": 1.2339720175502643e-05, "loss": 0.1627, "step": 390 }, { "epoch": 0.9683426443202979, "eval_loss": 0.16476133465766907, "eval_runtime": 41.0851, "eval_samples_per_second": 4.138, "eval_steps_per_second": 2.069, "step": 390 }, { "epoch": 0.9931719428926132, "grad_norm": 0.8102223873138428, "learning_rate": 1.191636019856198e-05, "loss": 0.1546, "step": 400 }, { "epoch": 1.0173805090006207, "grad_norm": 0.9590178728103638, "learning_rate": 1.1489402455585078e-05, "loss": 0.1635, "step": 410 }, { "epoch": 1.042209807572936, "grad_norm": 1.270085334777832, "learning_rate": 1.1059648515122426e-05, "loss": 0.1578, "step": 420 }, { "epoch": 1.042209807572936, "eval_loss": 0.16066311299800873, "eval_runtime": 41.0613, "eval_samples_per_second": 4.14, "eval_steps_per_second": 2.07, "step": 420 }, { "epoch": 1.0670391061452513, "grad_norm": 0.7239245176315308, "learning_rate": 1.0627905195293135e-05, "loss": 0.1509, "step": 430 }, { "epoch": 1.0918684047175666, "grad_norm": 1.2316311597824097, "learning_rate": 1.0194983049068212e-05, "loss": 0.1493, "step": 440 }, { "epoch": 1.1166977032898822, "grad_norm": 1.034386157989502, "learning_rate": 9.761694842542042e-06, "loss": 0.1427, "step": 450 }, { "epoch": 1.1166977032898822, "eval_loss": 0.15721318125724792, "eval_runtime": 40.9543, "eval_samples_per_second": 4.151, "eval_steps_per_second": 2.075, "step": 450 }, { "epoch": 1.1415270018621975, "grad_norm": 1.0646947622299194, "learning_rate": 9.328854029048985e-06, "loss": 0.1528, "step": 460 }, { "epoch": 1.1663563004345128, "grad_norm": 0.8461800813674927, "learning_rate": 8.897273221989715e-06, "loss": 0.1505, "step": 470 }, { "epoch": 1.191185599006828, "grad_norm": 0.6844385862350464, "learning_rate": 8.467762669234496e-06, "loss": 0.1472, "step": 480 }, { "epoch": 1.191185599006828, "eval_loss": 0.15322649478912354, "eval_runtime": 41.0165, "eval_samples_per_second": 4.145, "eval_steps_per_second": 2.072, "step": 480 }, { "epoch": 1.2160148975791434, "grad_norm": 0.8525738716125488, "learning_rate": 8.041128731967445e-06, "loss": 0.1519, "step": 490 }, { "epoch": 1.2408441961514587, "grad_norm": 0.7886703014373779, "learning_rate": 7.61817237082768e-06, "loss": 0.1519, "step": 500 }, { "epoch": 1.265673494723774, "grad_norm": 0.7268862128257751, "learning_rate": 7.199687642189388e-06, "loss": 0.142, "step": 510 }, { "epoch": 1.265673494723774, "eval_loss": 0.1511753350496292, "eval_runtime": 41.1138, "eval_samples_per_second": 4.135, "eval_steps_per_second": 2.067, "step": 510 }, { "epoch": 1.2905027932960893, "grad_norm": 0.7751151323318481, "learning_rate": 6.7864602074039775e-06, "loss": 0.1471, "step": 520 }, { "epoch": 1.3153320918684046, "grad_norm": 0.8264702558517456, "learning_rate": 6.37926585780297e-06, "loss": 0.1438, "step": 530 }, { "epoch": 1.34016139044072, "grad_norm": 0.5469579100608826, "learning_rate": 5.978869058230841e-06, "loss": 0.1493, "step": 540 }, { "epoch": 1.34016139044072, "eval_loss": 0.1491098254919052, "eval_runtime": 41.1584, "eval_samples_per_second": 4.13, "eval_steps_per_second": 2.065, "step": 540 }, { "epoch": 1.3649906890130354, "grad_norm": 0.9022724032402039, "learning_rate": 5.586021511842136e-06, "loss": 0.1371, "step": 550 }, { "epoch": 1.3898199875853507, "grad_norm": 0.8355094790458679, "learning_rate": 5.201460748857369e-06, "loss": 0.1409, "step": 560 }, { "epoch": 1.414649286157666, "grad_norm": 0.7820518016815186, "learning_rate": 4.825908741927076e-06, "loss": 0.1417, "step": 570 }, { "epoch": 1.414649286157666, "eval_loss": 0.14838995039463043, "eval_runtime": 41.137, "eval_samples_per_second": 4.133, "eval_steps_per_second": 2.066, "step": 570 }, { "epoch": 1.4394785847299814, "grad_norm": 0.7794449925422668, "learning_rate": 4.4600705507036125e-06, "loss": 0.1433, "step": 580 }, { "epoch": 1.4643078833022967, "grad_norm": 0.7465994358062744, "learning_rate": 4.104632998165309e-06, "loss": 0.1445, "step": 590 }, { "epoch": 1.489137181874612, "grad_norm": 0.8334828615188599, "learning_rate": 3.7602633811781165e-06, "loss": 0.1458, "step": 600 }, { "epoch": 1.489137181874612, "eval_loss": 0.1462916135787964, "eval_runtime": 41.086, "eval_samples_per_second": 4.138, "eval_steps_per_second": 2.069, "step": 600 }, { "epoch": 1.5139664804469275, "grad_norm": 0.8605223894119263, "learning_rate": 3.4276082177154536e-06, "loss": 0.1447, "step": 610 }, { "epoch": 1.5387957790192428, "grad_norm": 0.5568763017654419, "learning_rate": 3.107292033088265e-06, "loss": 0.1384, "step": 620 }, { "epoch": 1.563625077591558, "grad_norm": 0.7917467951774597, "learning_rate": 2.7999161874640026e-06, "loss": 0.1481, "step": 630 }, { "epoch": 1.563625077591558, "eval_loss": 0.14508940279483795, "eval_runtime": 41.1236, "eval_samples_per_second": 4.134, "eval_steps_per_second": 2.067, "step": 630 }, { "epoch": 1.5884543761638734, "grad_norm": 0.7165929675102234, "learning_rate": 2.506057746875753e-06, "loss": 0.1422, "step": 640 }, { "epoch": 1.6132836747361887, "grad_norm": 0.8443304896354675, "learning_rate": 2.226268399841055e-06, "loss": 0.1406, "step": 650 }, { "epoch": 1.638112973308504, "grad_norm": 1.0195279121398926, "learning_rate": 1.961073421624352e-06, "loss": 0.1403, "step": 660 }, { "epoch": 1.638112973308504, "eval_loss": 0.1443227380514145, "eval_runtime": 41.1177, "eval_samples_per_second": 4.134, "eval_steps_per_second": 2.067, "step": 660 }, { "epoch": 1.6629422718808193, "grad_norm": 0.7534502148628235, "learning_rate": 1.710970688087561e-06, "loss": 0.1398, "step": 670 }, { "epoch": 1.6877715704531346, "grad_norm": 0.8134092092514038, "learning_rate": 1.4764297409801764e-06, "loss": 0.1377, "step": 680 }, { "epoch": 1.71260086902545, "grad_norm": 0.7144195437431335, "learning_rate": 1.2578909064236887e-06, "loss": 0.1457, "step": 690 }, { "epoch": 1.71260086902545, "eval_loss": 0.1440444439649582, "eval_runtime": 41.1467, "eval_samples_per_second": 4.132, "eval_steps_per_second": 2.066, "step": 690 }, { "epoch": 1.7374301675977653, "grad_norm": 0.5405588150024414, "learning_rate": 1.055764468245304e-06, "loss": 0.1406, "step": 700 }, { "epoch": 1.7622594661700806, "grad_norm": 0.6921040415763855, "learning_rate": 8.70429897712921e-07, "loss": 0.1366, "step": 710 }, { "epoch": 1.7870887647423959, "grad_norm": 0.7384780645370483, "learning_rate": 7.022351411174866e-07, "loss": 0.1473, "step": 720 }, { "epoch": 1.7870887647423959, "eval_loss": 0.1436140090227127, "eval_runtime": 41.1175, "eval_samples_per_second": 4.134, "eval_steps_per_second": 2.067, "step": 720 }, { "epoch": 1.8119180633147114, "grad_norm": 0.7989315986633301, "learning_rate": 5.51495966540182e-07, "loss": 0.1368, "step": 730 }, { "epoch": 1.8367473618870267, "grad_norm": 0.8284072875976562, "learning_rate": 4.1849537103084924e-07, "loss": 0.1445, "step": 740 }, { "epoch": 1.861576660459342, "grad_norm": 0.8498286604881287, "learning_rate": 3.0348304931059556e-07, "loss": 0.1347, "step": 750 }, { "epoch": 1.861576660459342, "eval_loss": 0.14337773621082306, "eval_runtime": 41.0893, "eval_samples_per_second": 4.137, "eval_steps_per_second": 2.069, "step": 750 }, { "epoch": 1.8864059590316573, "grad_norm": 0.7535120844841003, "learning_rate": 2.066749249960498e-07, "loss": 0.1422, "step": 760 }, { "epoch": 1.9112352576039728, "grad_norm": 0.8711650371551514, "learning_rate": 1.2825274522532795e-07, "loss": 0.129, "step": 770 }, { "epoch": 1.9360645561762881, "grad_norm": 0.751017689704895, "learning_rate": 6.836373944677954e-08, "loss": 0.1423, "step": 780 }, { "epoch": 1.9360645561762881, "eval_loss": 0.1432679146528244, "eval_runtime": 41.1344, "eval_samples_per_second": 4.133, "eval_steps_per_second": 2.066, "step": 780 }, { "epoch": 1.9608938547486034, "grad_norm": 0.7860766053199768, "learning_rate": 2.7120343011071138e-08, "loss": 0.1492, "step": 790 }, { "epoch": 1.9857231533209188, "grad_norm": 0.7165619134902954, "learning_rate": 4.599986085573882e-09, "loss": 0.138, "step": 800 } ], "logging_steps": 10, "max_steps": 806, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4861828204940288e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }