| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9857231533209188, | |
| "eval_steps": 30, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024829298572315334, | |
| "grad_norm": 7.621621131896973, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 2.9278, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04965859714463067, | |
| "grad_norm": 6.982970237731934, | |
| "learning_rate": 4.691358024691358e-06, | |
| "loss": 2.7139, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.074487895716946, | |
| "grad_norm": 6.744388580322266, | |
| "learning_rate": 7.160493827160494e-06, | |
| "loss": 2.5972, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.074487895716946, | |
| "eval_loss": 2.417867660522461, | |
| "eval_runtime": 41.6844, | |
| "eval_samples_per_second": 4.078, | |
| "eval_steps_per_second": 2.039, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09931719428926133, | |
| "grad_norm": 5.335218906402588, | |
| "learning_rate": 9.62962962962963e-06, | |
| "loss": 2.0383, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12414649286157665, | |
| "grad_norm": 2.143411874771118, | |
| "learning_rate": 1.2098765432098767e-05, | |
| "loss": 1.3351, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.148975791433892, | |
| "grad_norm": 0.8463016748428345, | |
| "learning_rate": 1.4567901234567903e-05, | |
| "loss": 0.9604, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.148975791433892, | |
| "eval_loss": 0.9155183434486389, | |
| "eval_runtime": 41.0238, | |
| "eval_samples_per_second": 4.144, | |
| "eval_steps_per_second": 2.072, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17380509000620734, | |
| "grad_norm": 0.595171332359314, | |
| "learning_rate": 1.7037037037037038e-05, | |
| "loss": 0.7447, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19863438857852267, | |
| "grad_norm": 0.5561698079109192, | |
| "learning_rate": 1.9506172839506175e-05, | |
| "loss": 0.6919, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22346368715083798, | |
| "grad_norm": 0.5283234715461731, | |
| "learning_rate": 1.999399199592735e-05, | |
| "loss": 0.6338, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22346368715083798, | |
| "eval_loss": 0.6309370994567871, | |
| "eval_runtime": 41.0557, | |
| "eval_samples_per_second": 4.141, | |
| "eval_steps_per_second": 2.07, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2482929857231533, | |
| "grad_norm": 0.5148088335990906, | |
| "learning_rate": 1.996959685164433e-05, | |
| "loss": 0.5407, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.27312228429546864, | |
| "grad_norm": 0.7455502152442932, | |
| "learning_rate": 1.9926484830975116e-05, | |
| "loss": 0.5309, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.297951582867784, | |
| "grad_norm": 0.5646942853927612, | |
| "learning_rate": 1.986473687223383e-05, | |
| "loss": 0.474, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.297951582867784, | |
| "eval_loss": 0.4833507835865021, | |
| "eval_runtime": 41.0325, | |
| "eval_samples_per_second": 4.143, | |
| "eval_steps_per_second": 2.072, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3227808814400993, | |
| "grad_norm": 0.5670679211616516, | |
| "learning_rate": 1.9784468900761097e-05, | |
| "loss": 0.4259, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34761018001241467, | |
| "grad_norm": 0.7423049211502075, | |
| "learning_rate": 1.9685831611286312e-05, | |
| "loss": 0.412, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.37243947858473, | |
| "grad_norm": 0.6956289410591125, | |
| "learning_rate": 1.9569010185014062e-05, | |
| "loss": 0.377, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.37243947858473, | |
| "eval_loss": 0.38657110929489136, | |
| "eval_runtime": 40.924, | |
| "eval_samples_per_second": 4.154, | |
| "eval_steps_per_second": 2.077, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39726877715704534, | |
| "grad_norm": 0.8065354824066162, | |
| "learning_rate": 1.9434223941965738e-05, | |
| "loss": 0.3361, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.42209807572936064, | |
| "grad_norm": 0.7633559703826904, | |
| "learning_rate": 1.9281725929229127e-05, | |
| "loss": 0.3348, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44692737430167595, | |
| "grad_norm": 0.7941247224807739, | |
| "learning_rate": 1.9111802445888936e-05, | |
| "loss": 0.2987, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44692737430167595, | |
| "eval_loss": 0.3138997256755829, | |
| "eval_runtime": 41.0146, | |
| "eval_samples_per_second": 4.145, | |
| "eval_steps_per_second": 2.072, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4717566728739913, | |
| "grad_norm": 1.1606348752975464, | |
| "learning_rate": 1.8924772505530177e-05, | |
| "loss": 0.2776, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4965859714463066, | |
| "grad_norm": 1.1247198581695557, | |
| "learning_rate": 1.8720987237323497e-05, | |
| "loss": 0.2788, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.521415270018622, | |
| "grad_norm": 0.8864783644676208, | |
| "learning_rate": 1.8500829226816853e-05, | |
| "loss": 0.2588, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.521415270018622, | |
| "eval_loss": 0.26379144191741943, | |
| "eval_runtime": 41.1029, | |
| "eval_samples_per_second": 4.136, | |
| "eval_steps_per_second": 2.068, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5462445685909373, | |
| "grad_norm": 0.9481696486473083, | |
| "learning_rate": 1.826471179767111e-05, | |
| "loss": 0.2359, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5710738671632526, | |
| "grad_norm": 1.218240737915039, | |
| "learning_rate": 1.801307823568806e-05, | |
| "loss": 0.2406, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.595903165735568, | |
| "grad_norm": 0.9764422178268433, | |
| "learning_rate": 1.7746400956587653e-05, | |
| "loss": 0.2241, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.595903165735568, | |
| "eval_loss": 0.2281169593334198, | |
| "eval_runtime": 41.0529, | |
| "eval_samples_per_second": 4.141, | |
| "eval_steps_per_second": 2.071, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6207324643078833, | |
| "grad_norm": 1.1048036813735962, | |
| "learning_rate": 1.7465180619096834e-05, | |
| "loss": 0.2199, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6455617628801986, | |
| "grad_norm": 0.973822832107544, | |
| "learning_rate": 1.7169945185015106e-05, | |
| "loss": 0.2025, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "grad_norm": 0.758234977722168, | |
| "learning_rate": 1.686124892802141e-05, | |
| "loss": 0.1932, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "eval_loss": 0.20496493577957153, | |
| "eval_runtime": 41.1068, | |
| "eval_samples_per_second": 4.136, | |
| "eval_steps_per_second": 2.068, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6952203600248293, | |
| "grad_norm": 1.2279834747314453, | |
| "learning_rate": 1.6539671393083218e-05, | |
| "loss": 0.1943, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7200496585971446, | |
| "grad_norm": 0.7134841084480286, | |
| "learning_rate": 1.6205816308421386e-05, | |
| "loss": 0.1796, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.74487895716946, | |
| "grad_norm": 0.9929455518722534, | |
| "learning_rate": 1.586031045207354e-05, | |
| "loss": 0.1874, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.74487895716946, | |
| "eval_loss": 0.19150954484939575, | |
| "eval_runtime": 41.0713, | |
| "eval_samples_per_second": 4.139, | |
| "eval_steps_per_second": 2.07, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7697082557417753, | |
| "grad_norm": 0.8576317429542542, | |
| "learning_rate": 1.5503802475183773e-05, | |
| "loss": 0.1852, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7945375543140907, | |
| "grad_norm": 1.0153837203979492, | |
| "learning_rate": 1.5136961684227905e-05, | |
| "loss": 0.1814, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.819366852886406, | |
| "grad_norm": 0.8694589734077454, | |
| "learning_rate": 1.4760476784460514e-05, | |
| "loss": 0.1841, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.819366852886406, | |
| "eval_loss": 0.17898762226104736, | |
| "eval_runtime": 41.0055, | |
| "eval_samples_per_second": 4.146, | |
| "eval_steps_per_second": 2.073, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8441961514587213, | |
| "grad_norm": 1.1345281600952148, | |
| "learning_rate": 1.4375054586942771e-05, | |
| "loss": 0.1725, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8690254500310366, | |
| "grad_norm": 1.0460193157196045, | |
| "learning_rate": 1.3981418681578546e-05, | |
| "loss": 0.1605, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "grad_norm": 1.0834463834762573, | |
| "learning_rate": 1.3580308078649948e-05, | |
| "loss": 0.1652, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "eval_loss": 0.1723683923482895, | |
| "eval_runtime": 41.0124, | |
| "eval_samples_per_second": 4.145, | |
| "eval_steps_per_second": 2.073, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9186840471756673, | |
| "grad_norm": 0.7139394283294678, | |
| "learning_rate": 1.3172475821402748e-05, | |
| "loss": 0.1742, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9435133457479826, | |
| "grad_norm": 0.8907492756843567, | |
| "learning_rate": 1.2758687572286367e-05, | |
| "loss": 0.154, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9683426443202979, | |
| "grad_norm": 0.7731947302818298, | |
| "learning_rate": 1.2339720175502643e-05, | |
| "loss": 0.1627, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9683426443202979, | |
| "eval_loss": 0.16476133465766907, | |
| "eval_runtime": 41.0851, | |
| "eval_samples_per_second": 4.138, | |
| "eval_steps_per_second": 2.069, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9931719428926132, | |
| "grad_norm": 0.8102223873138428, | |
| "learning_rate": 1.191636019856198e-05, | |
| "loss": 0.1546, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0173805090006207, | |
| "grad_norm": 0.9590178728103638, | |
| "learning_rate": 1.1489402455585078e-05, | |
| "loss": 0.1635, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.042209807572936, | |
| "grad_norm": 1.270085334777832, | |
| "learning_rate": 1.1059648515122426e-05, | |
| "loss": 0.1578, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.042209807572936, | |
| "eval_loss": 0.16066311299800873, | |
| "eval_runtime": 41.0613, | |
| "eval_samples_per_second": 4.14, | |
| "eval_steps_per_second": 2.07, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0670391061452513, | |
| "grad_norm": 0.7239245176315308, | |
| "learning_rate": 1.0627905195293135e-05, | |
| "loss": 0.1509, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0918684047175666, | |
| "grad_norm": 1.2316311597824097, | |
| "learning_rate": 1.0194983049068212e-05, | |
| "loss": 0.1493, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1166977032898822, | |
| "grad_norm": 1.034386157989502, | |
| "learning_rate": 9.761694842542042e-06, | |
| "loss": 0.1427, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1166977032898822, | |
| "eval_loss": 0.15721318125724792, | |
| "eval_runtime": 40.9543, | |
| "eval_samples_per_second": 4.151, | |
| "eval_steps_per_second": 2.075, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1415270018621975, | |
| "grad_norm": 1.0646947622299194, | |
| "learning_rate": 9.328854029048985e-06, | |
| "loss": 0.1528, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1663563004345128, | |
| "grad_norm": 0.8461800813674927, | |
| "learning_rate": 8.897273221989715e-06, | |
| "loss": 0.1505, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.191185599006828, | |
| "grad_norm": 0.6844385862350464, | |
| "learning_rate": 8.467762669234496e-06, | |
| "loss": 0.1472, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.191185599006828, | |
| "eval_loss": 0.15322649478912354, | |
| "eval_runtime": 41.0165, | |
| "eval_samples_per_second": 4.145, | |
| "eval_steps_per_second": 2.072, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2160148975791434, | |
| "grad_norm": 0.8525738716125488, | |
| "learning_rate": 8.041128731967445e-06, | |
| "loss": 0.1519, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2408441961514587, | |
| "grad_norm": 0.7886703014373779, | |
| "learning_rate": 7.61817237082768e-06, | |
| "loss": 0.1519, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.265673494723774, | |
| "grad_norm": 0.7268862128257751, | |
| "learning_rate": 7.199687642189388e-06, | |
| "loss": 0.142, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.265673494723774, | |
| "eval_loss": 0.1511753350496292, | |
| "eval_runtime": 41.1138, | |
| "eval_samples_per_second": 4.135, | |
| "eval_steps_per_second": 2.067, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2905027932960893, | |
| "grad_norm": 0.7751151323318481, | |
| "learning_rate": 6.7864602074039775e-06, | |
| "loss": 0.1471, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.3153320918684046, | |
| "grad_norm": 0.8264702558517456, | |
| "learning_rate": 6.37926585780297e-06, | |
| "loss": 0.1438, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.34016139044072, | |
| "grad_norm": 0.5469579100608826, | |
| "learning_rate": 5.978869058230841e-06, | |
| "loss": 0.1493, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.34016139044072, | |
| "eval_loss": 0.1491098254919052, | |
| "eval_runtime": 41.1584, | |
| "eval_samples_per_second": 4.13, | |
| "eval_steps_per_second": 2.065, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3649906890130354, | |
| "grad_norm": 0.9022724032402039, | |
| "learning_rate": 5.586021511842136e-06, | |
| "loss": 0.1371, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3898199875853507, | |
| "grad_norm": 0.8355094790458679, | |
| "learning_rate": 5.201460748857369e-06, | |
| "loss": 0.1409, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.414649286157666, | |
| "grad_norm": 0.7820518016815186, | |
| "learning_rate": 4.825908741927076e-06, | |
| "loss": 0.1417, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.414649286157666, | |
| "eval_loss": 0.14838995039463043, | |
| "eval_runtime": 41.137, | |
| "eval_samples_per_second": 4.133, | |
| "eval_steps_per_second": 2.066, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4394785847299814, | |
| "grad_norm": 0.7794449925422668, | |
| "learning_rate": 4.4600705507036125e-06, | |
| "loss": 0.1433, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4643078833022967, | |
| "grad_norm": 0.7465994358062744, | |
| "learning_rate": 4.104632998165309e-06, | |
| "loss": 0.1445, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.489137181874612, | |
| "grad_norm": 0.8334828615188599, | |
| "learning_rate": 3.7602633811781165e-06, | |
| "loss": 0.1458, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.489137181874612, | |
| "eval_loss": 0.1462916135787964, | |
| "eval_runtime": 41.086, | |
| "eval_samples_per_second": 4.138, | |
| "eval_steps_per_second": 2.069, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5139664804469275, | |
| "grad_norm": 0.8605223894119263, | |
| "learning_rate": 3.4276082177154536e-06, | |
| "loss": 0.1447, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5387957790192428, | |
| "grad_norm": 0.5568763017654419, | |
| "learning_rate": 3.107292033088265e-06, | |
| "loss": 0.1384, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.563625077591558, | |
| "grad_norm": 0.7917467951774597, | |
| "learning_rate": 2.7999161874640026e-06, | |
| "loss": 0.1481, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.563625077591558, | |
| "eval_loss": 0.14508940279483795, | |
| "eval_runtime": 41.1236, | |
| "eval_samples_per_second": 4.134, | |
| "eval_steps_per_second": 2.067, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5884543761638734, | |
| "grad_norm": 0.7165929675102234, | |
| "learning_rate": 2.506057746875753e-06, | |
| "loss": 0.1422, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6132836747361887, | |
| "grad_norm": 0.8443304896354675, | |
| "learning_rate": 2.226268399841055e-06, | |
| "loss": 0.1406, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.638112973308504, | |
| "grad_norm": 1.0195279121398926, | |
| "learning_rate": 1.961073421624352e-06, | |
| "loss": 0.1403, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.638112973308504, | |
| "eval_loss": 0.1443227380514145, | |
| "eval_runtime": 41.1177, | |
| "eval_samples_per_second": 4.134, | |
| "eval_steps_per_second": 2.067, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6629422718808193, | |
| "grad_norm": 0.7534502148628235, | |
| "learning_rate": 1.710970688087561e-06, | |
| "loss": 0.1398, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6877715704531346, | |
| "grad_norm": 0.8134092092514038, | |
| "learning_rate": 1.4764297409801764e-06, | |
| "loss": 0.1377, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.71260086902545, | |
| "grad_norm": 0.7144195437431335, | |
| "learning_rate": 1.2578909064236887e-06, | |
| "loss": 0.1457, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.71260086902545, | |
| "eval_loss": 0.1440444439649582, | |
| "eval_runtime": 41.1467, | |
| "eval_samples_per_second": 4.132, | |
| "eval_steps_per_second": 2.066, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7374301675977653, | |
| "grad_norm": 0.5405588150024414, | |
| "learning_rate": 1.055764468245304e-06, | |
| "loss": 0.1406, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7622594661700806, | |
| "grad_norm": 0.6921040415763855, | |
| "learning_rate": 8.70429897712921e-07, | |
| "loss": 0.1366, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7870887647423959, | |
| "grad_norm": 0.7384780645370483, | |
| "learning_rate": 7.022351411174866e-07, | |
| "loss": 0.1473, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7870887647423959, | |
| "eval_loss": 0.1436140090227127, | |
| "eval_runtime": 41.1175, | |
| "eval_samples_per_second": 4.134, | |
| "eval_steps_per_second": 2.067, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8119180633147114, | |
| "grad_norm": 0.7989315986633301, | |
| "learning_rate": 5.51495966540182e-07, | |
| "loss": 0.1368, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8367473618870267, | |
| "grad_norm": 0.8284072875976562, | |
| "learning_rate": 4.1849537103084924e-07, | |
| "loss": 0.1445, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.861576660459342, | |
| "grad_norm": 0.8498286604881287, | |
| "learning_rate": 3.0348304931059556e-07, | |
| "loss": 0.1347, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.861576660459342, | |
| "eval_loss": 0.14337773621082306, | |
| "eval_runtime": 41.0893, | |
| "eval_samples_per_second": 4.137, | |
| "eval_steps_per_second": 2.069, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8864059590316573, | |
| "grad_norm": 0.7535120844841003, | |
| "learning_rate": 2.066749249960498e-07, | |
| "loss": 0.1422, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9112352576039728, | |
| "grad_norm": 0.8711650371551514, | |
| "learning_rate": 1.2825274522532795e-07, | |
| "loss": 0.129, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9360645561762881, | |
| "grad_norm": 0.751017689704895, | |
| "learning_rate": 6.836373944677954e-08, | |
| "loss": 0.1423, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9360645561762881, | |
| "eval_loss": 0.1432679146528244, | |
| "eval_runtime": 41.1344, | |
| "eval_samples_per_second": 4.133, | |
| "eval_steps_per_second": 2.066, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9608938547486034, | |
| "grad_norm": 0.7860766053199768, | |
| "learning_rate": 2.7120343011071138e-08, | |
| "loss": 0.1492, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9857231533209188, | |
| "grad_norm": 0.7165619134902954, | |
| "learning_rate": 4.599986085573882e-09, | |
| "loss": 0.138, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 806, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4861828204940288e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |