| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 808, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024752475247524754, | |
| "grad_norm": 1.4194453954696655, | |
| "learning_rate": 2.1951219512195125e-06, | |
| "loss": 0.4754, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04950495049504951, | |
| "grad_norm": 1.0042498111724854, | |
| "learning_rate": 4.634146341463416e-06, | |
| "loss": 0.3587, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07425742574257425, | |
| "grad_norm": 0.6140494346618652, | |
| "learning_rate": 7.0731707317073175e-06, | |
| "loss": 0.2705, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 0.4005405604839325, | |
| "learning_rate": 9.51219512195122e-06, | |
| "loss": 0.2262, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12376237623762376, | |
| "grad_norm": 0.5535389184951782, | |
| "learning_rate": 9.99731595284969e-06, | |
| "loss": 0.2024, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1485148514851485, | |
| "grad_norm": 0.45378637313842773, | |
| "learning_rate": 9.986416949868223e-06, | |
| "loss": 0.2047, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17326732673267325, | |
| "grad_norm": 0.46589696407318115, | |
| "learning_rate": 9.967153506514677e-06, | |
| "loss": 0.1903, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.4678415358066559, | |
| "learning_rate": 9.939557936156527e-06, | |
| "loss": 0.1865, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.22277227722772278, | |
| "grad_norm": 0.5036286115646362, | |
| "learning_rate": 9.903676528846353e-06, | |
| "loss": 0.1973, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.24752475247524752, | |
| "grad_norm": 0.7928675413131714, | |
| "learning_rate": 9.859569473672816e-06, | |
| "loss": 0.1838, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2722772277227723, | |
| "grad_norm": 0.4357012212276459, | |
| "learning_rate": 9.807310757796782e-06, | |
| "loss": 0.1902, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 0.5225412249565125, | |
| "learning_rate": 9.746988042341907e-06, | |
| "loss": 0.1715, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3217821782178218, | |
| "grad_norm": 0.46918612718582153, | |
| "learning_rate": 9.678702515347937e-06, | |
| "loss": 0.1843, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3465346534653465, | |
| "grad_norm": 0.5186185836791992, | |
| "learning_rate": 9.602568722033325e-06, | |
| "loss": 0.1847, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3712871287128713, | |
| "grad_norm": 0.525906503200531, | |
| "learning_rate": 9.518714372651922e-06, | |
| "loss": 0.1836, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.47885432839393616, | |
| "learning_rate": 9.427280128266049e-06, | |
| "loss": 0.1721, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4207920792079208, | |
| "grad_norm": 0.45440953969955444, | |
| "learning_rate": 9.328419364795295e-06, | |
| "loss": 0.1805, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44554455445544555, | |
| "grad_norm": 0.47786083817481995, | |
| "learning_rate": 9.222297915736835e-06, | |
| "loss": 0.1802, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.47029702970297027, | |
| "grad_norm": 0.40427038073539734, | |
| "learning_rate": 9.109093793988866e-06, | |
| "loss": 0.1747, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49504950495049505, | |
| "grad_norm": 0.5424014329910278, | |
| "learning_rate": 8.988996893243742e-06, | |
| "loss": 0.1734, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5198019801980198, | |
| "grad_norm": 0.636466383934021, | |
| "learning_rate": 8.862208669451748e-06, | |
| "loss": 0.1726, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5445544554455446, | |
| "grad_norm": 0.45117199420928955, | |
| "learning_rate": 8.728941802889816e-06, | |
| "loss": 0.1742, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5693069306930693, | |
| "grad_norm": 0.459605872631073, | |
| "learning_rate": 8.589419841402046e-06, | |
| "loss": 0.1812, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 0.4333467483520508, | |
| "learning_rate": 8.443876825410488e-06, | |
| "loss": 0.1745, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6188118811881188, | |
| "grad_norm": 0.39373937249183655, | |
| "learning_rate": 8.292556895325195e-06, | |
| "loss": 0.165, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6435643564356436, | |
| "grad_norm": 0.4536997377872467, | |
| "learning_rate": 8.135713882012102e-06, | |
| "loss": 0.1669, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6683168316831684, | |
| "grad_norm": 0.4184967577457428, | |
| "learning_rate": 7.973610881005702e-06, | |
| "loss": 0.1628, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.693069306930693, | |
| "grad_norm": 0.38848263025283813, | |
| "learning_rate": 7.80651981118075e-06, | |
| "loss": 0.1596, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7178217821782178, | |
| "grad_norm": 0.46365565061569214, | |
| "learning_rate": 7.634720958623287e-06, | |
| "loss": 0.1656, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7425742574257426, | |
| "grad_norm": 0.41299328207969666, | |
| "learning_rate": 7.458502506466146e-06, | |
| "loss": 0.1765, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7673267326732673, | |
| "grad_norm": 0.41087886691093445, | |
| "learning_rate": 7.278160051477574e-06, | |
| "loss": 0.1585, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 0.3603922128677368, | |
| "learning_rate": 7.09399610821391e-06, | |
| "loss": 0.1674, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8168316831683168, | |
| "grad_norm": 0.44449886679649353, | |
| "learning_rate": 6.906319601568039e-06, | |
| "loss": 0.1753, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8415841584158416, | |
| "grad_norm": 0.4169905483722687, | |
| "learning_rate": 6.715445348564863e-06, | |
| "loss": 0.171, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8663366336633663, | |
| "grad_norm": 0.4144372344017029, | |
| "learning_rate": 6.521693530273046e-06, | |
| "loss": 0.1678, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8910891089108911, | |
| "grad_norm": 0.35955941677093506, | |
| "learning_rate": 6.325389154718865e-06, | |
| "loss": 0.1608, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9158415841584159, | |
| "grad_norm": 0.49849727749824524, | |
| "learning_rate": 6.126861511703119e-06, | |
| "loss": 0.1648, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9405940594059405, | |
| "grad_norm": 0.40356525778770447, | |
| "learning_rate": 5.926443620435572e-06, | |
| "loss": 0.1603, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9653465346534653, | |
| "grad_norm": 0.3695359230041504, | |
| "learning_rate": 5.724471670913545e-06, | |
| "loss": 0.1553, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9900990099009901, | |
| "grad_norm": 0.4594876766204834, | |
| "learning_rate": 5.521284459981662e-06, | |
| "loss": 0.1623, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0148514851485149, | |
| "grad_norm": 0.39757996797561646, | |
| "learning_rate": 5.317222823018775e-06, | |
| "loss": 0.1223, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0396039603960396, | |
| "grad_norm": 0.4017312228679657, | |
| "learning_rate": 5.112629062205341e-06, | |
| "loss": 0.0924, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0643564356435644, | |
| "grad_norm": 0.4011599123477936, | |
| "learning_rate": 4.907846372330326e-06, | |
| "loss": 0.0949, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.0891089108910892, | |
| "grad_norm": 0.397612601518631, | |
| "learning_rate": 4.7032182651008204e-06, | |
| "loss": 0.0875, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.113861386138614, | |
| "grad_norm": 0.5288234949111938, | |
| "learning_rate": 4.4990879929200145e-06, | |
| "loss": 0.0958, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1386138613861387, | |
| "grad_norm": 0.36887308955192566, | |
| "learning_rate": 4.295797973100174e-06, | |
| "loss": 0.0906, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1633663366336633, | |
| "grad_norm": 0.42459458112716675, | |
| "learning_rate": 4.093689213476408e-06, | |
| "loss": 0.0912, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.188118811881188, | |
| "grad_norm": 0.43703174591064453, | |
| "learning_rate": 3.893100740384766e-06, | |
| "loss": 0.0945, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2128712871287128, | |
| "grad_norm": 0.4258781969547272, | |
| "learning_rate": 3.6943690299642055e-06, | |
| "loss": 0.0965, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2376237623762376, | |
| "grad_norm": 0.48606401681900024, | |
| "learning_rate": 3.4978274437363447e-06, | |
| "loss": 0.092, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2623762376237624, | |
| "grad_norm": 0.4267478287220001, | |
| "learning_rate": 3.3038056694098485e-06, | |
| "loss": 0.0949, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2871287128712872, | |
| "grad_norm": 0.5156121850013733, | |
| "learning_rate": 3.112629167847409e-06, | |
| "loss": 0.0965, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.311881188118812, | |
| "grad_norm": 0.4543267488479614, | |
| "learning_rate": 2.9246186271230335e-06, | |
| "loss": 0.094, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3366336633663367, | |
| "grad_norm": 0.3953835070133209, | |
| "learning_rate": 2.7400894245854327e-06, | |
| "loss": 0.0958, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3613861386138613, | |
| "grad_norm": 0.40146708488464355, | |
| "learning_rate": 2.5593510978298487e-06, | |
| "loss": 0.0941, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.386138613861386, | |
| "grad_norm": 0.39063599705696106, | |
| "learning_rate": 2.3827068254657493e-06, | |
| "loss": 0.0899, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4108910891089108, | |
| "grad_norm": 0.38410985469818115, | |
| "learning_rate": 2.2104529185513807e-06, | |
| "loss": 0.0972, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4356435643564356, | |
| "grad_norm": 0.4306909143924713, | |
| "learning_rate": 2.0428783235482423e-06, | |
| "loss": 0.0968, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.4603960396039604, | |
| "grad_norm": 0.43735161423683167, | |
| "learning_rate": 1.8802641376292913e-06, | |
| "loss": 0.1037, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4851485148514851, | |
| "grad_norm": 0.38794925808906555, | |
| "learning_rate": 1.722883137153874e-06, | |
| "loss": 0.0936, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.50990099009901, | |
| "grad_norm": 0.48267433047294617, | |
| "learning_rate": 1.5709993201003827e-06, | |
| "loss": 0.097, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.5346534653465347, | |
| "grad_norm": 0.3811907470226288, | |
| "learning_rate": 1.424867463224147e-06, | |
| "loss": 0.0915, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5594059405940595, | |
| "grad_norm": 0.4866475760936737, | |
| "learning_rate": 1.2847326946834427e-06, | |
| "loss": 0.1023, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5841584158415842, | |
| "grad_norm": 0.4222099483013153, | |
| "learning_rate": 1.1508300828504682e-06, | |
| "loss": 0.0991, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.608910891089109, | |
| "grad_norm": 0.37634769082069397, | |
| "learning_rate": 1.0233842419970773e-06, | |
| "loss": 0.0899, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6336633663366338, | |
| "grad_norm": 0.4965452551841736, | |
| "learning_rate": 9.026089555166745e-07, | |
| "loss": 0.1001, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.6584158415841586, | |
| "grad_norm": 0.3864215016365051, | |
| "learning_rate": 7.887068173143325e-07, | |
| "loss": 0.0994, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6831683168316833, | |
| "grad_norm": 0.4128170311450958, | |
| "learning_rate": 6.818688919666461e-07, | |
| "loss": 0.0989, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7079207920792079, | |
| "grad_norm": 0.3718600869178772, | |
| "learning_rate": 5.822743942214026e-07, | |
| "loss": 0.0944, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.7326732673267327, | |
| "grad_norm": 0.41322892904281616, | |
| "learning_rate": 4.900903883747021e-07, | |
| "loss": 0.0929, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.7574257425742574, | |
| "grad_norm": 0.3393838107585907, | |
| "learning_rate": 4.054715080297722e-07, | |
| "loss": 0.097, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7821782178217822, | |
| "grad_norm": 0.3705613911151886, | |
| "learning_rate": 3.285596967076055e-07, | |
| "loss": 0.0956, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.806930693069307, | |
| "grad_norm": 0.47960299253463745, | |
| "learning_rate": 2.594839697445017e-07, | |
| "loss": 0.1008, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8316831683168315, | |
| "grad_norm": 0.4418397545814514, | |
| "learning_rate": 1.983601978759292e-07, | |
| "loss": 0.1042, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.8564356435643563, | |
| "grad_norm": 0.3197237551212311, | |
| "learning_rate": 1.4529091286973994e-07, | |
| "loss": 0.0925, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.881188118811881, | |
| "grad_norm": 0.44026193022727966, | |
| "learning_rate": 1.0036513553476013e-07, | |
| "loss": 0.1043, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.9059405940594059, | |
| "grad_norm": 0.5169320702552795, | |
| "learning_rate": 6.365822639327724e-08, | |
| "loss": 0.1028, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9306930693069306, | |
| "grad_norm": 0.5262919068336487, | |
| "learning_rate": 3.523175926790745e-08, | |
| "loss": 0.0945, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.9554455445544554, | |
| "grad_norm": 0.374805748462677, | |
| "learning_rate": 1.513341799488921e-08, | |
| "loss": 0.1013, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "grad_norm": 0.3962545096874237, | |
| "learning_rate": 3.3969164370722953e-09, | |
| "loss": 0.0913, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 808, | |
| "total_flos": 576827298414592.0, | |
| "train_loss": 0.0689804450710221, | |
| "train_runtime": 41658.2625, | |
| "train_samples_per_second": 0.155, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 808, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 576827298414592.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |