| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.18562874251497, | |
| "eval_steps": 500, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11976047904191617, | |
| "grad_norm": 2.8749659061431885, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4673, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.23952095808383234, | |
| "grad_norm": 3.4398715496063232, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0019, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3592814371257485, | |
| "grad_norm": 1.9059951305389404, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0575, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 4.149394989013672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8164, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5988023952095808, | |
| "grad_norm": 1.4866076707839966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8684, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.718562874251497, | |
| "grad_norm": 3.1927452087402344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8016, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8383233532934131, | |
| "grad_norm": 1.1162314414978027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6809, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9580838323353293, | |
| "grad_norm": 2.829102039337158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6962, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.0778443113772456, | |
| "grad_norm": 1.2642532587051392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5769, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.1976047904191618, | |
| "grad_norm": 1.3799452781677246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4128, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.3173652694610778, | |
| "grad_norm": 2.3143367767333984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6441, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.437125748502994, | |
| "grad_norm": 1.085919976234436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.393, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.55688622754491, | |
| "grad_norm": 1.2423957586288452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.6766467065868262, | |
| "grad_norm": 1.2964059114456177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4276, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.7964071856287425, | |
| "grad_norm": 1.8397400379180908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6162, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.9161676646706587, | |
| "grad_norm": 1.0209627151489258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4565, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.035928143712575, | |
| "grad_norm": 0.8725757598876953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4807, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.155688622754491, | |
| "grad_norm": 1.1269447803497314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3895, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.2754491017964074, | |
| "grad_norm": 1.528011679649353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3553, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.3952095808383236, | |
| "grad_norm": 0.8296527862548828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3516, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5149700598802394, | |
| "grad_norm": 1.301917552947998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3918, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.6347305389221556, | |
| "grad_norm": 0.8420801758766174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3497, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.754491017964072, | |
| "grad_norm": 1.1430580615997314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4311, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.874251497005988, | |
| "grad_norm": 0.9065356850624084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3551, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.9940119760479043, | |
| "grad_norm": 1.1302285194396973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3513, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.1137724550898205, | |
| "grad_norm": 0.9960314631462097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3124, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.2335329341317367, | |
| "grad_norm": 1.680296778678894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3065, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.3532934131736525, | |
| "grad_norm": 1.1697853803634644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3009, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.4730538922155687, | |
| "grad_norm": 1.9219907522201538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2802, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.592814371257485, | |
| "grad_norm": 1.384773850440979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3419, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.712574850299401, | |
| "grad_norm": 1.3956997394561768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3172, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.8323353293413174, | |
| "grad_norm": 1.058669924736023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3723, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.9520958083832336, | |
| "grad_norm": 1.5626955032348633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.325, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.07185628742515, | |
| "grad_norm": 1.2782564163208008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2912, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.191616766467066, | |
| "grad_norm": 1.0916423797607422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.233, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.311377245508982, | |
| "grad_norm": 0.8613762855529785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3058, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.431137724550898, | |
| "grad_norm": 0.6293674111366272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2334, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.550898203592815, | |
| "grad_norm": 1.6042566299438477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3287, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.6706586826347305, | |
| "grad_norm": 0.8140411376953125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2372, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.790419161676647, | |
| "grad_norm": 1.5365833044052124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3266, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.910179640718563, | |
| "grad_norm": 0.9418448805809021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2513, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.029940119760479, | |
| "grad_norm": 0.6695829033851624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2688, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.149700598802395, | |
| "grad_norm": 0.628887414932251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2149, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.269461077844311, | |
| "grad_norm": 0.964766263961792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2606, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.389221556886228, | |
| "grad_norm": 0.5990360975265503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2364, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.508982035928144, | |
| "grad_norm": 0.8189520835876465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2857, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.62874251497006, | |
| "grad_norm": 0.5583224296569824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2414, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.748502994011976, | |
| "grad_norm": 0.7695009708404541, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2434, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.868263473053892, | |
| "grad_norm": 0.3456665575504303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2597, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.9880239520958085, | |
| "grad_norm": 0.7596808671951294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2983, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.107784431137724, | |
| "grad_norm": 0.9513673782348633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2139, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.227544910179641, | |
| "grad_norm": 1.0958881378173828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2211, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.347305389221557, | |
| "grad_norm": 0.6882690787315369, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2347, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.467065868263473, | |
| "grad_norm": 1.0562934875488281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2276, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.586826347305389, | |
| "grad_norm": 1.1535356044769287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2469, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.706586826347305, | |
| "grad_norm": 0.9436424970626831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2713, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.826347305389222, | |
| "grad_norm": 1.0283164978027344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2449, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.946107784431137, | |
| "grad_norm": 1.3945902585983276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2193, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 7.065868263473054, | |
| "grad_norm": 0.5662649869918823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.2415, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 7.18562874251497, | |
| "grad_norm": 0.4687662720680237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1792, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 100, | |
| "total_flos": 1753733775298560.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |