| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 750, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04, |
| "grad_norm": 15.310038127586646, |
| "learning_rate": 2.8768699654775607e-08, |
| "loss": 0.5928, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 14.307371238653309, |
| "learning_rate": 5.7537399309551214e-08, |
| "loss": 0.5886, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 11.030093905097976, |
| "learning_rate": 8.630609896432682e-08, |
| "loss": 0.5803, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 5.993337842833648, |
| "learning_rate": 1.1507479861910243e-07, |
| "loss": 0.548, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 3.710083568844066, |
| "learning_rate": 1.4384349827387802e-07, |
| "loss": 0.5181, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.9705824009872897, |
| "learning_rate": 1.7261219792865363e-07, |
| "loss": 0.4924, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 1.2663369930235067, |
| "learning_rate": 2.0138089758342927e-07, |
| "loss": 0.4774, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.3138855186585094, |
| "learning_rate": 2.3014959723820486e-07, |
| "loss": 0.4606, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.1467390567333238, |
| "learning_rate": 2.5891829689298047e-07, |
| "loss": 0.4512, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.0129633422067545, |
| "learning_rate": 2.8768699654775605e-07, |
| "loss": 0.438, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 1.3367425686963006, |
| "learning_rate": 3.164556962025317e-07, |
| "loss": 0.4272, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.9911516620221612, |
| "learning_rate": 3.4522439585730727e-07, |
| "loss": 0.4239, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 1.1381388271727326, |
| "learning_rate": 3.739930955120829e-07, |
| "loss": 0.4188, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 1.494070752193929, |
| "learning_rate": 4.0276179516685854e-07, |
| "loss": 0.4155, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 1.6657782892459603, |
| "learning_rate": 4.315304948216341e-07, |
| "loss": 0.4192, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 1.3914607726604022, |
| "learning_rate": 4.602991944764097e-07, |
| "loss": 0.4118, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 1.0670270762955518, |
| "learning_rate": 4.890678941311853e-07, |
| "loss": 0.4118, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 1.1607821737201103, |
| "learning_rate": 5.178365937859609e-07, |
| "loss": 0.406, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.0877519040830297, |
| "learning_rate": 5.466052934407366e-07, |
| "loss": 0.4037, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 1.5122893151477952, |
| "learning_rate": 5.753739930955121e-07, |
| "loss": 0.3994, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.0149081002178446, |
| "learning_rate": 6.041426927502877e-07, |
| "loss": 0.3965, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 1.054174710975304, |
| "learning_rate": 6.329113924050634e-07, |
| "loss": 0.4021, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 1.3289995110739046, |
| "learning_rate": 6.61680092059839e-07, |
| "loss": 0.3966, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 1.2045960771734698, |
| "learning_rate": 6.904487917146145e-07, |
| "loss": 0.3912, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.4483752652421882, |
| "learning_rate": 7.192174913693902e-07, |
| "loss": 0.3957, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.3906799554824829, |
| "eval_runtime": 23.4022, |
| "eval_samples_per_second": 286.853, |
| "eval_steps_per_second": 1.154, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 1.421137606048492, |
| "learning_rate": 7.479861910241658e-07, |
| "loss": 0.3822, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 1.1569390486775748, |
| "learning_rate": 7.767548906789415e-07, |
| "loss": 0.382, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 1.2966147842235234, |
| "learning_rate": 8.055235903337171e-07, |
| "loss": 0.3821, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 1.4954076930003743, |
| "learning_rate": 8.342922899884925e-07, |
| "loss": 0.3828, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 1.175833777681018, |
| "learning_rate": 8.630609896432681e-07, |
| "loss": 0.3776, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 1.4737420203396732, |
| "learning_rate": 8.918296892980438e-07, |
| "loss": 0.381, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 1.0411173755000598, |
| "learning_rate": 9.205983889528194e-07, |
| "loss": 0.373, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 1.4170827896859917, |
| "learning_rate": 9.493670886075951e-07, |
| "loss": 0.3744, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 1.14952148588396, |
| "learning_rate": 9.781357882623706e-07, |
| "loss": 0.3788, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 1.0100864705742274, |
| "learning_rate": 1.0069044879171462e-06, |
| "loss": 0.3751, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 1.4380885723117884, |
| "learning_rate": 1.0356731875719219e-06, |
| "loss": 0.3789, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 1.313223935969843, |
| "learning_rate": 1.0644418872266975e-06, |
| "loss": 0.3762, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 1.6074162427330996, |
| "learning_rate": 1.0932105868814731e-06, |
| "loss": 0.3719, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 1.0515537415251632, |
| "learning_rate": 1.1219792865362486e-06, |
| "loss": 0.3704, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.2284758699860527, |
| "learning_rate": 1.1507479861910242e-06, |
| "loss": 0.3706, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.6400000000000001, |
| "grad_norm": 1.3749774043001943, |
| "learning_rate": 1.1795166858457998e-06, |
| "loss": 0.3692, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 1.2659695321385707, |
| "learning_rate": 1.2082853855005755e-06, |
| "loss": 0.3692, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 1.1440516628585093, |
| "learning_rate": 1.2370540851553511e-06, |
| "loss": 0.369, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 1.225026971871428, |
| "learning_rate": 1.2658227848101267e-06, |
| "loss": 0.3699, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 1.403959379498929, |
| "learning_rate": 1.2945914844649024e-06, |
| "loss": 0.3638, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 1.188372333305488, |
| "learning_rate": 1.323360184119678e-06, |
| "loss": 0.3684, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 1.1033565733049102, |
| "learning_rate": 1.3521288837744534e-06, |
| "loss": 0.3688, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 1.2385035527511736, |
| "learning_rate": 1.380897583429229e-06, |
| "loss": 0.3637, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 1.2610897099257767, |
| "learning_rate": 1.4096662830840047e-06, |
| "loss": 0.3665, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.31063382710149, |
| "learning_rate": 1.4384349827387804e-06, |
| "loss": 0.3655, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.3691079914569855, |
| "eval_runtime": 22.9129, |
| "eval_samples_per_second": 292.979, |
| "eval_steps_per_second": 1.178, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.04, |
| "grad_norm": 1.3941491496870813, |
| "learning_rate": 1.467203682393556e-06, |
| "loss": 0.3444, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 1.1940415585124986, |
| "learning_rate": 1.4959723820483316e-06, |
| "loss": 0.3396, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.12, |
| "grad_norm": 1.6568570915918388, |
| "learning_rate": 1.5247410817031073e-06, |
| "loss": 0.3464, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 1.213779432504231, |
| "learning_rate": 1.553509781357883e-06, |
| "loss": 0.3446, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.2, |
| "grad_norm": 1.044059581354644, |
| "learning_rate": 1.5822784810126585e-06, |
| "loss": 0.3435, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 1.2395011495921882, |
| "learning_rate": 1.6110471806674342e-06, |
| "loss": 0.3477, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.2800000000000002, |
| "grad_norm": 1.5404260685292859, |
| "learning_rate": 1.6398158803222094e-06, |
| "loss": 0.3438, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 1.1140407297709995, |
| "learning_rate": 1.668584579976985e-06, |
| "loss": 0.3443, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 1.4618040726027104, |
| "learning_rate": 1.6973532796317607e-06, |
| "loss": 0.3374, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 1.5541271391197082, |
| "learning_rate": 1.7261219792865363e-06, |
| "loss": 0.3408, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.44, |
| "grad_norm": 1.7994149245724862, |
| "learning_rate": 1.754890678941312e-06, |
| "loss": 0.3425, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 1.1608293525012197, |
| "learning_rate": 1.7836593785960876e-06, |
| "loss": 0.3452, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.52, |
| "grad_norm": 1.4076591063804629, |
| "learning_rate": 1.8124280782508632e-06, |
| "loss": 0.3421, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 1.2923646133856594, |
| "learning_rate": 1.8411967779056388e-06, |
| "loss": 0.338, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 1.3157432345374322, |
| "learning_rate": 1.8699654775604145e-06, |
| "loss": 0.3407, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 1.5572730770590508, |
| "learning_rate": 1.8987341772151901e-06, |
| "loss": 0.3466, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.68, |
| "grad_norm": 1.2136917873100426, |
| "learning_rate": 1.9275028768699655e-06, |
| "loss": 0.3462, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "grad_norm": 1.140250439915797, |
| "learning_rate": 1.956271576524741e-06, |
| "loss": 0.3466, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.76, |
| "grad_norm": 1.164019763628723, |
| "learning_rate": 1.985040276179517e-06, |
| "loss": 0.3389, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 1.1464479853572567, |
| "learning_rate": 2.0138089758342925e-06, |
| "loss": 0.3395, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.84, |
| "grad_norm": 1.2122963375611697, |
| "learning_rate": 2.042577675489068e-06, |
| "loss": 0.3459, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 1.1460209246944402, |
| "learning_rate": 2.0713463751438437e-06, |
| "loss": 0.3397, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 1.1775969440835221, |
| "learning_rate": 2.1001150747986194e-06, |
| "loss": 0.3403, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 1.394530831390625, |
| "learning_rate": 2.128883774453395e-06, |
| "loss": 0.3418, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.3333287115682235, |
| "learning_rate": 2.1576524741081706e-06, |
| "loss": 0.3351, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.36160025000572205, |
| "eval_runtime": 22.6747, |
| "eval_samples_per_second": 296.057, |
| "eval_steps_per_second": 1.191, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 750, |
| "total_flos": 1256277934080000.0, |
| "train_loss": 0.38879382991790773, |
| "train_runtime": 4686.8691, |
| "train_samples_per_second": 81.64, |
| "train_steps_per_second": 0.16 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1256277934080000.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|