| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9992277992277993, | |
| "eval_steps": 100, | |
| "global_step": 647, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015444015444015444, | |
| "grad_norm": 54.027117924566284, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 8.2594, | |
| "mean_token_accuracy": 0.10601478479802609, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03088803088803089, | |
| "grad_norm": 36.2771924758843, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 8.0141, | |
| "mean_token_accuracy": 0.10835166163742542, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04633204633204633, | |
| "grad_norm": 32.30506084518261, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 7.1727, | |
| "mean_token_accuracy": 0.11615957953035831, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06177606177606178, | |
| "grad_norm": 12.644482204441966, | |
| "learning_rate": 1.230769230769231e-05, | |
| "loss": 6.1906, | |
| "mean_token_accuracy": 0.1327559869736433, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07722007722007722, | |
| "grad_norm": 10.568360790591178, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 5.4813, | |
| "mean_token_accuracy": 0.17196358889341354, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09266409266409266, | |
| "grad_norm": 4.068292936765287, | |
| "learning_rate": 1.8461538461538465e-05, | |
| "loss": 4.7438, | |
| "mean_token_accuracy": 0.2288092628121376, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.10810810810810811, | |
| "grad_norm": 3.5423142348559904, | |
| "learning_rate": 1.9996358021096174e-05, | |
| "loss": 4.2523, | |
| "mean_token_accuracy": 0.2767298325896263, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12355212355212356, | |
| "grad_norm": 2.6809187857623313, | |
| "learning_rate": 1.9967238104745695e-05, | |
| "loss": 3.9688, | |
| "mean_token_accuracy": 0.3063569128513336, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.138996138996139, | |
| "grad_norm": 2.1859867880714967, | |
| "learning_rate": 1.9909083099891682e-05, | |
| "loss": 3.6148, | |
| "mean_token_accuracy": 0.3451215773820877, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15444015444015444, | |
| "grad_norm": 1.2979500528779593, | |
| "learning_rate": 1.9822062415120053e-05, | |
| "loss": 3.4617, | |
| "mean_token_accuracy": 0.36571358889341354, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15444015444015444, | |
| "eval_runtime": 0.3678, | |
| "eval_samples_per_second": 252.838, | |
| "eval_steps_per_second": 16.312, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16988416988416988, | |
| "grad_norm": 1.2047363502334179, | |
| "learning_rate": 1.9706429546259592e-05, | |
| "loss": 3.4285, | |
| "mean_token_accuracy": 0.3689419463276863, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.18532818532818532, | |
| "grad_norm": 1.1540457712296708, | |
| "learning_rate": 1.9562521337935255e-05, | |
| "loss": 3.3438, | |
| "mean_token_accuracy": 0.37895588874816893, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.20077220077220076, | |
| "grad_norm": 1.1919170437393742, | |
| "learning_rate": 1.939075700232209e-05, | |
| "loss": 3.3227, | |
| "mean_token_accuracy": 0.38107282519340513, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.21621621621621623, | |
| "grad_norm": 1.0322594052165261, | |
| "learning_rate": 1.9191636897958123e-05, | |
| "loss": 3.3289, | |
| "mean_token_accuracy": 0.38092619478702544, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.23166023166023167, | |
| "grad_norm": 1.072622203579115, | |
| "learning_rate": 1.8965741072173647e-05, | |
| "loss": 3.3309, | |
| "mean_token_accuracy": 0.3811278060078621, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2471042471042471, | |
| "grad_norm": 1.1060003860280698, | |
| "learning_rate": 1.8713727571382857e-05, | |
| "loss": 3.3234, | |
| "mean_token_accuracy": 0.38025770634412764, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2625482625482625, | |
| "grad_norm": 1.086808022765859, | |
| "learning_rate": 1.8436330524160048e-05, | |
| "loss": 3.318, | |
| "mean_token_accuracy": 0.38055351972579954, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.277992277992278, | |
| "grad_norm": 1.1385524957672613, | |
| "learning_rate": 1.8134358002684504e-05, | |
| "loss": 3.2988, | |
| "mean_token_accuracy": 0.3846017554402351, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.29343629343629346, | |
| "grad_norm": 1.0815980244836383, | |
| "learning_rate": 1.7808689668783762e-05, | |
| "loss": 3.2711, | |
| "mean_token_accuracy": 0.3869165450334549, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3088803088803089, | |
| "grad_norm": 1.1075978174473033, | |
| "learning_rate": 1.7460274211432463e-05, | |
| "loss": 3.3227, | |
| "mean_token_accuracy": 0.38340970128774643, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3088803088803089, | |
| "eval_runtime": 0.3689, | |
| "eval_samples_per_second": 252.111, | |
| "eval_steps_per_second": 16.265, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "grad_norm": 1.2619430199962072, | |
| "learning_rate": 1.7090126583171503e-05, | |
| "loss": 3.3055, | |
| "mean_token_accuracy": 0.3856549397110939, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33976833976833976, | |
| "grad_norm": 1.087491804935316, | |
| "learning_rate": 1.6699325043497957e-05, | |
| "loss": 3.277, | |
| "mean_token_accuracy": 0.3866904929280281, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3552123552123552, | |
| "grad_norm": 1.1152659184947136, | |
| "learning_rate": 1.6289008017838447e-05, | |
| "loss": 3.2496, | |
| "mean_token_accuracy": 0.3880590170621872, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.37065637065637064, | |
| "grad_norm": 1.078360383901834, | |
| "learning_rate": 1.586037078125607e-05, | |
| "loss": 3.2484, | |
| "mean_token_accuracy": 0.3903868407011032, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3861003861003861, | |
| "grad_norm": 1.049519316739431, | |
| "learning_rate": 1.54146619765513e-05, | |
| "loss": 3.252, | |
| "mean_token_accuracy": 0.3888410285115242, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4015444015444015, | |
| "grad_norm": 1.0716141072297978, | |
| "learning_rate": 1.4953179976899878e-05, | |
| "loss": 3.2891, | |
| "mean_token_accuracy": 0.3861253634095192, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.416988416988417, | |
| "grad_norm": 1.1242108487806568, | |
| "learning_rate": 1.4477269103623496e-05, | |
| "loss": 3.2488, | |
| "mean_token_accuracy": 0.38970552384853363, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.43243243243243246, | |
| "grad_norm": 1.056564744386044, | |
| "learning_rate": 1.3988315710111151e-05, | |
| "loss": 3.232, | |
| "mean_token_accuracy": 0.39249450266361235, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.44787644787644787, | |
| "grad_norm": 1.070870946248766, | |
| "learning_rate": 1.3487744143298822e-05, | |
| "loss": 3.2512, | |
| "mean_token_accuracy": 0.3900837257504463, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.46332046332046334, | |
| "grad_norm": 1.0749246835150736, | |
| "learning_rate": 1.2977012594472008e-05, | |
| "loss": 3.2504, | |
| "mean_token_accuracy": 0.38782380521297455, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.46332046332046334, | |
| "eval_runtime": 0.3696, | |
| "eval_samples_per_second": 251.594, | |
| "eval_steps_per_second": 16.232, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.47876447876447875, | |
| "grad_norm": 0.9656072799953122, | |
| "learning_rate": 1.2457608851477833e-05, | |
| "loss": 3.2687, | |
| "mean_token_accuracy": 0.3866996571421623, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4942084942084942, | |
| "grad_norm": 1.0178005185953467, | |
| "learning_rate": 1.1931045964720882e-05, | |
| "loss": 3.198, | |
| "mean_token_accuracy": 0.3944434255361557, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5096525096525096, | |
| "grad_norm": 1.074522942556136, | |
| "learning_rate": 1.1398857839567811e-05, | |
| "loss": 3.2355, | |
| "mean_token_accuracy": 0.39279997497797015, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.525096525096525, | |
| "grad_norm": 1.0410557083388294, | |
| "learning_rate": 1.086259476800041e-05, | |
| "loss": 3.2195, | |
| "mean_token_accuracy": 0.39092436134815217, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 0.9762243696283865, | |
| "learning_rate": 1.0323818912533561e-05, | |
| "loss": 3.2445, | |
| "mean_token_accuracy": 0.38936176896095276, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.555984555984556, | |
| "grad_norm": 1.0744349569161593, | |
| "learning_rate": 9.784099755553723e-06, | |
| "loss": 3.2625, | |
| "mean_token_accuracy": 0.39045931249856947, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.9548919869257485, | |
| "learning_rate": 9.245009527334243e-06, | |
| "loss": 3.2527, | |
| "mean_token_accuracy": 0.38955584168434143, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5868725868725869, | |
| "grad_norm": 1.0268956745848117, | |
| "learning_rate": 8.708118626045939e-06, | |
| "loss": 3.2535, | |
| "mean_token_accuracy": 0.3885325014591217, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6023166023166023, | |
| "grad_norm": 1.0496300516856243, | |
| "learning_rate": 8.174991043104662e-06, | |
| "loss": 3.2566, | |
| "mean_token_accuracy": 0.38984403312206267, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6177606177606177, | |
| "grad_norm": 1.043332590039097, | |
| "learning_rate": 7.647179807182125e-06, | |
| "loss": 3.2281, | |
| "mean_token_accuracy": 0.3923295482993126, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6177606177606177, | |
| "eval_runtime": 0.3673, | |
| "eval_samples_per_second": 253.192, | |
| "eval_steps_per_second": 16.335, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6332046332046332, | |
| "grad_norm": 1.061649815314641, | |
| "learning_rate": 7.126222460151719e-06, | |
| "loss": 3.2043, | |
| "mean_token_accuracy": 0.39413081407546996, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 1.0147970765923096, | |
| "learning_rate": 6.613636578148242e-06, | |
| "loss": 3.2316, | |
| "mean_token_accuracy": 0.3912878751754761, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6640926640926641, | |
| "grad_norm": 1.0058231917151492, | |
| "learning_rate": 6.110915350788846e-06, | |
| "loss": 3.2207, | |
| "mean_token_accuracy": 0.3918399602174759, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6795366795366795, | |
| "grad_norm": 1.0703873139225168, | |
| "learning_rate": 5.619523231433177e-06, | |
| "loss": 3.2566, | |
| "mean_token_accuracy": 0.38752417266368866, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.694980694980695, | |
| "grad_norm": 1.123689944709063, | |
| "learning_rate": 5.140891671153797e-06, | |
| "loss": 3.2848, | |
| "mean_token_accuracy": 0.3864888772368431, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7104247104247104, | |
| "grad_norm": 1.0150276974982517, | |
| "learning_rate": 4.676414948843934e-06, | |
| "loss": 3.2078, | |
| "mean_token_accuracy": 0.3944342628121376, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7258687258687259, | |
| "grad_norm": 1.131883688390389, | |
| "learning_rate": 4.2274461096098085e-06, | |
| "loss": 3.2121, | |
| "mean_token_accuracy": 0.3935947135090828, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7413127413127413, | |
| "grad_norm": 1.0848803772523403, | |
| "learning_rate": 3.795293023279093e-06, | |
| "loss": 3.2309, | |
| "mean_token_accuracy": 0.3939241200685501, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7567567567567568, | |
| "grad_norm": 0.9911464983867319, | |
| "learning_rate": 3.3812145745073834e-06, | |
| "loss": 3.2645, | |
| "mean_token_accuracy": 0.3887524425983429, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7722007722007722, | |
| "grad_norm": 0.9468182557450763, | |
| "learning_rate": 2.9864169955810085e-06, | |
| "loss": 3.2348, | |
| "mean_token_accuracy": 0.3921034947037697, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7722007722007722, | |
| "eval_runtime": 0.3674, | |
| "eval_samples_per_second": 253.144, | |
| "eval_steps_per_second": 16.332, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7876447876447876, | |
| "grad_norm": 1.153751906362788, | |
| "learning_rate": 2.6120503525989894e-06, | |
| "loss": 3.2051, | |
| "mean_token_accuracy": 0.3940493628382683, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.803088803088803, | |
| "grad_norm": 1.0023254669711654, | |
| "learning_rate": 2.25920519527003e-06, | |
| "loss": 3.2387, | |
| "mean_token_accuracy": 0.3898582592606544, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8185328185328186, | |
| "grad_norm": 1.018252078051325, | |
| "learning_rate": 1.9289093800839067e-06, | |
| "loss": 3.2488, | |
| "mean_token_accuracy": 0.39030425548553466, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.833976833976834, | |
| "grad_norm": 1.0191281048265344, | |
| "learning_rate": 1.6221250761114803e-06, | |
| "loss": 3.2156, | |
| "mean_token_accuracy": 0.39363697469234465, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8494208494208494, | |
| "grad_norm": 1.0580017660782297, | |
| "learning_rate": 1.339745962155613e-06, | |
| "loss": 3.2449, | |
| "mean_token_accuracy": 0.3889385357499123, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 1.0638282009844648, | |
| "learning_rate": 1.0825946234178575e-06, | |
| "loss": 3.2687, | |
| "mean_token_accuracy": 0.38850476443767545, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8803088803088803, | |
| "grad_norm": 0.9647796959764461, | |
| "learning_rate": 8.514201552645052e-07, | |
| "loss": 3.2523, | |
| "mean_token_accuracy": 0.3878818407654762, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8957528957528957, | |
| "grad_norm": 1.0003940081508194, | |
| "learning_rate": 6.468959810724329e-07, | |
| "loss": 3.2141, | |
| "mean_token_accuracy": 0.3934506356716156, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9111969111969112, | |
| "grad_norm": 0.9373675947022841, | |
| "learning_rate": 4.696178905113913e-07, | |
| "loss": 3.2305, | |
| "mean_token_accuracy": 0.39248495548963547, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9266409266409267, | |
| "grad_norm": 0.9968667494256308, | |
| "learning_rate": 3.2010230397739206e-07, | |
| "loss": 3.2254, | |
| "mean_token_accuracy": 0.39279315173625945, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9266409266409267, | |
| "eval_runtime": 0.3665, | |
| "eval_samples_per_second": 253.77, | |
| "eval_steps_per_second": 16.372, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9420849420849421, | |
| "grad_norm": 1.0683699164488198, | |
| "learning_rate": 1.9878476823294467e-07, | |
| "loss": 3.2227, | |
| "mean_token_accuracy": 0.3929983913898468, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9575289575289575, | |
| "grad_norm": 0.9681641634376521, | |
| "learning_rate": 1.0601868763643997e-07, | |
| "loss": 3.2156, | |
| "mean_token_accuracy": 0.3948619216680527, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "grad_norm": 1.0525452552878858, | |
| "learning_rate": 4.207429465668877e-08, | |
| "loss": 3.2148, | |
| "mean_token_accuracy": 0.39301991611719134, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9884169884169884, | |
| "grad_norm": 1.007791209844153, | |
| "learning_rate": 7.1378626715268295e-09, | |
| "loss": 3.252, | |
| "mean_token_accuracy": 0.3903378531336784, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9992277992277993, | |
| "mean_token_accuracy": 0.3896016627550125, | |
| "step": 647, | |
| "total_flos": 5418484972388352.0, | |
| "train_loss": 3.606253622488408, | |
| "train_runtime": 424.9732, | |
| "train_samples_per_second": 48.742, | |
| "train_steps_per_second": 1.522 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 647, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5418484972388352.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |