{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992277992277993, "eval_steps": 100, "global_step": 647, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015444015444015444, "grad_norm": 54.027117924566284, "learning_rate": 3.0769230769230774e-06, "loss": 8.2594, "mean_token_accuracy": 0.10601478479802609, "step": 10 }, { "epoch": 0.03088803088803089, "grad_norm": 36.2771924758843, "learning_rate": 6.153846153846155e-06, "loss": 8.0141, "mean_token_accuracy": 0.10835166163742542, "step": 20 }, { "epoch": 0.04633204633204633, "grad_norm": 32.30506084518261, "learning_rate": 9.230769230769232e-06, "loss": 7.1727, "mean_token_accuracy": 0.11615957953035831, "step": 30 }, { "epoch": 0.06177606177606178, "grad_norm": 12.644482204441966, "learning_rate": 1.230769230769231e-05, "loss": 6.1906, "mean_token_accuracy": 0.1327559869736433, "step": 40 }, { "epoch": 0.07722007722007722, "grad_norm": 10.568360790591178, "learning_rate": 1.5384615384615387e-05, "loss": 5.4813, "mean_token_accuracy": 0.17196358889341354, "step": 50 }, { "epoch": 0.09266409266409266, "grad_norm": 4.068292936765287, "learning_rate": 1.8461538461538465e-05, "loss": 4.7438, "mean_token_accuracy": 0.2288092628121376, "step": 60 }, { "epoch": 0.10810810810810811, "grad_norm": 3.5423142348559904, "learning_rate": 1.9996358021096174e-05, "loss": 4.2523, "mean_token_accuracy": 0.2767298325896263, "step": 70 }, { "epoch": 0.12355212355212356, "grad_norm": 2.6809187857623313, "learning_rate": 1.9967238104745695e-05, "loss": 3.9688, "mean_token_accuracy": 0.3063569128513336, "step": 80 }, { "epoch": 0.138996138996139, "grad_norm": 2.1859867880714967, "learning_rate": 1.9909083099891682e-05, "loss": 3.6148, "mean_token_accuracy": 0.3451215773820877, "step": 90 }, { "epoch": 0.15444015444015444, "grad_norm": 1.2979500528779593, "learning_rate": 1.9822062415120053e-05, "loss": 3.4617, "mean_token_accuracy": 0.36571358889341354, "step": 100 }, { "epoch": 0.15444015444015444, "eval_runtime": 0.3678, "eval_samples_per_second": 252.838, "eval_steps_per_second": 16.312, "step": 100 }, { "epoch": 0.16988416988416988, "grad_norm": 1.2047363502334179, "learning_rate": 1.9706429546259592e-05, "loss": 3.4285, "mean_token_accuracy": 0.3689419463276863, "step": 110 }, { "epoch": 0.18532818532818532, "grad_norm": 1.1540457712296708, "learning_rate": 1.9562521337935255e-05, "loss": 3.3438, "mean_token_accuracy": 0.37895588874816893, "step": 120 }, { "epoch": 0.20077220077220076, "grad_norm": 1.1919170437393742, "learning_rate": 1.939075700232209e-05, "loss": 3.3227, "mean_token_accuracy": 0.38107282519340513, "step": 130 }, { "epoch": 0.21621621621621623, "grad_norm": 1.0322594052165261, "learning_rate": 1.9191636897958123e-05, "loss": 3.3289, "mean_token_accuracy": 0.38092619478702544, "step": 140 }, { "epoch": 0.23166023166023167, "grad_norm": 1.072622203579115, "learning_rate": 1.8965741072173647e-05, "loss": 3.3309, "mean_token_accuracy": 0.3811278060078621, "step": 150 }, { "epoch": 0.2471042471042471, "grad_norm": 1.1060003860280698, "learning_rate": 1.8713727571382857e-05, "loss": 3.3234, "mean_token_accuracy": 0.38025770634412764, "step": 160 }, { "epoch": 0.2625482625482625, "grad_norm": 1.086808022765859, "learning_rate": 1.8436330524160048e-05, "loss": 3.318, "mean_token_accuracy": 0.38055351972579954, "step": 170 }, { "epoch": 0.277992277992278, "grad_norm": 1.1385524957672613, "learning_rate": 1.8134358002684504e-05, "loss": 3.2988, "mean_token_accuracy": 0.3846017554402351, "step": 180 }, { "epoch": 0.29343629343629346, "grad_norm": 1.0815980244836383, "learning_rate": 1.7808689668783762e-05, "loss": 3.2711, "mean_token_accuracy": 0.3869165450334549, "step": 190 }, { "epoch": 0.3088803088803089, "grad_norm": 1.1075978174473033, "learning_rate": 1.7460274211432463e-05, "loss": 3.3227, "mean_token_accuracy": 0.38340970128774643, "step": 200 }, { "epoch": 0.3088803088803089, "eval_runtime": 0.3689, "eval_samples_per_second": 252.111, "eval_steps_per_second": 16.265, "step": 200 }, { "epoch": 0.32432432432432434, "grad_norm": 1.2619430199962072, "learning_rate": 1.7090126583171503e-05, "loss": 3.3055, "mean_token_accuracy": 0.3856549397110939, "step": 210 }, { "epoch": 0.33976833976833976, "grad_norm": 1.087491804935316, "learning_rate": 1.6699325043497957e-05, "loss": 3.277, "mean_token_accuracy": 0.3866904929280281, "step": 220 }, { "epoch": 0.3552123552123552, "grad_norm": 1.1152659184947136, "learning_rate": 1.6289008017838447e-05, "loss": 3.2496, "mean_token_accuracy": 0.3880590170621872, "step": 230 }, { "epoch": 0.37065637065637064, "grad_norm": 1.078360383901834, "learning_rate": 1.586037078125607e-05, "loss": 3.2484, "mean_token_accuracy": 0.3903868407011032, "step": 240 }, { "epoch": 0.3861003861003861, "grad_norm": 1.049519316739431, "learning_rate": 1.54146619765513e-05, "loss": 3.252, "mean_token_accuracy": 0.3888410285115242, "step": 250 }, { "epoch": 0.4015444015444015, "grad_norm": 1.0716141072297978, "learning_rate": 1.4953179976899878e-05, "loss": 3.2891, "mean_token_accuracy": 0.3861253634095192, "step": 260 }, { "epoch": 0.416988416988417, "grad_norm": 1.1242108487806568, "learning_rate": 1.4477269103623496e-05, "loss": 3.2488, "mean_token_accuracy": 0.38970552384853363, "step": 270 }, { "epoch": 0.43243243243243246, "grad_norm": 1.056564744386044, "learning_rate": 1.3988315710111151e-05, "loss": 3.232, "mean_token_accuracy": 0.39249450266361235, "step": 280 }, { "epoch": 0.44787644787644787, "grad_norm": 1.070870946248766, "learning_rate": 1.3487744143298822e-05, "loss": 3.2512, "mean_token_accuracy": 0.3900837257504463, "step": 290 }, { "epoch": 0.46332046332046334, "grad_norm": 1.0749246835150736, "learning_rate": 1.2977012594472008e-05, "loss": 3.2504, "mean_token_accuracy": 0.38782380521297455, "step": 300 }, { "epoch": 0.46332046332046334, "eval_runtime": 0.3696, "eval_samples_per_second": 251.594, "eval_steps_per_second": 16.232, "step": 300 }, { "epoch": 0.47876447876447875, "grad_norm": 0.9656072799953122, "learning_rate": 1.2457608851477833e-05, "loss": 3.2687, "mean_token_accuracy": 0.3866996571421623, "step": 310 }, { "epoch": 0.4942084942084942, "grad_norm": 1.0178005185953467, "learning_rate": 1.1931045964720882e-05, "loss": 3.198, "mean_token_accuracy": 0.3944434255361557, "step": 320 }, { "epoch": 0.5096525096525096, "grad_norm": 1.074522942556136, "learning_rate": 1.1398857839567811e-05, "loss": 3.2355, "mean_token_accuracy": 0.39279997497797015, "step": 330 }, { "epoch": 0.525096525096525, "grad_norm": 1.0410557083388294, "learning_rate": 1.086259476800041e-05, "loss": 3.2195, "mean_token_accuracy": 0.39092436134815217, "step": 340 }, { "epoch": 0.5405405405405406, "grad_norm": 0.9762243696283865, "learning_rate": 1.0323818912533561e-05, "loss": 3.2445, "mean_token_accuracy": 0.38936176896095276, "step": 350 }, { "epoch": 0.555984555984556, "grad_norm": 1.0744349569161593, "learning_rate": 9.784099755553723e-06, "loss": 3.2625, "mean_token_accuracy": 0.39045931249856947, "step": 360 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9548919869257485, "learning_rate": 9.245009527334243e-06, "loss": 3.2527, "mean_token_accuracy": 0.38955584168434143, "step": 370 }, { "epoch": 0.5868725868725869, "grad_norm": 1.0268956745848117, "learning_rate": 8.708118626045939e-06, "loss": 3.2535, "mean_token_accuracy": 0.3885325014591217, "step": 380 }, { "epoch": 0.6023166023166023, "grad_norm": 1.0496300516856243, "learning_rate": 8.174991043104662e-06, "loss": 3.2566, "mean_token_accuracy": 0.38984403312206267, "step": 390 }, { "epoch": 0.6177606177606177, "grad_norm": 1.043332590039097, "learning_rate": 7.647179807182125e-06, "loss": 3.2281, "mean_token_accuracy": 0.3923295482993126, "step": 400 }, { "epoch": 0.6177606177606177, "eval_runtime": 0.3673, "eval_samples_per_second": 253.192, "eval_steps_per_second": 16.335, "step": 400 }, { "epoch": 0.6332046332046332, "grad_norm": 1.061649815314641, "learning_rate": 7.126222460151719e-06, "loss": 3.2043, "mean_token_accuracy": 0.39413081407546996, "step": 410 }, { "epoch": 0.6486486486486487, "grad_norm": 1.0147970765923096, "learning_rate": 6.613636578148242e-06, "loss": 3.2316, "mean_token_accuracy": 0.3912878751754761, "step": 420 }, { "epoch": 0.6640926640926641, "grad_norm": 1.0058231917151492, "learning_rate": 6.110915350788846e-06, "loss": 3.2207, "mean_token_accuracy": 0.3918399602174759, "step": 430 }, { "epoch": 0.6795366795366795, "grad_norm": 1.0703873139225168, "learning_rate": 5.619523231433177e-06, "loss": 3.2566, "mean_token_accuracy": 0.38752417266368866, "step": 440 }, { "epoch": 0.694980694980695, "grad_norm": 1.123689944709063, "learning_rate": 5.140891671153797e-06, "loss": 3.2848, "mean_token_accuracy": 0.3864888772368431, "step": 450 }, { "epoch": 0.7104247104247104, "grad_norm": 1.0150276974982517, "learning_rate": 4.676414948843934e-06, "loss": 3.2078, "mean_token_accuracy": 0.3944342628121376, "step": 460 }, { "epoch": 0.7258687258687259, "grad_norm": 1.131883688390389, "learning_rate": 4.2274461096098085e-06, "loss": 3.2121, "mean_token_accuracy": 0.3935947135090828, "step": 470 }, { "epoch": 0.7413127413127413, "grad_norm": 1.0848803772523403, "learning_rate": 3.795293023279093e-06, "loss": 3.2309, "mean_token_accuracy": 0.3939241200685501, "step": 480 }, { "epoch": 0.7567567567567568, "grad_norm": 0.9911464983867319, "learning_rate": 3.3812145745073834e-06, "loss": 3.2645, "mean_token_accuracy": 0.3887524425983429, "step": 490 }, { "epoch": 0.7722007722007722, "grad_norm": 0.9468182557450763, "learning_rate": 2.9864169955810085e-06, "loss": 3.2348, "mean_token_accuracy": 0.3921034947037697, "step": 500 }, { "epoch": 0.7722007722007722, "eval_runtime": 0.3674, "eval_samples_per_second": 253.144, "eval_steps_per_second": 16.332, "step": 500 }, { "epoch": 0.7876447876447876, "grad_norm": 1.153751906362788, "learning_rate": 2.6120503525989894e-06, "loss": 3.2051, "mean_token_accuracy": 0.3940493628382683, "step": 510 }, { "epoch": 0.803088803088803, "grad_norm": 1.0023254669711654, "learning_rate": 2.25920519527003e-06, "loss": 3.2387, "mean_token_accuracy": 0.3898582592606544, "step": 520 }, { "epoch": 0.8185328185328186, "grad_norm": 1.018252078051325, "learning_rate": 1.9289093800839067e-06, "loss": 3.2488, "mean_token_accuracy": 0.39030425548553466, "step": 530 }, { "epoch": 0.833976833976834, "grad_norm": 1.0191281048265344, "learning_rate": 1.6221250761114803e-06, "loss": 3.2156, "mean_token_accuracy": 0.39363697469234465, "step": 540 }, { "epoch": 0.8494208494208494, "grad_norm": 1.0580017660782297, "learning_rate": 1.339745962155613e-06, "loss": 3.2449, "mean_token_accuracy": 0.3889385357499123, "step": 550 }, { "epoch": 0.8648648648648649, "grad_norm": 1.0638282009844648, "learning_rate": 1.0825946234178575e-06, "loss": 3.2687, "mean_token_accuracy": 0.38850476443767545, "step": 560 }, { "epoch": 0.8803088803088803, "grad_norm": 0.9647796959764461, "learning_rate": 8.514201552645052e-07, "loss": 3.2523, "mean_token_accuracy": 0.3878818407654762, "step": 570 }, { "epoch": 0.8957528957528957, "grad_norm": 1.0003940081508194, "learning_rate": 6.468959810724329e-07, "loss": 3.2141, "mean_token_accuracy": 0.3934506356716156, "step": 580 }, { "epoch": 0.9111969111969112, "grad_norm": 0.9373675947022841, "learning_rate": 4.696178905113913e-07, "loss": 3.2305, "mean_token_accuracy": 0.39248495548963547, "step": 590 }, { "epoch": 0.9266409266409267, "grad_norm": 0.9968667494256308, "learning_rate": 3.2010230397739206e-07, "loss": 3.2254, "mean_token_accuracy": 0.39279315173625945, "step": 600 }, { "epoch": 0.9266409266409267, "eval_runtime": 0.3665, "eval_samples_per_second": 253.77, "eval_steps_per_second": 16.372, "step": 600 }, { "epoch": 0.9420849420849421, "grad_norm": 1.0683699164488198, "learning_rate": 1.9878476823294467e-07, "loss": 3.2227, "mean_token_accuracy": 0.3929983913898468, "step": 610 }, { "epoch": 0.9575289575289575, "grad_norm": 0.9681641634376521, "learning_rate": 1.0601868763643997e-07, "loss": 3.2156, "mean_token_accuracy": 0.3948619216680527, "step": 620 }, { "epoch": 0.972972972972973, "grad_norm": 1.0525452552878858, "learning_rate": 4.207429465668877e-08, "loss": 3.2148, "mean_token_accuracy": 0.39301991611719134, "step": 630 }, { "epoch": 0.9884169884169884, "grad_norm": 1.007791209844153, "learning_rate": 7.1378626715268295e-09, "loss": 3.252, "mean_token_accuracy": 0.3903378531336784, "step": 640 }, { "epoch": 0.9992277992277993, "mean_token_accuracy": 0.3896016627550125, "step": 647, "total_flos": 5418484972388352.0, "train_loss": 3.606253622488408, "train_runtime": 424.9732, "train_samples_per_second": 48.742, "train_steps_per_second": 1.522 } ], "logging_steps": 10, "max_steps": 647, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5418484972388352.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }