{ "best_global_step": 747, "best_metric": 0.9911504424778761, "best_model_checkpoint": "./logs/checkpoint-747", "epoch": 10.0, "eval_steps": 500, "global_step": 830, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12048192771084337, "grad_norm": 6.236629486083984, "learning_rate": 1.0843373493975904e-05, "loss": 1.7779657363891601, "step": 10 }, { "epoch": 0.24096385542168675, "grad_norm": 6.614838123321533, "learning_rate": 2.289156626506024e-05, "loss": 1.6157239913940429, "step": 20 }, { "epoch": 0.3614457831325301, "grad_norm": 8.87565803527832, "learning_rate": 3.4939759036144585e-05, "loss": 1.3881938934326172, "step": 30 }, { "epoch": 0.4819277108433735, "grad_norm": 8.774048805236816, "learning_rate": 4.698795180722892e-05, "loss": 1.0548779487609863, "step": 40 }, { "epoch": 0.6024096385542169, "grad_norm": 10.919977188110352, "learning_rate": 5.903614457831326e-05, "loss": 0.860891056060791, "step": 50 }, { "epoch": 0.7228915662650602, "grad_norm": 17.067983627319336, "learning_rate": 7.108433734939759e-05, "loss": 0.7981919765472412, "step": 60 }, { "epoch": 0.8433734939759037, "grad_norm": 9.766481399536133, "learning_rate": 8.313253012048194e-05, "loss": 0.7360480785369873, "step": 70 }, { "epoch": 0.963855421686747, "grad_norm": 11.55764102935791, "learning_rate": 9.518072289156626e-05, "loss": 0.636728572845459, "step": 80 }, { "epoch": 1.0, "eval_accuracy": 0.8247787610619469, "eval_loss": 0.46115025877952576, "eval_runtime": 5.444, "eval_samples_per_second": 103.785, "eval_steps_per_second": 1.653, "step": 83 }, { "epoch": 1.0843373493975903, "grad_norm": 13.059717178344727, "learning_rate": 9.998408238461338e-05, "loss": 0.5606242656707764, "step": 90 }, { "epoch": 1.2048192771084336, "grad_norm": 7.8438615798950195, "learning_rate": 9.988684476816419e-05, "loss": 0.4747779369354248, "step": 100 }, { "epoch": 1.3253012048192772, "grad_norm": 17.873554229736328, "learning_rate": 9.970138440692705e-05, "loss": 0.513523006439209, "step": 110 }, { "epoch": 1.4457831325301205, "grad_norm": 11.972646713256836, "learning_rate": 9.942802927959443e-05, "loss": 0.5974394321441651, "step": 120 }, { "epoch": 1.5662650602409638, "grad_norm": 16.943544387817383, "learning_rate": 9.906726280298186e-05, "loss": 0.452280855178833, "step": 130 }, { "epoch": 1.6867469879518073, "grad_norm": 9.1749906539917, "learning_rate": 9.861972297712605e-05, "loss": 0.4720293045043945, "step": 140 }, { "epoch": 1.8072289156626506, "grad_norm": 11.010215759277344, "learning_rate": 9.808620125700925e-05, "loss": 0.4886914253234863, "step": 150 }, { "epoch": 1.927710843373494, "grad_norm": 8.292040824890137, "learning_rate": 9.746764115290496e-05, "loss": 0.46558895111083987, "step": 160 }, { "epoch": 2.0, "eval_accuracy": 0.8495575221238938, "eval_loss": 0.3608015775680542, "eval_runtime": 5.678, "eval_samples_per_second": 99.507, "eval_steps_per_second": 1.585, "step": 166 }, { "epoch": 2.0481927710843375, "grad_norm": 17.205005645751953, "learning_rate": 9.676513656182058e-05, "loss": 0.36324758529663087, "step": 170 }, { "epoch": 2.1686746987951806, "grad_norm": 11.862001419067383, "learning_rate": 9.597992983298747e-05, "loss": 0.4520224094390869, "step": 180 }, { "epoch": 2.289156626506024, "grad_norm": 15.979503631591797, "learning_rate": 9.511340957081958e-05, "loss": 0.40279397964477537, "step": 190 }, { "epoch": 2.4096385542168672, "grad_norm": 14.119962692260742, "learning_rate": 9.416710817922615e-05, "loss": 0.41336545944213865, "step": 200 }, { "epoch": 2.5301204819277108, "grad_norm": 17.406816482543945, "learning_rate": 9.314269915162114e-05, "loss": 0.3019423961639404, "step": 210 }, { "epoch": 2.6506024096385543, "grad_norm": 14.716191291809082, "learning_rate": 9.204199411142196e-05, "loss": 0.3748778820037842, "step": 220 }, { "epoch": 2.7710843373493974, "grad_norm": 21.437185287475586, "learning_rate": 9.086693960827105e-05, "loss": 0.34201803207397463, "step": 230 }, { "epoch": 2.891566265060241, "grad_norm": 10.061134338378906, "learning_rate": 8.961961367564651e-05, "loss": 0.49113874435424804, "step": 240 }, { "epoch": 3.0, "eval_accuracy": 0.9646017699115044, "eval_loss": 0.1344006061553955, "eval_runtime": 6.1864, "eval_samples_per_second": 91.329, "eval_steps_per_second": 1.455, "step": 249 }, { "epoch": 3.0120481927710845, "grad_norm": 12.825634956359863, "learning_rate": 8.83022221559489e-05, "loss": 0.27299160957336427, "step": 250 }, { "epoch": 3.1325301204819276, "grad_norm": 10.527647972106934, "learning_rate": 8.691709479956373e-05, "loss": 0.24414093494415284, "step": 260 }, { "epoch": 3.253012048192771, "grad_norm": 4.087771892547607, "learning_rate": 8.546668114479768e-05, "loss": 0.18104053735733033, "step": 270 }, { "epoch": 3.3734939759036147, "grad_norm": 9.269667625427246, "learning_rate": 8.395354618597533e-05, "loss": 0.3339837551116943, "step": 280 }, { "epoch": 3.4939759036144578, "grad_norm": 5.284663200378418, "learning_rate": 8.238036583735673e-05, "loss": 0.2400984764099121, "step": 290 }, { "epoch": 3.6144578313253013, "grad_norm": 8.479917526245117, "learning_rate": 8.074992220089769e-05, "loss": 0.22019331455230712, "step": 300 }, { "epoch": 3.734939759036145, "grad_norm": 9.128419876098633, "learning_rate": 7.906509864622203e-05, "loss": 0.25870823860168457, "step": 310 }, { "epoch": 3.855421686746988, "grad_norm": 12.799089431762695, "learning_rate": 7.73288747115059e-05, "loss": 0.20678648948669434, "step": 320 }, { "epoch": 3.9759036144578315, "grad_norm": 11.251450538635254, "learning_rate": 7.554432083429253e-05, "loss": 0.1629856824874878, "step": 330 }, { "epoch": 4.0, "eval_accuracy": 0.9575221238938053, "eval_loss": 0.13474561274051666, "eval_runtime": 5.6362, "eval_samples_per_second": 100.245, "eval_steps_per_second": 1.597, "step": 332 }, { "epoch": 4.096385542168675, "grad_norm": 19.876901626586914, "learning_rate": 7.3714592921555e-05, "loss": 0.24734578132629395, "step": 340 }, { "epoch": 4.216867469879518, "grad_norm": 13.15965461730957, "learning_rate": 7.184292676861024e-05, "loss": 0.22765071392059327, "step": 350 }, { "epoch": 4.337349397590361, "grad_norm": 12.12977123260498, "learning_rate": 6.99326323367538e-05, "loss": 0.17916421890258788, "step": 360 }, { "epoch": 4.457831325301205, "grad_norm": 8.979646682739258, "learning_rate": 6.798708789973527e-05, "loss": 0.1901506304740906, "step": 370 }, { "epoch": 4.578313253012048, "grad_norm": 5.592668056488037, "learning_rate": 6.600973406942616e-05, "loss": 0.22261853218078614, "step": 380 }, { "epoch": 4.698795180722891, "grad_norm": 12.222548484802246, "learning_rate": 6.400406771124536e-05, "loss": 0.16046804189682007, "step": 390 }, { "epoch": 4.8192771084337345, "grad_norm": 9.516422271728516, "learning_rate": 6.197363576010264e-05, "loss": 0.3090466022491455, "step": 400 }, { "epoch": 4.9397590361445785, "grad_norm": 8.311286926269531, "learning_rate": 5.992202894779649e-05, "loss": 0.18722275495529175, "step": 410 }, { "epoch": 5.0, "eval_accuracy": 0.9628318584070796, "eval_loss": 0.11059214919805527, "eval_runtime": 5.668, "eval_samples_per_second": 99.682, "eval_steps_per_second": 1.588, "step": 415 }, { "epoch": 5.0602409638554215, "grad_norm": 6.544140338897705, "learning_rate": 5.7852875452958954e-05, "loss": 0.1725080966949463, "step": 420 }, { "epoch": 5.180722891566265, "grad_norm": 7.241940975189209, "learning_rate": 5.576983448477734e-05, "loss": 0.2657145023345947, "step": 430 }, { "epoch": 5.301204819277109, "grad_norm": 2.805722713470459, "learning_rate": 5.3676589811839796e-05, "loss": 0.16265145540237427, "step": 440 }, { "epoch": 5.421686746987952, "grad_norm": 7.153483867645264, "learning_rate": 5.157684324754858e-05, "loss": 0.1511433720588684, "step": 450 }, { "epoch": 5.542168674698795, "grad_norm": 3.1414175033569336, "learning_rate": 4.9474308103621874e-05, "loss": 0.15450478792190553, "step": 460 }, { "epoch": 5.662650602409639, "grad_norm": 3.8960776329040527, "learning_rate": 4.737270262326134e-05, "loss": 0.13111191987991333, "step": 470 }, { "epoch": 5.783132530120482, "grad_norm": 7.418442726135254, "learning_rate": 4.527574340559844e-05, "loss": 0.1539200186729431, "step": 480 }, { "epoch": 5.903614457831325, "grad_norm": 8.860248565673828, "learning_rate": 4.3187138833048456e-05, "loss": 0.1801429271697998, "step": 490 }, { "epoch": 6.0, "eval_accuracy": 0.9823008849557522, "eval_loss": 0.09679495543241501, "eval_runtime": 6.1076, "eval_samples_per_second": 92.508, "eval_steps_per_second": 1.474, "step": 498 }, { "epoch": 6.024096385542169, "grad_norm": 11.467449188232422, "learning_rate": 4.111058251319516e-05, "loss": 0.11156998872756958, "step": 500 }, { "epoch": 6.144578313253012, "grad_norm": 13.2984037399292, "learning_rate": 3.904974674680436e-05, "loss": 0.11730811595916749, "step": 510 }, { "epoch": 6.265060240963855, "grad_norm": 3.543509006500244, "learning_rate": 3.7008276033517396e-05, "loss": 0.19998840093612671, "step": 520 }, { "epoch": 6.385542168674699, "grad_norm": 4.334460735321045, "learning_rate": 3.49897806267101e-05, "loss": 0.09577634930610657, "step": 530 }, { "epoch": 6.506024096385542, "grad_norm": 1.5698552131652832, "learning_rate": 3.2997830148914314e-05, "loss": 0.11064940690994263, "step": 540 }, { "epoch": 6.626506024096385, "grad_norm": 1.4867029190063477, "learning_rate": 3.103594727909385e-05, "loss": 0.0978583574295044, "step": 550 }, { "epoch": 6.746987951807229, "grad_norm": 8.762438774108887, "learning_rate": 2.910760152293764e-05, "loss": 0.08853105902671814, "step": 560 }, { "epoch": 6.867469879518072, "grad_norm": 5.0525360107421875, "learning_rate": 2.721620307718793e-05, "loss": 0.13467444181442262, "step": 570 }, { "epoch": 6.9879518072289155, "grad_norm": 13.927603721618652, "learning_rate": 2.536509679885355e-05, "loss": 0.14531443119049073, "step": 580 }, { "epoch": 7.0, "eval_accuracy": 0.9716814159292035, "eval_loss": 0.1196078509092331, "eval_runtime": 5.7351, "eval_samples_per_second": 98.517, "eval_steps_per_second": 1.569, "step": 581 }, { "epoch": 7.108433734939759, "grad_norm": 1.3813672065734863, "learning_rate": 2.3557556289973838e-05, "loss": 0.07141577005386353, "step": 590 }, { "epoch": 7.228915662650603, "grad_norm": 11.30803108215332, "learning_rate": 2.179677810839382e-05, "loss": 0.08913902044296265, "step": 600 }, { "epoch": 7.349397590361446, "grad_norm": 15.15943717956543, "learning_rate": 2.0085876114788937e-05, "loss": 0.1786208987236023, "step": 610 }, { "epoch": 7.469879518072289, "grad_norm": 6.659374237060547, "learning_rate": 1.8427875965935758e-05, "loss": 0.05375434160232544, "step": 620 }, { "epoch": 7.590361445783133, "grad_norm": 3.859622001647949, "learning_rate": 1.682570976396811e-05, "loss": 0.13732693195343018, "step": 630 }, { "epoch": 7.710843373493976, "grad_norm": 4.593474388122559, "learning_rate": 1.5282210871079926e-05, "loss": 0.09100980162620545, "step": 640 }, { "epoch": 7.831325301204819, "grad_norm": 13.193694114685059, "learning_rate": 1.3800108898846021e-05, "loss": 0.09656141400337219, "step": 650 }, { "epoch": 7.951807228915663, "grad_norm": 13.730281829833984, "learning_rate": 1.2382024881020937e-05, "loss": 0.0786526083946228, "step": 660 }, { "epoch": 8.0, "eval_accuracy": 0.9893805309734514, "eval_loss": 0.08379530161619186, "eval_runtime": 5.6656, "eval_samples_per_second": 99.724, "eval_steps_per_second": 1.589, "step": 664 }, { "epoch": 8.072289156626505, "grad_norm": 1.4331773519515991, "learning_rate": 1.1030466638353293e-05, "loss": 0.0922305703163147, "step": 670 }, { "epoch": 8.19277108433735, "grad_norm": 11.172012329101562, "learning_rate": 9.747824343612338e-06, "loss": 0.051563167572021486, "step": 680 }, { "epoch": 8.313253012048193, "grad_norm": 9.389365196228027, "learning_rate": 8.536366294669978e-06, "loss": 0.0976746916770935, "step": 690 }, { "epoch": 8.433734939759036, "grad_norm": 0.6641272902488708, "learning_rate": 7.398234903113266e-06, "loss": 0.07286246418952942, "step": 700 }, { "epoch": 8.55421686746988, "grad_norm": 10.819772720336914, "learning_rate": 6.335442905481442e-06, "loss": 0.07259726524353027, "step": 710 }, { "epoch": 8.674698795180722, "grad_norm": 0.8964389562606812, "learning_rate": 5.349869803827717e-06, "loss": 0.043448707461357115, "step": 720 }, { "epoch": 8.795180722891565, "grad_norm": 7.601110935211182, "learning_rate": 4.4432585419005076e-06, "loss": 0.10902594327926636, "step": 730 }, { "epoch": 8.91566265060241, "grad_norm": 0.9344149827957153, "learning_rate": 3.6172124228221914e-06, "loss": 0.03534201383590698, "step": 740 }, { "epoch": 9.0, "eval_accuracy": 0.9911504424778761, "eval_loss": 0.0801326259970665, "eval_runtime": 6.1806, "eval_samples_per_second": 91.415, "eval_steps_per_second": 1.456, "step": 747 }, { "epoch": 9.036144578313253, "grad_norm": 2.648090124130249, "learning_rate": 2.8731922737163685e-06, "loss": 0.024153660237789153, "step": 750 }, { "epoch": 9.156626506024097, "grad_norm": 7.166019439697266, "learning_rate": 2.212513862297649e-06, "loss": 0.12096415758132935, "step": 760 }, { "epoch": 9.27710843373494, "grad_norm": 9.207470893859863, "learning_rate": 1.6363455699930419e-06, "loss": 0.10932642221450806, "step": 770 }, { "epoch": 9.397590361445783, "grad_norm": 2.5481743812561035, "learning_rate": 1.145706325709389e-06, "loss": 0.06989773511886596, "step": 780 }, { "epoch": 9.518072289156626, "grad_norm": 12.317899703979492, "learning_rate": 7.414638039014265e-07, "loss": 0.08738085627555847, "step": 790 }, { "epoch": 9.638554216867469, "grad_norm": 2.812633752822876, "learning_rate": 4.2433289012662194e-07, "loss": 0.04424922168254852, "step": 800 }, { "epoch": 9.759036144578314, "grad_norm": 1.378448486328125, "learning_rate": 1.9487441680084983e-07, "loss": 0.07828723788261413, "step": 810 }, { "epoch": 9.879518072289157, "grad_norm": 9.539847373962402, "learning_rate": 5.3494171390228166e-08, "loss": 0.06453937888145447, "step": 820 }, { "epoch": 10.0, "grad_norm": 0.7179245948791504, "learning_rate": 4.4217879344166103e-10, "loss": 0.08780375719070435, "step": 830 }, { "epoch": 10.0, "eval_accuracy": 0.9911504424778761, "eval_loss": 0.08178059756755829, "eval_runtime": 5.6512, "eval_samples_per_second": 99.979, "eval_steps_per_second": 1.593, "step": 830 }, { "epoch": 10.0, "step": 830, "total_flos": 8.593274471605862e+17, "train_loss": 0.29410572172288435, "train_runtime": 896.4382, "train_samples_per_second": 29.461, "train_steps_per_second": 0.926 } ], "logging_steps": 10, "max_steps": 830, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.593274471605862e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }