| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.759571209800919, |
| "eval_steps": 100, |
| "global_step": 900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015313935681470138, |
| "grad_norm": 3.8326407564846536, |
| "learning_rate": 6.134969325153375e-07, |
| "loss": 1.3339, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.030627871362940276, |
| "grad_norm": 3.5496788557416803, |
| "learning_rate": 1.226993865030675e-06, |
| "loss": 1.3517, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.045941807044410414, |
| "grad_norm": 2.292172802820395, |
| "learning_rate": 1.8404907975460124e-06, |
| "loss": 1.3142, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.06125574272588055, |
| "grad_norm": 1.7775340318261912, |
| "learning_rate": 2.45398773006135e-06, |
| "loss": 1.2477, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07656967840735068, |
| "grad_norm": 1.4368794370978828, |
| "learning_rate": 3.0674846625766875e-06, |
| "loss": 1.2037, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.09188361408882083, |
| "grad_norm": 1.061354555879628, |
| "learning_rate": 3.680981595092025e-06, |
| "loss": 1.1505, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10719754977029096, |
| "grad_norm": 0.8377616484610274, |
| "learning_rate": 4.294478527607362e-06, |
| "loss": 1.1179, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1225114854517611, |
| "grad_norm": 0.7884987790196414, |
| "learning_rate": 4.9079754601227e-06, |
| "loss": 1.0735, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13782542113323124, |
| "grad_norm": 0.7279957151761491, |
| "learning_rate": 5.521472392638038e-06, |
| "loss": 1.0556, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.15313935681470137, |
| "grad_norm": 0.6692858930956382, |
| "learning_rate": 6.134969325153375e-06, |
| "loss": 1.0294, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16845329249617153, |
| "grad_norm": 0.6401840707040195, |
| "learning_rate": 6.748466257668712e-06, |
| "loss": 1.0175, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.18376722817764166, |
| "grad_norm": 0.6584204446837412, |
| "learning_rate": 7.36196319018405e-06, |
| "loss": 1.0283, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1990811638591118, |
| "grad_norm": 0.6647811794468166, |
| "learning_rate": 7.975460122699386e-06, |
| "loss": 0.9799, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.21439509954058192, |
| "grad_norm": 0.6774675226534475, |
| "learning_rate": 8.588957055214725e-06, |
| "loss": 0.969, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.22970903522205208, |
| "grad_norm": 0.6918496726012217, |
| "learning_rate": 9.202453987730062e-06, |
| "loss": 0.9787, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.2450229709035222, |
| "grad_norm": 0.6740082248654927, |
| "learning_rate": 9.8159509202454e-06, |
| "loss": 0.9596, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.26033690658499237, |
| "grad_norm": 0.6791518891702232, |
| "learning_rate": 1.0429447852760737e-05, |
| "loss": 0.9813, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.27565084226646247, |
| "grad_norm": 0.7029695620068616, |
| "learning_rate": 1.1042944785276076e-05, |
| "loss": 0.9703, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.29096477794793263, |
| "grad_norm": 0.6355433423695775, |
| "learning_rate": 1.1656441717791411e-05, |
| "loss": 0.9375, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.30627871362940273, |
| "grad_norm": 0.6551855479751819, |
| "learning_rate": 1.226993865030675e-05, |
| "loss": 0.952, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.30627871362940273, |
| "eval_loss": 0.964697003364563, |
| "eval_runtime": 3.4019, |
| "eval_samples_per_second": 36.45, |
| "eval_steps_per_second": 2.352, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3215926493108729, |
| "grad_norm": 0.6655305965463972, |
| "learning_rate": 1.2883435582822085e-05, |
| "loss": 0.9237, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.33690658499234305, |
| "grad_norm": 0.7257201177401903, |
| "learning_rate": 1.3496932515337424e-05, |
| "loss": 0.9511, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.35222052067381315, |
| "grad_norm": 0.7424667090609683, |
| "learning_rate": 1.4110429447852763e-05, |
| "loss": 0.9323, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.3675344563552833, |
| "grad_norm": 0.7336582339806711, |
| "learning_rate": 1.47239263803681e-05, |
| "loss": 0.9449, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.38284839203675347, |
| "grad_norm": 0.6851538651091853, |
| "learning_rate": 1.5337423312883436e-05, |
| "loss": 0.9133, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3981623277182236, |
| "grad_norm": 0.7424313941044212, |
| "learning_rate": 1.5950920245398772e-05, |
| "loss": 0.9173, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.41347626339969373, |
| "grad_norm": 0.7937170928985942, |
| "learning_rate": 1.656441717791411e-05, |
| "loss": 0.9348, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.42879019908116384, |
| "grad_norm": 0.7668730085289572, |
| "learning_rate": 1.717791411042945e-05, |
| "loss": 0.9091, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.444104134762634, |
| "grad_norm": 0.713212902268727, |
| "learning_rate": 1.7791411042944788e-05, |
| "loss": 0.9054, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.45941807044410415, |
| "grad_norm": 0.8266274957992115, |
| "learning_rate": 1.8404907975460123e-05, |
| "loss": 0.9165, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.47473200612557426, |
| "grad_norm": 0.7773849012810377, |
| "learning_rate": 1.9018404907975462e-05, |
| "loss": 0.9261, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4900459418070444, |
| "grad_norm": 0.7698543962501324, |
| "learning_rate": 1.96319018404908e-05, |
| "loss": 0.8848, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5053598774885145, |
| "grad_norm": 0.7326492641075325, |
| "learning_rate": 1.9999908278985548e-05, |
| "loss": 0.9002, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5206738131699847, |
| "grad_norm": 0.725915155149828, |
| "learning_rate": 1.9998876436895888e-05, |
| "loss": 0.9064, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5359877488514548, |
| "grad_norm": 0.7137681931522117, |
| "learning_rate": 1.99966982201439e-05, |
| "loss": 0.9083, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.5513016845329249, |
| "grad_norm": 0.8244990854189328, |
| "learning_rate": 1.9993373878462894e-05, |
| "loss": 0.8988, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5666156202143952, |
| "grad_norm": 0.7883005110592801, |
| "learning_rate": 1.99889037929898e-05, |
| "loss": 0.9021, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5819295558958653, |
| "grad_norm": 0.7224060026030372, |
| "learning_rate": 1.9983288476221482e-05, |
| "loss": 0.8881, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5972434915773354, |
| "grad_norm": 0.7010662098507998, |
| "learning_rate": 1.9976528571955946e-05, |
| "loss": 0.8889, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6125574272588055, |
| "grad_norm": 0.7154815490781321, |
| "learning_rate": 1.9968624855218578e-05, |
| "loss": 0.8884, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6125574272588055, |
| "eval_loss": 0.9119902849197388, |
| "eval_runtime": 3.4604, |
| "eval_samples_per_second": 35.834, |
| "eval_steps_per_second": 2.312, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6278713629402757, |
| "grad_norm": 0.6984184509316927, |
| "learning_rate": 1.995957823217325e-05, |
| "loss": 0.9033, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6431852986217458, |
| "grad_norm": 0.7120998555383481, |
| "learning_rate": 1.9949389740018438e-05, |
| "loss": 0.8988, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6584992343032159, |
| "grad_norm": 0.66696186884436, |
| "learning_rate": 1.993806054686832e-05, |
| "loss": 0.8735, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6738131699846861, |
| "grad_norm": 0.7395425655626215, |
| "learning_rate": 1.9925591951618822e-05, |
| "loss": 0.9127, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6891271056661562, |
| "grad_norm": 0.6960401538019414, |
| "learning_rate": 1.9911985383798737e-05, |
| "loss": 0.875, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7044410413476263, |
| "grad_norm": 0.7195874036136765, |
| "learning_rate": 1.9897242403405792e-05, |
| "loss": 0.8782, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7197549770290965, |
| "grad_norm": 0.7129931764921758, |
| "learning_rate": 1.9881364700727827e-05, |
| "loss": 0.8943, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7350689127105666, |
| "grad_norm": 0.690645634714852, |
| "learning_rate": 1.9864354096148966e-05, |
| "loss": 0.8782, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7503828483920367, |
| "grad_norm": 0.7055958514936376, |
| "learning_rate": 1.9846212539940955e-05, |
| "loss": 0.8867, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7656967840735069, |
| "grad_norm": 0.7517885221069497, |
| "learning_rate": 1.982694211203952e-05, |
| "loss": 0.8843, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.781010719754977, |
| "grad_norm": 0.7270711746255435, |
| "learning_rate": 1.9806545021805922e-05, |
| "loss": 0.8757, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.7963246554364471, |
| "grad_norm": 0.707252502344583, |
| "learning_rate": 1.9785023607773655e-05, |
| "loss": 0.8842, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8116385911179173, |
| "grad_norm": 0.8083132789219781, |
| "learning_rate": 1.976238033738033e-05, |
| "loss": 0.8578, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8269525267993875, |
| "grad_norm": 0.7287731737231247, |
| "learning_rate": 1.9738617806684767e-05, |
| "loss": 0.911, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8422664624808576, |
| "grad_norm": 0.7683997901360534, |
| "learning_rate": 1.9713738740069384e-05, |
| "loss": 0.8607, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8575803981623277, |
| "grad_norm": 0.6883196631913017, |
| "learning_rate": 1.9687745989927823e-05, |
| "loss": 0.8781, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8728943338437979, |
| "grad_norm": 0.7137416791751114, |
| "learning_rate": 1.966064253633793e-05, |
| "loss": 0.8675, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.888208269525268, |
| "grad_norm": 0.800711538711323, |
| "learning_rate": 1.963243148672009e-05, |
| "loss": 0.8696, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9035222052067381, |
| "grad_norm": 0.7017619169785259, |
| "learning_rate": 1.960311607548096e-05, |
| "loss": 0.8724, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9188361408882083, |
| "grad_norm": 0.7347091076360852, |
| "learning_rate": 1.957269966364263e-05, |
| "loss": 0.8753, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9188361408882083, |
| "eval_loss": 0.8846515417098999, |
| "eval_runtime": 3.4431, |
| "eval_samples_per_second": 36.014, |
| "eval_steps_per_second": 2.323, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9341500765696784, |
| "grad_norm": 0.6840257399223311, |
| "learning_rate": 1.9541185738457304e-05, |
| "loss": 0.8661, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9494640122511485, |
| "grad_norm": 0.6350789314835292, |
| "learning_rate": 1.9508577913007475e-05, |
| "loss": 0.866, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9647779479326187, |
| "grad_norm": 0.742666392443459, |
| "learning_rate": 1.9474879925791665e-05, |
| "loss": 0.8712, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.9800918836140888, |
| "grad_norm": 0.7707700113231902, |
| "learning_rate": 1.944009564029584e-05, |
| "loss": 0.8656, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9954058192955589, |
| "grad_norm": 0.7026681496550549, |
| "learning_rate": 1.9404229044550432e-05, |
| "loss": 0.8707, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0122511485451762, |
| "grad_norm": 0.8698658929763603, |
| "learning_rate": 1.9367284250673126e-05, |
| "loss": 1.0103, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0275650842266462, |
| "grad_norm": 0.7696705674515646, |
| "learning_rate": 1.9329265494397386e-05, |
| "loss": 0.8254, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.0428790199081164, |
| "grad_norm": 0.703085270940529, |
| "learning_rate": 1.929017713458685e-05, |
| "loss": 0.7994, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0581929555895866, |
| "grad_norm": 0.7281489106195598, |
| "learning_rate": 1.9250023652735573e-05, |
| "loss": 0.7997, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.0735068912710566, |
| "grad_norm": 0.6721859356177987, |
| "learning_rate": 1.920880965245422e-05, |
| "loss": 0.8028, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0888208269525268, |
| "grad_norm": 0.6999456629939552, |
| "learning_rate": 1.9166539858942258e-05, |
| "loss": 0.7923, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.104134762633997, |
| "grad_norm": 0.761057382772568, |
| "learning_rate": 1.912321911844622e-05, |
| "loss": 0.8023, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.119448698315467, |
| "grad_norm": 0.7613985157569584, |
| "learning_rate": 1.907885239770408e-05, |
| "loss": 0.8171, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.1347626339969372, |
| "grad_norm": 0.7130602486133167, |
| "learning_rate": 1.9033444783375806e-05, |
| "loss": 0.8136, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1500765696784074, |
| "grad_norm": 0.7290477717213912, |
| "learning_rate": 1.8987001481460177e-05, |
| "loss": 0.7956, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.1653905053598774, |
| "grad_norm": 0.6942113670274667, |
| "learning_rate": 1.8939527816697917e-05, |
| "loss": 0.795, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.1807044410413476, |
| "grad_norm": 0.7278550945084655, |
| "learning_rate": 1.8891029231961208e-05, |
| "loss": 0.8038, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.1960183767228179, |
| "grad_norm": 0.7637922924754829, |
| "learning_rate": 1.8841511287629667e-05, |
| "loss": 0.7941, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.2113323124042878, |
| "grad_norm": 0.7227667767094055, |
| "learning_rate": 1.8790979660952832e-05, |
| "loss": 0.7899, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.226646248085758, |
| "grad_norm": 0.6856723635729023, |
| "learning_rate": 1.8739440145399295e-05, |
| "loss": 0.8037, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.226646248085758, |
| "eval_loss": 0.8724085092544556, |
| "eval_runtime": 3.6883, |
| "eval_samples_per_second": 33.619, |
| "eval_steps_per_second": 2.169, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2419601837672283, |
| "grad_norm": 0.6585093868236329, |
| "learning_rate": 1.8686898649992437e-05, |
| "loss": 0.7969, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.2572741194486983, |
| "grad_norm": 0.6787675213944868, |
| "learning_rate": 1.8633361198632987e-05, |
| "loss": 0.8077, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2725880551301685, |
| "grad_norm": 0.6808724401749536, |
| "learning_rate": 1.857883392940837e-05, |
| "loss": 0.8077, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.2879019908116387, |
| "grad_norm": 0.6824672080638723, |
| "learning_rate": 1.8523323093888983e-05, |
| "loss": 0.8025, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.3032159264931087, |
| "grad_norm": 0.6611632916751219, |
| "learning_rate": 1.8466835056411422e-05, |
| "loss": 0.7875, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.318529862174579, |
| "grad_norm": 0.701746007873801, |
| "learning_rate": 1.8409376293348836e-05, |
| "loss": 0.794, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3338437978560491, |
| "grad_norm": 0.6668057827943388, |
| "learning_rate": 1.8350953392368408e-05, |
| "loss": 0.797, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.349157733537519, |
| "grad_norm": 0.7077024335619939, |
| "learning_rate": 1.8291573051676063e-05, |
| "loss": 0.8257, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3644716692189893, |
| "grad_norm": 0.7168213984013295, |
| "learning_rate": 1.8231242079248512e-05, |
| "loss": 0.7934, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.3797856049004595, |
| "grad_norm": 0.7308258264034365, |
| "learning_rate": 1.816996739205274e-05, |
| "loss": 0.8078, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.3950995405819295, |
| "grad_norm": 0.7935757091141241, |
| "learning_rate": 1.810775601525296e-05, |
| "loss": 0.8172, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.4104134762633997, |
| "grad_norm": 0.7960723044086886, |
| "learning_rate": 1.8044615081405153e-05, |
| "loss": 0.7973, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.42572741194487, |
| "grad_norm": 0.7013219126306894, |
| "learning_rate": 1.7980551829639357e-05, |
| "loss": 0.7928, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.44104134762634, |
| "grad_norm": 0.7030096980783208, |
| "learning_rate": 1.7915573604829684e-05, |
| "loss": 0.7891, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4563552833078102, |
| "grad_norm": 0.7129952798275541, |
| "learning_rate": 1.784968785675221e-05, |
| "loss": 0.7964, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.4716692189892804, |
| "grad_norm": 0.7042351661303197, |
| "learning_rate": 1.7782902139230876e-05, |
| "loss": 0.7821, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4869831546707504, |
| "grad_norm": 0.7412846210916867, |
| "learning_rate": 1.7715224109271426e-05, |
| "loss": 0.7983, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.5022970903522204, |
| "grad_norm": 0.7586380261564012, |
| "learning_rate": 1.764666152618355e-05, |
| "loss": 0.8167, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5176110260336908, |
| "grad_norm": 0.784243612804958, |
| "learning_rate": 1.7577222250691254e-05, |
| "loss": 0.8086, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.5329249617151608, |
| "grad_norm": 0.6901397300587065, |
| "learning_rate": 1.7506914244031627e-05, |
| "loss": 0.7966, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5329249617151608, |
| "eval_loss": 0.8608717918395996, |
| "eval_runtime": 4.0604, |
| "eval_samples_per_second": 30.539, |
| "eval_steps_per_second": 1.97, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5482388973966308, |
| "grad_norm": 0.6826627810241662, |
| "learning_rate": 1.7435745567042096e-05, |
| "loss": 0.7871, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.5635528330781012, |
| "grad_norm": 0.7329202799692207, |
| "learning_rate": 1.7363724379236237e-05, |
| "loss": 0.806, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.5788667687595712, |
| "grad_norm": 0.8605855369920612, |
| "learning_rate": 1.7290858937868296e-05, |
| "loss": 0.8134, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.5941807044410412, |
| "grad_norm": 0.6963610579257828, |
| "learning_rate": 1.7217157596986474e-05, |
| "loss": 0.8057, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.6094946401225116, |
| "grad_norm": 0.715201778907277, |
| "learning_rate": 1.7142628806475144e-05, |
| "loss": 0.806, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.6248085758039816, |
| "grad_norm": 0.7191782284781755, |
| "learning_rate": 1.706728111108607e-05, |
| "loss": 0.8027, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.6401225114854516, |
| "grad_norm": 0.6872215034145709, |
| "learning_rate": 1.699112314945874e-05, |
| "loss": 0.7953, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.655436447166922, |
| "grad_norm": 0.7008995671254957, |
| "learning_rate": 1.691416365312996e-05, |
| "loss": 0.8096, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.670750382848392, |
| "grad_norm": 0.6521306963395784, |
| "learning_rate": 1.683641144553275e-05, |
| "loss": 0.7921, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.686064318529862, |
| "grad_norm": 0.6632532080286483, |
| "learning_rate": 1.675787544098477e-05, |
| "loss": 0.8056, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.7013782542113323, |
| "grad_norm": 0.6547969582532698, |
| "learning_rate": 1.667856464366626e-05, |
| "loss": 0.7819, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.7166921898928025, |
| "grad_norm": 0.7046901763937415, |
| "learning_rate": 1.6598488146587733e-05, |
| "loss": 0.8061, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.7320061255742725, |
| "grad_norm": 0.6788820639858653, |
| "learning_rate": 1.6517655130547435e-05, |
| "loss": 0.7997, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.7473200612557427, |
| "grad_norm": 0.6894021837769623, |
| "learning_rate": 1.6436074863078783e-05, |
| "loss": 0.803, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.762633996937213, |
| "grad_norm": 0.7360294460315284, |
| "learning_rate": 1.635375669738782e-05, |
| "loss": 0.7975, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.777947932618683, |
| "grad_norm": 0.6850098881910617, |
| "learning_rate": 1.627071007128089e-05, |
| "loss": 0.7825, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.793261868300153, |
| "grad_norm": 0.7278655150611072, |
| "learning_rate": 1.618694450608256e-05, |
| "loss": 0.7955, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.8085758039816233, |
| "grad_norm": 0.7122173514963583, |
| "learning_rate": 1.6102469605544036e-05, |
| "loss": 0.808, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.8238897396630933, |
| "grad_norm": 0.6925118679066572, |
| "learning_rate": 1.6017295054742045e-05, |
| "loss": 0.7875, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.8392036753445635, |
| "grad_norm": 0.6857769438421079, |
| "learning_rate": 1.5931430618968476e-05, |
| "loss": 0.7884, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8392036753445635, |
| "eval_loss": 0.8516013622283936, |
| "eval_runtime": 4.2451, |
| "eval_samples_per_second": 29.21, |
| "eval_steps_per_second": 1.885, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8545176110260337, |
| "grad_norm": 0.6326947636704211, |
| "learning_rate": 1.5844886142610763e-05, |
| "loss": 0.7819, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.8698315467075037, |
| "grad_norm": 0.666694114425699, |
| "learning_rate": 1.575767154802323e-05, |
| "loss": 0.7777, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.885145482388974, |
| "grad_norm": 0.6182711467351976, |
| "learning_rate": 1.5669796834389496e-05, |
| "loss": 0.7889, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.9004594180704442, |
| "grad_norm": 0.6402744830423595, |
| "learning_rate": 1.5581272076576047e-05, |
| "loss": 0.7839, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.9157733537519142, |
| "grad_norm": 0.6612456968952057, |
| "learning_rate": 1.5492107423977167e-05, |
| "loss": 0.7884, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.9310872894333844, |
| "grad_norm": 0.6757984187665057, |
| "learning_rate": 1.5402313099351302e-05, |
| "loss": 0.7784, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9464012251148546, |
| "grad_norm": 0.6707442527908725, |
| "learning_rate": 1.5311899397649e-05, |
| "loss": 0.7935, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.9617151607963246, |
| "grad_norm": 0.787792189502549, |
| "learning_rate": 1.522087668483264e-05, |
| "loss": 0.779, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.9770290964777948, |
| "grad_norm": 0.6925473058613637, |
| "learning_rate": 1.5129255396687899e-05, |
| "loss": 0.7891, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.992343032159265, |
| "grad_norm": 0.6890667970589922, |
| "learning_rate": 1.503704603762734e-05, |
| "loss": 0.7693, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.009188361408882, |
| "grad_norm": 0.8106643045129424, |
| "learning_rate": 1.4944259179486068e-05, |
| "loss": 0.9039, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.0245022970903523, |
| "grad_norm": 0.7728005094053498, |
| "learning_rate": 1.4850905460309648e-05, |
| "loss": 0.7116, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.0398162327718223, |
| "grad_norm": 0.7301328106464207, |
| "learning_rate": 1.4756995583134463e-05, |
| "loss": 0.7313, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.0551301684532923, |
| "grad_norm": 0.702048383564715, |
| "learning_rate": 1.4662540314760608e-05, |
| "loss": 0.7222, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.0704441041347628, |
| "grad_norm": 0.7081084536553046, |
| "learning_rate": 1.4567550484517456e-05, |
| "loss": 0.7214, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.0857580398162328, |
| "grad_norm": 0.7492389283409402, |
| "learning_rate": 1.4472036983022106e-05, |
| "loss": 0.7251, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.1010719754977027, |
| "grad_norm": 0.7183839129962212, |
| "learning_rate": 1.437601076093073e-05, |
| "loss": 0.71, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.116385911179173, |
| "grad_norm": 0.6904647110507196, |
| "learning_rate": 1.4279482827683095e-05, |
| "loss": 0.724, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.131699846860643, |
| "grad_norm": 0.6745352346306862, |
| "learning_rate": 1.4182464250240341e-05, |
| "loss": 0.7127, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.147013782542113, |
| "grad_norm": 0.7254396412798532, |
| "learning_rate": 1.4084966151816124e-05, |
| "loss": 0.7071, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.147013782542113, |
| "eval_loss": 0.8550077676773071, |
| "eval_runtime": 3.8848, |
| "eval_samples_per_second": 31.919, |
| "eval_steps_per_second": 2.059, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1623277182235836, |
| "grad_norm": 0.73349139216254, |
| "learning_rate": 1.3986999710601348e-05, |
| "loss": 0.7298, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.1776416539050536, |
| "grad_norm": 0.6655753573647281, |
| "learning_rate": 1.3888576158482586e-05, |
| "loss": 0.7124, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.1929555895865236, |
| "grad_norm": 0.6987464871327375, |
| "learning_rate": 1.3789706779754326e-05, |
| "loss": 0.712, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.208269525267994, |
| "grad_norm": 0.7104463485359258, |
| "learning_rate": 1.3690402909825245e-05, |
| "loss": 0.7256, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.223583460949464, |
| "grad_norm": 0.786597658435966, |
| "learning_rate": 1.3590675933918578e-05, |
| "loss": 0.7265, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.238897396630934, |
| "grad_norm": 0.7359585689347169, |
| "learning_rate": 1.3490537285766809e-05, |
| "loss": 0.7167, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.2542113323124044, |
| "grad_norm": 0.6905264840508457, |
| "learning_rate": 1.3389998446300791e-05, |
| "loss": 0.7046, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.2695252679938744, |
| "grad_norm": 0.7110824744193723, |
| "learning_rate": 1.3289070942333448e-05, |
| "loss": 0.7113, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.2848392036753444, |
| "grad_norm": 0.774570053362907, |
| "learning_rate": 1.3187766345238222e-05, |
| "loss": 0.7097, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.300153139356815, |
| "grad_norm": 0.7178949231829396, |
| "learning_rate": 1.308609626962242e-05, |
| "loss": 0.6951, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.315467075038285, |
| "grad_norm": 0.6778307950742112, |
| "learning_rate": 1.2984072371995581e-05, |
| "loss": 0.7119, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.330781010719755, |
| "grad_norm": 0.7381317901698428, |
| "learning_rate": 1.288170634943307e-05, |
| "loss": 0.7179, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.3460949464012253, |
| "grad_norm": 0.688339196574423, |
| "learning_rate": 1.2779009938234986e-05, |
| "loss": 0.7372, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.3614088820826953, |
| "grad_norm": 0.7309911889901869, |
| "learning_rate": 1.2675994912580601e-05, |
| "loss": 0.7217, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.3767228177641653, |
| "grad_norm": 2.627453466768546, |
| "learning_rate": 1.2572673083178448e-05, |
| "loss": 0.7208, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.3920367534456357, |
| "grad_norm": 0.6825595176377366, |
| "learning_rate": 1.2469056295912216e-05, |
| "loss": 0.7173, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.4073506891271057, |
| "grad_norm": 0.681764187496552, |
| "learning_rate": 1.2365156430482621e-05, |
| "loss": 0.7172, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.4226646248085757, |
| "grad_norm": 0.6957949104606218, |
| "learning_rate": 1.2260985399045379e-05, |
| "loss": 0.7144, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.437978560490046, |
| "grad_norm": 0.6810069800053375, |
| "learning_rate": 1.2156555144845489e-05, |
| "loss": 0.7153, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.453292496171516, |
| "grad_norm": 0.6879227501131829, |
| "learning_rate": 1.2051877640847929e-05, |
| "loss": 0.7208, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.453292496171516, |
| "eval_loss": 0.8501381278038025, |
| "eval_runtime": 3.7489, |
| "eval_samples_per_second": 33.076, |
| "eval_steps_per_second": 2.134, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.468606431852986, |
| "grad_norm": 0.7074177591745171, |
| "learning_rate": 1.1946964888364949e-05, |
| "loss": 0.7293, |
| "step": 805 |
| }, |
| { |
| "epoch": 2.4839203675344566, |
| "grad_norm": 0.7100957198483567, |
| "learning_rate": 1.1841828915680127e-05, |
| "loss": 0.7088, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.4992343032159265, |
| "grad_norm": 0.7065376294005609, |
| "learning_rate": 1.1736481776669307e-05, |
| "loss": 0.7083, |
| "step": 815 |
| }, |
| { |
| "epoch": 2.5145482388973965, |
| "grad_norm": 0.699699294689246, |
| "learning_rate": 1.1630935549418627e-05, |
| "loss": 0.7239, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.5298621745788665, |
| "grad_norm": 0.6931742613638052, |
| "learning_rate": 1.1525202334839771e-05, |
| "loss": 0.7257, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.545176110260337, |
| "grad_norm": 0.6626821835687099, |
| "learning_rate": 1.1419294255282574e-05, |
| "loss": 0.7158, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.560490045941807, |
| "grad_norm": 0.7476424454423357, |
| "learning_rate": 1.1313223453145202e-05, |
| "loss": 0.7231, |
| "step": 835 |
| }, |
| { |
| "epoch": 2.5758039816232774, |
| "grad_norm": 0.7121987313639168, |
| "learning_rate": 1.1207002089482026e-05, |
| "loss": 0.7272, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.5911179173047474, |
| "grad_norm": 0.6842806839970864, |
| "learning_rate": 1.1100642342609352e-05, |
| "loss": 0.7215, |
| "step": 845 |
| }, |
| { |
| "epoch": 2.6064318529862174, |
| "grad_norm": 0.671282856198737, |
| "learning_rate": 1.0994156406709155e-05, |
| "loss": 0.7086, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.6217457886676874, |
| "grad_norm": 0.7127796423271462, |
| "learning_rate": 1.088755649043104e-05, |
| "loss": 0.7113, |
| "step": 855 |
| }, |
| { |
| "epoch": 2.637059724349158, |
| "grad_norm": 0.6642491628332302, |
| "learning_rate": 1.0780854815492496e-05, |
| "loss": 0.7247, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.652373660030628, |
| "grad_norm": 0.6698942933482407, |
| "learning_rate": 1.0674063615277681e-05, |
| "loss": 0.7301, |
| "step": 865 |
| }, |
| { |
| "epoch": 2.6676875957120982, |
| "grad_norm": 0.6690575090698347, |
| "learning_rate": 1.0567195133434851e-05, |
| "loss": 0.7318, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.6830015313935682, |
| "grad_norm": 0.6647266330654462, |
| "learning_rate": 1.0460261622472631e-05, |
| "loss": 0.7143, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.698315467075038, |
| "grad_norm": 0.6879144621587469, |
| "learning_rate": 1.0353275342355262e-05, |
| "loss": 0.7227, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.713629402756508, |
| "grad_norm": 0.6775051673571305, |
| "learning_rate": 1.024624855909698e-05, |
| "loss": 0.715, |
| "step": 885 |
| }, |
| { |
| "epoch": 2.7289433384379786, |
| "grad_norm": 0.6785052740842338, |
| "learning_rate": 1.013919354335572e-05, |
| "loss": 0.7224, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.7442572741194486, |
| "grad_norm": 0.6852717568024729, |
| "learning_rate": 1.0032122569026284e-05, |
| "loss": 0.7181, |
| "step": 895 |
| }, |
| { |
| "epoch": 2.759571209800919, |
| "grad_norm": 0.6966758449578118, |
| "learning_rate": 9.925047911833137e-06, |
| "loss": 0.726, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.759571209800919, |
| "eval_loss": 0.8449206352233887, |
| "eval_runtime": 3.7457, |
| "eval_samples_per_second": 33.104, |
| "eval_steps_per_second": 2.136, |
| "step": 900 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1630, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 95696971628544.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|