videomae-finetuned_41 / trainer_state.json
kharato's picture
Training in progress, epoch 0
bb463ff verified
{
"best_metric": 0.33170731707317075,
"best_model_checkpoint": "kharato/videomae-finetuned_41\\checkpoint-55494",
"epoch": 19.049691758598314,
"eval_steps": 500,
"global_step": 61640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 10.622191429138184,
"learning_rate": 8.111615833874108e-07,
"loss": 2.4377,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 16.160688400268555,
"learning_rate": 1.6223231667748216e-06,
"loss": 1.981,
"step": 400
},
{
"epoch": 0.01,
"grad_norm": 20.07671546936035,
"learning_rate": 2.4334847501622322e-06,
"loss": 1.5439,
"step": 600
},
{
"epoch": 0.01,
"grad_norm": 14.07224178314209,
"learning_rate": 3.244646333549643e-06,
"loss": 1.2921,
"step": 800
},
{
"epoch": 0.02,
"grad_norm": 10.083611488342285,
"learning_rate": 4.055807916937054e-06,
"loss": 1.0956,
"step": 1000
},
{
"epoch": 0.02,
"grad_norm": 34.907562255859375,
"learning_rate": 4.8669695003244645e-06,
"loss": 0.9225,
"step": 1200
},
{
"epoch": 0.02,
"grad_norm": 15.239115715026855,
"learning_rate": 5.678131083711875e-06,
"loss": 0.9422,
"step": 1400
},
{
"epoch": 0.03,
"grad_norm": 38.96265411376953,
"learning_rate": 6.489292667099286e-06,
"loss": 0.7609,
"step": 1600
},
{
"epoch": 0.03,
"grad_norm": 0.6985247135162354,
"learning_rate": 7.300454250486698e-06,
"loss": 0.733,
"step": 1800
},
{
"epoch": 0.03,
"grad_norm": 0.35616084933280945,
"learning_rate": 8.111615833874107e-06,
"loss": 0.6481,
"step": 2000
},
{
"epoch": 0.04,
"grad_norm": 40.93389892578125,
"learning_rate": 8.922777417261519e-06,
"loss": 0.735,
"step": 2200
},
{
"epoch": 0.04,
"grad_norm": 0.05193087458610535,
"learning_rate": 9.733939000648929e-06,
"loss": 0.6549,
"step": 2400
},
{
"epoch": 0.04,
"grad_norm": 31.53761100769043,
"learning_rate": 1.054510058403634e-05,
"loss": 0.6235,
"step": 2600
},
{
"epoch": 0.05,
"grad_norm": 0.07755979150533676,
"learning_rate": 1.135626216742375e-05,
"loss": 0.6016,
"step": 2800
},
{
"epoch": 0.05,
"grad_norm": 7.711596965789795,
"learning_rate": 1.2167423750811163e-05,
"loss": 0.5175,
"step": 3000
},
{
"epoch": 0.05,
"eval_accuracy": 0.3008130081300813,
"eval_loss": 4.697707653045654,
"eval_runtime": 416.069,
"eval_samples_per_second": 5.912,
"eval_steps_per_second": 2.956,
"step": 3083
},
{
"epoch": 1.0,
"grad_norm": 49.3885612487793,
"learning_rate": 1.2978585334198573e-05,
"loss": 0.6634,
"step": 3200
},
{
"epoch": 1.01,
"grad_norm": 0.030238119885325432,
"learning_rate": 1.3789746917585983e-05,
"loss": 0.6283,
"step": 3400
},
{
"epoch": 1.01,
"grad_norm": 0.3487834334373474,
"learning_rate": 1.4600908500973396e-05,
"loss": 0.4957,
"step": 3600
},
{
"epoch": 1.01,
"grad_norm": 0.025577368214726448,
"learning_rate": 1.5412070084360804e-05,
"loss": 0.4649,
"step": 3800
},
{
"epoch": 1.01,
"grad_norm": 0.12513820827007294,
"learning_rate": 1.6223231667748214e-05,
"loss": 0.4836,
"step": 4000
},
{
"epoch": 1.02,
"grad_norm": 0.9465208053588867,
"learning_rate": 1.7034393251135628e-05,
"loss": 0.5041,
"step": 4200
},
{
"epoch": 1.02,
"grad_norm": 72.26908874511719,
"learning_rate": 1.7845554834523038e-05,
"loss": 0.4779,
"step": 4400
},
{
"epoch": 1.02,
"grad_norm": 0.20234385132789612,
"learning_rate": 1.865671641791045e-05,
"loss": 0.563,
"step": 4600
},
{
"epoch": 1.03,
"grad_norm": 0.036579377949237823,
"learning_rate": 1.9467878001297858e-05,
"loss": 0.4415,
"step": 4800
},
{
"epoch": 1.03,
"grad_norm": 1.33750319480896,
"learning_rate": 2.0279039584685268e-05,
"loss": 0.4937,
"step": 5000
},
{
"epoch": 1.03,
"grad_norm": 0.014029554091393948,
"learning_rate": 2.109020116807268e-05,
"loss": 0.5405,
"step": 5200
},
{
"epoch": 1.04,
"grad_norm": 52.25617218017578,
"learning_rate": 2.190136275146009e-05,
"loss": 0.5149,
"step": 5400
},
{
"epoch": 1.04,
"grad_norm": 94.20246124267578,
"learning_rate": 2.27125243348475e-05,
"loss": 0.5254,
"step": 5600
},
{
"epoch": 1.04,
"grad_norm": 1.6156303882598877,
"learning_rate": 2.3523685918234915e-05,
"loss": 0.3886,
"step": 5800
},
{
"epoch": 1.05,
"grad_norm": 0.011294134892523289,
"learning_rate": 2.4334847501622325e-05,
"loss": 0.5311,
"step": 6000
},
{
"epoch": 1.05,
"eval_accuracy": 0.308130081300813,
"eval_loss": 5.601172924041748,
"eval_runtime": 435.0935,
"eval_samples_per_second": 5.654,
"eval_steps_per_second": 2.827,
"step": 6166
},
{
"epoch": 2.0,
"grad_norm": 29.597869873046875,
"learning_rate": 2.5146009085009735e-05,
"loss": 0.5912,
"step": 6200
},
{
"epoch": 2.0,
"grad_norm": 0.028715774416923523,
"learning_rate": 2.5957170668397145e-05,
"loss": 0.3763,
"step": 6400
},
{
"epoch": 2.01,
"grad_norm": 0.07937850058078766,
"learning_rate": 2.6768332251784555e-05,
"loss": 0.4722,
"step": 6600
},
{
"epoch": 2.01,
"grad_norm": 0.005830916576087475,
"learning_rate": 2.7579493835171965e-05,
"loss": 0.5422,
"step": 6800
},
{
"epoch": 2.01,
"grad_norm": 0.1301320195198059,
"learning_rate": 2.8390655418559382e-05,
"loss": 0.624,
"step": 7000
},
{
"epoch": 2.02,
"grad_norm": 0.011467114090919495,
"learning_rate": 2.9201817001946792e-05,
"loss": 0.3777,
"step": 7200
},
{
"epoch": 2.02,
"grad_norm": 0.0031489874236285686,
"learning_rate": 3.0012978585334202e-05,
"loss": 0.5277,
"step": 7400
},
{
"epoch": 2.02,
"grad_norm": 16.498056411743164,
"learning_rate": 3.082414016872161e-05,
"loss": 0.6094,
"step": 7600
},
{
"epoch": 2.03,
"grad_norm": 4.466196060180664,
"learning_rate": 3.163530175210902e-05,
"loss": 0.5631,
"step": 7800
},
{
"epoch": 2.03,
"grad_norm": 0.01907368004322052,
"learning_rate": 3.244646333549643e-05,
"loss": 0.4888,
"step": 8000
},
{
"epoch": 2.03,
"grad_norm": 0.035668205469846725,
"learning_rate": 3.325762491888384e-05,
"loss": 0.6714,
"step": 8200
},
{
"epoch": 2.04,
"grad_norm": 0.12581172585487366,
"learning_rate": 3.4068786502271256e-05,
"loss": 0.542,
"step": 8400
},
{
"epoch": 2.04,
"grad_norm": 97.48197174072266,
"learning_rate": 3.487994808565866e-05,
"loss": 0.4788,
"step": 8600
},
{
"epoch": 2.04,
"grad_norm": 3.1435787677764893,
"learning_rate": 3.5691109669046076e-05,
"loss": 0.5118,
"step": 8800
},
{
"epoch": 2.05,
"grad_norm": 0.08236628770828247,
"learning_rate": 3.650227125243348e-05,
"loss": 0.5886,
"step": 9000
},
{
"epoch": 2.05,
"grad_norm": 0.015331330709159374,
"learning_rate": 3.73134328358209e-05,
"loss": 0.5884,
"step": 9200
},
{
"epoch": 2.05,
"eval_accuracy": 0.317479674796748,
"eval_loss": 6.225230693817139,
"eval_runtime": 413.95,
"eval_samples_per_second": 5.943,
"eval_steps_per_second": 2.971,
"step": 9249
},
{
"epoch": 3.0,
"grad_norm": 1.9976195096969604,
"learning_rate": 3.812459441920831e-05,
"loss": 0.4934,
"step": 9400
},
{
"epoch": 3.01,
"grad_norm": 122.0944595336914,
"learning_rate": 3.8935756002595716e-05,
"loss": 0.6045,
"step": 9600
},
{
"epoch": 3.01,
"grad_norm": 0.06623541563749313,
"learning_rate": 3.974691758598313e-05,
"loss": 0.4791,
"step": 9800
},
{
"epoch": 3.01,
"grad_norm": 66.04131317138672,
"learning_rate": 4.0558079169370536e-05,
"loss": 0.6775,
"step": 10000
},
{
"epoch": 3.02,
"grad_norm": 55.43952178955078,
"learning_rate": 4.1369240752757956e-05,
"loss": 0.5957,
"step": 10200
},
{
"epoch": 3.02,
"grad_norm": 0.3894006311893463,
"learning_rate": 4.218040233614536e-05,
"loss": 0.4787,
"step": 10400
},
{
"epoch": 3.02,
"grad_norm": 0.6404015421867371,
"learning_rate": 4.2991563919532776e-05,
"loss": 0.5884,
"step": 10600
},
{
"epoch": 3.03,
"grad_norm": 0.4849078357219696,
"learning_rate": 4.380272550292018e-05,
"loss": 0.5479,
"step": 10800
},
{
"epoch": 3.03,
"grad_norm": 47.85725402832031,
"learning_rate": 4.461388708630759e-05,
"loss": 0.6813,
"step": 11000
},
{
"epoch": 3.03,
"grad_norm": 34.00189971923828,
"learning_rate": 4.5425048669695e-05,
"loss": 0.6548,
"step": 11200
},
{
"epoch": 3.03,
"grad_norm": 0.06875142455101013,
"learning_rate": 4.6236210253082417e-05,
"loss": 0.549,
"step": 11400
},
{
"epoch": 3.04,
"grad_norm": 30.920330047607422,
"learning_rate": 4.704737183646983e-05,
"loss": 0.4287,
"step": 11600
},
{
"epoch": 3.04,
"grad_norm": 0.029797730967402458,
"learning_rate": 4.785853341985724e-05,
"loss": 0.3832,
"step": 11800
},
{
"epoch": 3.04,
"grad_norm": 143.6451873779297,
"learning_rate": 4.866969500324465e-05,
"loss": 0.6604,
"step": 12000
},
{
"epoch": 3.05,
"grad_norm": 16.26555824279785,
"learning_rate": 4.948085658663206e-05,
"loss": 0.5206,
"step": 12200
},
{
"epoch": 3.05,
"eval_accuracy": 0.32479674796747965,
"eval_loss": 6.791728496551514,
"eval_runtime": 418.2496,
"eval_samples_per_second": 5.882,
"eval_steps_per_second": 2.941,
"step": 12332
},
{
"epoch": 4.0,
"grad_norm": 95.94632720947266,
"learning_rate": 4.992699545749514e-05,
"loss": 0.5541,
"step": 12400
},
{
"epoch": 4.0,
"grad_norm": 0.03315176069736481,
"learning_rate": 4.9724205061648285e-05,
"loss": 0.6088,
"step": 12600
},
{
"epoch": 4.01,
"grad_norm": 0.04930103197693825,
"learning_rate": 4.952141466580143e-05,
"loss": 0.5268,
"step": 12800
},
{
"epoch": 4.01,
"grad_norm": 0.12131338566541672,
"learning_rate": 4.9318624269954575e-05,
"loss": 0.4941,
"step": 13000
},
{
"epoch": 4.01,
"grad_norm": 18.080978393554688,
"learning_rate": 4.9115833874107724e-05,
"loss": 0.663,
"step": 13200
},
{
"epoch": 4.02,
"grad_norm": 26.085647583007812,
"learning_rate": 4.891304347826087e-05,
"loss": 0.6625,
"step": 13400
},
{
"epoch": 4.02,
"grad_norm": 2.3316972255706787,
"learning_rate": 4.871025308241402e-05,
"loss": 0.6131,
"step": 13600
},
{
"epoch": 4.02,
"grad_norm": 8.68952751159668,
"learning_rate": 4.850746268656717e-05,
"loss": 0.6193,
"step": 13800
},
{
"epoch": 4.03,
"grad_norm": 0.022995395585894585,
"learning_rate": 4.830467229072032e-05,
"loss": 0.6274,
"step": 14000
},
{
"epoch": 4.03,
"grad_norm": 0.037255994975566864,
"learning_rate": 4.8101881894873465e-05,
"loss": 0.5041,
"step": 14200
},
{
"epoch": 4.03,
"grad_norm": 0.034200407564640045,
"learning_rate": 4.7899091499026614e-05,
"loss": 0.6563,
"step": 14400
},
{
"epoch": 4.04,
"grad_norm": 0.019007038325071335,
"learning_rate": 4.7696301103179755e-05,
"loss": 0.6542,
"step": 14600
},
{
"epoch": 4.04,
"grad_norm": 0.019861804321408272,
"learning_rate": 4.7493510707332904e-05,
"loss": 0.4505,
"step": 14800
},
{
"epoch": 4.04,
"grad_norm": 7.428478240966797,
"learning_rate": 4.7290720311486045e-05,
"loss": 0.5076,
"step": 15000
},
{
"epoch": 4.05,
"grad_norm": 0.04943707585334778,
"learning_rate": 4.7087929915639194e-05,
"loss": 0.5715,
"step": 15200
},
{
"epoch": 4.05,
"grad_norm": 0.3862577974796295,
"learning_rate": 4.688513951979234e-05,
"loss": 0.4449,
"step": 15400
},
{
"epoch": 4.05,
"eval_accuracy": 0.3008130081300813,
"eval_loss": 6.194313049316406,
"eval_runtime": 417.8428,
"eval_samples_per_second": 5.887,
"eval_steps_per_second": 2.944,
"step": 15415
},
{
"epoch": 5.0,
"grad_norm": 0.017423830926418304,
"learning_rate": 4.668234912394549e-05,
"loss": 0.5838,
"step": 15600
},
{
"epoch": 5.01,
"grad_norm": 0.0035349351819604635,
"learning_rate": 4.647955872809864e-05,
"loss": 0.3539,
"step": 15800
},
{
"epoch": 5.01,
"grad_norm": 0.009847632609307766,
"learning_rate": 4.627676833225179e-05,
"loss": 0.5626,
"step": 16000
},
{
"epoch": 5.01,
"grad_norm": 4.297732353210449,
"learning_rate": 4.6073977936404935e-05,
"loss": 0.4804,
"step": 16200
},
{
"epoch": 5.02,
"grad_norm": 1.121057152748108,
"learning_rate": 4.5871187540558084e-05,
"loss": 0.4478,
"step": 16400
},
{
"epoch": 5.02,
"grad_norm": 4.427486896514893,
"learning_rate": 4.5668397144711225e-05,
"loss": 0.5708,
"step": 16600
},
{
"epoch": 5.02,
"grad_norm": 9.073264122009277,
"learning_rate": 4.5465606748864373e-05,
"loss": 0.5403,
"step": 16800
},
{
"epoch": 5.03,
"grad_norm": 0.024741439148783684,
"learning_rate": 4.526281635301752e-05,
"loss": 0.4263,
"step": 17000
},
{
"epoch": 5.03,
"grad_norm": 0.03147607669234276,
"learning_rate": 4.506002595717067e-05,
"loss": 0.3275,
"step": 17200
},
{
"epoch": 5.03,
"grad_norm": 0.013691963627934456,
"learning_rate": 4.485723556132382e-05,
"loss": 0.5028,
"step": 17400
},
{
"epoch": 5.04,
"grad_norm": 0.08244924992322922,
"learning_rate": 4.465444516547697e-05,
"loss": 0.5051,
"step": 17600
},
{
"epoch": 5.04,
"grad_norm": 0.012058720923960209,
"learning_rate": 4.4451654769630115e-05,
"loss": 0.4097,
"step": 17800
},
{
"epoch": 5.04,
"grad_norm": 0.0613565556704998,
"learning_rate": 4.4248864373783263e-05,
"loss": 0.4333,
"step": 18000
},
{
"epoch": 5.05,
"grad_norm": 39.56247329711914,
"learning_rate": 4.404607397793641e-05,
"loss": 0.598,
"step": 18200
},
{
"epoch": 5.05,
"grad_norm": 15.885381698608398,
"learning_rate": 4.384328358208955e-05,
"loss": 0.3783,
"step": 18400
},
{
"epoch": 5.05,
"eval_accuracy": 0.3150406504065041,
"eval_loss": 6.833878993988037,
"eval_runtime": 418.6571,
"eval_samples_per_second": 5.876,
"eval_steps_per_second": 2.938,
"step": 18498
},
{
"epoch": 6.0,
"grad_norm": 0.02771819196641445,
"learning_rate": 4.36404931862427e-05,
"loss": 0.4326,
"step": 18600
},
{
"epoch": 6.0,
"grad_norm": 0.018607838079333305,
"learning_rate": 4.343770279039585e-05,
"loss": 0.4859,
"step": 18800
},
{
"epoch": 6.01,
"grad_norm": 0.5698729157447815,
"learning_rate": 4.3234912394549e-05,
"loss": 0.4803,
"step": 19000
},
{
"epoch": 6.01,
"grad_norm": 0.041256897151470184,
"learning_rate": 4.303212199870215e-05,
"loss": 0.4549,
"step": 19200
},
{
"epoch": 6.01,
"grad_norm": 27.683008193969727,
"learning_rate": 4.282933160285529e-05,
"loss": 0.5179,
"step": 19400
},
{
"epoch": 6.02,
"grad_norm": 0.08311375975608826,
"learning_rate": 4.2626541207008437e-05,
"loss": 0.3603,
"step": 19600
},
{
"epoch": 6.02,
"grad_norm": 0.19924193620681763,
"learning_rate": 4.2423750811161585e-05,
"loss": 0.4939,
"step": 19800
},
{
"epoch": 6.02,
"grad_norm": 0.035903461277484894,
"learning_rate": 4.222096041531473e-05,
"loss": 0.4792,
"step": 20000
},
{
"epoch": 6.03,
"grad_norm": 0.004007269628345966,
"learning_rate": 4.201817001946788e-05,
"loss": 0.453,
"step": 20200
},
{
"epoch": 6.03,
"grad_norm": 0.02423214167356491,
"learning_rate": 4.181537962362102e-05,
"loss": 0.3846,
"step": 20400
},
{
"epoch": 6.03,
"grad_norm": 0.023962557315826416,
"learning_rate": 4.161258922777417e-05,
"loss": 0.4691,
"step": 20600
},
{
"epoch": 6.04,
"grad_norm": 0.08679631352424622,
"learning_rate": 4.140979883192732e-05,
"loss": 0.484,
"step": 20800
},
{
"epoch": 6.04,
"grad_norm": 55.87190246582031,
"learning_rate": 4.120700843608047e-05,
"loss": 0.3933,
"step": 21000
},
{
"epoch": 6.04,
"grad_norm": 0.0048807836137712,
"learning_rate": 4.1004218040233617e-05,
"loss": 0.3291,
"step": 21200
},
{
"epoch": 6.05,
"grad_norm": 0.015589645132422447,
"learning_rate": 4.0801427644386765e-05,
"loss": 0.5032,
"step": 21400
},
{
"epoch": 6.05,
"eval_accuracy": 0.30772357723577237,
"eval_loss": 6.656611919403076,
"eval_runtime": 419.2486,
"eval_samples_per_second": 5.868,
"eval_steps_per_second": 2.934,
"step": 21581
},
{
"epoch": 7.0,
"grad_norm": 0.02622975967824459,
"learning_rate": 4.059863724853991e-05,
"loss": 0.4267,
"step": 21600
},
{
"epoch": 7.0,
"grad_norm": 0.039106499403715134,
"learning_rate": 4.039584685269306e-05,
"loss": 0.3826,
"step": 21800
},
{
"epoch": 7.01,
"grad_norm": 0.012655826285481453,
"learning_rate": 4.019305645684621e-05,
"loss": 0.3964,
"step": 22000
},
{
"epoch": 7.01,
"grad_norm": 0.0034593914169818163,
"learning_rate": 3.999026606099935e-05,
"loss": 0.4858,
"step": 22200
},
{
"epoch": 7.01,
"grad_norm": 0.3785853087902069,
"learning_rate": 3.97874756651525e-05,
"loss": 0.4492,
"step": 22400
},
{
"epoch": 7.02,
"grad_norm": 0.09961965680122375,
"learning_rate": 3.958468526930565e-05,
"loss": 0.3808,
"step": 22600
},
{
"epoch": 7.02,
"grad_norm": 31.060867309570312,
"learning_rate": 3.9381894873458796e-05,
"loss": 0.3712,
"step": 22800
},
{
"epoch": 7.02,
"grad_norm": 0.023167919367551804,
"learning_rate": 3.9179104477611945e-05,
"loss": 0.4784,
"step": 23000
},
{
"epoch": 7.03,
"grad_norm": 0.030744561925530434,
"learning_rate": 3.897631408176509e-05,
"loss": 0.3833,
"step": 23200
},
{
"epoch": 7.03,
"grad_norm": 49.66160202026367,
"learning_rate": 3.877352368591824e-05,
"loss": 0.3776,
"step": 23400
},
{
"epoch": 7.03,
"grad_norm": 0.0036757669877260923,
"learning_rate": 3.857073329007138e-05,
"loss": 0.4413,
"step": 23600
},
{
"epoch": 7.04,
"grad_norm": 0.24145947396755219,
"learning_rate": 3.836794289422453e-05,
"loss": 0.3737,
"step": 23800
},
{
"epoch": 7.04,
"grad_norm": 0.36743244528770447,
"learning_rate": 3.816515249837768e-05,
"loss": 0.3371,
"step": 24000
},
{
"epoch": 7.04,
"grad_norm": 0.5987332463264465,
"learning_rate": 3.796236210253082e-05,
"loss": 0.4583,
"step": 24200
},
{
"epoch": 7.05,
"grad_norm": 0.009368489496409893,
"learning_rate": 3.775957170668397e-05,
"loss": 0.2901,
"step": 24400
},
{
"epoch": 7.05,
"grad_norm": 0.033730778843164444,
"learning_rate": 3.755678131083712e-05,
"loss": 0.4091,
"step": 24600
},
{
"epoch": 7.05,
"eval_accuracy": 0.29715447154471547,
"eval_loss": 6.801322937011719,
"eval_runtime": 420.5633,
"eval_samples_per_second": 5.849,
"eval_steps_per_second": 2.925,
"step": 24664
},
{
"epoch": 8.0,
"grad_norm": 0.5106008648872375,
"learning_rate": 3.7353990914990266e-05,
"loss": 0.4906,
"step": 24800
},
{
"epoch": 8.01,
"grad_norm": 0.027176540344953537,
"learning_rate": 3.7151200519143415e-05,
"loss": 0.3089,
"step": 25000
},
{
"epoch": 8.01,
"grad_norm": 0.005631732754409313,
"learning_rate": 3.694841012329656e-05,
"loss": 0.3547,
"step": 25200
},
{
"epoch": 8.01,
"grad_norm": 0.004955723416060209,
"learning_rate": 3.674561972744971e-05,
"loss": 0.3477,
"step": 25400
},
{
"epoch": 8.02,
"grad_norm": 0.23866023123264313,
"learning_rate": 3.654282933160286e-05,
"loss": 0.3229,
"step": 25600
},
{
"epoch": 8.02,
"grad_norm": 0.025327768176794052,
"learning_rate": 3.6340038935756e-05,
"loss": 0.3664,
"step": 25800
},
{
"epoch": 8.02,
"grad_norm": 0.004922116175293922,
"learning_rate": 3.613724853990915e-05,
"loss": 0.3222,
"step": 26000
},
{
"epoch": 8.02,
"grad_norm": 0.49947389960289,
"learning_rate": 3.59344581440623e-05,
"loss": 0.3044,
"step": 26200
},
{
"epoch": 8.03,
"grad_norm": 2.2153525352478027,
"learning_rate": 3.5731667748215446e-05,
"loss": 0.3326,
"step": 26400
},
{
"epoch": 8.03,
"grad_norm": 62.326072692871094,
"learning_rate": 3.5528877352368594e-05,
"loss": 0.4252,
"step": 26600
},
{
"epoch": 8.03,
"grad_norm": 0.028839513659477234,
"learning_rate": 3.532608695652174e-05,
"loss": 0.3653,
"step": 26800
},
{
"epoch": 8.04,
"grad_norm": 110.81560516357422,
"learning_rate": 3.512329656067489e-05,
"loss": 0.3337,
"step": 27000
},
{
"epoch": 8.04,
"grad_norm": 0.008337341248989105,
"learning_rate": 3.492050616482804e-05,
"loss": 0.2245,
"step": 27200
},
{
"epoch": 8.04,
"grad_norm": 0.019596580415964127,
"learning_rate": 3.471771576898119e-05,
"loss": 0.2529,
"step": 27400
},
{
"epoch": 8.05,
"grad_norm": 0.005616022273898125,
"learning_rate": 3.451492537313433e-05,
"loss": 0.4436,
"step": 27600
},
{
"epoch": 8.05,
"eval_accuracy": 0.3,
"eval_loss": 6.854862213134766,
"eval_runtime": 419.8758,
"eval_samples_per_second": 5.859,
"eval_steps_per_second": 2.929,
"step": 27747
},
{
"epoch": 9.0,
"grad_norm": 0.022624719887971878,
"learning_rate": 3.431213497728748e-05,
"loss": 0.3726,
"step": 27800
},
{
"epoch": 9.0,
"grad_norm": 0.001994416816160083,
"learning_rate": 3.410934458144062e-05,
"loss": 0.1703,
"step": 28000
},
{
"epoch": 9.01,
"grad_norm": 48.61260986328125,
"learning_rate": 3.390655418559377e-05,
"loss": 0.2406,
"step": 28200
},
{
"epoch": 9.01,
"grad_norm": 0.03608255833387375,
"learning_rate": 3.3703763789746916e-05,
"loss": 0.3916,
"step": 28400
},
{
"epoch": 9.01,
"grad_norm": 0.005167305935174227,
"learning_rate": 3.3500973393900064e-05,
"loss": 0.3041,
"step": 28600
},
{
"epoch": 9.02,
"grad_norm": 0.09229105710983276,
"learning_rate": 3.329818299805321e-05,
"loss": 0.2965,
"step": 28800
},
{
"epoch": 9.02,
"grad_norm": 0.0017744365613907576,
"learning_rate": 3.309539260220636e-05,
"loss": 0.1857,
"step": 29000
},
{
"epoch": 9.02,
"grad_norm": 0.0021416887175291777,
"learning_rate": 3.289260220635951e-05,
"loss": 0.3291,
"step": 29200
},
{
"epoch": 9.03,
"grad_norm": 0.02957375906407833,
"learning_rate": 3.268981181051266e-05,
"loss": 0.3716,
"step": 29400
},
{
"epoch": 9.03,
"grad_norm": 0.04252824932336807,
"learning_rate": 3.24870214146658e-05,
"loss": 0.4442,
"step": 29600
},
{
"epoch": 9.03,
"grad_norm": 0.00921566691249609,
"learning_rate": 3.228423101881895e-05,
"loss": 0.3244,
"step": 29800
},
{
"epoch": 9.04,
"grad_norm": 0.02388688549399376,
"learning_rate": 3.2081440622972096e-05,
"loss": 0.4729,
"step": 30000
},
{
"epoch": 9.04,
"grad_norm": 0.02142206020653248,
"learning_rate": 3.1878650227125244e-05,
"loss": 0.3775,
"step": 30200
},
{
"epoch": 9.04,
"grad_norm": 110.55142211914062,
"learning_rate": 3.167585983127839e-05,
"loss": 0.1999,
"step": 30400
},
{
"epoch": 9.05,
"grad_norm": 0.7394188642501831,
"learning_rate": 3.147306943543154e-05,
"loss": 0.3141,
"step": 30600
},
{
"epoch": 9.05,
"grad_norm": 0.06363216042518616,
"learning_rate": 3.127027903958469e-05,
"loss": 0.3474,
"step": 30800
},
{
"epoch": 9.05,
"eval_accuracy": 0.32682926829268294,
"eval_loss": 7.001511573791504,
"eval_runtime": 421.6251,
"eval_samples_per_second": 5.835,
"eval_steps_per_second": 2.917,
"step": 30830
},
{
"epoch": 10.0,
"grad_norm": 0.17996995151042938,
"learning_rate": 3.106748864373784e-05,
"loss": 0.2746,
"step": 31000
},
{
"epoch": 10.01,
"grad_norm": 0.007576479576528072,
"learning_rate": 3.0864698247890986e-05,
"loss": 0.2044,
"step": 31200
},
{
"epoch": 10.01,
"grad_norm": 0.013998846523463726,
"learning_rate": 3.066190785204413e-05,
"loss": 0.2771,
"step": 31400
},
{
"epoch": 10.01,
"grad_norm": 0.0031363102607429028,
"learning_rate": 3.045911745619728e-05,
"loss": 0.3009,
"step": 31600
},
{
"epoch": 10.02,
"grad_norm": 0.006260915659368038,
"learning_rate": 3.0256327060350424e-05,
"loss": 0.3288,
"step": 31800
},
{
"epoch": 10.02,
"grad_norm": 0.08372914791107178,
"learning_rate": 3.0053536664503572e-05,
"loss": 0.3404,
"step": 32000
},
{
"epoch": 10.02,
"grad_norm": 91.87712860107422,
"learning_rate": 2.9850746268656714e-05,
"loss": 0.429,
"step": 32200
},
{
"epoch": 10.03,
"grad_norm": 0.0794682428240776,
"learning_rate": 2.9647955872809862e-05,
"loss": 0.3603,
"step": 32400
},
{
"epoch": 10.03,
"grad_norm": 0.054609477519989014,
"learning_rate": 2.944516547696301e-05,
"loss": 0.2766,
"step": 32600
},
{
"epoch": 10.03,
"grad_norm": 0.04762391373515129,
"learning_rate": 2.924237508111616e-05,
"loss": 0.229,
"step": 32800
},
{
"epoch": 10.04,
"grad_norm": 0.004877444822341204,
"learning_rate": 2.9039584685269304e-05,
"loss": 0.3206,
"step": 33000
},
{
"epoch": 10.04,
"grad_norm": 0.00354503421112895,
"learning_rate": 2.8836794289422452e-05,
"loss": 0.2281,
"step": 33200
},
{
"epoch": 10.04,
"grad_norm": 0.0026297084987163544,
"learning_rate": 2.86340038935756e-05,
"loss": 0.2265,
"step": 33400
},
{
"epoch": 10.04,
"grad_norm": 59.78186798095703,
"learning_rate": 2.843121349772875e-05,
"loss": 0.3863,
"step": 33600
},
{
"epoch": 10.05,
"grad_norm": 0.036124564707279205,
"learning_rate": 2.8228423101881897e-05,
"loss": 0.2151,
"step": 33800
},
{
"epoch": 10.05,
"eval_accuracy": 0.3040650406504065,
"eval_loss": 7.767071723937988,
"eval_runtime": 419.266,
"eval_samples_per_second": 5.867,
"eval_steps_per_second": 2.934,
"step": 33913
},
{
"epoch": 11.0,
"grad_norm": 0.011912085115909576,
"learning_rate": 2.8025632706035042e-05,
"loss": 0.1451,
"step": 34000
},
{
"epoch": 11.0,
"grad_norm": 0.016937492415308952,
"learning_rate": 2.782284231018819e-05,
"loss": 0.3187,
"step": 34200
},
{
"epoch": 11.01,
"grad_norm": 1.2451800107955933,
"learning_rate": 2.762005191434134e-05,
"loss": 0.2229,
"step": 34400
},
{
"epoch": 11.01,
"grad_norm": 0.004575447645038366,
"learning_rate": 2.7417261518494487e-05,
"loss": 0.2205,
"step": 34600
},
{
"epoch": 11.01,
"grad_norm": 0.004918406717479229,
"learning_rate": 2.7214471122647632e-05,
"loss": 0.1633,
"step": 34800
},
{
"epoch": 11.02,
"grad_norm": 0.14480261504650116,
"learning_rate": 2.701168072680078e-05,
"loss": 0.2251,
"step": 35000
},
{
"epoch": 11.02,
"grad_norm": 0.004981683101505041,
"learning_rate": 2.680889033095393e-05,
"loss": 0.1801,
"step": 35200
},
{
"epoch": 11.02,
"grad_norm": 0.008614394813776016,
"learning_rate": 2.6606099935107077e-05,
"loss": 0.3143,
"step": 35400
},
{
"epoch": 11.03,
"grad_norm": 0.40735259652137756,
"learning_rate": 2.6403309539260222e-05,
"loss": 0.3673,
"step": 35600
},
{
"epoch": 11.03,
"grad_norm": 0.014218580909073353,
"learning_rate": 2.620051914341337e-05,
"loss": 0.2534,
"step": 35800
},
{
"epoch": 11.03,
"grad_norm": 0.021330924704670906,
"learning_rate": 2.599772874756652e-05,
"loss": 0.263,
"step": 36000
},
{
"epoch": 11.04,
"grad_norm": 0.013331553898751736,
"learning_rate": 2.5794938351719667e-05,
"loss": 0.3093,
"step": 36200
},
{
"epoch": 11.04,
"grad_norm": 0.43031027913093567,
"learning_rate": 2.559214795587281e-05,
"loss": 0.3005,
"step": 36400
},
{
"epoch": 11.04,
"grad_norm": 47.17981719970703,
"learning_rate": 2.5389357560025957e-05,
"loss": 0.1316,
"step": 36600
},
{
"epoch": 11.05,
"grad_norm": 96.86531829833984,
"learning_rate": 2.5186567164179102e-05,
"loss": 0.3597,
"step": 36800
},
{
"epoch": 11.05,
"eval_accuracy": 0.32926829268292684,
"eval_loss": 7.072375297546387,
"eval_runtime": 419.8993,
"eval_samples_per_second": 5.859,
"eval_steps_per_second": 2.929,
"step": 36996
},
{
"epoch": 12.0,
"grad_norm": 0.014351383782923222,
"learning_rate": 2.4983776768332254e-05,
"loss": 0.1933,
"step": 37000
},
{
"epoch": 12.0,
"grad_norm": 0.07165095955133438,
"learning_rate": 2.4780986372485402e-05,
"loss": 0.1762,
"step": 37200
},
{
"epoch": 12.01,
"grad_norm": 0.004130370914936066,
"learning_rate": 2.457819597663855e-05,
"loss": 0.2113,
"step": 37400
},
{
"epoch": 12.01,
"grad_norm": 0.0024869125336408615,
"learning_rate": 2.4375405580791695e-05,
"loss": 0.274,
"step": 37600
},
{
"epoch": 12.01,
"grad_norm": 0.00669575622305274,
"learning_rate": 2.417261518494484e-05,
"loss": 0.1877,
"step": 37800
},
{
"epoch": 12.02,
"grad_norm": 0.2720281183719635,
"learning_rate": 2.396982478909799e-05,
"loss": 0.2101,
"step": 38000
},
{
"epoch": 12.02,
"grad_norm": 0.00792625080794096,
"learning_rate": 2.3767034393251137e-05,
"loss": 0.2125,
"step": 38200
},
{
"epoch": 12.02,
"grad_norm": 0.0033484152518212795,
"learning_rate": 2.3564243997404285e-05,
"loss": 0.115,
"step": 38400
},
{
"epoch": 12.03,
"grad_norm": 0.0022839007433503866,
"learning_rate": 2.336145360155743e-05,
"loss": 0.1773,
"step": 38600
},
{
"epoch": 12.03,
"grad_norm": 0.0013085936661809683,
"learning_rate": 2.315866320571058e-05,
"loss": 0.1654,
"step": 38800
},
{
"epoch": 12.03,
"grad_norm": 0.023824598640203476,
"learning_rate": 2.2955872809863727e-05,
"loss": 0.2326,
"step": 39000
},
{
"epoch": 12.04,
"grad_norm": 0.002132503315806389,
"learning_rate": 2.2753082414016875e-05,
"loss": 0.2223,
"step": 39200
},
{
"epoch": 12.04,
"grad_norm": 0.0020501285325735807,
"learning_rate": 2.255029201817002e-05,
"loss": 0.2558,
"step": 39400
},
{
"epoch": 12.04,
"grad_norm": 0.03084419295191765,
"learning_rate": 2.2347501622323165e-05,
"loss": 0.2664,
"step": 39600
},
{
"epoch": 12.05,
"grad_norm": 0.026181140914559364,
"learning_rate": 2.2144711226476314e-05,
"loss": 0.3349,
"step": 39800
},
{
"epoch": 12.05,
"grad_norm": 62.58173751831055,
"learning_rate": 2.1941920830629462e-05,
"loss": 0.1673,
"step": 40000
},
{
"epoch": 12.05,
"eval_accuracy": 0.32479674796747965,
"eval_loss": 7.580522537231445,
"eval_runtime": 419.9065,
"eval_samples_per_second": 5.858,
"eval_steps_per_second": 2.929,
"step": 40079
},
{
"epoch": 13.0,
"grad_norm": 0.0021006192546337843,
"learning_rate": 2.173913043478261e-05,
"loss": 0.1841,
"step": 40200
},
{
"epoch": 13.01,
"grad_norm": 0.0030403181444853544,
"learning_rate": 2.1536340038935755e-05,
"loss": 0.2535,
"step": 40400
},
{
"epoch": 13.01,
"grad_norm": 0.008125518448650837,
"learning_rate": 2.1333549643088903e-05,
"loss": 0.2208,
"step": 40600
},
{
"epoch": 13.01,
"grad_norm": 0.007459101267158985,
"learning_rate": 2.1130759247242052e-05,
"loss": 0.2123,
"step": 40800
},
{
"epoch": 13.01,
"grad_norm": 0.0015231677098199725,
"learning_rate": 2.09279688513952e-05,
"loss": 0.2145,
"step": 41000
},
{
"epoch": 13.02,
"grad_norm": 0.017687492072582245,
"learning_rate": 2.072517845554835e-05,
"loss": 0.1135,
"step": 41200
},
{
"epoch": 13.02,
"grad_norm": 0.16305746138095856,
"learning_rate": 2.0522388059701493e-05,
"loss": 0.1654,
"step": 41400
},
{
"epoch": 13.02,
"grad_norm": 0.0016574672190472484,
"learning_rate": 2.0319597663854642e-05,
"loss": 0.2207,
"step": 41600
},
{
"epoch": 13.03,
"grad_norm": 0.007612856104969978,
"learning_rate": 2.0116807268007787e-05,
"loss": 0.1663,
"step": 41800
},
{
"epoch": 13.03,
"grad_norm": 0.0031971693970263004,
"learning_rate": 1.9914016872160935e-05,
"loss": 0.1238,
"step": 42000
},
{
"epoch": 13.03,
"grad_norm": 1.114168643951416,
"learning_rate": 1.9711226476314083e-05,
"loss": 0.1508,
"step": 42200
},
{
"epoch": 13.04,
"grad_norm": 0.00322701339609921,
"learning_rate": 1.950843608046723e-05,
"loss": 0.0757,
"step": 42400
},
{
"epoch": 13.04,
"grad_norm": 0.00376499374397099,
"learning_rate": 1.9305645684620377e-05,
"loss": 0.1904,
"step": 42600
},
{
"epoch": 13.04,
"grad_norm": 0.006022193934768438,
"learning_rate": 1.9102855288773525e-05,
"loss": 0.2528,
"step": 42800
},
{
"epoch": 13.05,
"grad_norm": 0.003072307910770178,
"learning_rate": 1.8900064892926673e-05,
"loss": 0.114,
"step": 43000
},
{
"epoch": 13.05,
"eval_accuracy": 0.317479674796748,
"eval_loss": 7.819610595703125,
"eval_runtime": 419.7353,
"eval_samples_per_second": 5.861,
"eval_steps_per_second": 2.93,
"step": 43162
},
{
"epoch": 14.0,
"grad_norm": 85.4753646850586,
"learning_rate": 1.8697274497079818e-05,
"loss": 0.155,
"step": 43200
},
{
"epoch": 14.0,
"grad_norm": 0.034654486924409866,
"learning_rate": 1.8494484101232967e-05,
"loss": 0.1435,
"step": 43400
},
{
"epoch": 14.01,
"grad_norm": 0.004176551941782236,
"learning_rate": 1.8291693705386115e-05,
"loss": 0.1181,
"step": 43600
},
{
"epoch": 14.01,
"grad_norm": 0.0013031965354457498,
"learning_rate": 1.8088903309539263e-05,
"loss": 0.2033,
"step": 43800
},
{
"epoch": 14.01,
"grad_norm": 0.0004607917508110404,
"learning_rate": 1.7886112913692408e-05,
"loss": 0.2331,
"step": 44000
},
{
"epoch": 14.02,
"grad_norm": 2.1360397338867188,
"learning_rate": 1.7683322517845553e-05,
"loss": 0.1684,
"step": 44200
},
{
"epoch": 14.02,
"grad_norm": 0.0016966286348178983,
"learning_rate": 1.74805321219987e-05,
"loss": 0.1155,
"step": 44400
},
{
"epoch": 14.02,
"grad_norm": 0.005142655223608017,
"learning_rate": 1.727774172615185e-05,
"loss": 0.1469,
"step": 44600
},
{
"epoch": 14.03,
"grad_norm": 1.043182134628296,
"learning_rate": 1.7074951330304998e-05,
"loss": 0.0513,
"step": 44800
},
{
"epoch": 14.03,
"grad_norm": 0.003510431619361043,
"learning_rate": 1.6872160934458147e-05,
"loss": 0.2092,
"step": 45000
},
{
"epoch": 14.03,
"grad_norm": 0.004158710595220327,
"learning_rate": 1.666937053861129e-05,
"loss": 0.166,
"step": 45200
},
{
"epoch": 14.04,
"grad_norm": 0.006816697306931019,
"learning_rate": 1.646658014276444e-05,
"loss": 0.1756,
"step": 45400
},
{
"epoch": 14.04,
"grad_norm": 0.0019508616533130407,
"learning_rate": 1.6263789746917588e-05,
"loss": 0.0895,
"step": 45600
},
{
"epoch": 14.04,
"grad_norm": 0.011971150524914265,
"learning_rate": 1.6060999351070736e-05,
"loss": 0.2169,
"step": 45800
},
{
"epoch": 14.05,
"grad_norm": 0.0015203008661046624,
"learning_rate": 1.585820895522388e-05,
"loss": 0.1652,
"step": 46000
},
{
"epoch": 14.05,
"grad_norm": 0.011360148899257183,
"learning_rate": 1.5655418559377026e-05,
"loss": 0.2088,
"step": 46200
},
{
"epoch": 14.05,
"eval_accuracy": 0.32723577235772355,
"eval_loss": 7.71033239364624,
"eval_runtime": 424.2708,
"eval_samples_per_second": 5.798,
"eval_steps_per_second": 2.899,
"step": 46245
},
{
"epoch": 15.0,
"grad_norm": 0.0022605557460337877,
"learning_rate": 1.5452628163530175e-05,
"loss": 0.0726,
"step": 46400
},
{
"epoch": 15.01,
"grad_norm": 0.024754885584115982,
"learning_rate": 1.5249837767683323e-05,
"loss": 0.1156,
"step": 46600
},
{
"epoch": 15.01,
"grad_norm": 0.10212986171245575,
"learning_rate": 1.504704737183647e-05,
"loss": 0.0741,
"step": 46800
},
{
"epoch": 15.01,
"grad_norm": 0.18410304188728333,
"learning_rate": 1.4844256975989618e-05,
"loss": 0.1824,
"step": 47000
},
{
"epoch": 15.02,
"grad_norm": 0.00476705189794302,
"learning_rate": 1.4641466580142765e-05,
"loss": 0.1592,
"step": 47200
},
{
"epoch": 15.02,
"grad_norm": 0.361130028963089,
"learning_rate": 1.4438676184295913e-05,
"loss": 0.053,
"step": 47400
},
{
"epoch": 15.02,
"grad_norm": 0.0020235551055520773,
"learning_rate": 1.423588578844906e-05,
"loss": 0.1294,
"step": 47600
},
{
"epoch": 15.03,
"grad_norm": 0.0014645768096670508,
"learning_rate": 1.4033095392602208e-05,
"loss": 0.119,
"step": 47800
},
{
"epoch": 15.03,
"grad_norm": 0.3808715045452118,
"learning_rate": 1.3830304996755356e-05,
"loss": 0.1903,
"step": 48000
},
{
"epoch": 15.03,
"grad_norm": 0.0016527449479326606,
"learning_rate": 1.36275146009085e-05,
"loss": 0.1918,
"step": 48200
},
{
"epoch": 15.03,
"grad_norm": 50.08372497558594,
"learning_rate": 1.3424724205061648e-05,
"loss": 0.157,
"step": 48400
},
{
"epoch": 15.04,
"grad_norm": 259.1443786621094,
"learning_rate": 1.3221933809214796e-05,
"loss": 0.1894,
"step": 48600
},
{
"epoch": 15.04,
"grad_norm": 0.002545048715546727,
"learning_rate": 1.3019143413367943e-05,
"loss": 0.1257,
"step": 48800
},
{
"epoch": 15.04,
"grad_norm": 0.2304425984621048,
"learning_rate": 1.2816353017521091e-05,
"loss": 0.1608,
"step": 49000
},
{
"epoch": 15.05,
"grad_norm": 28.786087036132812,
"learning_rate": 1.2613562621674238e-05,
"loss": 0.1662,
"step": 49200
},
{
"epoch": 15.05,
"eval_accuracy": 0.32479674796747965,
"eval_loss": 7.761257648468018,
"eval_runtime": 419.2032,
"eval_samples_per_second": 5.868,
"eval_steps_per_second": 2.934,
"step": 49328
},
{
"epoch": 16.0,
"grad_norm": 0.0010336956474930048,
"learning_rate": 1.2410772225827386e-05,
"loss": 0.1146,
"step": 49400
},
{
"epoch": 16.0,
"grad_norm": 0.005545318126678467,
"learning_rate": 1.2207981829980533e-05,
"loss": 0.1001,
"step": 49600
},
{
"epoch": 16.01,
"grad_norm": 0.0031858233269304037,
"learning_rate": 1.200519143413368e-05,
"loss": 0.1257,
"step": 49800
},
{
"epoch": 16.01,
"grad_norm": 0.004460224881768227,
"learning_rate": 1.1802401038286826e-05,
"loss": 0.2025,
"step": 50000
},
{
"epoch": 16.01,
"grad_norm": 93.79480743408203,
"learning_rate": 1.1599610642439974e-05,
"loss": 0.0833,
"step": 50200
},
{
"epoch": 16.02,
"grad_norm": 0.003030031453818083,
"learning_rate": 1.1396820246593123e-05,
"loss": 0.1456,
"step": 50400
},
{
"epoch": 16.02,
"grad_norm": 0.0017526369774714112,
"learning_rate": 1.119402985074627e-05,
"loss": 0.1136,
"step": 50600
},
{
"epoch": 16.02,
"grad_norm": 0.002222576644271612,
"learning_rate": 1.0991239454899418e-05,
"loss": 0.1357,
"step": 50800
},
{
"epoch": 16.03,
"grad_norm": 0.005358373746275902,
"learning_rate": 1.0788449059052563e-05,
"loss": 0.144,
"step": 51000
},
{
"epoch": 16.03,
"grad_norm": 0.037487562745809555,
"learning_rate": 1.0585658663205711e-05,
"loss": 0.2054,
"step": 51200
},
{
"epoch": 16.03,
"grad_norm": 0.052653077989816666,
"learning_rate": 1.0382868267358858e-05,
"loss": 0.2247,
"step": 51400
},
{
"epoch": 16.04,
"grad_norm": 0.001781440805643797,
"learning_rate": 1.0180077871512006e-05,
"loss": 0.1049,
"step": 51600
},
{
"epoch": 16.04,
"grad_norm": 0.001019317307509482,
"learning_rate": 9.977287475665154e-06,
"loss": 0.0469,
"step": 51800
},
{
"epoch": 16.04,
"grad_norm": 0.6995309591293335,
"learning_rate": 9.7744970798183e-06,
"loss": 0.1745,
"step": 52000
},
{
"epoch": 16.05,
"grad_norm": 0.19804002344608307,
"learning_rate": 9.571706683971448e-06,
"loss": 0.1293,
"step": 52200
},
{
"epoch": 16.05,
"grad_norm": 0.005632157437503338,
"learning_rate": 9.368916288124594e-06,
"loss": 0.1961,
"step": 52400
},
{
"epoch": 16.05,
"eval_accuracy": 0.32967479674796746,
"eval_loss": 7.7729597091674805,
"eval_runtime": 420.2271,
"eval_samples_per_second": 5.854,
"eval_steps_per_second": 2.927,
"step": 52411
},
{
"epoch": 17.0,
"grad_norm": 0.04664480686187744,
"learning_rate": 9.166125892277743e-06,
"loss": 0.1644,
"step": 52600
},
{
"epoch": 17.01,
"grad_norm": 0.0033755158074200153,
"learning_rate": 8.96333549643089e-06,
"loss": 0.112,
"step": 52800
},
{
"epoch": 17.01,
"grad_norm": 0.0010055933380499482,
"learning_rate": 8.760545100584036e-06,
"loss": 0.0775,
"step": 53000
},
{
"epoch": 17.01,
"grad_norm": 0.002419169992208481,
"learning_rate": 8.557754704737184e-06,
"loss": 0.0887,
"step": 53200
},
{
"epoch": 17.02,
"grad_norm": 114.45608520507812,
"learning_rate": 8.354964308890331e-06,
"loss": 0.1805,
"step": 53400
},
{
"epoch": 17.02,
"grad_norm": 0.009727099910378456,
"learning_rate": 8.15217391304348e-06,
"loss": 0.1292,
"step": 53600
},
{
"epoch": 17.02,
"grad_norm": 0.01231451891362667,
"learning_rate": 7.949383517196626e-06,
"loss": 0.0268,
"step": 53800
},
{
"epoch": 17.03,
"grad_norm": 0.20992113649845123,
"learning_rate": 7.746593121349774e-06,
"loss": 0.0512,
"step": 54000
},
{
"epoch": 17.03,
"grad_norm": 0.001962395152077079,
"learning_rate": 7.54380272550292e-06,
"loss": 0.0564,
"step": 54200
},
{
"epoch": 17.03,
"grad_norm": 0.007957936264574528,
"learning_rate": 7.3410123296560675e-06,
"loss": 0.062,
"step": 54400
},
{
"epoch": 17.04,
"grad_norm": 19.138671875,
"learning_rate": 7.138221933809215e-06,
"loss": 0.1305,
"step": 54600
},
{
"epoch": 17.04,
"grad_norm": 0.00127317919395864,
"learning_rate": 6.9354315379623625e-06,
"loss": 0.1225,
"step": 54800
},
{
"epoch": 17.04,
"grad_norm": 0.001261857571080327,
"learning_rate": 6.73264114211551e-06,
"loss": 0.1043,
"step": 55000
},
{
"epoch": 17.05,
"grad_norm": 0.0031231152825057507,
"learning_rate": 6.529850746268657e-06,
"loss": 0.0846,
"step": 55200
},
{
"epoch": 17.05,
"grad_norm": 0.0025645680725574493,
"learning_rate": 6.327060350421804e-06,
"loss": 0.1436,
"step": 55400
},
{
"epoch": 17.05,
"eval_accuracy": 0.33170731707317075,
"eval_loss": 7.929662227630615,
"eval_runtime": 418.991,
"eval_samples_per_second": 5.871,
"eval_steps_per_second": 2.936,
"step": 55494
},
{
"epoch": 18.0,
"grad_norm": 0.0005848600412718952,
"learning_rate": 6.124269954574952e-06,
"loss": 0.0734,
"step": 55600
},
{
"epoch": 18.0,
"grad_norm": 0.0016508701955899596,
"learning_rate": 5.921479558728099e-06,
"loss": 0.0234,
"step": 55800
},
{
"epoch": 18.01,
"grad_norm": 0.0011639483273029327,
"learning_rate": 5.718689162881246e-06,
"loss": 0.0497,
"step": 56000
},
{
"epoch": 18.01,
"grad_norm": 64.46589660644531,
"learning_rate": 5.515898767034393e-06,
"loss": 0.1146,
"step": 56200
},
{
"epoch": 18.01,
"grad_norm": 0.003238030942156911,
"learning_rate": 5.313108371187541e-06,
"loss": 0.0722,
"step": 56400
},
{
"epoch": 18.02,
"grad_norm": 0.34975093603134155,
"learning_rate": 5.110317975340688e-06,
"loss": 0.0685,
"step": 56600
},
{
"epoch": 18.02,
"grad_norm": 0.0005624560872092843,
"learning_rate": 4.907527579493836e-06,
"loss": 0.0781,
"step": 56800
},
{
"epoch": 18.02,
"grad_norm": 0.20106494426727295,
"learning_rate": 4.704737183646982e-06,
"loss": 0.084,
"step": 57000
},
{
"epoch": 18.03,
"grad_norm": 0.0013241646811366081,
"learning_rate": 4.50194678780013e-06,
"loss": 0.0977,
"step": 57200
},
{
"epoch": 18.03,
"grad_norm": 0.019781263545155525,
"learning_rate": 4.299156391953277e-06,
"loss": 0.1543,
"step": 57400
},
{
"epoch": 18.03,
"grad_norm": 0.001491773989982903,
"learning_rate": 4.096365996106424e-06,
"loss": 0.0821,
"step": 57600
},
{
"epoch": 18.04,
"grad_norm": 0.00963684543967247,
"learning_rate": 3.893575600259572e-06,
"loss": 0.1385,
"step": 57800
},
{
"epoch": 18.04,
"grad_norm": 0.0037320530973374844,
"learning_rate": 3.6907852044127193e-06,
"loss": 0.0359,
"step": 58000
},
{
"epoch": 18.04,
"grad_norm": 0.0036789393052458763,
"learning_rate": 3.4879948085658664e-06,
"loss": 0.0686,
"step": 58200
},
{
"epoch": 18.05,
"grad_norm": 0.0006985150394029915,
"learning_rate": 3.285204412719014e-06,
"loss": 0.1134,
"step": 58400
},
{
"epoch": 18.05,
"eval_accuracy": 0.32682926829268294,
"eval_loss": 8.044651985168457,
"eval_runtime": 441.4014,
"eval_samples_per_second": 5.573,
"eval_steps_per_second": 2.787,
"step": 58577
},
{
"epoch": 19.0,
"grad_norm": 0.19236122071743011,
"learning_rate": 3.082414016872161e-06,
"loss": 0.065,
"step": 58600
},
{
"epoch": 19.0,
"grad_norm": 0.002881132299080491,
"learning_rate": 2.8796236210253085e-06,
"loss": 0.0571,
"step": 58800
},
{
"epoch": 19.01,
"grad_norm": 3.4622700214385986,
"learning_rate": 2.6768332251784555e-06,
"loss": 0.0339,
"step": 59000
},
{
"epoch": 19.01,
"grad_norm": 0.0006050022784620523,
"learning_rate": 2.474042829331603e-06,
"loss": 0.1419,
"step": 59200
},
{
"epoch": 19.01,
"grad_norm": 43.79890441894531,
"learning_rate": 2.27125243348475e-06,
"loss": 0.1329,
"step": 59400
},
{
"epoch": 19.02,
"grad_norm": 0.0007294774986803532,
"learning_rate": 2.0684620376378976e-06,
"loss": 0.0427,
"step": 59600
},
{
"epoch": 19.02,
"grad_norm": 5.0048394203186035,
"learning_rate": 1.8656716417910446e-06,
"loss": 0.103,
"step": 59800
},
{
"epoch": 19.02,
"grad_norm": 0.002903040498495102,
"learning_rate": 1.6628812459441923e-06,
"loss": 0.1119,
"step": 60000
},
{
"epoch": 19.03,
"grad_norm": 7.973681926727295,
"learning_rate": 1.4600908500973394e-06,
"loss": 0.1148,
"step": 60200
},
{
"epoch": 19.03,
"grad_norm": 0.0013534132158383727,
"learning_rate": 1.2573004542504867e-06,
"loss": 0.1022,
"step": 60400
},
{
"epoch": 19.03,
"grad_norm": 0.0024612874258309603,
"learning_rate": 1.0545100584036342e-06,
"loss": 0.1123,
"step": 60600
},
{
"epoch": 19.04,
"grad_norm": 0.0006685277330689132,
"learning_rate": 8.517196625567812e-07,
"loss": 0.0382,
"step": 60800
},
{
"epoch": 19.04,
"grad_norm": 111.42835235595703,
"learning_rate": 6.489292667099286e-07,
"loss": 0.0578,
"step": 61000
},
{
"epoch": 19.04,
"grad_norm": 0.0009195853490382433,
"learning_rate": 4.4613887086307594e-07,
"loss": 0.0779,
"step": 61200
},
{
"epoch": 19.05,
"grad_norm": 0.0009977706940844655,
"learning_rate": 2.4334847501622327e-07,
"loss": 0.0684,
"step": 61400
},
{
"epoch": 19.05,
"grad_norm": 0.025145627558231354,
"learning_rate": 4.055807916937054e-08,
"loss": 0.0634,
"step": 61600
},
{
"epoch": 19.05,
"eval_accuracy": 0.3304878048780488,
"eval_loss": 7.971652507781982,
"eval_runtime": 406.4924,
"eval_samples_per_second": 6.052,
"eval_steps_per_second": 3.026,
"step": 61640
},
{
"epoch": 19.05,
"step": 61640,
"total_flos": 1.5360349509135758e+20,
"train_loss": 0.34115883751707676,
"train_runtime": 65653.4691,
"train_samples_per_second": 1.878,
"train_steps_per_second": 0.939
},
{
"epoch": 19.05,
"eval_accuracy": 0.33170731707317075,
"eval_loss": 7.929662227630615,
"eval_runtime": 414.4754,
"eval_samples_per_second": 5.935,
"eval_steps_per_second": 2.968,
"step": 61640
},
{
"epoch": 19.05,
"eval_accuracy": 0.33170731707317075,
"eval_loss": 7.929662227630615,
"eval_runtime": 300.9414,
"eval_samples_per_second": 8.174,
"eval_steps_per_second": 4.087,
"step": 61640
}
],
"logging_steps": 200,
"max_steps": 61640,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"total_flos": 1.5360349509135758e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}