supachaisupanya's picture
Upload 8 files
9b9e13e verified
{
"best_metric": 0.74,
"best_model_checkpoint": "swinv2-base-patch4-window16-256-finetuned-eurosat/checkpoint-1446",
"epoch": 29.657794676806084,
"eval_steps": 500,
"global_step": 1950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.15,
"grad_norm": 10.112361907958984,
"learning_rate": 2.564102564102564e-06,
"loss": 6.2407,
"step": 10
},
{
"epoch": 0.3,
"grad_norm": 8.693501472473145,
"learning_rate": 5.128205128205128e-06,
"loss": 6.2346,
"step": 20
},
{
"epoch": 0.46,
"grad_norm": 31.615943908691406,
"learning_rate": 7.692307692307694e-06,
"loss": 6.2382,
"step": 30
},
{
"epoch": 0.61,
"grad_norm": 4.278875827789307,
"learning_rate": 1.0256410256410256e-05,
"loss": 6.2402,
"step": 40
},
{
"epoch": 0.76,
"grad_norm": 5.3635783195495605,
"learning_rate": 1.282051282051282e-05,
"loss": 6.2228,
"step": 50
},
{
"epoch": 0.91,
"grad_norm": 3.80220890045166,
"learning_rate": 1.5384615384615387e-05,
"loss": 6.2125,
"step": 60
},
{
"epoch": 0.99,
"eval_accuracy": 0.0,
"eval_loss": 6.20700216293335,
"eval_runtime": 9.9729,
"eval_samples_per_second": 35.095,
"eval_steps_per_second": 3.008,
"step": 65
},
{
"epoch": 1.06,
"grad_norm": 4.892258167266846,
"learning_rate": 1.794871794871795e-05,
"loss": 6.197,
"step": 70
},
{
"epoch": 1.22,
"grad_norm": 5.695966720581055,
"learning_rate": 2.0512820512820512e-05,
"loss": 6.161,
"step": 80
},
{
"epoch": 1.37,
"grad_norm": 5.43502950668335,
"learning_rate": 2.307692307692308e-05,
"loss": 6.1137,
"step": 90
},
{
"epoch": 1.52,
"grad_norm": 10.649894714355469,
"learning_rate": 2.564102564102564e-05,
"loss": 6.1187,
"step": 100
},
{
"epoch": 1.67,
"grad_norm": 6.482412815093994,
"learning_rate": 2.8205128205128207e-05,
"loss": 6.0495,
"step": 110
},
{
"epoch": 1.83,
"grad_norm": 20.60057830810547,
"learning_rate": 3.0769230769230774e-05,
"loss": 6.0249,
"step": 120
},
{
"epoch": 1.98,
"grad_norm": 10.727930068969727,
"learning_rate": 3.3333333333333335e-05,
"loss": 5.9584,
"step": 130
},
{
"epoch": 1.99,
"eval_accuracy": 0.054285714285714284,
"eval_loss": 5.894497871398926,
"eval_runtime": 8.019,
"eval_samples_per_second": 43.646,
"eval_steps_per_second": 3.741,
"step": 131
},
{
"epoch": 2.13,
"grad_norm": 9.321084976196289,
"learning_rate": 3.58974358974359e-05,
"loss": 5.6353,
"step": 140
},
{
"epoch": 2.28,
"grad_norm": 15.699618339538574,
"learning_rate": 3.846153846153846e-05,
"loss": 5.4549,
"step": 150
},
{
"epoch": 2.43,
"grad_norm": 13.360058784484863,
"learning_rate": 4.1025641025641023e-05,
"loss": 5.1768,
"step": 160
},
{
"epoch": 2.59,
"grad_norm": 14.356024742126465,
"learning_rate": 4.358974358974359e-05,
"loss": 5.08,
"step": 170
},
{
"epoch": 2.74,
"grad_norm": 12.42605209350586,
"learning_rate": 4.615384615384616e-05,
"loss": 4.8681,
"step": 180
},
{
"epoch": 2.89,
"grad_norm": 13.120949745178223,
"learning_rate": 4.871794871794872e-05,
"loss": 4.7047,
"step": 190
},
{
"epoch": 3.0,
"eval_accuracy": 0.3028571428571429,
"eval_loss": 4.368250370025635,
"eval_runtime": 8.0905,
"eval_samples_per_second": 43.26,
"eval_steps_per_second": 3.708,
"step": 197
},
{
"epoch": 3.04,
"grad_norm": 14.149765014648438,
"learning_rate": 4.985754985754986e-05,
"loss": 4.2241,
"step": 200
},
{
"epoch": 3.19,
"grad_norm": 12.356833457946777,
"learning_rate": 4.9572649572649575e-05,
"loss": 3.6145,
"step": 210
},
{
"epoch": 3.35,
"grad_norm": 13.141170501708984,
"learning_rate": 4.928774928774929e-05,
"loss": 3.3265,
"step": 220
},
{
"epoch": 3.5,
"grad_norm": 11.219949722290039,
"learning_rate": 4.9002849002849004e-05,
"loss": 3.1353,
"step": 230
},
{
"epoch": 3.65,
"grad_norm": 12.410125732421875,
"learning_rate": 4.871794871794872e-05,
"loss": 2.9184,
"step": 240
},
{
"epoch": 3.8,
"grad_norm": 12.346807479858398,
"learning_rate": 4.8433048433048433e-05,
"loss": 2.6721,
"step": 250
},
{
"epoch": 3.95,
"grad_norm": 11.234230995178223,
"learning_rate": 4.814814814814815e-05,
"loss": 2.7217,
"step": 260
},
{
"epoch": 4.0,
"eval_accuracy": 0.5457142857142857,
"eval_loss": 2.71696400642395,
"eval_runtime": 8.017,
"eval_samples_per_second": 43.657,
"eval_steps_per_second": 3.742,
"step": 263
},
{
"epoch": 4.11,
"grad_norm": 9.922853469848633,
"learning_rate": 4.786324786324787e-05,
"loss": 1.9846,
"step": 270
},
{
"epoch": 4.26,
"grad_norm": 10.338376998901367,
"learning_rate": 4.7578347578347584e-05,
"loss": 1.7283,
"step": 280
},
{
"epoch": 4.41,
"grad_norm": 10.503661155700684,
"learning_rate": 4.72934472934473e-05,
"loss": 1.7337,
"step": 290
},
{
"epoch": 4.56,
"grad_norm": 9.319334983825684,
"learning_rate": 4.700854700854701e-05,
"loss": 1.6741,
"step": 300
},
{
"epoch": 4.71,
"grad_norm": 9.088035583496094,
"learning_rate": 4.672364672364672e-05,
"loss": 1.5406,
"step": 310
},
{
"epoch": 4.87,
"grad_norm": 8.811790466308594,
"learning_rate": 4.643874643874644e-05,
"loss": 1.6097,
"step": 320
},
{
"epoch": 4.99,
"eval_accuracy": 0.6314285714285715,
"eval_loss": 2.015495538711548,
"eval_runtime": 7.9582,
"eval_samples_per_second": 43.98,
"eval_steps_per_second": 3.77,
"step": 328
},
{
"epoch": 5.02,
"grad_norm": 9.573624610900879,
"learning_rate": 4.615384615384616e-05,
"loss": 1.4472,
"step": 330
},
{
"epoch": 5.17,
"grad_norm": 9.028331756591797,
"learning_rate": 4.586894586894587e-05,
"loss": 0.9959,
"step": 340
},
{
"epoch": 5.32,
"grad_norm": 8.725614547729492,
"learning_rate": 4.558404558404559e-05,
"loss": 0.945,
"step": 350
},
{
"epoch": 5.48,
"grad_norm": 10.008910179138184,
"learning_rate": 4.52991452991453e-05,
"loss": 0.9564,
"step": 360
},
{
"epoch": 5.63,
"grad_norm": 9.664880752563477,
"learning_rate": 4.501424501424502e-05,
"loss": 0.9423,
"step": 370
},
{
"epoch": 5.78,
"grad_norm": 7.615637302398682,
"learning_rate": 4.472934472934473e-05,
"loss": 0.9333,
"step": 380
},
{
"epoch": 5.93,
"grad_norm": 9.067399978637695,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.8932,
"step": 390
},
{
"epoch": 5.99,
"eval_accuracy": 0.6742857142857143,
"eval_loss": 1.70182204246521,
"eval_runtime": 8.0389,
"eval_samples_per_second": 43.538,
"eval_steps_per_second": 3.732,
"step": 394
},
{
"epoch": 6.08,
"grad_norm": 5.935275554656982,
"learning_rate": 4.415954415954416e-05,
"loss": 0.7379,
"step": 400
},
{
"epoch": 6.24,
"grad_norm": 7.257266521453857,
"learning_rate": 4.3874643874643876e-05,
"loss": 0.5602,
"step": 410
},
{
"epoch": 6.39,
"grad_norm": 9.825379371643066,
"learning_rate": 4.358974358974359e-05,
"loss": 0.5144,
"step": 420
},
{
"epoch": 6.54,
"grad_norm": 6.920632362365723,
"learning_rate": 4.3304843304843306e-05,
"loss": 0.5718,
"step": 430
},
{
"epoch": 6.69,
"grad_norm": 7.798554420471191,
"learning_rate": 4.301994301994302e-05,
"loss": 0.515,
"step": 440
},
{
"epoch": 6.84,
"grad_norm": 6.575021266937256,
"learning_rate": 4.2735042735042735e-05,
"loss": 0.5472,
"step": 450
},
{
"epoch": 7.0,
"grad_norm": 5.641183853149414,
"learning_rate": 4.2450142450142457e-05,
"loss": 0.5734,
"step": 460
},
{
"epoch": 7.0,
"eval_accuracy": 0.7057142857142857,
"eval_loss": 1.5249170064926147,
"eval_runtime": 8.0507,
"eval_samples_per_second": 43.474,
"eval_steps_per_second": 3.726,
"step": 460
},
{
"epoch": 7.15,
"grad_norm": 3.5870327949523926,
"learning_rate": 4.216524216524217e-05,
"loss": 0.3342,
"step": 470
},
{
"epoch": 7.3,
"grad_norm": 4.1048479080200195,
"learning_rate": 4.1880341880341886e-05,
"loss": 0.3382,
"step": 480
},
{
"epoch": 7.45,
"grad_norm": 6.017439842224121,
"learning_rate": 4.15954415954416e-05,
"loss": 0.3804,
"step": 490
},
{
"epoch": 7.6,
"grad_norm": 5.106074333190918,
"learning_rate": 4.131054131054131e-05,
"loss": 0.3606,
"step": 500
},
{
"epoch": 7.76,
"grad_norm": 5.5891900062561035,
"learning_rate": 4.1025641025641023e-05,
"loss": 0.3295,
"step": 510
},
{
"epoch": 7.91,
"grad_norm": 4.079031944274902,
"learning_rate": 4.074074074074074e-05,
"loss": 0.324,
"step": 520
},
{
"epoch": 8.0,
"eval_accuracy": 0.7085714285714285,
"eval_loss": 1.4846410751342773,
"eval_runtime": 7.9798,
"eval_samples_per_second": 43.861,
"eval_steps_per_second": 3.76,
"step": 526
},
{
"epoch": 8.06,
"grad_norm": 3.212510824203491,
"learning_rate": 4.045584045584046e-05,
"loss": 0.2964,
"step": 530
},
{
"epoch": 8.21,
"grad_norm": 5.004084587097168,
"learning_rate": 4.0170940170940174e-05,
"loss": 0.2145,
"step": 540
},
{
"epoch": 8.37,
"grad_norm": 4.74351167678833,
"learning_rate": 3.988603988603989e-05,
"loss": 0.2206,
"step": 550
},
{
"epoch": 8.52,
"grad_norm": 5.272638320922852,
"learning_rate": 3.9601139601139604e-05,
"loss": 0.2131,
"step": 560
},
{
"epoch": 8.67,
"grad_norm": 3.062843084335327,
"learning_rate": 3.931623931623932e-05,
"loss": 0.2447,
"step": 570
},
{
"epoch": 8.82,
"grad_norm": 3.7355995178222656,
"learning_rate": 3.903133903133903e-05,
"loss": 0.213,
"step": 580
},
{
"epoch": 8.97,
"grad_norm": 3.62921404838562,
"learning_rate": 3.874643874643875e-05,
"loss": 0.2195,
"step": 590
},
{
"epoch": 8.99,
"eval_accuracy": 0.7114285714285714,
"eval_loss": 1.4269201755523682,
"eval_runtime": 8.004,
"eval_samples_per_second": 43.728,
"eval_steps_per_second": 3.748,
"step": 591
},
{
"epoch": 9.13,
"grad_norm": 2.647521734237671,
"learning_rate": 3.846153846153846e-05,
"loss": 0.1677,
"step": 600
},
{
"epoch": 9.28,
"grad_norm": 4.363504409790039,
"learning_rate": 3.817663817663818e-05,
"loss": 0.1513,
"step": 610
},
{
"epoch": 9.43,
"grad_norm": 2.5766873359680176,
"learning_rate": 3.789173789173789e-05,
"loss": 0.1684,
"step": 620
},
{
"epoch": 9.58,
"grad_norm": 3.8854830265045166,
"learning_rate": 3.760683760683761e-05,
"loss": 0.1552,
"step": 630
},
{
"epoch": 9.73,
"grad_norm": 6.697465896606445,
"learning_rate": 3.732193732193732e-05,
"loss": 0.188,
"step": 640
},
{
"epoch": 9.89,
"grad_norm": 3.860522985458374,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.1679,
"step": 650
},
{
"epoch": 9.99,
"eval_accuracy": 0.7171428571428572,
"eval_loss": 1.4169081449508667,
"eval_runtime": 8.0283,
"eval_samples_per_second": 43.596,
"eval_steps_per_second": 3.737,
"step": 657
},
{
"epoch": 10.04,
"grad_norm": 4.154173374176025,
"learning_rate": 3.675213675213676e-05,
"loss": 0.1645,
"step": 660
},
{
"epoch": 10.19,
"grad_norm": 1.8003276586532593,
"learning_rate": 3.646723646723647e-05,
"loss": 0.105,
"step": 670
},
{
"epoch": 10.34,
"grad_norm": 4.1917619705200195,
"learning_rate": 3.618233618233619e-05,
"loss": 0.149,
"step": 680
},
{
"epoch": 10.49,
"grad_norm": 3.338636636734009,
"learning_rate": 3.58974358974359e-05,
"loss": 0.1287,
"step": 690
},
{
"epoch": 10.65,
"grad_norm": 1.6283141374588013,
"learning_rate": 3.561253561253561e-05,
"loss": 0.1458,
"step": 700
},
{
"epoch": 10.8,
"grad_norm": 2.769218921661377,
"learning_rate": 3.5327635327635325e-05,
"loss": 0.1394,
"step": 710
},
{
"epoch": 10.95,
"grad_norm": 3.2028868198394775,
"learning_rate": 3.504273504273504e-05,
"loss": 0.1277,
"step": 720
},
{
"epoch": 11.0,
"eval_accuracy": 0.7057142857142857,
"eval_loss": 1.404009222984314,
"eval_runtime": 8.031,
"eval_samples_per_second": 43.581,
"eval_steps_per_second": 3.736,
"step": 723
},
{
"epoch": 11.1,
"grad_norm": 1.2642875909805298,
"learning_rate": 3.475783475783476e-05,
"loss": 0.1187,
"step": 730
},
{
"epoch": 11.25,
"grad_norm": 1.5215080976486206,
"learning_rate": 3.4472934472934476e-05,
"loss": 0.0854,
"step": 740
},
{
"epoch": 11.41,
"grad_norm": 2.877058982849121,
"learning_rate": 3.418803418803419e-05,
"loss": 0.1105,
"step": 750
},
{
"epoch": 11.56,
"grad_norm": 5.0010552406311035,
"learning_rate": 3.3903133903133905e-05,
"loss": 0.0912,
"step": 760
},
{
"epoch": 11.71,
"grad_norm": 5.7503981590271,
"learning_rate": 3.361823361823362e-05,
"loss": 0.1264,
"step": 770
},
{
"epoch": 11.86,
"grad_norm": 3.2310426235198975,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.1238,
"step": 780
},
{
"epoch": 12.0,
"eval_accuracy": 0.7285714285714285,
"eval_loss": 1.4007512331008911,
"eval_runtime": 8.0356,
"eval_samples_per_second": 43.556,
"eval_steps_per_second": 3.733,
"step": 789
},
{
"epoch": 12.02,
"grad_norm": 1.719030737876892,
"learning_rate": 3.304843304843305e-05,
"loss": 0.0817,
"step": 790
},
{
"epoch": 12.17,
"grad_norm": 3.475520610809326,
"learning_rate": 3.2763532763532764e-05,
"loss": 0.0765,
"step": 800
},
{
"epoch": 12.32,
"grad_norm": 3.978292226791382,
"learning_rate": 3.247863247863248e-05,
"loss": 0.0874,
"step": 810
},
{
"epoch": 12.47,
"grad_norm": 1.6397371292114258,
"learning_rate": 3.2193732193732194e-05,
"loss": 0.1348,
"step": 820
},
{
"epoch": 12.62,
"grad_norm": 0.9705621600151062,
"learning_rate": 3.190883190883191e-05,
"loss": 0.057,
"step": 830
},
{
"epoch": 12.78,
"grad_norm": 3.8919146060943604,
"learning_rate": 3.162393162393162e-05,
"loss": 0.085,
"step": 840
},
{
"epoch": 12.93,
"grad_norm": 1.4797801971435547,
"learning_rate": 3.133903133903134e-05,
"loss": 0.088,
"step": 850
},
{
"epoch": 12.99,
"eval_accuracy": 0.7114285714285714,
"eval_loss": 1.3840457201004028,
"eval_runtime": 7.9781,
"eval_samples_per_second": 43.87,
"eval_steps_per_second": 3.76,
"step": 854
},
{
"epoch": 13.08,
"grad_norm": 2.244473695755005,
"learning_rate": 3.105413105413106e-05,
"loss": 0.0673,
"step": 860
},
{
"epoch": 13.23,
"grad_norm": 1.467897653579712,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.0523,
"step": 870
},
{
"epoch": 13.38,
"grad_norm": 2.4079532623291016,
"learning_rate": 3.0484330484330486e-05,
"loss": 0.0752,
"step": 880
},
{
"epoch": 13.54,
"grad_norm": 3.189384698867798,
"learning_rate": 3.01994301994302e-05,
"loss": 0.0559,
"step": 890
},
{
"epoch": 13.69,
"grad_norm": 2.8496036529541016,
"learning_rate": 2.9914529914529915e-05,
"loss": 0.0688,
"step": 900
},
{
"epoch": 13.84,
"grad_norm": 0.6937215328216553,
"learning_rate": 2.962962962962963e-05,
"loss": 0.073,
"step": 910
},
{
"epoch": 13.99,
"grad_norm": 1.4593366384506226,
"learning_rate": 2.9344729344729345e-05,
"loss": 0.0834,
"step": 920
},
{
"epoch": 13.99,
"eval_accuracy": 0.72,
"eval_loss": 1.3873815536499023,
"eval_runtime": 8.1063,
"eval_samples_per_second": 43.176,
"eval_steps_per_second": 3.701,
"step": 920
},
{
"epoch": 14.14,
"grad_norm": 0.6792957186698914,
"learning_rate": 2.9059829059829063e-05,
"loss": 0.0434,
"step": 930
},
{
"epoch": 14.3,
"grad_norm": 1.9660212993621826,
"learning_rate": 2.8774928774928778e-05,
"loss": 0.0457,
"step": 940
},
{
"epoch": 14.45,
"grad_norm": 1.9186339378356934,
"learning_rate": 2.8490028490028492e-05,
"loss": 0.0485,
"step": 950
},
{
"epoch": 14.6,
"grad_norm": 1.0086941719055176,
"learning_rate": 2.8205128205128207e-05,
"loss": 0.0472,
"step": 960
},
{
"epoch": 14.75,
"grad_norm": 2.760943651199341,
"learning_rate": 2.7920227920227922e-05,
"loss": 0.0733,
"step": 970
},
{
"epoch": 14.9,
"grad_norm": 0.8688881993293762,
"learning_rate": 2.7635327635327633e-05,
"loss": 0.0813,
"step": 980
},
{
"epoch": 15.0,
"eval_accuracy": 0.7257142857142858,
"eval_loss": 1.3705737590789795,
"eval_runtime": 8.1151,
"eval_samples_per_second": 43.13,
"eval_steps_per_second": 3.697,
"step": 986
},
{
"epoch": 15.06,
"grad_norm": 0.6380533576011658,
"learning_rate": 2.7350427350427355e-05,
"loss": 0.0466,
"step": 990
},
{
"epoch": 15.21,
"grad_norm": 6.788400650024414,
"learning_rate": 2.706552706552707e-05,
"loss": 0.044,
"step": 1000
},
{
"epoch": 15.36,
"grad_norm": 2.104766607284546,
"learning_rate": 2.6780626780626784e-05,
"loss": 0.0723,
"step": 1010
},
{
"epoch": 15.51,
"grad_norm": 1.0589812994003296,
"learning_rate": 2.64957264957265e-05,
"loss": 0.0628,
"step": 1020
},
{
"epoch": 15.67,
"grad_norm": 1.543593168258667,
"learning_rate": 2.621082621082621e-05,
"loss": 0.0485,
"step": 1030
},
{
"epoch": 15.82,
"grad_norm": 2.2463526725769043,
"learning_rate": 2.5925925925925925e-05,
"loss": 0.0442,
"step": 1040
},
{
"epoch": 15.97,
"grad_norm": 5.468172550201416,
"learning_rate": 2.564102564102564e-05,
"loss": 0.0423,
"step": 1050
},
{
"epoch": 16.0,
"eval_accuracy": 0.7228571428571429,
"eval_loss": 1.3519986867904663,
"eval_runtime": 8.094,
"eval_samples_per_second": 43.242,
"eval_steps_per_second": 3.706,
"step": 1052
},
{
"epoch": 16.12,
"grad_norm": 2.093841791152954,
"learning_rate": 2.535612535612536e-05,
"loss": 0.0532,
"step": 1060
},
{
"epoch": 16.27,
"grad_norm": 0.7975372672080994,
"learning_rate": 2.5071225071225073e-05,
"loss": 0.0273,
"step": 1070
},
{
"epoch": 16.43,
"grad_norm": 6.552361965179443,
"learning_rate": 2.4786324786324787e-05,
"loss": 0.0643,
"step": 1080
},
{
"epoch": 16.58,
"grad_norm": 1.8863351345062256,
"learning_rate": 2.4501424501424502e-05,
"loss": 0.0345,
"step": 1090
},
{
"epoch": 16.73,
"grad_norm": 0.8653244376182556,
"learning_rate": 2.4216524216524217e-05,
"loss": 0.0502,
"step": 1100
},
{
"epoch": 16.88,
"grad_norm": 0.7265773415565491,
"learning_rate": 2.3931623931623935e-05,
"loss": 0.067,
"step": 1110
},
{
"epoch": 16.99,
"eval_accuracy": 0.7228571428571429,
"eval_loss": 1.3108690977096558,
"eval_runtime": 8.0282,
"eval_samples_per_second": 43.597,
"eval_steps_per_second": 3.737,
"step": 1117
},
{
"epoch": 17.03,
"grad_norm": 0.5706465244293213,
"learning_rate": 2.364672364672365e-05,
"loss": 0.0456,
"step": 1120
},
{
"epoch": 17.19,
"grad_norm": 0.4868156313896179,
"learning_rate": 2.336182336182336e-05,
"loss": 0.0239,
"step": 1130
},
{
"epoch": 17.34,
"grad_norm": 0.2969132661819458,
"learning_rate": 2.307692307692308e-05,
"loss": 0.0258,
"step": 1140
},
{
"epoch": 17.49,
"grad_norm": 0.7196402549743652,
"learning_rate": 2.2792022792022794e-05,
"loss": 0.0307,
"step": 1150
},
{
"epoch": 17.64,
"grad_norm": 0.6792505383491516,
"learning_rate": 2.250712250712251e-05,
"loss": 0.0357,
"step": 1160
},
{
"epoch": 17.79,
"grad_norm": 1.3564707040786743,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.0447,
"step": 1170
},
{
"epoch": 17.95,
"grad_norm": 0.7506925463676453,
"learning_rate": 2.1937321937321938e-05,
"loss": 0.0438,
"step": 1180
},
{
"epoch": 17.99,
"eval_accuracy": 0.7171428571428572,
"eval_loss": 1.3395991325378418,
"eval_runtime": 7.9804,
"eval_samples_per_second": 43.857,
"eval_steps_per_second": 3.759,
"step": 1183
},
{
"epoch": 18.1,
"grad_norm": 0.2639639377593994,
"learning_rate": 2.1652421652421653e-05,
"loss": 0.0364,
"step": 1190
},
{
"epoch": 18.25,
"grad_norm": 0.6512497067451477,
"learning_rate": 2.1367521367521368e-05,
"loss": 0.035,
"step": 1200
},
{
"epoch": 18.4,
"grad_norm": 0.36454707384109497,
"learning_rate": 2.1082621082621086e-05,
"loss": 0.031,
"step": 1210
},
{
"epoch": 18.56,
"grad_norm": 1.9671510457992554,
"learning_rate": 2.07977207977208e-05,
"loss": 0.0365,
"step": 1220
},
{
"epoch": 18.71,
"grad_norm": 2.5179057121276855,
"learning_rate": 2.0512820512820512e-05,
"loss": 0.0343,
"step": 1230
},
{
"epoch": 18.86,
"grad_norm": 0.5848199725151062,
"learning_rate": 2.022792022792023e-05,
"loss": 0.0399,
"step": 1240
},
{
"epoch": 19.0,
"eval_accuracy": 0.7257142857142858,
"eval_loss": 1.3867747783660889,
"eval_runtime": 7.995,
"eval_samples_per_second": 43.778,
"eval_steps_per_second": 3.752,
"step": 1249
},
{
"epoch": 19.01,
"grad_norm": 1.6354899406433105,
"learning_rate": 1.9943019943019945e-05,
"loss": 0.0488,
"step": 1250
},
{
"epoch": 19.16,
"grad_norm": 4.593708038330078,
"learning_rate": 1.965811965811966e-05,
"loss": 0.0326,
"step": 1260
},
{
"epoch": 19.32,
"grad_norm": 0.5004624128341675,
"learning_rate": 1.9373219373219374e-05,
"loss": 0.0312,
"step": 1270
},
{
"epoch": 19.47,
"grad_norm": 3.982077121734619,
"learning_rate": 1.908831908831909e-05,
"loss": 0.0367,
"step": 1280
},
{
"epoch": 19.62,
"grad_norm": 1.31514573097229,
"learning_rate": 1.8803418803418804e-05,
"loss": 0.0288,
"step": 1290
},
{
"epoch": 19.77,
"grad_norm": 2.477193593978882,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.0188,
"step": 1300
},
{
"epoch": 19.92,
"grad_norm": 1.13873291015625,
"learning_rate": 1.8233618233618236e-05,
"loss": 0.022,
"step": 1310
},
{
"epoch": 20.0,
"eval_accuracy": 0.7257142857142858,
"eval_loss": 1.3571245670318604,
"eval_runtime": 7.9825,
"eval_samples_per_second": 43.846,
"eval_steps_per_second": 3.758,
"step": 1315
},
{
"epoch": 20.08,
"grad_norm": 0.1975400298833847,
"learning_rate": 1.794871794871795e-05,
"loss": 0.016,
"step": 1320
},
{
"epoch": 20.23,
"grad_norm": 2.610684871673584,
"learning_rate": 1.7663817663817662e-05,
"loss": 0.0364,
"step": 1330
},
{
"epoch": 20.38,
"grad_norm": 2.5552616119384766,
"learning_rate": 1.737891737891738e-05,
"loss": 0.0209,
"step": 1340
},
{
"epoch": 20.53,
"grad_norm": 1.8163336515426636,
"learning_rate": 1.7094017094017095e-05,
"loss": 0.014,
"step": 1350
},
{
"epoch": 20.68,
"grad_norm": 2.3455891609191895,
"learning_rate": 1.680911680911681e-05,
"loss": 0.015,
"step": 1360
},
{
"epoch": 20.84,
"grad_norm": 1.0087167024612427,
"learning_rate": 1.6524216524216525e-05,
"loss": 0.021,
"step": 1370
},
{
"epoch": 20.99,
"grad_norm": 4.435824394226074,
"learning_rate": 1.623931623931624e-05,
"loss": 0.0326,
"step": 1380
},
{
"epoch": 20.99,
"eval_accuracy": 0.7342857142857143,
"eval_loss": 1.316083550453186,
"eval_runtime": 8.0694,
"eval_samples_per_second": 43.374,
"eval_steps_per_second": 3.718,
"step": 1380
},
{
"epoch": 21.14,
"grad_norm": 2.11207914352417,
"learning_rate": 1.5954415954415954e-05,
"loss": 0.0249,
"step": 1390
},
{
"epoch": 21.29,
"grad_norm": 0.3664344251155853,
"learning_rate": 1.566951566951567e-05,
"loss": 0.0168,
"step": 1400
},
{
"epoch": 21.44,
"grad_norm": 2.1651501655578613,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0269,
"step": 1410
},
{
"epoch": 21.6,
"grad_norm": 6.236063480377197,
"learning_rate": 1.50997150997151e-05,
"loss": 0.0266,
"step": 1420
},
{
"epoch": 21.75,
"grad_norm": 0.4216400980949402,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.0276,
"step": 1430
},
{
"epoch": 21.9,
"grad_norm": 0.34464436769485474,
"learning_rate": 1.4529914529914531e-05,
"loss": 0.0217,
"step": 1440
},
{
"epoch": 21.99,
"eval_accuracy": 0.74,
"eval_loss": 1.3431659936904907,
"eval_runtime": 8.2814,
"eval_samples_per_second": 42.263,
"eval_steps_per_second": 3.623,
"step": 1446
},
{
"epoch": 22.05,
"grad_norm": 0.550115168094635,
"learning_rate": 1.4245014245014246e-05,
"loss": 0.0116,
"step": 1450
},
{
"epoch": 22.21,
"grad_norm": 0.7523086071014404,
"learning_rate": 1.3960113960113961e-05,
"loss": 0.0167,
"step": 1460
},
{
"epoch": 22.36,
"grad_norm": 0.4303203821182251,
"learning_rate": 1.3675213675213677e-05,
"loss": 0.0152,
"step": 1470
},
{
"epoch": 22.51,
"grad_norm": 0.9599018096923828,
"learning_rate": 1.3390313390313392e-05,
"loss": 0.0129,
"step": 1480
},
{
"epoch": 22.66,
"grad_norm": 0.6038946509361267,
"learning_rate": 1.3105413105413105e-05,
"loss": 0.0153,
"step": 1490
},
{
"epoch": 22.81,
"grad_norm": 2.5680289268493652,
"learning_rate": 1.282051282051282e-05,
"loss": 0.0302,
"step": 1500
},
{
"epoch": 22.97,
"grad_norm": 0.7856467366218567,
"learning_rate": 1.2535612535612536e-05,
"loss": 0.0185,
"step": 1510
},
{
"epoch": 23.0,
"eval_accuracy": 0.7342857142857143,
"eval_loss": 1.3489614725112915,
"eval_runtime": 8.0906,
"eval_samples_per_second": 43.26,
"eval_steps_per_second": 3.708,
"step": 1512
},
{
"epoch": 23.12,
"grad_norm": 0.6607487201690674,
"learning_rate": 1.2250712250712251e-05,
"loss": 0.014,
"step": 1520
},
{
"epoch": 23.27,
"grad_norm": 0.14532317221164703,
"learning_rate": 1.1965811965811967e-05,
"loss": 0.02,
"step": 1530
},
{
"epoch": 23.42,
"grad_norm": 0.3423649072647095,
"learning_rate": 1.168091168091168e-05,
"loss": 0.0156,
"step": 1540
},
{
"epoch": 23.57,
"grad_norm": 0.15258215367794037,
"learning_rate": 1.1396011396011397e-05,
"loss": 0.0087,
"step": 1550
},
{
"epoch": 23.73,
"grad_norm": 0.20266969501972198,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0257,
"step": 1560
},
{
"epoch": 23.88,
"grad_norm": 0.46567222476005554,
"learning_rate": 1.0826210826210826e-05,
"loss": 0.0247,
"step": 1570
},
{
"epoch": 24.0,
"eval_accuracy": 0.7285714285714285,
"eval_loss": 1.3712286949157715,
"eval_runtime": 8.0686,
"eval_samples_per_second": 43.378,
"eval_steps_per_second": 3.718,
"step": 1578
},
{
"epoch": 24.03,
"grad_norm": 0.43167567253112793,
"learning_rate": 1.0541310541310543e-05,
"loss": 0.0151,
"step": 1580
},
{
"epoch": 24.18,
"grad_norm": 0.3076987862586975,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.0145,
"step": 1590
},
{
"epoch": 24.33,
"grad_norm": 0.28051629662513733,
"learning_rate": 9.971509971509972e-06,
"loss": 0.0068,
"step": 1600
},
{
"epoch": 24.49,
"grad_norm": 0.17808012664318085,
"learning_rate": 9.686609686609687e-06,
"loss": 0.015,
"step": 1610
},
{
"epoch": 24.64,
"grad_norm": 0.46903499960899353,
"learning_rate": 9.401709401709402e-06,
"loss": 0.0111,
"step": 1620
},
{
"epoch": 24.79,
"grad_norm": 3.1560771465301514,
"learning_rate": 9.116809116809118e-06,
"loss": 0.0198,
"step": 1630
},
{
"epoch": 24.94,
"grad_norm": 1.1795072555541992,
"learning_rate": 8.831908831908831e-06,
"loss": 0.0147,
"step": 1640
},
{
"epoch": 24.99,
"eval_accuracy": 0.7285714285714285,
"eval_loss": 1.3384881019592285,
"eval_runtime": 8.045,
"eval_samples_per_second": 43.505,
"eval_steps_per_second": 3.729,
"step": 1643
},
{
"epoch": 25.1,
"grad_norm": 2.324568748474121,
"learning_rate": 8.547008547008548e-06,
"loss": 0.0147,
"step": 1650
},
{
"epoch": 25.25,
"grad_norm": 0.6252849102020264,
"learning_rate": 8.262108262108262e-06,
"loss": 0.0141,
"step": 1660
},
{
"epoch": 25.4,
"grad_norm": 2.523175001144409,
"learning_rate": 7.977207977207977e-06,
"loss": 0.0288,
"step": 1670
},
{
"epoch": 25.55,
"grad_norm": 0.6321514844894409,
"learning_rate": 7.692307692307694e-06,
"loss": 0.0151,
"step": 1680
},
{
"epoch": 25.7,
"grad_norm": 0.1425185650587082,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.0093,
"step": 1690
},
{
"epoch": 25.86,
"grad_norm": 0.6362813115119934,
"learning_rate": 7.122507122507123e-06,
"loss": 0.0164,
"step": 1700
},
{
"epoch": 25.99,
"eval_accuracy": 0.7228571428571429,
"eval_loss": 1.352995753288269,
"eval_runtime": 7.9452,
"eval_samples_per_second": 44.052,
"eval_steps_per_second": 3.776,
"step": 1709
},
{
"epoch": 26.01,
"grad_norm": 0.11444679647684097,
"learning_rate": 6.837606837606839e-06,
"loss": 0.0198,
"step": 1710
},
{
"epoch": 26.16,
"grad_norm": 0.34033504128456116,
"learning_rate": 6.5527065527065525e-06,
"loss": 0.013,
"step": 1720
},
{
"epoch": 26.31,
"grad_norm": 1.7793394327163696,
"learning_rate": 6.267806267806268e-06,
"loss": 0.0122,
"step": 1730
},
{
"epoch": 26.46,
"grad_norm": 0.11746495217084885,
"learning_rate": 5.982905982905984e-06,
"loss": 0.0153,
"step": 1740
},
{
"epoch": 26.62,
"grad_norm": 4.355152606964111,
"learning_rate": 5.6980056980056985e-06,
"loss": 0.0153,
"step": 1750
},
{
"epoch": 26.77,
"grad_norm": 0.5570241808891296,
"learning_rate": 5.413105413105413e-06,
"loss": 0.013,
"step": 1760
},
{
"epoch": 26.92,
"grad_norm": 0.22895778715610504,
"learning_rate": 5.128205128205128e-06,
"loss": 0.0148,
"step": 1770
},
{
"epoch": 27.0,
"eval_accuracy": 0.7257142857142858,
"eval_loss": 1.3564364910125732,
"eval_runtime": 8.0323,
"eval_samples_per_second": 43.574,
"eval_steps_per_second": 3.735,
"step": 1775
},
{
"epoch": 27.07,
"grad_norm": 1.6692248582839966,
"learning_rate": 4.8433048433048435e-06,
"loss": 0.0217,
"step": 1780
},
{
"epoch": 27.22,
"grad_norm": 0.4036758542060852,
"learning_rate": 4.558404558404559e-06,
"loss": 0.0068,
"step": 1790
},
{
"epoch": 27.38,
"grad_norm": 0.1422310322523117,
"learning_rate": 4.273504273504274e-06,
"loss": 0.0086,
"step": 1800
},
{
"epoch": 27.53,
"grad_norm": 0.36455395817756653,
"learning_rate": 3.988603988603989e-06,
"loss": 0.0097,
"step": 1810
},
{
"epoch": 27.68,
"grad_norm": 2.0207414627075195,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.009,
"step": 1820
},
{
"epoch": 27.83,
"grad_norm": 0.2137887328863144,
"learning_rate": 3.4188034188034193e-06,
"loss": 0.0073,
"step": 1830
},
{
"epoch": 27.98,
"grad_norm": 1.0078092813491821,
"learning_rate": 3.133903133903134e-06,
"loss": 0.0095,
"step": 1840
},
{
"epoch": 28.0,
"eval_accuracy": 0.7228571428571429,
"eval_loss": 1.3562867641448975,
"eval_runtime": 8.0528,
"eval_samples_per_second": 43.463,
"eval_steps_per_second": 3.725,
"step": 1841
},
{
"epoch": 28.14,
"grad_norm": 0.11777978390455246,
"learning_rate": 2.8490028490028492e-06,
"loss": 0.0076,
"step": 1850
},
{
"epoch": 28.29,
"grad_norm": 0.4021410644054413,
"learning_rate": 2.564102564102564e-06,
"loss": 0.0091,
"step": 1860
},
{
"epoch": 28.44,
"grad_norm": 0.19985055923461914,
"learning_rate": 2.2792022792022796e-06,
"loss": 0.0091,
"step": 1870
},
{
"epoch": 28.59,
"grad_norm": 0.30899757146835327,
"learning_rate": 1.9943019943019943e-06,
"loss": 0.0096,
"step": 1880
},
{
"epoch": 28.75,
"grad_norm": 0.1285697966814041,
"learning_rate": 1.7094017094017097e-06,
"loss": 0.0108,
"step": 1890
},
{
"epoch": 28.9,
"grad_norm": 1.3066548109054565,
"learning_rate": 1.4245014245014246e-06,
"loss": 0.0105,
"step": 1900
},
{
"epoch": 28.99,
"eval_accuracy": 0.7171428571428572,
"eval_loss": 1.3570489883422852,
"eval_runtime": 8.0496,
"eval_samples_per_second": 43.481,
"eval_steps_per_second": 3.727,
"step": 1906
},
{
"epoch": 29.05,
"grad_norm": 0.1782771348953247,
"learning_rate": 1.1396011396011398e-06,
"loss": 0.0109,
"step": 1910
},
{
"epoch": 29.2,
"grad_norm": 0.0780392736196518,
"learning_rate": 8.547008547008548e-07,
"loss": 0.0058,
"step": 1920
},
{
"epoch": 29.35,
"grad_norm": 1.1122561693191528,
"learning_rate": 5.698005698005699e-07,
"loss": 0.012,
"step": 1930
},
{
"epoch": 29.51,
"grad_norm": 0.21714162826538086,
"learning_rate": 2.8490028490028494e-07,
"loss": 0.0088,
"step": 1940
},
{
"epoch": 29.66,
"grad_norm": 0.504612922668457,
"learning_rate": 0.0,
"loss": 0.0105,
"step": 1950
},
{
"epoch": 29.66,
"eval_accuracy": 0.7171428571428572,
"eval_loss": 1.3564331531524658,
"eval_runtime": 8.0797,
"eval_samples_per_second": 43.319,
"eval_steps_per_second": 3.713,
"step": 1950
}
],
"logging_steps": 10,
"max_steps": 1950,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"total_flos": 5.87953618460352e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}