SALAMA_NEW9 / checkpoint-5880 /trainer_state.json
EYEDOL's picture
Upload folder using huggingface_hub
02000a4 verified
{
"best_global_step": 4000,
"best_metric": 0.3019483689209356,
"best_model_checkpoint": "./SALAMA_NEW9/checkpoint-4000",
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 5880,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0051033426894615975,
"grad_norm": 0.5211380124092102,
"learning_rate": 1.8e-07,
"loss": 0.0083,
"step": 10
},
{
"epoch": 0.010206685378923195,
"grad_norm": 1.0282593965530396,
"learning_rate": 3.8e-07,
"loss": 0.0078,
"step": 20
},
{
"epoch": 0.015310028068384792,
"grad_norm": 0.4742085933685303,
"learning_rate": 5.800000000000001e-07,
"loss": 0.0082,
"step": 30
},
{
"epoch": 0.02041337075784639,
"grad_norm": 0.7376335263252258,
"learning_rate": 7.8e-07,
"loss": 0.0056,
"step": 40
},
{
"epoch": 0.025516713447307986,
"grad_norm": 1.5874927043914795,
"learning_rate": 9.800000000000001e-07,
"loss": 0.012,
"step": 50
},
{
"epoch": 0.030620056136769585,
"grad_norm": 0.6289522051811218,
"learning_rate": 1.1800000000000001e-06,
"loss": 0.0081,
"step": 60
},
{
"epoch": 0.035723398826231184,
"grad_norm": 1.9904325008392334,
"learning_rate": 1.3800000000000001e-06,
"loss": 0.018,
"step": 70
},
{
"epoch": 0.04082674151569278,
"grad_norm": 2.7978858947753906,
"learning_rate": 1.5800000000000001e-06,
"loss": 0.0083,
"step": 80
},
{
"epoch": 0.045930084205154376,
"grad_norm": 1.7935121059417725,
"learning_rate": 1.7800000000000001e-06,
"loss": 0.0107,
"step": 90
},
{
"epoch": 0.05103342689461597,
"grad_norm": 0.3225713074207306,
"learning_rate": 1.98e-06,
"loss": 0.01,
"step": 100
},
{
"epoch": 0.056136769584077574,
"grad_norm": 0.9831269979476929,
"learning_rate": 2.1800000000000003e-06,
"loss": 0.0101,
"step": 110
},
{
"epoch": 0.06124011227353917,
"grad_norm": 1.6235119104385376,
"learning_rate": 2.38e-06,
"loss": 0.0067,
"step": 120
},
{
"epoch": 0.06634345496300076,
"grad_norm": 2.157205820083618,
"learning_rate": 2.5800000000000003e-06,
"loss": 0.008,
"step": 130
},
{
"epoch": 0.07144679765246237,
"grad_norm": 0.38939204812049866,
"learning_rate": 2.7800000000000005e-06,
"loss": 0.0054,
"step": 140
},
{
"epoch": 0.07655014034192396,
"grad_norm": 0.7885264754295349,
"learning_rate": 2.9800000000000003e-06,
"loss": 0.0081,
"step": 150
},
{
"epoch": 0.08165348303138556,
"grad_norm": 1.518344759941101,
"learning_rate": 3.1800000000000005e-06,
"loss": 0.0088,
"step": 160
},
{
"epoch": 0.08675682572084716,
"grad_norm": 0.5009278059005737,
"learning_rate": 3.3800000000000007e-06,
"loss": 0.005,
"step": 170
},
{
"epoch": 0.09186016841030875,
"grad_norm": 2.364478588104248,
"learning_rate": 3.58e-06,
"loss": 0.0085,
"step": 180
},
{
"epoch": 0.09696351109977035,
"grad_norm": 0.31605347990989685,
"learning_rate": 3.7800000000000002e-06,
"loss": 0.0073,
"step": 190
},
{
"epoch": 0.10206685378923194,
"grad_norm": 1.516256332397461,
"learning_rate": 3.980000000000001e-06,
"loss": 0.0081,
"step": 200
},
{
"epoch": 0.10717019647869354,
"grad_norm": 1.2425719499588013,
"learning_rate": 4.18e-06,
"loss": 0.0072,
"step": 210
},
{
"epoch": 0.11227353916815515,
"grad_norm": 2.350731611251831,
"learning_rate": 4.38e-06,
"loss": 0.0147,
"step": 220
},
{
"epoch": 0.11737688185761674,
"grad_norm": 0.7598681449890137,
"learning_rate": 4.58e-06,
"loss": 0.0117,
"step": 230
},
{
"epoch": 0.12248022454707834,
"grad_norm": 0.8952233791351318,
"learning_rate": 4.78e-06,
"loss": 0.0089,
"step": 240
},
{
"epoch": 0.12758356723653994,
"grad_norm": 0.6309432983398438,
"learning_rate": 4.980000000000001e-06,
"loss": 0.0047,
"step": 250
},
{
"epoch": 0.13268690992600152,
"grad_norm": 1.7359912395477295,
"learning_rate": 5.18e-06,
"loss": 0.0096,
"step": 260
},
{
"epoch": 0.13779025261546313,
"grad_norm": 2.6553421020507812,
"learning_rate": 5.380000000000001e-06,
"loss": 0.0078,
"step": 270
},
{
"epoch": 0.14289359530492474,
"grad_norm": 1.1739858388900757,
"learning_rate": 5.580000000000001e-06,
"loss": 0.0087,
"step": 280
},
{
"epoch": 0.14799693799438632,
"grad_norm": 0.708070695400238,
"learning_rate": 5.78e-06,
"loss": 0.0075,
"step": 290
},
{
"epoch": 0.15310028068384793,
"grad_norm": 0.8523297309875488,
"learning_rate": 5.98e-06,
"loss": 0.0085,
"step": 300
},
{
"epoch": 0.1582036233733095,
"grad_norm": 2.7822892665863037,
"learning_rate": 6.18e-06,
"loss": 0.0092,
"step": 310
},
{
"epoch": 0.16330696606277112,
"grad_norm": 3.9056875705718994,
"learning_rate": 6.380000000000001e-06,
"loss": 0.0172,
"step": 320
},
{
"epoch": 0.1684103087522327,
"grad_norm": 1.9748897552490234,
"learning_rate": 6.5800000000000005e-06,
"loss": 0.0086,
"step": 330
},
{
"epoch": 0.1735136514416943,
"grad_norm": 1.313830852508545,
"learning_rate": 6.780000000000001e-06,
"loss": 0.0124,
"step": 340
},
{
"epoch": 0.17861699413115592,
"grad_norm": 1.782209038734436,
"learning_rate": 6.98e-06,
"loss": 0.0166,
"step": 350
},
{
"epoch": 0.1837203368206175,
"grad_norm": 2.647857427597046,
"learning_rate": 7.180000000000001e-06,
"loss": 0.0151,
"step": 360
},
{
"epoch": 0.1888236795100791,
"grad_norm": 2.1850852966308594,
"learning_rate": 7.3800000000000005e-06,
"loss": 0.0116,
"step": 370
},
{
"epoch": 0.1939270221995407,
"grad_norm": 2.69811749458313,
"learning_rate": 7.58e-06,
"loss": 0.0118,
"step": 380
},
{
"epoch": 0.1990303648890023,
"grad_norm": 3.1227176189422607,
"learning_rate": 7.78e-06,
"loss": 0.0137,
"step": 390
},
{
"epoch": 0.20413370757846389,
"grad_norm": 3.615041494369507,
"learning_rate": 7.980000000000002e-06,
"loss": 0.0113,
"step": 400
},
{
"epoch": 0.2092370502679255,
"grad_norm": 2.067406177520752,
"learning_rate": 8.18e-06,
"loss": 0.0154,
"step": 410
},
{
"epoch": 0.21434039295738708,
"grad_norm": 1.5332070589065552,
"learning_rate": 8.380000000000001e-06,
"loss": 0.0159,
"step": 420
},
{
"epoch": 0.2194437356468487,
"grad_norm": 1.3139411211013794,
"learning_rate": 8.580000000000001e-06,
"loss": 0.0096,
"step": 430
},
{
"epoch": 0.2245470783363103,
"grad_norm": 2.8063700199127197,
"learning_rate": 8.78e-06,
"loss": 0.0096,
"step": 440
},
{
"epoch": 0.22965042102577188,
"grad_norm": 1.2194337844848633,
"learning_rate": 8.98e-06,
"loss": 0.0113,
"step": 450
},
{
"epoch": 0.2347537637152335,
"grad_norm": 1.4826334714889526,
"learning_rate": 9.180000000000002e-06,
"loss": 0.01,
"step": 460
},
{
"epoch": 0.23985710640469507,
"grad_norm": 1.4572652578353882,
"learning_rate": 9.38e-06,
"loss": 0.012,
"step": 470
},
{
"epoch": 0.24496044909415668,
"grad_norm": 2.323155641555786,
"learning_rate": 9.58e-06,
"loss": 0.016,
"step": 480
},
{
"epoch": 0.25006379178361826,
"grad_norm": 2.1754894256591797,
"learning_rate": 9.780000000000001e-06,
"loss": 0.0134,
"step": 490
},
{
"epoch": 0.25516713447307987,
"grad_norm": 2.55068039894104,
"learning_rate": 9.980000000000001e-06,
"loss": 0.0133,
"step": 500
},
{
"epoch": 0.2602704771625415,
"grad_norm": 2.78916072845459,
"learning_rate": 9.983271375464685e-06,
"loss": 0.0102,
"step": 510
},
{
"epoch": 0.26537381985200303,
"grad_norm": 3.6304802894592285,
"learning_rate": 9.96468401486989e-06,
"loss": 0.0142,
"step": 520
},
{
"epoch": 0.27047716254146464,
"grad_norm": 1.1248530149459839,
"learning_rate": 9.946096654275093e-06,
"loss": 0.0096,
"step": 530
},
{
"epoch": 0.27558050523092625,
"grad_norm": 2.088334798812866,
"learning_rate": 9.927509293680298e-06,
"loss": 0.0189,
"step": 540
},
{
"epoch": 0.28068384792038786,
"grad_norm": 2.035660982131958,
"learning_rate": 9.908921933085503e-06,
"loss": 0.0125,
"step": 550
},
{
"epoch": 0.2857871906098495,
"grad_norm": 1.8379945755004883,
"learning_rate": 9.890334572490708e-06,
"loss": 0.01,
"step": 560
},
{
"epoch": 0.290890533299311,
"grad_norm": 2.2616829872131348,
"learning_rate": 9.871747211895911e-06,
"loss": 0.0119,
"step": 570
},
{
"epoch": 0.29599387598877264,
"grad_norm": 2.0087382793426514,
"learning_rate": 9.853159851301116e-06,
"loss": 0.0104,
"step": 580
},
{
"epoch": 0.30109721867823425,
"grad_norm": 1.7350345849990845,
"learning_rate": 9.83457249070632e-06,
"loss": 0.0144,
"step": 590
},
{
"epoch": 0.30620056136769586,
"grad_norm": 1.9522229433059692,
"learning_rate": 9.815985130111524e-06,
"loss": 0.0152,
"step": 600
},
{
"epoch": 0.31130390405715747,
"grad_norm": 2.231642961502075,
"learning_rate": 9.79739776951673e-06,
"loss": 0.0138,
"step": 610
},
{
"epoch": 0.316407246746619,
"grad_norm": 1.9675791263580322,
"learning_rate": 9.778810408921934e-06,
"loss": 0.0148,
"step": 620
},
{
"epoch": 0.32151058943608063,
"grad_norm": 1.9099314212799072,
"learning_rate": 9.76022304832714e-06,
"loss": 0.0145,
"step": 630
},
{
"epoch": 0.32661393212554224,
"grad_norm": 1.777403473854065,
"learning_rate": 9.741635687732343e-06,
"loss": 0.0164,
"step": 640
},
{
"epoch": 0.33171727481500385,
"grad_norm": 3.160843849182129,
"learning_rate": 9.723048327137548e-06,
"loss": 0.0178,
"step": 650
},
{
"epoch": 0.3368206175044654,
"grad_norm": 0.950518786907196,
"learning_rate": 9.70446096654275e-06,
"loss": 0.0154,
"step": 660
},
{
"epoch": 0.341923960193927,
"grad_norm": 3.335329294204712,
"learning_rate": 9.685873605947956e-06,
"loss": 0.0149,
"step": 670
},
{
"epoch": 0.3470273028833886,
"grad_norm": 2.4955413341522217,
"learning_rate": 9.66728624535316e-06,
"loss": 0.0095,
"step": 680
},
{
"epoch": 0.35213064557285023,
"grad_norm": 1.4317821264266968,
"learning_rate": 9.648698884758366e-06,
"loss": 0.0143,
"step": 690
},
{
"epoch": 0.35723398826231184,
"grad_norm": 1.6408915519714355,
"learning_rate": 9.63011152416357e-06,
"loss": 0.0262,
"step": 700
},
{
"epoch": 0.3623373309517734,
"grad_norm": 5.888397693634033,
"learning_rate": 9.611524163568774e-06,
"loss": 0.0297,
"step": 710
},
{
"epoch": 0.367440673641235,
"grad_norm": 2.526792526245117,
"learning_rate": 9.592936802973979e-06,
"loss": 0.0149,
"step": 720
},
{
"epoch": 0.3725440163306966,
"grad_norm": 1.5245797634124756,
"learning_rate": 9.574349442379182e-06,
"loss": 0.0122,
"step": 730
},
{
"epoch": 0.3776473590201582,
"grad_norm": 2.3768417835235596,
"learning_rate": 9.555762081784387e-06,
"loss": 0.0146,
"step": 740
},
{
"epoch": 0.3827507017096198,
"grad_norm": 1.9545379877090454,
"learning_rate": 9.537174721189592e-06,
"loss": 0.0116,
"step": 750
},
{
"epoch": 0.3878540443990814,
"grad_norm": 2.5888895988464355,
"learning_rate": 9.518587360594797e-06,
"loss": 0.0125,
"step": 760
},
{
"epoch": 0.392957387088543,
"grad_norm": 2.554670810699463,
"learning_rate": 9.5e-06,
"loss": 0.0121,
"step": 770
},
{
"epoch": 0.3980607297780046,
"grad_norm": 2.3274645805358887,
"learning_rate": 9.481412639405206e-06,
"loss": 0.0152,
"step": 780
},
{
"epoch": 0.4031640724674662,
"grad_norm": 1.916551113128662,
"learning_rate": 9.46282527881041e-06,
"loss": 0.0181,
"step": 790
},
{
"epoch": 0.40826741515692777,
"grad_norm": 2.7110981941223145,
"learning_rate": 9.444237918215614e-06,
"loss": 0.0201,
"step": 800
},
{
"epoch": 0.4133707578463894,
"grad_norm": 2.66487193107605,
"learning_rate": 9.425650557620819e-06,
"loss": 0.0163,
"step": 810
},
{
"epoch": 0.418474100535851,
"grad_norm": 4.3903303146362305,
"learning_rate": 9.407063197026024e-06,
"loss": 0.0203,
"step": 820
},
{
"epoch": 0.4235774432253126,
"grad_norm": 0.8613393902778625,
"learning_rate": 9.388475836431227e-06,
"loss": 0.0101,
"step": 830
},
{
"epoch": 0.42868078591477415,
"grad_norm": 14.285655975341797,
"learning_rate": 9.369888475836432e-06,
"loss": 0.0201,
"step": 840
},
{
"epoch": 0.43378412860423576,
"grad_norm": 2.281245708465576,
"learning_rate": 9.351301115241637e-06,
"loss": 0.017,
"step": 850
},
{
"epoch": 0.4388874712936974,
"grad_norm": 2.5612051486968994,
"learning_rate": 9.33271375464684e-06,
"loss": 0.0164,
"step": 860
},
{
"epoch": 0.443990813983159,
"grad_norm": 3.728468894958496,
"learning_rate": 9.314126394052045e-06,
"loss": 0.0184,
"step": 870
},
{
"epoch": 0.4490941566726206,
"grad_norm": 2.954237699508667,
"learning_rate": 9.295539033457249e-06,
"loss": 0.0216,
"step": 880
},
{
"epoch": 0.45419749936208215,
"grad_norm": 2.5756335258483887,
"learning_rate": 9.276951672862453e-06,
"loss": 0.021,
"step": 890
},
{
"epoch": 0.45930084205154376,
"grad_norm": 4.490197658538818,
"learning_rate": 9.258364312267658e-06,
"loss": 0.0138,
"step": 900
},
{
"epoch": 0.46440418474100537,
"grad_norm": 1.9928340911865234,
"learning_rate": 9.239776951672863e-06,
"loss": 0.0158,
"step": 910
},
{
"epoch": 0.469507527430467,
"grad_norm": 3.2016446590423584,
"learning_rate": 9.221189591078068e-06,
"loss": 0.0188,
"step": 920
},
{
"epoch": 0.47461087011992853,
"grad_norm": 2.1624643802642822,
"learning_rate": 9.202602230483272e-06,
"loss": 0.0139,
"step": 930
},
{
"epoch": 0.47971421280939014,
"grad_norm": 2.0108089447021484,
"learning_rate": 9.184014869888477e-06,
"loss": 0.0173,
"step": 940
},
{
"epoch": 0.48481755549885175,
"grad_norm": 2.6266250610351562,
"learning_rate": 9.16542750929368e-06,
"loss": 0.0181,
"step": 950
},
{
"epoch": 0.48992089818831336,
"grad_norm": 1.7041484117507935,
"learning_rate": 9.146840148698885e-06,
"loss": 0.0167,
"step": 960
},
{
"epoch": 0.49502424087777497,
"grad_norm": 2.4042234420776367,
"learning_rate": 9.12825278810409e-06,
"loss": 0.017,
"step": 970
},
{
"epoch": 0.5001275835672365,
"grad_norm": 1.770944595336914,
"learning_rate": 9.109665427509295e-06,
"loss": 0.01,
"step": 980
},
{
"epoch": 0.5052309262566982,
"grad_norm": 2.101804256439209,
"learning_rate": 9.0910780669145e-06,
"loss": 0.0152,
"step": 990
},
{
"epoch": 0.5103342689461597,
"grad_norm": 3.545254945755005,
"learning_rate": 9.072490706319703e-06,
"loss": 0.014,
"step": 1000
},
{
"epoch": 0.5154376116356213,
"grad_norm": 2.445159912109375,
"learning_rate": 9.053903345724908e-06,
"loss": 0.0207,
"step": 1010
},
{
"epoch": 0.520540954325083,
"grad_norm": 3.302297830581665,
"learning_rate": 9.035315985130111e-06,
"loss": 0.0212,
"step": 1020
},
{
"epoch": 0.5256442970145445,
"grad_norm": 4.689877510070801,
"learning_rate": 9.016728624535316e-06,
"loss": 0.025,
"step": 1030
},
{
"epoch": 0.5307476397040061,
"grad_norm": 4.139590740203857,
"learning_rate": 8.998141263940521e-06,
"loss": 0.0158,
"step": 1040
},
{
"epoch": 0.5358509823934677,
"grad_norm": 1.6236610412597656,
"learning_rate": 8.979553903345726e-06,
"loss": 0.0112,
"step": 1050
},
{
"epoch": 0.5409543250829293,
"grad_norm": 2.6642770767211914,
"learning_rate": 8.96096654275093e-06,
"loss": 0.0226,
"step": 1060
},
{
"epoch": 0.546057667772391,
"grad_norm": 2.012868642807007,
"learning_rate": 8.942379182156135e-06,
"loss": 0.0172,
"step": 1070
},
{
"epoch": 0.5511610104618525,
"grad_norm": 1.9676612615585327,
"learning_rate": 8.92379182156134e-06,
"loss": 0.0131,
"step": 1080
},
{
"epoch": 0.5562643531513141,
"grad_norm": 3.358045816421509,
"learning_rate": 8.905204460966543e-06,
"loss": 0.0168,
"step": 1090
},
{
"epoch": 0.5613676958407757,
"grad_norm": 1.9890451431274414,
"learning_rate": 8.886617100371748e-06,
"loss": 0.0158,
"step": 1100
},
{
"epoch": 0.5664710385302373,
"grad_norm": 2.1915857791900635,
"learning_rate": 8.868029739776953e-06,
"loss": 0.015,
"step": 1110
},
{
"epoch": 0.571574381219699,
"grad_norm": 2.0204272270202637,
"learning_rate": 8.849442379182158e-06,
"loss": 0.0217,
"step": 1120
},
{
"epoch": 0.5766777239091605,
"grad_norm": 1.8702834844589233,
"learning_rate": 8.830855018587361e-06,
"loss": 0.014,
"step": 1130
},
{
"epoch": 0.581781066598622,
"grad_norm": 0.8649874925613403,
"learning_rate": 8.812267657992566e-06,
"loss": 0.0168,
"step": 1140
},
{
"epoch": 0.5868844092880837,
"grad_norm": 2.020085334777832,
"learning_rate": 8.79368029739777e-06,
"loss": 0.0166,
"step": 1150
},
{
"epoch": 0.5919877519775453,
"grad_norm": 0.6940491199493408,
"learning_rate": 8.775092936802974e-06,
"loss": 0.014,
"step": 1160
},
{
"epoch": 0.5970910946670069,
"grad_norm": 1.6421513557434082,
"learning_rate": 8.75650557620818e-06,
"loss": 0.0149,
"step": 1170
},
{
"epoch": 0.6021944373564685,
"grad_norm": 1.7957764863967896,
"learning_rate": 8.737918215613384e-06,
"loss": 0.0194,
"step": 1180
},
{
"epoch": 0.60729778004593,
"grad_norm": 1.6488491296768188,
"learning_rate": 8.719330855018588e-06,
"loss": 0.0119,
"step": 1190
},
{
"epoch": 0.6124011227353917,
"grad_norm": 1.9999263286590576,
"learning_rate": 8.700743494423793e-06,
"loss": 0.0165,
"step": 1200
},
{
"epoch": 0.6175044654248533,
"grad_norm": 1.749192237854004,
"learning_rate": 8.682156133828998e-06,
"loss": 0.0193,
"step": 1210
},
{
"epoch": 0.6226078081143149,
"grad_norm": 2.414264440536499,
"learning_rate": 8.663568773234201e-06,
"loss": 0.0199,
"step": 1220
},
{
"epoch": 0.6277111508037765,
"grad_norm": 2.670834541320801,
"learning_rate": 8.644981412639406e-06,
"loss": 0.0178,
"step": 1230
},
{
"epoch": 0.632814493493238,
"grad_norm": 3.2673842906951904,
"learning_rate": 8.626394052044609e-06,
"loss": 0.0161,
"step": 1240
},
{
"epoch": 0.6379178361826997,
"grad_norm": 2.5664849281311035,
"learning_rate": 8.607806691449814e-06,
"loss": 0.0213,
"step": 1250
},
{
"epoch": 0.6430211788721613,
"grad_norm": 2.350846290588379,
"learning_rate": 8.589219330855019e-06,
"loss": 0.0181,
"step": 1260
},
{
"epoch": 0.6481245215616228,
"grad_norm": 2.494407892227173,
"learning_rate": 8.570631970260224e-06,
"loss": 0.0128,
"step": 1270
},
{
"epoch": 0.6532278642510845,
"grad_norm": 2.3424453735351562,
"learning_rate": 8.552044609665429e-06,
"loss": 0.0127,
"step": 1280
},
{
"epoch": 0.658331206940546,
"grad_norm": 2.1651947498321533,
"learning_rate": 8.533457249070632e-06,
"loss": 0.0229,
"step": 1290
},
{
"epoch": 0.6634345496300077,
"grad_norm": 0.5863803029060364,
"learning_rate": 8.514869888475837e-06,
"loss": 0.0145,
"step": 1300
},
{
"epoch": 0.6685378923194693,
"grad_norm": 1.3225018978118896,
"learning_rate": 8.49628252788104e-06,
"loss": 0.0149,
"step": 1310
},
{
"epoch": 0.6736412350089308,
"grad_norm": 3.3000130653381348,
"learning_rate": 8.477695167286246e-06,
"loss": 0.0211,
"step": 1320
},
{
"epoch": 0.6787445776983925,
"grad_norm": 2.677570104598999,
"learning_rate": 8.45910780669145e-06,
"loss": 0.0113,
"step": 1330
},
{
"epoch": 0.683847920387854,
"grad_norm": 1.235533595085144,
"learning_rate": 8.440520446096656e-06,
"loss": 0.0132,
"step": 1340
},
{
"epoch": 0.6889512630773157,
"grad_norm": 1.7336188554763794,
"learning_rate": 8.42193308550186e-06,
"loss": 0.0147,
"step": 1350
},
{
"epoch": 0.6940546057667772,
"grad_norm": 3.8093788623809814,
"learning_rate": 8.403345724907064e-06,
"loss": 0.0168,
"step": 1360
},
{
"epoch": 0.6991579484562388,
"grad_norm": 1.9721407890319824,
"learning_rate": 8.384758364312269e-06,
"loss": 0.0148,
"step": 1370
},
{
"epoch": 0.7042612911457005,
"grad_norm": 4.275414943695068,
"learning_rate": 8.366171003717472e-06,
"loss": 0.0171,
"step": 1380
},
{
"epoch": 0.709364633835162,
"grad_norm": 1.36530339717865,
"learning_rate": 8.347583643122677e-06,
"loss": 0.0157,
"step": 1390
},
{
"epoch": 0.7144679765246237,
"grad_norm": 2.0768120288848877,
"learning_rate": 8.328996282527882e-06,
"loss": 0.0197,
"step": 1400
},
{
"epoch": 0.7195713192140852,
"grad_norm": 3.6376969814300537,
"learning_rate": 8.310408921933087e-06,
"loss": 0.02,
"step": 1410
},
{
"epoch": 0.7246746619035468,
"grad_norm": 4.029935836791992,
"learning_rate": 8.29182156133829e-06,
"loss": 0.0132,
"step": 1420
},
{
"epoch": 0.7297780045930085,
"grad_norm": 3.0603153705596924,
"learning_rate": 8.273234200743495e-06,
"loss": 0.0124,
"step": 1430
},
{
"epoch": 0.73488134728247,
"grad_norm": 0.8475554585456848,
"learning_rate": 8.2546468401487e-06,
"loss": 0.0124,
"step": 1440
},
{
"epoch": 0.7399846899719316,
"grad_norm": 1.9978291988372803,
"learning_rate": 8.236059479553904e-06,
"loss": 0.0117,
"step": 1450
},
{
"epoch": 0.7450880326613932,
"grad_norm": 1.5020562410354614,
"learning_rate": 8.217472118959108e-06,
"loss": 0.0167,
"step": 1460
},
{
"epoch": 0.7501913753508548,
"grad_norm": 1.614305853843689,
"learning_rate": 8.198884758364313e-06,
"loss": 0.0149,
"step": 1470
},
{
"epoch": 0.7552947180403164,
"grad_norm": 2.371570110321045,
"learning_rate": 8.180297397769518e-06,
"loss": 0.0142,
"step": 1480
},
{
"epoch": 0.760398060729778,
"grad_norm": 1.5552469491958618,
"learning_rate": 8.161710037174722e-06,
"loss": 0.0134,
"step": 1490
},
{
"epoch": 0.7655014034192396,
"grad_norm": 1.9674372673034668,
"learning_rate": 8.143122676579927e-06,
"loss": 0.0225,
"step": 1500
},
{
"epoch": 0.7706047461087012,
"grad_norm": 1.94131600856781,
"learning_rate": 8.12453531598513e-06,
"loss": 0.0132,
"step": 1510
},
{
"epoch": 0.7757080887981628,
"grad_norm": 2.533285140991211,
"learning_rate": 8.105947955390335e-06,
"loss": 0.0164,
"step": 1520
},
{
"epoch": 0.7808114314876244,
"grad_norm": 1.7931355237960815,
"learning_rate": 8.08736059479554e-06,
"loss": 0.0145,
"step": 1530
},
{
"epoch": 0.785914774177086,
"grad_norm": 1.5637154579162598,
"learning_rate": 8.068773234200745e-06,
"loss": 0.0131,
"step": 1540
},
{
"epoch": 0.7910181168665475,
"grad_norm": 1.0649983882904053,
"learning_rate": 8.050185873605948e-06,
"loss": 0.0317,
"step": 1550
},
{
"epoch": 0.7961214595560092,
"grad_norm": 1.9837394952774048,
"learning_rate": 8.031598513011153e-06,
"loss": 0.0168,
"step": 1560
},
{
"epoch": 0.8012248022454708,
"grad_norm": 3.6585099697113037,
"learning_rate": 8.013011152416358e-06,
"loss": 0.0131,
"step": 1570
},
{
"epoch": 0.8063281449349324,
"grad_norm": 2.7953765392303467,
"learning_rate": 7.994423791821561e-06,
"loss": 0.0162,
"step": 1580
},
{
"epoch": 0.811431487624394,
"grad_norm": 2.3890202045440674,
"learning_rate": 7.975836431226766e-06,
"loss": 0.0158,
"step": 1590
},
{
"epoch": 0.8165348303138555,
"grad_norm": 2.073019504547119,
"learning_rate": 7.95724907063197e-06,
"loss": 0.0159,
"step": 1600
},
{
"epoch": 0.8216381730033172,
"grad_norm": 2.4629039764404297,
"learning_rate": 7.938661710037175e-06,
"loss": 0.0117,
"step": 1610
},
{
"epoch": 0.8267415156927788,
"grad_norm": 1.4736220836639404,
"learning_rate": 7.92007434944238e-06,
"loss": 0.0145,
"step": 1620
},
{
"epoch": 0.8318448583822403,
"grad_norm": 3.2814719676971436,
"learning_rate": 7.901486988847585e-06,
"loss": 0.0143,
"step": 1630
},
{
"epoch": 0.836948201071702,
"grad_norm": 2.1625795364379883,
"learning_rate": 7.88289962825279e-06,
"loss": 0.0118,
"step": 1640
},
{
"epoch": 0.8420515437611635,
"grad_norm": 1.660874605178833,
"learning_rate": 7.864312267657993e-06,
"loss": 0.0127,
"step": 1650
},
{
"epoch": 0.8471548864506252,
"grad_norm": 1.7518630027770996,
"learning_rate": 7.845724907063198e-06,
"loss": 0.0085,
"step": 1660
},
{
"epoch": 0.8522582291400868,
"grad_norm": 1.452298879623413,
"learning_rate": 7.827137546468401e-06,
"loss": 0.0151,
"step": 1670
},
{
"epoch": 0.8573615718295483,
"grad_norm": 1.8911986351013184,
"learning_rate": 7.808550185873606e-06,
"loss": 0.0166,
"step": 1680
},
{
"epoch": 0.86246491451901,
"grad_norm": 3.8515708446502686,
"learning_rate": 7.789962825278811e-06,
"loss": 0.0221,
"step": 1690
},
{
"epoch": 0.8675682572084715,
"grad_norm": 2.210042953491211,
"learning_rate": 7.771375464684016e-06,
"loss": 0.0208,
"step": 1700
},
{
"epoch": 0.8726715998979332,
"grad_norm": 2.0735044479370117,
"learning_rate": 7.75278810408922e-06,
"loss": 0.0142,
"step": 1710
},
{
"epoch": 0.8777749425873947,
"grad_norm": 2.415004253387451,
"learning_rate": 7.734200743494424e-06,
"loss": 0.0179,
"step": 1720
},
{
"epoch": 0.8828782852768563,
"grad_norm": 2.272406816482544,
"learning_rate": 7.71561338289963e-06,
"loss": 0.0142,
"step": 1730
},
{
"epoch": 0.887981627966318,
"grad_norm": 1.2048219442367554,
"learning_rate": 7.697026022304833e-06,
"loss": 0.0102,
"step": 1740
},
{
"epoch": 0.8930849706557795,
"grad_norm": 1.414962887763977,
"learning_rate": 7.678438661710038e-06,
"loss": 0.0112,
"step": 1750
},
{
"epoch": 0.8981883133452412,
"grad_norm": 0.9970257878303528,
"learning_rate": 7.659851301115243e-06,
"loss": 0.0116,
"step": 1760
},
{
"epoch": 0.9032916560347027,
"grad_norm": 1.7614041566848755,
"learning_rate": 7.641263940520448e-06,
"loss": 0.0198,
"step": 1770
},
{
"epoch": 0.9083949987241643,
"grad_norm": 2.285222291946411,
"learning_rate": 7.622676579925651e-06,
"loss": 0.0146,
"step": 1780
},
{
"epoch": 0.913498341413626,
"grad_norm": 2.238495111465454,
"learning_rate": 7.604089219330856e-06,
"loss": 0.0102,
"step": 1790
},
{
"epoch": 0.9186016841030875,
"grad_norm": 0.7516927123069763,
"learning_rate": 7.58550185873606e-06,
"loss": 0.0134,
"step": 1800
},
{
"epoch": 0.9237050267925491,
"grad_norm": 1.6228662729263306,
"learning_rate": 7.566914498141265e-06,
"loss": 0.0182,
"step": 1810
},
{
"epoch": 0.9288083694820107,
"grad_norm": 1.0676440000534058,
"learning_rate": 7.548327137546469e-06,
"loss": 0.0117,
"step": 1820
},
{
"epoch": 0.9339117121714723,
"grad_norm": 2.345280170440674,
"learning_rate": 7.529739776951673e-06,
"loss": 0.0116,
"step": 1830
},
{
"epoch": 0.939015054860934,
"grad_norm": 2.056405782699585,
"learning_rate": 7.511152416356878e-06,
"loss": 0.0181,
"step": 1840
},
{
"epoch": 0.9441183975503955,
"grad_norm": 1.5895274877548218,
"learning_rate": 7.492565055762082e-06,
"loss": 0.0143,
"step": 1850
},
{
"epoch": 0.9492217402398571,
"grad_norm": 3.693983554840088,
"learning_rate": 7.473977695167287e-06,
"loss": 0.0139,
"step": 1860
},
{
"epoch": 0.9543250829293187,
"grad_norm": 1.7493189573287964,
"learning_rate": 7.455390334572491e-06,
"loss": 0.012,
"step": 1870
},
{
"epoch": 0.9594284256187803,
"grad_norm": 6.353549957275391,
"learning_rate": 7.436802973977696e-06,
"loss": 0.0182,
"step": 1880
},
{
"epoch": 0.9645317683082419,
"grad_norm": 3.067734956741333,
"learning_rate": 7.4182156133829e-06,
"loss": 0.0116,
"step": 1890
},
{
"epoch": 0.9696351109977035,
"grad_norm": 2.4685025215148926,
"learning_rate": 7.399628252788105e-06,
"loss": 0.0153,
"step": 1900
},
{
"epoch": 0.974738453687165,
"grad_norm": 2.9748520851135254,
"learning_rate": 7.38104089219331e-06,
"loss": 0.0196,
"step": 1910
},
{
"epoch": 0.9798417963766267,
"grad_norm": 1.787302017211914,
"learning_rate": 7.362453531598514e-06,
"loss": 0.0139,
"step": 1920
},
{
"epoch": 0.9849451390660883,
"grad_norm": 2.998495101928711,
"learning_rate": 7.343866171003719e-06,
"loss": 0.0117,
"step": 1930
},
{
"epoch": 0.9900484817555499,
"grad_norm": 2.461190938949585,
"learning_rate": 7.325278810408922e-06,
"loss": 0.0188,
"step": 1940
},
{
"epoch": 0.9951518244450115,
"grad_norm": 2.0859811305999756,
"learning_rate": 7.306691449814127e-06,
"loss": 0.0144,
"step": 1950
},
{
"epoch": 1.0,
"grad_norm": 9.421244621276855,
"learning_rate": 7.288104089219331e-06,
"loss": 0.0214,
"step": 1960
},
{
"epoch": 1.0051033426894616,
"grad_norm": 1.6903966665267944,
"learning_rate": 7.269516728624536e-06,
"loss": 0.0052,
"step": 1970
},
{
"epoch": 1.010206685378923,
"grad_norm": 1.4454400539398193,
"learning_rate": 7.25092936802974e-06,
"loss": 0.0054,
"step": 1980
},
{
"epoch": 1.0153100280683849,
"grad_norm": 1.46286141872406,
"learning_rate": 7.2323420074349444e-06,
"loss": 0.0036,
"step": 1990
},
{
"epoch": 1.0204133707578464,
"grad_norm": 1.4223207235336304,
"learning_rate": 7.213754646840149e-06,
"loss": 0.0041,
"step": 2000
},
{
"epoch": 1.0204133707578464,
"eval_loss": 0.007851608097553253,
"eval_runtime": 5932.2159,
"eval_samples_per_second": 2.642,
"eval_steps_per_second": 0.33,
"eval_wer": 0.6722360915468404,
"step": 2000
},
{
"epoch": 1.025516713447308,
"grad_norm": 1.5885974168777466,
"learning_rate": 7.1951672862453535e-06,
"loss": 0.0058,
"step": 2010
},
{
"epoch": 1.0306200561367695,
"grad_norm": 0.7237359881401062,
"learning_rate": 7.1765799256505585e-06,
"loss": 0.0056,
"step": 2020
},
{
"epoch": 1.035723398826231,
"grad_norm": 2.5091042518615723,
"learning_rate": 7.157992565055763e-06,
"loss": 0.0055,
"step": 2030
},
{
"epoch": 1.0408267415156929,
"grad_norm": 1.831894874572754,
"learning_rate": 7.139405204460968e-06,
"loss": 0.0054,
"step": 2040
},
{
"epoch": 1.0459300842051544,
"grad_norm": 0.5377639532089233,
"learning_rate": 7.120817843866171e-06,
"loss": 0.0055,
"step": 2050
},
{
"epoch": 1.051033426894616,
"grad_norm": 0.48973751068115234,
"learning_rate": 7.102230483271376e-06,
"loss": 0.0055,
"step": 2060
},
{
"epoch": 1.0561367695840775,
"grad_norm": 1.8925316333770752,
"learning_rate": 7.08364312267658e-06,
"loss": 0.0097,
"step": 2070
},
{
"epoch": 1.061240112273539,
"grad_norm": 1.005006194114685,
"learning_rate": 7.065055762081785e-06,
"loss": 0.0055,
"step": 2080
},
{
"epoch": 1.0663434549630009,
"grad_norm": 1.7371063232421875,
"learning_rate": 7.04646840148699e-06,
"loss": 0.0067,
"step": 2090
},
{
"epoch": 1.0714467976524624,
"grad_norm": 1.484964370727539,
"learning_rate": 7.027881040892194e-06,
"loss": 0.0041,
"step": 2100
},
{
"epoch": 1.076550140341924,
"grad_norm": 1.0253629684448242,
"learning_rate": 7.009293680297399e-06,
"loss": 0.0065,
"step": 2110
},
{
"epoch": 1.0816534830313855,
"grad_norm": 0.5347009897232056,
"learning_rate": 6.990706319702602e-06,
"loss": 0.0069,
"step": 2120
},
{
"epoch": 1.086756825720847,
"grad_norm": 1.1465612649917603,
"learning_rate": 6.972118959107807e-06,
"loss": 0.0057,
"step": 2130
},
{
"epoch": 1.0918601684103089,
"grad_norm": 0.8260084986686707,
"learning_rate": 6.9535315985130115e-06,
"loss": 0.0036,
"step": 2140
},
{
"epoch": 1.0969635110997704,
"grad_norm": 0.7711835503578186,
"learning_rate": 6.9349442379182165e-06,
"loss": 0.0035,
"step": 2150
},
{
"epoch": 1.102066853789232,
"grad_norm": 1.6993855237960815,
"learning_rate": 6.916356877323421e-06,
"loss": 0.0045,
"step": 2160
},
{
"epoch": 1.1071701964786935,
"grad_norm": 1.6055148839950562,
"learning_rate": 6.897769516728625e-06,
"loss": 0.0037,
"step": 2170
},
{
"epoch": 1.112273539168155,
"grad_norm": 1.5848637819290161,
"learning_rate": 6.87918215613383e-06,
"loss": 0.0057,
"step": 2180
},
{
"epoch": 1.1173768818576169,
"grad_norm": 0.6338240504264832,
"learning_rate": 6.860594795539034e-06,
"loss": 0.0041,
"step": 2190
},
{
"epoch": 1.1224802245470784,
"grad_norm": 0.5418840646743774,
"learning_rate": 6.842007434944239e-06,
"loss": 0.0056,
"step": 2200
},
{
"epoch": 1.12758356723654,
"grad_norm": 1.729345679283142,
"learning_rate": 6.823420074349443e-06,
"loss": 0.0045,
"step": 2210
},
{
"epoch": 1.1326869099260015,
"grad_norm": 2.351128578186035,
"learning_rate": 6.804832713754648e-06,
"loss": 0.0047,
"step": 2220
},
{
"epoch": 1.137790252615463,
"grad_norm": 0.46814098954200745,
"learning_rate": 6.786245353159851e-06,
"loss": 0.0053,
"step": 2230
},
{
"epoch": 1.1428935953049248,
"grad_norm": 1.0121688842773438,
"learning_rate": 6.767657992565056e-06,
"loss": 0.0056,
"step": 2240
},
{
"epoch": 1.1479969379943864,
"grad_norm": 0.32681307196617126,
"learning_rate": 6.74907063197026e-06,
"loss": 0.0033,
"step": 2250
},
{
"epoch": 1.153100280683848,
"grad_norm": 0.3472459614276886,
"learning_rate": 6.730483271375465e-06,
"loss": 0.005,
"step": 2260
},
{
"epoch": 1.1582036233733095,
"grad_norm": 0.3251103460788727,
"learning_rate": 6.7118959107806694e-06,
"loss": 0.0067,
"step": 2270
},
{
"epoch": 1.163306966062771,
"grad_norm": 1.372989535331726,
"learning_rate": 6.6933085501858744e-06,
"loss": 0.0066,
"step": 2280
},
{
"epoch": 1.1684103087522326,
"grad_norm": 3.0950405597686768,
"learning_rate": 6.674721189591079e-06,
"loss": 0.0059,
"step": 2290
},
{
"epoch": 1.1735136514416944,
"grad_norm": 0.6446343064308167,
"learning_rate": 6.656133828996283e-06,
"loss": 0.0048,
"step": 2300
},
{
"epoch": 1.178616994131156,
"grad_norm": 2.7999908924102783,
"learning_rate": 6.637546468401488e-06,
"loss": 0.0075,
"step": 2310
},
{
"epoch": 1.1837203368206175,
"grad_norm": 1.556735634803772,
"learning_rate": 6.618959107806692e-06,
"loss": 0.0078,
"step": 2320
},
{
"epoch": 1.188823679510079,
"grad_norm": 0.6871877908706665,
"learning_rate": 6.600371747211897e-06,
"loss": 0.0039,
"step": 2330
},
{
"epoch": 1.1939270221995406,
"grad_norm": 0.6974169611930847,
"learning_rate": 6.581784386617101e-06,
"loss": 0.0044,
"step": 2340
},
{
"epoch": 1.1990303648890024,
"grad_norm": 0.34097474813461304,
"learning_rate": 6.563197026022305e-06,
"loss": 0.0038,
"step": 2350
},
{
"epoch": 1.204133707578464,
"grad_norm": 1.1647700071334839,
"learning_rate": 6.544609665427509e-06,
"loss": 0.0042,
"step": 2360
},
{
"epoch": 1.2092370502679255,
"grad_norm": 0.5699931383132935,
"learning_rate": 6.526022304832714e-06,
"loss": 0.0044,
"step": 2370
},
{
"epoch": 1.214340392957387,
"grad_norm": 0.9477786421775818,
"learning_rate": 6.507434944237919e-06,
"loss": 0.0062,
"step": 2380
},
{
"epoch": 1.2194437356468486,
"grad_norm": 1.1258920431137085,
"learning_rate": 6.488847583643123e-06,
"loss": 0.005,
"step": 2390
},
{
"epoch": 1.2245470783363104,
"grad_norm": 1.580121636390686,
"learning_rate": 6.470260223048328e-06,
"loss": 0.0037,
"step": 2400
},
{
"epoch": 1.229650421025772,
"grad_norm": 1.993891716003418,
"learning_rate": 6.4516728624535315e-06,
"loss": 0.0062,
"step": 2410
},
{
"epoch": 1.2347537637152335,
"grad_norm": 1.034355878829956,
"learning_rate": 6.4330855018587365e-06,
"loss": 0.0053,
"step": 2420
},
{
"epoch": 1.239857106404695,
"grad_norm": 1.6849045753479004,
"learning_rate": 6.414498141263941e-06,
"loss": 0.0056,
"step": 2430
},
{
"epoch": 1.2449604490941566,
"grad_norm": 1.9418292045593262,
"learning_rate": 6.395910780669146e-06,
"loss": 0.006,
"step": 2440
},
{
"epoch": 1.2500637917836182,
"grad_norm": 1.7483155727386475,
"learning_rate": 6.37732342007435e-06,
"loss": 0.0055,
"step": 2450
},
{
"epoch": 1.25516713447308,
"grad_norm": 0.9368677139282227,
"learning_rate": 6.358736059479555e-06,
"loss": 0.006,
"step": 2460
},
{
"epoch": 1.2602704771625415,
"grad_norm": 1.3387763500213623,
"learning_rate": 6.34014869888476e-06,
"loss": 0.0066,
"step": 2470
},
{
"epoch": 1.265373819852003,
"grad_norm": 0.7016597986221313,
"learning_rate": 6.321561338289963e-06,
"loss": 0.0035,
"step": 2480
},
{
"epoch": 1.2704771625414646,
"grad_norm": 2.289067268371582,
"learning_rate": 6.302973977695168e-06,
"loss": 0.0041,
"step": 2490
},
{
"epoch": 1.2755805052309261,
"grad_norm": 2.0604097843170166,
"learning_rate": 6.284386617100372e-06,
"loss": 0.0029,
"step": 2500
},
{
"epoch": 1.280683847920388,
"grad_norm": 0.09299144893884659,
"learning_rate": 6.265799256505577e-06,
"loss": 0.0065,
"step": 2510
},
{
"epoch": 1.2857871906098495,
"grad_norm": 2.164297342300415,
"learning_rate": 6.247211895910781e-06,
"loss": 0.0041,
"step": 2520
},
{
"epoch": 1.290890533299311,
"grad_norm": 1.1168850660324097,
"learning_rate": 6.228624535315985e-06,
"loss": 0.0064,
"step": 2530
},
{
"epoch": 1.2959938759887726,
"grad_norm": 1.1941462755203247,
"learning_rate": 6.2100371747211895e-06,
"loss": 0.0048,
"step": 2540
},
{
"epoch": 1.3010972186782341,
"grad_norm": 0.29545173048973083,
"learning_rate": 6.1914498141263945e-06,
"loss": 0.005,
"step": 2550
},
{
"epoch": 1.306200561367696,
"grad_norm": 1.6539217233657837,
"learning_rate": 6.1728624535315994e-06,
"loss": 0.0035,
"step": 2560
},
{
"epoch": 1.3113039040571575,
"grad_norm": 0.5509535670280457,
"learning_rate": 6.1542750929368036e-06,
"loss": 0.0081,
"step": 2570
},
{
"epoch": 1.316407246746619,
"grad_norm": 1.1881476640701294,
"learning_rate": 6.1356877323420085e-06,
"loss": 0.0074,
"step": 2580
},
{
"epoch": 1.3215105894360806,
"grad_norm": 1.1224453449249268,
"learning_rate": 6.117100371747212e-06,
"loss": 0.0058,
"step": 2590
},
{
"epoch": 1.3266139321255421,
"grad_norm": 0.6658273935317993,
"learning_rate": 6.098513011152417e-06,
"loss": 0.0046,
"step": 2600
},
{
"epoch": 1.331717274815004,
"grad_norm": 2.696826696395874,
"learning_rate": 6.079925650557621e-06,
"loss": 0.0046,
"step": 2610
},
{
"epoch": 1.3368206175044655,
"grad_norm": 0.6089099645614624,
"learning_rate": 6.061338289962826e-06,
"loss": 0.0054,
"step": 2620
},
{
"epoch": 1.341923960193927,
"grad_norm": 0.5594236850738525,
"learning_rate": 6.04275092936803e-06,
"loss": 0.0037,
"step": 2630
},
{
"epoch": 1.3470273028833886,
"grad_norm": 2.5467419624328613,
"learning_rate": 6.024163568773235e-06,
"loss": 0.0062,
"step": 2640
},
{
"epoch": 1.3521306455728501,
"grad_norm": 1.825701117515564,
"learning_rate": 6.00557620817844e-06,
"loss": 0.0074,
"step": 2650
},
{
"epoch": 1.357233988262312,
"grad_norm": 1.2724944353103638,
"learning_rate": 5.986988847583643e-06,
"loss": 0.0055,
"step": 2660
},
{
"epoch": 1.3623373309517735,
"grad_norm": 0.20556636154651642,
"learning_rate": 5.968401486988848e-06,
"loss": 0.0049,
"step": 2670
},
{
"epoch": 1.367440673641235,
"grad_norm": 0.982221782207489,
"learning_rate": 5.949814126394052e-06,
"loss": 0.005,
"step": 2680
},
{
"epoch": 1.3725440163306966,
"grad_norm": 0.5019739866256714,
"learning_rate": 5.931226765799257e-06,
"loss": 0.005,
"step": 2690
},
{
"epoch": 1.3776473590201581,
"grad_norm": 0.9710202217102051,
"learning_rate": 5.9126394052044615e-06,
"loss": 0.0046,
"step": 2700
},
{
"epoch": 1.38275070170962,
"grad_norm": 1.481512427330017,
"learning_rate": 5.894052044609666e-06,
"loss": 0.0039,
"step": 2710
},
{
"epoch": 1.3878540443990814,
"grad_norm": 0.9244014024734497,
"learning_rate": 5.87546468401487e-06,
"loss": 0.0035,
"step": 2720
},
{
"epoch": 1.392957387088543,
"grad_norm": 0.28111106157302856,
"learning_rate": 5.856877323420075e-06,
"loss": 0.0054,
"step": 2730
},
{
"epoch": 1.3980607297780046,
"grad_norm": 1.0643965005874634,
"learning_rate": 5.83828996282528e-06,
"loss": 0.0048,
"step": 2740
},
{
"epoch": 1.403164072467466,
"grad_norm": 0.3674823045730591,
"learning_rate": 5.819702602230484e-06,
"loss": 0.0049,
"step": 2750
},
{
"epoch": 1.4082674151569279,
"grad_norm": 1.2270021438598633,
"learning_rate": 5.801115241635689e-06,
"loss": 0.0037,
"step": 2760
},
{
"epoch": 1.4133707578463894,
"grad_norm": 4.543473243713379,
"learning_rate": 5.782527881040892e-06,
"loss": 0.0067,
"step": 2770
},
{
"epoch": 1.418474100535851,
"grad_norm": 1.119815468788147,
"learning_rate": 5.763940520446097e-06,
"loss": 0.0036,
"step": 2780
},
{
"epoch": 1.4235774432253125,
"grad_norm": 1.7222695350646973,
"learning_rate": 5.745353159851301e-06,
"loss": 0.0065,
"step": 2790
},
{
"epoch": 1.428680785914774,
"grad_norm": 0.778711199760437,
"learning_rate": 5.726765799256506e-06,
"loss": 0.0034,
"step": 2800
},
{
"epoch": 1.4337841286042359,
"grad_norm": 0.5175672173500061,
"learning_rate": 5.70817843866171e-06,
"loss": 0.0055,
"step": 2810
},
{
"epoch": 1.4388874712936974,
"grad_norm": 1.3372684717178345,
"learning_rate": 5.689591078066915e-06,
"loss": 0.005,
"step": 2820
},
{
"epoch": 1.443990813983159,
"grad_norm": 0.7624754309654236,
"learning_rate": 5.67100371747212e-06,
"loss": 0.0032,
"step": 2830
},
{
"epoch": 1.4490941566726205,
"grad_norm": 0.597372829914093,
"learning_rate": 5.652416356877324e-06,
"loss": 0.0034,
"step": 2840
},
{
"epoch": 1.454197499362082,
"grad_norm": 0.6024683713912964,
"learning_rate": 5.633828996282529e-06,
"loss": 0.0047,
"step": 2850
},
{
"epoch": 1.4593008420515439,
"grad_norm": 3.4740748405456543,
"learning_rate": 5.615241635687733e-06,
"loss": 0.007,
"step": 2860
},
{
"epoch": 1.4644041847410054,
"grad_norm": 1.7954155206680298,
"learning_rate": 5.596654275092938e-06,
"loss": 0.006,
"step": 2870
},
{
"epoch": 1.469507527430467,
"grad_norm": 0.7482948899269104,
"learning_rate": 5.578066914498142e-06,
"loss": 0.0053,
"step": 2880
},
{
"epoch": 1.4746108701199285,
"grad_norm": 2.095458507537842,
"learning_rate": 5.559479553903346e-06,
"loss": 0.0062,
"step": 2890
},
{
"epoch": 1.47971421280939,
"grad_norm": 1.7963470220565796,
"learning_rate": 5.54089219330855e-06,
"loss": 0.0068,
"step": 2900
},
{
"epoch": 1.4848175554988519,
"grad_norm": 2.6437880992889404,
"learning_rate": 5.522304832713755e-06,
"loss": 0.0092,
"step": 2910
},
{
"epoch": 1.4899208981883134,
"grad_norm": 1.520580768585205,
"learning_rate": 5.503717472118959e-06,
"loss": 0.0042,
"step": 2920
},
{
"epoch": 1.495024240877775,
"grad_norm": 4.081545352935791,
"learning_rate": 5.485130111524164e-06,
"loss": 0.0044,
"step": 2930
},
{
"epoch": 1.5001275835672365,
"grad_norm": 1.9808855056762695,
"learning_rate": 5.466542750929369e-06,
"loss": 0.0043,
"step": 2940
},
{
"epoch": 1.505230926256698,
"grad_norm": 0.6452007293701172,
"learning_rate": 5.4479553903345724e-06,
"loss": 0.0043,
"step": 2950
},
{
"epoch": 1.5103342689461599,
"grad_norm": 0.26754477620124817,
"learning_rate": 5.429368029739777e-06,
"loss": 0.0036,
"step": 2960
},
{
"epoch": 1.5154376116356212,
"grad_norm": 1.183559536933899,
"learning_rate": 5.4107806691449816e-06,
"loss": 0.0069,
"step": 2970
},
{
"epoch": 1.520540954325083,
"grad_norm": 0.8674173355102539,
"learning_rate": 5.3921933085501865e-06,
"loss": 0.0041,
"step": 2980
},
{
"epoch": 1.5256442970145445,
"grad_norm": 0.28192785382270813,
"learning_rate": 5.373605947955391e-06,
"loss": 0.0039,
"step": 2990
},
{
"epoch": 1.530747639704006,
"grad_norm": 1.6907070875167847,
"learning_rate": 5.355018587360596e-06,
"loss": 0.0065,
"step": 3000
},
{
"epoch": 1.5358509823934678,
"grad_norm": 1.0499199628829956,
"learning_rate": 5.336431226765799e-06,
"loss": 0.0029,
"step": 3010
},
{
"epoch": 1.5409543250829292,
"grad_norm": 0.5462940335273743,
"learning_rate": 5.317843866171004e-06,
"loss": 0.0047,
"step": 3020
},
{
"epoch": 1.546057667772391,
"grad_norm": 0.8141253590583801,
"learning_rate": 5.299256505576209e-06,
"loss": 0.0048,
"step": 3030
},
{
"epoch": 1.5511610104618525,
"grad_norm": 0.5449689030647278,
"learning_rate": 5.280669144981413e-06,
"loss": 0.0041,
"step": 3040
},
{
"epoch": 1.556264353151314,
"grad_norm": 1.7593544721603394,
"learning_rate": 5.262081784386618e-06,
"loss": 0.0051,
"step": 3050
},
{
"epoch": 1.5613676958407758,
"grad_norm": 0.6444630026817322,
"learning_rate": 5.243494423791822e-06,
"loss": 0.0038,
"step": 3060
},
{
"epoch": 1.5664710385302372,
"grad_norm": 0.7694640755653381,
"learning_rate": 5.224907063197026e-06,
"loss": 0.0076,
"step": 3070
},
{
"epoch": 1.571574381219699,
"grad_norm": 0.9293046593666077,
"learning_rate": 5.20631970260223e-06,
"loss": 0.0057,
"step": 3080
},
{
"epoch": 1.5766777239091605,
"grad_norm": 1.5675665140151978,
"learning_rate": 5.187732342007435e-06,
"loss": 0.0034,
"step": 3090
},
{
"epoch": 1.581781066598622,
"grad_norm": 0.7534652352333069,
"learning_rate": 5.1691449814126395e-06,
"loss": 0.0056,
"step": 3100
},
{
"epoch": 1.5868844092880838,
"grad_norm": 0.5952958464622498,
"learning_rate": 5.1505576208178445e-06,
"loss": 0.007,
"step": 3110
},
{
"epoch": 1.5919877519775452,
"grad_norm": 2.0124547481536865,
"learning_rate": 5.1319702602230495e-06,
"loss": 0.0085,
"step": 3120
},
{
"epoch": 1.597091094667007,
"grad_norm": 2.005147695541382,
"learning_rate": 5.113382899628253e-06,
"loss": 0.0054,
"step": 3130
},
{
"epoch": 1.6021944373564685,
"grad_norm": 2.4814913272857666,
"learning_rate": 5.094795539033458e-06,
"loss": 0.0078,
"step": 3140
},
{
"epoch": 1.60729778004593,
"grad_norm": 1.2369087934494019,
"learning_rate": 5.076208178438662e-06,
"loss": 0.0045,
"step": 3150
},
{
"epoch": 1.6124011227353918,
"grad_norm": 1.4700592756271362,
"learning_rate": 5.057620817843867e-06,
"loss": 0.005,
"step": 3160
},
{
"epoch": 1.6175044654248532,
"grad_norm": 0.8340181112289429,
"learning_rate": 5.039033457249071e-06,
"loss": 0.0039,
"step": 3170
},
{
"epoch": 1.622607808114315,
"grad_norm": 1.1849232912063599,
"learning_rate": 5.020446096654276e-06,
"loss": 0.0057,
"step": 3180
},
{
"epoch": 1.6277111508037765,
"grad_norm": 0.2937636077404022,
"learning_rate": 5.001858736059479e-06,
"loss": 0.0035,
"step": 3190
},
{
"epoch": 1.632814493493238,
"grad_norm": 2.127737045288086,
"learning_rate": 4.983271375464684e-06,
"loss": 0.006,
"step": 3200
},
{
"epoch": 1.6379178361826998,
"grad_norm": 0.5009581446647644,
"learning_rate": 4.964684014869889e-06,
"loss": 0.0049,
"step": 3210
},
{
"epoch": 1.6430211788721611,
"grad_norm": 0.9251111745834351,
"learning_rate": 4.946096654275093e-06,
"loss": 0.0033,
"step": 3220
},
{
"epoch": 1.648124521561623,
"grad_norm": 0.9365226626396179,
"learning_rate": 4.9275092936802975e-06,
"loss": 0.0057,
"step": 3230
},
{
"epoch": 1.6532278642510845,
"grad_norm": 1.4188483953475952,
"learning_rate": 4.9089219330855024e-06,
"loss": 0.006,
"step": 3240
},
{
"epoch": 1.658331206940546,
"grad_norm": 2.330155372619629,
"learning_rate": 4.8903345724907066e-06,
"loss": 0.0087,
"step": 3250
},
{
"epoch": 1.6634345496300078,
"grad_norm": 0.6663316488265991,
"learning_rate": 4.8717472118959115e-06,
"loss": 0.0077,
"step": 3260
},
{
"epoch": 1.6685378923194691,
"grad_norm": 0.3848799169063568,
"learning_rate": 4.853159851301116e-06,
"loss": 0.0037,
"step": 3270
},
{
"epoch": 1.673641235008931,
"grad_norm": 1.8248586654663086,
"learning_rate": 4.83457249070632e-06,
"loss": 0.0048,
"step": 3280
},
{
"epoch": 1.6787445776983925,
"grad_norm": 0.4323923885822296,
"learning_rate": 4.815985130111525e-06,
"loss": 0.0039,
"step": 3290
},
{
"epoch": 1.683847920387854,
"grad_norm": 0.8399850726127625,
"learning_rate": 4.797397769516729e-06,
"loss": 0.0048,
"step": 3300
},
{
"epoch": 1.6889512630773158,
"grad_norm": 1.9225555658340454,
"learning_rate": 4.778810408921933e-06,
"loss": 0.0049,
"step": 3310
},
{
"epoch": 1.6940546057667771,
"grad_norm": 3.109381675720215,
"learning_rate": 4.760223048327138e-06,
"loss": 0.0032,
"step": 3320
},
{
"epoch": 1.699157948456239,
"grad_norm": 0.18898829817771912,
"learning_rate": 4.741635687732342e-06,
"loss": 0.0044,
"step": 3330
},
{
"epoch": 1.7042612911457005,
"grad_norm": 1.3611193895339966,
"learning_rate": 4.723048327137547e-06,
"loss": 0.0043,
"step": 3340
},
{
"epoch": 1.709364633835162,
"grad_norm": 2.0131754875183105,
"learning_rate": 4.704460966542751e-06,
"loss": 0.0055,
"step": 3350
},
{
"epoch": 1.7144679765246238,
"grad_norm": 0.367348313331604,
"learning_rate": 4.685873605947956e-06,
"loss": 0.0035,
"step": 3360
},
{
"epoch": 1.7195713192140851,
"grad_norm": 0.44550269842147827,
"learning_rate": 4.66728624535316e-06,
"loss": 0.0048,
"step": 3370
},
{
"epoch": 1.724674661903547,
"grad_norm": 0.8190656900405884,
"learning_rate": 4.6486988847583645e-06,
"loss": 0.0034,
"step": 3380
},
{
"epoch": 1.7297780045930085,
"grad_norm": 1.4577871561050415,
"learning_rate": 4.6301115241635695e-06,
"loss": 0.0036,
"step": 3390
},
{
"epoch": 1.73488134728247,
"grad_norm": 1.2969541549682617,
"learning_rate": 4.611524163568774e-06,
"loss": 0.0056,
"step": 3400
},
{
"epoch": 1.7399846899719316,
"grad_norm": 1.2122235298156738,
"learning_rate": 4.592936802973978e-06,
"loss": 0.005,
"step": 3410
},
{
"epoch": 1.7450880326613931,
"grad_norm": 1.4638077020645142,
"learning_rate": 4.574349442379183e-06,
"loss": 0.0067,
"step": 3420
},
{
"epoch": 1.750191375350855,
"grad_norm": 0.8848273158073425,
"learning_rate": 4.555762081784387e-06,
"loss": 0.0043,
"step": 3430
},
{
"epoch": 1.7552947180403164,
"grad_norm": 1.0738859176635742,
"learning_rate": 4.537174721189592e-06,
"loss": 0.003,
"step": 3440
},
{
"epoch": 1.760398060729778,
"grad_norm": 0.7346417307853699,
"learning_rate": 4.518587360594796e-06,
"loss": 0.0064,
"step": 3450
},
{
"epoch": 1.7655014034192396,
"grad_norm": 0.6269612908363342,
"learning_rate": 4.5e-06,
"loss": 0.0029,
"step": 3460
},
{
"epoch": 1.770604746108701,
"grad_norm": 0.3140880763530731,
"learning_rate": 4.481412639405205e-06,
"loss": 0.0031,
"step": 3470
},
{
"epoch": 1.7757080887981629,
"grad_norm": 1.5578272342681885,
"learning_rate": 4.462825278810409e-06,
"loss": 0.0049,
"step": 3480
},
{
"epoch": 1.7808114314876244,
"grad_norm": 1.5797828435897827,
"learning_rate": 4.444237918215613e-06,
"loss": 0.0038,
"step": 3490
},
{
"epoch": 1.785914774177086,
"grad_norm": 0.6058505773544312,
"learning_rate": 4.425650557620818e-06,
"loss": 0.0041,
"step": 3500
},
{
"epoch": 1.7910181168665475,
"grad_norm": 1.7516237497329712,
"learning_rate": 4.4070631970260225e-06,
"loss": 0.0034,
"step": 3510
},
{
"epoch": 1.796121459556009,
"grad_norm": 1.806767463684082,
"learning_rate": 4.388475836431227e-06,
"loss": 0.0055,
"step": 3520
},
{
"epoch": 1.8012248022454709,
"grad_norm": 1.1925740242004395,
"learning_rate": 4.369888475836432e-06,
"loss": 0.009,
"step": 3530
},
{
"epoch": 1.8063281449349324,
"grad_norm": 0.4238371253013611,
"learning_rate": 4.3513011152416366e-06,
"loss": 0.0049,
"step": 3540
},
{
"epoch": 1.811431487624394,
"grad_norm": 0.4037840664386749,
"learning_rate": 4.332713754646841e-06,
"loss": 0.0036,
"step": 3550
},
{
"epoch": 1.8165348303138555,
"grad_norm": 0.35048696398735046,
"learning_rate": 4.314126394052045e-06,
"loss": 0.0041,
"step": 3560
},
{
"epoch": 1.821638173003317,
"grad_norm": 0.916644811630249,
"learning_rate": 4.29553903345725e-06,
"loss": 0.0052,
"step": 3570
},
{
"epoch": 1.8267415156927789,
"grad_norm": 1.2729437351226807,
"learning_rate": 4.276951672862454e-06,
"loss": 0.0039,
"step": 3580
},
{
"epoch": 1.8318448583822402,
"grad_norm": 0.14079026877880096,
"learning_rate": 4.258364312267658e-06,
"loss": 0.0052,
"step": 3590
},
{
"epoch": 1.836948201071702,
"grad_norm": 0.7596153616905212,
"learning_rate": 4.239776951672863e-06,
"loss": 0.0036,
"step": 3600
},
{
"epoch": 1.8420515437611635,
"grad_norm": 0.5967218279838562,
"learning_rate": 4.221189591078067e-06,
"loss": 0.0034,
"step": 3610
},
{
"epoch": 1.847154886450625,
"grad_norm": 3.4013657569885254,
"learning_rate": 4.202602230483272e-06,
"loss": 0.0037,
"step": 3620
},
{
"epoch": 1.8522582291400869,
"grad_norm": 0.5800639986991882,
"learning_rate": 4.184014869888476e-06,
"loss": 0.0054,
"step": 3630
},
{
"epoch": 1.8573615718295482,
"grad_norm": 1.089106798171997,
"learning_rate": 4.16542750929368e-06,
"loss": 0.0034,
"step": 3640
},
{
"epoch": 1.86246491451901,
"grad_norm": 1.3403905630111694,
"learning_rate": 4.146840148698885e-06,
"loss": 0.0081,
"step": 3650
},
{
"epoch": 1.8675682572084715,
"grad_norm": 1.1740740537643433,
"learning_rate": 4.1282527881040895e-06,
"loss": 0.006,
"step": 3660
},
{
"epoch": 1.872671599897933,
"grad_norm": 1.466586709022522,
"learning_rate": 4.109665427509294e-06,
"loss": 0.0057,
"step": 3670
},
{
"epoch": 1.8777749425873949,
"grad_norm": 0.5066618919372559,
"learning_rate": 4.091078066914499e-06,
"loss": 0.0052,
"step": 3680
},
{
"epoch": 1.8828782852768562,
"grad_norm": 1.9082309007644653,
"learning_rate": 4.072490706319703e-06,
"loss": 0.0107,
"step": 3690
},
{
"epoch": 1.887981627966318,
"grad_norm": 1.2575207948684692,
"learning_rate": 4.053903345724907e-06,
"loss": 0.0036,
"step": 3700
},
{
"epoch": 1.8930849706557795,
"grad_norm": 2.694517135620117,
"learning_rate": 4.035315985130112e-06,
"loss": 0.0037,
"step": 3710
},
{
"epoch": 1.898188313345241,
"grad_norm": 1.3405297994613647,
"learning_rate": 4.016728624535317e-06,
"loss": 0.0035,
"step": 3720
},
{
"epoch": 1.9032916560347029,
"grad_norm": 0.7254632115364075,
"learning_rate": 3.998141263940521e-06,
"loss": 0.0058,
"step": 3730
},
{
"epoch": 1.9083949987241642,
"grad_norm": 0.9581726789474487,
"learning_rate": 3.979553903345725e-06,
"loss": 0.0035,
"step": 3740
},
{
"epoch": 1.913498341413626,
"grad_norm": 1.521457314491272,
"learning_rate": 3.96096654275093e-06,
"loss": 0.0028,
"step": 3750
},
{
"epoch": 1.9186016841030875,
"grad_norm": 1.7886348962783813,
"learning_rate": 3.942379182156134e-06,
"loss": 0.0036,
"step": 3760
},
{
"epoch": 1.923705026792549,
"grad_norm": 1.3068257570266724,
"learning_rate": 3.923791821561338e-06,
"loss": 0.0035,
"step": 3770
},
{
"epoch": 1.9288083694820108,
"grad_norm": 0.31176239252090454,
"learning_rate": 3.905204460966543e-06,
"loss": 0.0044,
"step": 3780
},
{
"epoch": 1.9339117121714722,
"grad_norm": 1.6306222677230835,
"learning_rate": 3.8866171003717475e-06,
"loss": 0.0056,
"step": 3790
},
{
"epoch": 1.939015054860934,
"grad_norm": 0.576551616191864,
"learning_rate": 3.868029739776952e-06,
"loss": 0.0043,
"step": 3800
},
{
"epoch": 1.9441183975503955,
"grad_norm": 0.834531843662262,
"learning_rate": 3.849442379182157e-06,
"loss": 0.0042,
"step": 3810
},
{
"epoch": 1.949221740239857,
"grad_norm": 0.5537549257278442,
"learning_rate": 3.830855018587361e-06,
"loss": 0.0032,
"step": 3820
},
{
"epoch": 1.9543250829293188,
"grad_norm": 1.457414150238037,
"learning_rate": 3.8122676579925653e-06,
"loss": 0.0046,
"step": 3830
},
{
"epoch": 1.9594284256187802,
"grad_norm": 1.4577444791793823,
"learning_rate": 3.79368029739777e-06,
"loss": 0.0042,
"step": 3840
},
{
"epoch": 1.964531768308242,
"grad_norm": 1.1170302629470825,
"learning_rate": 3.7750929368029744e-06,
"loss": 0.0024,
"step": 3850
},
{
"epoch": 1.9696351109977035,
"grad_norm": 1.491133213043213,
"learning_rate": 3.7565055762081785e-06,
"loss": 0.0034,
"step": 3860
},
{
"epoch": 1.974738453687165,
"grad_norm": 0.6399077773094177,
"learning_rate": 3.737918215613383e-06,
"loss": 0.0037,
"step": 3870
},
{
"epoch": 1.9798417963766268,
"grad_norm": 0.1969028264284134,
"learning_rate": 3.7193308550185876e-06,
"loss": 0.0032,
"step": 3880
},
{
"epoch": 1.9849451390660882,
"grad_norm": 0.8877514600753784,
"learning_rate": 3.7007434944237918e-06,
"loss": 0.0048,
"step": 3890
},
{
"epoch": 1.99004848175555,
"grad_norm": 0.6252923607826233,
"learning_rate": 3.6821561338289967e-06,
"loss": 0.0023,
"step": 3900
},
{
"epoch": 1.9951518244450115,
"grad_norm": 0.5143133997917175,
"learning_rate": 3.6635687732342013e-06,
"loss": 0.0032,
"step": 3910
},
{
"epoch": 2.0,
"grad_norm": 0.16264012455940247,
"learning_rate": 3.6449814126394054e-06,
"loss": 0.0032,
"step": 3920
},
{
"epoch": 2.0051033426894618,
"grad_norm": 1.059360146522522,
"learning_rate": 3.62639405204461e-06,
"loss": 0.0019,
"step": 3930
},
{
"epoch": 2.010206685378923,
"grad_norm": 0.20279403030872345,
"learning_rate": 3.6078066914498145e-06,
"loss": 0.0026,
"step": 3940
},
{
"epoch": 2.015310028068385,
"grad_norm": 3.2196450233459473,
"learning_rate": 3.5892193308550187e-06,
"loss": 0.0014,
"step": 3950
},
{
"epoch": 2.020413370757846,
"grad_norm": 0.14018088579177856,
"learning_rate": 3.5706319702602232e-06,
"loss": 0.0025,
"step": 3960
},
{
"epoch": 2.025516713447308,
"grad_norm": 0.1343514323234558,
"learning_rate": 3.5520446096654278e-06,
"loss": 0.0014,
"step": 3970
},
{
"epoch": 2.0306200561367698,
"grad_norm": 0.08496394008398056,
"learning_rate": 3.533457249070632e-06,
"loss": 0.0008,
"step": 3980
},
{
"epoch": 2.035723398826231,
"grad_norm": 0.08516672253608704,
"learning_rate": 3.514869888475837e-06,
"loss": 0.0009,
"step": 3990
},
{
"epoch": 2.040826741515693,
"grad_norm": 0.12272001802921295,
"learning_rate": 3.4962825278810415e-06,
"loss": 0.0047,
"step": 4000
},
{
"epoch": 2.040826741515693,
"eval_loss": 0.0037899946328252554,
"eval_runtime": 5936.4458,
"eval_samples_per_second": 2.64,
"eval_steps_per_second": 0.33,
"eval_wer": 0.3019483689209356,
"step": 4000
},
{
"epoch": 2.045930084205154,
"grad_norm": 2.3276867866516113,
"learning_rate": 3.4776951672862456e-06,
"loss": 0.0031,
"step": 4010
},
{
"epoch": 2.051033426894616,
"grad_norm": 0.3778473436832428,
"learning_rate": 3.45910780669145e-06,
"loss": 0.001,
"step": 4020
},
{
"epoch": 2.0561367695840778,
"grad_norm": 0.19050586223602295,
"learning_rate": 3.4405204460966547e-06,
"loss": 0.0016,
"step": 4030
},
{
"epoch": 2.061240112273539,
"grad_norm": 0.73752361536026,
"learning_rate": 3.421933085501859e-06,
"loss": 0.0015,
"step": 4040
},
{
"epoch": 2.066343454963001,
"grad_norm": 0.04638410732150078,
"learning_rate": 3.4033457249070634e-06,
"loss": 0.0008,
"step": 4050
},
{
"epoch": 2.071446797652462,
"grad_norm": 0.1353781372308731,
"learning_rate": 3.384758364312268e-06,
"loss": 0.0015,
"step": 4060
},
{
"epoch": 2.076550140341924,
"grad_norm": 0.2437770962715149,
"learning_rate": 3.366171003717472e-06,
"loss": 0.0027,
"step": 4070
},
{
"epoch": 2.0816534830313858,
"grad_norm": 0.1823304444551468,
"learning_rate": 3.3475836431226766e-06,
"loss": 0.0008,
"step": 4080
},
{
"epoch": 2.086756825720847,
"grad_norm": 0.14914552867412567,
"learning_rate": 3.3289962825278816e-06,
"loss": 0.0022,
"step": 4090
},
{
"epoch": 2.091860168410309,
"grad_norm": 0.09792085736989975,
"learning_rate": 3.3104089219330857e-06,
"loss": 0.0008,
"step": 4100
},
{
"epoch": 2.09696351109977,
"grad_norm": 0.1684180498123169,
"learning_rate": 3.2918215613382903e-06,
"loss": 0.002,
"step": 4110
},
{
"epoch": 2.102066853789232,
"grad_norm": 0.11210177093744278,
"learning_rate": 3.273234200743495e-06,
"loss": 0.0015,
"step": 4120
},
{
"epoch": 2.1071701964786937,
"grad_norm": 0.08923725038766861,
"learning_rate": 3.254646840148699e-06,
"loss": 0.0017,
"step": 4130
},
{
"epoch": 2.112273539168155,
"grad_norm": 0.1244659274816513,
"learning_rate": 3.2360594795539035e-06,
"loss": 0.0014,
"step": 4140
},
{
"epoch": 2.117376881857617,
"grad_norm": 0.1290469914674759,
"learning_rate": 3.217472118959108e-06,
"loss": 0.001,
"step": 4150
},
{
"epoch": 2.122480224547078,
"grad_norm": 0.07927042990922928,
"learning_rate": 3.1988847583643122e-06,
"loss": 0.0011,
"step": 4160
},
{
"epoch": 2.12758356723654,
"grad_norm": 0.05773673579096794,
"learning_rate": 3.1802973977695168e-06,
"loss": 0.002,
"step": 4170
},
{
"epoch": 2.1326869099260017,
"grad_norm": 0.0966513454914093,
"learning_rate": 3.1617100371747218e-06,
"loss": 0.0017,
"step": 4180
},
{
"epoch": 2.137790252615463,
"grad_norm": 0.4139426052570343,
"learning_rate": 3.143122676579926e-06,
"loss": 0.001,
"step": 4190
},
{
"epoch": 2.142893595304925,
"grad_norm": 0.13697542250156403,
"learning_rate": 3.1245353159851304e-06,
"loss": 0.0015,
"step": 4200
},
{
"epoch": 2.147996937994386,
"grad_norm": 0.25114238262176514,
"learning_rate": 3.105947955390335e-06,
"loss": 0.0008,
"step": 4210
},
{
"epoch": 2.153100280683848,
"grad_norm": 2.0331125259399414,
"learning_rate": 3.087360594795539e-06,
"loss": 0.0008,
"step": 4220
},
{
"epoch": 2.1582036233733097,
"grad_norm": 0.24415123462677002,
"learning_rate": 3.0687732342007437e-06,
"loss": 0.0007,
"step": 4230
},
{
"epoch": 2.163306966062771,
"grad_norm": 0.12605494260787964,
"learning_rate": 3.0501858736059482e-06,
"loss": 0.001,
"step": 4240
},
{
"epoch": 2.168410308752233,
"grad_norm": 0.1145569309592247,
"learning_rate": 3.0315985130111524e-06,
"loss": 0.0015,
"step": 4250
},
{
"epoch": 2.173513651441694,
"grad_norm": 0.10327371209859848,
"learning_rate": 3.013011152416357e-06,
"loss": 0.0019,
"step": 4260
},
{
"epoch": 2.178616994131156,
"grad_norm": 0.28306740522384644,
"learning_rate": 2.994423791821562e-06,
"loss": 0.0011,
"step": 4270
},
{
"epoch": 2.1837203368206177,
"grad_norm": 0.07228295505046844,
"learning_rate": 2.975836431226766e-06,
"loss": 0.0011,
"step": 4280
},
{
"epoch": 2.188823679510079,
"grad_norm": 0.34507399797439575,
"learning_rate": 2.9572490706319706e-06,
"loss": 0.0025,
"step": 4290
},
{
"epoch": 2.193927022199541,
"grad_norm": 0.18134921789169312,
"learning_rate": 2.938661710037175e-06,
"loss": 0.0008,
"step": 4300
},
{
"epoch": 2.199030364889002,
"grad_norm": 0.45632845163345337,
"learning_rate": 2.9200743494423793e-06,
"loss": 0.0013,
"step": 4310
},
{
"epoch": 2.204133707578464,
"grad_norm": 0.3898405134677887,
"learning_rate": 2.901486988847584e-06,
"loss": 0.0026,
"step": 4320
},
{
"epoch": 2.2092370502679257,
"grad_norm": 0.2680865228176117,
"learning_rate": 2.8828996282527884e-06,
"loss": 0.003,
"step": 4330
},
{
"epoch": 2.214340392957387,
"grad_norm": 0.15372110903263092,
"learning_rate": 2.8643122676579925e-06,
"loss": 0.0007,
"step": 4340
},
{
"epoch": 2.219443735646849,
"grad_norm": 0.05915176123380661,
"learning_rate": 2.845724907063197e-06,
"loss": 0.0009,
"step": 4350
},
{
"epoch": 2.22454707833631,
"grad_norm": 0.07108801603317261,
"learning_rate": 2.827137546468402e-06,
"loss": 0.0018,
"step": 4360
},
{
"epoch": 2.229650421025772,
"grad_norm": 0.10522626340389252,
"learning_rate": 2.808550185873606e-06,
"loss": 0.0009,
"step": 4370
},
{
"epoch": 2.2347537637152337,
"grad_norm": 0.49109524488449097,
"learning_rate": 2.7899628252788107e-06,
"loss": 0.0023,
"step": 4380
},
{
"epoch": 2.239857106404695,
"grad_norm": 0.09302613884210587,
"learning_rate": 2.7713754646840153e-06,
"loss": 0.0012,
"step": 4390
},
{
"epoch": 2.244960449094157,
"grad_norm": 0.09560300409793854,
"learning_rate": 2.7527881040892194e-06,
"loss": 0.0015,
"step": 4400
},
{
"epoch": 2.250063791783618,
"grad_norm": 0.04867486655712128,
"learning_rate": 2.734200743494424e-06,
"loss": 0.0008,
"step": 4410
},
{
"epoch": 2.25516713447308,
"grad_norm": 0.40990233421325684,
"learning_rate": 2.7156133828996285e-06,
"loss": 0.0015,
"step": 4420
},
{
"epoch": 2.2602704771625417,
"grad_norm": 0.17485907673835754,
"learning_rate": 2.6970260223048327e-06,
"loss": 0.0017,
"step": 4430
},
{
"epoch": 2.265373819852003,
"grad_norm": 0.55886310338974,
"learning_rate": 2.6784386617100372e-06,
"loss": 0.0012,
"step": 4440
},
{
"epoch": 2.270477162541465,
"grad_norm": 0.1222391277551651,
"learning_rate": 2.659851301115242e-06,
"loss": 0.0007,
"step": 4450
},
{
"epoch": 2.275580505230926,
"grad_norm": 0.08456246554851532,
"learning_rate": 2.6412639405204463e-06,
"loss": 0.0011,
"step": 4460
},
{
"epoch": 2.280683847920388,
"grad_norm": 0.10595785826444626,
"learning_rate": 2.622676579925651e-06,
"loss": 0.0008,
"step": 4470
},
{
"epoch": 2.2857871906098497,
"grad_norm": 0.20427174866199493,
"learning_rate": 2.6040892193308555e-06,
"loss": 0.0017,
"step": 4480
},
{
"epoch": 2.290890533299311,
"grad_norm": 0.12449493259191513,
"learning_rate": 2.5855018587360596e-06,
"loss": 0.0007,
"step": 4490
},
{
"epoch": 2.295993875988773,
"grad_norm": 0.14265963435173035,
"learning_rate": 2.566914498141264e-06,
"loss": 0.0017,
"step": 4500
},
{
"epoch": 2.301097218678234,
"grad_norm": 0.11204581707715988,
"learning_rate": 2.5483271375464687e-06,
"loss": 0.0009,
"step": 4510
},
{
"epoch": 2.306200561367696,
"grad_norm": 0.05533703789114952,
"learning_rate": 2.529739776951673e-06,
"loss": 0.0009,
"step": 4520
},
{
"epoch": 2.3113039040571577,
"grad_norm": 0.18573331832885742,
"learning_rate": 2.5111524163568774e-06,
"loss": 0.0009,
"step": 4530
},
{
"epoch": 2.316407246746619,
"grad_norm": 0.26909375190734863,
"learning_rate": 2.492565055762082e-06,
"loss": 0.0006,
"step": 4540
},
{
"epoch": 2.321510589436081,
"grad_norm": 0.1236054077744484,
"learning_rate": 2.4739776951672865e-06,
"loss": 0.0006,
"step": 4550
},
{
"epoch": 2.326613932125542,
"grad_norm": 0.12512840330600739,
"learning_rate": 2.4553903345724906e-06,
"loss": 0.0009,
"step": 4560
},
{
"epoch": 2.331717274815004,
"grad_norm": 0.06971368938684464,
"learning_rate": 2.4368029739776956e-06,
"loss": 0.0013,
"step": 4570
},
{
"epoch": 2.3368206175044652,
"grad_norm": 0.06864210218191147,
"learning_rate": 2.4182156133828997e-06,
"loss": 0.0007,
"step": 4580
},
{
"epoch": 2.341923960193927,
"grad_norm": 0.07469363510608673,
"learning_rate": 2.3996282527881043e-06,
"loss": 0.0008,
"step": 4590
},
{
"epoch": 2.347027302883389,
"grad_norm": 0.06451886147260666,
"learning_rate": 2.381040892193309e-06,
"loss": 0.0007,
"step": 4600
},
{
"epoch": 2.35213064557285,
"grad_norm": 2.1539828777313232,
"learning_rate": 2.362453531598513e-06,
"loss": 0.0021,
"step": 4610
},
{
"epoch": 2.357233988262312,
"grad_norm": 0.22783559560775757,
"learning_rate": 2.343866171003718e-06,
"loss": 0.0011,
"step": 4620
},
{
"epoch": 2.3623373309517732,
"grad_norm": 0.2584841549396515,
"learning_rate": 2.325278810408922e-06,
"loss": 0.0011,
"step": 4630
},
{
"epoch": 2.367440673641235,
"grad_norm": 0.03816497325897217,
"learning_rate": 2.3066914498141266e-06,
"loss": 0.001,
"step": 4640
},
{
"epoch": 2.372544016330697,
"grad_norm": 0.15890613198280334,
"learning_rate": 2.2881040892193308e-06,
"loss": 0.0007,
"step": 4650
},
{
"epoch": 2.377647359020158,
"grad_norm": 0.09676986187696457,
"learning_rate": 2.2695167286245353e-06,
"loss": 0.0011,
"step": 4660
},
{
"epoch": 2.38275070170962,
"grad_norm": 0.1698722392320633,
"learning_rate": 2.25092936802974e-06,
"loss": 0.0008,
"step": 4670
},
{
"epoch": 2.387854044399081,
"grad_norm": 0.11855011433362961,
"learning_rate": 2.2323420074349444e-06,
"loss": 0.0008,
"step": 4680
},
{
"epoch": 2.392957387088543,
"grad_norm": 0.06932226568460464,
"learning_rate": 2.213754646840149e-06,
"loss": 0.0006,
"step": 4690
},
{
"epoch": 2.3980607297780048,
"grad_norm": 0.11563575267791748,
"learning_rate": 2.195167286245353e-06,
"loss": 0.0007,
"step": 4700
},
{
"epoch": 2.403164072467466,
"grad_norm": 0.15386591851711273,
"learning_rate": 2.176579925650558e-06,
"loss": 0.0012,
"step": 4710
},
{
"epoch": 2.408267415156928,
"grad_norm": 1.269977331161499,
"learning_rate": 2.1579925650557622e-06,
"loss": 0.0009,
"step": 4720
},
{
"epoch": 2.413370757846389,
"grad_norm": 0.16647151112556458,
"learning_rate": 2.139405204460967e-06,
"loss": 0.0019,
"step": 4730
},
{
"epoch": 2.418474100535851,
"grad_norm": 0.02462880127131939,
"learning_rate": 2.120817843866171e-06,
"loss": 0.0008,
"step": 4740
},
{
"epoch": 2.4235774432253128,
"grad_norm": 0.1744278371334076,
"learning_rate": 2.1022304832713755e-06,
"loss": 0.0017,
"step": 4750
},
{
"epoch": 2.428680785914774,
"grad_norm": 0.10652749240398407,
"learning_rate": 2.08364312267658e-06,
"loss": 0.0008,
"step": 4760
},
{
"epoch": 2.433784128604236,
"grad_norm": 0.07770746201276779,
"learning_rate": 2.0650557620817846e-06,
"loss": 0.0019,
"step": 4770
},
{
"epoch": 2.438887471293697,
"grad_norm": 0.18003053963184357,
"learning_rate": 2.046468401486989e-06,
"loss": 0.0019,
"step": 4780
},
{
"epoch": 2.443990813983159,
"grad_norm": 1.0288631916046143,
"learning_rate": 2.0278810408921933e-06,
"loss": 0.001,
"step": 4790
},
{
"epoch": 2.4490941566726208,
"grad_norm": 0.15346410870552063,
"learning_rate": 2.009293680297398e-06,
"loss": 0.0019,
"step": 4800
},
{
"epoch": 2.454197499362082,
"grad_norm": 0.036040738224983215,
"learning_rate": 1.9907063197026024e-06,
"loss": 0.0006,
"step": 4810
},
{
"epoch": 2.459300842051544,
"grad_norm": 0.23148085176944733,
"learning_rate": 1.972118959107807e-06,
"loss": 0.0021,
"step": 4820
},
{
"epoch": 2.464404184741005,
"grad_norm": 0.2947562038898468,
"learning_rate": 1.953531598513011e-06,
"loss": 0.0009,
"step": 4830
},
{
"epoch": 2.469507527430467,
"grad_norm": 0.0932520180940628,
"learning_rate": 1.9349442379182156e-06,
"loss": 0.0007,
"step": 4840
},
{
"epoch": 2.4746108701199283,
"grad_norm": 0.17898057401180267,
"learning_rate": 1.91635687732342e-06,
"loss": 0.0008,
"step": 4850
},
{
"epoch": 2.47971421280939,
"grad_norm": 0.3708474934101105,
"learning_rate": 1.8977695167286248e-06,
"loss": 0.0007,
"step": 4860
},
{
"epoch": 2.484817555498852,
"grad_norm": 1.4122263193130493,
"learning_rate": 1.879182156133829e-06,
"loss": 0.002,
"step": 4870
},
{
"epoch": 2.489920898188313,
"grad_norm": 0.08020167052745819,
"learning_rate": 1.8605947955390337e-06,
"loss": 0.0009,
"step": 4880
},
{
"epoch": 2.495024240877775,
"grad_norm": 0.058009982109069824,
"learning_rate": 1.842007434944238e-06,
"loss": 0.0009,
"step": 4890
},
{
"epoch": 2.5001275835672363,
"grad_norm": 0.13584981858730316,
"learning_rate": 1.8234200743494426e-06,
"loss": 0.0007,
"step": 4900
},
{
"epoch": 2.505230926256698,
"grad_norm": 0.12781758606433868,
"learning_rate": 1.8048327137546471e-06,
"loss": 0.0012,
"step": 4910
},
{
"epoch": 2.51033426894616,
"grad_norm": 0.07662224024534225,
"learning_rate": 1.7862453531598515e-06,
"loss": 0.0014,
"step": 4920
},
{
"epoch": 2.515437611635621,
"grad_norm": 0.3236522674560547,
"learning_rate": 1.7676579925650558e-06,
"loss": 0.0016,
"step": 4930
},
{
"epoch": 2.520540954325083,
"grad_norm": 0.37531137466430664,
"learning_rate": 1.7490706319702606e-06,
"loss": 0.0009,
"step": 4940
},
{
"epoch": 2.5256442970145443,
"grad_norm": 0.0633283481001854,
"learning_rate": 1.730483271375465e-06,
"loss": 0.0017,
"step": 4950
},
{
"epoch": 2.530747639704006,
"grad_norm": 0.21088258922100067,
"learning_rate": 1.7118959107806692e-06,
"loss": 0.0016,
"step": 4960
},
{
"epoch": 2.535850982393468,
"grad_norm": 0.21896515786647797,
"learning_rate": 1.6933085501858738e-06,
"loss": 0.0015,
"step": 4970
},
{
"epoch": 2.540954325082929,
"grad_norm": 0.16385294497013092,
"learning_rate": 1.6747211895910781e-06,
"loss": 0.0009,
"step": 4980
},
{
"epoch": 2.546057667772391,
"grad_norm": 0.42555078864097595,
"learning_rate": 1.6561338289962827e-06,
"loss": 0.0015,
"step": 4990
},
{
"epoch": 2.5511610104618523,
"grad_norm": 0.09824415296316147,
"learning_rate": 1.6375464684014873e-06,
"loss": 0.0013,
"step": 5000
},
{
"epoch": 2.556264353151314,
"grad_norm": 0.1624874621629715,
"learning_rate": 1.6189591078066916e-06,
"loss": 0.0013,
"step": 5010
},
{
"epoch": 2.561367695840776,
"grad_norm": 0.09969186037778854,
"learning_rate": 1.600371747211896e-06,
"loss": 0.0008,
"step": 5020
},
{
"epoch": 2.566471038530237,
"grad_norm": 0.2469654530286789,
"learning_rate": 1.5817843866171003e-06,
"loss": 0.0008,
"step": 5030
},
{
"epoch": 2.571574381219699,
"grad_norm": 0.19352488219738007,
"learning_rate": 1.563197026022305e-06,
"loss": 0.0009,
"step": 5040
},
{
"epoch": 2.5766777239091603,
"grad_norm": 0.10725276172161102,
"learning_rate": 1.5446096654275094e-06,
"loss": 0.0016,
"step": 5050
},
{
"epoch": 2.581781066598622,
"grad_norm": 0.08375217020511627,
"learning_rate": 1.526022304832714e-06,
"loss": 0.0013,
"step": 5060
},
{
"epoch": 2.586884409288084,
"grad_norm": 0.04441326484084129,
"learning_rate": 1.5074349442379183e-06,
"loss": 0.0018,
"step": 5070
},
{
"epoch": 2.591987751977545,
"grad_norm": 0.13323254883289337,
"learning_rate": 1.4888475836431229e-06,
"loss": 0.0007,
"step": 5080
},
{
"epoch": 2.597091094667007,
"grad_norm": 0.0811479389667511,
"learning_rate": 1.4702602230483274e-06,
"loss": 0.0012,
"step": 5090
},
{
"epoch": 2.6021944373564683,
"grad_norm": 0.12009306252002716,
"learning_rate": 1.4516728624535318e-06,
"loss": 0.0007,
"step": 5100
},
{
"epoch": 2.60729778004593,
"grad_norm": 0.14158442616462708,
"learning_rate": 1.433085501858736e-06,
"loss": 0.0008,
"step": 5110
},
{
"epoch": 2.612401122735392,
"grad_norm": 0.08860210329294205,
"learning_rate": 1.4144981412639404e-06,
"loss": 0.0009,
"step": 5120
},
{
"epoch": 2.617504465424853,
"grad_norm": 0.1343207210302353,
"learning_rate": 1.3959107806691452e-06,
"loss": 0.0008,
"step": 5130
},
{
"epoch": 2.622607808114315,
"grad_norm": 0.2582390308380127,
"learning_rate": 1.3773234200743496e-06,
"loss": 0.0009,
"step": 5140
},
{
"epoch": 2.6277111508037763,
"grad_norm": 0.46794310212135315,
"learning_rate": 1.3587360594795541e-06,
"loss": 0.0026,
"step": 5150
},
{
"epoch": 2.632814493493238,
"grad_norm": 0.33035382628440857,
"learning_rate": 1.3401486988847585e-06,
"loss": 0.0028,
"step": 5160
},
{
"epoch": 2.6379178361827,
"grad_norm": 0.08842908591032028,
"learning_rate": 1.3215613382899628e-06,
"loss": 0.0007,
"step": 5170
},
{
"epoch": 2.643021178872161,
"grad_norm": 0.05442144721746445,
"learning_rate": 1.3029739776951676e-06,
"loss": 0.0007,
"step": 5180
},
{
"epoch": 2.648124521561623,
"grad_norm": 0.11757698655128479,
"learning_rate": 1.284386617100372e-06,
"loss": 0.001,
"step": 5190
},
{
"epoch": 2.6532278642510843,
"grad_norm": 0.02783489041030407,
"learning_rate": 1.2657992565055763e-06,
"loss": 0.0007,
"step": 5200
},
{
"epoch": 2.658331206940546,
"grad_norm": 0.5406972169876099,
"learning_rate": 1.2472118959107808e-06,
"loss": 0.0006,
"step": 5210
},
{
"epoch": 2.663434549630008,
"grad_norm": 0.26091036200523376,
"learning_rate": 1.2286245353159852e-06,
"loss": 0.0021,
"step": 5220
},
{
"epoch": 2.668537892319469,
"grad_norm": 0.06804846972227097,
"learning_rate": 1.2100371747211897e-06,
"loss": 0.0009,
"step": 5230
},
{
"epoch": 2.673641235008931,
"grad_norm": 0.26903030276298523,
"learning_rate": 1.191449814126394e-06,
"loss": 0.0006,
"step": 5240
},
{
"epoch": 2.6787445776983922,
"grad_norm": 0.11368360370397568,
"learning_rate": 1.1728624535315986e-06,
"loss": 0.0022,
"step": 5250
},
{
"epoch": 2.683847920387854,
"grad_norm": 0.25202804803848267,
"learning_rate": 1.1542750929368032e-06,
"loss": 0.0016,
"step": 5260
},
{
"epoch": 2.688951263077316,
"grad_norm": 0.1004834696650505,
"learning_rate": 1.1356877323420075e-06,
"loss": 0.0011,
"step": 5270
},
{
"epoch": 2.694054605766777,
"grad_norm": 0.505276083946228,
"learning_rate": 1.117100371747212e-06,
"loss": 0.0006,
"step": 5280
},
{
"epoch": 2.699157948456239,
"grad_norm": 0.20426543056964874,
"learning_rate": 1.0985130111524164e-06,
"loss": 0.0007,
"step": 5290
},
{
"epoch": 2.7042612911457002,
"grad_norm": 0.25349369645118713,
"learning_rate": 1.079925650557621e-06,
"loss": 0.0009,
"step": 5300
},
{
"epoch": 2.709364633835162,
"grad_norm": 0.24185939133167267,
"learning_rate": 1.0613382899628253e-06,
"loss": 0.0008,
"step": 5310
},
{
"epoch": 2.714467976524624,
"grad_norm": 0.09236308932304382,
"learning_rate": 1.0427509293680299e-06,
"loss": 0.0018,
"step": 5320
},
{
"epoch": 2.719571319214085,
"grad_norm": 1.3882845640182495,
"learning_rate": 1.0241635687732342e-06,
"loss": 0.0016,
"step": 5330
},
{
"epoch": 2.724674661903547,
"grad_norm": 0.09514420479536057,
"learning_rate": 1.0055762081784388e-06,
"loss": 0.001,
"step": 5340
},
{
"epoch": 2.7297780045930082,
"grad_norm": 0.053467828780412674,
"learning_rate": 9.869888475836433e-07,
"loss": 0.0016,
"step": 5350
},
{
"epoch": 2.73488134728247,
"grad_norm": 0.05835232511162758,
"learning_rate": 9.684014869888477e-07,
"loss": 0.0008,
"step": 5360
},
{
"epoch": 2.739984689971932,
"grad_norm": 0.268450528383255,
"learning_rate": 9.498141263940522e-07,
"loss": 0.0011,
"step": 5370
},
{
"epoch": 2.745088032661393,
"grad_norm": 0.26086875796318054,
"learning_rate": 9.312267657992566e-07,
"loss": 0.0009,
"step": 5380
},
{
"epoch": 2.750191375350855,
"grad_norm": 0.33822524547576904,
"learning_rate": 9.12639405204461e-07,
"loss": 0.0024,
"step": 5390
},
{
"epoch": 2.7552947180403162,
"grad_norm": 0.18432816863059998,
"learning_rate": 8.940520446096656e-07,
"loss": 0.0014,
"step": 5400
},
{
"epoch": 2.760398060729778,
"grad_norm": 0.1385423243045807,
"learning_rate": 8.754646840148699e-07,
"loss": 0.0022,
"step": 5410
},
{
"epoch": 2.76550140341924,
"grad_norm": 0.2909170091152191,
"learning_rate": 8.568773234200745e-07,
"loss": 0.0017,
"step": 5420
},
{
"epoch": 2.770604746108701,
"grad_norm": 0.14743858575820923,
"learning_rate": 8.382899628252789e-07,
"loss": 0.0005,
"step": 5430
},
{
"epoch": 2.775708088798163,
"grad_norm": 0.0644611343741417,
"learning_rate": 8.197026022304834e-07,
"loss": 0.0007,
"step": 5440
},
{
"epoch": 2.780811431487624,
"grad_norm": 0.07210192829370499,
"learning_rate": 8.011152416356878e-07,
"loss": 0.0006,
"step": 5450
},
{
"epoch": 2.785914774177086,
"grad_norm": 0.09240356832742691,
"learning_rate": 7.825278810408922e-07,
"loss": 0.0008,
"step": 5460
},
{
"epoch": 2.7910181168665478,
"grad_norm": 0.060933616012334824,
"learning_rate": 7.639405204460967e-07,
"loss": 0.0008,
"step": 5470
},
{
"epoch": 2.796121459556009,
"grad_norm": 0.21750350296497345,
"learning_rate": 7.453531598513012e-07,
"loss": 0.0006,
"step": 5480
},
{
"epoch": 2.801224802245471,
"grad_norm": 0.5640401840209961,
"learning_rate": 7.267657992565057e-07,
"loss": 0.0011,
"step": 5490
},
{
"epoch": 2.806328144934932,
"grad_norm": 0.11260447651147842,
"learning_rate": 7.081784386617101e-07,
"loss": 0.002,
"step": 5500
},
{
"epoch": 2.811431487624394,
"grad_norm": 0.2466878592967987,
"learning_rate": 6.895910780669146e-07,
"loss": 0.0014,
"step": 5510
},
{
"epoch": 2.8165348303138558,
"grad_norm": 0.28468677401542664,
"learning_rate": 6.710037174721191e-07,
"loss": 0.0012,
"step": 5520
},
{
"epoch": 2.821638173003317,
"grad_norm": 0.31752482056617737,
"learning_rate": 6.524163568773234e-07,
"loss": 0.0008,
"step": 5530
},
{
"epoch": 2.826741515692779,
"grad_norm": 0.06439520418643951,
"learning_rate": 6.33828996282528e-07,
"loss": 0.0008,
"step": 5540
},
{
"epoch": 2.83184485838224,
"grad_norm": 1.2561355829238892,
"learning_rate": 6.152416356877324e-07,
"loss": 0.0015,
"step": 5550
},
{
"epoch": 2.836948201071702,
"grad_norm": 0.03623468056321144,
"learning_rate": 5.966542750929369e-07,
"loss": 0.0011,
"step": 5560
},
{
"epoch": 2.8420515437611638,
"grad_norm": 0.0849744901061058,
"learning_rate": 5.780669144981413e-07,
"loss": 0.0009,
"step": 5570
},
{
"epoch": 2.847154886450625,
"grad_norm": 0.06939615309238434,
"learning_rate": 5.594795539033458e-07,
"loss": 0.0007,
"step": 5580
},
{
"epoch": 2.852258229140087,
"grad_norm": 0.1424991339445114,
"learning_rate": 5.408921933085502e-07,
"loss": 0.0005,
"step": 5590
},
{
"epoch": 2.857361571829548,
"grad_norm": 0.12026016414165497,
"learning_rate": 5.223048327137547e-07,
"loss": 0.0007,
"step": 5600
},
{
"epoch": 2.86246491451901,
"grad_norm": 0.06239992007613182,
"learning_rate": 5.037174721189591e-07,
"loss": 0.0006,
"step": 5610
},
{
"epoch": 2.8675682572084717,
"grad_norm": 0.3679276704788208,
"learning_rate": 4.851301115241637e-07,
"loss": 0.0018,
"step": 5620
},
{
"epoch": 2.872671599897933,
"grad_norm": 0.35109734535217285,
"learning_rate": 4.665427509293681e-07,
"loss": 0.0006,
"step": 5630
},
{
"epoch": 2.877774942587395,
"grad_norm": 0.05598178505897522,
"learning_rate": 4.479553903345725e-07,
"loss": 0.0018,
"step": 5640
},
{
"epoch": 2.882878285276856,
"grad_norm": 0.2161133885383606,
"learning_rate": 4.2936802973977696e-07,
"loss": 0.0029,
"step": 5650
},
{
"epoch": 2.887981627966318,
"grad_norm": 0.09623316675424576,
"learning_rate": 4.1078066914498146e-07,
"loss": 0.0013,
"step": 5660
},
{
"epoch": 2.8930849706557797,
"grad_norm": 0.12743206322193146,
"learning_rate": 3.921933085501859e-07,
"loss": 0.0008,
"step": 5670
},
{
"epoch": 2.898188313345241,
"grad_norm": 0.11315581947565079,
"learning_rate": 3.7360594795539036e-07,
"loss": 0.0008,
"step": 5680
},
{
"epoch": 2.903291656034703,
"grad_norm": 0.3154519498348236,
"learning_rate": 3.5501858736059486e-07,
"loss": 0.0011,
"step": 5690
},
{
"epoch": 2.908394998724164,
"grad_norm": 0.21190138161182404,
"learning_rate": 3.364312267657993e-07,
"loss": 0.0008,
"step": 5700
},
{
"epoch": 2.913498341413626,
"grad_norm": 0.1613079160451889,
"learning_rate": 3.178438661710037e-07,
"loss": 0.0017,
"step": 5710
},
{
"epoch": 2.9186016841030877,
"grad_norm": 0.07112964987754822,
"learning_rate": 2.992565055762082e-07,
"loss": 0.0007,
"step": 5720
},
{
"epoch": 2.923705026792549,
"grad_norm": 0.07665146887302399,
"learning_rate": 2.8066914498141266e-07,
"loss": 0.001,
"step": 5730
},
{
"epoch": 2.928808369482011,
"grad_norm": 2.0098037719726562,
"learning_rate": 2.620817843866171e-07,
"loss": 0.0014,
"step": 5740
},
{
"epoch": 2.933911712171472,
"grad_norm": 0.26732105016708374,
"learning_rate": 2.4349442379182156e-07,
"loss": 0.0008,
"step": 5750
},
{
"epoch": 2.939015054860934,
"grad_norm": 0.1273537576198578,
"learning_rate": 2.2490706319702606e-07,
"loss": 0.0015,
"step": 5760
},
{
"epoch": 2.9441183975503957,
"grad_norm": 0.3333347737789154,
"learning_rate": 2.063197026022305e-07,
"loss": 0.0012,
"step": 5770
},
{
"epoch": 2.949221740239857,
"grad_norm": 0.18425709009170532,
"learning_rate": 1.8773234200743496e-07,
"loss": 0.0019,
"step": 5780
},
{
"epoch": 2.954325082929319,
"grad_norm": 0.1758401244878769,
"learning_rate": 1.6914498141263944e-07,
"loss": 0.003,
"step": 5790
},
{
"epoch": 2.95942842561878,
"grad_norm": 0.13455261290073395,
"learning_rate": 1.505576208178439e-07,
"loss": 0.0008,
"step": 5800
},
{
"epoch": 2.964531768308242,
"grad_norm": 0.07335013896226883,
"learning_rate": 1.3197026022304834e-07,
"loss": 0.001,
"step": 5810
},
{
"epoch": 2.9696351109977037,
"grad_norm": 0.07254175841808319,
"learning_rate": 1.133828996282528e-07,
"loss": 0.0015,
"step": 5820
},
{
"epoch": 2.974738453687165,
"grad_norm": 0.09682650864124298,
"learning_rate": 9.479553903345725e-08,
"loss": 0.0012,
"step": 5830
},
{
"epoch": 2.979841796376627,
"grad_norm": 0.38653382658958435,
"learning_rate": 7.620817843866171e-08,
"loss": 0.0006,
"step": 5840
},
{
"epoch": 2.984945139066088,
"grad_norm": 1.195354700088501,
"learning_rate": 5.762081784386618e-08,
"loss": 0.0008,
"step": 5850
},
{
"epoch": 2.99004848175555,
"grad_norm": 0.05911433324217796,
"learning_rate": 3.9033457249070633e-08,
"loss": 0.0005,
"step": 5860
},
{
"epoch": 2.9951518244450117,
"grad_norm": 0.0798773467540741,
"learning_rate": 2.0446096654275096e-08,
"loss": 0.0017,
"step": 5870
},
{
"epoch": 3.0,
"grad_norm": 0.04673103615641594,
"learning_rate": 1.858736059479554e-09,
"loss": 0.0018,
"step": 5880
}
],
"logging_steps": 10,
"max_steps": 5880,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.428118228779008e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}