model-hotpotqa-constant-epo2 / trainer_state.json
ssktora's picture
Upload folder using huggingface_hub
34cee9d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2658,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007524454477050414,
"grad_norm": 63.752017974853516,
"learning_rate": 1e-05,
"loss": 3.3312,
"step": 10
},
{
"epoch": 0.015048908954100828,
"grad_norm": 50.676944732666016,
"learning_rate": 1e-05,
"loss": 1.525,
"step": 20
},
{
"epoch": 0.022573363431151242,
"grad_norm": 38.975914001464844,
"learning_rate": 1e-05,
"loss": 1.2213,
"step": 30
},
{
"epoch": 0.030097817908201655,
"grad_norm": 30.093769073486328,
"learning_rate": 1e-05,
"loss": 0.8578,
"step": 40
},
{
"epoch": 0.03762227238525207,
"grad_norm": 27.47739028930664,
"learning_rate": 1e-05,
"loss": 0.8364,
"step": 50
},
{
"epoch": 0.045146726862302484,
"grad_norm": 26.15501594543457,
"learning_rate": 1e-05,
"loss": 0.8731,
"step": 60
},
{
"epoch": 0.0526711813393529,
"grad_norm": 30.096651077270508,
"learning_rate": 1e-05,
"loss": 0.5887,
"step": 70
},
{
"epoch": 0.06019563581640331,
"grad_norm": 21.479469299316406,
"learning_rate": 1e-05,
"loss": 0.6146,
"step": 80
},
{
"epoch": 0.06772009029345373,
"grad_norm": 25.00172996520996,
"learning_rate": 1e-05,
"loss": 0.7224,
"step": 90
},
{
"epoch": 0.07524454477050414,
"grad_norm": 19.167516708374023,
"learning_rate": 1e-05,
"loss": 0.6965,
"step": 100
},
{
"epoch": 0.08276899924755456,
"grad_norm": 25.692691802978516,
"learning_rate": 1e-05,
"loss": 0.5539,
"step": 110
},
{
"epoch": 0.09029345372460497,
"grad_norm": 19.874868392944336,
"learning_rate": 1e-05,
"loss": 0.5842,
"step": 120
},
{
"epoch": 0.09781790820165538,
"grad_norm": 22.329219818115234,
"learning_rate": 1e-05,
"loss": 0.6003,
"step": 130
},
{
"epoch": 0.1053423626787058,
"grad_norm": 14.971156120300293,
"learning_rate": 1e-05,
"loss": 0.5671,
"step": 140
},
{
"epoch": 0.11286681715575621,
"grad_norm": 14.223251342773438,
"learning_rate": 1e-05,
"loss": 0.5096,
"step": 150
},
{
"epoch": 0.12039127163280662,
"grad_norm": 19.30224609375,
"learning_rate": 1e-05,
"loss": 0.5067,
"step": 160
},
{
"epoch": 0.12791572610985705,
"grad_norm": 14.27910327911377,
"learning_rate": 1e-05,
"loss": 0.5281,
"step": 170
},
{
"epoch": 0.13544018058690746,
"grad_norm": 19.874217987060547,
"learning_rate": 1e-05,
"loss": 0.4569,
"step": 180
},
{
"epoch": 0.14296463506395787,
"grad_norm": 16.16669273376465,
"learning_rate": 1e-05,
"loss": 0.5304,
"step": 190
},
{
"epoch": 0.1504890895410083,
"grad_norm": 13.952829360961914,
"learning_rate": 1e-05,
"loss": 0.4415,
"step": 200
},
{
"epoch": 0.1580135440180587,
"grad_norm": 16.3791446685791,
"learning_rate": 1e-05,
"loss": 0.5214,
"step": 210
},
{
"epoch": 0.1655379984951091,
"grad_norm": 11.849374771118164,
"learning_rate": 1e-05,
"loss": 0.4079,
"step": 220
},
{
"epoch": 0.17306245297215953,
"grad_norm": 17.892818450927734,
"learning_rate": 1e-05,
"loss": 0.562,
"step": 230
},
{
"epoch": 0.18058690744920994,
"grad_norm": 11.97033977508545,
"learning_rate": 1e-05,
"loss": 0.3959,
"step": 240
},
{
"epoch": 0.18811136192626035,
"grad_norm": 10.604959487915039,
"learning_rate": 1e-05,
"loss": 0.4519,
"step": 250
},
{
"epoch": 0.19563581640331076,
"grad_norm": 10.842782974243164,
"learning_rate": 1e-05,
"loss": 0.4617,
"step": 260
},
{
"epoch": 0.20316027088036118,
"grad_norm": 10.904434204101562,
"learning_rate": 1e-05,
"loss": 0.4858,
"step": 270
},
{
"epoch": 0.2106847253574116,
"grad_norm": 9.698153495788574,
"learning_rate": 1e-05,
"loss": 0.4126,
"step": 280
},
{
"epoch": 0.218209179834462,
"grad_norm": 12.699883460998535,
"learning_rate": 1e-05,
"loss": 0.4756,
"step": 290
},
{
"epoch": 0.22573363431151242,
"grad_norm": 14.62389850616455,
"learning_rate": 1e-05,
"loss": 0.3576,
"step": 300
},
{
"epoch": 0.23325808878856283,
"grad_norm": 12.436488151550293,
"learning_rate": 1e-05,
"loss": 0.3493,
"step": 310
},
{
"epoch": 0.24078254326561324,
"grad_norm": 17.488454818725586,
"learning_rate": 1e-05,
"loss": 0.4812,
"step": 320
},
{
"epoch": 0.24830699774266365,
"grad_norm": 15.149370193481445,
"learning_rate": 1e-05,
"loss": 0.4218,
"step": 330
},
{
"epoch": 0.2558314522197141,
"grad_norm": 11.76059341430664,
"learning_rate": 1e-05,
"loss": 0.3729,
"step": 340
},
{
"epoch": 0.2633559066967645,
"grad_norm": 15.72620964050293,
"learning_rate": 1e-05,
"loss": 0.2586,
"step": 350
},
{
"epoch": 0.2708803611738149,
"grad_norm": 16.726228713989258,
"learning_rate": 1e-05,
"loss": 0.4295,
"step": 360
},
{
"epoch": 0.27840481565086533,
"grad_norm": 12.156024932861328,
"learning_rate": 1e-05,
"loss": 0.3179,
"step": 370
},
{
"epoch": 0.28592927012791575,
"grad_norm": 12.32470417022705,
"learning_rate": 1e-05,
"loss": 0.3523,
"step": 380
},
{
"epoch": 0.29345372460496616,
"grad_norm": 17.34354591369629,
"learning_rate": 1e-05,
"loss": 0.3724,
"step": 390
},
{
"epoch": 0.3009781790820166,
"grad_norm": 9.95320987701416,
"learning_rate": 1e-05,
"loss": 0.4028,
"step": 400
},
{
"epoch": 0.308502633559067,
"grad_norm": 10.40683650970459,
"learning_rate": 1e-05,
"loss": 0.5469,
"step": 410
},
{
"epoch": 0.3160270880361174,
"grad_norm": 12.613582611083984,
"learning_rate": 1e-05,
"loss": 0.3672,
"step": 420
},
{
"epoch": 0.3235515425131678,
"grad_norm": 13.326891899108887,
"learning_rate": 1e-05,
"loss": 0.3514,
"step": 430
},
{
"epoch": 0.3310759969902182,
"grad_norm": 8.356232643127441,
"learning_rate": 1e-05,
"loss": 0.292,
"step": 440
},
{
"epoch": 0.33860045146726864,
"grad_norm": 14.835829734802246,
"learning_rate": 1e-05,
"loss": 0.3765,
"step": 450
},
{
"epoch": 0.34612490594431905,
"grad_norm": 7.975886821746826,
"learning_rate": 1e-05,
"loss": 0.3667,
"step": 460
},
{
"epoch": 0.35364936042136946,
"grad_norm": 10.029479026794434,
"learning_rate": 1e-05,
"loss": 0.3788,
"step": 470
},
{
"epoch": 0.3611738148984199,
"grad_norm": 11.4894437789917,
"learning_rate": 1e-05,
"loss": 0.309,
"step": 480
},
{
"epoch": 0.3686982693754703,
"grad_norm": 12.190320014953613,
"learning_rate": 1e-05,
"loss": 0.3398,
"step": 490
},
{
"epoch": 0.3762227238525207,
"grad_norm": 12.104024887084961,
"learning_rate": 1e-05,
"loss": 0.3907,
"step": 500
},
{
"epoch": 0.3837471783295711,
"grad_norm": 11.915987014770508,
"learning_rate": 1e-05,
"loss": 0.3278,
"step": 510
},
{
"epoch": 0.3912716328066215,
"grad_norm": 16.552160263061523,
"learning_rate": 1e-05,
"loss": 0.3745,
"step": 520
},
{
"epoch": 0.39879608728367194,
"grad_norm": 11.056331634521484,
"learning_rate": 1e-05,
"loss": 0.4694,
"step": 530
},
{
"epoch": 0.40632054176072235,
"grad_norm": 10.76766586303711,
"learning_rate": 1e-05,
"loss": 0.3631,
"step": 540
},
{
"epoch": 0.41384499623777277,
"grad_norm": 10.77774715423584,
"learning_rate": 1e-05,
"loss": 0.3448,
"step": 550
},
{
"epoch": 0.4213694507148232,
"grad_norm": 11.11598014831543,
"learning_rate": 1e-05,
"loss": 0.339,
"step": 560
},
{
"epoch": 0.4288939051918736,
"grad_norm": 8.696084976196289,
"learning_rate": 1e-05,
"loss": 0.4023,
"step": 570
},
{
"epoch": 0.436418359668924,
"grad_norm": 15.626012802124023,
"learning_rate": 1e-05,
"loss": 0.4638,
"step": 580
},
{
"epoch": 0.4439428141459744,
"grad_norm": 14.812833786010742,
"learning_rate": 1e-05,
"loss": 0.468,
"step": 590
},
{
"epoch": 0.45146726862302483,
"grad_norm": 11.22861385345459,
"learning_rate": 1e-05,
"loss": 0.3772,
"step": 600
},
{
"epoch": 0.45899172310007524,
"grad_norm": 14.62263011932373,
"learning_rate": 1e-05,
"loss": 0.3682,
"step": 610
},
{
"epoch": 0.46651617757712566,
"grad_norm": 10.826017379760742,
"learning_rate": 1e-05,
"loss": 0.3671,
"step": 620
},
{
"epoch": 0.47404063205417607,
"grad_norm": 9.838117599487305,
"learning_rate": 1e-05,
"loss": 0.3459,
"step": 630
},
{
"epoch": 0.4815650865312265,
"grad_norm": 7.919167518615723,
"learning_rate": 1e-05,
"loss": 0.2914,
"step": 640
},
{
"epoch": 0.4890895410082769,
"grad_norm": 4.093368053436279,
"learning_rate": 1e-05,
"loss": 0.3329,
"step": 650
},
{
"epoch": 0.4966139954853273,
"grad_norm": 12.66010856628418,
"learning_rate": 1e-05,
"loss": 0.4115,
"step": 660
},
{
"epoch": 0.5041384499623778,
"grad_norm": 11.424004554748535,
"learning_rate": 1e-05,
"loss": 0.4033,
"step": 670
},
{
"epoch": 0.5116629044394282,
"grad_norm": 9.730168342590332,
"learning_rate": 1e-05,
"loss": 0.3893,
"step": 680
},
{
"epoch": 0.5191873589164786,
"grad_norm": 9.054938316345215,
"learning_rate": 1e-05,
"loss": 0.3922,
"step": 690
},
{
"epoch": 0.526711813393529,
"grad_norm": 10.94675350189209,
"learning_rate": 1e-05,
"loss": 0.3291,
"step": 700
},
{
"epoch": 0.5342362678705794,
"grad_norm": 12.961570739746094,
"learning_rate": 1e-05,
"loss": 0.3225,
"step": 710
},
{
"epoch": 0.5417607223476298,
"grad_norm": 8.719619750976562,
"learning_rate": 1e-05,
"loss": 0.3267,
"step": 720
},
{
"epoch": 0.5492851768246803,
"grad_norm": 10.847646713256836,
"learning_rate": 1e-05,
"loss": 0.4268,
"step": 730
},
{
"epoch": 0.5568096313017307,
"grad_norm": 11.188985824584961,
"learning_rate": 1e-05,
"loss": 0.4236,
"step": 740
},
{
"epoch": 0.5643340857787811,
"grad_norm": 13.59192943572998,
"learning_rate": 1e-05,
"loss": 0.4197,
"step": 750
},
{
"epoch": 0.5718585402558315,
"grad_norm": 10.489006042480469,
"learning_rate": 1e-05,
"loss": 0.3314,
"step": 760
},
{
"epoch": 0.5793829947328819,
"grad_norm": 11.065324783325195,
"learning_rate": 1e-05,
"loss": 0.3667,
"step": 770
},
{
"epoch": 0.5869074492099323,
"grad_norm": 12.28297233581543,
"learning_rate": 1e-05,
"loss": 0.3189,
"step": 780
},
{
"epoch": 0.5944319036869827,
"grad_norm": 9.553642272949219,
"learning_rate": 1e-05,
"loss": 0.3991,
"step": 790
},
{
"epoch": 0.6019563581640331,
"grad_norm": 11.755203247070312,
"learning_rate": 1e-05,
"loss": 0.3528,
"step": 800
},
{
"epoch": 0.6094808126410836,
"grad_norm": 7.8607306480407715,
"learning_rate": 1e-05,
"loss": 0.3292,
"step": 810
},
{
"epoch": 0.617005267118134,
"grad_norm": 10.472386360168457,
"learning_rate": 1e-05,
"loss": 0.3549,
"step": 820
},
{
"epoch": 0.6245297215951844,
"grad_norm": 9.280732154846191,
"learning_rate": 1e-05,
"loss": 0.3183,
"step": 830
},
{
"epoch": 0.6320541760722348,
"grad_norm": 9.160599708557129,
"learning_rate": 1e-05,
"loss": 0.3686,
"step": 840
},
{
"epoch": 0.6395786305492852,
"grad_norm": 10.545658111572266,
"learning_rate": 1e-05,
"loss": 0.3517,
"step": 850
},
{
"epoch": 0.6471030850263356,
"grad_norm": 11.327434539794922,
"learning_rate": 1e-05,
"loss": 0.3465,
"step": 860
},
{
"epoch": 0.654627539503386,
"grad_norm": 12.003908157348633,
"learning_rate": 1e-05,
"loss": 0.2845,
"step": 870
},
{
"epoch": 0.6621519939804364,
"grad_norm": 9.960043907165527,
"learning_rate": 1e-05,
"loss": 0.3255,
"step": 880
},
{
"epoch": 0.6696764484574869,
"grad_norm": 11.36705207824707,
"learning_rate": 1e-05,
"loss": 0.3724,
"step": 890
},
{
"epoch": 0.6772009029345373,
"grad_norm": 9.673847198486328,
"learning_rate": 1e-05,
"loss": 0.3524,
"step": 900
},
{
"epoch": 0.6847253574115877,
"grad_norm": 10.644118309020996,
"learning_rate": 1e-05,
"loss": 0.3988,
"step": 910
},
{
"epoch": 0.6922498118886381,
"grad_norm": 11.484865188598633,
"learning_rate": 1e-05,
"loss": 0.3601,
"step": 920
},
{
"epoch": 0.6997742663656885,
"grad_norm": 7.940932750701904,
"learning_rate": 1e-05,
"loss": 0.2722,
"step": 930
},
{
"epoch": 0.7072987208427389,
"grad_norm": 9.51900863647461,
"learning_rate": 1e-05,
"loss": 0.2814,
"step": 940
},
{
"epoch": 0.7148231753197893,
"grad_norm": 14.423086166381836,
"learning_rate": 1e-05,
"loss": 0.4054,
"step": 950
},
{
"epoch": 0.7223476297968398,
"grad_norm": 12.655383110046387,
"learning_rate": 1e-05,
"loss": 0.3125,
"step": 960
},
{
"epoch": 0.7298720842738902,
"grad_norm": 13.050726890563965,
"learning_rate": 1e-05,
"loss": 0.3368,
"step": 970
},
{
"epoch": 0.7373965387509406,
"grad_norm": 8.44699764251709,
"learning_rate": 1e-05,
"loss": 0.3162,
"step": 980
},
{
"epoch": 0.744920993227991,
"grad_norm": 9.112492561340332,
"learning_rate": 1e-05,
"loss": 0.3428,
"step": 990
},
{
"epoch": 0.7524454477050414,
"grad_norm": 7.576210975646973,
"learning_rate": 1e-05,
"loss": 0.2674,
"step": 1000
},
{
"epoch": 0.7599699021820918,
"grad_norm": 10.723271369934082,
"learning_rate": 1e-05,
"loss": 0.313,
"step": 1010
},
{
"epoch": 0.7674943566591422,
"grad_norm": 11.943977355957031,
"learning_rate": 1e-05,
"loss": 0.321,
"step": 1020
},
{
"epoch": 0.7750188111361926,
"grad_norm": 9.523961067199707,
"learning_rate": 1e-05,
"loss": 0.3475,
"step": 1030
},
{
"epoch": 0.782543265613243,
"grad_norm": 10.895538330078125,
"learning_rate": 1e-05,
"loss": 0.3763,
"step": 1040
},
{
"epoch": 0.7900677200902935,
"grad_norm": 6.124391078948975,
"learning_rate": 1e-05,
"loss": 0.251,
"step": 1050
},
{
"epoch": 0.7975921745673439,
"grad_norm": 9.036330223083496,
"learning_rate": 1e-05,
"loss": 0.3976,
"step": 1060
},
{
"epoch": 0.8051166290443943,
"grad_norm": 9.179152488708496,
"learning_rate": 1e-05,
"loss": 0.3052,
"step": 1070
},
{
"epoch": 0.8126410835214447,
"grad_norm": 6.6380157470703125,
"learning_rate": 1e-05,
"loss": 0.3397,
"step": 1080
},
{
"epoch": 0.8201655379984951,
"grad_norm": 9.615362167358398,
"learning_rate": 1e-05,
"loss": 0.3091,
"step": 1090
},
{
"epoch": 0.8276899924755455,
"grad_norm": 11.812026977539062,
"learning_rate": 1e-05,
"loss": 0.3156,
"step": 1100
},
{
"epoch": 0.835214446952596,
"grad_norm": 12.034916877746582,
"learning_rate": 1e-05,
"loss": 0.2479,
"step": 1110
},
{
"epoch": 0.8427389014296464,
"grad_norm": 13.908053398132324,
"learning_rate": 1e-05,
"loss": 0.256,
"step": 1120
},
{
"epoch": 0.8502633559066968,
"grad_norm": 11.581817626953125,
"learning_rate": 1e-05,
"loss": 0.3175,
"step": 1130
},
{
"epoch": 0.8577878103837472,
"grad_norm": 11.642997741699219,
"learning_rate": 1e-05,
"loss": 0.25,
"step": 1140
},
{
"epoch": 0.8653122648607976,
"grad_norm": 10.813202857971191,
"learning_rate": 1e-05,
"loss": 0.3413,
"step": 1150
},
{
"epoch": 0.872836719337848,
"grad_norm": 12.767478942871094,
"learning_rate": 1e-05,
"loss": 0.3094,
"step": 1160
},
{
"epoch": 0.8803611738148984,
"grad_norm": 9.270513534545898,
"learning_rate": 1e-05,
"loss": 0.2807,
"step": 1170
},
{
"epoch": 0.8878856282919488,
"grad_norm": 7.739561557769775,
"learning_rate": 1e-05,
"loss": 0.2789,
"step": 1180
},
{
"epoch": 0.8954100827689992,
"grad_norm": 7.118448734283447,
"learning_rate": 1e-05,
"loss": 0.2752,
"step": 1190
},
{
"epoch": 0.9029345372460497,
"grad_norm": 11.153009414672852,
"learning_rate": 1e-05,
"loss": 0.32,
"step": 1200
},
{
"epoch": 0.9104589917231001,
"grad_norm": 10.667895317077637,
"learning_rate": 1e-05,
"loss": 0.2285,
"step": 1210
},
{
"epoch": 0.9179834462001505,
"grad_norm": 12.081469535827637,
"learning_rate": 1e-05,
"loss": 0.3278,
"step": 1220
},
{
"epoch": 0.9255079006772009,
"grad_norm": 8.599586486816406,
"learning_rate": 1e-05,
"loss": 0.3049,
"step": 1230
},
{
"epoch": 0.9330323551542513,
"grad_norm": 10.062015533447266,
"learning_rate": 1e-05,
"loss": 0.3583,
"step": 1240
},
{
"epoch": 0.9405568096313017,
"grad_norm": 8.24731159210205,
"learning_rate": 1e-05,
"loss": 0.3162,
"step": 1250
},
{
"epoch": 0.9480812641083521,
"grad_norm": 8.026958465576172,
"learning_rate": 1e-05,
"loss": 0.3082,
"step": 1260
},
{
"epoch": 0.9556057185854026,
"grad_norm": 8.996962547302246,
"learning_rate": 1e-05,
"loss": 0.3969,
"step": 1270
},
{
"epoch": 0.963130173062453,
"grad_norm": 7.346575736999512,
"learning_rate": 1e-05,
"loss": 0.2905,
"step": 1280
},
{
"epoch": 0.9706546275395034,
"grad_norm": 7.704085826873779,
"learning_rate": 1e-05,
"loss": 0.2655,
"step": 1290
},
{
"epoch": 0.9781790820165538,
"grad_norm": 9.016671180725098,
"learning_rate": 1e-05,
"loss": 0.3071,
"step": 1300
},
{
"epoch": 0.9857035364936042,
"grad_norm": 10.028645515441895,
"learning_rate": 1e-05,
"loss": 0.3077,
"step": 1310
},
{
"epoch": 0.9932279909706546,
"grad_norm": 9.28890609741211,
"learning_rate": 1e-05,
"loss": 0.3317,
"step": 1320
},
{
"epoch": 1.000752445447705,
"grad_norm": 9.905320167541504,
"learning_rate": 1e-05,
"loss": 0.31,
"step": 1330
},
{
"epoch": 1.0082768999247556,
"grad_norm": 8.012602806091309,
"learning_rate": 1e-05,
"loss": 0.2343,
"step": 1340
},
{
"epoch": 1.0158013544018059,
"grad_norm": 6.548900127410889,
"learning_rate": 1e-05,
"loss": 0.2686,
"step": 1350
},
{
"epoch": 1.0233258088788564,
"grad_norm": 9.645492553710938,
"learning_rate": 1e-05,
"loss": 0.3046,
"step": 1360
},
{
"epoch": 1.0308502633559067,
"grad_norm": 10.740938186645508,
"learning_rate": 1e-05,
"loss": 0.3873,
"step": 1370
},
{
"epoch": 1.0383747178329572,
"grad_norm": 11.067444801330566,
"learning_rate": 1e-05,
"loss": 0.3229,
"step": 1380
},
{
"epoch": 1.0458991723100075,
"grad_norm": 7.437419891357422,
"learning_rate": 1e-05,
"loss": 0.2907,
"step": 1390
},
{
"epoch": 1.053423626787058,
"grad_norm": 5.831209659576416,
"learning_rate": 1e-05,
"loss": 0.3107,
"step": 1400
},
{
"epoch": 1.0609480812641083,
"grad_norm": 8.783834457397461,
"learning_rate": 1e-05,
"loss": 0.3269,
"step": 1410
},
{
"epoch": 1.0684725357411589,
"grad_norm": 12.042133331298828,
"learning_rate": 1e-05,
"loss": 0.2861,
"step": 1420
},
{
"epoch": 1.0759969902182092,
"grad_norm": 10.743906021118164,
"learning_rate": 1e-05,
"loss": 0.3,
"step": 1430
},
{
"epoch": 1.0835214446952597,
"grad_norm": 10.540002822875977,
"learning_rate": 1e-05,
"loss": 0.2708,
"step": 1440
},
{
"epoch": 1.09104589917231,
"grad_norm": 7.265504837036133,
"learning_rate": 1e-05,
"loss": 0.2723,
"step": 1450
},
{
"epoch": 1.0985703536493605,
"grad_norm": 5.650593280792236,
"learning_rate": 1e-05,
"loss": 0.3101,
"step": 1460
},
{
"epoch": 1.1060948081264108,
"grad_norm": 10.168730735778809,
"learning_rate": 1e-05,
"loss": 0.291,
"step": 1470
},
{
"epoch": 1.1136192626034613,
"grad_norm": 6.533019542694092,
"learning_rate": 1e-05,
"loss": 0.2925,
"step": 1480
},
{
"epoch": 1.1211437170805116,
"grad_norm": 9.97232437133789,
"learning_rate": 1e-05,
"loss": 0.2983,
"step": 1490
},
{
"epoch": 1.1286681715575622,
"grad_norm": 8.263399124145508,
"learning_rate": 1e-05,
"loss": 0.2818,
"step": 1500
},
{
"epoch": 1.1361926260346125,
"grad_norm": 8.396636962890625,
"learning_rate": 1e-05,
"loss": 0.3436,
"step": 1510
},
{
"epoch": 1.143717080511663,
"grad_norm": 13.860685348510742,
"learning_rate": 1e-05,
"loss": 0.4241,
"step": 1520
},
{
"epoch": 1.1512415349887133,
"grad_norm": 8.995695114135742,
"learning_rate": 1e-05,
"loss": 0.2742,
"step": 1530
},
{
"epoch": 1.1587659894657638,
"grad_norm": 12.496316909790039,
"learning_rate": 1e-05,
"loss": 0.3052,
"step": 1540
},
{
"epoch": 1.1662904439428141,
"grad_norm": 7.071567535400391,
"learning_rate": 1e-05,
"loss": 0.3227,
"step": 1550
},
{
"epoch": 1.1738148984198646,
"grad_norm": 9.216208457946777,
"learning_rate": 1e-05,
"loss": 0.2476,
"step": 1560
},
{
"epoch": 1.181339352896915,
"grad_norm": 7.963762283325195,
"learning_rate": 1e-05,
"loss": 0.2883,
"step": 1570
},
{
"epoch": 1.1888638073739655,
"grad_norm": 11.962204933166504,
"learning_rate": 1e-05,
"loss": 0.2792,
"step": 1580
},
{
"epoch": 1.1963882618510158,
"grad_norm": 11.451403617858887,
"learning_rate": 1e-05,
"loss": 0.3362,
"step": 1590
},
{
"epoch": 1.2039127163280663,
"grad_norm": 11.551766395568848,
"learning_rate": 1e-05,
"loss": 0.2431,
"step": 1600
},
{
"epoch": 1.2114371708051166,
"grad_norm": 5.913654327392578,
"learning_rate": 1e-05,
"loss": 0.277,
"step": 1610
},
{
"epoch": 1.2189616252821671,
"grad_norm": 13.468070030212402,
"learning_rate": 1e-05,
"loss": 0.3292,
"step": 1620
},
{
"epoch": 1.2264860797592174,
"grad_norm": 8.323406219482422,
"learning_rate": 1e-05,
"loss": 0.3182,
"step": 1630
},
{
"epoch": 1.234010534236268,
"grad_norm": 9.116568565368652,
"learning_rate": 1e-05,
"loss": 0.2328,
"step": 1640
},
{
"epoch": 1.2415349887133182,
"grad_norm": 8.88713264465332,
"learning_rate": 1e-05,
"loss": 0.2538,
"step": 1650
},
{
"epoch": 1.2490594431903688,
"grad_norm": 10.781469345092773,
"learning_rate": 1e-05,
"loss": 0.2389,
"step": 1660
},
{
"epoch": 1.256583897667419,
"grad_norm": 12.144160270690918,
"learning_rate": 1e-05,
"loss": 0.3006,
"step": 1670
},
{
"epoch": 1.2641083521444696,
"grad_norm": 7.866734027862549,
"learning_rate": 1e-05,
"loss": 0.345,
"step": 1680
},
{
"epoch": 1.27163280662152,
"grad_norm": 7.459820747375488,
"learning_rate": 1e-05,
"loss": 0.2482,
"step": 1690
},
{
"epoch": 1.2791572610985704,
"grad_norm": 7.7605109214782715,
"learning_rate": 1e-05,
"loss": 0.214,
"step": 1700
},
{
"epoch": 1.2866817155756207,
"grad_norm": 9.145365715026855,
"learning_rate": 1e-05,
"loss": 0.3057,
"step": 1710
},
{
"epoch": 1.2942061700526712,
"grad_norm": 10.521879196166992,
"learning_rate": 1e-05,
"loss": 0.3376,
"step": 1720
},
{
"epoch": 1.3017306245297215,
"grad_norm": 5.132536888122559,
"learning_rate": 1e-05,
"loss": 0.2723,
"step": 1730
},
{
"epoch": 1.309255079006772,
"grad_norm": 10.422348022460938,
"learning_rate": 1e-05,
"loss": 0.2769,
"step": 1740
},
{
"epoch": 1.3167795334838224,
"grad_norm": 9.99517822265625,
"learning_rate": 1e-05,
"loss": 0.3455,
"step": 1750
},
{
"epoch": 1.324303987960873,
"grad_norm": 6.903396129608154,
"learning_rate": 1e-05,
"loss": 0.2813,
"step": 1760
},
{
"epoch": 1.3318284424379232,
"grad_norm": 5.721127986907959,
"learning_rate": 1e-05,
"loss": 0.2824,
"step": 1770
},
{
"epoch": 1.3393528969149737,
"grad_norm": 9.914773941040039,
"learning_rate": 1e-05,
"loss": 0.3353,
"step": 1780
},
{
"epoch": 1.346877351392024,
"grad_norm": 7.985681056976318,
"learning_rate": 1e-05,
"loss": 0.3295,
"step": 1790
},
{
"epoch": 1.3544018058690745,
"grad_norm": 10.242146492004395,
"learning_rate": 1e-05,
"loss": 0.3519,
"step": 1800
},
{
"epoch": 1.3619262603461249,
"grad_norm": 8.590888977050781,
"learning_rate": 1e-05,
"loss": 0.3171,
"step": 1810
},
{
"epoch": 1.3694507148231754,
"grad_norm": 9.34271240234375,
"learning_rate": 1e-05,
"loss": 0.3385,
"step": 1820
},
{
"epoch": 1.3769751693002257,
"grad_norm": 8.391048431396484,
"learning_rate": 1e-05,
"loss": 0.2975,
"step": 1830
},
{
"epoch": 1.3844996237772762,
"grad_norm": 11.190972328186035,
"learning_rate": 1e-05,
"loss": 0.3412,
"step": 1840
},
{
"epoch": 1.3920240782543265,
"grad_norm": 8.990412712097168,
"learning_rate": 1e-05,
"loss": 0.2985,
"step": 1850
},
{
"epoch": 1.399548532731377,
"grad_norm": 6.625011920928955,
"learning_rate": 1e-05,
"loss": 0.2485,
"step": 1860
},
{
"epoch": 1.4070729872084273,
"grad_norm": 8.91481876373291,
"learning_rate": 1e-05,
"loss": 0.3527,
"step": 1870
},
{
"epoch": 1.4145974416854779,
"grad_norm": 7.358391761779785,
"learning_rate": 1e-05,
"loss": 0.3243,
"step": 1880
},
{
"epoch": 1.4221218961625282,
"grad_norm": 6.641557216644287,
"learning_rate": 1e-05,
"loss": 0.242,
"step": 1890
},
{
"epoch": 1.4296463506395787,
"grad_norm": 8.88590145111084,
"learning_rate": 1e-05,
"loss": 0.2933,
"step": 1900
},
{
"epoch": 1.437170805116629,
"grad_norm": 9.170287132263184,
"learning_rate": 1e-05,
"loss": 0.2567,
"step": 1910
},
{
"epoch": 1.4446952595936795,
"grad_norm": 9.2968168258667,
"learning_rate": 1e-05,
"loss": 0.2649,
"step": 1920
},
{
"epoch": 1.4522197140707298,
"grad_norm": 8.246125221252441,
"learning_rate": 1e-05,
"loss": 0.2668,
"step": 1930
},
{
"epoch": 1.4597441685477803,
"grad_norm": 10.747627258300781,
"learning_rate": 1e-05,
"loss": 0.2905,
"step": 1940
},
{
"epoch": 1.4672686230248306,
"grad_norm": 7.501006603240967,
"learning_rate": 1e-05,
"loss": 0.3359,
"step": 1950
},
{
"epoch": 1.4747930775018812,
"grad_norm": 9.075075149536133,
"learning_rate": 1e-05,
"loss": 0.3039,
"step": 1960
},
{
"epoch": 1.4823175319789315,
"grad_norm": 9.139381408691406,
"learning_rate": 1e-05,
"loss": 0.3024,
"step": 1970
},
{
"epoch": 1.489841986455982,
"grad_norm": 10.483285903930664,
"learning_rate": 1e-05,
"loss": 0.2997,
"step": 1980
},
{
"epoch": 1.4973664409330323,
"grad_norm": 10.016528129577637,
"learning_rate": 1e-05,
"loss": 0.3105,
"step": 1990
},
{
"epoch": 1.5048908954100828,
"grad_norm": 7.966375827789307,
"learning_rate": 1e-05,
"loss": 0.3121,
"step": 2000
},
{
"epoch": 1.5124153498871333,
"grad_norm": 6.316532135009766,
"learning_rate": 1e-05,
"loss": 0.3148,
"step": 2010
},
{
"epoch": 1.5199398043641836,
"grad_norm": 9.16601276397705,
"learning_rate": 1e-05,
"loss": 0.2695,
"step": 2020
},
{
"epoch": 1.527464258841234,
"grad_norm": 4.997910499572754,
"learning_rate": 1e-05,
"loss": 0.2746,
"step": 2030
},
{
"epoch": 1.5349887133182845,
"grad_norm": 9.328558921813965,
"learning_rate": 1e-05,
"loss": 0.2873,
"step": 2040
},
{
"epoch": 1.542513167795335,
"grad_norm": 7.824413299560547,
"learning_rate": 1e-05,
"loss": 0.2925,
"step": 2050
},
{
"epoch": 1.5500376222723853,
"grad_norm": 8.306281089782715,
"learning_rate": 1e-05,
"loss": 0.3059,
"step": 2060
},
{
"epoch": 1.5575620767494356,
"grad_norm": 11.394743919372559,
"learning_rate": 1e-05,
"loss": 0.2995,
"step": 2070
},
{
"epoch": 1.565086531226486,
"grad_norm": 8.414088249206543,
"learning_rate": 1e-05,
"loss": 0.2914,
"step": 2080
},
{
"epoch": 1.5726109857035366,
"grad_norm": 10.023848533630371,
"learning_rate": 1e-05,
"loss": 0.239,
"step": 2090
},
{
"epoch": 1.580135440180587,
"grad_norm": 8.992568016052246,
"learning_rate": 1e-05,
"loss": 0.2613,
"step": 2100
},
{
"epoch": 1.5876598946576372,
"grad_norm": 9.712190628051758,
"learning_rate": 1e-05,
"loss": 0.2558,
"step": 2110
},
{
"epoch": 1.5951843491346878,
"grad_norm": 7.813495635986328,
"learning_rate": 1e-05,
"loss": 0.218,
"step": 2120
},
{
"epoch": 1.6027088036117383,
"grad_norm": 6.5997443199157715,
"learning_rate": 1e-05,
"loss": 0.285,
"step": 2130
},
{
"epoch": 1.6102332580887886,
"grad_norm": 11.630515098571777,
"learning_rate": 1e-05,
"loss": 0.2677,
"step": 2140
},
{
"epoch": 1.617757712565839,
"grad_norm": 7.1673359870910645,
"learning_rate": 1e-05,
"loss": 0.2845,
"step": 2150
},
{
"epoch": 1.6252821670428894,
"grad_norm": 10.230573654174805,
"learning_rate": 1e-05,
"loss": 0.2695,
"step": 2160
},
{
"epoch": 1.63280662151994,
"grad_norm": 7.908997535705566,
"learning_rate": 1e-05,
"loss": 0.2732,
"step": 2170
},
{
"epoch": 1.6403310759969902,
"grad_norm": 6.134283542633057,
"learning_rate": 1e-05,
"loss": 0.2613,
"step": 2180
},
{
"epoch": 1.6478555304740405,
"grad_norm": 9.178680419921875,
"learning_rate": 1e-05,
"loss": 0.3006,
"step": 2190
},
{
"epoch": 1.655379984951091,
"grad_norm": 8.385205268859863,
"learning_rate": 1e-05,
"loss": 0.2391,
"step": 2200
},
{
"epoch": 1.6629044394281416,
"grad_norm": 8.024893760681152,
"learning_rate": 1e-05,
"loss": 0.3149,
"step": 2210
},
{
"epoch": 1.670428893905192,
"grad_norm": 7.160342216491699,
"learning_rate": 1e-05,
"loss": 0.2321,
"step": 2220
},
{
"epoch": 1.6779533483822422,
"grad_norm": 9.467365264892578,
"learning_rate": 1e-05,
"loss": 0.3183,
"step": 2230
},
{
"epoch": 1.6854778028592927,
"grad_norm": 8.656280517578125,
"learning_rate": 1e-05,
"loss": 0.3128,
"step": 2240
},
{
"epoch": 1.6930022573363432,
"grad_norm": 5.692852973937988,
"learning_rate": 1e-05,
"loss": 0.3139,
"step": 2250
},
{
"epoch": 1.7005267118133935,
"grad_norm": 7.600724220275879,
"learning_rate": 1e-05,
"loss": 0.249,
"step": 2260
},
{
"epoch": 1.7080511662904438,
"grad_norm": 7.849809646606445,
"learning_rate": 1e-05,
"loss": 0.2587,
"step": 2270
},
{
"epoch": 1.7155756207674944,
"grad_norm": 8.293899536132812,
"learning_rate": 1e-05,
"loss": 0.2805,
"step": 2280
},
{
"epoch": 1.723100075244545,
"grad_norm": 5.557303428649902,
"learning_rate": 1e-05,
"loss": 0.2789,
"step": 2290
},
{
"epoch": 1.7306245297215952,
"grad_norm": 10.24751091003418,
"learning_rate": 1e-05,
"loss": 0.3244,
"step": 2300
},
{
"epoch": 1.7381489841986455,
"grad_norm": 7.778582572937012,
"learning_rate": 1e-05,
"loss": 0.3005,
"step": 2310
},
{
"epoch": 1.745673438675696,
"grad_norm": 7.266477584838867,
"learning_rate": 1e-05,
"loss": 0.2207,
"step": 2320
},
{
"epoch": 1.7531978931527465,
"grad_norm": 10.221525192260742,
"learning_rate": 1e-05,
"loss": 0.2742,
"step": 2330
},
{
"epoch": 1.7607223476297968,
"grad_norm": 8.85750961303711,
"learning_rate": 1e-05,
"loss": 0.265,
"step": 2340
},
{
"epoch": 1.7682468021068471,
"grad_norm": 8.104692459106445,
"learning_rate": 1e-05,
"loss": 0.277,
"step": 2350
},
{
"epoch": 1.7757712565838977,
"grad_norm": 8.834745407104492,
"learning_rate": 1e-05,
"loss": 0.2315,
"step": 2360
},
{
"epoch": 1.7832957110609482,
"grad_norm": 10.258501052856445,
"learning_rate": 1e-05,
"loss": 0.2921,
"step": 2370
},
{
"epoch": 1.7908201655379985,
"grad_norm": 8.523322105407715,
"learning_rate": 1e-05,
"loss": 0.2043,
"step": 2380
},
{
"epoch": 1.7983446200150488,
"grad_norm": 9.75324821472168,
"learning_rate": 1e-05,
"loss": 0.2187,
"step": 2390
},
{
"epoch": 1.8058690744920993,
"grad_norm": 6.369287490844727,
"learning_rate": 1e-05,
"loss": 0.2365,
"step": 2400
},
{
"epoch": 1.8133935289691498,
"grad_norm": 6.650455951690674,
"learning_rate": 1e-05,
"loss": 0.3027,
"step": 2410
},
{
"epoch": 1.8209179834462002,
"grad_norm": 7.7705397605896,
"learning_rate": 1e-05,
"loss": 0.2942,
"step": 2420
},
{
"epoch": 1.8284424379232505,
"grad_norm": 8.137877464294434,
"learning_rate": 1e-05,
"loss": 0.3317,
"step": 2430
},
{
"epoch": 1.835966892400301,
"grad_norm": 8.068604469299316,
"learning_rate": 1e-05,
"loss": 0.2657,
"step": 2440
},
{
"epoch": 1.8434913468773515,
"grad_norm": 5.773308753967285,
"learning_rate": 1e-05,
"loss": 0.2706,
"step": 2450
},
{
"epoch": 1.8510158013544018,
"grad_norm": 6.239875793457031,
"learning_rate": 1e-05,
"loss": 0.2696,
"step": 2460
},
{
"epoch": 1.858540255831452,
"grad_norm": 11.373642921447754,
"learning_rate": 1e-05,
"loss": 0.2964,
"step": 2470
},
{
"epoch": 1.8660647103085026,
"grad_norm": 10.659271240234375,
"learning_rate": 1e-05,
"loss": 0.2839,
"step": 2480
},
{
"epoch": 1.8735891647855532,
"grad_norm": 7.298862934112549,
"learning_rate": 1e-05,
"loss": 0.2748,
"step": 2490
},
{
"epoch": 1.8811136192626035,
"grad_norm": 12.347573280334473,
"learning_rate": 1e-05,
"loss": 0.2654,
"step": 2500
},
{
"epoch": 1.8886380737396538,
"grad_norm": 6.894641399383545,
"learning_rate": 1e-05,
"loss": 0.2522,
"step": 2510
},
{
"epoch": 1.8961625282167043,
"grad_norm": 7.589210510253906,
"learning_rate": 1e-05,
"loss": 0.3053,
"step": 2520
},
{
"epoch": 1.9036869826937548,
"grad_norm": 6.738051891326904,
"learning_rate": 1e-05,
"loss": 0.224,
"step": 2530
},
{
"epoch": 1.911211437170805,
"grad_norm": 7.8747239112854,
"learning_rate": 1e-05,
"loss": 0.2751,
"step": 2540
},
{
"epoch": 1.9187358916478554,
"grad_norm": 6.456340789794922,
"learning_rate": 1e-05,
"loss": 0.1926,
"step": 2550
},
{
"epoch": 1.926260346124906,
"grad_norm": 9.186247825622559,
"learning_rate": 1e-05,
"loss": 0.3004,
"step": 2560
},
{
"epoch": 1.9337848006019565,
"grad_norm": 11.75734806060791,
"learning_rate": 1e-05,
"loss": 0.2749,
"step": 2570
},
{
"epoch": 1.9413092550790068,
"grad_norm": 10.06619644165039,
"learning_rate": 1e-05,
"loss": 0.3011,
"step": 2580
},
{
"epoch": 1.948833709556057,
"grad_norm": 11.47065544128418,
"learning_rate": 1e-05,
"loss": 0.3124,
"step": 2590
},
{
"epoch": 1.9563581640331076,
"grad_norm": 9.72806167602539,
"learning_rate": 1e-05,
"loss": 0.298,
"step": 2600
},
{
"epoch": 1.963882618510158,
"grad_norm": 9.43126106262207,
"learning_rate": 1e-05,
"loss": 0.2617,
"step": 2610
},
{
"epoch": 1.9714070729872084,
"grad_norm": 12.323174476623535,
"learning_rate": 1e-05,
"loss": 0.3397,
"step": 2620
},
{
"epoch": 1.9789315274642587,
"grad_norm": 7.464193820953369,
"learning_rate": 1e-05,
"loss": 0.2187,
"step": 2630
},
{
"epoch": 1.9864559819413092,
"grad_norm": 6.475297927856445,
"learning_rate": 1e-05,
"loss": 0.2879,
"step": 2640
},
{
"epoch": 1.9939804364183598,
"grad_norm": 8.84854793548584,
"learning_rate": 1e-05,
"loss": 0.2828,
"step": 2650
}
],
"logging_steps": 10,
"max_steps": 39870,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}