lora_0-4_3B / trainer_state.json
gulaschnascher4000's picture
End of training
8cfc32d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.30020422055820284,
"eval_steps": 500,
"global_step": 882,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034036759700476512,
"grad_norm": 0.3817383944988251,
"learning_rate": 1.1235955056179776e-05,
"loss": 1.5188,
"step": 10
},
{
"epoch": 0.0068073519400953025,
"grad_norm": 0.4666265845298767,
"learning_rate": 2.2471910112359552e-05,
"loss": 1.5003,
"step": 20
},
{
"epoch": 0.010211027910142955,
"grad_norm": 0.41875213384628296,
"learning_rate": 3.370786516853933e-05,
"loss": 1.4805,
"step": 30
},
{
"epoch": 0.013614703880190605,
"grad_norm": 0.37079161405563354,
"learning_rate": 4.4943820224719104e-05,
"loss": 1.3878,
"step": 40
},
{
"epoch": 0.01701837985023826,
"grad_norm": 0.5143834352493286,
"learning_rate": 5.6179775280898885e-05,
"loss": 1.4275,
"step": 50
},
{
"epoch": 0.02042205582028591,
"grad_norm": 0.36570975184440613,
"learning_rate": 6.741573033707866e-05,
"loss": 1.3778,
"step": 60
},
{
"epoch": 0.023825731790333562,
"grad_norm": 1.1076630353927612,
"learning_rate": 7.865168539325843e-05,
"loss": 1.3833,
"step": 70
},
{
"epoch": 0.02722940776038121,
"grad_norm": 0.43584316968917847,
"learning_rate": 8.988764044943821e-05,
"loss": 1.3653,
"step": 80
},
{
"epoch": 0.03063308373042886,
"grad_norm": 0.6449490189552307,
"learning_rate": 9.999960763269511e-05,
"loss": 1.3612,
"step": 90
},
{
"epoch": 0.03403675970047652,
"grad_norm": 0.408315509557724,
"learning_rate": 9.9952531006933e-05,
"loss": 1.363,
"step": 100
},
{
"epoch": 0.037440435670524165,
"grad_norm": 0.4980515241622925,
"learning_rate": 9.982706557199723e-05,
"loss": 1.3148,
"step": 110
},
{
"epoch": 0.04084411164057182,
"grad_norm": 0.620286762714386,
"learning_rate": 9.962340821653064e-05,
"loss": 1.3419,
"step": 120
},
{
"epoch": 0.04424778761061947,
"grad_norm": 0.5976988077163696,
"learning_rate": 9.934187853309946e-05,
"loss": 1.3251,
"step": 130
},
{
"epoch": 0.047651463580667124,
"grad_norm": 0.5392516851425171,
"learning_rate": 9.898291831666755e-05,
"loss": 1.2998,
"step": 140
},
{
"epoch": 0.05105513955071477,
"grad_norm": 0.7126803398132324,
"learning_rate": 9.85470908713026e-05,
"loss": 1.3652,
"step": 150
},
{
"epoch": 0.05445881552076242,
"grad_norm": 0.591543436050415,
"learning_rate": 9.803508012620217e-05,
"loss": 1.3153,
"step": 160
},
{
"epoch": 0.057862491490810075,
"grad_norm": 0.4493406116962433,
"learning_rate": 9.744768956242683e-05,
"loss": 1.3256,
"step": 170
},
{
"epoch": 0.06126616746085772,
"grad_norm": 0.814771294593811,
"learning_rate": 9.678584095202468e-05,
"loss": 1.3489,
"step": 180
},
{
"epoch": 0.06466984343090537,
"grad_norm": 0.5826136469841003,
"learning_rate": 9.605057291152581e-05,
"loss": 1.3149,
"step": 190
},
{
"epoch": 0.06807351940095303,
"grad_norm": 0.5115235447883606,
"learning_rate": 9.524303927207663e-05,
"loss": 1.3108,
"step": 200
},
{
"epoch": 0.07147719537100068,
"grad_norm": 0.5263652205467224,
"learning_rate": 9.43645072687719e-05,
"loss": 1.3354,
"step": 210
},
{
"epoch": 0.07488087134104833,
"grad_norm": 0.6202068328857422,
"learning_rate": 9.341635555202577e-05,
"loss": 1.3317,
"step": 220
},
{
"epoch": 0.07828454731109598,
"grad_norm": 0.4816318154335022,
"learning_rate": 9.240007202410249e-05,
"loss": 1.3592,
"step": 230
},
{
"epoch": 0.08168822328114364,
"grad_norm": 0.6449723243713379,
"learning_rate": 9.131725150420205e-05,
"loss": 1.3266,
"step": 240
},
{
"epoch": 0.08509189925119129,
"grad_norm": 0.5435076355934143,
"learning_rate": 9.016959322576453e-05,
"loss": 1.3268,
"step": 250
},
{
"epoch": 0.08849557522123894,
"grad_norm": 0.5929358005523682,
"learning_rate": 8.895889816992084e-05,
"loss": 1.3539,
"step": 260
},
{
"epoch": 0.09189925119128659,
"grad_norm": 0.5826624631881714,
"learning_rate": 8.768706623927427e-05,
"loss": 1.262,
"step": 270
},
{
"epoch": 0.09530292716133425,
"grad_norm": 0.8038996458053589,
"learning_rate": 8.635609327644783e-05,
"loss": 1.3711,
"step": 280
},
{
"epoch": 0.0987066031313819,
"grad_norm": 0.852136492729187,
"learning_rate": 8.496806793207635e-05,
"loss": 1.296,
"step": 290
},
{
"epoch": 0.10211027910142954,
"grad_norm": 0.4777398705482483,
"learning_rate": 8.352516838715799e-05,
"loss": 1.3339,
"step": 300
},
{
"epoch": 0.10551395507147719,
"grad_norm": 0.6921893954277039,
"learning_rate": 8.202965893490878e-05,
"loss": 1.3019,
"step": 310
},
{
"epoch": 0.10891763104152484,
"grad_norm": 0.5298048853874207,
"learning_rate": 8.048388642748425e-05,
"loss": 1.278,
"step": 320
},
{
"epoch": 0.1123213070115725,
"grad_norm": 0.6031792163848877,
"learning_rate": 7.889027659314423e-05,
"loss": 1.2982,
"step": 330
},
{
"epoch": 0.11572498298162015,
"grad_norm": 0.767808198928833,
"learning_rate": 7.725133022963974e-05,
"loss": 1.3111,
"step": 340
},
{
"epoch": 0.1191286589516678,
"grad_norm": 0.7626794576644897,
"learning_rate": 7.556961927979622e-05,
"loss": 1.2223,
"step": 350
},
{
"epoch": 0.12253233492171545,
"grad_norm": 0.5305209159851074,
"learning_rate": 7.384778279545096e-05,
"loss": 1.3317,
"step": 360
},
{
"epoch": 0.1259360108917631,
"grad_norm": 0.9789229035377502,
"learning_rate": 7.208852279607883e-05,
"loss": 1.352,
"step": 370
},
{
"epoch": 0.12933968686181074,
"grad_norm": 0.5160291790962219,
"learning_rate": 7.029460002860492e-05,
"loss": 1.2979,
"step": 380
},
{
"epoch": 0.13274336283185842,
"grad_norm": 0.47278067469596863,
"learning_rate": 6.846882963505821e-05,
"loss": 1.2898,
"step": 390
},
{
"epoch": 0.13614703880190607,
"grad_norm": 0.5707348585128784,
"learning_rate": 6.661407673486489e-05,
"loss": 1.3688,
"step": 400
},
{
"epoch": 0.13955071477195372,
"grad_norm": 0.623561441898346,
"learning_rate": 6.473325192871382e-05,
"loss": 1.3058,
"step": 410
},
{
"epoch": 0.14295439074200136,
"grad_norm": 0.7172777056694031,
"learning_rate": 6.282930673104965e-05,
"loss": 1.2926,
"step": 420
},
{
"epoch": 0.146358066712049,
"grad_norm": 0.5847612619400024,
"learning_rate": 6.090522893836183e-05,
"loss": 1.3026,
"step": 430
},
{
"epoch": 0.14976174268209666,
"grad_norm": 0.6956146955490112,
"learning_rate": 5.896403794053679e-05,
"loss": 1.3242,
"step": 440
},
{
"epoch": 0.1531654186521443,
"grad_norm": 0.9769287109375,
"learning_rate": 5.700877998263221e-05,
"loss": 1.2992,
"step": 450
},
{
"epoch": 0.15656909462219196,
"grad_norm": 1.0140455961227417,
"learning_rate": 5.5042523384508136e-05,
"loss": 1.2911,
"step": 460
},
{
"epoch": 0.15997277059223963,
"grad_norm": 0.6622276902198792,
"learning_rate": 5.306835372581687e-05,
"loss": 1.2924,
"step": 470
},
{
"epoch": 0.16337644656228728,
"grad_norm": 1.0274490118026733,
"learning_rate": 5.108936900390775e-05,
"loss": 1.3372,
"step": 480
},
{
"epoch": 0.16678012253233493,
"grad_norm": 0.4924236536026001,
"learning_rate": 4.9108674772245144e-05,
"loss": 1.2659,
"step": 490
},
{
"epoch": 0.17018379850238258,
"grad_norm": 0.5681174397468567,
"learning_rate": 4.712937926696903e-05,
"loss": 1.2672,
"step": 500
},
{
"epoch": 0.17018379850238258,
"eval_loss": 1.492391586303711,
"eval_runtime": 136.4709,
"eval_samples_per_second": 76.522,
"eval_steps_per_second": 19.132,
"step": 500
},
{
"epoch": 0.17358747447243023,
"grad_norm": 0.6138017177581787,
"learning_rate": 4.515458852924553e-05,
"loss": 1.3551,
"step": 510
},
{
"epoch": 0.17699115044247787,
"grad_norm": 0.6779869198799133,
"learning_rate": 4.318740153106218e-05,
"loss": 1.3149,
"step": 520
},
{
"epoch": 0.18039482641252552,
"grad_norm": 0.5932974815368652,
"learning_rate": 4.123090531211653e-05,
"loss": 1.3229,
"step": 530
},
{
"epoch": 0.18379850238257317,
"grad_norm": 0.49759408831596375,
"learning_rate": 3.928817013542954e-05,
"loss": 1.3248,
"step": 540
},
{
"epoch": 0.18720217835262082,
"grad_norm": 0.6779336333274841,
"learning_rate": 3.736224466928634e-05,
"loss": 1.2666,
"step": 550
},
{
"epoch": 0.1906058543226685,
"grad_norm": 0.7689851522445679,
"learning_rate": 3.5456151203064515e-05,
"loss": 1.2989,
"step": 560
},
{
"epoch": 0.19400953029271614,
"grad_norm": 0.7002193331718445,
"learning_rate": 3.357288090445827e-05,
"loss": 1.3551,
"step": 570
},
{
"epoch": 0.1974132062627638,
"grad_norm": 0.7646905779838562,
"learning_rate": 3.171538912554054e-05,
"loss": 1.2812,
"step": 580
},
{
"epoch": 0.20081688223281144,
"grad_norm": 1.0780543088912964,
"learning_rate": 2.988659076502946e-05,
"loss": 1.2798,
"step": 590
},
{
"epoch": 0.2042205582028591,
"grad_norm": 0.7833569049835205,
"learning_rate": 2.808935569403688e-05,
"loss": 1.3266,
"step": 600
},
{
"epoch": 0.20762423417290674,
"grad_norm": 0.5977985262870789,
"learning_rate": 2.6326504252477046e-05,
"loss": 1.2375,
"step": 610
},
{
"epoch": 0.21102791014295438,
"grad_norm": 0.6137337684631348,
"learning_rate": 2.4600802823203273e-05,
"loss": 1.2488,
"step": 620
},
{
"epoch": 0.21443158611300203,
"grad_norm": 0.7681064605712891,
"learning_rate": 2.2914959490817122e-05,
"loss": 1.3371,
"step": 630
},
{
"epoch": 0.21783526208304968,
"grad_norm": 1.2434996366500854,
"learning_rate": 2.12716197919634e-05,
"loss": 1.3557,
"step": 640
},
{
"epoch": 0.22123893805309736,
"grad_norm": 0.7922418713569641,
"learning_rate": 1.9673362563779356e-05,
"loss": 1.26,
"step": 650
},
{
"epoch": 0.224642614023145,
"grad_norm": 0.6704869866371155,
"learning_rate": 1.812269589701326e-05,
"loss": 1.2262,
"step": 660
},
{
"epoch": 0.22804628999319265,
"grad_norm": 0.6106792092323303,
"learning_rate": 1.662205320016279e-05,
"loss": 1.2958,
"step": 670
},
{
"epoch": 0.2314499659632403,
"grad_norm": 0.7386729717254639,
"learning_rate": 1.517378938080979e-05,
"loss": 1.279,
"step": 680
},
{
"epoch": 0.23485364193328795,
"grad_norm": 0.9197444319725037,
"learning_rate": 1.3780177150143908e-05,
"loss": 1.3203,
"step": 690
},
{
"epoch": 0.2382573179033356,
"grad_norm": 0.5879797339439392,
"learning_rate": 1.2443403456474017e-05,
"loss": 1.2765,
"step": 700
},
{
"epoch": 0.24166099387338325,
"grad_norm": 0.6365487575531006,
"learning_rate": 1.1165566053324699e-05,
"loss": 1.2499,
"step": 710
},
{
"epoch": 0.2450646698434309,
"grad_norm": 0.5641332864761353,
"learning_rate": 9.948670207502907e-06,
"loss": 1.2574,
"step": 720
},
{
"epoch": 0.24846834581347857,
"grad_norm": 1.0512096881866455,
"learning_rate": 8.794625552300878e-06,
"loss": 1.2396,
"step": 730
},
{
"epoch": 0.2518720217835262,
"grad_norm": 0.8620232939720154,
"learning_rate": 7.705243090773522e-06,
"loss": 1.3172,
"step": 740
},
{
"epoch": 0.25527569775357384,
"grad_norm": 0.8536260724067688,
"learning_rate": 6.682232353792894e-06,
"loss": 1.3053,
"step": 750
},
{
"epoch": 0.2586793737236215,
"grad_norm": 0.7353459596633911,
"learning_rate": 5.727198717339511e-06,
"loss": 1.3267,
"step": 760
},
{
"epoch": 0.2620830496936692,
"grad_norm": 0.7075720429420471,
"learning_rate": 4.8416408832403334e-06,
"loss": 1.2496,
"step": 770
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.5608527660369873,
"learning_rate": 4.026948527306989e-06,
"loss": 1.2856,
"step": 780
},
{
"epoch": 0.2688904016337645,
"grad_norm": 0.7906400561332703,
"learning_rate": 3.2844001185647288e-06,
"loss": 1.2576,
"step": 790
},
{
"epoch": 0.27229407760381213,
"grad_norm": 0.9198495745658875,
"learning_rate": 2.6151609129943964e-06,
"loss": 1.3065,
"step": 800
},
{
"epoch": 0.2756977535738598,
"grad_norm": 0.5755515098571777,
"learning_rate": 2.02028112493588e-06,
"loss": 1.3112,
"step": 810
},
{
"epoch": 0.27910142954390743,
"grad_norm": 0.5036645531654358,
"learning_rate": 1.5006942790224133e-06,
"loss": 1.2878,
"step": 820
},
{
"epoch": 0.2825051055139551,
"grad_norm": 0.5611245036125183,
"learning_rate": 1.0572157452321097e-06,
"loss": 1.2717,
"step": 830
},
{
"epoch": 0.2859087814840027,
"grad_norm": 0.7261125445365906,
"learning_rate": 6.905414593555482e-07,
"loss": 1.2907,
"step": 840
},
{
"epoch": 0.2893124574540504,
"grad_norm": 0.5549591183662415,
"learning_rate": 4.0124683088740287e-07,
"loss": 1.3056,
"step": 850
},
{
"epoch": 0.292716133424098,
"grad_norm": 1.1202582120895386,
"learning_rate": 1.897858400558783e-07,
"loss": 1.2864,
"step": 860
},
{
"epoch": 0.29611980939414567,
"grad_norm": 0.5217841863632202,
"learning_rate": 5.6490325406971524e-08,
"loss": 1.2405,
"step": 870
},
{
"epoch": 0.2995234853641933,
"grad_norm": 1.1227178573608398,
"learning_rate": 1.5694630615070704e-09,
"loss": 1.3007,
"step": 880
},
{
"epoch": 0.30020422055820284,
"step": 882,
"total_flos": 2.4828724791109222e+17,
"train_loss": 1.3161652834237028,
"train_runtime": 1203.9175,
"train_samples_per_second": 23.42,
"train_steps_per_second": 0.733
}
],
"logging_steps": 10,
"max_steps": 882,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4828724791109222e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}