MPropositioneur-V1 / trainer_state.json
Zual's picture
Upload folder using huggingface_hub
8cd6723 verified
{
"best_global_step": 750,
"best_metric": 0.26988574862480164,
"best_model_checkpoint": "models/qwen-0.5b-distilled/checkpoint-750",
"epoch": 0.9829619921363041,
"eval_steps": 25,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001310615989515072,
"grad_norm": 9.375,
"learning_rate": 0.0,
"loss": 0.548,
"step": 1
},
{
"epoch": 0.01310615989515072,
"grad_norm": 6.375,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.43,
"step": 10
},
{
"epoch": 0.02621231979030144,
"grad_norm": 9.25,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.4675,
"step": 20
},
{
"epoch": 0.0327653997378768,
"eval_loss": 0.4063957631587982,
"eval_runtime": 4.2148,
"eval_samples_per_second": 7.355,
"eval_steps_per_second": 0.949,
"step": 25
},
{
"epoch": 0.039318479685452164,
"grad_norm": 4.84375,
"learning_rate": 5.8e-06,
"loss": 0.3877,
"step": 30
},
{
"epoch": 0.05242463958060288,
"grad_norm": 5.125,
"learning_rate": 7.800000000000002e-06,
"loss": 0.3403,
"step": 40
},
{
"epoch": 0.0655307994757536,
"grad_norm": 6.375,
"learning_rate": 9.800000000000001e-06,
"loss": 0.3379,
"step": 50
},
{
"epoch": 0.0655307994757536,
"eval_loss": 0.3531549572944641,
"eval_runtime": 3.6193,
"eval_samples_per_second": 8.565,
"eval_steps_per_second": 1.105,
"step": 50
},
{
"epoch": 0.07863695937090433,
"grad_norm": 4.75,
"learning_rate": 1.18e-05,
"loss": 0.3865,
"step": 60
},
{
"epoch": 0.09174311926605505,
"grad_norm": 5.34375,
"learning_rate": 1.38e-05,
"loss": 0.3487,
"step": 70
},
{
"epoch": 0.0982961992136304,
"eval_loss": 0.3395631015300751,
"eval_runtime": 3.6271,
"eval_samples_per_second": 8.547,
"eval_steps_per_second": 1.103,
"step": 75
},
{
"epoch": 0.10484927916120576,
"grad_norm": 4.1875,
"learning_rate": 1.58e-05,
"loss": 0.3037,
"step": 80
},
{
"epoch": 0.11795543905635648,
"grad_norm": 7.46875,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.3588,
"step": 90
},
{
"epoch": 0.1310615989515072,
"grad_norm": 5.9375,
"learning_rate": 1.98e-05,
"loss": 0.3423,
"step": 100
},
{
"epoch": 0.1310615989515072,
"eval_loss": 0.3489256501197815,
"eval_runtime": 3.6228,
"eval_samples_per_second": 8.557,
"eval_steps_per_second": 1.104,
"step": 100
},
{
"epoch": 0.14416775884665792,
"grad_norm": 5.65625,
"learning_rate": 1.9873772791023845e-05,
"loss": 0.344,
"step": 110
},
{
"epoch": 0.15727391874180865,
"grad_norm": 6.71875,
"learning_rate": 1.973352033660589e-05,
"loss": 0.3335,
"step": 120
},
{
"epoch": 0.16382699868938402,
"eval_loss": 0.32981202006340027,
"eval_runtime": 3.631,
"eval_samples_per_second": 8.538,
"eval_steps_per_second": 1.102,
"step": 125
},
{
"epoch": 0.17038007863695936,
"grad_norm": 4.25,
"learning_rate": 1.959326788218794e-05,
"loss": 0.3455,
"step": 130
},
{
"epoch": 0.1834862385321101,
"grad_norm": 4.40625,
"learning_rate": 1.9453015427769988e-05,
"loss": 0.3583,
"step": 140
},
{
"epoch": 0.1965923984272608,
"grad_norm": 8.5625,
"learning_rate": 1.9312762973352036e-05,
"loss": 0.3583,
"step": 150
},
{
"epoch": 0.1965923984272608,
"eval_loss": 0.32496821880340576,
"eval_runtime": 3.6332,
"eval_samples_per_second": 8.532,
"eval_steps_per_second": 1.101,
"step": 150
},
{
"epoch": 0.20969855832241152,
"grad_norm": 3.921875,
"learning_rate": 1.9172510518934082e-05,
"loss": 0.3653,
"step": 160
},
{
"epoch": 0.22280471821756226,
"grad_norm": 5.96875,
"learning_rate": 1.903225806451613e-05,
"loss": 0.3176,
"step": 170
},
{
"epoch": 0.22935779816513763,
"eval_loss": 0.31902241706848145,
"eval_runtime": 3.6232,
"eval_samples_per_second": 8.556,
"eval_steps_per_second": 1.104,
"step": 175
},
{
"epoch": 0.23591087811271297,
"grad_norm": 5.0625,
"learning_rate": 1.889200561009818e-05,
"loss": 0.3316,
"step": 180
},
{
"epoch": 0.2490170380078637,
"grad_norm": 5.03125,
"learning_rate": 1.8751753155680224e-05,
"loss": 0.3265,
"step": 190
},
{
"epoch": 0.2621231979030144,
"grad_norm": 4.1875,
"learning_rate": 1.8611500701262273e-05,
"loss": 0.3195,
"step": 200
},
{
"epoch": 0.2621231979030144,
"eval_loss": 0.314563125371933,
"eval_runtime": 3.6254,
"eval_samples_per_second": 8.551,
"eval_steps_per_second": 1.103,
"step": 200
},
{
"epoch": 0.27522935779816515,
"grad_norm": 4.6875,
"learning_rate": 1.8471248246844322e-05,
"loss": 0.3281,
"step": 210
},
{
"epoch": 0.28833551769331583,
"grad_norm": 3.421875,
"learning_rate": 1.833099579242637e-05,
"loss": 0.2814,
"step": 220
},
{
"epoch": 0.2948885976408912,
"eval_loss": 0.3087914288043976,
"eval_runtime": 3.6237,
"eval_samples_per_second": 8.555,
"eval_steps_per_second": 1.104,
"step": 225
},
{
"epoch": 0.30144167758846657,
"grad_norm": 3.734375,
"learning_rate": 1.8190743338008416e-05,
"loss": 0.3419,
"step": 230
},
{
"epoch": 0.3145478374836173,
"grad_norm": 6.28125,
"learning_rate": 1.8050490883590464e-05,
"loss": 0.3428,
"step": 240
},
{
"epoch": 0.32765399737876805,
"grad_norm": 6.28125,
"learning_rate": 1.7910238429172513e-05,
"loss": 0.3723,
"step": 250
},
{
"epoch": 0.32765399737876805,
"eval_loss": 0.31134819984436035,
"eval_runtime": 3.6258,
"eval_samples_per_second": 8.55,
"eval_steps_per_second": 1.103,
"step": 250
},
{
"epoch": 0.34076015727391873,
"grad_norm": 6.96875,
"learning_rate": 1.776998597475456e-05,
"loss": 0.3545,
"step": 260
},
{
"epoch": 0.35386631716906947,
"grad_norm": 5.375,
"learning_rate": 1.7629733520336607e-05,
"loss": 0.3441,
"step": 270
},
{
"epoch": 0.36041939711664484,
"eval_loss": 0.30370065569877625,
"eval_runtime": 3.6249,
"eval_samples_per_second": 8.552,
"eval_steps_per_second": 1.103,
"step": 275
},
{
"epoch": 0.3669724770642202,
"grad_norm": 4.5625,
"learning_rate": 1.7489481065918656e-05,
"loss": 0.3107,
"step": 280
},
{
"epoch": 0.3800786369593709,
"grad_norm": 3.203125,
"learning_rate": 1.7349228611500704e-05,
"loss": 0.2933,
"step": 290
},
{
"epoch": 0.3931847968545216,
"grad_norm": 3.75,
"learning_rate": 1.720897615708275e-05,
"loss": 0.291,
"step": 300
},
{
"epoch": 0.3931847968545216,
"eval_loss": 0.30050498247146606,
"eval_runtime": 3.6252,
"eval_samples_per_second": 8.551,
"eval_steps_per_second": 1.103,
"step": 300
},
{
"epoch": 0.40629095674967236,
"grad_norm": 4.125,
"learning_rate": 1.70687237026648e-05,
"loss": 0.3167,
"step": 310
},
{
"epoch": 0.41939711664482304,
"grad_norm": 4.90625,
"learning_rate": 1.6928471248246844e-05,
"loss": 0.3104,
"step": 320
},
{
"epoch": 0.4259501965923984,
"eval_loss": 0.2980906069278717,
"eval_runtime": 3.6274,
"eval_samples_per_second": 8.546,
"eval_steps_per_second": 1.103,
"step": 325
},
{
"epoch": 0.4325032765399738,
"grad_norm": 4.03125,
"learning_rate": 1.6788218793828896e-05,
"loss": 0.3122,
"step": 330
},
{
"epoch": 0.4456094364351245,
"grad_norm": 5.21875,
"learning_rate": 1.664796633941094e-05,
"loss": 0.3053,
"step": 340
},
{
"epoch": 0.45871559633027525,
"grad_norm": 3.09375,
"learning_rate": 1.650771388499299e-05,
"loss": 0.2632,
"step": 350
},
{
"epoch": 0.45871559633027525,
"eval_loss": 0.29447969794273376,
"eval_runtime": 3.6275,
"eval_samples_per_second": 8.546,
"eval_steps_per_second": 1.103,
"step": 350
},
{
"epoch": 0.47182175622542594,
"grad_norm": 4.40625,
"learning_rate": 1.6367461430575035e-05,
"loss": 0.2968,
"step": 360
},
{
"epoch": 0.4849279161205767,
"grad_norm": 3.78125,
"learning_rate": 1.6227208976157084e-05,
"loss": 0.3154,
"step": 370
},
{
"epoch": 0.49148099606815204,
"eval_loss": 0.29292187094688416,
"eval_runtime": 3.6253,
"eval_samples_per_second": 8.551,
"eval_steps_per_second": 1.103,
"step": 375
},
{
"epoch": 0.4980340760157274,
"grad_norm": 3.984375,
"learning_rate": 1.6086956521739132e-05,
"loss": 0.2635,
"step": 380
},
{
"epoch": 0.5111402359108781,
"grad_norm": 3.5,
"learning_rate": 1.5946704067321178e-05,
"loss": 0.3126,
"step": 390
},
{
"epoch": 0.5242463958060288,
"grad_norm": 5.40625,
"learning_rate": 1.5806451612903226e-05,
"loss": 0.3126,
"step": 400
},
{
"epoch": 0.5242463958060288,
"eval_loss": 0.2929743826389313,
"eval_runtime": 3.6233,
"eval_samples_per_second": 8.556,
"eval_steps_per_second": 1.104,
"step": 400
},
{
"epoch": 0.5373525557011796,
"grad_norm": 3.984375,
"learning_rate": 1.5666199158485275e-05,
"loss": 0.279,
"step": 410
},
{
"epoch": 0.5504587155963303,
"grad_norm": 4.5,
"learning_rate": 1.5525946704067324e-05,
"loss": 0.2817,
"step": 420
},
{
"epoch": 0.5570117955439057,
"eval_loss": 0.2881828844547272,
"eval_runtime": 3.6309,
"eval_samples_per_second": 8.538,
"eval_steps_per_second": 1.102,
"step": 425
},
{
"epoch": 0.563564875491481,
"grad_norm": 3.671875,
"learning_rate": 1.538569424964937e-05,
"loss": 0.2967,
"step": 430
},
{
"epoch": 0.5766710353866317,
"grad_norm": 4.3125,
"learning_rate": 1.5245441795231418e-05,
"loss": 0.3231,
"step": 440
},
{
"epoch": 0.5897771952817824,
"grad_norm": 3.65625,
"learning_rate": 1.5105189340813466e-05,
"loss": 0.3371,
"step": 450
},
{
"epoch": 0.5897771952817824,
"eval_loss": 0.285811185836792,
"eval_runtime": 3.6291,
"eval_samples_per_second": 8.542,
"eval_steps_per_second": 1.102,
"step": 450
},
{
"epoch": 0.6028833551769331,
"grad_norm": 3.328125,
"learning_rate": 1.4964936886395513e-05,
"loss": 0.3081,
"step": 460
},
{
"epoch": 0.6159895150720839,
"grad_norm": 3.984375,
"learning_rate": 1.482468443197756e-05,
"loss": 0.2821,
"step": 470
},
{
"epoch": 0.6225425950196593,
"eval_loss": 0.28545138239860535,
"eval_runtime": 3.6247,
"eval_samples_per_second": 8.553,
"eval_steps_per_second": 1.104,
"step": 475
},
{
"epoch": 0.6290956749672346,
"grad_norm": 3.65625,
"learning_rate": 1.4684431977559607e-05,
"loss": 0.3088,
"step": 480
},
{
"epoch": 0.6422018348623854,
"grad_norm": 3.4375,
"learning_rate": 1.4544179523141658e-05,
"loss": 0.3111,
"step": 490
},
{
"epoch": 0.6553079947575361,
"grad_norm": 3.65625,
"learning_rate": 1.4403927068723705e-05,
"loss": 0.2942,
"step": 500
},
{
"epoch": 0.6553079947575361,
"eval_loss": 0.28335702419281006,
"eval_runtime": 3.6367,
"eval_samples_per_second": 8.524,
"eval_steps_per_second": 1.1,
"step": 500
},
{
"epoch": 0.6684141546526867,
"grad_norm": 3.671875,
"learning_rate": 1.4263674614305752e-05,
"loss": 0.2872,
"step": 510
},
{
"epoch": 0.6815203145478375,
"grad_norm": 3.03125,
"learning_rate": 1.4123422159887799e-05,
"loss": 0.3081,
"step": 520
},
{
"epoch": 0.6880733944954128,
"eval_loss": 0.2832925617694855,
"eval_runtime": 3.6251,
"eval_samples_per_second": 8.551,
"eval_steps_per_second": 1.103,
"step": 525
},
{
"epoch": 0.6946264744429882,
"grad_norm": 5.03125,
"learning_rate": 1.3983169705469847e-05,
"loss": 0.3289,
"step": 530
},
{
"epoch": 0.7077326343381389,
"grad_norm": 3.75,
"learning_rate": 1.3842917251051894e-05,
"loss": 0.2875,
"step": 540
},
{
"epoch": 0.7208387942332897,
"grad_norm": 2.78125,
"learning_rate": 1.3702664796633941e-05,
"loss": 0.2218,
"step": 550
},
{
"epoch": 0.7208387942332897,
"eval_loss": 0.28089821338653564,
"eval_runtime": 3.6305,
"eval_samples_per_second": 8.539,
"eval_steps_per_second": 1.102,
"step": 550
},
{
"epoch": 0.7339449541284404,
"grad_norm": 4.4375,
"learning_rate": 1.356241234221599e-05,
"loss": 0.264,
"step": 560
},
{
"epoch": 0.747051114023591,
"grad_norm": 3.140625,
"learning_rate": 1.3422159887798039e-05,
"loss": 0.3062,
"step": 570
},
{
"epoch": 0.7536041939711664,
"eval_loss": 0.2809857130050659,
"eval_runtime": 3.6291,
"eval_samples_per_second": 8.542,
"eval_steps_per_second": 1.102,
"step": 575
},
{
"epoch": 0.7601572739187418,
"grad_norm": 4.125,
"learning_rate": 1.3281907433380086e-05,
"loss": 0.2615,
"step": 580
},
{
"epoch": 0.7732634338138925,
"grad_norm": 3.53125,
"learning_rate": 1.3141654978962133e-05,
"loss": 0.2747,
"step": 590
},
{
"epoch": 0.7863695937090432,
"grad_norm": 3.34375,
"learning_rate": 1.300140252454418e-05,
"loss": 0.2837,
"step": 600
},
{
"epoch": 0.7863695937090432,
"eval_loss": 0.276317834854126,
"eval_runtime": 3.6322,
"eval_samples_per_second": 8.535,
"eval_steps_per_second": 1.101,
"step": 600
},
{
"epoch": 0.799475753604194,
"grad_norm": 5.0,
"learning_rate": 1.286115007012623e-05,
"loss": 0.3009,
"step": 610
},
{
"epoch": 0.8125819134993447,
"grad_norm": 3.46875,
"learning_rate": 1.2720897615708277e-05,
"loss": 0.3129,
"step": 620
},
{
"epoch": 0.8191349934469201,
"eval_loss": 0.27676212787628174,
"eval_runtime": 3.6276,
"eval_samples_per_second": 8.546,
"eval_steps_per_second": 1.103,
"step": 625
},
{
"epoch": 0.8256880733944955,
"grad_norm": 2.90625,
"learning_rate": 1.2580645161290324e-05,
"loss": 0.2404,
"step": 630
},
{
"epoch": 0.8387942332896461,
"grad_norm": 3.09375,
"learning_rate": 1.2440392706872371e-05,
"loss": 0.2858,
"step": 640
},
{
"epoch": 0.8519003931847968,
"grad_norm": 3.90625,
"learning_rate": 1.230014025245442e-05,
"loss": 0.2947,
"step": 650
},
{
"epoch": 0.8519003931847968,
"eval_loss": 0.2739439606666565,
"eval_runtime": 3.6305,
"eval_samples_per_second": 8.539,
"eval_steps_per_second": 1.102,
"step": 650
},
{
"epoch": 0.8650065530799476,
"grad_norm": 3.15625,
"learning_rate": 1.2159887798036467e-05,
"loss": 0.291,
"step": 660
},
{
"epoch": 0.8781127129750983,
"grad_norm": 3.828125,
"learning_rate": 1.2019635343618514e-05,
"loss": 0.3276,
"step": 670
},
{
"epoch": 0.8846657929226737,
"eval_loss": 0.2710360586643219,
"eval_runtime": 3.6251,
"eval_samples_per_second": 8.551,
"eval_steps_per_second": 1.103,
"step": 675
},
{
"epoch": 0.891218872870249,
"grad_norm": 4.75,
"learning_rate": 1.187938288920056e-05,
"loss": 0.289,
"step": 680
},
{
"epoch": 0.9043250327653998,
"grad_norm": 3.53125,
"learning_rate": 1.1739130434782611e-05,
"loss": 0.2442,
"step": 690
},
{
"epoch": 0.9174311926605505,
"grad_norm": 3.0625,
"learning_rate": 1.1598877980364658e-05,
"loss": 0.3024,
"step": 700
},
{
"epoch": 0.9174311926605505,
"eval_loss": 0.2719181180000305,
"eval_runtime": 3.625,
"eval_samples_per_second": 8.552,
"eval_steps_per_second": 1.103,
"step": 700
},
{
"epoch": 0.9305373525557011,
"grad_norm": 3.15625,
"learning_rate": 1.1458625525946705e-05,
"loss": 0.2883,
"step": 710
},
{
"epoch": 0.9436435124508519,
"grad_norm": 4.0625,
"learning_rate": 1.1318373071528752e-05,
"loss": 0.2915,
"step": 720
},
{
"epoch": 0.9501965923984272,
"eval_loss": 0.27032172679901123,
"eval_runtime": 3.6246,
"eval_samples_per_second": 8.553,
"eval_steps_per_second": 1.104,
"step": 725
},
{
"epoch": 0.9567496723460026,
"grad_norm": 3.09375,
"learning_rate": 1.11781206171108e-05,
"loss": 0.2846,
"step": 730
},
{
"epoch": 0.9698558322411533,
"grad_norm": 3.859375,
"learning_rate": 1.103786816269285e-05,
"loss": 0.2954,
"step": 740
},
{
"epoch": 0.9829619921363041,
"grad_norm": 4.34375,
"learning_rate": 1.0897615708274896e-05,
"loss": 0.2825,
"step": 750
},
{
"epoch": 0.9829619921363041,
"eval_loss": 0.26988574862480164,
"eval_runtime": 3.6297,
"eval_samples_per_second": 8.541,
"eval_steps_per_second": 1.102,
"step": 750
}
],
"logging_steps": 10,
"max_steps": 1526,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9744231345598464.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}