ppt-agent-model / trainer_state.json
RohitSur's picture
Upload folder using huggingface_hub
643b8aa verified
Raw
History Blame Contribute Delete
27.2 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.79375,
"epoch": 0.021052631578947368,
"grad_norm": 6.71875,
"learning_rate": 4.9526315789473685e-05,
"loss": 1.7312183380126953,
"mean_token_accuracy": 0.6867853045463562,
"num_tokens": 840.0,
"step": 10
},
{
"entropy": 1.52421875,
"epoch": 0.042105263157894736,
"grad_norm": 3.90625,
"learning_rate": 4.9e-05,
"loss": 1.3260906219482422,
"mean_token_accuracy": 0.7451180815696716,
"num_tokens": 1573.0,
"step": 20
},
{
"entropy": 1.69921875,
"epoch": 0.06315789473684211,
"grad_norm": 6.5625,
"learning_rate": 4.847368421052632e-05,
"loss": 1.6661336898803711,
"mean_token_accuracy": 0.7098163902759552,
"num_tokens": 2338.0,
"step": 30
},
{
"entropy": 1.5484375,
"epoch": 0.08421052631578947,
"grad_norm": 7.4375,
"learning_rate": 4.794736842105264e-05,
"loss": 1.5569435119628907,
"mean_token_accuracy": 0.7198988318443298,
"num_tokens": 3038.0,
"step": 40
},
{
"entropy": 1.59296875,
"epoch": 0.10526315789473684,
"grad_norm": 8.1875,
"learning_rate": 4.742105263157895e-05,
"loss": 1.5420040130615233,
"mean_token_accuracy": 0.7149245262145996,
"num_tokens": 3682.0,
"step": 50
},
{
"entropy": 1.80390625,
"epoch": 0.12631578947368421,
"grad_norm": 9.9375,
"learning_rate": 4.6894736842105264e-05,
"loss": 1.6832901000976563,
"mean_token_accuracy": 0.7065998375415802,
"num_tokens": 4617.0,
"step": 60
},
{
"entropy": 1.66796875,
"epoch": 0.14736842105263157,
"grad_norm": 8.375,
"learning_rate": 4.6368421052631584e-05,
"loss": 1.6378042221069335,
"mean_token_accuracy": 0.7060423612594604,
"num_tokens": 5349.0,
"step": 70
},
{
"entropy": 1.5109375,
"epoch": 0.16842105263157894,
"grad_norm": 6.03125,
"learning_rate": 4.58421052631579e-05,
"loss": 1.5179196357727052,
"mean_token_accuracy": 0.7291716754436492,
"num_tokens": 6122.0,
"step": 80
},
{
"entropy": 1.721875,
"epoch": 0.18947368421052632,
"grad_norm": 4.0625,
"learning_rate": 4.531578947368421e-05,
"loss": 1.8176845550537108,
"mean_token_accuracy": 0.6842875778675079,
"num_tokens": 7197.0,
"step": 90
},
{
"entropy": 1.52265625,
"epoch": 0.21052631578947367,
"grad_norm": 6.46875,
"learning_rate": 4.478947368421053e-05,
"loss": 1.5252375602722168,
"mean_token_accuracy": 0.711300152540207,
"num_tokens": 8046.0,
"step": 100
},
{
"entropy": 1.6515625,
"epoch": 0.23157894736842105,
"grad_norm": 7.0,
"learning_rate": 4.426315789473684e-05,
"loss": 1.611851119995117,
"mean_token_accuracy": 0.7179319798946381,
"num_tokens": 8698.0,
"step": 110
},
{
"entropy": 1.54140625,
"epoch": 0.25263157894736843,
"grad_norm": 6.90625,
"learning_rate": 4.373684210526316e-05,
"loss": 1.5053813934326172,
"mean_token_accuracy": 0.7282716870307923,
"num_tokens": 9455.0,
"step": 120
},
{
"entropy": 1.5375,
"epoch": 0.2736842105263158,
"grad_norm": 7.25,
"learning_rate": 4.3210526315789475e-05,
"loss": 1.5904606819152831,
"mean_token_accuracy": 0.7234093546867371,
"num_tokens": 10191.0,
"step": 130
},
{
"entropy": 1.58984375,
"epoch": 0.29473684210526313,
"grad_norm": 9.375,
"learning_rate": 4.2684210526315795e-05,
"loss": 1.6070585250854492,
"mean_token_accuracy": 0.7228378415107727,
"num_tokens": 10859.0,
"step": 140
},
{
"entropy": 1.80234375,
"epoch": 0.3157894736842105,
"grad_norm": 8.3125,
"learning_rate": 4.215789473684211e-05,
"loss": 1.7423076629638672,
"mean_token_accuracy": 0.6761326909065246,
"num_tokens": 11846.0,
"step": 150
},
{
"entropy": 1.592578125,
"epoch": 0.3368421052631579,
"grad_norm": 5.1875,
"learning_rate": 4.163157894736842e-05,
"loss": 1.6300687789916992,
"mean_token_accuracy": 0.7145272672176362,
"num_tokens": 12722.0,
"step": 160
},
{
"entropy": 1.5953125,
"epoch": 0.35789473684210527,
"grad_norm": 6.125,
"learning_rate": 4.110526315789474e-05,
"loss": 1.44827880859375,
"mean_token_accuracy": 0.725699108839035,
"num_tokens": 13458.0,
"step": 170
},
{
"entropy": 1.8171875,
"epoch": 0.37894736842105264,
"grad_norm": 7.125,
"learning_rate": 4.0578947368421054e-05,
"loss": 1.7392475128173828,
"mean_token_accuracy": 0.6949776589870453,
"num_tokens": 14197.0,
"step": 180
},
{
"entropy": 1.78671875,
"epoch": 0.4,
"grad_norm": 7.8125,
"learning_rate": 4.0052631578947367e-05,
"loss": 1.8162834167480468,
"mean_token_accuracy": 0.6562099277973175,
"num_tokens": 15056.0,
"step": 190
},
{
"entropy": 1.77421875,
"epoch": 0.42105263157894735,
"grad_norm": 7.53125,
"learning_rate": 3.9526315789473686e-05,
"loss": 1.7153417587280273,
"mean_token_accuracy": 0.6948422849178314,
"num_tokens": 15807.0,
"step": 200
},
{
"entropy": 1.6875,
"epoch": 0.4421052631578947,
"grad_norm": 9.125,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.6782726287841796,
"mean_token_accuracy": 0.7238920211791993,
"num_tokens": 16432.0,
"step": 210
},
{
"entropy": 1.446484375,
"epoch": 0.4631578947368421,
"grad_norm": 5.84375,
"learning_rate": 3.847368421052632e-05,
"loss": 1.4408910751342774,
"mean_token_accuracy": 0.7403214454650879,
"num_tokens": 17103.0,
"step": 220
},
{
"entropy": 1.76171875,
"epoch": 0.4842105263157895,
"grad_norm": 6.375,
"learning_rate": 3.794736842105263e-05,
"loss": 1.7578521728515626,
"mean_token_accuracy": 0.6848492741584777,
"num_tokens": 18038.0,
"step": 230
},
{
"entropy": 1.53515625,
"epoch": 0.5052631578947369,
"grad_norm": 6.71875,
"learning_rate": 3.742105263157895e-05,
"loss": 1.48400821685791,
"mean_token_accuracy": 0.7131125509738923,
"num_tokens": 18720.0,
"step": 240
},
{
"entropy": 1.49140625,
"epoch": 0.5263157894736842,
"grad_norm": 6.375,
"learning_rate": 3.6894736842105265e-05,
"loss": 1.5496106147766113,
"mean_token_accuracy": 0.7365909218788147,
"num_tokens": 19308.0,
"step": 250
},
{
"entropy": 1.46796875,
"epoch": 0.5473684210526316,
"grad_norm": 8.125,
"learning_rate": 3.636842105263158e-05,
"loss": 1.5240853309631348,
"mean_token_accuracy": 0.736497437953949,
"num_tokens": 19967.0,
"step": 260
},
{
"entropy": 1.578125,
"epoch": 0.5684210526315789,
"grad_norm": 8.3125,
"learning_rate": 3.58421052631579e-05,
"loss": 1.5680004119873048,
"mean_token_accuracy": 0.7332142323255539,
"num_tokens": 20769.0,
"step": 270
},
{
"entropy": 1.501953125,
"epoch": 0.5894736842105263,
"grad_norm": 8.375,
"learning_rate": 3.531578947368421e-05,
"loss": 1.5029385566711426,
"mean_token_accuracy": 0.7254173457622528,
"num_tokens": 22503.0,
"step": 280
},
{
"entropy": 1.5921875,
"epoch": 0.6105263157894737,
"grad_norm": 7.0,
"learning_rate": 3.478947368421053e-05,
"loss": 1.6090343475341797,
"mean_token_accuracy": 0.7082146763801574,
"num_tokens": 23439.0,
"step": 290
},
{
"entropy": 1.75078125,
"epoch": 0.631578947368421,
"grad_norm": 8.25,
"learning_rate": 3.426315789473684e-05,
"loss": 1.6250024795532227,
"mean_token_accuracy": 0.6883616149425507,
"num_tokens": 24161.0,
"step": 300
},
{
"entropy": 1.269140625,
"epoch": 0.6526315789473685,
"grad_norm": 7.46875,
"learning_rate": 3.373684210526316e-05,
"loss": 1.3148769378662108,
"mean_token_accuracy": 0.7618813216686249,
"num_tokens": 24716.0,
"step": 310
},
{
"entropy": 1.809375,
"epoch": 0.6736842105263158,
"grad_norm": 7.40625,
"learning_rate": 3.3210526315789476e-05,
"loss": 1.9381757736206056,
"mean_token_accuracy": 0.667089307308197,
"num_tokens": 25481.0,
"step": 320
},
{
"entropy": 1.73515625,
"epoch": 0.6947368421052632,
"grad_norm": 11.5625,
"learning_rate": 3.268421052631579e-05,
"loss": 1.76402587890625,
"mean_token_accuracy": 0.6992617845535278,
"num_tokens": 26159.0,
"step": 330
},
{
"entropy": 1.644140625,
"epoch": 0.7157894736842105,
"grad_norm": 7.625,
"learning_rate": 3.215789473684211e-05,
"loss": 1.6910707473754882,
"mean_token_accuracy": 0.6974358975887298,
"num_tokens": 26983.0,
"step": 340
},
{
"entropy": 1.44140625,
"epoch": 0.7368421052631579,
"grad_norm": 7.84375,
"learning_rate": 3.163157894736842e-05,
"loss": 1.3823105812072753,
"mean_token_accuracy": 0.7533604800701141,
"num_tokens": 27564.0,
"step": 350
},
{
"entropy": 1.41953125,
"epoch": 0.7578947368421053,
"grad_norm": 9.5,
"learning_rate": 3.1105263157894735e-05,
"loss": 1.488726806640625,
"mean_token_accuracy": 0.733670562505722,
"num_tokens": 28200.0,
"step": 360
},
{
"entropy": 1.753125,
"epoch": 0.7789473684210526,
"grad_norm": 5.9375,
"learning_rate": 3.0578947368421054e-05,
"loss": 1.8948373794555664,
"mean_token_accuracy": 0.675409197807312,
"num_tokens": 28919.0,
"step": 370
},
{
"entropy": 1.98828125,
"epoch": 0.8,
"grad_norm": 8.5,
"learning_rate": 3.005263157894737e-05,
"loss": 2.146737289428711,
"mean_token_accuracy": 0.6334406793117523,
"num_tokens": 29894.0,
"step": 380
},
{
"entropy": 1.85625,
"epoch": 0.8210526315789474,
"grad_norm": 6.3125,
"learning_rate": 2.9526315789473684e-05,
"loss": 1.858269500732422,
"mean_token_accuracy": 0.6439402669668197,
"num_tokens": 30652.0,
"step": 390
},
{
"entropy": 1.727734375,
"epoch": 0.8421052631578947,
"grad_norm": 7.96875,
"learning_rate": 2.9e-05,
"loss": 1.7343599319458007,
"mean_token_accuracy": 0.700012594461441,
"num_tokens": 31372.0,
"step": 400
},
{
"entropy": 1.65,
"epoch": 0.8631578947368421,
"grad_norm": 8.0625,
"learning_rate": 2.847368421052632e-05,
"loss": 1.6553001403808594,
"mean_token_accuracy": 0.7165175020694733,
"num_tokens": 32149.0,
"step": 410
},
{
"entropy": 1.596875,
"epoch": 0.8842105263157894,
"grad_norm": 6.59375,
"learning_rate": 2.7947368421052633e-05,
"loss": 1.5661128997802733,
"mean_token_accuracy": 0.7166097521781921,
"num_tokens": 33203.0,
"step": 420
},
{
"entropy": 1.6015625,
"epoch": 0.9052631578947369,
"grad_norm": 7.1875,
"learning_rate": 2.7421052631578946e-05,
"loss": 1.6179162979125976,
"mean_token_accuracy": 0.7093785464763641,
"num_tokens": 33889.0,
"step": 430
},
{
"entropy": 1.75546875,
"epoch": 0.9263157894736842,
"grad_norm": 7.21875,
"learning_rate": 2.6894736842105266e-05,
"loss": 1.7675729751586915,
"mean_token_accuracy": 0.6769322335720063,
"num_tokens": 34862.0,
"step": 440
},
{
"entropy": 1.92109375,
"epoch": 0.9473684210526315,
"grad_norm": 11.0625,
"learning_rate": 2.6368421052631582e-05,
"loss": 1.9548963546752929,
"mean_token_accuracy": 0.648491358757019,
"num_tokens": 35710.0,
"step": 450
},
{
"entropy": 1.699609375,
"epoch": 0.968421052631579,
"grad_norm": 6.03125,
"learning_rate": 2.5842105263157895e-05,
"loss": 1.6432403564453124,
"mean_token_accuracy": 0.6918732106685639,
"num_tokens": 36374.0,
"step": 460
},
{
"entropy": 1.84453125,
"epoch": 0.9894736842105263,
"grad_norm": 7.03125,
"learning_rate": 2.5315789473684208e-05,
"loss": 1.7591934204101562,
"mean_token_accuracy": 0.6581568241119384,
"num_tokens": 37139.0,
"step": 470
},
{
"entropy": 1.75859375,
"epoch": 1.0105263157894737,
"grad_norm": 7.15625,
"learning_rate": 2.4789473684210528e-05,
"loss": 1.7067642211914062,
"mean_token_accuracy": 0.7008034646511078,
"num_tokens": 38037.0,
"step": 480
},
{
"entropy": 1.4203125,
"epoch": 1.0315789473684212,
"grad_norm": 6.4375,
"learning_rate": 2.4263157894736844e-05,
"loss": 1.378176498413086,
"mean_token_accuracy": 0.7486959993839264,
"num_tokens": 38668.0,
"step": 490
},
{
"entropy": 1.56328125,
"epoch": 1.0526315789473684,
"grad_norm": 7.15625,
"learning_rate": 2.373684210526316e-05,
"loss": 1.4762983322143555,
"mean_token_accuracy": 0.7175065577030182,
"num_tokens": 39424.0,
"step": 500
},
{
"entropy": 1.630078125,
"epoch": 1.0736842105263158,
"grad_norm": 4.84375,
"learning_rate": 2.3210526315789473e-05,
"loss": 1.5906378746032714,
"mean_token_accuracy": 0.6876044690608978,
"num_tokens": 40317.0,
"step": 510
},
{
"entropy": 1.556640625,
"epoch": 1.0947368421052632,
"grad_norm": 6.59375,
"learning_rate": 2.268421052631579e-05,
"loss": 1.6246864318847656,
"mean_token_accuracy": 0.7021546125411987,
"num_tokens": 41101.0,
"step": 520
},
{
"entropy": 1.86015625,
"epoch": 1.1157894736842104,
"grad_norm": 6.3125,
"learning_rate": 2.2157894736842106e-05,
"loss": 1.832110595703125,
"mean_token_accuracy": 0.6659113824367523,
"num_tokens": 42080.0,
"step": 530
},
{
"entropy": 1.50859375,
"epoch": 1.1368421052631579,
"grad_norm": 4.75,
"learning_rate": 2.1631578947368423e-05,
"loss": 1.495261001586914,
"mean_token_accuracy": 0.7114485323429107,
"num_tokens": 43064.0,
"step": 540
},
{
"entropy": 1.379296875,
"epoch": 1.1578947368421053,
"grad_norm": 7.90625,
"learning_rate": 2.110526315789474e-05,
"loss": 1.3109845161437987,
"mean_token_accuracy": 0.7768619418144226,
"num_tokens": 43638.0,
"step": 550
},
{
"entropy": 1.671875,
"epoch": 1.1789473684210527,
"grad_norm": 8.1875,
"learning_rate": 2.0578947368421052e-05,
"loss": 1.7390716552734375,
"mean_token_accuracy": 0.6966245353221894,
"num_tokens": 44538.0,
"step": 560
},
{
"entropy": 1.45390625,
"epoch": 1.2,
"grad_norm": 8.5,
"learning_rate": 2.005263157894737e-05,
"loss": 1.3690235137939453,
"mean_token_accuracy": 0.742480456829071,
"num_tokens": 45159.0,
"step": 570
},
{
"entropy": 1.8234375,
"epoch": 1.2210526315789474,
"grad_norm": 6.65625,
"learning_rate": 1.9526315789473685e-05,
"loss": 1.890069580078125,
"mean_token_accuracy": 0.6900433540344239,
"num_tokens": 45918.0,
"step": 580
},
{
"entropy": 1.8640625,
"epoch": 1.2421052631578948,
"grad_norm": 8.125,
"learning_rate": 1.9e-05,
"loss": 1.7881799697875977,
"mean_token_accuracy": 0.6537608683109284,
"num_tokens": 46795.0,
"step": 590
},
{
"entropy": 1.59453125,
"epoch": 1.263157894736842,
"grad_norm": 6.5,
"learning_rate": 1.8473684210526317e-05,
"loss": 1.5872214317321778,
"mean_token_accuracy": 0.722487497329712,
"num_tokens": 47423.0,
"step": 600
},
{
"entropy": 1.78828125,
"epoch": 1.2842105263157895,
"grad_norm": 7.3125,
"learning_rate": 1.7947368421052634e-05,
"loss": 1.6725593566894532,
"mean_token_accuracy": 0.6980259001255036,
"num_tokens": 48158.0,
"step": 610
},
{
"entropy": 1.61015625,
"epoch": 1.305263157894737,
"grad_norm": 6.5,
"learning_rate": 1.742105263157895e-05,
"loss": 1.5390226364135742,
"mean_token_accuracy": 0.725999391078949,
"num_tokens": 48875.0,
"step": 620
},
{
"entropy": 1.71953125,
"epoch": 1.3263157894736843,
"grad_norm": 8.25,
"learning_rate": 1.6894736842105263e-05,
"loss": 1.6228282928466797,
"mean_token_accuracy": 0.7085169553756714,
"num_tokens": 49697.0,
"step": 630
},
{
"entropy": 1.437109375,
"epoch": 1.3473684210526315,
"grad_norm": 7.5625,
"learning_rate": 1.636842105263158e-05,
"loss": 1.4906560897827148,
"mean_token_accuracy": 0.7471937596797943,
"num_tokens": 50300.0,
"step": 640
},
{
"entropy": 1.73359375,
"epoch": 1.368421052631579,
"grad_norm": 4.375,
"learning_rate": 1.5842105263157896e-05,
"loss": 1.5986823081970214,
"mean_token_accuracy": 0.699000483751297,
"num_tokens": 51332.0,
"step": 650
},
{
"entropy": 1.2671875,
"epoch": 1.3894736842105262,
"grad_norm": 11.8125,
"learning_rate": 1.5315789473684212e-05,
"loss": 1.234378719329834,
"mean_token_accuracy": 0.7714344441890717,
"num_tokens": 51900.0,
"step": 660
},
{
"entropy": 1.90859375,
"epoch": 1.4105263157894736,
"grad_norm": 6.875,
"learning_rate": 1.4789473684210529e-05,
"loss": 1.815606689453125,
"mean_token_accuracy": 0.6656625211238861,
"num_tokens": 52883.0,
"step": 670
},
{
"entropy": 1.558203125,
"epoch": 1.431578947368421,
"grad_norm": 9.375,
"learning_rate": 1.4263157894736842e-05,
"loss": 1.4858011245727538,
"mean_token_accuracy": 0.7443289816379547,
"num_tokens": 53576.0,
"step": 680
},
{
"entropy": 1.64765625,
"epoch": 1.4526315789473685,
"grad_norm": 8.9375,
"learning_rate": 1.373684210526316e-05,
"loss": 1.534531307220459,
"mean_token_accuracy": 0.7204049170017243,
"num_tokens": 54450.0,
"step": 690
},
{
"entropy": 1.58359375,
"epoch": 1.4736842105263157,
"grad_norm": 6.46875,
"learning_rate": 1.3210526315789473e-05,
"loss": 1.5365165710449218,
"mean_token_accuracy": 0.7069519102573395,
"num_tokens": 55160.0,
"step": 700
},
{
"entropy": 1.4171875,
"epoch": 1.4947368421052631,
"grad_norm": 7.1875,
"learning_rate": 1.268421052631579e-05,
"loss": 1.307802391052246,
"mean_token_accuracy": 0.7627157270908356,
"num_tokens": 55929.0,
"step": 710
},
{
"entropy": 1.678125,
"epoch": 1.5157894736842106,
"grad_norm": 6.34375,
"learning_rate": 1.2157894736842105e-05,
"loss": 1.5420659065246582,
"mean_token_accuracy": 0.7097006201744079,
"num_tokens": 56725.0,
"step": 720
},
{
"entropy": 1.58125,
"epoch": 1.5368421052631578,
"grad_norm": 7.0625,
"learning_rate": 1.1631578947368422e-05,
"loss": 1.4044910430908204,
"mean_token_accuracy": 0.7465328335762024,
"num_tokens": 57398.0,
"step": 730
},
{
"entropy": 1.576171875,
"epoch": 1.5578947368421052,
"grad_norm": 6.03125,
"learning_rate": 1.1105263157894738e-05,
"loss": 1.605686569213867,
"mean_token_accuracy": 0.7333506286144257,
"num_tokens": 58072.0,
"step": 740
},
{
"entropy": 1.59140625,
"epoch": 1.5789473684210527,
"grad_norm": 6.03125,
"learning_rate": 1.0578947368421053e-05,
"loss": 1.4928099632263183,
"mean_token_accuracy": 0.7072650909423828,
"num_tokens": 58748.0,
"step": 750
},
{
"entropy": 1.63046875,
"epoch": 1.6,
"grad_norm": 7.1875,
"learning_rate": 1.005263157894737e-05,
"loss": 1.624325942993164,
"mean_token_accuracy": 0.690889635682106,
"num_tokens": 59658.0,
"step": 760
},
{
"entropy": 1.803125,
"epoch": 1.6210526315789475,
"grad_norm": 6.03125,
"learning_rate": 9.526315789473684e-06,
"loss": 1.7750425338745117,
"mean_token_accuracy": 0.6651369571685791,
"num_tokens": 60859.0,
"step": 770
},
{
"entropy": 1.4703125,
"epoch": 1.6421052631578947,
"grad_norm": 7.03125,
"learning_rate": 9e-06,
"loss": 1.4258437156677246,
"mean_token_accuracy": 0.7347829401493072,
"num_tokens": 62439.0,
"step": 780
},
{
"entropy": 1.671875,
"epoch": 1.663157894736842,
"grad_norm": 7.71875,
"learning_rate": 8.473684210526315e-06,
"loss": 1.721219825744629,
"mean_token_accuracy": 0.7035934925079346,
"num_tokens": 63290.0,
"step": 790
},
{
"entropy": 1.554296875,
"epoch": 1.6842105263157894,
"grad_norm": 9.0625,
"learning_rate": 7.947368421052633e-06,
"loss": 1.5237810134887695,
"mean_token_accuracy": 0.7325670003890992,
"num_tokens": 64010.0,
"step": 800
},
{
"entropy": 1.7859375,
"epoch": 1.7052631578947368,
"grad_norm": 8.5,
"learning_rate": 7.421052631578948e-06,
"loss": 1.7944671630859375,
"mean_token_accuracy": 0.683906614780426,
"num_tokens": 64966.0,
"step": 810
},
{
"entropy": 1.840625,
"epoch": 1.7263157894736842,
"grad_norm": 7.53125,
"learning_rate": 6.894736842105263e-06,
"loss": 1.787227249145508,
"mean_token_accuracy": 0.6670031368732452,
"num_tokens": 65703.0,
"step": 820
},
{
"entropy": 1.29765625,
"epoch": 1.7473684210526317,
"grad_norm": 9.0625,
"learning_rate": 6.368421052631579e-06,
"loss": 1.2813177108764648,
"mean_token_accuracy": 0.7610228896141052,
"num_tokens": 66421.0,
"step": 830
},
{
"entropy": 1.504296875,
"epoch": 1.768421052631579,
"grad_norm": 6.5,
"learning_rate": 5.842105263157895e-06,
"loss": 1.477138137817383,
"mean_token_accuracy": 0.7455608665943145,
"num_tokens": 67005.0,
"step": 840
},
{
"entropy": 1.60859375,
"epoch": 1.7894736842105263,
"grad_norm": 5.03125,
"learning_rate": 5.315789473684211e-06,
"loss": 1.6519662857055664,
"mean_token_accuracy": 0.7005816400051117,
"num_tokens": 67975.0,
"step": 850
},
{
"entropy": 1.6765625,
"epoch": 1.8105263157894735,
"grad_norm": 6.53125,
"learning_rate": 4.789473684210526e-06,
"loss": 1.6701608657836915,
"mean_token_accuracy": 0.7136963486671448,
"num_tokens": 68650.0,
"step": 860
},
{
"entropy": 1.44609375,
"epoch": 1.831578947368421,
"grad_norm": 7.1875,
"learning_rate": 4.2631578947368425e-06,
"loss": 1.3475713729858398,
"mean_token_accuracy": 0.7566476047039032,
"num_tokens": 69317.0,
"step": 870
},
{
"entropy": 1.71328125,
"epoch": 1.8526315789473684,
"grad_norm": 6.90625,
"learning_rate": 3.736842105263158e-06,
"loss": 1.7077770233154297,
"mean_token_accuracy": 0.6919207274913788,
"num_tokens": 70169.0,
"step": 880
},
{
"entropy": 1.646875,
"epoch": 1.8736842105263158,
"grad_norm": 7.8125,
"learning_rate": 3.2105263157894735e-06,
"loss": 1.5933343887329101,
"mean_token_accuracy": 0.708821702003479,
"num_tokens": 71138.0,
"step": 890
},
{
"entropy": 1.437109375,
"epoch": 1.8947368421052633,
"grad_norm": 8.625,
"learning_rate": 2.68421052631579e-06,
"loss": 1.4426955223083495,
"mean_token_accuracy": 0.7344056785106658,
"num_tokens": 71776.0,
"step": 900
},
{
"entropy": 1.591015625,
"epoch": 1.9157894736842105,
"grad_norm": 8.125,
"learning_rate": 2.1578947368421054e-06,
"loss": 1.4523811340332031,
"mean_token_accuracy": 0.7294364452362061,
"num_tokens": 72390.0,
"step": 910
},
{
"entropy": 1.48984375,
"epoch": 1.936842105263158,
"grad_norm": 8.375,
"learning_rate": 1.6315789473684212e-06,
"loss": 1.4202921867370606,
"mean_token_accuracy": 0.7283547043800354,
"num_tokens": 73109.0,
"step": 920
},
{
"entropy": 1.37578125,
"epoch": 1.9578947368421051,
"grad_norm": 7.34375,
"learning_rate": 1.1052631578947369e-06,
"loss": 1.3212156295776367,
"mean_token_accuracy": 0.765373581647873,
"num_tokens": 73665.0,
"step": 930
},
{
"entropy": 1.69140625,
"epoch": 1.9789473684210526,
"grad_norm": 7.5625,
"learning_rate": 5.789473684210527e-07,
"loss": 1.5747876167297363,
"mean_token_accuracy": 0.7353746354579925,
"num_tokens": 74320.0,
"step": 940
},
{
"entropy": 1.75390625,
"epoch": 2.0,
"grad_norm": 6.46875,
"learning_rate": 5.263157894736842e-08,
"loss": 1.7055414199829102,
"mean_token_accuracy": 0.6862038552761078,
"num_tokens": 75304.0,
"step": 950
}
],
"logging_steps": 10,
"max_steps": 950,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 207562104419328.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}