loris3's picture
Upload folder using huggingface_hub
7f7ce2f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.753886629992825,
"eval_steps": 500,
"global_step": 2858,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09567089213106912,
"grad_norm": 19.851242065429688,
"learning_rate": 0.00011666666666666667,
"loss": 121.1525,
"step": 50
},
{
"epoch": 0.19134178426213824,
"grad_norm": 13.892197608947754,
"learning_rate": 0.00023333333333333333,
"loss": 77.1477,
"step": 100
},
{
"epoch": 0.2870126763932074,
"grad_norm": 12.60777759552002,
"learning_rate": 0.00035,
"loss": 64.8839,
"step": 150
},
{
"epoch": 0.3826835685242765,
"grad_norm": 8.890196800231934,
"learning_rate": 0.00046666666666666666,
"loss": 59.8849,
"step": 200
},
{
"epoch": 0.4783544606553456,
"grad_norm": 8.705379486083984,
"learning_rate": 0.0005833333333333334,
"loss": 57.5299,
"step": 250
},
{
"epoch": 0.5740253527864148,
"grad_norm": 7.457199573516846,
"learning_rate": 0.0007,
"loss": 56.2806,
"step": 300
},
{
"epoch": 0.6696962449174838,
"grad_norm": 7.943981647491455,
"learning_rate": 0.000699821634561209,
"loss": 55.3251,
"step": 350
},
{
"epoch": 0.765367137048553,
"grad_norm": 7.716305732727051,
"learning_rate": 0.0006992867200404345,
"loss": 52.4636,
"step": 400
},
{
"epoch": 0.8610380291796221,
"grad_norm": 7.123641014099121,
"learning_rate": 0.0006983958016391807,
"loss": 52.215,
"step": 450
},
{
"epoch": 0.9567089213106912,
"grad_norm": 7.223691463470459,
"learning_rate": 0.0006971497874091708,
"loss": 52.1157,
"step": 500
},
{
"epoch": 0.9988041138483617,
"eval_accuracy": 0.0,
"eval_loss": 8.116789817810059,
"eval_normalizer": 685885.0,
"eval_runtime": 103.9352,
"eval_samples_per_second": 514.33,
"eval_steps_per_second": 1.01,
"step": 522
},
{
"epoch": 1.0535756995933987,
"grad_norm": 7.065810203552246,
"learning_rate": 0.0006955499473268326,
"loss": 50.0351,
"step": 550
},
{
"epoch": 1.1492465917244679,
"grad_norm": 7.027311325073242,
"learning_rate": 0.0006935979119988993,
"loss": 48.6995,
"step": 600
},
{
"epoch": 1.244917483855537,
"grad_norm": 6.899056911468506,
"learning_rate": 0.0006912956710004438,
"loss": 48.9226,
"step": 650
},
{
"epoch": 1.3405883759866062,
"grad_norm": 7.2765116691589355,
"learning_rate": 0.0006886455708470427,
"loss": 48.9019,
"step": 700
},
{
"epoch": 1.4362592681176751,
"grad_norm": 8.293883323669434,
"learning_rate": 0.0006856503126031346,
"loss": 44.5473,
"step": 750
},
{
"epoch": 1.5319301602487443,
"grad_norm": 7.498640537261963,
"learning_rate": 0.0006823129491290102,
"loss": 45.238,
"step": 800
},
{
"epoch": 1.6276010523798135,
"grad_norm": 7.582442283630371,
"learning_rate": 0.0006786368819692442,
"loss": 45.6905,
"step": 850
},
{
"epoch": 1.7232719445108826,
"grad_norm": 8.016205787658691,
"learning_rate": 0.0006746258578857331,
"loss": 42.9074,
"step": 900
},
{
"epoch": 1.8189428366419516,
"grad_norm": 8.324728012084961,
"learning_rate": 0.000670283965038881,
"loss": 40.5995,
"step": 950
},
{
"epoch": 1.9146137287730207,
"grad_norm": 8.61531925201416,
"learning_rate": 0.0006656156288208179,
"loss": 41.5048,
"step": 1000
},
{
"epoch": 1.9988041138483617,
"eval_accuracy": 0.0,
"eval_loss": 8.638134956359863,
"eval_normalizer": 685885.0,
"eval_runtime": 104.5141,
"eval_samples_per_second": 511.481,
"eval_steps_per_second": 1.005,
"step": 1044
},
{
"epoch": 2.0114805070557282,
"grad_norm": 119.22761535644531,
"learning_rate": 0.000660625607344904,
"loss": 52.7863,
"step": 1050
},
{
"epoch": 2.1071513991867974,
"grad_norm": 7.394373893737793,
"learning_rate": 0.0006553189865961112,
"loss": 87.7065,
"step": 1100
},
{
"epoch": 2.2028222913178666,
"grad_norm": 5.8150200843811035,
"learning_rate": 0.0006497011752472301,
"loss": 68.8677,
"step": 1150
},
{
"epoch": 2.2984931834489357,
"grad_norm": 8.597038269042969,
"learning_rate": 0.0006437778991461825,
"loss": 59.723,
"step": 1200
},
{
"epoch": 2.3597225544128198,
"eval_accuracy": 0.0,
"eval_loss": 5.760004997253418,
"eval_normalizer": 685885.0,
"eval_runtime": 102.8572,
"eval_samples_per_second": 519.721,
"eval_steps_per_second": 1.021,
"step": 1232
},
{
"epoch": 3.0344415211671847,
"grad_norm": 9.546870231628418,
"learning_rate": 0.0006375551954800587,
"loss": 49.7983,
"step": 1250
},
{
"epoch": 3.130112413298254,
"grad_norm": 11.537946701049805,
"learning_rate": 0.0006310394066218296,
"loss": 38.324,
"step": 1300
},
{
"epoch": 3.225783305429323,
"grad_norm": 11.162705421447754,
"learning_rate": 0.0006242371736660025,
"loss": 27.47,
"step": 1350
},
{
"epoch": 3.321454197560392,
"grad_norm": 12.32886791229248,
"learning_rate": 0.000617155429659811,
"loss": 18.7372,
"step": 1400
},
{
"epoch": 3.3597225544128198,
"eval_accuracy": 0.0,
"eval_loss": 7.427099227905273,
"eval_normalizer": 685885.0,
"eval_runtime": 110.0904,
"eval_samples_per_second": 485.573,
"eval_steps_per_second": 0.954,
"step": 1420
},
{
"epoch": 4.057402535278642,
"grad_norm": 13.549798011779785,
"learning_rate": 0.0006098013925368385,
"loss": 57.3773,
"step": 1450
},
{
"epoch": 4.15307342740971,
"grad_norm": 10.063131332397461,
"learning_rate": 0.0006021825577602754,
"loss": 71.4705,
"step": 1500
},
{
"epoch": 4.24874431954078,
"grad_norm": 9.314085006713867,
"learning_rate": 0.0005943066906833104,
"loss": 69.1017,
"step": 1550
},
{
"epoch": 4.344415211671849,
"grad_norm": 8.48709774017334,
"learning_rate": 0.0005861818186344407,
"loss": 66.7627,
"step": 1600
},
{
"epoch": 4.440086103802918,
"grad_norm": 9.457844734191895,
"learning_rate": 0.00057781622273577,
"loss": 62.5878,
"step": 1650
},
{
"epoch": 4.535756995933987,
"grad_norm": 8.216409683227539,
"learning_rate": 0.0005692184294626307,
"loss": 63.0275,
"step": 1700
},
{
"epoch": 4.631427888065057,
"grad_norm": 8.80875301361084,
"learning_rate": 0.0005603972019531362,
"loss": 62.8742,
"step": 1750
},
{
"epoch": 4.727098780196125,
"grad_norm": 9.469932556152344,
"learning_rate": 0.0005513615310765172,
"loss": 56.6002,
"step": 1800
},
{
"epoch": 4.822769672327194,
"grad_norm": 9.208184242248535,
"learning_rate": 0.0005421206262693491,
"loss": 55.5842,
"step": 1850
},
{
"epoch": 4.918440564458264,
"grad_norm": 9.542802810668945,
"learning_rate": 0.0005326839061490078,
"loss": 56.4509,
"step": 1900
},
{
"epoch": 4.998804113848362,
"eval_accuracy": 0.0,
"eval_loss": 5.560527801513672,
"eval_normalizer": 685885.0,
"eval_runtime": 102.9429,
"eval_samples_per_second": 519.288,
"eval_steps_per_second": 1.02,
"step": 1942
},
{
"epoch": 5.015307342740971,
"grad_norm": 10.22252082824707,
"learning_rate": 0.0005230609889139216,
"loss": 52.8616,
"step": 1950
},
{
"epoch": 5.11097823487204,
"grad_norm": 11.31312370300293,
"learning_rate": 0.0005132616825404055,
"loss": 45.5469,
"step": 2000
},
{
"epoch": 5.206649127003109,
"grad_norm": 11.319524765014648,
"learning_rate": 0.0005032959747860662,
"loss": 47.203,
"step": 2050
},
{
"epoch": 5.302320019134179,
"grad_norm": 11.748647689819336,
"learning_rate": 0.000493174023009969,
"loss": 48.3289,
"step": 2100
},
{
"epoch": 5.397990911265247,
"grad_norm": 12.092212677001953,
"learning_rate": 0.00048290614381994235,
"loss": 39.7512,
"step": 2150
},
{
"epoch": 5.493661803396317,
"grad_norm": 12.723077774047852,
"learning_rate": 0.00047250280255757023,
"loss": 38.0036,
"step": 2200
},
{
"epoch": 5.589332695527386,
"grad_norm": 13.034537315368652,
"learning_rate": 0.0004619746026315906,
"loss": 39.4596,
"step": 2250
},
{
"epoch": 5.685003587658455,
"grad_norm": 11.695505142211914,
"learning_rate": 0.00045133227471057203,
"loss": 36.7652,
"step": 2300
},
{
"epoch": 5.780674479789524,
"grad_norm": 12.36651611328125,
"learning_rate": 0.00044058666578588224,
"loss": 30.5917,
"step": 2350
},
{
"epoch": 5.876345371920593,
"grad_norm": 12.983572006225586,
"learning_rate": 0.0004297487281160982,
"loss": 32.1428,
"step": 2400
},
{
"epoch": 5.972016264051662,
"grad_norm": 13.80376148223877,
"learning_rate": 0.00041882950806412285,
"loss": 33.0843,
"step": 2450
},
{
"epoch": 5.998804113848362,
"eval_accuracy": 0.0,
"eval_loss": 6.923346042633057,
"eval_normalizer": 685885.0,
"eval_runtime": 103.7481,
"eval_samples_per_second": 515.258,
"eval_steps_per_second": 1.012,
"step": 2464
},
{
"epoch": 6.068883042334369,
"grad_norm": 9.631818771362305,
"learning_rate": 0.0004078401348383897,
"loss": 65.9342,
"step": 2500
},
{
"epoch": 6.164553934465439,
"grad_norm": 7.13682746887207,
"learning_rate": 0.00039679180914962693,
"loss": 70.9535,
"step": 2550
},
{
"epoch": 6.260224826596508,
"grad_norm": 7.001780033111572,
"learning_rate": 0.00038569579179474536,
"loss": 65.2834,
"step": 2600
},
{
"epoch": 6.355895718727577,
"grad_norm": 8.191680908203125,
"learning_rate": 0.00037456339217948394,
"loss": 60.6176,
"step": 2650
},
{
"epoch": 6.451566610858646,
"grad_norm": 8.527460098266602,
"learning_rate": 0.0003634059567915124,
"loss": 57.9774,
"step": 2700
},
{
"epoch": 6.547237502989716,
"grad_norm": 10.255255699157715,
"learning_rate": 0.00035223485763573775,
"loss": 48.2276,
"step": 2750
},
{
"epoch": 6.642908395120784,
"grad_norm": 11.824259757995605,
"learning_rate": 0.00034106148064360405,
"loss": 44.7545,
"step": 2800
},
{
"epoch": 6.738579287251854,
"grad_norm": 12.938841819763184,
"learning_rate": 0.0003298972140681969,
"loss": 39.2877,
"step": 2850
},
{
"epoch": 6.753886629992825,
"eval_accuracy": 0.0,
"eval_loss": 5.779562950134277,
"eval_normalizer": 685885.0,
"eval_runtime": 104.2115,
"eval_samples_per_second": 512.967,
"eval_steps_per_second": 1.008,
"step": 2858
}
],
"logging_steps": 50,
"max_steps": 5220,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.38189330093718e+17,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}