Attila1011's picture
Upload folder using huggingface_hub
90bfee7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.07450407704020913,
"eval_steps": 1024,
"global_step": 7168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002660859894293183,
"grad_norm": 1.2474960088729858,
"learning_rate": 4.416349151368202e-06,
"loss": 10.7959,
"step": 256
},
{
"epoch": 0.005321719788586366,
"grad_norm": 0.9909601807594299,
"learning_rate": 8.85001731901628e-06,
"loss": 10.0541,
"step": 512
},
{
"epoch": 0.00798257968287955,
"grad_norm": 0.8707857728004456,
"learning_rate": 1.3283685486664357e-05,
"loss": 9.0128,
"step": 768
},
{
"epoch": 0.010643439577172733,
"grad_norm": 0.8646675944328308,
"learning_rate": 1.7717353654312436e-05,
"loss": 8.0258,
"step": 1024
},
{
"epoch": 0.010643439577172733,
"eval_acr_loss": 0.010622835965477861,
"eval_across_var": 0.012448028806829825,
"eval_bleu": 0.09796718659972453,
"eval_ce_loss": 7.011146575212479,
"eval_cos_loss": 0.9561333861202002,
"eval_cov": 0.06968498229980469,
"eval_cov_loss": 0.007897357034380548,
"eval_glb_loss": 0.4064090773463249,
"eval_global_kurtosis": 3.1536246612668037,
"eval_global_mean": -0.0043766796588897705,
"eval_global_var": 0.26236724853515625,
"eval_krt_loss": 0.02414830235647969,
"eval_loss": 7.440777659416199,
"eval_mean_loss": 2.432924324580199e-05,
"eval_mse_loss": 1.9190465211868286,
"eval_per_loss": 0.3546135723590851,
"eval_per_var": 0.25460052490234375,
"eval_within_var": 0.2497538859024644,
"eval_wth_loss": 0.42283743154257536,
"step": 1024
},
{
"epoch": 0.010643439577172733,
"eval_acr_loss": 0.010622835965477861,
"eval_across_var": 0.012448028806829825,
"eval_bleu": 0.09796718659972453,
"eval_ce_loss": 7.011146575212479,
"eval_cos_loss": 0.9561333861202002,
"eval_cov": 0.06968498229980469,
"eval_cov_loss": 0.007897357034380548,
"eval_glb_loss": 0.4064090773463249,
"eval_global_kurtosis": 3.1536246612668037,
"eval_global_mean": -0.0043766796588897705,
"eval_global_var": 0.26236724853515625,
"eval_krt_loss": 0.02414830235647969,
"eval_loss": 7.440777659416199,
"eval_mean_loss": 2.432924324580199e-05,
"eval_mse_loss": 1.9190465211868286,
"eval_per_loss": 0.3546135723590851,
"eval_per_var": 0.25460052490234375,
"eval_runtime": 10.2924,
"eval_samples_per_second": 194.318,
"eval_steps_per_second": 3.109,
"eval_within_var": 0.2497538859024644,
"eval_wth_loss": 0.42283743154257536,
"step": 1024
},
{
"epoch": 0.013304299471465915,
"grad_norm": 0.8640124797821045,
"learning_rate": 2.2151021821960514e-05,
"loss": 7.0717,
"step": 1280
},
{
"epoch": 0.0159651593657591,
"grad_norm": 0.7730758190155029,
"learning_rate": 2.6584689989608592e-05,
"loss": 6.1186,
"step": 1536
},
{
"epoch": 0.018626019260052282,
"grad_norm": 0.6963288187980652,
"learning_rate": 3.1018358157256674e-05,
"loss": 5.2034,
"step": 1792
},
{
"epoch": 0.021286879154345465,
"grad_norm": 0.5616676211357117,
"learning_rate": 3.5452026324904745e-05,
"loss": 4.3736,
"step": 2048
},
{
"epoch": 0.021286879154345465,
"eval_acr_loss": 0.012247161677805707,
"eval_across_var": 0.012088194896932691,
"eval_bleu": 0.3414767015551533,
"eval_ce_loss": 3.459592819213867,
"eval_cos_loss": 0.9247305598109961,
"eval_cov": 0.0738067626953125,
"eval_cov_loss": 0.008805307501461357,
"eval_glb_loss": 0.354451559484005,
"eval_global_kurtosis": 3.0689163729548454,
"eval_global_mean": -0.004139065742492676,
"eval_global_var": 0.30448150634765625,
"eval_krt_loss": 0.005096593113194103,
"eval_loss": 3.872511200606823,
"eval_mean_loss": 2.0987964205687604e-05,
"eval_mse_loss": 1.9079551436007023,
"eval_per_loss": 0.3076172471046448,
"eval_per_var": 0.29547882080078125,
"eval_within_var": 0.29124921560287476,
"eval_wth_loss": 0.3706007469445467,
"step": 2048
},
{
"epoch": 0.021286879154345465,
"eval_acr_loss": 0.012247161677805707,
"eval_across_var": 0.012088194896932691,
"eval_bleu": 0.3414767015551533,
"eval_ce_loss": 3.459592819213867,
"eval_cos_loss": 0.9247305598109961,
"eval_cov": 0.0738067626953125,
"eval_cov_loss": 0.008805307501461357,
"eval_glb_loss": 0.354451559484005,
"eval_global_kurtosis": 3.0689163729548454,
"eval_global_mean": -0.004139065742492676,
"eval_global_var": 0.30448150634765625,
"eval_krt_loss": 0.005096593113194103,
"eval_loss": 3.872511200606823,
"eval_mean_loss": 2.0987964205687604e-05,
"eval_mse_loss": 1.9079551436007023,
"eval_per_loss": 0.3076172471046448,
"eval_per_var": 0.29547882080078125,
"eval_runtime": 9.9029,
"eval_samples_per_second": 201.962,
"eval_steps_per_second": 3.231,
"eval_within_var": 0.29124921560287476,
"eval_wth_loss": 0.3706007469445467,
"step": 2048
},
{
"epoch": 0.023947739048638648,
"grad_norm": 0.44369781017303467,
"learning_rate": 3.988569449255283e-05,
"loss": 3.6681,
"step": 2304
},
{
"epoch": 0.02660859894293183,
"grad_norm": 0.3687000274658203,
"learning_rate": 4.43193626602009e-05,
"loss": 3.0886,
"step": 2560
},
{
"epoch": 0.029269458837225013,
"grad_norm": 0.3568866550922394,
"learning_rate": 4.875303082784898e-05,
"loss": 2.6049,
"step": 2816
},
{
"epoch": 0.0319303187315182,
"grad_norm": 0.2986361086368561,
"learning_rate": 4.9999520413849384e-05,
"loss": 2.2063,
"step": 3072
},
{
"epoch": 0.0319303187315182,
"eval_acr_loss": 0.011944463331019506,
"eval_across_var": 0.025048962590517476,
"eval_bleu": 0.5783938497071468,
"eval_ce_loss": 1.5567151941359043,
"eval_cos_loss": 0.8373732026666403,
"eval_cov": 0.10790634155273438,
"eval_cov_loss": 0.01786720016389154,
"eval_glb_loss": 0.08421005308628082,
"eval_global_kurtosis": 3.042153775691986,
"eval_global_mean": -0.0013459473848342896,
"eval_global_var": 0.60980224609375,
"eval_krt_loss": 0.0020023469523948734,
"eval_loss": 1.9125033244490623,
"eval_mean_loss": 9.189085614202952e-06,
"eval_mse_loss": 1.798950683325529,
"eval_per_loss": 0.06572123290970922,
"eval_per_var": 0.59381103515625,
"eval_within_var": 0.5743193719536066,
"eval_wth_loss": 0.10620259935967624,
"step": 3072
},
{
"epoch": 0.0319303187315182,
"eval_acr_loss": 0.011944463331019506,
"eval_across_var": 0.025048962590517476,
"eval_bleu": 0.5783938497071468,
"eval_ce_loss": 1.5567151941359043,
"eval_cos_loss": 0.8373732026666403,
"eval_cov": 0.10790634155273438,
"eval_cov_loss": 0.01786720016389154,
"eval_glb_loss": 0.08421005308628082,
"eval_global_kurtosis": 3.042153775691986,
"eval_global_mean": -0.0013459473848342896,
"eval_global_var": 0.60980224609375,
"eval_krt_loss": 0.0020023469523948734,
"eval_loss": 1.9125033244490623,
"eval_mean_loss": 9.189085614202952e-06,
"eval_mse_loss": 1.798950683325529,
"eval_per_loss": 0.06572123290970922,
"eval_per_var": 0.59381103515625,
"eval_runtime": 10.4973,
"eval_samples_per_second": 190.525,
"eval_steps_per_second": 3.048,
"eval_within_var": 0.5743193719536066,
"eval_wth_loss": 0.10620259935967624,
"step": 3072
},
{
"epoch": 0.03459117862581138,
"grad_norm": 0.270656943321228,
"learning_rate": 4.9997257606389056e-05,
"loss": 1.8881,
"step": 3328
},
{
"epoch": 0.037252038520104565,
"grad_norm": 0.24188651144504547,
"learning_rate": 4.999313831167736e-05,
"loss": 1.6388,
"step": 3584
},
{
"epoch": 0.03991289841439775,
"grad_norm": 0.2294900268316269,
"learning_rate": 4.998716283564454e-05,
"loss": 1.4382,
"step": 3840
},
{
"epoch": 0.04257375830869093,
"grad_norm": 0.20773501694202423,
"learning_rate": 4.99793316220751e-05,
"loss": 1.2713,
"step": 4096
},
{
"epoch": 0.04257375830869093,
"eval_acr_loss": 0.011318061951897107,
"eval_across_var": 0.037467821151949465,
"eval_bleu": 0.7398917090811331,
"eval_ce_loss": 0.8147697541862726,
"eval_cos_loss": 0.739902313798666,
"eval_cov": 0.09920120239257812,
"eval_cov_loss": 0.015144521807087585,
"eval_glb_loss": 0.0026292089896742254,
"eval_global_kurtosis": 3.0431209057569504,
"eval_global_mean": 0.0002828165888786316,
"eval_global_var": 0.849639892578125,
"eval_krt_loss": 0.002429395680081825,
"eval_loss": 1.1247494276612997,
"eval_mean_loss": 1.0841616662204956e-05,
"eval_mse_loss": 1.6548683494329453,
"eval_per_loss": 0.0006620931362704141,
"eval_per_var": 0.826202392578125,
"eval_within_var": 0.8049256391823292,
"eval_wth_loss": 0.009282346058171242,
"step": 4096
},
{
"epoch": 0.04257375830869093,
"eval_acr_loss": 0.011318061951897107,
"eval_across_var": 0.037467821151949465,
"eval_bleu": 0.7398917090811331,
"eval_ce_loss": 0.8147697541862726,
"eval_cos_loss": 0.739902313798666,
"eval_cov": 0.09920120239257812,
"eval_cov_loss": 0.015144521807087585,
"eval_glb_loss": 0.0026292089896742254,
"eval_global_kurtosis": 3.0431209057569504,
"eval_global_mean": 0.0002828165888786316,
"eval_global_var": 0.849639892578125,
"eval_krt_loss": 0.002429395680081825,
"eval_loss": 1.1247494276612997,
"eval_mean_loss": 1.0841616662204956e-05,
"eval_mse_loss": 1.6548683494329453,
"eval_per_loss": 0.0006620931362704141,
"eval_per_var": 0.826202392578125,
"eval_runtime": 10.0939,
"eval_samples_per_second": 198.14,
"eval_steps_per_second": 3.17,
"eval_within_var": 0.8049256391823292,
"eval_wth_loss": 0.009282346058171242,
"step": 4096
},
{
"epoch": 0.04523461820298411,
"grad_norm": 0.1941945105791092,
"learning_rate": 4.996964525257477e-05,
"loss": 1.1364,
"step": 4352
},
{
"epoch": 0.047895478097277296,
"grad_norm": 0.17706365883350372,
"learning_rate": 4.995810444652731e-05,
"loss": 1.0202,
"step": 4608
},
{
"epoch": 0.05055633799157048,
"grad_norm": 0.17764592170715332,
"learning_rate": 4.994471006104112e-05,
"loss": 0.9256,
"step": 4864
},
{
"epoch": 0.05321719788586366,
"grad_norm": 0.1597519963979721,
"learning_rate": 4.992946309088557e-05,
"loss": 0.8433,
"step": 5120
},
{
"epoch": 0.05321719788586366,
"eval_acr_loss": 0.010796478512929752,
"eval_across_var": 0.0437286015949212,
"eval_bleu": 0.8359253777154618,
"eval_ce_loss": 0.4896330190822482,
"eval_cos_loss": 0.6560099385678768,
"eval_cov": 0.08585166931152344,
"eval_cov_loss": 0.011597162316320464,
"eval_glb_loss": 0.0,
"eval_global_kurtosis": 3.051339641213417,
"eval_global_mean": 0.0004043206572532654,
"eval_global_var": 0.9364166259765625,
"eval_krt_loss": 0.0034133701161636054,
"eval_loss": 0.7659010197967291,
"eval_mean_loss": 1.1450480416286268e-05,
"eval_mse_loss": 1.5238465368747711,
"eval_per_loss": 0.0,
"eval_per_var": 0.9102935791015625,
"eval_within_var": 0.8955719340592623,
"eval_wth_loss": 0.00021194279955238926,
"step": 5120
},
{
"epoch": 0.05321719788586366,
"eval_acr_loss": 0.010796478512929752,
"eval_across_var": 0.0437286015949212,
"eval_bleu": 0.8359253777154618,
"eval_ce_loss": 0.4896330190822482,
"eval_cos_loss": 0.6560099385678768,
"eval_cov": 0.08585166931152344,
"eval_cov_loss": 0.011597162316320464,
"eval_glb_loss": 0.0,
"eval_global_kurtosis": 3.051339641213417,
"eval_global_mean": 0.0004043206572532654,
"eval_global_var": 0.9364166259765625,
"eval_krt_loss": 0.0034133701161636054,
"eval_loss": 0.7659010197967291,
"eval_mean_loss": 1.1450480416286268e-05,
"eval_mse_loss": 1.5238465368747711,
"eval_per_loss": 0.0,
"eval_per_var": 0.9102935791015625,
"eval_runtime": 10.0333,
"eval_samples_per_second": 199.336,
"eval_steps_per_second": 3.189,
"eval_within_var": 0.8955719340592623,
"eval_wth_loss": 0.00021194279955238926,
"step": 5120
},
{
"epoch": 0.055878057780156844,
"grad_norm": 0.15128253400325775,
"learning_rate": 4.991236466841708e-05,
"loss": 0.7748,
"step": 5376
},
{
"epoch": 0.058538917674450026,
"grad_norm": 0.15075387060642242,
"learning_rate": 4.989341606349509e-05,
"loss": 0.7149,
"step": 5632
},
{
"epoch": 0.06119977756874321,
"grad_norm": 0.13722559809684753,
"learning_rate": 4.987261868338772e-05,
"loss": 0.6633,
"step": 5888
},
{
"epoch": 0.0638606374630364,
"grad_norm": 0.14299507439136505,
"learning_rate": 4.9849974072667235e-05,
"loss": 0.6168,
"step": 6144
},
{
"epoch": 0.0638606374630364,
"eval_acr_loss": 0.010568196172243915,
"eval_across_var": 0.050391704426147044,
"eval_bleu": 0.8864417334039504,
"eval_ce_loss": 0.3192982799373567,
"eval_cos_loss": 0.5848112031817436,
"eval_cov": 0.08610343933105469,
"eval_cov_loss": 0.011645367194432765,
"eval_glb_loss": 0.0,
"eval_global_kurtosis": 3.057781808078289,
"eval_global_mean": 0.00010520219802856445,
"eval_global_var": 1.05322265625,
"eval_krt_loss": 0.00413643000592856,
"eval_loss": 0.5672316299751401,
"eval_mean_loss": 1.1898590268621945e-05,
"eval_mse_loss": 1.4093649201095104,
"eval_per_loss": 0.0,
"eval_per_var": 1.0248565673828125,
"eval_within_var": 1.0096650514751673,
"eval_wth_loss": 0.0,
"step": 6144
},
{
"epoch": 0.0638606374630364,
"eval_acr_loss": 0.010568196172243915,
"eval_across_var": 0.050391704426147044,
"eval_bleu": 0.8864417334039504,
"eval_ce_loss": 0.3192982799373567,
"eval_cos_loss": 0.5848112031817436,
"eval_cov": 0.08610343933105469,
"eval_cov_loss": 0.011645367194432765,
"eval_glb_loss": 0.0,
"eval_global_kurtosis": 3.057781808078289,
"eval_global_mean": 0.00010520219802856445,
"eval_global_var": 1.05322265625,
"eval_krt_loss": 0.00413643000592856,
"eval_loss": 0.5672316299751401,
"eval_mean_loss": 1.1898590268621945e-05,
"eval_mse_loss": 1.4093649201095104,
"eval_per_loss": 0.0,
"eval_per_var": 1.0248565673828125,
"eval_runtime": 10.495,
"eval_samples_per_second": 190.567,
"eval_steps_per_second": 3.049,
"eval_within_var": 1.0096650514751673,
"eval_wth_loss": 0.0,
"step": 6144
},
{
"epoch": 0.06652149735732958,
"grad_norm": 0.13175231218338013,
"learning_rate": 4.9825483913095364e-05,
"loss": 0.5727,
"step": 6400
},
{
"epoch": 0.06918235725162276,
"grad_norm": 0.130602166056633,
"learning_rate": 4.979915002349838e-05,
"loss": 0.5411,
"step": 6656
},
{
"epoch": 0.07184321714591595,
"grad_norm": 0.12843571603298187,
"learning_rate": 4.977097435963204e-05,
"loss": 0.5082,
"step": 6912
},
{
"epoch": 0.07450407704020913,
"grad_norm": 0.1221570074558258,
"learning_rate": 4.974095901403632e-05,
"loss": 0.4775,
"step": 7168
},
{
"epoch": 0.07450407704020913,
"eval_acr_loss": 0.01032613120332826,
"eval_across_var": 0.055548187578096986,
"eval_bleu": 0.917825685067053,
"eval_ce_loss": 0.22222592495381832,
"eval_cos_loss": 0.5258319452404976,
"eval_cov": 0.0838165283203125,
"eval_cov_loss": 0.01109178303158842,
"eval_glb_loss": 0.0011626811420910599,
"eval_global_kurtosis": 3.0560965314507484,
"eval_global_mean": -0.00032412633299827576,
"eval_global_var": 1.1317138671875,
"eval_krt_loss": 0.003975647037577801,
"eval_loss": 0.4466686090454459,
"eval_mean_loss": 1.2274138283974168e-05,
"eval_mse_loss": 1.3142655715346336,
"eval_per_loss": 0.0,
"eval_per_var": 1.1024169921875,
"eval_within_var": 1.088273286819458,
"eval_wth_loss": 4.5452433568016204e-05,
"step": 7168
},
{
"epoch": 0.07450407704020913,
"eval_acr_loss": 0.01032613120332826,
"eval_across_var": 0.055548187578096986,
"eval_bleu": 0.917825685067053,
"eval_ce_loss": 0.22222592495381832,
"eval_cos_loss": 0.5258319452404976,
"eval_cov": 0.0838165283203125,
"eval_cov_loss": 0.01109178303158842,
"eval_glb_loss": 0.0011626811420910599,
"eval_global_kurtosis": 3.0560965314507484,
"eval_global_mean": -0.00032412633299827576,
"eval_global_var": 1.1317138671875,
"eval_krt_loss": 0.003975647037577801,
"eval_loss": 0.4466686090454459,
"eval_mean_loss": 1.2274138283974168e-05,
"eval_mse_loss": 1.3142655715346336,
"eval_per_loss": 0.0,
"eval_per_var": 1.1024169921875,
"eval_runtime": 10.2975,
"eval_samples_per_second": 194.222,
"eval_steps_per_second": 3.108,
"eval_within_var": 1.088273286819458,
"eval_wth_loss": 4.5452433568016204e-05,
"step": 7168
}
],
"logging_steps": 256,
"max_steps": 96210,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1024,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}