| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 50, |
| "global_step": 663, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04524886877828054, |
| "grad_norm": 5.21430778503418, |
| "learning_rate": 4.9321266968325794e-05, |
| "loss": 2.927, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.09049773755656108, |
| "grad_norm": 3.602487087249756, |
| "learning_rate": 4.856711915535445e-05, |
| "loss": 0.2562, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.13574660633484162, |
| "grad_norm": 3.16632080078125, |
| "learning_rate": 4.781297134238311e-05, |
| "loss": 0.1797, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.18099547511312217, |
| "grad_norm": 1.254840612411499, |
| "learning_rate": 4.705882352941177e-05, |
| "loss": 0.1615, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22624434389140272, |
| "grad_norm": 1.1034722328186035, |
| "learning_rate": 4.6304675716440425e-05, |
| "loss": 0.1336, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22624434389140272, |
| "eval_loss": 0.1453334242105484, |
| "eval_runtime": 19.792, |
| "eval_samples_per_second": 44.563, |
| "eval_steps_per_second": 1.415, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.27149321266968324, |
| "grad_norm": 1.1389213800430298, |
| "learning_rate": 4.555052790346908e-05, |
| "loss": 0.1375, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3167420814479638, |
| "grad_norm": 1.7688931226730347, |
| "learning_rate": 4.479638009049774e-05, |
| "loss": 0.1198, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.36199095022624433, |
| "grad_norm": 0.8274220824241638, |
| "learning_rate": 4.40422322775264e-05, |
| "loss": 0.097, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4072398190045249, |
| "grad_norm": 1.46064293384552, |
| "learning_rate": 4.328808446455506e-05, |
| "loss": 0.1155, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.45248868778280543, |
| "grad_norm": 1.8560250997543335, |
| "learning_rate": 4.2533936651583714e-05, |
| "loss": 0.1168, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.45248868778280543, |
| "eval_loss": 0.13205984234809875, |
| "eval_runtime": 19.7942, |
| "eval_samples_per_second": 44.559, |
| "eval_steps_per_second": 1.415, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.497737556561086, |
| "grad_norm": 0.6780909895896912, |
| "learning_rate": 4.177978883861237e-05, |
| "loss": 0.1156, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5429864253393665, |
| "grad_norm": 1.2922134399414062, |
| "learning_rate": 4.1025641025641023e-05, |
| "loss": 0.1099, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 0.5779380798339844, |
| "learning_rate": 4.027149321266969e-05, |
| "loss": 0.1054, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6334841628959276, |
| "grad_norm": 1.124670386314392, |
| "learning_rate": 3.951734539969834e-05, |
| "loss": 0.1158, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6787330316742082, |
| "grad_norm": 0.8543263077735901, |
| "learning_rate": 3.8763197586727004e-05, |
| "loss": 0.1013, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6787330316742082, |
| "eval_loss": 0.11915399879217148, |
| "eval_runtime": 19.7455, |
| "eval_samples_per_second": 44.668, |
| "eval_steps_per_second": 1.418, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.7239819004524887, |
| "grad_norm": 1.6712355613708496, |
| "learning_rate": 3.8009049773755655e-05, |
| "loss": 0.1095, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 1.8462319374084473, |
| "learning_rate": 3.725490196078432e-05, |
| "loss": 0.1032, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8144796380090498, |
| "grad_norm": 1.3506959676742554, |
| "learning_rate": 3.650075414781297e-05, |
| "loss": 0.0801, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8597285067873304, |
| "grad_norm": 1.9755982160568237, |
| "learning_rate": 3.574660633484163e-05, |
| "loss": 0.1174, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.9049773755656109, |
| "grad_norm": 0.8696920275688171, |
| "learning_rate": 3.4992458521870286e-05, |
| "loss": 0.0995, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9049773755656109, |
| "eval_loss": 0.09578042477369308, |
| "eval_runtime": 19.7545, |
| "eval_samples_per_second": 44.648, |
| "eval_steps_per_second": 1.417, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9502262443438914, |
| "grad_norm": 0.7832978963851929, |
| "learning_rate": 3.4238310708898944e-05, |
| "loss": 0.0921, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.995475113122172, |
| "grad_norm": 2.3884148597717285, |
| "learning_rate": 3.34841628959276e-05, |
| "loss": 0.0944, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0407239819004526, |
| "grad_norm": 2.8667216300964355, |
| "learning_rate": 3.273001508295626e-05, |
| "loss": 0.0677, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.085972850678733, |
| "grad_norm": 1.0837510824203491, |
| "learning_rate": 3.197586726998492e-05, |
| "loss": 0.0705, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.1312217194570136, |
| "grad_norm": 1.6083077192306519, |
| "learning_rate": 3.1221719457013576e-05, |
| "loss": 0.0599, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1312217194570136, |
| "eval_loss": 0.09875330328941345, |
| "eval_runtime": 19.7671, |
| "eval_samples_per_second": 44.62, |
| "eval_steps_per_second": 1.416, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 0.8606009483337402, |
| "learning_rate": 3.046757164404223e-05, |
| "loss": 0.0563, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.2217194570135748, |
| "grad_norm": 1.5137906074523926, |
| "learning_rate": 2.971342383107089e-05, |
| "loss": 0.0774, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.2669683257918551, |
| "grad_norm": 3.468083381652832, |
| "learning_rate": 2.8959276018099553e-05, |
| "loss": 0.0636, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.3122171945701357, |
| "grad_norm": 2.216883659362793, |
| "learning_rate": 2.8205128205128207e-05, |
| "loss": 0.0578, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.3574660633484164, |
| "grad_norm": 0.9391270875930786, |
| "learning_rate": 2.7450980392156865e-05, |
| "loss": 0.0688, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3574660633484164, |
| "eval_loss": 0.10202794522047043, |
| "eval_runtime": 19.7869, |
| "eval_samples_per_second": 44.575, |
| "eval_steps_per_second": 1.415, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.4027149321266967, |
| "grad_norm": 1.7586933374404907, |
| "learning_rate": 2.6696832579185523e-05, |
| "loss": 0.0762, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.4479638009049773, |
| "grad_norm": 1.774359941482544, |
| "learning_rate": 2.594268476621418e-05, |
| "loss": 0.0513, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.493212669683258, |
| "grad_norm": 1.3426671028137207, |
| "learning_rate": 2.5188536953242835e-05, |
| "loss": 0.0694, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 2.611431121826172, |
| "learning_rate": 2.4434389140271493e-05, |
| "loss": 0.0553, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.5837104072398192, |
| "grad_norm": 1.2032498121261597, |
| "learning_rate": 2.368024132730015e-05, |
| "loss": 0.0649, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.5837104072398192, |
| "eval_loss": 0.08488748222589493, |
| "eval_runtime": 19.7102, |
| "eval_samples_per_second": 44.748, |
| "eval_steps_per_second": 1.421, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.6289592760180995, |
| "grad_norm": 1.0457937717437744, |
| "learning_rate": 2.292609351432881e-05, |
| "loss": 0.0656, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.6742081447963801, |
| "grad_norm": 1.1490514278411865, |
| "learning_rate": 2.2171945701357466e-05, |
| "loss": 0.0459, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.7194570135746607, |
| "grad_norm": 2.64288592338562, |
| "learning_rate": 2.1417797888386124e-05, |
| "loss": 0.0634, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 1.5465795993804932, |
| "learning_rate": 2.0663650075414782e-05, |
| "loss": 0.0457, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.8099547511312217, |
| "grad_norm": 2.7211270332336426, |
| "learning_rate": 1.990950226244344e-05, |
| "loss": 0.0541, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.8099547511312217, |
| "eval_loss": 0.08079428225755692, |
| "eval_runtime": 19.7977, |
| "eval_samples_per_second": 44.551, |
| "eval_steps_per_second": 1.414, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.8552036199095023, |
| "grad_norm": 1.789421558380127, |
| "learning_rate": 1.9155354449472098e-05, |
| "loss": 0.0573, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.9004524886877827, |
| "grad_norm": 2.3791332244873047, |
| "learning_rate": 1.8401206636500756e-05, |
| "loss": 0.0654, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.9457013574660633, |
| "grad_norm": 1.8045274019241333, |
| "learning_rate": 1.7647058823529414e-05, |
| "loss": 0.0567, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.990950226244344, |
| "grad_norm": 1.0020647048950195, |
| "learning_rate": 1.689291101055807e-05, |
| "loss": 0.056, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.0361990950226243, |
| "grad_norm": 1.4606750011444092, |
| "learning_rate": 1.613876319758673e-05, |
| "loss": 0.0357, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.0361990950226243, |
| "eval_loss": 0.09197434037923813, |
| "eval_runtime": 19.7791, |
| "eval_samples_per_second": 44.592, |
| "eval_steps_per_second": 1.416, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.081447963800905, |
| "grad_norm": 1.4176522493362427, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 0.0487, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.1266968325791855, |
| "grad_norm": 0.9721936583518982, |
| "learning_rate": 1.4630467571644043e-05, |
| "loss": 0.0264, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.171945701357466, |
| "grad_norm": 1.8826217651367188, |
| "learning_rate": 1.3876319758672701e-05, |
| "loss": 0.0337, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.2171945701357467, |
| "grad_norm": 2.2242259979248047, |
| "learning_rate": 1.3122171945701359e-05, |
| "loss": 0.0486, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.262443438914027, |
| "grad_norm": 3.4401419162750244, |
| "learning_rate": 1.2368024132730017e-05, |
| "loss": 0.0371, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.262443438914027, |
| "eval_loss": 0.10297037661075592, |
| "eval_runtime": 19.804, |
| "eval_samples_per_second": 44.536, |
| "eval_steps_per_second": 1.414, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 2.3715732097625732, |
| "learning_rate": 1.1613876319758673e-05, |
| "loss": 0.0303, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.3529411764705883, |
| "grad_norm": 1.1995147466659546, |
| "learning_rate": 1.0859728506787331e-05, |
| "loss": 0.0375, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.3981900452488687, |
| "grad_norm": 2.4895598888397217, |
| "learning_rate": 1.0105580693815989e-05, |
| "loss": 0.0336, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.4434389140271495, |
| "grad_norm": 1.5218836069107056, |
| "learning_rate": 9.351432880844647e-06, |
| "loss": 0.0412, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.48868778280543, |
| "grad_norm": 1.484147071838379, |
| "learning_rate": 8.597285067873303e-06, |
| "loss": 0.0437, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.48868778280543, |
| "eval_loss": 0.09078551828861237, |
| "eval_runtime": 19.7816, |
| "eval_samples_per_second": 44.587, |
| "eval_steps_per_second": 1.415, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.5339366515837103, |
| "grad_norm": 1.726880669593811, |
| "learning_rate": 7.84313725490196e-06, |
| "loss": 0.0282, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.579185520361991, |
| "grad_norm": 1.269982933998108, |
| "learning_rate": 7.0889894419306185e-06, |
| "loss": 0.0353, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.6244343891402715, |
| "grad_norm": 0.8152230381965637, |
| "learning_rate": 6.334841628959276e-06, |
| "loss": 0.0251, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.669683257918552, |
| "grad_norm": 1.2272216081619263, |
| "learning_rate": 5.580693815987934e-06, |
| "loss": 0.0251, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.7149321266968327, |
| "grad_norm": 1.86264967918396, |
| "learning_rate": 4.826546003016592e-06, |
| "loss": 0.0213, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.7149321266968327, |
| "eval_loss": 0.10036029666662216, |
| "eval_runtime": 19.7588, |
| "eval_samples_per_second": 44.638, |
| "eval_steps_per_second": 1.417, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.760180995475113, |
| "grad_norm": 2.0232512950897217, |
| "learning_rate": 4.072398190045249e-06, |
| "loss": 0.0317, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.8054298642533935, |
| "grad_norm": 1.125870943069458, |
| "learning_rate": 3.3182503770739065e-06, |
| "loss": 0.0328, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.8506787330316743, |
| "grad_norm": 2.3638086318969727, |
| "learning_rate": 2.564102564102564e-06, |
| "loss": 0.0318, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.8959276018099547, |
| "grad_norm": 1.220841646194458, |
| "learning_rate": 1.809954751131222e-06, |
| "loss": 0.0265, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.9411764705882355, |
| "grad_norm": 0.9988038539886475, |
| "learning_rate": 1.0558069381598795e-06, |
| "loss": 0.0266, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.9411764705882355, |
| "eval_loss": 0.0963086187839508, |
| "eval_runtime": 19.7526, |
| "eval_samples_per_second": 44.652, |
| "eval_steps_per_second": 1.418, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.986425339366516, |
| "grad_norm": 2.0754244327545166, |
| "learning_rate": 3.01659125188537e-07, |
| "loss": 0.0218, |
| "step": 660 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 663, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2915495153893376e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|