| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 31135, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08029548739360848, | |
| "grad_norm": 1.0575518608093262, | |
| "learning_rate": 4.919704512606392e-05, | |
| "loss": 1.6978, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16059097478721696, | |
| "grad_norm": 1.1808357238769531, | |
| "learning_rate": 4.839409025212783e-05, | |
| "loss": 1.5655, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.24088646218082543, | |
| "grad_norm": 1.2457187175750732, | |
| "learning_rate": 4.759113537819175e-05, | |
| "loss": 1.5116, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3211819495744339, | |
| "grad_norm": 1.1846860647201538, | |
| "learning_rate": 4.678818050425566e-05, | |
| "loss": 1.4617, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4014774369680424, | |
| "grad_norm": 1.1566858291625977, | |
| "learning_rate": 4.598522563031958e-05, | |
| "loss": 1.4351, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.48177292436165087, | |
| "grad_norm": 1.2010133266448975, | |
| "learning_rate": 4.5182270756383495e-05, | |
| "loss": 1.4215, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5620684117552593, | |
| "grad_norm": 1.0966484546661377, | |
| "learning_rate": 4.4379315882447406e-05, | |
| "loss": 1.3943, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6423638991488678, | |
| "grad_norm": 1.1054482460021973, | |
| "learning_rate": 4.3576361008511324e-05, | |
| "loss": 1.392, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7226593865424763, | |
| "grad_norm": 1.166495680809021, | |
| "learning_rate": 4.277340613457524e-05, | |
| "loss": 1.3607, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8029548739360848, | |
| "grad_norm": 1.6440229415893555, | |
| "learning_rate": 4.197045126063915e-05, | |
| "loss": 1.3453, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8832503613296933, | |
| "grad_norm": 1.1146718263626099, | |
| "learning_rate": 4.116749638670307e-05, | |
| "loss": 1.328, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9635458487233017, | |
| "grad_norm": 1.2365636825561523, | |
| "learning_rate": 4.036454151276698e-05, | |
| "loss": 1.3213, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0438413361169103, | |
| "grad_norm": 1.0937212705612183, | |
| "learning_rate": 3.95615866388309e-05, | |
| "loss": 1.2654, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1241368235105187, | |
| "grad_norm": 0.8828343749046326, | |
| "learning_rate": 3.875863176489482e-05, | |
| "loss": 1.2134, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2044323109041273, | |
| "grad_norm": 1.099165916442871, | |
| "learning_rate": 3.795567689095873e-05, | |
| "loss": 1.2104, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.2847277982977356, | |
| "grad_norm": 1.2219111919403076, | |
| "learning_rate": 3.7152722017022646e-05, | |
| "loss": 1.1973, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.3650232856913442, | |
| "grad_norm": 1.0750117301940918, | |
| "learning_rate": 3.6349767143086564e-05, | |
| "loss": 1.1923, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.4453187730849526, | |
| "grad_norm": 1.098244547843933, | |
| "learning_rate": 3.5546812269150475e-05, | |
| "loss": 1.1925, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.525614260478561, | |
| "grad_norm": 1.1637680530548096, | |
| "learning_rate": 3.474385739521439e-05, | |
| "loss": 1.182, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6059097478721696, | |
| "grad_norm": 1.1562321186065674, | |
| "learning_rate": 3.3940902521278304e-05, | |
| "loss": 1.1634, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.6862052352657781, | |
| "grad_norm": 1.4565141201019287, | |
| "learning_rate": 3.313794764734222e-05, | |
| "loss": 1.1542, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.7665007226593865, | |
| "grad_norm": 1.434606671333313, | |
| "learning_rate": 3.233499277340614e-05, | |
| "loss": 1.1533, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.8467962100529949, | |
| "grad_norm": 1.1290115118026733, | |
| "learning_rate": 3.153203789947005e-05, | |
| "loss": 1.1496, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.9270916974466035, | |
| "grad_norm": 1.1467580795288086, | |
| "learning_rate": 3.072908302553397e-05, | |
| "loss": 1.1444, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.007387184840212, | |
| "grad_norm": 1.1580528020858765, | |
| "learning_rate": 2.9926128151597882e-05, | |
| "loss": 1.1478, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.0876826722338206, | |
| "grad_norm": 1.040642261505127, | |
| "learning_rate": 2.9123173277661797e-05, | |
| "loss": 1.0662, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.167978159627429, | |
| "grad_norm": 1.1460875272750854, | |
| "learning_rate": 2.832021840372571e-05, | |
| "loss": 1.0788, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.2482736470210374, | |
| "grad_norm": 1.0731582641601562, | |
| "learning_rate": 2.751726352978963e-05, | |
| "loss": 1.0635, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.328569134414646, | |
| "grad_norm": 1.1237194538116455, | |
| "learning_rate": 2.6714308655853543e-05, | |
| "loss": 1.065, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.4088646218082546, | |
| "grad_norm": 1.0012214183807373, | |
| "learning_rate": 2.5911353781917458e-05, | |
| "loss": 1.0509, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.4891601092018627, | |
| "grad_norm": 1.1109308004379272, | |
| "learning_rate": 2.5108398907981372e-05, | |
| "loss": 1.0574, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.5694555965954713, | |
| "grad_norm": 1.1631648540496826, | |
| "learning_rate": 2.430544403404529e-05, | |
| "loss": 1.0345, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.64975108398908, | |
| "grad_norm": 1.0513032674789429, | |
| "learning_rate": 2.3502489160109204e-05, | |
| "loss": 1.0616, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.7300465713826885, | |
| "grad_norm": 1.189889669418335, | |
| "learning_rate": 2.269953428617312e-05, | |
| "loss": 1.0533, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.8103420587762966, | |
| "grad_norm": 1.0951628684997559, | |
| "learning_rate": 2.1896579412237033e-05, | |
| "loss": 1.0388, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.890637546169905, | |
| "grad_norm": 1.0122724771499634, | |
| "learning_rate": 2.109362453830095e-05, | |
| "loss": 1.0374, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.970933033563514, | |
| "grad_norm": 1.1020405292510986, | |
| "learning_rate": 2.0290669664364865e-05, | |
| "loss": 1.0325, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.0512285209571224, | |
| "grad_norm": 1.0594305992126465, | |
| "learning_rate": 1.948771479042878e-05, | |
| "loss": 1.0047, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.1315240083507305, | |
| "grad_norm": 1.070056438446045, | |
| "learning_rate": 1.8684759916492694e-05, | |
| "loss": 0.9794, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.211819495744339, | |
| "grad_norm": 1.106451392173767, | |
| "learning_rate": 1.7881805042556608e-05, | |
| "loss": 0.971, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.2921149831379477, | |
| "grad_norm": 1.0232676267623901, | |
| "learning_rate": 1.7078850168620526e-05, | |
| "loss": 0.9819, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.3724104705315563, | |
| "grad_norm": 1.1868596076965332, | |
| "learning_rate": 1.627589529468444e-05, | |
| "loss": 0.9763, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.4527059579251644, | |
| "grad_norm": 1.0707334280014038, | |
| "learning_rate": 1.5472940420748355e-05, | |
| "loss": 0.9741, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.533001445318773, | |
| "grad_norm": 1.0286450386047363, | |
| "learning_rate": 1.466998554681227e-05, | |
| "loss": 0.9821, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.6132969327123816, | |
| "grad_norm": 1.1337109804153442, | |
| "learning_rate": 1.3867030672876185e-05, | |
| "loss": 0.9754, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 3.69359242010599, | |
| "grad_norm": 1.1301957368850708, | |
| "learning_rate": 1.3064075798940101e-05, | |
| "loss": 0.9757, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.7738879074995983, | |
| "grad_norm": 0.8995300531387329, | |
| "learning_rate": 1.2261120925004016e-05, | |
| "loss": 0.9728, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.854183394893207, | |
| "grad_norm": 1.099932074546814, | |
| "learning_rate": 1.1458166051067932e-05, | |
| "loss": 0.9549, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.9344788822868155, | |
| "grad_norm": 1.0159733295440674, | |
| "learning_rate": 1.0655211177131846e-05, | |
| "loss": 0.976, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.014774369680424, | |
| "grad_norm": 1.0208700895309448, | |
| "learning_rate": 9.852256303195762e-06, | |
| "loss": 0.9571, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.095069857074033, | |
| "grad_norm": 1.040358304977417, | |
| "learning_rate": 9.049301429259676e-06, | |
| "loss": 0.9293, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 4.175365344467641, | |
| "grad_norm": 1.1360992193222046, | |
| "learning_rate": 8.246346555323591e-06, | |
| "loss": 0.9351, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.255660831861249, | |
| "grad_norm": 1.0629996061325073, | |
| "learning_rate": 7.443391681387506e-06, | |
| "loss": 0.9308, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 4.335956319254858, | |
| "grad_norm": 1.1828113794326782, | |
| "learning_rate": 6.6404368074514205e-06, | |
| "loss": 0.9356, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 4.416251806648466, | |
| "grad_norm": 1.156646966934204, | |
| "learning_rate": 5.8374819335153366e-06, | |
| "loss": 0.9396, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 4.496547294042075, | |
| "grad_norm": 1.0000945329666138, | |
| "learning_rate": 5.034527059579252e-06, | |
| "loss": 0.9266, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 4.576842781435683, | |
| "grad_norm": 1.0536987781524658, | |
| "learning_rate": 4.231572185643167e-06, | |
| "loss": 0.9269, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 4.657138268829292, | |
| "grad_norm": 1.1100162267684937, | |
| "learning_rate": 3.4286173117070822e-06, | |
| "loss": 0.9256, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.7374337562229005, | |
| "grad_norm": 1.1744736433029175, | |
| "learning_rate": 2.6256624377709975e-06, | |
| "loss": 0.9176, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 4.817729243616509, | |
| "grad_norm": 1.049423098564148, | |
| "learning_rate": 1.8227075638349127e-06, | |
| "loss": 0.9355, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.898024731010118, | |
| "grad_norm": 1.227993369102478, | |
| "learning_rate": 1.0197526898988277e-06, | |
| "loss": 0.9221, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 4.978320218403725, | |
| "grad_norm": 1.1226952075958252, | |
| "learning_rate": 2.167978159627429e-07, | |
| "loss": 0.9336, | |
| "step": 31000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 31135, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6463151666049843e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |