marcus-castalk's picture
Add files using upload-large-folder tool
1616ab4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2984,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01675603217158177,
"grad_norm": 3.53125,
"learning_rate": 4.9178954423592495e-05,
"loss": 4.2472,
"step": 50
},
{
"epoch": 0.03351206434316354,
"grad_norm": 6.53125,
"learning_rate": 4.834115281501341e-05,
"loss": 4.0273,
"step": 100
},
{
"epoch": 0.05026809651474531,
"grad_norm": 3.4375,
"learning_rate": 4.750335120643432e-05,
"loss": 4.039,
"step": 150
},
{
"epoch": 0.06702412868632708,
"grad_norm": 2.25,
"learning_rate": 4.666554959785523e-05,
"loss": 3.9948,
"step": 200
},
{
"epoch": 0.08378016085790885,
"grad_norm": 3.34375,
"learning_rate": 4.582774798927614e-05,
"loss": 3.9269,
"step": 250
},
{
"epoch": 0.10053619302949061,
"grad_norm": 3.546875,
"learning_rate": 4.4989946380697054e-05,
"loss": 3.9419,
"step": 300
},
{
"epoch": 0.11729222520107238,
"grad_norm": 2.875,
"learning_rate": 4.4152144772117966e-05,
"loss": 3.915,
"step": 350
},
{
"epoch": 0.13404825737265416,
"grad_norm": 2.734375,
"learning_rate": 4.331434316353888e-05,
"loss": 3.9188,
"step": 400
},
{
"epoch": 0.15080428954423591,
"grad_norm": 6.9375,
"learning_rate": 4.247654155495979e-05,
"loss": 3.8512,
"step": 450
},
{
"epoch": 0.1675603217158177,
"grad_norm": 5.65625,
"learning_rate": 4.16387399463807e-05,
"loss": 3.9142,
"step": 500
},
{
"epoch": 0.18431635388739948,
"grad_norm": 2.75,
"learning_rate": 4.0800938337801606e-05,
"loss": 3.8583,
"step": 550
},
{
"epoch": 0.20107238605898123,
"grad_norm": 3.03125,
"learning_rate": 3.9963136729222525e-05,
"loss": 3.8955,
"step": 600
},
{
"epoch": 0.217828418230563,
"grad_norm": 7.21875,
"learning_rate": 3.912533512064344e-05,
"loss": 3.8571,
"step": 650
},
{
"epoch": 0.23458445040214476,
"grad_norm": 2.59375,
"learning_rate": 3.828753351206434e-05,
"loss": 3.9075,
"step": 700
},
{
"epoch": 0.25134048257372654,
"grad_norm": 3.28125,
"learning_rate": 3.744973190348526e-05,
"loss": 3.8933,
"step": 750
},
{
"epoch": 0.2680965147453083,
"grad_norm": 2.046875,
"learning_rate": 3.6611930294906165e-05,
"loss": 3.8752,
"step": 800
},
{
"epoch": 0.2848525469168901,
"grad_norm": 2.359375,
"learning_rate": 3.577412868632708e-05,
"loss": 3.8662,
"step": 850
},
{
"epoch": 0.30160857908847183,
"grad_norm": 2.8125,
"learning_rate": 3.4936327077747996e-05,
"loss": 3.912,
"step": 900
},
{
"epoch": 0.3183646112600536,
"grad_norm": 1.953125,
"learning_rate": 3.40985254691689e-05,
"loss": 3.9365,
"step": 950
},
{
"epoch": 0.3351206434316354,
"grad_norm": 3.4375,
"learning_rate": 3.326072386058981e-05,
"loss": 3.8791,
"step": 1000
},
{
"epoch": 0.35187667560321717,
"grad_norm": 3.390625,
"learning_rate": 3.2422922252010724e-05,
"loss": 3.8536,
"step": 1050
},
{
"epoch": 0.36863270777479895,
"grad_norm": 3.1875,
"learning_rate": 3.1585120643431636e-05,
"loss": 3.8585,
"step": 1100
},
{
"epoch": 0.3853887399463807,
"grad_norm": 3.40625,
"learning_rate": 3.074731903485255e-05,
"loss": 3.9163,
"step": 1150
},
{
"epoch": 0.40214477211796246,
"grad_norm": 2.34375,
"learning_rate": 2.990951742627346e-05,
"loss": 3.8529,
"step": 1200
},
{
"epoch": 0.41890080428954424,
"grad_norm": 3.140625,
"learning_rate": 2.907171581769437e-05,
"loss": 3.86,
"step": 1250
},
{
"epoch": 0.435656836461126,
"grad_norm": 2.40625,
"learning_rate": 2.823391420911528e-05,
"loss": 3.8541,
"step": 1300
},
{
"epoch": 0.4524128686327078,
"grad_norm": 2.03125,
"learning_rate": 2.7396112600536195e-05,
"loss": 3.8549,
"step": 1350
},
{
"epoch": 0.4691689008042895,
"grad_norm": 2.671875,
"learning_rate": 2.6558310991957107e-05,
"loss": 3.884,
"step": 1400
},
{
"epoch": 0.4859249329758713,
"grad_norm": 3.34375,
"learning_rate": 2.5720509383378015e-05,
"loss": 3.8677,
"step": 1450
},
{
"epoch": 0.5026809651474531,
"grad_norm": 2.984375,
"learning_rate": 2.488270777479893e-05,
"loss": 3.7921,
"step": 1500
},
{
"epoch": 0.5194369973190348,
"grad_norm": 7.0,
"learning_rate": 2.4044906166219842e-05,
"loss": 3.8765,
"step": 1550
},
{
"epoch": 0.5361930294906166,
"grad_norm": 1.7421875,
"learning_rate": 2.320710455764075e-05,
"loss": 3.7936,
"step": 1600
},
{
"epoch": 0.5529490616621984,
"grad_norm": 1.8984375,
"learning_rate": 2.2369302949061662e-05,
"loss": 3.8304,
"step": 1650
},
{
"epoch": 0.5697050938337802,
"grad_norm": 2.421875,
"learning_rate": 2.1531501340482574e-05,
"loss": 3.7994,
"step": 1700
},
{
"epoch": 0.5864611260053619,
"grad_norm": 3.234375,
"learning_rate": 2.069369973190349e-05,
"loss": 3.8363,
"step": 1750
},
{
"epoch": 0.6032171581769437,
"grad_norm": 2.46875,
"learning_rate": 1.9855898123324398e-05,
"loss": 3.8377,
"step": 1800
},
{
"epoch": 0.6199731903485255,
"grad_norm": 2.296875,
"learning_rate": 1.901809651474531e-05,
"loss": 3.86,
"step": 1850
},
{
"epoch": 0.6367292225201072,
"grad_norm": 3.546875,
"learning_rate": 1.818029490616622e-05,
"loss": 3.8127,
"step": 1900
},
{
"epoch": 0.653485254691689,
"grad_norm": 2.578125,
"learning_rate": 1.7342493297587133e-05,
"loss": 3.8481,
"step": 1950
},
{
"epoch": 0.6702412868632708,
"grad_norm": 4.40625,
"learning_rate": 1.6504691689008045e-05,
"loss": 3.8308,
"step": 2000
},
{
"epoch": 0.6869973190348525,
"grad_norm": 1.8984375,
"learning_rate": 1.5666890080428956e-05,
"loss": 3.7686,
"step": 2050
},
{
"epoch": 0.7037533512064343,
"grad_norm": 2.109375,
"learning_rate": 1.4829088471849867e-05,
"loss": 3.8635,
"step": 2100
},
{
"epoch": 0.7205093833780161,
"grad_norm": 3.8125,
"learning_rate": 1.3991286863270778e-05,
"loss": 3.8338,
"step": 2150
},
{
"epoch": 0.7372654155495979,
"grad_norm": 2.640625,
"learning_rate": 1.3153485254691688e-05,
"loss": 3.8528,
"step": 2200
},
{
"epoch": 0.7540214477211796,
"grad_norm": 3.03125,
"learning_rate": 1.23156836461126e-05,
"loss": 3.879,
"step": 2250
},
{
"epoch": 0.7707774798927614,
"grad_norm": 2.453125,
"learning_rate": 1.1477882037533512e-05,
"loss": 3.8653,
"step": 2300
},
{
"epoch": 0.7875335120643432,
"grad_norm": 3.359375,
"learning_rate": 1.0640080428954424e-05,
"loss": 3.7811,
"step": 2350
},
{
"epoch": 0.8042895442359249,
"grad_norm": 1.8125,
"learning_rate": 9.802278820375336e-06,
"loss": 3.864,
"step": 2400
},
{
"epoch": 0.8210455764075067,
"grad_norm": 3.015625,
"learning_rate": 8.964477211796247e-06,
"loss": 3.8341,
"step": 2450
},
{
"epoch": 0.8378016085790885,
"grad_norm": 2.109375,
"learning_rate": 8.126675603217159e-06,
"loss": 3.8547,
"step": 2500
},
{
"epoch": 0.8545576407506702,
"grad_norm": 3.765625,
"learning_rate": 7.288873994638071e-06,
"loss": 3.8611,
"step": 2550
},
{
"epoch": 0.871313672922252,
"grad_norm": 2.390625,
"learning_rate": 6.451072386058982e-06,
"loss": 3.7803,
"step": 2600
},
{
"epoch": 0.8880697050938338,
"grad_norm": 2.609375,
"learning_rate": 5.613270777479894e-06,
"loss": 3.8047,
"step": 2650
},
{
"epoch": 0.9048257372654156,
"grad_norm": 2.09375,
"learning_rate": 4.7754691689008045e-06,
"loss": 3.8275,
"step": 2700
},
{
"epoch": 0.9215817694369973,
"grad_norm": 3.59375,
"learning_rate": 3.9376675603217155e-06,
"loss": 3.8354,
"step": 2750
},
{
"epoch": 0.938337801608579,
"grad_norm": 2.90625,
"learning_rate": 3.0998659517426277e-06,
"loss": 3.8245,
"step": 2800
},
{
"epoch": 0.9550938337801609,
"grad_norm": 2.375,
"learning_rate": 2.262064343163539e-06,
"loss": 3.8646,
"step": 2850
},
{
"epoch": 0.9718498659517426,
"grad_norm": 2.15625,
"learning_rate": 1.4242627345844506e-06,
"loss": 3.8213,
"step": 2900
},
{
"epoch": 0.9886058981233244,
"grad_norm": 7.625,
"learning_rate": 5.86461126005362e-07,
"loss": 3.7671,
"step": 2950
}
],
"logging_steps": 50,
"max_steps": 2984,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.509068212041933e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}