| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 500, | |
| "global_step": 50000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 35.146240234375, | |
| "learning_rate": 4.9004900490049e-05, | |
| "loss": 10.1867, | |
| "mean_token_accuracy": 0.544253178048879, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 32.855838775634766, | |
| "learning_rate": 4.8004800480048006e-05, | |
| "loss": 10.2308, | |
| "mean_token_accuracy": 0.5428702699318528, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 36.6450309753418, | |
| "learning_rate": 4.700470047004701e-05, | |
| "loss": 10.2215, | |
| "mean_token_accuracy": 0.5424384255111218, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 33.15324783325195, | |
| "learning_rate": 4.6004600460046006e-05, | |
| "loss": 10.2813, | |
| "mean_token_accuracy": 0.5411355452165008, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 31.203840255737305, | |
| "learning_rate": 4.500450045004501e-05, | |
| "loss": 10.2447, | |
| "mean_token_accuracy": 0.5425998609103262, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 29.603290557861328, | |
| "learning_rate": 4.4004400440044006e-05, | |
| "loss": 10.131, | |
| "mean_token_accuracy": 0.5463594534434378, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 31.61845588684082, | |
| "learning_rate": 4.3004300430043e-05, | |
| "loss": 10.0479, | |
| "mean_token_accuracy": 0.5483995637446641, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 31.28079605102539, | |
| "learning_rate": 4.2004200420042006e-05, | |
| "loss": 9.9802, | |
| "mean_token_accuracy": 0.5492958616241813, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 28.147220611572266, | |
| "learning_rate": 4.100410041004101e-05, | |
| "loss": 9.8445, | |
| "mean_token_accuracy": 0.5530450477898121, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 26.830713272094727, | |
| "learning_rate": 4.0004000400040005e-05, | |
| "loss": 9.8066, | |
| "mean_token_accuracy": 0.5538738285191357, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 28.744468688964844, | |
| "learning_rate": 3.900390039003901e-05, | |
| "loss": 9.7206, | |
| "mean_token_accuracy": 0.5572981600053608, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 25.372802734375, | |
| "learning_rate": 3.8003800380038005e-05, | |
| "loss": 9.6634, | |
| "mean_token_accuracy": 0.5570068260915577, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 31.25323486328125, | |
| "learning_rate": 3.7003700370037e-05, | |
| "loss": 9.5426, | |
| "mean_token_accuracy": 0.5603208757042885, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 33.54015350341797, | |
| "learning_rate": 3.6003600360036005e-05, | |
| "loss": 9.532, | |
| "mean_token_accuracy": 0.561353420805186, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 30.744821548461914, | |
| "learning_rate": 3.500350035003501e-05, | |
| "loss": 9.4778, | |
| "mean_token_accuracy": 0.5629392731450498, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 24.013673782348633, | |
| "learning_rate": 3.4003400340034005e-05, | |
| "loss": 9.477, | |
| "mean_token_accuracy": 0.5625265723504126, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 27.767776489257812, | |
| "learning_rate": 3.300330033003301e-05, | |
| "loss": 9.3889, | |
| "mean_token_accuracy": 0.5659195666387677, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 29.1698055267334, | |
| "learning_rate": 3.2003200320032004e-05, | |
| "loss": 9.3476, | |
| "mean_token_accuracy": 0.5666751223653554, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 29.41615867614746, | |
| "learning_rate": 3.1003100310031e-05, | |
| "loss": 9.3244, | |
| "mean_token_accuracy": 0.5674368364065886, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 23.839937210083008, | |
| "learning_rate": 3.0003000300030004e-05, | |
| "loss": 9.2717, | |
| "mean_token_accuracy": 0.5696731022559106, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 28.645061492919922, | |
| "learning_rate": 2.9002900290029007e-05, | |
| "loss": 9.2327, | |
| "mean_token_accuracy": 0.5706070831567049, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 26.104412078857422, | |
| "learning_rate": 2.8002800280028004e-05, | |
| "loss": 9.223, | |
| "mean_token_accuracy": 0.5696454518660903, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 28.753164291381836, | |
| "learning_rate": 2.7002700270027004e-05, | |
| "loss": 9.145, | |
| "mean_token_accuracy": 0.5734736853465437, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 21.9370174407959, | |
| "learning_rate": 2.6002600260026007e-05, | |
| "loss": 9.1538, | |
| "mean_token_accuracy": 0.5731835125163197, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 31.202007293701172, | |
| "learning_rate": 2.5002500250025003e-05, | |
| "loss": 9.1016, | |
| "mean_token_accuracy": 0.5749420530423522, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 24.899829864501953, | |
| "learning_rate": 2.4002400240024003e-05, | |
| "loss": 9.1086, | |
| "mean_token_accuracy": 0.5731981860995292, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 26.59105682373047, | |
| "learning_rate": 2.3002300230023003e-05, | |
| "loss": 9.0585, | |
| "mean_token_accuracy": 0.575087952144444, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 27.35274314880371, | |
| "learning_rate": 2.2002200220022003e-05, | |
| "loss": 9.0378, | |
| "mean_token_accuracy": 0.5757604394182563, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 23.581249237060547, | |
| "learning_rate": 2.1002100210021003e-05, | |
| "loss": 9.087, | |
| "mean_token_accuracy": 0.5731492869332433, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 26.905712127685547, | |
| "learning_rate": 2.0002000200020003e-05, | |
| "loss": 9.0834, | |
| "mean_token_accuracy": 0.5751373803690076, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 24.928512573242188, | |
| "learning_rate": 1.9001900190019003e-05, | |
| "loss": 9.0077, | |
| "mean_token_accuracy": 0.5775581553503871, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 28.373720169067383, | |
| "learning_rate": 1.8001800180018002e-05, | |
| "loss": 9.0328, | |
| "mean_token_accuracy": 0.575654436133802, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 26.213802337646484, | |
| "learning_rate": 1.7001700170017002e-05, | |
| "loss": 8.9223, | |
| "mean_token_accuracy": 0.5791815776266158, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 27.070953369140625, | |
| "learning_rate": 1.6001600160016002e-05, | |
| "loss": 8.9483, | |
| "mean_token_accuracy": 0.5792850709185005, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 22.90890884399414, | |
| "learning_rate": 1.5001500150015002e-05, | |
| "loss": 9.0419, | |
| "mean_token_accuracy": 0.5748572928607464, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 28.693235397338867, | |
| "learning_rate": 1.4001400140014002e-05, | |
| "loss": 8.951, | |
| "mean_token_accuracy": 0.5788668767511844, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 27.749176025390625, | |
| "learning_rate": 1.3001300130013003e-05, | |
| "loss": 8.9335, | |
| "mean_token_accuracy": 0.5779373695105314, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 25.057411193847656, | |
| "learning_rate": 1.2001200120012002e-05, | |
| "loss": 8.8612, | |
| "mean_token_accuracy": 0.5811370112374425, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 26.132497787475586, | |
| "learning_rate": 1.1001100110011001e-05, | |
| "loss": 8.8688, | |
| "mean_token_accuracy": 0.5816743801310659, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 26.350906372070312, | |
| "learning_rate": 1.0001000100010001e-05, | |
| "loss": 8.8358, | |
| "mean_token_accuracy": 0.5821756240203977, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 24.874052047729492, | |
| "learning_rate": 9.000900090009001e-06, | |
| "loss": 8.8207, | |
| "mean_token_accuracy": 0.5826298766359687, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 26.102046966552734, | |
| "learning_rate": 8.000800080008001e-06, | |
| "loss": 8.8275, | |
| "mean_token_accuracy": 0.5821652906313538, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 29.679323196411133, | |
| "learning_rate": 7.000700070007001e-06, | |
| "loss": 8.85, | |
| "mean_token_accuracy": 0.5809146241471171, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 26.106046676635742, | |
| "learning_rate": 6.000600060006001e-06, | |
| "loss": 8.8531, | |
| "mean_token_accuracy": 0.5821815392710269, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 22.304044723510742, | |
| "learning_rate": 5.000500050005001e-06, | |
| "loss": 8.8142, | |
| "mean_token_accuracy": 0.5825774453170598, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 28.982166290283203, | |
| "learning_rate": 4.0004000400040005e-06, | |
| "loss": 8.8434, | |
| "mean_token_accuracy": 0.5813510757684708, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 27.076814651489258, | |
| "learning_rate": 3.0003000300030004e-06, | |
| "loss": 8.7994, | |
| "mean_token_accuracy": 0.5838636282868683, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 26.112808227539062, | |
| "learning_rate": 2.0002000200020003e-06, | |
| "loss": 8.8055, | |
| "mean_token_accuracy": 0.5828906665407121, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 24.94652557373047, | |
| "learning_rate": 1.0001000100010001e-06, | |
| "loss": 8.8295, | |
| "mean_token_accuracy": 0.5821196795813739, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 28.15529441833496, | |
| "learning_rate": 0.0, | |
| "loss": 8.7714, | |
| "mean_token_accuracy": 0.5843258857652545, | |
| "step": 50000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.68231960576e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |