| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 5239, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01908761213972132, | |
| "grad_norm": 24.85356330871582, | |
| "learning_rate": 2.9484634472227524e-05, | |
| "loss": 2.8924, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03817522427944264, | |
| "grad_norm": 11.6854829788208, | |
| "learning_rate": 2.8912006108035885e-05, | |
| "loss": 0.7288, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.057262836419163965, | |
| "grad_norm": 14.07279109954834, | |
| "learning_rate": 2.8339377743844245e-05, | |
| "loss": 0.6985, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07635044855888529, | |
| "grad_norm": 12.763427734375, | |
| "learning_rate": 2.7766749379652605e-05, | |
| "loss": 0.6016, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09543806069860661, | |
| "grad_norm": 10.65344524383545, | |
| "learning_rate": 2.7194121015460965e-05, | |
| "loss": 0.5952, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09543806069860661, | |
| "eval_exact_match": 74.24583520936515, | |
| "eval_f1": 74.39195128749384, | |
| "eval_runtime": 27.1246, | |
| "eval_samples_per_second": 163.763, | |
| "eval_steps_per_second": 20.498, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11452567283832793, | |
| "grad_norm": 24.665790557861328, | |
| "learning_rate": 2.6621492651269326e-05, | |
| "loss": 0.5502, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13361328497804925, | |
| "grad_norm": 5.523789405822754, | |
| "learning_rate": 2.6048864287077686e-05, | |
| "loss": 0.6028, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15270089711777057, | |
| "grad_norm": 21.62483787536621, | |
| "learning_rate": 2.5476235922886046e-05, | |
| "loss": 0.5487, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1717885092574919, | |
| "grad_norm": 3.459503412246704, | |
| "learning_rate": 2.4903607558694406e-05, | |
| "loss": 0.5442, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19087612139721322, | |
| "grad_norm": 6.397250652313232, | |
| "learning_rate": 2.4330979194502767e-05, | |
| "loss": 0.4708, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.19087612139721322, | |
| "eval_exact_match": 75.73165240882486, | |
| "eval_f1": 75.78096524517056, | |
| "eval_runtime": 27.0858, | |
| "eval_samples_per_second": 163.997, | |
| "eval_steps_per_second": 20.527, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.20996373353693454, | |
| "grad_norm": 5.82578182220459, | |
| "learning_rate": 2.3758350830311127e-05, | |
| "loss": 0.4403, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.22905134567665586, | |
| "grad_norm": 55.54654312133789, | |
| "learning_rate": 2.3185722466119487e-05, | |
| "loss": 0.5175, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.24813895781637718, | |
| "grad_norm": 3.4721755981445312, | |
| "learning_rate": 2.2613094101927848e-05, | |
| "loss": 0.4986, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2672265699560985, | |
| "grad_norm": 10.456282615661621, | |
| "learning_rate": 2.2040465737736208e-05, | |
| "loss": 0.5227, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2863141820958198, | |
| "grad_norm": 26.73484992980957, | |
| "learning_rate": 2.1467837373544568e-05, | |
| "loss": 0.4404, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2863141820958198, | |
| "eval_exact_match": 75.73165240882486, | |
| "eval_f1": 75.79168542698483, | |
| "eval_runtime": 27.0984, | |
| "eval_samples_per_second": 163.921, | |
| "eval_steps_per_second": 20.518, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.30540179423554115, | |
| "grad_norm": 13.284788131713867, | |
| "learning_rate": 2.089520900935293e-05, | |
| "loss": 0.4939, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.32448940637526247, | |
| "grad_norm": 1.5096017122268677, | |
| "learning_rate": 2.032258064516129e-05, | |
| "loss": 0.4657, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3435770185149838, | |
| "grad_norm": 10.362142562866211, | |
| "learning_rate": 1.974995228096965e-05, | |
| "loss": 0.4559, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3626646306547051, | |
| "grad_norm": 12.559228897094727, | |
| "learning_rate": 1.917732391677801e-05, | |
| "loss": 0.4998, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.38175224279442643, | |
| "grad_norm": 8.283560752868652, | |
| "learning_rate": 1.860469555258637e-05, | |
| "loss": 0.4603, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.38175224279442643, | |
| "eval_exact_match": 77.73525438991446, | |
| "eval_f1": 77.81083167170515, | |
| "eval_runtime": 27.123, | |
| "eval_samples_per_second": 163.772, | |
| "eval_steps_per_second": 20.499, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.40083985493414775, | |
| "grad_norm": 5.818004131317139, | |
| "learning_rate": 1.803206718839473e-05, | |
| "loss": 0.4361, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.4199274670738691, | |
| "grad_norm": 14.656002044677734, | |
| "learning_rate": 1.745943882420309e-05, | |
| "loss": 0.467, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.4390150792135904, | |
| "grad_norm": 8.953429222106934, | |
| "learning_rate": 1.688681046001145e-05, | |
| "loss": 0.3691, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4581026913533117, | |
| "grad_norm": 9.16756820678711, | |
| "learning_rate": 1.631418209581981e-05, | |
| "loss": 0.4318, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.47719030349303304, | |
| "grad_norm": 12.255992889404297, | |
| "learning_rate": 1.574155373162817e-05, | |
| "loss": 0.4323, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.47719030349303304, | |
| "eval_exact_match": 78.09545249887438, | |
| "eval_f1": 78.14723097703737, | |
| "eval_runtime": 27.2895, | |
| "eval_samples_per_second": 162.773, | |
| "eval_steps_per_second": 20.374, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.49627791563275436, | |
| "grad_norm": 15.877165794372559, | |
| "learning_rate": 1.5168925367436533e-05, | |
| "loss": 0.392, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.5153655277724757, | |
| "grad_norm": 18.496259689331055, | |
| "learning_rate": 1.4596297003244895e-05, | |
| "loss": 0.4044, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.534453139912197, | |
| "grad_norm": 12.297314643859863, | |
| "learning_rate": 1.4023668639053255e-05, | |
| "loss": 0.4233, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5535407520519183, | |
| "grad_norm": 4.0507049560546875, | |
| "learning_rate": 1.3451040274861615e-05, | |
| "loss": 0.4213, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5726283641916396, | |
| "grad_norm": 6.028878211975098, | |
| "learning_rate": 1.2878411910669976e-05, | |
| "loss": 0.4129, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5726283641916396, | |
| "eval_exact_match": 78.77082395317424, | |
| "eval_f1": 78.77832808044424, | |
| "eval_runtime": 27.3136, | |
| "eval_samples_per_second": 162.629, | |
| "eval_steps_per_second": 20.356, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 2.101097583770752, | |
| "learning_rate": 1.2305783546478336e-05, | |
| "loss": 0.386, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6108035884710823, | |
| "grad_norm": 26.336244583129883, | |
| "learning_rate": 1.1733155182286696e-05, | |
| "loss": 0.4582, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.6298912006108036, | |
| "grad_norm": 1.0698107481002808, | |
| "learning_rate": 1.1160526818095056e-05, | |
| "loss": 0.3729, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6489788127505249, | |
| "grad_norm": 24.801355361938477, | |
| "learning_rate": 1.0587898453903417e-05, | |
| "loss": 0.3804, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6680664248902463, | |
| "grad_norm": 27.99701690673828, | |
| "learning_rate": 1.0015270089711777e-05, | |
| "loss": 0.4083, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6680664248902463, | |
| "eval_exact_match": 78.18550202611436, | |
| "eval_f1": 78.18325078793336, | |
| "eval_runtime": 26.9164, | |
| "eval_samples_per_second": 165.03, | |
| "eval_steps_per_second": 20.657, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6871540370299676, | |
| "grad_norm": 1.1666632890701294, | |
| "learning_rate": 9.442641725520137e-06, | |
| "loss": 0.3877, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.7062416491696889, | |
| "grad_norm": 15.128180503845215, | |
| "learning_rate": 8.870013361328497e-06, | |
| "loss": 0.4168, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.7253292613094102, | |
| "grad_norm": 22.817325592041016, | |
| "learning_rate": 8.297384997136858e-06, | |
| "loss": 0.4292, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.7444168734491315, | |
| "grad_norm": 11.064777374267578, | |
| "learning_rate": 7.724756632945218e-06, | |
| "loss": 0.4225, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7635044855888529, | |
| "grad_norm": 6.2987060546875, | |
| "learning_rate": 7.152128268753579e-06, | |
| "loss": 0.39, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7635044855888529, | |
| "eval_exact_match": 78.95092300765421, | |
| "eval_f1": 78.91715443493922, | |
| "eval_runtime": 26.9864, | |
| "eval_samples_per_second": 164.602, | |
| "eval_steps_per_second": 20.603, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7825920977285742, | |
| "grad_norm": 13.171396255493164, | |
| "learning_rate": 6.579499904561939e-06, | |
| "loss": 0.4579, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.8016797098682955, | |
| "grad_norm": 5.967886924743652, | |
| "learning_rate": 6.0068715403703e-06, | |
| "loss": 0.3918, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.8207673220080168, | |
| "grad_norm": 15.37869930267334, | |
| "learning_rate": 5.43424317617866e-06, | |
| "loss": 0.3938, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.8398549341477382, | |
| "grad_norm": 14.545265197753906, | |
| "learning_rate": 4.86161481198702e-06, | |
| "loss": 0.3823, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8589425462874595, | |
| "grad_norm": 22.79754638671875, | |
| "learning_rate": 4.2889864477953805e-06, | |
| "loss": 0.3429, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8589425462874595, | |
| "eval_exact_match": 79.19855920756416, | |
| "eval_f1": 79.16479063484917, | |
| "eval_runtime": 26.9798, | |
| "eval_samples_per_second": 164.642, | |
| "eval_steps_per_second": 20.608, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8780301584271808, | |
| "grad_norm": 19.819913864135742, | |
| "learning_rate": 3.716358083603741e-06, | |
| "loss": 0.3482, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8971177705669021, | |
| "grad_norm": 14.867344856262207, | |
| "learning_rate": 3.1437297194121014e-06, | |
| "loss": 0.3886, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.9162053827066234, | |
| "grad_norm": 5.279645919799805, | |
| "learning_rate": 2.571101355220462e-06, | |
| "loss": 0.3932, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.9352929948463448, | |
| "grad_norm": 17.6607666015625, | |
| "learning_rate": 1.9984729910288224e-06, | |
| "loss": 0.369, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.9543806069860661, | |
| "grad_norm": 29.288660049438477, | |
| "learning_rate": 1.4258446268371827e-06, | |
| "loss": 0.3289, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9543806069860661, | |
| "eval_exact_match": 79.73885637100405, | |
| "eval_f1": 79.70508779828906, | |
| "eval_runtime": 26.9148, | |
| "eval_samples_per_second": 165.039, | |
| "eval_steps_per_second": 20.658, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9734682191257874, | |
| "grad_norm": 18.447587966918945, | |
| "learning_rate": 8.53216262645543e-07, | |
| "loss": 0.4057, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9925558312655087, | |
| "grad_norm": 4.9967122077941895, | |
| "learning_rate": 2.8058789845390346e-07, | |
| "loss": 0.3995, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 5239, | |
| "total_flos": 4.56805831292375e+16, | |
| "train_loss": 0.49690309092388235, | |
| "train_runtime": 1658.9592, | |
| "train_samples_per_second": 37.891, | |
| "train_steps_per_second": 3.158 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 5239, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.56805831292375e+16, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |