| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.90625, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0390625, |
| "grad_norm": 1.6853455305099487, |
| "learning_rate": 8e-05, |
| "loss": 2.2667, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.078125, |
| "grad_norm": 1.127389907836914, |
| "learning_rate": 0.00018, |
| "loss": 1.2747, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1171875, |
| "grad_norm": 0.6251115798950195, |
| "learning_rate": 0.00019947575360419398, |
| "loss": 1.2874, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 0.8021469116210938, |
| "learning_rate": 0.00019882044560943645, |
| "loss": 1.1816, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1953125, |
| "grad_norm": 0.7008321285247803, |
| "learning_rate": 0.0001981651376146789, |
| "loss": 0.9091, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.234375, |
| "grad_norm": 0.8806556463241577, |
| "learning_rate": 0.00019750982961992138, |
| "loss": 1.18, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.2734375, |
| "grad_norm": 0.4898707866668701, |
| "learning_rate": 0.00019685452162516385, |
| "loss": 0.8613, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 0.818252444267273, |
| "learning_rate": 0.0001961992136304063, |
| "loss": 0.9613, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3515625, |
| "grad_norm": 0.7560004591941833, |
| "learning_rate": 0.00019554390563564878, |
| "loss": 0.9433, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.390625, |
| "grad_norm": 0.5985464453697205, |
| "learning_rate": 0.00019488859764089122, |
| "loss": 0.8993, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4296875, |
| "grad_norm": 0.7984416484832764, |
| "learning_rate": 0.0001942332896461337, |
| "loss": 0.9395, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.46875, |
| "grad_norm": 0.5905727744102478, |
| "learning_rate": 0.00019357798165137616, |
| "loss": 0.9598, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5078125, |
| "grad_norm": 0.47333383560180664, |
| "learning_rate": 0.00019292267365661863, |
| "loss": 0.9031, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.546875, |
| "grad_norm": 0.5469959378242493, |
| "learning_rate": 0.00019226736566186107, |
| "loss": 1.1451, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5859375, |
| "grad_norm": 0.5932920575141907, |
| "learning_rate": 0.00019161205766710356, |
| "loss": 0.8857, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.5339898467063904, |
| "learning_rate": 0.000190956749672346, |
| "loss": 0.7919, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6640625, |
| "grad_norm": 0.48470577597618103, |
| "learning_rate": 0.00019030144167758847, |
| "loss": 0.9792, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.703125, |
| "grad_norm": 0.4082311689853668, |
| "learning_rate": 0.00018964613368283094, |
| "loss": 0.9229, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.7421875, |
| "grad_norm": 0.6408493518829346, |
| "learning_rate": 0.0001889908256880734, |
| "loss": 0.7129, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.78125, |
| "grad_norm": 0.45103052258491516, |
| "learning_rate": 0.00018833551769331587, |
| "loss": 0.9419, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8203125, |
| "grad_norm": 0.6506906747817993, |
| "learning_rate": 0.00018768020969855834, |
| "loss": 0.836, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.859375, |
| "grad_norm": 0.6588282585144043, |
| "learning_rate": 0.0001870249017038008, |
| "loss": 0.885, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8984375, |
| "grad_norm": 0.5844029188156128, |
| "learning_rate": 0.00018636959370904325, |
| "loss": 0.8624, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.9375, |
| "grad_norm": 0.538287878036499, |
| "learning_rate": 0.00018571428571428572, |
| "loss": 0.9129, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9765625, |
| "grad_norm": 0.3959498107433319, |
| "learning_rate": 0.00018505897771952819, |
| "loss": 1.0968, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.015625, |
| "grad_norm": 0.46326589584350586, |
| "learning_rate": 0.00018440366972477065, |
| "loss": 0.7563, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.0546875, |
| "grad_norm": 0.6401046514511108, |
| "learning_rate": 0.00018374836173001312, |
| "loss": 0.7854, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.09375, |
| "grad_norm": 0.6093747615814209, |
| "learning_rate": 0.0001830930537352556, |
| "loss": 0.7031, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.1328125, |
| "grad_norm": 0.48366278409957886, |
| "learning_rate": 0.00018243774574049803, |
| "loss": 0.4555, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.171875, |
| "grad_norm": 0.5257757902145386, |
| "learning_rate": 0.0001817824377457405, |
| "loss": 0.6499, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2109375, |
| "grad_norm": 0.7223408818244934, |
| "learning_rate": 0.00018112712975098296, |
| "loss": 0.5893, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.4492509663105011, |
| "learning_rate": 0.00018047182175622543, |
| "loss": 0.8623, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2890625, |
| "grad_norm": 0.6466461420059204, |
| "learning_rate": 0.0001798165137614679, |
| "loss": 0.7997, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.328125, |
| "grad_norm": 0.6021189093589783, |
| "learning_rate": 0.00017916120576671037, |
| "loss": 0.7151, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.3671875, |
| "grad_norm": 0.43464839458465576, |
| "learning_rate": 0.00017850589777195283, |
| "loss": 0.626, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.40625, |
| "grad_norm": 0.49049654603004456, |
| "learning_rate": 0.00017785058977719527, |
| "loss": 0.7601, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.4453125, |
| "grad_norm": 0.6579009294509888, |
| "learning_rate": 0.00017719528178243777, |
| "loss": 0.6411, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.484375, |
| "grad_norm": 0.7494032382965088, |
| "learning_rate": 0.0001765399737876802, |
| "loss": 0.589, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.5234375, |
| "grad_norm": 0.5080376267433167, |
| "learning_rate": 0.00017588466579292268, |
| "loss": 0.6848, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 0.49630534648895264, |
| "learning_rate": 0.00017522935779816515, |
| "loss": 0.6273, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.6015625, |
| "grad_norm": 0.6087814569473267, |
| "learning_rate": 0.0001745740498034076, |
| "loss": 0.6083, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.640625, |
| "grad_norm": 0.607954740524292, |
| "learning_rate": 0.00017391874180865005, |
| "loss": 0.8664, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6796875, |
| "grad_norm": 0.44959601759910583, |
| "learning_rate": 0.00017326343381389255, |
| "loss": 0.5538, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.71875, |
| "grad_norm": 0.5550365447998047, |
| "learning_rate": 0.000172608125819135, |
| "loss": 0.4869, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.7578125, |
| "grad_norm": 0.6531190872192383, |
| "learning_rate": 0.00017195281782437746, |
| "loss": 0.7142, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.796875, |
| "grad_norm": 0.6506574153900146, |
| "learning_rate": 0.00017129750982961995, |
| "loss": 0.6573, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.8359375, |
| "grad_norm": 0.5597310662269592, |
| "learning_rate": 0.0001706422018348624, |
| "loss": 0.6261, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.5404195189476013, |
| "learning_rate": 0.00016998689384010486, |
| "loss": 0.5054, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.9140625, |
| "grad_norm": 0.611003041267395, |
| "learning_rate": 0.00016933158584534733, |
| "loss": 0.6949, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.953125, |
| "grad_norm": 0.4925813674926758, |
| "learning_rate": 0.0001686762778505898, |
| "loss": 0.6684, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.9921875, |
| "grad_norm": 0.5423117876052856, |
| "learning_rate": 0.00016802096985583224, |
| "loss": 0.7782, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.03125, |
| "grad_norm": 0.4928165078163147, |
| "learning_rate": 0.00016736566186107473, |
| "loss": 0.4515, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.0703125, |
| "grad_norm": 0.6966648101806641, |
| "learning_rate": 0.00016671035386631717, |
| "loss": 0.4123, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.109375, |
| "grad_norm": 0.7156907916069031, |
| "learning_rate": 0.00016605504587155964, |
| "loss": 0.591, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.1484375, |
| "grad_norm": 0.5283113718032837, |
| "learning_rate": 0.0001653997378768021, |
| "loss": 0.4631, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.1875, |
| "grad_norm": 0.7045680284500122, |
| "learning_rate": 0.00016474442988204457, |
| "loss": 0.4343, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.2265625, |
| "grad_norm": 0.7731931805610657, |
| "learning_rate": 0.00016408912188728701, |
| "loss": 0.4591, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.265625, |
| "grad_norm": 0.7124219536781311, |
| "learning_rate": 0.0001634338138925295, |
| "loss": 0.4534, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.3046875, |
| "grad_norm": 0.66915363073349, |
| "learning_rate": 0.00016277850589777198, |
| "loss": 0.5908, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.34375, |
| "grad_norm": 0.6559345722198486, |
| "learning_rate": 0.00016212319790301442, |
| "loss": 0.5065, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.3828125, |
| "grad_norm": 0.776062548160553, |
| "learning_rate": 0.00016146788990825688, |
| "loss": 0.4548, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.421875, |
| "grad_norm": 0.5407435297966003, |
| "learning_rate": 0.00016081258191349935, |
| "loss": 0.4586, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.4609375, |
| "grad_norm": 0.7619644403457642, |
| "learning_rate": 0.00016015727391874182, |
| "loss": 0.6065, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.6659480333328247, |
| "learning_rate": 0.0001595019659239843, |
| "loss": 0.4892, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.5390625, |
| "grad_norm": 0.586632490158081, |
| "learning_rate": 0.00015884665792922676, |
| "loss": 0.5094, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.578125, |
| "grad_norm": 0.6501973867416382, |
| "learning_rate": 0.0001581913499344692, |
| "loss": 0.5017, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.6171875, |
| "grad_norm": 0.5939526557922363, |
| "learning_rate": 0.00015753604193971166, |
| "loss": 0.357, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.65625, |
| "grad_norm": 0.6541431546211243, |
| "learning_rate": 0.00015688073394495413, |
| "loss": 0.5687, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.6953125, |
| "grad_norm": 0.7392444014549255, |
| "learning_rate": 0.0001562254259501966, |
| "loss": 0.5922, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.734375, |
| "grad_norm": 0.7246791124343872, |
| "learning_rate": 0.00015557011795543907, |
| "loss": 0.388, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.7734375, |
| "grad_norm": 1.0469605922698975, |
| "learning_rate": 0.00015491480996068153, |
| "loss": 0.4028, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.8125, |
| "grad_norm": 0.7362831830978394, |
| "learning_rate": 0.000154259501965924, |
| "loss": 0.6255, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.8515625, |
| "grad_norm": 0.6058784127235413, |
| "learning_rate": 0.00015360419397116644, |
| "loss": 0.5189, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.890625, |
| "grad_norm": 0.6939958333969116, |
| "learning_rate": 0.00015294888597640894, |
| "loss": 0.501, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.9296875, |
| "grad_norm": 0.8468016982078552, |
| "learning_rate": 0.00015229357798165138, |
| "loss": 0.5747, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.96875, |
| "grad_norm": 0.6065675616264343, |
| "learning_rate": 0.00015163826998689384, |
| "loss": 0.3813, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.0078125, |
| "grad_norm": 0.5093637108802795, |
| "learning_rate": 0.0001509829619921363, |
| "loss": 0.5125, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.046875, |
| "grad_norm": 0.7048936486244202, |
| "learning_rate": 0.00015032765399737878, |
| "loss": 0.3766, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.0859375, |
| "grad_norm": 1.191715955734253, |
| "learning_rate": 0.00014967234600262122, |
| "loss": 0.4224, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 0.6624323129653931, |
| "learning_rate": 0.00014901703800786372, |
| "loss": 0.4212, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.1640625, |
| "grad_norm": 1.3422083854675293, |
| "learning_rate": 0.00014836173001310616, |
| "loss": 0.3319, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.203125, |
| "grad_norm": 0.5813243985176086, |
| "learning_rate": 0.00014770642201834862, |
| "loss": 0.4468, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.2421875, |
| "grad_norm": 0.7296664118766785, |
| "learning_rate": 0.0001470511140235911, |
| "loss": 0.3234, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.28125, |
| "grad_norm": 0.7492959499359131, |
| "learning_rate": 0.00014639580602883356, |
| "loss": 0.3102, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.3203125, |
| "grad_norm": 0.994613528251648, |
| "learning_rate": 0.000145740498034076, |
| "loss": 0.4286, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.359375, |
| "grad_norm": 0.9514994025230408, |
| "learning_rate": 0.0001450851900393185, |
| "loss": 0.3267, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.3984375, |
| "grad_norm": 0.7083520293235779, |
| "learning_rate": 0.00014442988204456096, |
| "loss": 0.4743, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.4375, |
| "grad_norm": 0.6460224390029907, |
| "learning_rate": 0.0001437745740498034, |
| "loss": 0.3128, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.4765625, |
| "grad_norm": 0.7406665086746216, |
| "learning_rate": 0.0001431192660550459, |
| "loss": 0.3762, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.515625, |
| "grad_norm": 0.7346643805503845, |
| "learning_rate": 0.00014246395806028834, |
| "loss": 0.3573, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.5546875, |
| "grad_norm": 0.3775249123573303, |
| "learning_rate": 0.0001418086500655308, |
| "loss": 0.3488, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.59375, |
| "grad_norm": 0.9807206988334656, |
| "learning_rate": 0.00014115334207077327, |
| "loss": 0.2672, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.6328125, |
| "grad_norm": 0.5825705528259277, |
| "learning_rate": 0.00014049803407601574, |
| "loss": 0.2388, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.671875, |
| "grad_norm": 1.1724300384521484, |
| "learning_rate": 0.00013984272608125818, |
| "loss": 0.2998, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.7109375, |
| "grad_norm": 0.6543852090835571, |
| "learning_rate": 0.00013918741808650068, |
| "loss": 0.3704, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.6687126755714417, |
| "learning_rate": 0.00013853211009174312, |
| "loss": 0.3478, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.7890625, |
| "grad_norm": 0.8228131532669067, |
| "learning_rate": 0.00013787680209698558, |
| "loss": 0.3139, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.828125, |
| "grad_norm": 0.65690678358078, |
| "learning_rate": 0.00013722149410222805, |
| "loss": 0.4469, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.8671875, |
| "grad_norm": 0.7769365906715393, |
| "learning_rate": 0.00013656618610747052, |
| "loss": 0.4247, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.90625, |
| "grad_norm": 0.7008833289146423, |
| "learning_rate": 0.000135910878112713, |
| "loss": 0.4327, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1536, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 12, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.829508424091648e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|