| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 12.0, |
| "eval_steps": 500, |
| "global_step": 1536, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0390625, |
| "grad_norm": 1.6853455305099487, |
| "learning_rate": 8e-05, |
| "loss": 2.2667, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.078125, |
| "grad_norm": 1.127389907836914, |
| "learning_rate": 0.00018, |
| "loss": 1.2747, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1171875, |
| "grad_norm": 0.6251115798950195, |
| "learning_rate": 0.00019947575360419398, |
| "loss": 1.2874, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 0.8021469116210938, |
| "learning_rate": 0.00019882044560943645, |
| "loss": 1.1816, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1953125, |
| "grad_norm": 0.7008321285247803, |
| "learning_rate": 0.0001981651376146789, |
| "loss": 0.9091, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.234375, |
| "grad_norm": 0.8806556463241577, |
| "learning_rate": 0.00019750982961992138, |
| "loss": 1.18, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.2734375, |
| "grad_norm": 0.4898707866668701, |
| "learning_rate": 0.00019685452162516385, |
| "loss": 0.8613, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 0.818252444267273, |
| "learning_rate": 0.0001961992136304063, |
| "loss": 0.9613, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3515625, |
| "grad_norm": 0.7560004591941833, |
| "learning_rate": 0.00019554390563564878, |
| "loss": 0.9433, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.390625, |
| "grad_norm": 0.5985464453697205, |
| "learning_rate": 0.00019488859764089122, |
| "loss": 0.8993, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4296875, |
| "grad_norm": 0.7984416484832764, |
| "learning_rate": 0.0001942332896461337, |
| "loss": 0.9395, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.46875, |
| "grad_norm": 0.5905727744102478, |
| "learning_rate": 0.00019357798165137616, |
| "loss": 0.9598, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5078125, |
| "grad_norm": 0.47333383560180664, |
| "learning_rate": 0.00019292267365661863, |
| "loss": 0.9031, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.546875, |
| "grad_norm": 0.5469959378242493, |
| "learning_rate": 0.00019226736566186107, |
| "loss": 1.1451, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5859375, |
| "grad_norm": 0.5932920575141907, |
| "learning_rate": 0.00019161205766710356, |
| "loss": 0.8857, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.5339898467063904, |
| "learning_rate": 0.000190956749672346, |
| "loss": 0.7919, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6640625, |
| "grad_norm": 0.48470577597618103, |
| "learning_rate": 0.00019030144167758847, |
| "loss": 0.9792, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.703125, |
| "grad_norm": 0.4082311689853668, |
| "learning_rate": 0.00018964613368283094, |
| "loss": 0.9229, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.7421875, |
| "grad_norm": 0.6408493518829346, |
| "learning_rate": 0.0001889908256880734, |
| "loss": 0.7129, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.78125, |
| "grad_norm": 0.45103052258491516, |
| "learning_rate": 0.00018833551769331587, |
| "loss": 0.9419, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8203125, |
| "grad_norm": 0.6506906747817993, |
| "learning_rate": 0.00018768020969855834, |
| "loss": 0.836, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.859375, |
| "grad_norm": 0.6588282585144043, |
| "learning_rate": 0.0001870249017038008, |
| "loss": 0.885, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8984375, |
| "grad_norm": 0.5844029188156128, |
| "learning_rate": 0.00018636959370904325, |
| "loss": 0.8624, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.9375, |
| "grad_norm": 0.538287878036499, |
| "learning_rate": 0.00018571428571428572, |
| "loss": 0.9129, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9765625, |
| "grad_norm": 0.3959498107433319, |
| "learning_rate": 0.00018505897771952819, |
| "loss": 1.0968, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.015625, |
| "grad_norm": 0.46326589584350586, |
| "learning_rate": 0.00018440366972477065, |
| "loss": 0.7563, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.0546875, |
| "grad_norm": 0.6401046514511108, |
| "learning_rate": 0.00018374836173001312, |
| "loss": 0.7854, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.09375, |
| "grad_norm": 0.6093747615814209, |
| "learning_rate": 0.0001830930537352556, |
| "loss": 0.7031, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.1328125, |
| "grad_norm": 0.48366278409957886, |
| "learning_rate": 0.00018243774574049803, |
| "loss": 0.4555, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.171875, |
| "grad_norm": 0.5257757902145386, |
| "learning_rate": 0.0001817824377457405, |
| "loss": 0.6499, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2109375, |
| "grad_norm": 0.7223408818244934, |
| "learning_rate": 0.00018112712975098296, |
| "loss": 0.5893, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.4492509663105011, |
| "learning_rate": 0.00018047182175622543, |
| "loss": 0.8623, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2890625, |
| "grad_norm": 0.6466461420059204, |
| "learning_rate": 0.0001798165137614679, |
| "loss": 0.7997, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.328125, |
| "grad_norm": 0.6021189093589783, |
| "learning_rate": 0.00017916120576671037, |
| "loss": 0.7151, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.3671875, |
| "grad_norm": 0.43464839458465576, |
| "learning_rate": 0.00017850589777195283, |
| "loss": 0.626, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.40625, |
| "grad_norm": 0.49049654603004456, |
| "learning_rate": 0.00017785058977719527, |
| "loss": 0.7601, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.4453125, |
| "grad_norm": 0.6579009294509888, |
| "learning_rate": 0.00017719528178243777, |
| "loss": 0.6411, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.484375, |
| "grad_norm": 0.7494032382965088, |
| "learning_rate": 0.0001765399737876802, |
| "loss": 0.589, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.5234375, |
| "grad_norm": 0.5080376267433167, |
| "learning_rate": 0.00017588466579292268, |
| "loss": 0.6848, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 0.49630534648895264, |
| "learning_rate": 0.00017522935779816515, |
| "loss": 0.6273, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.6015625, |
| "grad_norm": 0.6087814569473267, |
| "learning_rate": 0.0001745740498034076, |
| "loss": 0.6083, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.640625, |
| "grad_norm": 0.607954740524292, |
| "learning_rate": 0.00017391874180865005, |
| "loss": 0.8664, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6796875, |
| "grad_norm": 0.44959601759910583, |
| "learning_rate": 0.00017326343381389255, |
| "loss": 0.5538, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.71875, |
| "grad_norm": 0.5550365447998047, |
| "learning_rate": 0.000172608125819135, |
| "loss": 0.4869, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.7578125, |
| "grad_norm": 0.6531190872192383, |
| "learning_rate": 0.00017195281782437746, |
| "loss": 0.7142, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.796875, |
| "grad_norm": 0.6506574153900146, |
| "learning_rate": 0.00017129750982961995, |
| "loss": 0.6573, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.8359375, |
| "grad_norm": 0.5597310662269592, |
| "learning_rate": 0.0001706422018348624, |
| "loss": 0.6261, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.5404195189476013, |
| "learning_rate": 0.00016998689384010486, |
| "loss": 0.5054, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.9140625, |
| "grad_norm": 0.611003041267395, |
| "learning_rate": 0.00016933158584534733, |
| "loss": 0.6949, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.953125, |
| "grad_norm": 0.4925813674926758, |
| "learning_rate": 0.0001686762778505898, |
| "loss": 0.6684, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.9921875, |
| "grad_norm": 0.5423117876052856, |
| "learning_rate": 0.00016802096985583224, |
| "loss": 0.7782, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.03125, |
| "grad_norm": 0.4928165078163147, |
| "learning_rate": 0.00016736566186107473, |
| "loss": 0.4515, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.0703125, |
| "grad_norm": 0.6966648101806641, |
| "learning_rate": 0.00016671035386631717, |
| "loss": 0.4123, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.109375, |
| "grad_norm": 0.7156907916069031, |
| "learning_rate": 0.00016605504587155964, |
| "loss": 0.591, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.1484375, |
| "grad_norm": 0.5283113718032837, |
| "learning_rate": 0.0001653997378768021, |
| "loss": 0.4631, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.1875, |
| "grad_norm": 0.7045680284500122, |
| "learning_rate": 0.00016474442988204457, |
| "loss": 0.4343, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.2265625, |
| "grad_norm": 0.7731931805610657, |
| "learning_rate": 0.00016408912188728701, |
| "loss": 0.4591, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.265625, |
| "grad_norm": 0.7124219536781311, |
| "learning_rate": 0.0001634338138925295, |
| "loss": 0.4534, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.3046875, |
| "grad_norm": 0.66915363073349, |
| "learning_rate": 0.00016277850589777198, |
| "loss": 0.5908, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.34375, |
| "grad_norm": 0.6559345722198486, |
| "learning_rate": 0.00016212319790301442, |
| "loss": 0.5065, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.3828125, |
| "grad_norm": 0.776062548160553, |
| "learning_rate": 0.00016146788990825688, |
| "loss": 0.4548, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.421875, |
| "grad_norm": 0.5407435297966003, |
| "learning_rate": 0.00016081258191349935, |
| "loss": 0.4586, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.4609375, |
| "grad_norm": 0.7619644403457642, |
| "learning_rate": 0.00016015727391874182, |
| "loss": 0.6065, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.6659480333328247, |
| "learning_rate": 0.0001595019659239843, |
| "loss": 0.4892, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.5390625, |
| "grad_norm": 0.586632490158081, |
| "learning_rate": 0.00015884665792922676, |
| "loss": 0.5094, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.578125, |
| "grad_norm": 0.6501973867416382, |
| "learning_rate": 0.0001581913499344692, |
| "loss": 0.5017, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.6171875, |
| "grad_norm": 0.5939526557922363, |
| "learning_rate": 0.00015753604193971166, |
| "loss": 0.357, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.65625, |
| "grad_norm": 0.6541431546211243, |
| "learning_rate": 0.00015688073394495413, |
| "loss": 0.5687, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.6953125, |
| "grad_norm": 0.7392444014549255, |
| "learning_rate": 0.0001562254259501966, |
| "loss": 0.5922, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.734375, |
| "grad_norm": 0.7246791124343872, |
| "learning_rate": 0.00015557011795543907, |
| "loss": 0.388, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.7734375, |
| "grad_norm": 1.0469605922698975, |
| "learning_rate": 0.00015491480996068153, |
| "loss": 0.4028, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.8125, |
| "grad_norm": 0.7362831830978394, |
| "learning_rate": 0.000154259501965924, |
| "loss": 0.6255, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.8515625, |
| "grad_norm": 0.6058784127235413, |
| "learning_rate": 0.00015360419397116644, |
| "loss": 0.5189, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.890625, |
| "grad_norm": 0.6939958333969116, |
| "learning_rate": 0.00015294888597640894, |
| "loss": 0.501, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.9296875, |
| "grad_norm": 0.8468016982078552, |
| "learning_rate": 0.00015229357798165138, |
| "loss": 0.5747, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.96875, |
| "grad_norm": 0.6065675616264343, |
| "learning_rate": 0.00015163826998689384, |
| "loss": 0.3813, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.0078125, |
| "grad_norm": 0.5093637108802795, |
| "learning_rate": 0.0001509829619921363, |
| "loss": 0.5125, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.046875, |
| "grad_norm": 0.7048936486244202, |
| "learning_rate": 0.00015032765399737878, |
| "loss": 0.3766, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.0859375, |
| "grad_norm": 1.191715955734253, |
| "learning_rate": 0.00014967234600262122, |
| "loss": 0.4224, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 0.6624323129653931, |
| "learning_rate": 0.00014901703800786372, |
| "loss": 0.4212, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.1640625, |
| "grad_norm": 1.3422083854675293, |
| "learning_rate": 0.00014836173001310616, |
| "loss": 0.3319, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.203125, |
| "grad_norm": 0.5813243985176086, |
| "learning_rate": 0.00014770642201834862, |
| "loss": 0.4468, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.2421875, |
| "grad_norm": 0.7296664118766785, |
| "learning_rate": 0.0001470511140235911, |
| "loss": 0.3234, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.28125, |
| "grad_norm": 0.7492959499359131, |
| "learning_rate": 0.00014639580602883356, |
| "loss": 0.3102, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.3203125, |
| "grad_norm": 0.994613528251648, |
| "learning_rate": 0.000145740498034076, |
| "loss": 0.4286, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.359375, |
| "grad_norm": 0.9514994025230408, |
| "learning_rate": 0.0001450851900393185, |
| "loss": 0.3267, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.3984375, |
| "grad_norm": 0.7083520293235779, |
| "learning_rate": 0.00014442988204456096, |
| "loss": 0.4743, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.4375, |
| "grad_norm": 0.6460224390029907, |
| "learning_rate": 0.0001437745740498034, |
| "loss": 0.3128, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.4765625, |
| "grad_norm": 0.7406665086746216, |
| "learning_rate": 0.0001431192660550459, |
| "loss": 0.3762, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.515625, |
| "grad_norm": 0.7346643805503845, |
| "learning_rate": 0.00014246395806028834, |
| "loss": 0.3573, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.5546875, |
| "grad_norm": 0.3775249123573303, |
| "learning_rate": 0.0001418086500655308, |
| "loss": 0.3488, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.59375, |
| "grad_norm": 0.9807206988334656, |
| "learning_rate": 0.00014115334207077327, |
| "loss": 0.2672, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.6328125, |
| "grad_norm": 0.5825705528259277, |
| "learning_rate": 0.00014049803407601574, |
| "loss": 0.2388, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.671875, |
| "grad_norm": 1.1724300384521484, |
| "learning_rate": 0.00013984272608125818, |
| "loss": 0.2998, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.7109375, |
| "grad_norm": 0.6543852090835571, |
| "learning_rate": 0.00013918741808650068, |
| "loss": 0.3704, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.6687126755714417, |
| "learning_rate": 0.00013853211009174312, |
| "loss": 0.3478, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.7890625, |
| "grad_norm": 0.8228131532669067, |
| "learning_rate": 0.00013787680209698558, |
| "loss": 0.3139, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.828125, |
| "grad_norm": 0.65690678358078, |
| "learning_rate": 0.00013722149410222805, |
| "loss": 0.4469, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.8671875, |
| "grad_norm": 0.7769365906715393, |
| "learning_rate": 0.00013656618610747052, |
| "loss": 0.4247, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.90625, |
| "grad_norm": 0.7008833289146423, |
| "learning_rate": 0.000135910878112713, |
| "loss": 0.4327, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.9453125, |
| "grad_norm": 0.6874434947967529, |
| "learning_rate": 0.00013525557011795545, |
| "loss": 0.5225, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.984375, |
| "grad_norm": 0.4368758499622345, |
| "learning_rate": 0.00013460026212319792, |
| "loss": 0.3697, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.0234375, |
| "grad_norm": 1.0020313262939453, |
| "learning_rate": 0.00013394495412844036, |
| "loss": 0.2611, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.0625, |
| "grad_norm": 0.8647730946540833, |
| "learning_rate": 0.00013328964613368286, |
| "loss": 0.2297, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.1015625, |
| "grad_norm": 1.0684905052185059, |
| "learning_rate": 0.0001326343381389253, |
| "loss": 0.2594, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.140625, |
| "grad_norm": 0.6783558130264282, |
| "learning_rate": 0.00013197903014416777, |
| "loss": 0.2302, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.1796875, |
| "grad_norm": 0.7600467205047607, |
| "learning_rate": 0.00013132372214941023, |
| "loss": 0.1469, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.21875, |
| "grad_norm": 0.9370886087417603, |
| "learning_rate": 0.0001306684141546527, |
| "loss": 0.3683, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.2578125, |
| "grad_norm": 0.6307783722877502, |
| "learning_rate": 0.00013001310615989514, |
| "loss": 0.1998, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.296875, |
| "grad_norm": 0.9554206728935242, |
| "learning_rate": 0.0001293577981651376, |
| "loss": 0.3081, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.3359375, |
| "grad_norm": 0.8178610801696777, |
| "learning_rate": 0.00012870249017038008, |
| "loss": 0.2767, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 0.6448714137077332, |
| "learning_rate": 0.00012804718217562254, |
| "loss": 0.2248, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.4140625, |
| "grad_norm": 0.9795539379119873, |
| "learning_rate": 0.000127391874180865, |
| "loss": 0.3152, |
| "step": 565 |
| }, |
| { |
| "epoch": 4.453125, |
| "grad_norm": 0.7778314352035522, |
| "learning_rate": 0.00012673656618610748, |
| "loss": 0.3028, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.4921875, |
| "grad_norm": 0.9457613825798035, |
| "learning_rate": 0.00012608125819134995, |
| "loss": 0.309, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.53125, |
| "grad_norm": 0.7530558705329895, |
| "learning_rate": 0.0001254259501965924, |
| "loss": 0.3125, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.5703125, |
| "grad_norm": 0.7017265558242798, |
| "learning_rate": 0.00012477064220183488, |
| "loss": 0.273, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.609375, |
| "grad_norm": 0.8178383708000183, |
| "learning_rate": 0.00012411533420707732, |
| "loss": 0.3205, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.6484375, |
| "grad_norm": 1.5198026895523071, |
| "learning_rate": 0.0001234600262123198, |
| "loss": 0.3607, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.6875, |
| "grad_norm": 0.8270261883735657, |
| "learning_rate": 0.00012280471821756226, |
| "loss": 0.3297, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.7265625, |
| "grad_norm": 0.8817920088768005, |
| "learning_rate": 0.00012214941022280473, |
| "loss": 0.1601, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.765625, |
| "grad_norm": 0.9366243481636047, |
| "learning_rate": 0.00012149410222804718, |
| "loss": 0.1668, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.8046875, |
| "grad_norm": 0.706917405128479, |
| "learning_rate": 0.00012083879423328965, |
| "loss": 0.2827, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.84375, |
| "grad_norm": 0.8291599154472351, |
| "learning_rate": 0.0001201834862385321, |
| "loss": 0.2726, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.8828125, |
| "grad_norm": 0.6848894357681274, |
| "learning_rate": 0.00011952817824377458, |
| "loss": 0.1808, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.921875, |
| "grad_norm": 0.9057679176330566, |
| "learning_rate": 0.00011887287024901705, |
| "loss": 0.3169, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.9609375, |
| "grad_norm": 0.570704460144043, |
| "learning_rate": 0.0001182175622542595, |
| "loss": 0.3689, |
| "step": 635 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.8146092295646667, |
| "learning_rate": 0.00011756225425950199, |
| "loss": 0.3264, |
| "step": 640 |
| }, |
| { |
| "epoch": 5.0390625, |
| "grad_norm": 0.5888718366622925, |
| "learning_rate": 0.00011690694626474443, |
| "loss": 0.167, |
| "step": 645 |
| }, |
| { |
| "epoch": 5.078125, |
| "grad_norm": 0.9465442299842834, |
| "learning_rate": 0.00011625163826998691, |
| "loss": 0.1873, |
| "step": 650 |
| }, |
| { |
| "epoch": 5.1171875, |
| "grad_norm": 0.5365155339241028, |
| "learning_rate": 0.00011559633027522936, |
| "loss": 0.1689, |
| "step": 655 |
| }, |
| { |
| "epoch": 5.15625, |
| "grad_norm": 0.9071202278137207, |
| "learning_rate": 0.00011494102228047183, |
| "loss": 0.1666, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.1953125, |
| "grad_norm": 0.7092397212982178, |
| "learning_rate": 0.00011428571428571428, |
| "loss": 0.1557, |
| "step": 665 |
| }, |
| { |
| "epoch": 5.234375, |
| "grad_norm": 0.7074161767959595, |
| "learning_rate": 0.00011363040629095676, |
| "loss": 0.2316, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.2734375, |
| "grad_norm": 0.9464021325111389, |
| "learning_rate": 0.0001129750982961992, |
| "loss": 0.2005, |
| "step": 675 |
| }, |
| { |
| "epoch": 5.3125, |
| "grad_norm": 0.6366726160049438, |
| "learning_rate": 0.00011231979030144169, |
| "loss": 0.1569, |
| "step": 680 |
| }, |
| { |
| "epoch": 5.3515625, |
| "grad_norm": 0.6061714291572571, |
| "learning_rate": 0.00011166448230668414, |
| "loss": 0.2708, |
| "step": 685 |
| }, |
| { |
| "epoch": 5.390625, |
| "grad_norm": 0.8460837602615356, |
| "learning_rate": 0.00011100917431192661, |
| "loss": 0.1406, |
| "step": 690 |
| }, |
| { |
| "epoch": 5.4296875, |
| "grad_norm": 0.7344151139259338, |
| "learning_rate": 0.00011035386631716909, |
| "loss": 0.1954, |
| "step": 695 |
| }, |
| { |
| "epoch": 5.46875, |
| "grad_norm": 1.1099109649658203, |
| "learning_rate": 0.00010969855832241154, |
| "loss": 0.1321, |
| "step": 700 |
| }, |
| { |
| "epoch": 5.5078125, |
| "grad_norm": 0.8708857297897339, |
| "learning_rate": 0.00010904325032765401, |
| "loss": 0.2918, |
| "step": 705 |
| }, |
| { |
| "epoch": 5.546875, |
| "grad_norm": 0.8755677938461304, |
| "learning_rate": 0.00010838794233289647, |
| "loss": 0.2321, |
| "step": 710 |
| }, |
| { |
| "epoch": 5.5859375, |
| "grad_norm": 0.7879914045333862, |
| "learning_rate": 0.00010773263433813893, |
| "loss": 0.2525, |
| "step": 715 |
| }, |
| { |
| "epoch": 5.625, |
| "grad_norm": 0.8013678193092346, |
| "learning_rate": 0.00010707732634338139, |
| "loss": 0.1955, |
| "step": 720 |
| }, |
| { |
| "epoch": 5.6640625, |
| "grad_norm": 0.6184900999069214, |
| "learning_rate": 0.00010642201834862387, |
| "loss": 0.1356, |
| "step": 725 |
| }, |
| { |
| "epoch": 5.703125, |
| "grad_norm": 1.0018306970596313, |
| "learning_rate": 0.00010576671035386632, |
| "loss": 0.2501, |
| "step": 730 |
| }, |
| { |
| "epoch": 5.7421875, |
| "grad_norm": 0.9539072513580322, |
| "learning_rate": 0.00010511140235910879, |
| "loss": 0.3376, |
| "step": 735 |
| }, |
| { |
| "epoch": 5.78125, |
| "grad_norm": 0.7880743741989136, |
| "learning_rate": 0.00010445609436435124, |
| "loss": 0.176, |
| "step": 740 |
| }, |
| { |
| "epoch": 5.8203125, |
| "grad_norm": 0.7900522351264954, |
| "learning_rate": 0.00010380078636959371, |
| "loss": 0.2542, |
| "step": 745 |
| }, |
| { |
| "epoch": 5.859375, |
| "grad_norm": 0.7261202931404114, |
| "learning_rate": 0.00010314547837483617, |
| "loss": 0.1128, |
| "step": 750 |
| }, |
| { |
| "epoch": 5.8984375, |
| "grad_norm": 0.8230142593383789, |
| "learning_rate": 0.00010249017038007865, |
| "loss": 0.2231, |
| "step": 755 |
| }, |
| { |
| "epoch": 5.9375, |
| "grad_norm": 0.3808448612689972, |
| "learning_rate": 0.0001018348623853211, |
| "loss": 0.151, |
| "step": 760 |
| }, |
| { |
| "epoch": 5.9765625, |
| "grad_norm": 0.4841325581073761, |
| "learning_rate": 0.00010117955439056357, |
| "loss": 0.1336, |
| "step": 765 |
| }, |
| { |
| "epoch": 6.015625, |
| "grad_norm": 0.29059118032455444, |
| "learning_rate": 0.00010052424639580605, |
| "loss": 0.2318, |
| "step": 770 |
| }, |
| { |
| "epoch": 6.0546875, |
| "grad_norm": 0.6378641724586487, |
| "learning_rate": 9.986893840104849e-05, |
| "loss": 0.1794, |
| "step": 775 |
| }, |
| { |
| "epoch": 6.09375, |
| "grad_norm": 1.158392310142517, |
| "learning_rate": 9.921363040629096e-05, |
| "loss": 0.1575, |
| "step": 780 |
| }, |
| { |
| "epoch": 6.1328125, |
| "grad_norm": 0.4778974652290344, |
| "learning_rate": 9.855832241153343e-05, |
| "loss": 0.1201, |
| "step": 785 |
| }, |
| { |
| "epoch": 6.171875, |
| "grad_norm": 0.4302467107772827, |
| "learning_rate": 9.790301441677588e-05, |
| "loss": 0.0934, |
| "step": 790 |
| }, |
| { |
| "epoch": 6.2109375, |
| "grad_norm": 0.5038356781005859, |
| "learning_rate": 9.724770642201836e-05, |
| "loss": 0.1497, |
| "step": 795 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 0.6529866456985474, |
| "learning_rate": 9.659239842726083e-05, |
| "loss": 0.1439, |
| "step": 800 |
| }, |
| { |
| "epoch": 6.2890625, |
| "grad_norm": 0.8134426474571228, |
| "learning_rate": 9.593709043250328e-05, |
| "loss": 0.1469, |
| "step": 805 |
| }, |
| { |
| "epoch": 6.328125, |
| "grad_norm": 1.0741759538650513, |
| "learning_rate": 9.528178243774575e-05, |
| "loss": 0.0793, |
| "step": 810 |
| }, |
| { |
| "epoch": 6.3671875, |
| "grad_norm": 0.37064865231513977, |
| "learning_rate": 9.462647444298822e-05, |
| "loss": 0.0891, |
| "step": 815 |
| }, |
| { |
| "epoch": 6.40625, |
| "grad_norm": 0.6766513586044312, |
| "learning_rate": 9.397116644823067e-05, |
| "loss": 0.2092, |
| "step": 820 |
| }, |
| { |
| "epoch": 6.4453125, |
| "grad_norm": 0.45673248171806335, |
| "learning_rate": 9.331585845347314e-05, |
| "loss": 0.1063, |
| "step": 825 |
| }, |
| { |
| "epoch": 6.484375, |
| "grad_norm": 0.6083511710166931, |
| "learning_rate": 9.266055045871561e-05, |
| "loss": 0.0958, |
| "step": 830 |
| }, |
| { |
| "epoch": 6.5234375, |
| "grad_norm": 0.954582691192627, |
| "learning_rate": 9.200524246395806e-05, |
| "loss": 0.1522, |
| "step": 835 |
| }, |
| { |
| "epoch": 6.5625, |
| "grad_norm": 0.6275842785835266, |
| "learning_rate": 9.134993446920053e-05, |
| "loss": 0.1387, |
| "step": 840 |
| }, |
| { |
| "epoch": 6.6015625, |
| "grad_norm": 0.826816976070404, |
| "learning_rate": 9.069462647444298e-05, |
| "loss": 0.1997, |
| "step": 845 |
| }, |
| { |
| "epoch": 6.640625, |
| "grad_norm": 0.5855023264884949, |
| "learning_rate": 9.003931847968545e-05, |
| "loss": 0.1423, |
| "step": 850 |
| }, |
| { |
| "epoch": 6.6796875, |
| "grad_norm": 0.37608104944229126, |
| "learning_rate": 8.938401048492792e-05, |
| "loss": 0.1563, |
| "step": 855 |
| }, |
| { |
| "epoch": 6.71875, |
| "grad_norm": 1.1068248748779297, |
| "learning_rate": 8.872870249017037e-05, |
| "loss": 0.1066, |
| "step": 860 |
| }, |
| { |
| "epoch": 6.7578125, |
| "grad_norm": 0.8714601397514343, |
| "learning_rate": 8.807339449541285e-05, |
| "loss": 0.1076, |
| "step": 865 |
| }, |
| { |
| "epoch": 6.796875, |
| "grad_norm": 0.6995155215263367, |
| "learning_rate": 8.741808650065532e-05, |
| "loss": 0.0935, |
| "step": 870 |
| }, |
| { |
| "epoch": 6.8359375, |
| "grad_norm": 0.895413875579834, |
| "learning_rate": 8.676277850589778e-05, |
| "loss": 0.1802, |
| "step": 875 |
| }, |
| { |
| "epoch": 6.875, |
| "grad_norm": 0.8599961400032043, |
| "learning_rate": 8.610747051114024e-05, |
| "loss": 0.2149, |
| "step": 880 |
| }, |
| { |
| "epoch": 6.9140625, |
| "grad_norm": 0.6649323105812073, |
| "learning_rate": 8.545216251638271e-05, |
| "loss": 0.1272, |
| "step": 885 |
| }, |
| { |
| "epoch": 6.953125, |
| "grad_norm": 0.6272252798080444, |
| "learning_rate": 8.479685452162516e-05, |
| "loss": 0.1269, |
| "step": 890 |
| }, |
| { |
| "epoch": 6.9921875, |
| "grad_norm": 0.837714672088623, |
| "learning_rate": 8.414154652686763e-05, |
| "loss": 0.1932, |
| "step": 895 |
| }, |
| { |
| "epoch": 7.03125, |
| "grad_norm": 0.7026847004890442, |
| "learning_rate": 8.34862385321101e-05, |
| "loss": 0.1475, |
| "step": 900 |
| }, |
| { |
| "epoch": 7.0703125, |
| "grad_norm": 0.24409687519073486, |
| "learning_rate": 8.283093053735255e-05, |
| "loss": 0.0635, |
| "step": 905 |
| }, |
| { |
| "epoch": 7.109375, |
| "grad_norm": 0.26595592498779297, |
| "learning_rate": 8.217562254259502e-05, |
| "loss": 0.0448, |
| "step": 910 |
| }, |
| { |
| "epoch": 7.1484375, |
| "grad_norm": 0.6503292322158813, |
| "learning_rate": 8.152031454783749e-05, |
| "loss": 0.0652, |
| "step": 915 |
| }, |
| { |
| "epoch": 7.1875, |
| "grad_norm": 1.0240068435668945, |
| "learning_rate": 8.086500655307994e-05, |
| "loss": 0.077, |
| "step": 920 |
| }, |
| { |
| "epoch": 7.2265625, |
| "grad_norm": 0.36204642057418823, |
| "learning_rate": 8.020969855832241e-05, |
| "loss": 0.0827, |
| "step": 925 |
| }, |
| { |
| "epoch": 7.265625, |
| "grad_norm": 0.8305175304412842, |
| "learning_rate": 7.955439056356488e-05, |
| "loss": 0.1117, |
| "step": 930 |
| }, |
| { |
| "epoch": 7.3046875, |
| "grad_norm": 0.31086069345474243, |
| "learning_rate": 7.889908256880735e-05, |
| "loss": 0.0912, |
| "step": 935 |
| }, |
| { |
| "epoch": 7.34375, |
| "grad_norm": 0.5125362873077393, |
| "learning_rate": 7.824377457404981e-05, |
| "loss": 0.0819, |
| "step": 940 |
| }, |
| { |
| "epoch": 7.3828125, |
| "grad_norm": 0.6713749766349792, |
| "learning_rate": 7.758846657929227e-05, |
| "loss": 0.1059, |
| "step": 945 |
| }, |
| { |
| "epoch": 7.421875, |
| "grad_norm": 0.6156826615333557, |
| "learning_rate": 7.693315858453474e-05, |
| "loss": 0.1326, |
| "step": 950 |
| }, |
| { |
| "epoch": 7.4609375, |
| "grad_norm": 0.7549245953559875, |
| "learning_rate": 7.62778505897772e-05, |
| "loss": 0.0854, |
| "step": 955 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.8916281461715698, |
| "learning_rate": 7.562254259501966e-05, |
| "loss": 0.0898, |
| "step": 960 |
| }, |
| { |
| "epoch": 7.5390625, |
| "grad_norm": 0.7383102178573608, |
| "learning_rate": 7.496723460026212e-05, |
| "loss": 0.0699, |
| "step": 965 |
| }, |
| { |
| "epoch": 7.578125, |
| "grad_norm": 0.7137540578842163, |
| "learning_rate": 7.431192660550459e-05, |
| "loss": 0.0991, |
| "step": 970 |
| }, |
| { |
| "epoch": 7.6171875, |
| "grad_norm": 0.3973597586154938, |
| "learning_rate": 7.365661861074705e-05, |
| "loss": 0.089, |
| "step": 975 |
| }, |
| { |
| "epoch": 7.65625, |
| "grad_norm": 0.8355888724327087, |
| "learning_rate": 7.300131061598951e-05, |
| "loss": 0.0913, |
| "step": 980 |
| }, |
| { |
| "epoch": 7.6953125, |
| "grad_norm": 0.5336706042289734, |
| "learning_rate": 7.234600262123198e-05, |
| "loss": 0.1516, |
| "step": 985 |
| }, |
| { |
| "epoch": 7.734375, |
| "grad_norm": 0.8123258352279663, |
| "learning_rate": 7.169069462647444e-05, |
| "loss": 0.1144, |
| "step": 990 |
| }, |
| { |
| "epoch": 7.7734375, |
| "grad_norm": 0.4773609936237335, |
| "learning_rate": 7.103538663171692e-05, |
| "loss": 0.0785, |
| "step": 995 |
| }, |
| { |
| "epoch": 7.8125, |
| "grad_norm": 0.8169093132019043, |
| "learning_rate": 7.038007863695938e-05, |
| "loss": 0.1227, |
| "step": 1000 |
| }, |
| { |
| "epoch": 7.8515625, |
| "grad_norm": 0.27247804403305054, |
| "learning_rate": 6.972477064220184e-05, |
| "loss": 0.086, |
| "step": 1005 |
| }, |
| { |
| "epoch": 7.890625, |
| "grad_norm": 0.5810950398445129, |
| "learning_rate": 6.90694626474443e-05, |
| "loss": 0.0948, |
| "step": 1010 |
| }, |
| { |
| "epoch": 7.9296875, |
| "grad_norm": 0.9459575414657593, |
| "learning_rate": 6.841415465268677e-05, |
| "loss": 0.0889, |
| "step": 1015 |
| }, |
| { |
| "epoch": 7.96875, |
| "grad_norm": 0.127482607960701, |
| "learning_rate": 6.775884665792923e-05, |
| "loss": 0.0981, |
| "step": 1020 |
| }, |
| { |
| "epoch": 8.0078125, |
| "grad_norm": 0.3091827630996704, |
| "learning_rate": 6.71035386631717e-05, |
| "loss": 0.1035, |
| "step": 1025 |
| }, |
| { |
| "epoch": 8.046875, |
| "grad_norm": 0.4560360610485077, |
| "learning_rate": 6.644823066841416e-05, |
| "loss": 0.0834, |
| "step": 1030 |
| }, |
| { |
| "epoch": 8.0859375, |
| "grad_norm": 0.6077558398246765, |
| "learning_rate": 6.579292267365662e-05, |
| "loss": 0.0711, |
| "step": 1035 |
| }, |
| { |
| "epoch": 8.125, |
| "grad_norm": 0.7591610550880432, |
| "learning_rate": 6.513761467889909e-05, |
| "loss": 0.0877, |
| "step": 1040 |
| }, |
| { |
| "epoch": 8.1640625, |
| "grad_norm": 0.38261088728904724, |
| "learning_rate": 6.448230668414155e-05, |
| "loss": 0.0422, |
| "step": 1045 |
| }, |
| { |
| "epoch": 8.203125, |
| "grad_norm": 0.8100435137748718, |
| "learning_rate": 6.382699868938401e-05, |
| "loss": 0.0883, |
| "step": 1050 |
| }, |
| { |
| "epoch": 8.2421875, |
| "grad_norm": 0.25259244441986084, |
| "learning_rate": 6.317169069462647e-05, |
| "loss": 0.0696, |
| "step": 1055 |
| }, |
| { |
| "epoch": 8.28125, |
| "grad_norm": 0.7604616284370422, |
| "learning_rate": 6.251638269986894e-05, |
| "loss": 0.0761, |
| "step": 1060 |
| }, |
| { |
| "epoch": 8.3203125, |
| "grad_norm": 0.395271897315979, |
| "learning_rate": 6.186107470511141e-05, |
| "loss": 0.0769, |
| "step": 1065 |
| }, |
| { |
| "epoch": 8.359375, |
| "grad_norm": 0.6258074045181274, |
| "learning_rate": 6.120576671035388e-05, |
| "loss": 0.0543, |
| "step": 1070 |
| }, |
| { |
| "epoch": 8.3984375, |
| "grad_norm": 0.13846486806869507, |
| "learning_rate": 6.055045871559634e-05, |
| "loss": 0.0526, |
| "step": 1075 |
| }, |
| { |
| "epoch": 8.4375, |
| "grad_norm": 0.7763333916664124, |
| "learning_rate": 5.98951507208388e-05, |
| "loss": 0.0649, |
| "step": 1080 |
| }, |
| { |
| "epoch": 8.4765625, |
| "grad_norm": 0.6369220614433289, |
| "learning_rate": 5.923984272608126e-05, |
| "loss": 0.043, |
| "step": 1085 |
| }, |
| { |
| "epoch": 8.515625, |
| "grad_norm": 0.6248875856399536, |
| "learning_rate": 5.858453473132373e-05, |
| "loss": 0.0648, |
| "step": 1090 |
| }, |
| { |
| "epoch": 8.5546875, |
| "grad_norm": 0.1796266883611679, |
| "learning_rate": 5.792922673656619e-05, |
| "loss": 0.0487, |
| "step": 1095 |
| }, |
| { |
| "epoch": 8.59375, |
| "grad_norm": 0.7085462212562561, |
| "learning_rate": 5.727391874180865e-05, |
| "loss": 0.0745, |
| "step": 1100 |
| }, |
| { |
| "epoch": 8.6328125, |
| "grad_norm": 0.27906715869903564, |
| "learning_rate": 5.661861074705112e-05, |
| "loss": 0.0382, |
| "step": 1105 |
| }, |
| { |
| "epoch": 8.671875, |
| "grad_norm": 0.4836632013320923, |
| "learning_rate": 5.596330275229358e-05, |
| "loss": 0.0714, |
| "step": 1110 |
| }, |
| { |
| "epoch": 8.7109375, |
| "grad_norm": 0.5871438384056091, |
| "learning_rate": 5.530799475753604e-05, |
| "loss": 0.0427, |
| "step": 1115 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 0.3616584241390228, |
| "learning_rate": 5.4652686762778507e-05, |
| "loss": 0.0458, |
| "step": 1120 |
| }, |
| { |
| "epoch": 8.7890625, |
| "grad_norm": 0.9580535292625427, |
| "learning_rate": 5.399737876802097e-05, |
| "loss": 0.0759, |
| "step": 1125 |
| }, |
| { |
| "epoch": 8.828125, |
| "grad_norm": 0.6458576321601868, |
| "learning_rate": 5.334207077326344e-05, |
| "loss": 0.069, |
| "step": 1130 |
| }, |
| { |
| "epoch": 8.8671875, |
| "grad_norm": 0.21817967295646667, |
| "learning_rate": 5.26867627785059e-05, |
| "loss": 0.0511, |
| "step": 1135 |
| }, |
| { |
| "epoch": 8.90625, |
| "grad_norm": 0.6552639603614807, |
| "learning_rate": 5.203145478374837e-05, |
| "loss": 0.0707, |
| "step": 1140 |
| }, |
| { |
| "epoch": 8.9453125, |
| "grad_norm": 0.5542663931846619, |
| "learning_rate": 5.137614678899083e-05, |
| "loss": 0.0811, |
| "step": 1145 |
| }, |
| { |
| "epoch": 8.984375, |
| "grad_norm": 0.2486066222190857, |
| "learning_rate": 5.072083879423329e-05, |
| "loss": 0.0491, |
| "step": 1150 |
| }, |
| { |
| "epoch": 9.0234375, |
| "grad_norm": 0.10281497240066528, |
| "learning_rate": 5.006553079947576e-05, |
| "loss": 0.0513, |
| "step": 1155 |
| }, |
| { |
| "epoch": 9.0625, |
| "grad_norm": 0.5462967753410339, |
| "learning_rate": 4.941022280471822e-05, |
| "loss": 0.0444, |
| "step": 1160 |
| }, |
| { |
| "epoch": 9.1015625, |
| "grad_norm": 0.32824379205703735, |
| "learning_rate": 4.875491480996068e-05, |
| "loss": 0.0419, |
| "step": 1165 |
| }, |
| { |
| "epoch": 9.140625, |
| "grad_norm": 0.15365761518478394, |
| "learning_rate": 4.809960681520315e-05, |
| "loss": 0.0532, |
| "step": 1170 |
| }, |
| { |
| "epoch": 9.1796875, |
| "grad_norm": 0.4261007308959961, |
| "learning_rate": 4.744429882044561e-05, |
| "loss": 0.0476, |
| "step": 1175 |
| }, |
| { |
| "epoch": 9.21875, |
| "grad_norm": 0.5910694599151611, |
| "learning_rate": 4.678899082568808e-05, |
| "loss": 0.0437, |
| "step": 1180 |
| }, |
| { |
| "epoch": 9.2578125, |
| "grad_norm": 0.30444568395614624, |
| "learning_rate": 4.613368283093054e-05, |
| "loss": 0.0485, |
| "step": 1185 |
| }, |
| { |
| "epoch": 9.296875, |
| "grad_norm": 0.21978724002838135, |
| "learning_rate": 4.5478374836173006e-05, |
| "loss": 0.0422, |
| "step": 1190 |
| }, |
| { |
| "epoch": 9.3359375, |
| "grad_norm": 0.754964292049408, |
| "learning_rate": 4.482306684141547e-05, |
| "loss": 0.0474, |
| "step": 1195 |
| }, |
| { |
| "epoch": 9.375, |
| "grad_norm": 0.08515404164791107, |
| "learning_rate": 4.416775884665793e-05, |
| "loss": 0.0333, |
| "step": 1200 |
| }, |
| { |
| "epoch": 9.4140625, |
| "grad_norm": 0.22733353078365326, |
| "learning_rate": 4.3512450851900395e-05, |
| "loss": 0.0408, |
| "step": 1205 |
| }, |
| { |
| "epoch": 9.453125, |
| "grad_norm": 0.2974601089954376, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": 0.0493, |
| "step": 1210 |
| }, |
| { |
| "epoch": 9.4921875, |
| "grad_norm": 0.7275934815406799, |
| "learning_rate": 4.2201834862385324e-05, |
| "loss": 0.0538, |
| "step": 1215 |
| }, |
| { |
| "epoch": 9.53125, |
| "grad_norm": 0.4996713101863861, |
| "learning_rate": 4.154652686762779e-05, |
| "loss": 0.0383, |
| "step": 1220 |
| }, |
| { |
| "epoch": 9.5703125, |
| "grad_norm": 0.4535525441169739, |
| "learning_rate": 4.089121887287025e-05, |
| "loss": 0.0654, |
| "step": 1225 |
| }, |
| { |
| "epoch": 9.609375, |
| "grad_norm": 0.44223669171333313, |
| "learning_rate": 4.023591087811271e-05, |
| "loss": 0.0369, |
| "step": 1230 |
| }, |
| { |
| "epoch": 9.6484375, |
| "grad_norm": 0.08399149775505066, |
| "learning_rate": 3.958060288335518e-05, |
| "loss": 0.0374, |
| "step": 1235 |
| }, |
| { |
| "epoch": 9.6875, |
| "grad_norm": 0.4127291142940521, |
| "learning_rate": 3.892529488859764e-05, |
| "loss": 0.0379, |
| "step": 1240 |
| }, |
| { |
| "epoch": 9.7265625, |
| "grad_norm": 0.21898190677165985, |
| "learning_rate": 3.82699868938401e-05, |
| "loss": 0.0323, |
| "step": 1245 |
| }, |
| { |
| "epoch": 9.765625, |
| "grad_norm": 0.1628551185131073, |
| "learning_rate": 3.761467889908257e-05, |
| "loss": 0.0438, |
| "step": 1250 |
| }, |
| { |
| "epoch": 9.8046875, |
| "grad_norm": 0.42952653765678406, |
| "learning_rate": 3.695937090432504e-05, |
| "loss": 0.0358, |
| "step": 1255 |
| }, |
| { |
| "epoch": 9.84375, |
| "grad_norm": 0.23393145203590393, |
| "learning_rate": 3.63040629095675e-05, |
| "loss": 0.0424, |
| "step": 1260 |
| }, |
| { |
| "epoch": 9.8828125, |
| "grad_norm": 0.3994542956352234, |
| "learning_rate": 3.564875491480996e-05, |
| "loss": 0.0416, |
| "step": 1265 |
| }, |
| { |
| "epoch": 9.921875, |
| "grad_norm": 0.32643911242485046, |
| "learning_rate": 3.499344692005243e-05, |
| "loss": 0.0378, |
| "step": 1270 |
| }, |
| { |
| "epoch": 9.9609375, |
| "grad_norm": 0.13228672742843628, |
| "learning_rate": 3.433813892529489e-05, |
| "loss": 0.0302, |
| "step": 1275 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.5824608206748962, |
| "learning_rate": 3.3682830930537356e-05, |
| "loss": 0.0534, |
| "step": 1280 |
| }, |
| { |
| "epoch": 10.0390625, |
| "grad_norm": 0.36108893156051636, |
| "learning_rate": 3.302752293577982e-05, |
| "loss": 0.0316, |
| "step": 1285 |
| }, |
| { |
| "epoch": 10.078125, |
| "grad_norm": 0.21287214756011963, |
| "learning_rate": 3.2372214941022284e-05, |
| "loss": 0.0325, |
| "step": 1290 |
| }, |
| { |
| "epoch": 10.1171875, |
| "grad_norm": 0.3202109932899475, |
| "learning_rate": 3.1716906946264745e-05, |
| "loss": 0.0299, |
| "step": 1295 |
| }, |
| { |
| "epoch": 10.15625, |
| "grad_norm": 0.21489982306957245, |
| "learning_rate": 3.1061598951507206e-05, |
| "loss": 0.0272, |
| "step": 1300 |
| }, |
| { |
| "epoch": 10.1953125, |
| "grad_norm": 0.1839279681444168, |
| "learning_rate": 3.0406290956749674e-05, |
| "loss": 0.0284, |
| "step": 1305 |
| }, |
| { |
| "epoch": 10.234375, |
| "grad_norm": 0.36773788928985596, |
| "learning_rate": 2.9750982961992135e-05, |
| "loss": 0.0304, |
| "step": 1310 |
| }, |
| { |
| "epoch": 10.2734375, |
| "grad_norm": 0.13723714649677277, |
| "learning_rate": 2.9095674967234606e-05, |
| "loss": 0.0367, |
| "step": 1315 |
| }, |
| { |
| "epoch": 10.3125, |
| "grad_norm": 0.40129488706588745, |
| "learning_rate": 2.8440366972477066e-05, |
| "loss": 0.04, |
| "step": 1320 |
| }, |
| { |
| "epoch": 10.3515625, |
| "grad_norm": 0.08511374145746231, |
| "learning_rate": 2.778505897771953e-05, |
| "loss": 0.0246, |
| "step": 1325 |
| }, |
| { |
| "epoch": 10.390625, |
| "grad_norm": 0.43909040093421936, |
| "learning_rate": 2.7129750982961995e-05, |
| "loss": 0.0318, |
| "step": 1330 |
| }, |
| { |
| "epoch": 10.4296875, |
| "grad_norm": 0.11963684856891632, |
| "learning_rate": 2.6474442988204456e-05, |
| "loss": 0.0313, |
| "step": 1335 |
| }, |
| { |
| "epoch": 10.46875, |
| "grad_norm": 0.07002709805965424, |
| "learning_rate": 2.581913499344692e-05, |
| "loss": 0.0234, |
| "step": 1340 |
| }, |
| { |
| "epoch": 10.5078125, |
| "grad_norm": 0.5138005614280701, |
| "learning_rate": 2.5163826998689384e-05, |
| "loss": 0.0333, |
| "step": 1345 |
| }, |
| { |
| "epoch": 10.546875, |
| "grad_norm": 0.5884416699409485, |
| "learning_rate": 2.450851900393185e-05, |
| "loss": 0.0334, |
| "step": 1350 |
| }, |
| { |
| "epoch": 10.5859375, |
| "grad_norm": 0.2413049191236496, |
| "learning_rate": 2.3853211009174313e-05, |
| "loss": 0.029, |
| "step": 1355 |
| }, |
| { |
| "epoch": 10.625, |
| "grad_norm": 0.07464331388473511, |
| "learning_rate": 2.3197903014416777e-05, |
| "loss": 0.0294, |
| "step": 1360 |
| }, |
| { |
| "epoch": 10.6640625, |
| "grad_norm": 0.38783198595046997, |
| "learning_rate": 2.254259501965924e-05, |
| "loss": 0.0304, |
| "step": 1365 |
| }, |
| { |
| "epoch": 10.703125, |
| "grad_norm": 0.46144524216651917, |
| "learning_rate": 2.1887287024901702e-05, |
| "loss": 0.0321, |
| "step": 1370 |
| }, |
| { |
| "epoch": 10.7421875, |
| "grad_norm": 0.10308784991502762, |
| "learning_rate": 2.123197903014417e-05, |
| "loss": 0.0329, |
| "step": 1375 |
| }, |
| { |
| "epoch": 10.78125, |
| "grad_norm": 0.33099478483200073, |
| "learning_rate": 2.0576671035386634e-05, |
| "loss": 0.0323, |
| "step": 1380 |
| }, |
| { |
| "epoch": 10.8203125, |
| "grad_norm": 0.381788045167923, |
| "learning_rate": 1.9921363040629095e-05, |
| "loss": 0.0268, |
| "step": 1385 |
| }, |
| { |
| "epoch": 10.859375, |
| "grad_norm": 0.08423493057489395, |
| "learning_rate": 1.9266055045871563e-05, |
| "loss": 0.0304, |
| "step": 1390 |
| }, |
| { |
| "epoch": 10.8984375, |
| "grad_norm": 0.5857261419296265, |
| "learning_rate": 1.8610747051114023e-05, |
| "loss": 0.0327, |
| "step": 1395 |
| }, |
| { |
| "epoch": 10.9375, |
| "grad_norm": 0.12234900146722794, |
| "learning_rate": 1.7955439056356488e-05, |
| "loss": 0.0298, |
| "step": 1400 |
| }, |
| { |
| "epoch": 10.9765625, |
| "grad_norm": 0.4933612048625946, |
| "learning_rate": 1.7300131061598955e-05, |
| "loss": 0.0329, |
| "step": 1405 |
| }, |
| { |
| "epoch": 11.015625, |
| "grad_norm": 0.21089200675487518, |
| "learning_rate": 1.6644823066841416e-05, |
| "loss": 0.0274, |
| "step": 1410 |
| }, |
| { |
| "epoch": 11.0546875, |
| "grad_norm": 0.10511160641908646, |
| "learning_rate": 1.598951507208388e-05, |
| "loss": 0.0254, |
| "step": 1415 |
| }, |
| { |
| "epoch": 11.09375, |
| "grad_norm": 0.10039519518613815, |
| "learning_rate": 1.5334207077326345e-05, |
| "loss": 0.0211, |
| "step": 1420 |
| }, |
| { |
| "epoch": 11.1328125, |
| "grad_norm": 0.17804010212421417, |
| "learning_rate": 1.4678899082568809e-05, |
| "loss": 0.0239, |
| "step": 1425 |
| }, |
| { |
| "epoch": 11.171875, |
| "grad_norm": 0.21066808700561523, |
| "learning_rate": 1.4023591087811271e-05, |
| "loss": 0.0302, |
| "step": 1430 |
| }, |
| { |
| "epoch": 11.2109375, |
| "grad_norm": 0.14561991393566132, |
| "learning_rate": 1.3368283093053736e-05, |
| "loss": 0.0204, |
| "step": 1435 |
| }, |
| { |
| "epoch": 11.25, |
| "grad_norm": 0.0932135209441185, |
| "learning_rate": 1.2712975098296202e-05, |
| "loss": 0.0284, |
| "step": 1440 |
| }, |
| { |
| "epoch": 11.2890625, |
| "grad_norm": 0.2561885416507721, |
| "learning_rate": 1.2057667103538664e-05, |
| "loss": 0.0246, |
| "step": 1445 |
| }, |
| { |
| "epoch": 11.328125, |
| "grad_norm": 0.20448650419712067, |
| "learning_rate": 1.1402359108781127e-05, |
| "loss": 0.0282, |
| "step": 1450 |
| }, |
| { |
| "epoch": 11.3671875, |
| "grad_norm": 0.11967150866985321, |
| "learning_rate": 1.0747051114023591e-05, |
| "loss": 0.0271, |
| "step": 1455 |
| }, |
| { |
| "epoch": 11.40625, |
| "grad_norm": 0.12555184960365295, |
| "learning_rate": 1.0091743119266055e-05, |
| "loss": 0.0248, |
| "step": 1460 |
| }, |
| { |
| "epoch": 11.4453125, |
| "grad_norm": 0.2812488079071045, |
| "learning_rate": 9.43643512450852e-06, |
| "loss": 0.0312, |
| "step": 1465 |
| }, |
| { |
| "epoch": 11.484375, |
| "grad_norm": 0.06791621446609497, |
| "learning_rate": 8.781127129750984e-06, |
| "loss": 0.0246, |
| "step": 1470 |
| }, |
| { |
| "epoch": 11.5234375, |
| "grad_norm": 0.08099279552698135, |
| "learning_rate": 8.125819134993446e-06, |
| "loss": 0.0243, |
| "step": 1475 |
| }, |
| { |
| "epoch": 11.5625, |
| "grad_norm": 0.1286236196756363, |
| "learning_rate": 7.4705111402359114e-06, |
| "loss": 0.0215, |
| "step": 1480 |
| }, |
| { |
| "epoch": 11.6015625, |
| "grad_norm": 0.2546003460884094, |
| "learning_rate": 6.815203145478376e-06, |
| "loss": 0.0248, |
| "step": 1485 |
| }, |
| { |
| "epoch": 11.640625, |
| "grad_norm": 0.18944767117500305, |
| "learning_rate": 6.159895150720839e-06, |
| "loss": 0.024, |
| "step": 1490 |
| }, |
| { |
| "epoch": 11.6796875, |
| "grad_norm": 0.20657788217067719, |
| "learning_rate": 5.504587155963303e-06, |
| "loss": 0.0206, |
| "step": 1495 |
| }, |
| { |
| "epoch": 11.71875, |
| "grad_norm": 0.2377331256866455, |
| "learning_rate": 4.849279161205767e-06, |
| "loss": 0.0254, |
| "step": 1500 |
| }, |
| { |
| "epoch": 11.7578125, |
| "grad_norm": 0.10646895319223404, |
| "learning_rate": 4.193971166448231e-06, |
| "loss": 0.0214, |
| "step": 1505 |
| }, |
| { |
| "epoch": 11.796875, |
| "grad_norm": 0.3463532328605652, |
| "learning_rate": 3.538663171690695e-06, |
| "loss": 0.0252, |
| "step": 1510 |
| }, |
| { |
| "epoch": 11.8359375, |
| "grad_norm": 0.17272751033306122, |
| "learning_rate": 2.8833551769331587e-06, |
| "loss": 0.0316, |
| "step": 1515 |
| }, |
| { |
| "epoch": 11.875, |
| "grad_norm": 0.3141430914402008, |
| "learning_rate": 2.2280471821756225e-06, |
| "loss": 0.0251, |
| "step": 1520 |
| }, |
| { |
| "epoch": 11.9140625, |
| "grad_norm": 0.28511035442352295, |
| "learning_rate": 1.5727391874180865e-06, |
| "loss": 0.0267, |
| "step": 1525 |
| }, |
| { |
| "epoch": 11.953125, |
| "grad_norm": 0.10313425958156586, |
| "learning_rate": 9.174311926605506e-07, |
| "loss": 0.0196, |
| "step": 1530 |
| }, |
| { |
| "epoch": 11.9921875, |
| "grad_norm": 0.28020963072776794, |
| "learning_rate": 2.6212319790301444e-07, |
| "loss": 0.024, |
| "step": 1535 |
| }, |
| { |
| "epoch": 12.0, |
| "step": 1536, |
| "total_flos": 8.765889328981094e+16, |
| "train_loss": 0.2845887634175597, |
| "train_runtime": 2956.9183, |
| "train_samples_per_second": 4.152, |
| "train_steps_per_second": 0.519 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1536, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 12, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.765889328981094e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|