| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 15.0, |
| "eval_steps": 500, |
| "global_step": 3405, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04405286343612335, |
| "grad_norm": 32.503074645996094, |
| "learning_rate": 0.0002992070484581498, |
| "loss": 18.6527, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0881057268722467, |
| "grad_norm": 14.659343719482422, |
| "learning_rate": 0.0002983259911894273, |
| "loss": 6.941, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.13215859030837004, |
| "grad_norm": 13.076040267944336, |
| "learning_rate": 0.00029744493392070483, |
| "loss": 4.3562, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1762114537444934, |
| "grad_norm": 21.742950439453125, |
| "learning_rate": 0.00029656387665198236, |
| "loss": 3.5182, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22026431718061673, |
| "grad_norm": 19.481538772583008, |
| "learning_rate": 0.0002956828193832599, |
| "loss": 2.6543, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2643171806167401, |
| "grad_norm": 8.22918701171875, |
| "learning_rate": 0.0002948017621145374, |
| "loss": 1.7678, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.30837004405286345, |
| "grad_norm": 5.820700168609619, |
| "learning_rate": 0.00029392070484581494, |
| "loss": 1.2462, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3524229074889868, |
| "grad_norm": 2.173941135406494, |
| "learning_rate": 0.00029303964757709247, |
| "loss": 1.3665, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3964757709251101, |
| "grad_norm": 1.8419830799102783, |
| "learning_rate": 0.00029215859030837, |
| "loss": 1.1257, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.44052863436123346, |
| "grad_norm": 1.7700544595718384, |
| "learning_rate": 0.0002912775330396475, |
| "loss": 1.0317, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4845814977973568, |
| "grad_norm": 1.2313016653060913, |
| "learning_rate": 0.0002903964757709251, |
| "loss": 0.8453, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5286343612334802, |
| "grad_norm": 1.3880990743637085, |
| "learning_rate": 0.00028951541850220263, |
| "loss": 0.8675, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5726872246696035, |
| "grad_norm": 2.3586199283599854, |
| "learning_rate": 0.00028863436123348016, |
| "loss": 0.8509, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6167400881057269, |
| "grad_norm": 1.3902242183685303, |
| "learning_rate": 0.0002877533039647577, |
| "loss": 0.8528, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6607929515418502, |
| "grad_norm": 1.0430246591567993, |
| "learning_rate": 0.0002868722466960352, |
| "loss": 0.8726, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.7048458149779736, |
| "grad_norm": 1.3441425561904907, |
| "learning_rate": 0.00028599118942731274, |
| "loss": 1.1029, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.748898678414097, |
| "grad_norm": 1.18771231174469, |
| "learning_rate": 0.00028511013215859026, |
| "loss": 0.9229, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7929515418502202, |
| "grad_norm": 0.8050010800361633, |
| "learning_rate": 0.0002842290748898678, |
| "loss": 0.8363, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8370044052863436, |
| "grad_norm": 1.1800554990768433, |
| "learning_rate": 0.0002833480176211453, |
| "loss": 0.8128, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8810572687224669, |
| "grad_norm": 1.3873122930526733, |
| "learning_rate": 0.00028246696035242285, |
| "loss": 0.8275, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9251101321585903, |
| "grad_norm": 1.1561434268951416, |
| "learning_rate": 0.0002815859030837004, |
| "loss": 0.8961, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9691629955947136, |
| "grad_norm": 0.6641005277633667, |
| "learning_rate": 0.00028070484581497795, |
| "loss": 0.6599, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6834425330162048, |
| "eval_runtime": 5.6075, |
| "eval_samples_per_second": 36.023, |
| "eval_steps_per_second": 4.637, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.013215859030837, |
| "grad_norm": 1.0736805200576782, |
| "learning_rate": 0.0002798237885462555, |
| "loss": 0.7283, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.0572687224669604, |
| "grad_norm": 1.069932460784912, |
| "learning_rate": 0.000278942731277533, |
| "loss": 0.825, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.1013215859030836, |
| "grad_norm": 1.3098456859588623, |
| "learning_rate": 0.0002780616740088106, |
| "loss": 0.7152, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.145374449339207, |
| "grad_norm": 1.0547797679901123, |
| "learning_rate": 0.0002771806167400881, |
| "loss": 0.746, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1894273127753303, |
| "grad_norm": 1.58526611328125, |
| "learning_rate": 0.00027629955947136564, |
| "loss": 0.6655, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.2334801762114538, |
| "grad_norm": 1.4090569019317627, |
| "learning_rate": 0.0002754185022026431, |
| "loss": 0.7397, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.277533039647577, |
| "grad_norm": 1.3417810201644897, |
| "learning_rate": 0.00027453744493392064, |
| "loss": 0.6534, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.3215859030837005, |
| "grad_norm": 0.7320300936698914, |
| "learning_rate": 0.0002736563876651982, |
| "loss": 0.719, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3656387665198237, |
| "grad_norm": 1.7867811918258667, |
| "learning_rate": 0.00027277533039647575, |
| "loss": 0.8878, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.4096916299559472, |
| "grad_norm": 1.0332417488098145, |
| "learning_rate": 0.0002718942731277533, |
| "loss": 0.7058, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.4537444933920705, |
| "grad_norm": 1.1942096948623657, |
| "learning_rate": 0.0002710132158590308, |
| "loss": 0.6599, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.497797356828194, |
| "grad_norm": 1.0352708101272583, |
| "learning_rate": 0.00027013215859030833, |
| "loss": 0.6487, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.5418502202643172, |
| "grad_norm": 1.2984694242477417, |
| "learning_rate": 0.0002692511013215859, |
| "loss": 0.7225, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.5859030837004404, |
| "grad_norm": 1.1419997215270996, |
| "learning_rate": 0.00026837004405286344, |
| "loss": 0.7233, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.6299559471365639, |
| "grad_norm": 1.5873011350631714, |
| "learning_rate": 0.00026748898678414097, |
| "loss": 0.5429, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.6740088105726874, |
| "grad_norm": 1.058026671409607, |
| "learning_rate": 0.0002666079295154185, |
| "loss": 0.7621, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.7180616740088106, |
| "grad_norm": 1.424886703491211, |
| "learning_rate": 0.000265726872246696, |
| "loss": 0.7103, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.7621145374449338, |
| "grad_norm": 0.9987337589263916, |
| "learning_rate": 0.00026484581497797355, |
| "loss": 0.6882, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.8061674008810573, |
| "grad_norm": 1.0241808891296387, |
| "learning_rate": 0.0002639647577092511, |
| "loss": 0.6754, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.8502202643171806, |
| "grad_norm": 0.7069824934005737, |
| "learning_rate": 0.0002630837004405286, |
| "loss": 0.6426, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.894273127753304, |
| "grad_norm": 1.18909752368927, |
| "learning_rate": 0.00026220264317180613, |
| "loss": 0.7879, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.9383259911894273, |
| "grad_norm": 0.8950007557868958, |
| "learning_rate": 0.00026132158590308366, |
| "loss": 0.7197, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.9823788546255505, |
| "grad_norm": 1.3497512340545654, |
| "learning_rate": 0.00026044052863436124, |
| "loss": 0.6892, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.6154947876930237, |
| "eval_runtime": 5.3228, |
| "eval_samples_per_second": 37.95, |
| "eval_steps_per_second": 4.885, |
| "step": 454 |
| }, |
| { |
| "epoch": 2.026431718061674, |
| "grad_norm": 1.0325676202774048, |
| "learning_rate": 0.00025955947136563877, |
| "loss": 0.7104, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.0704845814977975, |
| "grad_norm": 1.4701021909713745, |
| "learning_rate": 0.0002586784140969163, |
| "loss": 0.6249, |
| "step": 470 |
| }, |
| { |
| "epoch": 2.1145374449339207, |
| "grad_norm": 1.2472504377365112, |
| "learning_rate": 0.0002577973568281938, |
| "loss": 0.7115, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.158590308370044, |
| "grad_norm": 1.01516592502594, |
| "learning_rate": 0.00025691629955947135, |
| "loss": 0.6039, |
| "step": 490 |
| }, |
| { |
| "epoch": 2.202643171806167, |
| "grad_norm": 1.3985668420791626, |
| "learning_rate": 0.0002560352422907489, |
| "loss": 0.5976, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.246696035242291, |
| "grad_norm": 0.6047684550285339, |
| "learning_rate": 0.0002551541850220264, |
| "loss": 0.5158, |
| "step": 510 |
| }, |
| { |
| "epoch": 2.290748898678414, |
| "grad_norm": 0.8428493142127991, |
| "learning_rate": 0.00025427312775330393, |
| "loss": 0.6338, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.3348017621145374, |
| "grad_norm": 1.0199517011642456, |
| "learning_rate": 0.00025339207048458146, |
| "loss": 0.469, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.3788546255506606, |
| "grad_norm": 0.8641414642333984, |
| "learning_rate": 0.000252511013215859, |
| "loss": 0.583, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.4229074889867843, |
| "grad_norm": 0.8442863821983337, |
| "learning_rate": 0.00025162995594713657, |
| "loss": 0.6108, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.4669603524229076, |
| "grad_norm": 0.8864941000938416, |
| "learning_rate": 0.0002507488986784141, |
| "loss": 0.5572, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.511013215859031, |
| "grad_norm": 0.9025411605834961, |
| "learning_rate": 0.0002498678414096916, |
| "loss": 0.6174, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.555066079295154, |
| "grad_norm": 0.8481220602989197, |
| "learning_rate": 0.00024898678414096915, |
| "loss": 0.6118, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.5991189427312777, |
| "grad_norm": 0.9391738772392273, |
| "learning_rate": 0.0002481057268722467, |
| "loss": 0.569, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.643171806167401, |
| "grad_norm": 1.0381453037261963, |
| "learning_rate": 0.0002472246696035242, |
| "loss": 0.4904, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.6872246696035242, |
| "grad_norm": 1.023573398590088, |
| "learning_rate": 0.00024634361233480173, |
| "loss": 0.5969, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.7312775330396475, |
| "grad_norm": 1.2105042934417725, |
| "learning_rate": 0.00024546255506607926, |
| "loss": 0.6589, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.7753303964757707, |
| "grad_norm": 1.1160320043563843, |
| "learning_rate": 0.0002445814977973568, |
| "loss": 0.5242, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.8193832599118944, |
| "grad_norm": 1.1934391260147095, |
| "learning_rate": 0.00024370044052863436, |
| "loss": 0.6548, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.8634361233480177, |
| "grad_norm": 1.1788102388381958, |
| "learning_rate": 0.0002428193832599119, |
| "loss": 0.5828, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.907488986784141, |
| "grad_norm": 1.2889748811721802, |
| "learning_rate": 0.00024193832599118942, |
| "loss": 0.6468, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.951541850220264, |
| "grad_norm": 0.8837119340896606, |
| "learning_rate": 0.00024105726872246695, |
| "loss": 0.6694, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.995594713656388, |
| "grad_norm": 1.0216575860977173, |
| "learning_rate": 0.00024017621145374447, |
| "loss": 0.633, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.592506468296051, |
| "eval_runtime": 5.3852, |
| "eval_samples_per_second": 37.51, |
| "eval_steps_per_second": 4.828, |
| "step": 681 |
| }, |
| { |
| "epoch": 3.039647577092511, |
| "grad_norm": 1.58785080909729, |
| "learning_rate": 0.00023929515418502203, |
| "loss": 0.4512, |
| "step": 690 |
| }, |
| { |
| "epoch": 3.0837004405286343, |
| "grad_norm": 1.0036600828170776, |
| "learning_rate": 0.00023841409691629955, |
| "loss": 0.9613, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.1277533039647576, |
| "grad_norm": 0.9956134557723999, |
| "learning_rate": 0.00023753303964757708, |
| "loss": 0.479, |
| "step": 710 |
| }, |
| { |
| "epoch": 3.171806167400881, |
| "grad_norm": 1.1154946088790894, |
| "learning_rate": 0.0002366519823788546, |
| "loss": 0.5444, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.2158590308370045, |
| "grad_norm": 1.3544610738754272, |
| "learning_rate": 0.00023577092511013214, |
| "loss": 0.5163, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.2599118942731278, |
| "grad_norm": 0.7720727920532227, |
| "learning_rate": 0.0002348898678414097, |
| "loss": 0.5317, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.303964757709251, |
| "grad_norm": 0.9804306030273438, |
| "learning_rate": 0.00023400881057268722, |
| "loss": 0.5179, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.3480176211453743, |
| "grad_norm": 1.0230934619903564, |
| "learning_rate": 0.00023312775330396474, |
| "loss": 0.5261, |
| "step": 760 |
| }, |
| { |
| "epoch": 3.392070484581498, |
| "grad_norm": 0.8620821237564087, |
| "learning_rate": 0.00023224669603524227, |
| "loss": 0.4998, |
| "step": 770 |
| }, |
| { |
| "epoch": 3.436123348017621, |
| "grad_norm": 0.8884461522102356, |
| "learning_rate": 0.0002313656387665198, |
| "loss": 0.5527, |
| "step": 780 |
| }, |
| { |
| "epoch": 3.4801762114537445, |
| "grad_norm": 0.7721192836761475, |
| "learning_rate": 0.00023048458149779735, |
| "loss": 0.5279, |
| "step": 790 |
| }, |
| { |
| "epoch": 3.5242290748898677, |
| "grad_norm": 1.0769802331924438, |
| "learning_rate": 0.00022960352422907488, |
| "loss": 0.5851, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.568281938325991, |
| "grad_norm": 1.3999199867248535, |
| "learning_rate": 0.0002287224669603524, |
| "loss": 0.44, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.6123348017621146, |
| "grad_norm": 0.9963156580924988, |
| "learning_rate": 0.00022784140969162993, |
| "loss": 0.6028, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.656387665198238, |
| "grad_norm": 0.9077759981155396, |
| "learning_rate": 0.00022696035242290746, |
| "loss": 0.6824, |
| "step": 830 |
| }, |
| { |
| "epoch": 3.700440528634361, |
| "grad_norm": 0.9758647680282593, |
| "learning_rate": 0.00022607929515418502, |
| "loss": 0.5424, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.744493392070485, |
| "grad_norm": 0.9838646054267883, |
| "learning_rate": 0.00022519823788546254, |
| "loss": 0.5588, |
| "step": 850 |
| }, |
| { |
| "epoch": 3.788546255506608, |
| "grad_norm": 1.1924773454666138, |
| "learning_rate": 0.00022431718061674007, |
| "loss": 0.6215, |
| "step": 860 |
| }, |
| { |
| "epoch": 3.8325991189427313, |
| "grad_norm": 1.27988600730896, |
| "learning_rate": 0.0002234361233480176, |
| "loss": 0.5336, |
| "step": 870 |
| }, |
| { |
| "epoch": 3.8766519823788546, |
| "grad_norm": 1.0098719596862793, |
| "learning_rate": 0.00022255506607929512, |
| "loss": 0.6623, |
| "step": 880 |
| }, |
| { |
| "epoch": 3.920704845814978, |
| "grad_norm": 1.301437497138977, |
| "learning_rate": 0.00022167400881057268, |
| "loss": 0.4837, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.964757709251101, |
| "grad_norm": 1.3062794208526611, |
| "learning_rate": 0.0002207929515418502, |
| "loss": 0.4399, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.5791140198707581, |
| "eval_runtime": 5.2768, |
| "eval_samples_per_second": 38.281, |
| "eval_steps_per_second": 4.927, |
| "step": 908 |
| }, |
| { |
| "epoch": 4.008810572687224, |
| "grad_norm": 1.2243441343307495, |
| "learning_rate": 0.00021991189427312773, |
| "loss": 0.5225, |
| "step": 910 |
| }, |
| { |
| "epoch": 4.052863436123348, |
| "grad_norm": 1.0874862670898438, |
| "learning_rate": 0.00021903083700440526, |
| "loss": 0.5878, |
| "step": 920 |
| }, |
| { |
| "epoch": 4.096916299559472, |
| "grad_norm": 1.1561787128448486, |
| "learning_rate": 0.0002181497797356828, |
| "loss": 0.4172, |
| "step": 930 |
| }, |
| { |
| "epoch": 4.140969162995595, |
| "grad_norm": 0.9504215121269226, |
| "learning_rate": 0.00021726872246696034, |
| "loss": 0.454, |
| "step": 940 |
| }, |
| { |
| "epoch": 4.185022026431718, |
| "grad_norm": 1.0901755094528198, |
| "learning_rate": 0.00021638766519823787, |
| "loss": 0.5222, |
| "step": 950 |
| }, |
| { |
| "epoch": 4.229074889867841, |
| "grad_norm": 0.7518570423126221, |
| "learning_rate": 0.0002155066079295154, |
| "loss": 0.4048, |
| "step": 960 |
| }, |
| { |
| "epoch": 4.273127753303965, |
| "grad_norm": 0.9933887720108032, |
| "learning_rate": 0.00021462555066079292, |
| "loss": 0.4545, |
| "step": 970 |
| }, |
| { |
| "epoch": 4.317180616740088, |
| "grad_norm": 0.8956694006919861, |
| "learning_rate": 0.00021374449339207048, |
| "loss": 0.5703, |
| "step": 980 |
| }, |
| { |
| "epoch": 4.361233480176211, |
| "grad_norm": 1.0768828392028809, |
| "learning_rate": 0.000212863436123348, |
| "loss": 0.411, |
| "step": 990 |
| }, |
| { |
| "epoch": 4.405286343612334, |
| "grad_norm": 1.3219349384307861, |
| "learning_rate": 0.00021198237885462553, |
| "loss": 0.5096, |
| "step": 1000 |
| }, |
| { |
| "epoch": 4.4493392070484585, |
| "grad_norm": 0.6028145551681519, |
| "learning_rate": 0.00021110132158590306, |
| "loss": 0.5427, |
| "step": 1010 |
| }, |
| { |
| "epoch": 4.493392070484582, |
| "grad_norm": 0.6015641689300537, |
| "learning_rate": 0.00021022026431718059, |
| "loss": 0.4855, |
| "step": 1020 |
| }, |
| { |
| "epoch": 4.537444933920705, |
| "grad_norm": 0.7184689044952393, |
| "learning_rate": 0.00020933920704845814, |
| "loss": 0.4893, |
| "step": 1030 |
| }, |
| { |
| "epoch": 4.581497797356828, |
| "grad_norm": 1.445830225944519, |
| "learning_rate": 0.00020845814977973567, |
| "loss": 0.4412, |
| "step": 1040 |
| }, |
| { |
| "epoch": 4.6255506607929515, |
| "grad_norm": 0.9506711959838867, |
| "learning_rate": 0.0002075770925110132, |
| "loss": 0.5596, |
| "step": 1050 |
| }, |
| { |
| "epoch": 4.669603524229075, |
| "grad_norm": 0.9642265439033508, |
| "learning_rate": 0.00020669603524229072, |
| "loss": 0.3944, |
| "step": 1060 |
| }, |
| { |
| "epoch": 4.713656387665198, |
| "grad_norm": 0.9548330307006836, |
| "learning_rate": 0.00020581497797356825, |
| "loss": 0.4925, |
| "step": 1070 |
| }, |
| { |
| "epoch": 4.757709251101321, |
| "grad_norm": 1.5850030183792114, |
| "learning_rate": 0.0002049339207048458, |
| "loss": 0.5114, |
| "step": 1080 |
| }, |
| { |
| "epoch": 4.8017621145374445, |
| "grad_norm": 0.7429970502853394, |
| "learning_rate": 0.00020405286343612333, |
| "loss": 0.5556, |
| "step": 1090 |
| }, |
| { |
| "epoch": 4.845814977973569, |
| "grad_norm": 0.9865929484367371, |
| "learning_rate": 0.00020317180616740086, |
| "loss": 0.4612, |
| "step": 1100 |
| }, |
| { |
| "epoch": 4.889867841409692, |
| "grad_norm": 0.8113177418708801, |
| "learning_rate": 0.00020229074889867838, |
| "loss": 0.5196, |
| "step": 1110 |
| }, |
| { |
| "epoch": 4.933920704845815, |
| "grad_norm": 1.1767125129699707, |
| "learning_rate": 0.0002014096916299559, |
| "loss": 0.5321, |
| "step": 1120 |
| }, |
| { |
| "epoch": 4.977973568281938, |
| "grad_norm": 0.8367587327957153, |
| "learning_rate": 0.00020052863436123347, |
| "loss": 0.506, |
| "step": 1130 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.5730367302894592, |
| "eval_runtime": 6.1337, |
| "eval_samples_per_second": 32.933, |
| "eval_steps_per_second": 4.239, |
| "step": 1135 |
| }, |
| { |
| "epoch": 5.022026431718062, |
| "grad_norm": 1.1242823600769043, |
| "learning_rate": 0.000199647577092511, |
| "loss": 0.4678, |
| "step": 1140 |
| }, |
| { |
| "epoch": 5.066079295154185, |
| "grad_norm": 1.0385881662368774, |
| "learning_rate": 0.00019876651982378852, |
| "loss": 0.3968, |
| "step": 1150 |
| }, |
| { |
| "epoch": 5.110132158590308, |
| "grad_norm": 0.9282165765762329, |
| "learning_rate": 0.00019788546255506605, |
| "loss": 0.5089, |
| "step": 1160 |
| }, |
| { |
| "epoch": 5.154185022026431, |
| "grad_norm": 1.401548147201538, |
| "learning_rate": 0.00019700440528634357, |
| "loss": 0.4457, |
| "step": 1170 |
| }, |
| { |
| "epoch": 5.1982378854625555, |
| "grad_norm": 0.6676862835884094, |
| "learning_rate": 0.00019612334801762113, |
| "loss": 0.3175, |
| "step": 1180 |
| }, |
| { |
| "epoch": 5.242290748898679, |
| "grad_norm": 1.1318411827087402, |
| "learning_rate": 0.00019524229074889866, |
| "loss": 0.4468, |
| "step": 1190 |
| }, |
| { |
| "epoch": 5.286343612334802, |
| "grad_norm": 0.706200361251831, |
| "learning_rate": 0.00019436123348017618, |
| "loss": 0.3954, |
| "step": 1200 |
| }, |
| { |
| "epoch": 5.330396475770925, |
| "grad_norm": 0.6558952927589417, |
| "learning_rate": 0.0001934801762114537, |
| "loss": 0.4318, |
| "step": 1210 |
| }, |
| { |
| "epoch": 5.3744493392070485, |
| "grad_norm": 0.59174644947052, |
| "learning_rate": 0.00019259911894273124, |
| "loss": 0.3962, |
| "step": 1220 |
| }, |
| { |
| "epoch": 5.418502202643172, |
| "grad_norm": 0.9306423664093018, |
| "learning_rate": 0.0001917180616740088, |
| "loss": 0.4161, |
| "step": 1230 |
| }, |
| { |
| "epoch": 5.462555066079295, |
| "grad_norm": 1.2412904500961304, |
| "learning_rate": 0.00019083700440528632, |
| "loss": 0.4259, |
| "step": 1240 |
| }, |
| { |
| "epoch": 5.506607929515418, |
| "grad_norm": 0.8949795961380005, |
| "learning_rate": 0.00018995594713656385, |
| "loss": 0.5512, |
| "step": 1250 |
| }, |
| { |
| "epoch": 5.5506607929515415, |
| "grad_norm": 0.9977787733078003, |
| "learning_rate": 0.00018907488986784137, |
| "loss": 0.4497, |
| "step": 1260 |
| }, |
| { |
| "epoch": 5.594713656387665, |
| "grad_norm": 1.0676085948944092, |
| "learning_rate": 0.0001881938325991189, |
| "loss": 0.4344, |
| "step": 1270 |
| }, |
| { |
| "epoch": 5.638766519823789, |
| "grad_norm": 0.6446275115013123, |
| "learning_rate": 0.00018731277533039648, |
| "loss": 0.4184, |
| "step": 1280 |
| }, |
| { |
| "epoch": 5.682819383259912, |
| "grad_norm": 1.3255438804626465, |
| "learning_rate": 0.000186431718061674, |
| "loss": 0.5441, |
| "step": 1290 |
| }, |
| { |
| "epoch": 5.726872246696035, |
| "grad_norm": 0.823581337928772, |
| "learning_rate": 0.0001855506607929515, |
| "loss": 0.5028, |
| "step": 1300 |
| }, |
| { |
| "epoch": 5.770925110132159, |
| "grad_norm": 1.0471981763839722, |
| "learning_rate": 0.00018466960352422904, |
| "loss": 0.4407, |
| "step": 1310 |
| }, |
| { |
| "epoch": 5.814977973568282, |
| "grad_norm": 1.0394315719604492, |
| "learning_rate": 0.00018378854625550662, |
| "loss": 0.4788, |
| "step": 1320 |
| }, |
| { |
| "epoch": 5.859030837004405, |
| "grad_norm": 1.4738258123397827, |
| "learning_rate": 0.00018290748898678414, |
| "loss": 0.5495, |
| "step": 1330 |
| }, |
| { |
| "epoch": 5.903083700440528, |
| "grad_norm": 1.2812182903289795, |
| "learning_rate": 0.00018202643171806167, |
| "loss": 0.4335, |
| "step": 1340 |
| }, |
| { |
| "epoch": 5.9471365638766525, |
| "grad_norm": 1.4929533004760742, |
| "learning_rate": 0.0001811453744493392, |
| "loss": 0.5097, |
| "step": 1350 |
| }, |
| { |
| "epoch": 5.991189427312776, |
| "grad_norm": 1.2788587808609009, |
| "learning_rate": 0.00018026431718061673, |
| "loss": 0.4702, |
| "step": 1360 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.5740869045257568, |
| "eval_runtime": 4.9653, |
| "eval_samples_per_second": 40.682, |
| "eval_steps_per_second": 5.236, |
| "step": 1362 |
| }, |
| { |
| "epoch": 6.035242290748899, |
| "grad_norm": 0.9543855786323547, |
| "learning_rate": 0.00017938325991189428, |
| "loss": 0.4232, |
| "step": 1370 |
| }, |
| { |
| "epoch": 6.079295154185022, |
| "grad_norm": 1.0528812408447266, |
| "learning_rate": 0.0001785022026431718, |
| "loss": 0.4025, |
| "step": 1380 |
| }, |
| { |
| "epoch": 6.1233480176211454, |
| "grad_norm": 0.9573265910148621, |
| "learning_rate": 0.00017762114537444933, |
| "loss": 0.4127, |
| "step": 1390 |
| }, |
| { |
| "epoch": 6.167400881057269, |
| "grad_norm": 1.7806532382965088, |
| "learning_rate": 0.00017674008810572686, |
| "loss": 0.4646, |
| "step": 1400 |
| }, |
| { |
| "epoch": 6.211453744493392, |
| "grad_norm": 1.0559179782867432, |
| "learning_rate": 0.0001758590308370044, |
| "loss": 0.3222, |
| "step": 1410 |
| }, |
| { |
| "epoch": 6.255506607929515, |
| "grad_norm": 0.9502829313278198, |
| "learning_rate": 0.00017497797356828194, |
| "loss": 0.4697, |
| "step": 1420 |
| }, |
| { |
| "epoch": 6.299559471365638, |
| "grad_norm": 0.6869007349014282, |
| "learning_rate": 0.00017409691629955947, |
| "loss": 0.4155, |
| "step": 1430 |
| }, |
| { |
| "epoch": 6.343612334801762, |
| "grad_norm": 0.6793345808982849, |
| "learning_rate": 0.000173215859030837, |
| "loss": 0.4236, |
| "step": 1440 |
| }, |
| { |
| "epoch": 6.387665198237886, |
| "grad_norm": 1.067975640296936, |
| "learning_rate": 0.00017233480176211452, |
| "loss": 0.3558, |
| "step": 1450 |
| }, |
| { |
| "epoch": 6.431718061674009, |
| "grad_norm": 1.0968421697616577, |
| "learning_rate": 0.00017145374449339205, |
| "loss": 0.4453, |
| "step": 1460 |
| }, |
| { |
| "epoch": 6.475770925110132, |
| "grad_norm": 1.1832313537597656, |
| "learning_rate": 0.0001705726872246696, |
| "loss": 0.5115, |
| "step": 1470 |
| }, |
| { |
| "epoch": 6.5198237885462555, |
| "grad_norm": 0.9857836365699768, |
| "learning_rate": 0.00016969162995594713, |
| "loss": 0.4274, |
| "step": 1480 |
| }, |
| { |
| "epoch": 6.563876651982379, |
| "grad_norm": 0.9006336331367493, |
| "learning_rate": 0.00016881057268722466, |
| "loss": 0.3865, |
| "step": 1490 |
| }, |
| { |
| "epoch": 6.607929515418502, |
| "grad_norm": 1.1091986894607544, |
| "learning_rate": 0.0001679295154185022, |
| "loss": 0.3988, |
| "step": 1500 |
| }, |
| { |
| "epoch": 6.651982378854625, |
| "grad_norm": 1.423886775970459, |
| "learning_rate": 0.00016704845814977971, |
| "loss": 0.5057, |
| "step": 1510 |
| }, |
| { |
| "epoch": 6.6960352422907485, |
| "grad_norm": 0.9245197176933289, |
| "learning_rate": 0.00016616740088105727, |
| "loss": 0.3966, |
| "step": 1520 |
| }, |
| { |
| "epoch": 6.740088105726873, |
| "grad_norm": 0.944870114326477, |
| "learning_rate": 0.0001652863436123348, |
| "loss": 0.4521, |
| "step": 1530 |
| }, |
| { |
| "epoch": 6.784140969162996, |
| "grad_norm": 0.8870773315429688, |
| "learning_rate": 0.00016440528634361232, |
| "loss": 0.4425, |
| "step": 1540 |
| }, |
| { |
| "epoch": 6.828193832599119, |
| "grad_norm": 0.7404115200042725, |
| "learning_rate": 0.00016352422907488985, |
| "loss": 0.3207, |
| "step": 1550 |
| }, |
| { |
| "epoch": 6.872246696035242, |
| "grad_norm": 0.9958137273788452, |
| "learning_rate": 0.00016264317180616738, |
| "loss": 0.4244, |
| "step": 1560 |
| }, |
| { |
| "epoch": 6.916299559471366, |
| "grad_norm": 1.0651079416275024, |
| "learning_rate": 0.00016176211453744493, |
| "loss": 0.4075, |
| "step": 1570 |
| }, |
| { |
| "epoch": 6.960352422907489, |
| "grad_norm": 0.9528789520263672, |
| "learning_rate": 0.00016088105726872246, |
| "loss": 0.493, |
| "step": 1580 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.5702072381973267, |
| "eval_runtime": 5.1787, |
| "eval_samples_per_second": 39.006, |
| "eval_steps_per_second": 5.021, |
| "step": 1589 |
| }, |
| { |
| "epoch": 7.004405286343612, |
| "grad_norm": 1.0486853122711182, |
| "learning_rate": 0.00015999999999999999, |
| "loss": 0.4131, |
| "step": 1590 |
| }, |
| { |
| "epoch": 7.048458149779735, |
| "grad_norm": 1.2176262140274048, |
| "learning_rate": 0.0001591189427312775, |
| "loss": 0.4417, |
| "step": 1600 |
| }, |
| { |
| "epoch": 7.092511013215859, |
| "grad_norm": 1.187107801437378, |
| "learning_rate": 0.00015823788546255504, |
| "loss": 0.4372, |
| "step": 1610 |
| }, |
| { |
| "epoch": 7.136563876651983, |
| "grad_norm": 0.9459372758865356, |
| "learning_rate": 0.0001573568281938326, |
| "loss": 0.356, |
| "step": 1620 |
| }, |
| { |
| "epoch": 7.180616740088106, |
| "grad_norm": 0.8114103078842163, |
| "learning_rate": 0.00015647577092511012, |
| "loss": 0.308, |
| "step": 1630 |
| }, |
| { |
| "epoch": 7.224669603524229, |
| "grad_norm": 1.035370945930481, |
| "learning_rate": 0.00015559471365638765, |
| "loss": 0.3738, |
| "step": 1640 |
| }, |
| { |
| "epoch": 7.2687224669603525, |
| "grad_norm": 1.0260848999023438, |
| "learning_rate": 0.00015471365638766518, |
| "loss": 0.342, |
| "step": 1650 |
| }, |
| { |
| "epoch": 7.312775330396476, |
| "grad_norm": 0.8079932928085327, |
| "learning_rate": 0.00015383259911894273, |
| "loss": 0.4381, |
| "step": 1660 |
| }, |
| { |
| "epoch": 7.356828193832599, |
| "grad_norm": 1.318695068359375, |
| "learning_rate": 0.00015295154185022026, |
| "loss": 0.3685, |
| "step": 1670 |
| }, |
| { |
| "epoch": 7.400881057268722, |
| "grad_norm": 1.3181859254837036, |
| "learning_rate": 0.00015207048458149778, |
| "loss": 0.3465, |
| "step": 1680 |
| }, |
| { |
| "epoch": 7.4449339207048455, |
| "grad_norm": 1.0277948379516602, |
| "learning_rate": 0.0001511894273127753, |
| "loss": 0.3659, |
| "step": 1690 |
| }, |
| { |
| "epoch": 7.48898678414097, |
| "grad_norm": 1.1619762182235718, |
| "learning_rate": 0.00015030837004405284, |
| "loss": 0.4304, |
| "step": 1700 |
| }, |
| { |
| "epoch": 7.533039647577093, |
| "grad_norm": 1.2854048013687134, |
| "learning_rate": 0.0001494273127753304, |
| "loss": 0.4372, |
| "step": 1710 |
| }, |
| { |
| "epoch": 7.577092511013216, |
| "grad_norm": 1.032459020614624, |
| "learning_rate": 0.00014854625550660792, |
| "loss": 0.3687, |
| "step": 1720 |
| }, |
| { |
| "epoch": 7.621145374449339, |
| "grad_norm": 0.9430228471755981, |
| "learning_rate": 0.00014766519823788545, |
| "loss": 0.3967, |
| "step": 1730 |
| }, |
| { |
| "epoch": 7.665198237885463, |
| "grad_norm": 1.2012503147125244, |
| "learning_rate": 0.00014678414096916297, |
| "loss": 0.4028, |
| "step": 1740 |
| }, |
| { |
| "epoch": 7.709251101321586, |
| "grad_norm": 0.9703013896942139, |
| "learning_rate": 0.00014590308370044053, |
| "loss": 0.4037, |
| "step": 1750 |
| }, |
| { |
| "epoch": 7.753303964757709, |
| "grad_norm": 1.2811229228973389, |
| "learning_rate": 0.00014502202643171806, |
| "loss": 0.3725, |
| "step": 1760 |
| }, |
| { |
| "epoch": 7.797356828193832, |
| "grad_norm": 0.9879553914070129, |
| "learning_rate": 0.00014414096916299558, |
| "loss": 0.4385, |
| "step": 1770 |
| }, |
| { |
| "epoch": 7.841409691629956, |
| "grad_norm": 1.4015151262283325, |
| "learning_rate": 0.0001432599118942731, |
| "loss": 0.4046, |
| "step": 1780 |
| }, |
| { |
| "epoch": 7.885462555066079, |
| "grad_norm": 0.9369928240776062, |
| "learning_rate": 0.00014237885462555064, |
| "loss": 0.4232, |
| "step": 1790 |
| }, |
| { |
| "epoch": 7.929515418502203, |
| "grad_norm": 0.7787442803382874, |
| "learning_rate": 0.0001414977973568282, |
| "loss": 0.3679, |
| "step": 1800 |
| }, |
| { |
| "epoch": 7.973568281938326, |
| "grad_norm": 0.7212619781494141, |
| "learning_rate": 0.00014061674008810572, |
| "loss": 0.4299, |
| "step": 1810 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.57987380027771, |
| "eval_runtime": 4.8534, |
| "eval_samples_per_second": 41.62, |
| "eval_steps_per_second": 5.357, |
| "step": 1816 |
| }, |
| { |
| "epoch": 8.017621145374449, |
| "grad_norm": 1.1815301179885864, |
| "learning_rate": 0.00013973568281938325, |
| "loss": 0.4058, |
| "step": 1820 |
| }, |
| { |
| "epoch": 8.061674008810572, |
| "grad_norm": 0.7913572192192078, |
| "learning_rate": 0.00013885462555066077, |
| "loss": 0.2876, |
| "step": 1830 |
| }, |
| { |
| "epoch": 8.105726872246697, |
| "grad_norm": 0.9591747522354126, |
| "learning_rate": 0.0001379735682819383, |
| "loss": 0.2801, |
| "step": 1840 |
| }, |
| { |
| "epoch": 8.14977973568282, |
| "grad_norm": 1.2883862257003784, |
| "learning_rate": 0.00013709251101321585, |
| "loss": 0.3435, |
| "step": 1850 |
| }, |
| { |
| "epoch": 8.193832599118943, |
| "grad_norm": 1.2138097286224365, |
| "learning_rate": 0.00013621145374449338, |
| "loss": 0.4603, |
| "step": 1860 |
| }, |
| { |
| "epoch": 8.237885462555067, |
| "grad_norm": 0.9017927050590515, |
| "learning_rate": 0.0001353303964757709, |
| "loss": 0.328, |
| "step": 1870 |
| }, |
| { |
| "epoch": 8.28193832599119, |
| "grad_norm": 1.0213032960891724, |
| "learning_rate": 0.00013444933920704844, |
| "loss": 0.4241, |
| "step": 1880 |
| }, |
| { |
| "epoch": 8.325991189427313, |
| "grad_norm": 0.782507598400116, |
| "learning_rate": 0.00013356828193832596, |
| "loss": 0.287, |
| "step": 1890 |
| }, |
| { |
| "epoch": 8.370044052863436, |
| "grad_norm": 0.8239027261734009, |
| "learning_rate": 0.00013268722466960352, |
| "loss": 0.3471, |
| "step": 1900 |
| }, |
| { |
| "epoch": 8.41409691629956, |
| "grad_norm": 0.9952473044395447, |
| "learning_rate": 0.00013180616740088104, |
| "loss": 0.325, |
| "step": 1910 |
| }, |
| { |
| "epoch": 8.458149779735683, |
| "grad_norm": 0.7988440990447998, |
| "learning_rate": 0.00013092511013215857, |
| "loss": 0.3397, |
| "step": 1920 |
| }, |
| { |
| "epoch": 8.502202643171806, |
| "grad_norm": 1.2881464958190918, |
| "learning_rate": 0.0001300440528634361, |
| "loss": 0.4655, |
| "step": 1930 |
| }, |
| { |
| "epoch": 8.54625550660793, |
| "grad_norm": 0.9545268416404724, |
| "learning_rate": 0.00012916299559471365, |
| "loss": 0.4031, |
| "step": 1940 |
| }, |
| { |
| "epoch": 8.590308370044053, |
| "grad_norm": 1.550424337387085, |
| "learning_rate": 0.00012828193832599118, |
| "loss": 0.3697, |
| "step": 1950 |
| }, |
| { |
| "epoch": 8.634361233480176, |
| "grad_norm": 1.2041224241256714, |
| "learning_rate": 0.0001274008810572687, |
| "loss": 0.43, |
| "step": 1960 |
| }, |
| { |
| "epoch": 8.678414096916299, |
| "grad_norm": 0.8280724287033081, |
| "learning_rate": 0.00012651982378854626, |
| "loss": 0.4045, |
| "step": 1970 |
| }, |
| { |
| "epoch": 8.722466960352422, |
| "grad_norm": 0.8164283037185669, |
| "learning_rate": 0.00012563876651982376, |
| "loss": 0.4001, |
| "step": 1980 |
| }, |
| { |
| "epoch": 8.766519823788546, |
| "grad_norm": 0.9470929503440857, |
| "learning_rate": 0.00012475770925110132, |
| "loss": 0.3767, |
| "step": 1990 |
| }, |
| { |
| "epoch": 8.810572687224669, |
| "grad_norm": 0.7390472888946533, |
| "learning_rate": 0.00012387665198237884, |
| "loss": 0.4206, |
| "step": 2000 |
| }, |
| { |
| "epoch": 8.854625550660792, |
| "grad_norm": 0.8382723927497864, |
| "learning_rate": 0.00012299559471365637, |
| "loss": 0.3061, |
| "step": 2010 |
| }, |
| { |
| "epoch": 8.898678414096917, |
| "grad_norm": 1.060539722442627, |
| "learning_rate": 0.00012211453744493392, |
| "loss": 0.4921, |
| "step": 2020 |
| }, |
| { |
| "epoch": 8.94273127753304, |
| "grad_norm": 0.6955994367599487, |
| "learning_rate": 0.00012123348017621144, |
| "loss": 0.4077, |
| "step": 2030 |
| }, |
| { |
| "epoch": 8.986784140969164, |
| "grad_norm": 0.8158656358718872, |
| "learning_rate": 0.00012035242290748898, |
| "loss": 0.3759, |
| "step": 2040 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.582844614982605, |
| "eval_runtime": 4.8405, |
| "eval_samples_per_second": 41.732, |
| "eval_steps_per_second": 5.371, |
| "step": 2043 |
| }, |
| { |
| "epoch": 9.030837004405287, |
| "grad_norm": 0.9192315936088562, |
| "learning_rate": 0.0001194713656387665, |
| "loss": 0.3809, |
| "step": 2050 |
| }, |
| { |
| "epoch": 9.07488986784141, |
| "grad_norm": 1.0536017417907715, |
| "learning_rate": 0.00011859030837004403, |
| "loss": 0.3321, |
| "step": 2060 |
| }, |
| { |
| "epoch": 9.118942731277533, |
| "grad_norm": 1.1080108880996704, |
| "learning_rate": 0.00011770925110132157, |
| "loss": 0.407, |
| "step": 2070 |
| }, |
| { |
| "epoch": 9.162995594713657, |
| "grad_norm": 0.9956775903701782, |
| "learning_rate": 0.0001168281938325991, |
| "loss": 0.3423, |
| "step": 2080 |
| }, |
| { |
| "epoch": 9.20704845814978, |
| "grad_norm": 0.746013343334198, |
| "learning_rate": 0.00011594713656387664, |
| "loss": 0.3794, |
| "step": 2090 |
| }, |
| { |
| "epoch": 9.251101321585903, |
| "grad_norm": 1.126372218132019, |
| "learning_rate": 0.00011506607929515417, |
| "loss": 0.4121, |
| "step": 2100 |
| }, |
| { |
| "epoch": 9.295154185022026, |
| "grad_norm": 1.4978642463684082, |
| "learning_rate": 0.00011418502202643172, |
| "loss": 0.3358, |
| "step": 2110 |
| }, |
| { |
| "epoch": 9.33920704845815, |
| "grad_norm": 0.7826859951019287, |
| "learning_rate": 0.00011330396475770924, |
| "loss": 0.2931, |
| "step": 2120 |
| }, |
| { |
| "epoch": 9.383259911894273, |
| "grad_norm": 1.1644082069396973, |
| "learning_rate": 0.00011242290748898676, |
| "loss": 0.377, |
| "step": 2130 |
| }, |
| { |
| "epoch": 9.427312775330396, |
| "grad_norm": 0.8106231093406677, |
| "learning_rate": 0.00011154185022026432, |
| "loss": 0.3562, |
| "step": 2140 |
| }, |
| { |
| "epoch": 9.47136563876652, |
| "grad_norm": 1.162919282913208, |
| "learning_rate": 0.00011066079295154183, |
| "loss": 0.3441, |
| "step": 2150 |
| }, |
| { |
| "epoch": 9.515418502202643, |
| "grad_norm": 0.7184136509895325, |
| "learning_rate": 0.00010977973568281939, |
| "loss": 0.3254, |
| "step": 2160 |
| }, |
| { |
| "epoch": 9.559471365638766, |
| "grad_norm": 0.9587578177452087, |
| "learning_rate": 0.00010889867841409691, |
| "loss": 0.3533, |
| "step": 2170 |
| }, |
| { |
| "epoch": 9.603524229074889, |
| "grad_norm": 0.8703950643539429, |
| "learning_rate": 0.00010801762114537444, |
| "loss": 0.3366, |
| "step": 2180 |
| }, |
| { |
| "epoch": 9.647577092511014, |
| "grad_norm": 0.7304671406745911, |
| "learning_rate": 0.00010713656387665198, |
| "loss": 0.3608, |
| "step": 2190 |
| }, |
| { |
| "epoch": 9.691629955947137, |
| "grad_norm": 1.1611542701721191, |
| "learning_rate": 0.00010625550660792951, |
| "loss": 0.3353, |
| "step": 2200 |
| }, |
| { |
| "epoch": 9.73568281938326, |
| "grad_norm": 0.7281723022460938, |
| "learning_rate": 0.00010537444933920705, |
| "loss": 0.3082, |
| "step": 2210 |
| }, |
| { |
| "epoch": 9.779735682819384, |
| "grad_norm": 1.1435456275939941, |
| "learning_rate": 0.00010449339207048458, |
| "loss": 0.4317, |
| "step": 2220 |
| }, |
| { |
| "epoch": 9.823788546255507, |
| "grad_norm": 0.9928381443023682, |
| "learning_rate": 0.0001036123348017621, |
| "loss": 0.3564, |
| "step": 2230 |
| }, |
| { |
| "epoch": 9.86784140969163, |
| "grad_norm": 0.8395977020263672, |
| "learning_rate": 0.00010273127753303964, |
| "loss": 0.3531, |
| "step": 2240 |
| }, |
| { |
| "epoch": 9.911894273127754, |
| "grad_norm": 1.0142395496368408, |
| "learning_rate": 0.00010185022026431717, |
| "loss": 0.3896, |
| "step": 2250 |
| }, |
| { |
| "epoch": 9.955947136563877, |
| "grad_norm": 0.6916971802711487, |
| "learning_rate": 0.00010096916299559471, |
| "loss": 0.3667, |
| "step": 2260 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.7665943503379822, |
| "learning_rate": 0.00010008810572687224, |
| "loss": 0.3075, |
| "step": 2270 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.579430878162384, |
| "eval_runtime": 4.7891, |
| "eval_samples_per_second": 42.179, |
| "eval_steps_per_second": 5.429, |
| "step": 2270 |
| }, |
| { |
| "epoch": 10.044052863436123, |
| "grad_norm": 1.4675018787384033, |
| "learning_rate": 9.920704845814978e-05, |
| "loss": 0.3251, |
| "step": 2280 |
| }, |
| { |
| "epoch": 10.088105726872246, |
| "grad_norm": 0.6954736709594727, |
| "learning_rate": 9.83259911894273e-05, |
| "loss": 0.3656, |
| "step": 2290 |
| }, |
| { |
| "epoch": 10.13215859030837, |
| "grad_norm": 1.4188182353973389, |
| "learning_rate": 9.744493392070483e-05, |
| "loss": 0.335, |
| "step": 2300 |
| }, |
| { |
| "epoch": 10.176211453744493, |
| "grad_norm": 0.9333553910255432, |
| "learning_rate": 9.656387665198237e-05, |
| "loss": 0.2888, |
| "step": 2310 |
| }, |
| { |
| "epoch": 10.220264317180616, |
| "grad_norm": 0.886482834815979, |
| "learning_rate": 9.56828193832599e-05, |
| "loss": 0.3122, |
| "step": 2320 |
| }, |
| { |
| "epoch": 10.26431718061674, |
| "grad_norm": 0.6795399188995361, |
| "learning_rate": 9.480176211453744e-05, |
| "loss": 0.3765, |
| "step": 2330 |
| }, |
| { |
| "epoch": 10.308370044052863, |
| "grad_norm": 1.3046603202819824, |
| "learning_rate": 9.392070484581497e-05, |
| "loss": 0.3316, |
| "step": 2340 |
| }, |
| { |
| "epoch": 10.352422907488986, |
| "grad_norm": 1.0006519556045532, |
| "learning_rate": 9.30396475770925e-05, |
| "loss": 0.3659, |
| "step": 2350 |
| }, |
| { |
| "epoch": 10.396475770925111, |
| "grad_norm": 1.1640467643737793, |
| "learning_rate": 9.215859030837004e-05, |
| "loss": 0.346, |
| "step": 2360 |
| }, |
| { |
| "epoch": 10.440528634361234, |
| "grad_norm": 0.9744365811347961, |
| "learning_rate": 9.127753303964756e-05, |
| "loss": 0.3317, |
| "step": 2370 |
| }, |
| { |
| "epoch": 10.484581497797357, |
| "grad_norm": 1.039802074432373, |
| "learning_rate": 9.03964757709251e-05, |
| "loss": 0.3162, |
| "step": 2380 |
| }, |
| { |
| "epoch": 10.52863436123348, |
| "grad_norm": 0.9926576614379883, |
| "learning_rate": 8.951541850220263e-05, |
| "loss": 0.3559, |
| "step": 2390 |
| }, |
| { |
| "epoch": 10.572687224669604, |
| "grad_norm": 1.0141366720199585, |
| "learning_rate": 8.863436123348016e-05, |
| "loss": 0.3196, |
| "step": 2400 |
| }, |
| { |
| "epoch": 10.616740088105727, |
| "grad_norm": 0.5856879353523254, |
| "learning_rate": 8.77533039647577e-05, |
| "loss": 0.2919, |
| "step": 2410 |
| }, |
| { |
| "epoch": 10.66079295154185, |
| "grad_norm": 0.9484356045722961, |
| "learning_rate": 8.687224669603523e-05, |
| "loss": 0.339, |
| "step": 2420 |
| }, |
| { |
| "epoch": 10.704845814977974, |
| "grad_norm": 0.9014990925788879, |
| "learning_rate": 8.599118942731277e-05, |
| "loss": 0.3089, |
| "step": 2430 |
| }, |
| { |
| "epoch": 10.748898678414097, |
| "grad_norm": 0.9830072522163391, |
| "learning_rate": 8.51101321585903e-05, |
| "loss": 0.3461, |
| "step": 2440 |
| }, |
| { |
| "epoch": 10.79295154185022, |
| "grad_norm": 1.051647424697876, |
| "learning_rate": 8.422907488986782e-05, |
| "loss": 0.292, |
| "step": 2450 |
| }, |
| { |
| "epoch": 10.837004405286343, |
| "grad_norm": 1.0580625534057617, |
| "learning_rate": 8.334801762114536e-05, |
| "loss": 0.4052, |
| "step": 2460 |
| }, |
| { |
| "epoch": 10.881057268722467, |
| "grad_norm": 1.01996648311615, |
| "learning_rate": 8.246696035242289e-05, |
| "loss": 0.3927, |
| "step": 2470 |
| }, |
| { |
| "epoch": 10.92511013215859, |
| "grad_norm": 0.6538860201835632, |
| "learning_rate": 8.158590308370044e-05, |
| "loss": 0.3451, |
| "step": 2480 |
| }, |
| { |
| "epoch": 10.969162995594713, |
| "grad_norm": 0.9368380308151245, |
| "learning_rate": 8.070484581497796e-05, |
| "loss": 0.3932, |
| "step": 2490 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.5825287103652954, |
| "eval_runtime": 4.811, |
| "eval_samples_per_second": 41.987, |
| "eval_steps_per_second": 5.404, |
| "step": 2497 |
| }, |
| { |
| "epoch": 11.013215859030836, |
| "grad_norm": 0.9590967893600464, |
| "learning_rate": 7.982378854625551e-05, |
| "loss": 0.32, |
| "step": 2500 |
| }, |
| { |
| "epoch": 11.05726872246696, |
| "grad_norm": 0.9905742406845093, |
| "learning_rate": 7.894273127753304e-05, |
| "loss": 0.3029, |
| "step": 2510 |
| }, |
| { |
| "epoch": 11.101321585903083, |
| "grad_norm": 1.2009577751159668, |
| "learning_rate": 7.806167400881057e-05, |
| "loss": 0.3626, |
| "step": 2520 |
| }, |
| { |
| "epoch": 11.145374449339206, |
| "grad_norm": 1.0607908964157104, |
| "learning_rate": 7.718061674008811e-05, |
| "loss": 0.314, |
| "step": 2530 |
| }, |
| { |
| "epoch": 11.189427312775331, |
| "grad_norm": 1.1098504066467285, |
| "learning_rate": 7.629955947136563e-05, |
| "loss": 0.3062, |
| "step": 2540 |
| }, |
| { |
| "epoch": 11.233480176211454, |
| "grad_norm": 0.6961995959281921, |
| "learning_rate": 7.541850220264317e-05, |
| "loss": 0.3499, |
| "step": 2550 |
| }, |
| { |
| "epoch": 11.277533039647578, |
| "grad_norm": 1.0727498531341553, |
| "learning_rate": 7.45374449339207e-05, |
| "loss": 0.2559, |
| "step": 2560 |
| }, |
| { |
| "epoch": 11.321585903083701, |
| "grad_norm": 1.064344048500061, |
| "learning_rate": 7.365638766519823e-05, |
| "loss": 0.3011, |
| "step": 2570 |
| }, |
| { |
| "epoch": 11.365638766519824, |
| "grad_norm": 1.1059036254882812, |
| "learning_rate": 7.277533039647577e-05, |
| "loss": 0.3415, |
| "step": 2580 |
| }, |
| { |
| "epoch": 11.409691629955947, |
| "grad_norm": 0.8815020322799683, |
| "learning_rate": 7.18942731277533e-05, |
| "loss": 0.3164, |
| "step": 2590 |
| }, |
| { |
| "epoch": 11.45374449339207, |
| "grad_norm": 0.9667496085166931, |
| "learning_rate": 7.101321585903082e-05, |
| "loss": 0.3642, |
| "step": 2600 |
| }, |
| { |
| "epoch": 11.497797356828194, |
| "grad_norm": 0.942876935005188, |
| "learning_rate": 7.013215859030836e-05, |
| "loss": 0.3624, |
| "step": 2610 |
| }, |
| { |
| "epoch": 11.541850220264317, |
| "grad_norm": 1.022675633430481, |
| "learning_rate": 6.925110132158589e-05, |
| "loss": 0.3351, |
| "step": 2620 |
| }, |
| { |
| "epoch": 11.58590308370044, |
| "grad_norm": 0.9919267892837524, |
| "learning_rate": 6.837004405286343e-05, |
| "loss": 0.3335, |
| "step": 2630 |
| }, |
| { |
| "epoch": 11.629955947136564, |
| "grad_norm": 0.9724282026290894, |
| "learning_rate": 6.748898678414096e-05, |
| "loss": 0.3154, |
| "step": 2640 |
| }, |
| { |
| "epoch": 11.674008810572687, |
| "grad_norm": 1.3246617317199707, |
| "learning_rate": 6.660792951541849e-05, |
| "loss": 0.4366, |
| "step": 2650 |
| }, |
| { |
| "epoch": 11.71806167400881, |
| "grad_norm": 1.0111949443817139, |
| "learning_rate": 6.572687224669603e-05, |
| "loss": 0.3324, |
| "step": 2660 |
| }, |
| { |
| "epoch": 11.762114537444933, |
| "grad_norm": 0.8399791717529297, |
| "learning_rate": 6.484581497797357e-05, |
| "loss": 0.2669, |
| "step": 2670 |
| }, |
| { |
| "epoch": 11.806167400881057, |
| "grad_norm": 0.917736828327179, |
| "learning_rate": 6.39647577092511e-05, |
| "loss": 0.324, |
| "step": 2680 |
| }, |
| { |
| "epoch": 11.85022026431718, |
| "grad_norm": 0.9939138293266296, |
| "learning_rate": 6.308370044052864e-05, |
| "loss": 0.2888, |
| "step": 2690 |
| }, |
| { |
| "epoch": 11.894273127753303, |
| "grad_norm": 0.9510142803192139, |
| "learning_rate": 6.220264317180616e-05, |
| "loss": 0.3428, |
| "step": 2700 |
| }, |
| { |
| "epoch": 11.938325991189426, |
| "grad_norm": 1.3216148614883423, |
| "learning_rate": 6.132158590308369e-05, |
| "loss": 0.3254, |
| "step": 2710 |
| }, |
| { |
| "epoch": 11.982378854625551, |
| "grad_norm": 1.2755056619644165, |
| "learning_rate": 6.0440528634361224e-05, |
| "loss": 0.3188, |
| "step": 2720 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.5862967371940613, |
| "eval_runtime": 5.4591, |
| "eval_samples_per_second": 37.002, |
| "eval_steps_per_second": 4.763, |
| "step": 2724 |
| }, |
| { |
| "epoch": 12.026431718061675, |
| "grad_norm": 0.8617092967033386, |
| "learning_rate": 5.955947136563876e-05, |
| "loss": 0.2953, |
| "step": 2730 |
| }, |
| { |
| "epoch": 12.070484581497798, |
| "grad_norm": 0.7434670329093933, |
| "learning_rate": 5.86784140969163e-05, |
| "loss": 0.312, |
| "step": 2740 |
| }, |
| { |
| "epoch": 12.114537444933921, |
| "grad_norm": 0.9274753332138062, |
| "learning_rate": 5.779735682819383e-05, |
| "loss": 0.2664, |
| "step": 2750 |
| }, |
| { |
| "epoch": 12.158590308370044, |
| "grad_norm": 1.058923363685608, |
| "learning_rate": 5.691629955947135e-05, |
| "loss": 0.3236, |
| "step": 2760 |
| }, |
| { |
| "epoch": 12.202643171806168, |
| "grad_norm": 0.7601414918899536, |
| "learning_rate": 5.6035242290748894e-05, |
| "loss": 0.3076, |
| "step": 2770 |
| }, |
| { |
| "epoch": 12.246696035242291, |
| "grad_norm": 0.7787047624588013, |
| "learning_rate": 5.515418502202643e-05, |
| "loss": 0.2618, |
| "step": 2780 |
| }, |
| { |
| "epoch": 12.290748898678414, |
| "grad_norm": 0.9064326882362366, |
| "learning_rate": 5.427312775330396e-05, |
| "loss": 0.3305, |
| "step": 2790 |
| }, |
| { |
| "epoch": 12.334801762114537, |
| "grad_norm": 1.0712478160858154, |
| "learning_rate": 5.3392070484581496e-05, |
| "loss": 0.341, |
| "step": 2800 |
| }, |
| { |
| "epoch": 12.37885462555066, |
| "grad_norm": 0.6585920453071594, |
| "learning_rate": 5.251101321585903e-05, |
| "loss": 0.3476, |
| "step": 2810 |
| }, |
| { |
| "epoch": 12.422907488986784, |
| "grad_norm": 1.1152169704437256, |
| "learning_rate": 5.162995594713656e-05, |
| "loss": 0.3418, |
| "step": 2820 |
| }, |
| { |
| "epoch": 12.466960352422907, |
| "grad_norm": 0.926008403301239, |
| "learning_rate": 5.074889867841409e-05, |
| "loss": 0.2543, |
| "step": 2830 |
| }, |
| { |
| "epoch": 12.51101321585903, |
| "grad_norm": 1.1506083011627197, |
| "learning_rate": 4.9867841409691625e-05, |
| "loss": 0.2895, |
| "step": 2840 |
| }, |
| { |
| "epoch": 12.555066079295154, |
| "grad_norm": 0.8726121783256531, |
| "learning_rate": 4.898678414096916e-05, |
| "loss": 0.2917, |
| "step": 2850 |
| }, |
| { |
| "epoch": 12.599118942731277, |
| "grad_norm": 1.1620839834213257, |
| "learning_rate": 4.810572687224669e-05, |
| "loss": 0.3585, |
| "step": 2860 |
| }, |
| { |
| "epoch": 12.6431718061674, |
| "grad_norm": 1.1911215782165527, |
| "learning_rate": 4.7224669603524226e-05, |
| "loss": 0.3177, |
| "step": 2870 |
| }, |
| { |
| "epoch": 12.687224669603523, |
| "grad_norm": 0.9236161708831787, |
| "learning_rate": 4.6343612334801754e-05, |
| "loss": 0.3203, |
| "step": 2880 |
| }, |
| { |
| "epoch": 12.731277533039648, |
| "grad_norm": 1.0384935140609741, |
| "learning_rate": 4.546255506607929e-05, |
| "loss": 0.3264, |
| "step": 2890 |
| }, |
| { |
| "epoch": 12.775330396475772, |
| "grad_norm": 1.3048256635665894, |
| "learning_rate": 4.458149779735682e-05, |
| "loss": 0.3544, |
| "step": 2900 |
| }, |
| { |
| "epoch": 12.819383259911895, |
| "grad_norm": 1.127678394317627, |
| "learning_rate": 4.370044052863436e-05, |
| "loss": 0.3768, |
| "step": 2910 |
| }, |
| { |
| "epoch": 12.863436123348018, |
| "grad_norm": 0.9425409436225891, |
| "learning_rate": 4.2819383259911896e-05, |
| "loss": 0.2778, |
| "step": 2920 |
| }, |
| { |
| "epoch": 12.907488986784141, |
| "grad_norm": 1.2469598054885864, |
| "learning_rate": 4.1938325991189416e-05, |
| "loss": 0.3532, |
| "step": 2930 |
| }, |
| { |
| "epoch": 12.951541850220265, |
| "grad_norm": 0.7975876927375793, |
| "learning_rate": 4.105726872246696e-05, |
| "loss": 0.3189, |
| "step": 2940 |
| }, |
| { |
| "epoch": 12.995594713656388, |
| "grad_norm": 0.8869457840919495, |
| "learning_rate": 4.017621145374449e-05, |
| "loss": 0.3282, |
| "step": 2950 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.5989018678665161, |
| "eval_runtime": 4.9822, |
| "eval_samples_per_second": 40.544, |
| "eval_steps_per_second": 5.219, |
| "step": 2951 |
| }, |
| { |
| "epoch": 13.039647577092511, |
| "grad_norm": 1.1934689283370972, |
| "learning_rate": 3.9295154185022025e-05, |
| "loss": 0.2919, |
| "step": 2960 |
| }, |
| { |
| "epoch": 13.083700440528634, |
| "grad_norm": 1.1812618970870972, |
| "learning_rate": 3.841409691629956e-05, |
| "loss": 0.3219, |
| "step": 2970 |
| }, |
| { |
| "epoch": 13.127753303964758, |
| "grad_norm": 1.2065187692642212, |
| "learning_rate": 3.753303964757709e-05, |
| "loss": 0.29, |
| "step": 2980 |
| }, |
| { |
| "epoch": 13.17180616740088, |
| "grad_norm": 0.8890476822853088, |
| "learning_rate": 3.665198237885462e-05, |
| "loss": 0.2462, |
| "step": 2990 |
| }, |
| { |
| "epoch": 13.215859030837004, |
| "grad_norm": 1.2433491945266724, |
| "learning_rate": 3.5770925110132154e-05, |
| "loss": 0.2614, |
| "step": 3000 |
| }, |
| { |
| "epoch": 13.259911894273127, |
| "grad_norm": 0.7791047692298889, |
| "learning_rate": 3.488986784140969e-05, |
| "loss": 0.2824, |
| "step": 3010 |
| }, |
| { |
| "epoch": 13.30396475770925, |
| "grad_norm": 1.1994709968566895, |
| "learning_rate": 3.400881057268722e-05, |
| "loss": 0.2873, |
| "step": 3020 |
| }, |
| { |
| "epoch": 13.348017621145374, |
| "grad_norm": 1.082380771636963, |
| "learning_rate": 3.3127753303964756e-05, |
| "loss": 0.2571, |
| "step": 3030 |
| }, |
| { |
| "epoch": 13.392070484581497, |
| "grad_norm": 0.9533681273460388, |
| "learning_rate": 3.224669603524229e-05, |
| "loss": 0.347, |
| "step": 3040 |
| }, |
| { |
| "epoch": 13.43612334801762, |
| "grad_norm": 0.9235092997550964, |
| "learning_rate": 3.1365638766519824e-05, |
| "loss": 0.3015, |
| "step": 3050 |
| }, |
| { |
| "epoch": 13.480176211453745, |
| "grad_norm": 0.7734026908874512, |
| "learning_rate": 3.0484581497797354e-05, |
| "loss": 0.3521, |
| "step": 3060 |
| }, |
| { |
| "epoch": 13.524229074889869, |
| "grad_norm": 1.0365204811096191, |
| "learning_rate": 2.9603524229074888e-05, |
| "loss": 0.3765, |
| "step": 3070 |
| }, |
| { |
| "epoch": 13.568281938325992, |
| "grad_norm": 0.7667551040649414, |
| "learning_rate": 2.8722466960352422e-05, |
| "loss": 0.2976, |
| "step": 3080 |
| }, |
| { |
| "epoch": 13.612334801762115, |
| "grad_norm": 1.3401521444320679, |
| "learning_rate": 2.7841409691629952e-05, |
| "loss": 0.3571, |
| "step": 3090 |
| }, |
| { |
| "epoch": 13.656387665198238, |
| "grad_norm": 1.1603132486343384, |
| "learning_rate": 2.6960352422907486e-05, |
| "loss": 0.3513, |
| "step": 3100 |
| }, |
| { |
| "epoch": 13.700440528634362, |
| "grad_norm": 0.9537465572357178, |
| "learning_rate": 2.607929515418502e-05, |
| "loss": 0.3365, |
| "step": 3110 |
| }, |
| { |
| "epoch": 13.744493392070485, |
| "grad_norm": 1.5475343465805054, |
| "learning_rate": 2.519823788546255e-05, |
| "loss": 0.2835, |
| "step": 3120 |
| }, |
| { |
| "epoch": 13.788546255506608, |
| "grad_norm": 1.308053970336914, |
| "learning_rate": 2.4317180616740088e-05, |
| "loss": 0.367, |
| "step": 3130 |
| }, |
| { |
| "epoch": 13.832599118942731, |
| "grad_norm": 0.9297582507133484, |
| "learning_rate": 2.343612334801762e-05, |
| "loss": 0.2539, |
| "step": 3140 |
| }, |
| { |
| "epoch": 13.876651982378855, |
| "grad_norm": 0.8061990737915039, |
| "learning_rate": 2.2555066079295153e-05, |
| "loss": 0.3235, |
| "step": 3150 |
| }, |
| { |
| "epoch": 13.920704845814978, |
| "grad_norm": 0.9529415369033813, |
| "learning_rate": 2.1674008810572687e-05, |
| "loss": 0.2427, |
| "step": 3160 |
| }, |
| { |
| "epoch": 13.964757709251101, |
| "grad_norm": 1.0907659530639648, |
| "learning_rate": 2.0792951541850217e-05, |
| "loss": 0.3928, |
| "step": 3170 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.5977996587753296, |
| "eval_runtime": 4.957, |
| "eval_samples_per_second": 40.75, |
| "eval_steps_per_second": 5.245, |
| "step": 3178 |
| }, |
| { |
| "epoch": 14.008810572687224, |
| "grad_norm": 0.9147399663925171, |
| "learning_rate": 1.991189427312775e-05, |
| "loss": 0.3046, |
| "step": 3180 |
| }, |
| { |
| "epoch": 14.052863436123348, |
| "grad_norm": 0.9036657214164734, |
| "learning_rate": 1.9030837004405285e-05, |
| "loss": 0.3805, |
| "step": 3190 |
| }, |
| { |
| "epoch": 14.09691629955947, |
| "grad_norm": 0.8942423462867737, |
| "learning_rate": 1.814977973568282e-05, |
| "loss": 0.3361, |
| "step": 3200 |
| }, |
| { |
| "epoch": 14.140969162995594, |
| "grad_norm": 0.8469783067703247, |
| "learning_rate": 1.7268722466960353e-05, |
| "loss": 0.2552, |
| "step": 3210 |
| }, |
| { |
| "epoch": 14.185022026431717, |
| "grad_norm": 0.9594734311103821, |
| "learning_rate": 1.6387665198237883e-05, |
| "loss": 0.3417, |
| "step": 3220 |
| }, |
| { |
| "epoch": 14.229074889867842, |
| "grad_norm": 0.7858956456184387, |
| "learning_rate": 1.5506607929515417e-05, |
| "loss": 0.3343, |
| "step": 3230 |
| }, |
| { |
| "epoch": 14.273127753303966, |
| "grad_norm": 1.0662245750427246, |
| "learning_rate": 1.4625550660792951e-05, |
| "loss": 0.311, |
| "step": 3240 |
| }, |
| { |
| "epoch": 14.317180616740089, |
| "grad_norm": 0.7350011467933655, |
| "learning_rate": 1.3744493392070483e-05, |
| "loss": 0.2985, |
| "step": 3250 |
| }, |
| { |
| "epoch": 14.361233480176212, |
| "grad_norm": 1.1536651849746704, |
| "learning_rate": 1.2863436123348016e-05, |
| "loss": 0.2556, |
| "step": 3260 |
| }, |
| { |
| "epoch": 14.405286343612335, |
| "grad_norm": 1.1384259462356567, |
| "learning_rate": 1.198237885462555e-05, |
| "loss": 0.3369, |
| "step": 3270 |
| }, |
| { |
| "epoch": 14.449339207048459, |
| "grad_norm": 0.9650891423225403, |
| "learning_rate": 1.1101321585903083e-05, |
| "loss": 0.366, |
| "step": 3280 |
| }, |
| { |
| "epoch": 14.493392070484582, |
| "grad_norm": 0.6376497149467468, |
| "learning_rate": 1.0220264317180616e-05, |
| "loss": 0.2398, |
| "step": 3290 |
| }, |
| { |
| "epoch": 14.537444933920705, |
| "grad_norm": 0.9311153888702393, |
| "learning_rate": 9.33920704845815e-06, |
| "loss": 0.2665, |
| "step": 3300 |
| }, |
| { |
| "epoch": 14.581497797356828, |
| "grad_norm": 0.8357282876968384, |
| "learning_rate": 8.458149779735682e-06, |
| "loss": 0.2894, |
| "step": 3310 |
| }, |
| { |
| "epoch": 14.625550660792952, |
| "grad_norm": 0.8187097311019897, |
| "learning_rate": 7.577092511013215e-06, |
| "loss": 0.2763, |
| "step": 3320 |
| }, |
| { |
| "epoch": 14.669603524229075, |
| "grad_norm": 0.7701286673545837, |
| "learning_rate": 6.696035242290749e-06, |
| "loss": 0.3006, |
| "step": 3330 |
| }, |
| { |
| "epoch": 14.713656387665198, |
| "grad_norm": 0.8439558744430542, |
| "learning_rate": 5.814977973568281e-06, |
| "loss": 0.2882, |
| "step": 3340 |
| }, |
| { |
| "epoch": 14.757709251101321, |
| "grad_norm": 1.196057677268982, |
| "learning_rate": 4.933920704845815e-06, |
| "loss": 0.2544, |
| "step": 3350 |
| }, |
| { |
| "epoch": 14.801762114537445, |
| "grad_norm": 0.9785457253456116, |
| "learning_rate": 4.052863436123348e-06, |
| "loss": 0.3665, |
| "step": 3360 |
| }, |
| { |
| "epoch": 14.845814977973568, |
| "grad_norm": 0.7792004346847534, |
| "learning_rate": 3.1718061674008807e-06, |
| "loss": 0.3404, |
| "step": 3370 |
| }, |
| { |
| "epoch": 14.889867841409691, |
| "grad_norm": 0.6765570044517517, |
| "learning_rate": 2.2907488986784137e-06, |
| "loss": 0.2711, |
| "step": 3380 |
| }, |
| { |
| "epoch": 14.933920704845814, |
| "grad_norm": 1.0028444528579712, |
| "learning_rate": 1.409691629955947e-06, |
| "loss": 0.2378, |
| "step": 3390 |
| }, |
| { |
| "epoch": 14.97797356828194, |
| "grad_norm": 0.9199230074882507, |
| "learning_rate": 5.286343612334801e-07, |
| "loss": 0.3425, |
| "step": 3400 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3405, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 15, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5052503917854720.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|