| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 97, | |
| "global_step": 772, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0025906735751295338, | |
| "grad_norm": 758.2562349755826, | |
| "learning_rate": 0.0, | |
| "loss": 1.3719, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0025906735751295338, | |
| "eval_loss": 1.3159157037734985, | |
| "eval_runtime": 36.907, | |
| "eval_samples_per_second": 20.159, | |
| "eval_steps_per_second": 1.273, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0051813471502590676, | |
| "grad_norm": 666.308184823038, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.36, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.007772020725388601, | |
| "grad_norm": 211.0771195353068, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.3746, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.010362694300518135, | |
| "grad_norm": 431.5114709683218, | |
| "learning_rate": 3e-06, | |
| "loss": 1.3412, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.012953367875647668, | |
| "grad_norm": 230.87468433791625, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.3837, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015544041450777202, | |
| "grad_norm": 635.1636587738542, | |
| "learning_rate": 5e-06, | |
| "loss": 1.3761, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.018134715025906734, | |
| "grad_norm": 791.5536958334704, | |
| "learning_rate": 6e-06, | |
| "loss": 1.2855, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02072538860103627, | |
| "grad_norm": 667.7197994216477, | |
| "learning_rate": 7e-06, | |
| "loss": 1.3267, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.023316062176165803, | |
| "grad_norm": 254.3855973692125, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.2977, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.025906735751295335, | |
| "grad_norm": 162.29347257682093, | |
| "learning_rate": 9e-06, | |
| "loss": 1.3522, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02849740932642487, | |
| "grad_norm": 352.6352930651456, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2688, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.031088082901554404, | |
| "grad_norm": 148.2629265526552, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.3342, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03367875647668394, | |
| "grad_norm": 249.88753789723657, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.2983, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03626943005181347, | |
| "grad_norm": 184.03358422636597, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 1.3291, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.038860103626943004, | |
| "grad_norm": 198.4491469860763, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.4014, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04145077720207254, | |
| "grad_norm": 680.9537058769038, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.3775, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.04404145077720207, | |
| "grad_norm": 563.0247638614801, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.3228, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.046632124352331605, | |
| "grad_norm": 271.985463813746, | |
| "learning_rate": 1.7e-05, | |
| "loss": 1.3695, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04922279792746114, | |
| "grad_norm": 399.51218452223316, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.2556, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.05181347150259067, | |
| "grad_norm": 160.70697055826656, | |
| "learning_rate": 1.9e-05, | |
| "loss": 1.2982, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.054404145077720206, | |
| "grad_norm": 227.8927504687491, | |
| "learning_rate": 2e-05, | |
| "loss": 1.3532, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05699481865284974, | |
| "grad_norm": 550.1538868076032, | |
| "learning_rate": 2.1000000000000002e-05, | |
| "loss": 1.2603, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.05958549222797927, | |
| "grad_norm": 291.8994359919024, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.3663, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.06217616580310881, | |
| "grad_norm": 120.60677833129643, | |
| "learning_rate": 2.3e-05, | |
| "loss": 1.3129, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.06476683937823834, | |
| "grad_norm": 414.4006662101242, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.3037, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06735751295336788, | |
| "grad_norm": 141.48324465317884, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.3095, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.06994818652849741, | |
| "grad_norm": 147.86066819937994, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.2372, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07253886010362694, | |
| "grad_norm": 214.47337614964576, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.3384, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.07512953367875648, | |
| "grad_norm": 898.4324889241673, | |
| "learning_rate": 2.8e-05, | |
| "loss": 1.2003, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07772020725388601, | |
| "grad_norm": 128.83026557596128, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.2172, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08031088082901554, | |
| "grad_norm": 183.0777862405529, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 1.2674, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08290155440414508, | |
| "grad_norm": 119.01841833358732, | |
| "learning_rate": 3.1e-05, | |
| "loss": 1.2554, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08549222797927461, | |
| "grad_norm": 117.65980267542858, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 1.2716, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08808290155440414, | |
| "grad_norm": 82.40151099433953, | |
| "learning_rate": 3.3e-05, | |
| "loss": 1.2019, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.09067357512953368, | |
| "grad_norm": 82.61816783653785, | |
| "learning_rate": 3.4e-05, | |
| "loss": 1.2424, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.09326424870466321, | |
| "grad_norm": 136.42743433868276, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 1.2066, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09585492227979274, | |
| "grad_norm": 36.775911657584444, | |
| "learning_rate": 3.6e-05, | |
| "loss": 1.2485, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.09844559585492228, | |
| "grad_norm": 56.55022603284064, | |
| "learning_rate": 3.7000000000000005e-05, | |
| "loss": 1.2112, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.10103626943005181, | |
| "grad_norm": 50.09896932886107, | |
| "learning_rate": 3.8e-05, | |
| "loss": 1.2027, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.10362694300518134, | |
| "grad_norm": 54.2661481198025, | |
| "learning_rate": 3.9e-05, | |
| "loss": 1.2673, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10621761658031088, | |
| "grad_norm": 60.04145981731815, | |
| "learning_rate": 4e-05, | |
| "loss": 1.1648, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.10880829015544041, | |
| "grad_norm": 169.47741055545822, | |
| "learning_rate": 3.999981580539036e-05, | |
| "loss": 1.2393, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.11139896373056994, | |
| "grad_norm": 43.64716987307323, | |
| "learning_rate": 3.9999263224954204e-05, | |
| "loss": 1.2906, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.11398963730569948, | |
| "grad_norm": 51.3206609767585, | |
| "learning_rate": 3.999834226886976e-05, | |
| "loss": 1.1807, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.11658031088082901, | |
| "grad_norm": 38.95055887413869, | |
| "learning_rate": 3.999705295410054e-05, | |
| "loss": 1.1825, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11917098445595854, | |
| "grad_norm": 40.59968974426338, | |
| "learning_rate": 3.999539530439504e-05, | |
| "loss": 1.193, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.12176165803108809, | |
| "grad_norm": 34.5796571445333, | |
| "learning_rate": 3.9993369350286265e-05, | |
| "loss": 1.2127, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.12435233160621761, | |
| "grad_norm": 37.97693356149241, | |
| "learning_rate": 3.99909751290912e-05, | |
| "loss": 1.1543, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.12694300518134716, | |
| "grad_norm": 82.9217015858092, | |
| "learning_rate": 3.9988212684910107e-05, | |
| "loss": 1.2329, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.12953367875647667, | |
| "grad_norm": 49.256542144400214, | |
| "learning_rate": 3.9985082068625724e-05, | |
| "loss": 1.212, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13212435233160622, | |
| "grad_norm": 45.025980435259484, | |
| "learning_rate": 3.998158333790231e-05, | |
| "loss": 1.2129, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.13471502590673576, | |
| "grad_norm": 45.98465689592428, | |
| "learning_rate": 3.99777165571846e-05, | |
| "loss": 1.1709, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.13730569948186527, | |
| "grad_norm": 43.481241408477906, | |
| "learning_rate": 3.997348179769661e-05, | |
| "loss": 1.1614, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.13989637305699482, | |
| "grad_norm": 82.17633750834132, | |
| "learning_rate": 3.996887913744033e-05, | |
| "loss": 1.2205, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.14248704663212436, | |
| "grad_norm": 53.0176514970764, | |
| "learning_rate": 3.9963908661194285e-05, | |
| "loss": 1.1204, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.14507772020725387, | |
| "grad_norm": 67.86382426995611, | |
| "learning_rate": 3.995857046051196e-05, | |
| "loss": 1.1839, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.14766839378238342, | |
| "grad_norm": 31.282407703790597, | |
| "learning_rate": 3.995286463372013e-05, | |
| "loss": 1.2126, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.15025906735751296, | |
| "grad_norm": 52.200764429265604, | |
| "learning_rate": 3.994679128591706e-05, | |
| "loss": 1.2036, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.15284974093264247, | |
| "grad_norm": 60.706608653531895, | |
| "learning_rate": 3.9940350528970535e-05, | |
| "loss": 1.1848, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.15544041450777202, | |
| "grad_norm": 47.31754062899529, | |
| "learning_rate": 3.993354248151583e-05, | |
| "loss": 1.0869, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15803108808290156, | |
| "grad_norm": 49.42450836392811, | |
| "learning_rate": 3.9926367268953514e-05, | |
| "loss": 1.2651, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.16062176165803108, | |
| "grad_norm": 38.791167030088886, | |
| "learning_rate": 3.991882502344712e-05, | |
| "loss": 1.1881, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.16321243523316062, | |
| "grad_norm": 56.16339499737216, | |
| "learning_rate": 3.991091588392077e-05, | |
| "loss": 1.1518, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.16580310880829016, | |
| "grad_norm": 861.8559063020828, | |
| "learning_rate": 3.990263999605652e-05, | |
| "loss": 1.1614, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.16839378238341968, | |
| "grad_norm": 50.92822786500888, | |
| "learning_rate": 3.989399751229179e-05, | |
| "loss": 1.1998, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.17098445595854922, | |
| "grad_norm": 31.04121324055666, | |
| "learning_rate": 3.988498859181645e-05, | |
| "loss": 1.1795, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.17357512953367876, | |
| "grad_norm": 50.33061983380845, | |
| "learning_rate": 3.9875613400569975e-05, | |
| "loss": 1.1742, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.17616580310880828, | |
| "grad_norm": 75.20462514003519, | |
| "learning_rate": 3.986587211123833e-05, | |
| "loss": 1.1856, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.17875647668393782, | |
| "grad_norm": 38.82139317052205, | |
| "learning_rate": 3.98557649032508e-05, | |
| "loss": 1.1529, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.18134715025906736, | |
| "grad_norm": 36.55988806615175, | |
| "learning_rate": 3.984529196277674e-05, | |
| "loss": 1.1884, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18393782383419688, | |
| "grad_norm": 104.8931793971097, | |
| "learning_rate": 3.983445348272203e-05, | |
| "loss": 1.2182, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.18652849740932642, | |
| "grad_norm": 36.50395409234617, | |
| "learning_rate": 3.982324966272566e-05, | |
| "loss": 1.1609, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.18911917098445596, | |
| "grad_norm": 35.019191693448626, | |
| "learning_rate": 3.981168070915594e-05, | |
| "loss": 1.173, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.19170984455958548, | |
| "grad_norm": 33.378390048053596, | |
| "learning_rate": 3.979974683510677e-05, | |
| "loss": 1.173, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.19430051813471502, | |
| "grad_norm": 43.356840136984154, | |
| "learning_rate": 3.978744826039366e-05, | |
| "loss": 1.2032, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.19689119170984457, | |
| "grad_norm": 31.285725922510768, | |
| "learning_rate": 3.977478521154974e-05, | |
| "loss": 1.1569, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.19948186528497408, | |
| "grad_norm": 35.19264482867074, | |
| "learning_rate": 3.9761757921821544e-05, | |
| "loss": 1.1365, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.20207253886010362, | |
| "grad_norm": 44.66037256551279, | |
| "learning_rate": 3.974836663116472e-05, | |
| "loss": 1.164, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.20466321243523317, | |
| "grad_norm": 68.91101457952654, | |
| "learning_rate": 3.973461158623963e-05, | |
| "loss": 1.2256, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.20725388601036268, | |
| "grad_norm": 45.866521854583, | |
| "learning_rate": 3.9720493040406786e-05, | |
| "loss": 1.1697, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20984455958549222, | |
| "grad_norm": 59.63095169617338, | |
| "learning_rate": 3.970601125372218e-05, | |
| "loss": 1.2094, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.21243523316062177, | |
| "grad_norm": 39.085597271064216, | |
| "learning_rate": 3.9691166492932535e-05, | |
| "loss": 1.1048, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.21502590673575128, | |
| "grad_norm": 36.40256073477861, | |
| "learning_rate": 3.9675959031470336e-05, | |
| "loss": 1.248, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.21761658031088082, | |
| "grad_norm": 29.846921716586085, | |
| "learning_rate": 3.966038914944881e-05, | |
| "loss": 1.1718, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.22020725388601037, | |
| "grad_norm": 50.87052190327881, | |
| "learning_rate": 3.964445713365682e-05, | |
| "loss": 1.1529, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.22279792746113988, | |
| "grad_norm": 35.32915760431302, | |
| "learning_rate": 3.9628163277553486e-05, | |
| "loss": 1.1767, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.22538860103626943, | |
| "grad_norm": 157.5587514654703, | |
| "learning_rate": 3.961150788126286e-05, | |
| "loss": 1.2194, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.22797927461139897, | |
| "grad_norm": 25.03485489120971, | |
| "learning_rate": 3.9594491251568376e-05, | |
| "loss": 1.1392, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.23056994818652848, | |
| "grad_norm": 80.55933867045263, | |
| "learning_rate": 3.957711370190716e-05, | |
| "loss": 1.1819, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.23316062176165803, | |
| "grad_norm": 272.22874004071406, | |
| "learning_rate": 3.9559375552364325e-05, | |
| "loss": 1.0998, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23575129533678757, | |
| "grad_norm": 91.94671663482514, | |
| "learning_rate": 3.954127712966702e-05, | |
| "loss": 1.2494, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.23834196891191708, | |
| "grad_norm": 54.31533598131098, | |
| "learning_rate": 3.952281876717843e-05, | |
| "loss": 1.1385, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.24093264248704663, | |
| "grad_norm": 103.20789745908105, | |
| "learning_rate": 3.950400080489165e-05, | |
| "loss": 1.1398, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.24352331606217617, | |
| "grad_norm": 45.14746362545893, | |
| "learning_rate": 3.94848235894234e-05, | |
| "loss": 1.2697, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.24611398963730569, | |
| "grad_norm": 21.271923336142002, | |
| "learning_rate": 3.9465287474007654e-05, | |
| "loss": 1.1397, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.24870466321243523, | |
| "grad_norm": 93.89786795431422, | |
| "learning_rate": 3.944539281848912e-05, | |
| "loss": 1.1542, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.25129533678756477, | |
| "grad_norm": 32.38768349342839, | |
| "learning_rate": 3.942513998931663e-05, | |
| "loss": 1.1693, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.25129533678756477, | |
| "eval_loss": 1.1344976425170898, | |
| "eval_runtime": 37.8807, | |
| "eval_samples_per_second": 19.641, | |
| "eval_steps_per_second": 1.241, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.2538860103626943, | |
| "grad_norm": 91.41293468177638, | |
| "learning_rate": 3.940452935953639e-05, | |
| "loss": 1.1724, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.25647668393782386, | |
| "grad_norm": 39.20645478419229, | |
| "learning_rate": 3.9383561308785075e-05, | |
| "loss": 1.1583, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.25906735751295334, | |
| "grad_norm": 35.32804513153546, | |
| "learning_rate": 3.9362236223282885e-05, | |
| "loss": 1.158, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2616580310880829, | |
| "grad_norm": 35.24783762804842, | |
| "learning_rate": 3.934055449582641e-05, | |
| "loss": 1.1552, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.26424870466321243, | |
| "grad_norm": 33.743808031979775, | |
| "learning_rate": 3.931851652578137e-05, | |
| "loss": 1.264, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.266839378238342, | |
| "grad_norm": 113.49798793226394, | |
| "learning_rate": 3.92961227190753e-05, | |
| "loss": 1.2361, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2694300518134715, | |
| "grad_norm": 31.813807349410364, | |
| "learning_rate": 3.9273373488190036e-05, | |
| "loss": 1.1246, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.27202072538860106, | |
| "grad_norm": 29.391695486306187, | |
| "learning_rate": 3.925026925215417e-05, | |
| "loss": 1.1142, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.27461139896373055, | |
| "grad_norm": 33.79933331839905, | |
| "learning_rate": 3.922681043653526e-05, | |
| "loss": 1.1401, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.2772020725388601, | |
| "grad_norm": 39.09509012730907, | |
| "learning_rate": 3.920299747343204e-05, | |
| "loss": 1.1822, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.27979274611398963, | |
| "grad_norm": 37.81471938433609, | |
| "learning_rate": 3.9178830801466465e-05, | |
| "loss": 1.1592, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2823834196891192, | |
| "grad_norm": 69.07753778460207, | |
| "learning_rate": 3.915431086577561e-05, | |
| "loss": 1.1683, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2849740932642487, | |
| "grad_norm": 28.864787246081605, | |
| "learning_rate": 3.912943811800347e-05, | |
| "loss": 1.1179, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.28756476683937826, | |
| "grad_norm": 28.842042951717836, | |
| "learning_rate": 3.910421301629264e-05, | |
| "loss": 1.1317, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.29015544041450775, | |
| "grad_norm": 51.475482074695506, | |
| "learning_rate": 3.9078636025275904e-05, | |
| "loss": 1.1451, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2927461139896373, | |
| "grad_norm": 33.48279556713943, | |
| "learning_rate": 3.9052707616067654e-05, | |
| "loss": 1.1554, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.29533678756476683, | |
| "grad_norm": 21.279603575929844, | |
| "learning_rate": 3.9026428266255205e-05, | |
| "loss": 1.1636, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.2979274611398964, | |
| "grad_norm": 36.226178034876675, | |
| "learning_rate": 3.899979845989003e-05, | |
| "loss": 1.1966, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3005181347150259, | |
| "grad_norm": 29.90506353145981, | |
| "learning_rate": 3.897281868747878e-05, | |
| "loss": 1.1888, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.30310880829015546, | |
| "grad_norm": 36.04602777809767, | |
| "learning_rate": 3.894548944597434e-05, | |
| "loss": 1.2066, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.30569948186528495, | |
| "grad_norm": 36.42793844948301, | |
| "learning_rate": 3.8917811238766606e-05, | |
| "loss": 1.1712, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3082901554404145, | |
| "grad_norm": 58.788967662325696, | |
| "learning_rate": 3.888978457567323e-05, | |
| "loss": 1.1225, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.31088082901554404, | |
| "grad_norm": 29.357299816022326, | |
| "learning_rate": 3.886140997293024e-05, | |
| "loss": 1.1315, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3134715025906736, | |
| "grad_norm": 95.08345317107502, | |
| "learning_rate": 3.883268795318252e-05, | |
| "loss": 1.1852, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3160621761658031, | |
| "grad_norm": 33.6623824593179, | |
| "learning_rate": 3.88036190454742e-05, | |
| "loss": 1.16, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.31865284974093266, | |
| "grad_norm": 42.587546987131105, | |
| "learning_rate": 3.8774203785238886e-05, | |
| "loss": 1.1374, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.32124352331606215, | |
| "grad_norm": 33.360649853064245, | |
| "learning_rate": 3.8744442714289816e-05, | |
| "loss": 1.1757, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.3238341968911917, | |
| "grad_norm": 49.09256643961471, | |
| "learning_rate": 3.8714336380809874e-05, | |
| "loss": 1.1782, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.32642487046632124, | |
| "grad_norm": 31.505007051172793, | |
| "learning_rate": 3.86838853393415e-05, | |
| "loss": 1.195, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3290155440414508, | |
| "grad_norm": 34.36735417254799, | |
| "learning_rate": 3.865309015077645e-05, | |
| "loss": 1.1078, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3316062176165803, | |
| "grad_norm": 36.63220606142181, | |
| "learning_rate": 3.862195138234551e-05, | |
| "loss": 1.1319, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.33419689119170987, | |
| "grad_norm": 53.324986862513676, | |
| "learning_rate": 3.859046960760801e-05, | |
| "loss": 1.2301, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.33678756476683935, | |
| "grad_norm": 47.41445409144979, | |
| "learning_rate": 3.855864540644126e-05, | |
| "loss": 1.2366, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3393782383419689, | |
| "grad_norm": 32.57355122427366, | |
| "learning_rate": 3.8526479365029906e-05, | |
| "loss": 1.142, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.34196891191709844, | |
| "grad_norm": 28.445824333644715, | |
| "learning_rate": 3.849397207585508e-05, | |
| "loss": 1.0847, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.344559585492228, | |
| "grad_norm": 49.23062726715889, | |
| "learning_rate": 3.846112413768353e-05, | |
| "loss": 1.2241, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.3471502590673575, | |
| "grad_norm": 53.424206543788074, | |
| "learning_rate": 3.842793615555657e-05, | |
| "loss": 1.2392, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.34974093264248707, | |
| "grad_norm": 38.19316140175426, | |
| "learning_rate": 3.8394408740778934e-05, | |
| "loss": 1.1208, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.35233160621761656, | |
| "grad_norm": 32.35931252369273, | |
| "learning_rate": 3.836054251090755e-05, | |
| "loss": 1.1604, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.3549222797927461, | |
| "grad_norm": 37.90085344799495, | |
| "learning_rate": 3.83263380897401e-05, | |
| "loss": 1.1134, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.35751295336787564, | |
| "grad_norm": 44.49191588319939, | |
| "learning_rate": 3.829179610730359e-05, | |
| "loss": 1.1281, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.3601036269430052, | |
| "grad_norm": 141.98524430756757, | |
| "learning_rate": 3.8256917199842715e-05, | |
| "loss": 1.0928, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.3626943005181347, | |
| "grad_norm": 30.887093976524472, | |
| "learning_rate": 3.822170200980815e-05, | |
| "loss": 1.0936, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.36528497409326427, | |
| "grad_norm": 21.980521878837745, | |
| "learning_rate": 3.818615118584472e-05, | |
| "loss": 1.1368, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.36787564766839376, | |
| "grad_norm": 538.6650762618656, | |
| "learning_rate": 3.815026538277943e-05, | |
| "loss": 1.0918, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.3704663212435233, | |
| "grad_norm": 40.842881572203, | |
| "learning_rate": 3.811404526160943e-05, | |
| "loss": 1.1705, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.37305699481865284, | |
| "grad_norm": 26.891553492377298, | |
| "learning_rate": 3.8077491489489835e-05, | |
| "loss": 1.1468, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.3756476683937824, | |
| "grad_norm": 45.138483181178074, | |
| "learning_rate": 3.8040604739721415e-05, | |
| "loss": 1.1679, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.37823834196891193, | |
| "grad_norm": 35.133763086168244, | |
| "learning_rate": 3.8003385691738227e-05, | |
| "loss": 1.1029, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.38082901554404147, | |
| "grad_norm": 36.941250802707344, | |
| "learning_rate": 3.7965835031095065e-05, | |
| "loss": 1.1491, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.38341968911917096, | |
| "grad_norm": 90.1080256703095, | |
| "learning_rate": 3.792795344945485e-05, | |
| "loss": 1.1212, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.3860103626943005, | |
| "grad_norm": 39.70360899750413, | |
| "learning_rate": 3.7889741644575914e-05, | |
| "loss": 1.15, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.38860103626943004, | |
| "grad_norm": 28.229369877304094, | |
| "learning_rate": 3.78512003202991e-05, | |
| "loss": 1.1111, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3911917098445596, | |
| "grad_norm": 31.611752191925987, | |
| "learning_rate": 3.7812330186534815e-05, | |
| "loss": 1.1366, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.39378238341968913, | |
| "grad_norm": 38.196015586772425, | |
| "learning_rate": 3.777313195924998e-05, | |
| "loss": 1.1433, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.3963730569948187, | |
| "grad_norm": 22.732638044547453, | |
| "learning_rate": 3.773360636045481e-05, | |
| "loss": 1.1125, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.39896373056994816, | |
| "grad_norm": 90.19158665385014, | |
| "learning_rate": 3.7693754118189525e-05, | |
| "loss": 1.1242, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.4015544041450777, | |
| "grad_norm": 42.43479974993017, | |
| "learning_rate": 3.765357596651095e-05, | |
| "loss": 1.1191, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.40414507772020725, | |
| "grad_norm": 88.0076735720364, | |
| "learning_rate": 3.761307264547899e-05, | |
| "loss": 1.1718, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.4067357512953368, | |
| "grad_norm": 30.782507703935767, | |
| "learning_rate": 3.757224490114297e-05, | |
| "loss": 1.109, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.40932642487046633, | |
| "grad_norm": 69.89871106113397, | |
| "learning_rate": 3.7531093485527943e-05, | |
| "loss": 1.1018, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.4119170984455959, | |
| "grad_norm": 37.339006645717305, | |
| "learning_rate": 3.7489619156620796e-05, | |
| "loss": 1.1358, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.41450777202072536, | |
| "grad_norm": 28.06388054378899, | |
| "learning_rate": 3.744782267835632e-05, | |
| "loss": 1.0847, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4170984455958549, | |
| "grad_norm": 54.05874281297702, | |
| "learning_rate": 3.740570482060311e-05, | |
| "loss": 1.1682, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.41968911917098445, | |
| "grad_norm": 32.299093265328835, | |
| "learning_rate": 3.73632663591494e-05, | |
| "loss": 1.1413, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.422279792746114, | |
| "grad_norm": 31.213652090157694, | |
| "learning_rate": 3.732050807568878e-05, | |
| "loss": 1.1313, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.42487046632124353, | |
| "grad_norm": 40.01090035937505, | |
| "learning_rate": 3.727743075780578e-05, | |
| "loss": 1.1513, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.4274611398963731, | |
| "grad_norm": 47.11352577964853, | |
| "learning_rate": 3.723403519896136e-05, | |
| "loss": 1.2192, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.43005181347150256, | |
| "grad_norm": 28.645086506093037, | |
| "learning_rate": 3.7190322198478355e-05, | |
| "loss": 1.1097, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.4326424870466321, | |
| "grad_norm": 35.28541113925116, | |
| "learning_rate": 3.7146292561526654e-05, | |
| "loss": 1.1557, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.43523316062176165, | |
| "grad_norm": 58.30281063037669, | |
| "learning_rate": 3.7101947099108425e-05, | |
| "loss": 1.1829, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4378238341968912, | |
| "grad_norm": 26.33563548968379, | |
| "learning_rate": 3.70572866280432e-05, | |
| "loss": 1.147, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.44041450777202074, | |
| "grad_norm": 57.00052875402651, | |
| "learning_rate": 3.701231197095277e-05, | |
| "loss": 1.1212, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4430051813471503, | |
| "grad_norm": 23.672828037237174, | |
| "learning_rate": 3.696702395624608e-05, | |
| "loss": 1.1152, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.44559585492227977, | |
| "grad_norm": 41.1264174112964, | |
| "learning_rate": 3.692142341810395e-05, | |
| "loss": 1.1154, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.4481865284974093, | |
| "grad_norm": 26.72177706144361, | |
| "learning_rate": 3.6875511196463715e-05, | |
| "loss": 1.1725, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.45077720207253885, | |
| "grad_norm": 95.4088800585977, | |
| "learning_rate": 3.682928813700375e-05, | |
| "loss": 1.1339, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.4533678756476684, | |
| "grad_norm": 34.33666578349465, | |
| "learning_rate": 3.678275509112788e-05, | |
| "loss": 1.1867, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.45595854922279794, | |
| "grad_norm": 31.032304531003014, | |
| "learning_rate": 3.6735912915949745e-05, | |
| "loss": 1.1386, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.4585492227979275, | |
| "grad_norm": 55.22043313188224, | |
| "learning_rate": 3.6688762474276945e-05, | |
| "loss": 1.1102, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.46113989637305697, | |
| "grad_norm": 29.82713377876857, | |
| "learning_rate": 3.6641304634595216e-05, | |
| "loss": 1.1564, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.4637305699481865, | |
| "grad_norm": 35.71025459541737, | |
| "learning_rate": 3.659354027105238e-05, | |
| "loss": 1.0939, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.46632124352331605, | |
| "grad_norm": 52.41175655642653, | |
| "learning_rate": 3.6545470263442265e-05, | |
| "loss": 1.1578, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4689119170984456, | |
| "grad_norm": 27.682485766528306, | |
| "learning_rate": 3.649709549718849e-05, | |
| "loss": 1.1875, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.47150259067357514, | |
| "grad_norm": 36.53293663303487, | |
| "learning_rate": 3.6448416863328186e-05, | |
| "loss": 1.1111, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.4740932642487047, | |
| "grad_norm": 31.45177998538027, | |
| "learning_rate": 3.639943525849555e-05, | |
| "loss": 1.113, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.47668393782383417, | |
| "grad_norm": 28.323097072885673, | |
| "learning_rate": 3.635015158490533e-05, | |
| "loss": 1.1159, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.4792746113989637, | |
| "grad_norm": 47.75573754341213, | |
| "learning_rate": 3.6300566750336225e-05, | |
| "loss": 1.1305, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.48186528497409326, | |
| "grad_norm": 21.384095061494357, | |
| "learning_rate": 3.625068166811418e-05, | |
| "loss": 1.1369, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4844559585492228, | |
| "grad_norm": 30.714645036809546, | |
| "learning_rate": 3.6200497257095504e-05, | |
| "loss": 1.1858, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.48704663212435234, | |
| "grad_norm": 35.12161426399798, | |
| "learning_rate": 3.615001444165001e-05, | |
| "loss": 1.1293, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.4896373056994819, | |
| "grad_norm": 116.83443661381396, | |
| "learning_rate": 3.6099234151643924e-05, | |
| "loss": 1.1515, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.49222797927461137, | |
| "grad_norm": 55.47885243409044, | |
| "learning_rate": 3.604815732242283e-05, | |
| "loss": 1.112, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4948186528497409, | |
| "grad_norm": 32.332747429034285, | |
| "learning_rate": 3.5996784894794394e-05, | |
| "loss": 1.1661, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.49740932642487046, | |
| "grad_norm": 33.039210183180046, | |
| "learning_rate": 3.594511781501103e-05, | |
| "loss": 1.1244, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 21.325687337182504, | |
| "learning_rate": 3.58931570347525e-05, | |
| "loss": 1.1634, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.5025906735751295, | |
| "grad_norm": 51.37599478469561, | |
| "learning_rate": 3.584090351110838e-05, | |
| "loss": 1.2106, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.5025906735751295, | |
| "eval_loss": 1.1119717359542847, | |
| "eval_runtime": 49.6027, | |
| "eval_samples_per_second": 14.999, | |
| "eval_steps_per_second": 0.948, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.5051813471502591, | |
| "grad_norm": 42.105169991612456, | |
| "learning_rate": 3.57883582065604e-05, | |
| "loss": 1.1303, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5077720207253886, | |
| "grad_norm": 37.14457014578168, | |
| "learning_rate": 3.573552208896474e-05, | |
| "loss": 1.1483, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.5103626943005182, | |
| "grad_norm": 28.56241612018119, | |
| "learning_rate": 3.568239613153421e-05, | |
| "loss": 1.0843, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.5129533678756477, | |
| "grad_norm": 35.399304035761865, | |
| "learning_rate": 3.5628981312820315e-05, | |
| "loss": 1.1177, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5155440414507773, | |
| "grad_norm": 25.91156850470446, | |
| "learning_rate": 3.557527861669522e-05, | |
| "loss": 1.1215, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.5181347150259067, | |
| "grad_norm": 43.509516777992324, | |
| "learning_rate": 3.552128903233363e-05, | |
| "loss": 1.1532, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5207253886010362, | |
| "grad_norm": 38.18164449834795, | |
| "learning_rate": 3.54670135541946e-05, | |
| "loss": 1.1142, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.5233160621761658, | |
| "grad_norm": 48.576743289054534, | |
| "learning_rate": 3.541245318200318e-05, | |
| "loss": 1.1152, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.5259067357512953, | |
| "grad_norm": 38.65411737007163, | |
| "learning_rate": 3.5357608920732e-05, | |
| "loss": 1.1607, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.5284974093264249, | |
| "grad_norm": 35.663493907396834, | |
| "learning_rate": 3.530248178058282e-05, | |
| "loss": 1.1273, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.5310880829015544, | |
| "grad_norm": 26.829817821665976, | |
| "learning_rate": 3.5247072776967805e-05, | |
| "loss": 1.1174, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.533678756476684, | |
| "grad_norm": 39.79604912152638, | |
| "learning_rate": 3.519138293049097e-05, | |
| "loss": 1.1811, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.5362694300518135, | |
| "grad_norm": 32.26179097390416, | |
| "learning_rate": 3.513541326692925e-05, | |
| "loss": 1.1346, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.538860103626943, | |
| "grad_norm": 24.35769329902787, | |
| "learning_rate": 3.5079164817213684e-05, | |
| "loss": 1.1061, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.5414507772020726, | |
| "grad_norm": 26.645546258363844, | |
| "learning_rate": 3.5022638617410396e-05, | |
| "loss": 1.0514, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5440414507772021, | |
| "grad_norm": 105.19676603444857, | |
| "learning_rate": 3.496583570870152e-05, | |
| "loss": 1.1474, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5466321243523317, | |
| "grad_norm": 61.600623030405885, | |
| "learning_rate": 3.4908757137366006e-05, | |
| "loss": 1.104, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5492227979274611, | |
| "grad_norm": 31.65460129853052, | |
| "learning_rate": 3.485140395476038e-05, | |
| "loss": 1.0737, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5518134715025906, | |
| "grad_norm": 26.860379117211497, | |
| "learning_rate": 3.4793777217299346e-05, | |
| "loss": 1.1119, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5544041450777202, | |
| "grad_norm": 39.89324262309783, | |
| "learning_rate": 3.473587798643633e-05, | |
| "loss": 1.1626, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5569948186528497, | |
| "grad_norm": 39.77638257731599, | |
| "learning_rate": 3.467770732864399e-05, | |
| "loss": 1.1545, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5595854922279793, | |
| "grad_norm": 30.994657564291458, | |
| "learning_rate": 3.461926631539445e-05, | |
| "loss": 1.1646, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.5621761658031088, | |
| "grad_norm": 51.99674092516571, | |
| "learning_rate": 3.4560556023139695e-05, | |
| "loss": 1.1638, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.5647668393782384, | |
| "grad_norm": 58.5132713002146, | |
| "learning_rate": 3.450157753329166e-05, | |
| "loss": 1.1461, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.5673575129533679, | |
| "grad_norm": 30.712469030418482, | |
| "learning_rate": 3.4442331932202326e-05, | |
| "loss": 1.1583, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5699481865284974, | |
| "grad_norm": 47.00217426642832, | |
| "learning_rate": 3.438282031114374e-05, | |
| "loss": 1.1154, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.572538860103627, | |
| "grad_norm": 37.33927961163222, | |
| "learning_rate": 3.432304376628787e-05, | |
| "loss": 1.1372, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.5751295336787565, | |
| "grad_norm": 28.858636933974392, | |
| "learning_rate": 3.4263003398686464e-05, | |
| "loss": 1.0488, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.5777202072538861, | |
| "grad_norm": 37.842230890171486, | |
| "learning_rate": 3.420270031425072e-05, | |
| "loss": 1.1892, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.5803108808290155, | |
| "grad_norm": 32.65394945357516, | |
| "learning_rate": 3.4142135623730954e-05, | |
| "loss": 1.1218, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.582901554404145, | |
| "grad_norm": 115.22040829465772, | |
| "learning_rate": 3.4081310442696114e-05, | |
| "loss": 1.1546, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5854922279792746, | |
| "grad_norm": 31.20514468446119, | |
| "learning_rate": 3.402022589151325e-05, | |
| "loss": 1.0969, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.5880829015544041, | |
| "grad_norm": 52.8397361926395, | |
| "learning_rate": 3.395888309532687e-05, | |
| "loss": 1.1218, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5906735751295337, | |
| "grad_norm": 51.7991692917308, | |
| "learning_rate": 3.3897283184038215e-05, | |
| "loss": 1.1395, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5932642487046632, | |
| "grad_norm": 33.56775233970504, | |
| "learning_rate": 3.3835427292284445e-05, | |
| "loss": 1.1107, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5958549222797928, | |
| "grad_norm": 46.081120788214314, | |
| "learning_rate": 3.3773316559417734e-05, | |
| "loss": 1.1472, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5984455958549223, | |
| "grad_norm": 41.72558170492288, | |
| "learning_rate": 3.371095212948431e-05, | |
| "loss": 1.1871, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.6010362694300518, | |
| "grad_norm": 34.27957927587091, | |
| "learning_rate": 3.364833515120336e-05, | |
| "loss": 1.1376, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.6036269430051814, | |
| "grad_norm": 36.58452602010953, | |
| "learning_rate": 3.358546677794586e-05, | |
| "loss": 1.1885, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.6062176165803109, | |
| "grad_norm": 28.010809914189192, | |
| "learning_rate": 3.352234816771337e-05, | |
| "loss": 1.102, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.6088082901554405, | |
| "grad_norm": 24.78419558611963, | |
| "learning_rate": 3.3458980483116664e-05, | |
| "loss": 1.0818, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6113989637305699, | |
| "grad_norm": 28.12830040081226, | |
| "learning_rate": 3.3395364891354316e-05, | |
| "loss": 1.1862, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.6139896373056994, | |
| "grad_norm": 37.94181651161551, | |
| "learning_rate": 3.333150256419127e-05, | |
| "loss": 1.147, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.616580310880829, | |
| "grad_norm": 21.809518482701854, | |
| "learning_rate": 3.3267394677937134e-05, | |
| "loss": 1.0994, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.6191709844559585, | |
| "grad_norm": 32.12135773753589, | |
| "learning_rate": 3.320304241342464e-05, | |
| "loss": 1.1531, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.6217616580310881, | |
| "grad_norm": 51.959731073524054, | |
| "learning_rate": 3.31384469559878e-05, | |
| "loss": 1.1717, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6243523316062176, | |
| "grad_norm": 28.045815836372345, | |
| "learning_rate": 3.307360949544012e-05, | |
| "loss": 1.1814, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.6269430051813472, | |
| "grad_norm": 39.55208384578746, | |
| "learning_rate": 3.300853122605268e-05, | |
| "loss": 1.1483, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.6295336787564767, | |
| "grad_norm": 29.799974205160808, | |
| "learning_rate": 3.294321334653213e-05, | |
| "loss": 1.1838, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.6321243523316062, | |
| "grad_norm": 124.31035254102245, | |
| "learning_rate": 3.2877657059998584e-05, | |
| "loss": 1.0698, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.6347150259067358, | |
| "grad_norm": 37.989925180187655, | |
| "learning_rate": 3.281186357396351e-05, | |
| "loss": 1.0984, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6373056994818653, | |
| "grad_norm": 55.72599333657572, | |
| "learning_rate": 3.274583410030745e-05, | |
| "loss": 1.2333, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.6398963730569949, | |
| "grad_norm": 46.77079456439719, | |
| "learning_rate": 3.267956985525774e-05, | |
| "loss": 1.2157, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.6424870466321243, | |
| "grad_norm": 33.62329915252562, | |
| "learning_rate": 3.261307205936603e-05, | |
| "loss": 1.1752, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.6450777202072538, | |
| "grad_norm": 34.11794183225494, | |
| "learning_rate": 3.2546341937485884e-05, | |
| "loss": 1.1265, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.6476683937823834, | |
| "grad_norm": 36.027636323913896, | |
| "learning_rate": 3.247938071875017e-05, | |
| "loss": 1.103, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6502590673575129, | |
| "grad_norm": 35.393219337329946, | |
| "learning_rate": 3.2412189636548456e-05, | |
| "loss": 1.1148, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.6528497409326425, | |
| "grad_norm": 31.578919022569924, | |
| "learning_rate": 3.234476992850425e-05, | |
| "loss": 1.1149, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.655440414507772, | |
| "grad_norm": 28.93717647736964, | |
| "learning_rate": 3.227712283645224e-05, | |
| "loss": 1.1425, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.6580310880829016, | |
| "grad_norm": 34.170026750703684, | |
| "learning_rate": 3.2209249606415394e-05, | |
| "loss": 1.1591, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6606217616580311, | |
| "grad_norm": 27.52194954061608, | |
| "learning_rate": 3.214115148858201e-05, | |
| "loss": 1.1704, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6632124352331606, | |
| "grad_norm": 81.65404753769732, | |
| "learning_rate": 3.207282973728273e-05, | |
| "loss": 1.161, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6658031088082902, | |
| "grad_norm": 57.45351536522683, | |
| "learning_rate": 3.200428561096737e-05, | |
| "loss": 1.116, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.6683937823834197, | |
| "grad_norm": 30.968529074463714, | |
| "learning_rate": 3.193552037218179e-05, | |
| "loss": 1.1265, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.6709844559585493, | |
| "grad_norm": 37.8817748068655, | |
| "learning_rate": 3.186653528754464e-05, | |
| "loss": 1.1287, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6735751295336787, | |
| "grad_norm": 29.197031189172545, | |
| "learning_rate": 3.179733162772398e-05, | |
| "loss": 1.1045, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6761658031088082, | |
| "grad_norm": 36.56253841299107, | |
| "learning_rate": 3.172791066741392e-05, | |
| "loss": 1.1539, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.6787564766839378, | |
| "grad_norm": 25.799921116950998, | |
| "learning_rate": 3.165827368531113e-05, | |
| "loss": 1.0796, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.6813471502590673, | |
| "grad_norm": 82.81825216532526, | |
| "learning_rate": 3.1588421964091276e-05, | |
| "loss": 1.142, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.6839378238341969, | |
| "grad_norm": 31.100074747569124, | |
| "learning_rate": 3.151835679038542e-05, | |
| "loss": 1.0908, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.6865284974093264, | |
| "grad_norm": 25.57297200703221, | |
| "learning_rate": 3.14480794547563e-05, | |
| "loss": 1.1436, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.689119170984456, | |
| "grad_norm": 23.92492773149328, | |
| "learning_rate": 3.137759125167455e-05, | |
| "loss": 1.1202, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.6917098445595855, | |
| "grad_norm": 22.14274360766396, | |
| "learning_rate": 3.130689347949486e-05, | |
| "loss": 1.1113, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.694300518134715, | |
| "grad_norm": 26.68725288649902, | |
| "learning_rate": 3.123598744043211e-05, | |
| "loss": 1.1517, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.6968911917098446, | |
| "grad_norm": 25.559817524659362, | |
| "learning_rate": 3.1164874440537295e-05, | |
| "loss": 1.0976, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.6994818652849741, | |
| "grad_norm": 28.89996834100355, | |
| "learning_rate": 3.109355578967356e-05, | |
| "loss": 1.1932, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7020725388601037, | |
| "grad_norm": 32.09658045195569, | |
| "learning_rate": 3.1022032801492e-05, | |
| "loss": 1.1161, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.7046632124352331, | |
| "grad_norm": 30.623705646213768, | |
| "learning_rate": 3.095030679340751e-05, | |
| "loss": 1.1993, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.7072538860103627, | |
| "grad_norm": 41.71263710932429, | |
| "learning_rate": 3.0878379086574494e-05, | |
| "loss": 1.1624, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.7098445595854922, | |
| "grad_norm": 34.68352639470226, | |
| "learning_rate": 3.0806251005862535e-05, | |
| "loss": 1.1156, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.7124352331606217, | |
| "grad_norm": 23.52580702428812, | |
| "learning_rate": 3.073392387983202e-05, | |
| "loss": 1.0963, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7150259067357513, | |
| "grad_norm": 28.10687988214902, | |
| "learning_rate": 3.0661399040709584e-05, | |
| "loss": 1.1095, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.7176165803108808, | |
| "grad_norm": 66.72288729975841, | |
| "learning_rate": 3.05886778243637e-05, | |
| "loss": 1.0865, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.7202072538860104, | |
| "grad_norm": 25.775217430321934, | |
| "learning_rate": 3.051576157027998e-05, | |
| "loss": 1.1058, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.7227979274611399, | |
| "grad_norm": 36.82942099016794, | |
| "learning_rate": 3.0442651621536502e-05, | |
| "loss": 1.1211, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.7253886010362695, | |
| "grad_norm": 27.878820856521013, | |
| "learning_rate": 3.0369349324779115e-05, | |
| "loss": 1.1471, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.727979274611399, | |
| "grad_norm": 31.293156717285573, | |
| "learning_rate": 3.0295856030196618e-05, | |
| "loss": 1.0748, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.7305699481865285, | |
| "grad_norm": 39.315952115194435, | |
| "learning_rate": 3.022217309149588e-05, | |
| "loss": 1.0993, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.7331606217616581, | |
| "grad_norm": 36.79954071435495, | |
| "learning_rate": 3.0148301865876913e-05, | |
| "loss": 1.1045, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.7357512953367875, | |
| "grad_norm": 26.127389502147167, | |
| "learning_rate": 3.0074243714007875e-05, | |
| "loss": 1.1424, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.7383419689119171, | |
| "grad_norm": 25.608778060317068, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 1.1055, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7409326424870466, | |
| "grad_norm": 36.22629669671894, | |
| "learning_rate": 2.992557209138249e-05, | |
| "loss": 1.0845, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.7435233160621761, | |
| "grad_norm": 35.30642111132886, | |
| "learning_rate": 2.9850961359077293e-05, | |
| "loss": 1.204, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.7461139896373057, | |
| "grad_norm": 29.765894622087952, | |
| "learning_rate": 2.977616917737388e-05, | |
| "loss": 1.168, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.7487046632124352, | |
| "grad_norm": 27.194683587397567, | |
| "learning_rate": 2.9701196923903927e-05, | |
| "loss": 1.1236, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.7512953367875648, | |
| "grad_norm": 63.09779240191165, | |
| "learning_rate": 2.9626045979615928e-05, | |
| "loss": 1.1395, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7538860103626943, | |
| "grad_norm": 25.014233377763066, | |
| "learning_rate": 2.9550717728749768e-05, | |
| "loss": 1.1054, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.7538860103626943, | |
| "eval_loss": 1.0996382236480713, | |
| "eval_runtime": 37.9545, | |
| "eval_samples_per_second": 19.602, | |
| "eval_steps_per_second": 1.238, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.7564766839378239, | |
| "grad_norm": 27.481891737318097, | |
| "learning_rate": 2.947521355881122e-05, | |
| "loss": 1.1252, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.7590673575129534, | |
| "grad_norm": 67.57807413949878, | |
| "learning_rate": 2.9399534860546404e-05, | |
| "loss": 1.1761, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.7616580310880829, | |
| "grad_norm": 65.66834495909988, | |
| "learning_rate": 2.932368302791614e-05, | |
| "loss": 1.0551, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.7642487046632125, | |
| "grad_norm": 30.051210942517116, | |
| "learning_rate": 2.92476594580703e-05, | |
| "loss": 1.138, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7668393782383419, | |
| "grad_norm": 22.693089678510507, | |
| "learning_rate": 2.917146555132206e-05, | |
| "loss": 1.1495, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7694300518134715, | |
| "grad_norm": 53.84166280540606, | |
| "learning_rate": 2.909510271112212e-05, | |
| "loss": 1.1409, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.772020725388601, | |
| "grad_norm": 32.69106061524578, | |
| "learning_rate": 2.9018572344032823e-05, | |
| "loss": 1.1709, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7746113989637305, | |
| "grad_norm": 39.44484991312582, | |
| "learning_rate": 2.8941875859702283e-05, | |
| "loss": 1.1138, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7772020725388601, | |
| "grad_norm": 31.51857596969122, | |
| "learning_rate": 2.88650146708384e-05, | |
| "loss": 1.1931, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7797927461139896, | |
| "grad_norm": 70.51218412614058, | |
| "learning_rate": 2.878799019318283e-05, | |
| "loss": 1.155, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.7823834196891192, | |
| "grad_norm": 80.27969224752457, | |
| "learning_rate": 2.8710803845484955e-05, | |
| "loss": 1.1425, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.7849740932642487, | |
| "grad_norm": 28.16560857981767, | |
| "learning_rate": 2.8633457049475678e-05, | |
| "loss": 1.1072, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.7875647668393783, | |
| "grad_norm": 41.15138307552231, | |
| "learning_rate": 2.855595122984129e-05, | |
| "loss": 1.1492, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.7901554404145078, | |
| "grad_norm": 23.894217282116276, | |
| "learning_rate": 2.847828781419722e-05, | |
| "loss": 1.1136, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7927461139896373, | |
| "grad_norm": 25.005501120810248, | |
| "learning_rate": 2.8400468233061708e-05, | |
| "loss": 1.0921, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.7953367875647669, | |
| "grad_norm": 30.91791938195468, | |
| "learning_rate": 2.832249391982949e-05, | |
| "loss": 1.1098, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.7979274611398963, | |
| "grad_norm": 44.776563922922726, | |
| "learning_rate": 2.8244366310745398e-05, | |
| "loss": 1.1845, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.8005181347150259, | |
| "grad_norm": 19.059329544784376, | |
| "learning_rate": 2.816608684487787e-05, | |
| "loss": 1.169, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.8031088082901554, | |
| "grad_norm": 63.97334641962602, | |
| "learning_rate": 2.8087656964092472e-05, | |
| "loss": 1.124, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.805699481865285, | |
| "grad_norm": 30.878848859015882, | |
| "learning_rate": 2.8009078113025335e-05, | |
| "loss": 1.2087, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.8082901554404145, | |
| "grad_norm": 34.63835471543836, | |
| "learning_rate": 2.7930351739056533e-05, | |
| "loss": 1.1338, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.810880829015544, | |
| "grad_norm": 30.03178182445718, | |
| "learning_rate": 2.7851479292283442e-05, | |
| "loss": 1.1321, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.8134715025906736, | |
| "grad_norm": 38.42236523356876, | |
| "learning_rate": 2.7772462225494013e-05, | |
| "loss": 1.1557, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.8160621761658031, | |
| "grad_norm": 39.179683790956744, | |
| "learning_rate": 2.7693301994140026e-05, | |
| "loss": 1.1201, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8186528497409327, | |
| "grad_norm": 38.32243159447327, | |
| "learning_rate": 2.761400005631028e-05, | |
| "loss": 1.1105, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.8212435233160622, | |
| "grad_norm": 39.913808227411835, | |
| "learning_rate": 2.7534557872703705e-05, | |
| "loss": 1.1598, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.8238341968911918, | |
| "grad_norm": 69.73521867812421, | |
| "learning_rate": 2.7454976906602513e-05, | |
| "loss": 1.1145, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.8264248704663213, | |
| "grad_norm": 65.55887588207746, | |
| "learning_rate": 2.7375258623845207e-05, | |
| "loss": 1.1255, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.8290155440414507, | |
| "grad_norm": 30.980111545641563, | |
| "learning_rate": 2.7295404492799575e-05, | |
| "loss": 1.122, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8316062176165803, | |
| "grad_norm": 30.12179911444832, | |
| "learning_rate": 2.721541598433567e-05, | |
| "loss": 1.113, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.8341968911917098, | |
| "grad_norm": 28.329434659508582, | |
| "learning_rate": 2.7135294571798706e-05, | |
| "loss": 1.0498, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.8367875647668394, | |
| "grad_norm": 25.114787597049578, | |
| "learning_rate": 2.70550417309819e-05, | |
| "loss": 1.0633, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.8393782383419689, | |
| "grad_norm": 27.754037709590385, | |
| "learning_rate": 2.6974658940099337e-05, | |
| "loss": 1.1585, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.8419689119170984, | |
| "grad_norm": 29.489888159179444, | |
| "learning_rate": 2.6894147679758678e-05, | |
| "loss": 1.1259, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.844559585492228, | |
| "grad_norm": 24.426102194202898, | |
| "learning_rate": 2.6813509432933957e-05, | |
| "loss": 1.1515, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.8471502590673575, | |
| "grad_norm": 24.75197483331429, | |
| "learning_rate": 2.673274568493821e-05, | |
| "loss": 1.15, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.8497409326424871, | |
| "grad_norm": 40.604864626683366, | |
| "learning_rate": 2.6651857923396132e-05, | |
| "loss": 1.1219, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.8523316062176166, | |
| "grad_norm": 34.694568404196026, | |
| "learning_rate": 2.6570847638216698e-05, | |
| "loss": 1.103, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.8549222797927462, | |
| "grad_norm": 48.715136403425035, | |
| "learning_rate": 2.648971632156569e-05, | |
| "loss": 1.1675, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8575129533678757, | |
| "grad_norm": 97.77526410121799, | |
| "learning_rate": 2.6408465467838225e-05, | |
| "loss": 1.1502, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.8601036269430051, | |
| "grad_norm": 54.697215318949276, | |
| "learning_rate": 2.632709657363124e-05, | |
| "loss": 1.1446, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.8626943005181347, | |
| "grad_norm": 38.09192002041798, | |
| "learning_rate": 2.6245611137715897e-05, | |
| "loss": 1.1333, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.8652849740932642, | |
| "grad_norm": 46.713623556984956, | |
| "learning_rate": 2.6164010661010007e-05, | |
| "loss": 1.1252, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.8678756476683938, | |
| "grad_norm": 46.40552686286593, | |
| "learning_rate": 2.6082296646550364e-05, | |
| "loss": 1.121, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8704663212435233, | |
| "grad_norm": 37.57424454065957, | |
| "learning_rate": 2.6000470599465065e-05, | |
| "loss": 1.1671, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.8730569948186528, | |
| "grad_norm": 38.580777053099204, | |
| "learning_rate": 2.5918534026945787e-05, | |
| "loss": 1.0849, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.8756476683937824, | |
| "grad_norm": 154.3106712010981, | |
| "learning_rate": 2.5836488438220044e-05, | |
| "loss": 1.0663, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.8782383419689119, | |
| "grad_norm": 34.21394067951015, | |
| "learning_rate": 2.575433534452334e-05, | |
| "loss": 1.0895, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8808290155440415, | |
| "grad_norm": 36.291611242733886, | |
| "learning_rate": 2.5672076259071385e-05, | |
| "loss": 1.1242, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.883419689119171, | |
| "grad_norm": 29.411623389655112, | |
| "learning_rate": 2.558971269703219e-05, | |
| "loss": 1.1005, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8860103626943006, | |
| "grad_norm": 30.24903086761753, | |
| "learning_rate": 2.5507246175498174e-05, | |
| "loss": 1.1134, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.8886010362694301, | |
| "grad_norm": 22.032293114161938, | |
| "learning_rate": 2.5424678213458202e-05, | |
| "loss": 1.1121, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.8911917098445595, | |
| "grad_norm": 34.997361528376956, | |
| "learning_rate": 2.5342010331769635e-05, | |
| "loss": 1.1341, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.8937823834196891, | |
| "grad_norm": 28.212824875732352, | |
| "learning_rate": 2.5259244053130295e-05, | |
| "loss": 1.0748, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8963730569948186, | |
| "grad_norm": 23.870011592985897, | |
| "learning_rate": 2.5176380902050418e-05, | |
| "loss": 1.0643, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.8989637305699482, | |
| "grad_norm": 26.10018699309748, | |
| "learning_rate": 2.5093422404824574e-05, | |
| "loss": 1.1662, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.9015544041450777, | |
| "grad_norm": 30.191468778559166, | |
| "learning_rate": 2.5010370089503578e-05, | |
| "loss": 1.1023, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.9041450777202072, | |
| "grad_norm": 55.799581973427415, | |
| "learning_rate": 2.4927225485866297e-05, | |
| "loss": 1.1538, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.9067357512953368, | |
| "grad_norm": 35.7030284720465, | |
| "learning_rate": 2.4843990125391516e-05, | |
| "loss": 1.1, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9093264248704663, | |
| "grad_norm": 28.61763302791738, | |
| "learning_rate": 2.4760665541229712e-05, | |
| "loss": 1.0914, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.9119170984455959, | |
| "grad_norm": 33.34233685155311, | |
| "learning_rate": 2.467725326817481e-05, | |
| "loss": 1.0862, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.9145077720207254, | |
| "grad_norm": 25.441052078480084, | |
| "learning_rate": 2.4593754842635917e-05, | |
| "loss": 1.1422, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.917098445595855, | |
| "grad_norm": 24.217974454985058, | |
| "learning_rate": 2.451017180260902e-05, | |
| "loss": 1.132, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.9196891191709845, | |
| "grad_norm": 57.986011465793155, | |
| "learning_rate": 2.4426505687648653e-05, | |
| "loss": 1.2082, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9222797927461139, | |
| "grad_norm": 34.058264716876195, | |
| "learning_rate": 2.4342758038839573e-05, | |
| "loss": 1.1679, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.9248704663212435, | |
| "grad_norm": 28.621514922275253, | |
| "learning_rate": 2.4258930398768317e-05, | |
| "loss": 1.1319, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.927461139896373, | |
| "grad_norm": 35.33355417283227, | |
| "learning_rate": 2.4175024311494835e-05, | |
| "loss": 1.0705, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.9300518134715026, | |
| "grad_norm": 46.579572933583265, | |
| "learning_rate": 2.4091041322524023e-05, | |
| "loss": 1.0842, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.9326424870466321, | |
| "grad_norm": 35.494740787672974, | |
| "learning_rate": 2.4006982978777263e-05, | |
| "loss": 1.1072, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9352331606217616, | |
| "grad_norm": 44.56606839509262, | |
| "learning_rate": 2.392285082856394e-05, | |
| "loss": 1.1125, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.9378238341968912, | |
| "grad_norm": 46.26363869084929, | |
| "learning_rate": 2.3838646421552917e-05, | |
| "loss": 1.1268, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.9404145077720207, | |
| "grad_norm": 89.17676267680146, | |
| "learning_rate": 2.3754371308743975e-05, | |
| "loss": 1.0893, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.9430051813471503, | |
| "grad_norm": 34.87700187494181, | |
| "learning_rate": 2.367002704243927e-05, | |
| "loss": 1.1203, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.9455958549222798, | |
| "grad_norm": 32.92806939217504, | |
| "learning_rate": 2.3585615176214716e-05, | |
| "loss": 1.1488, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9481865284974094, | |
| "grad_norm": 27.27458755248548, | |
| "learning_rate": 2.3501137264891396e-05, | |
| "loss": 1.0874, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.9507772020725389, | |
| "grad_norm": 24.959123789739834, | |
| "learning_rate": 2.3416594864506887e-05, | |
| "loss": 1.1783, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.9533678756476683, | |
| "grad_norm": 31.838670988369724, | |
| "learning_rate": 2.333198953228664e-05, | |
| "loss": 1.0759, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.9559585492227979, | |
| "grad_norm": 28.112870222863155, | |
| "learning_rate": 2.3247322826615276e-05, | |
| "loss": 1.1481, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.9585492227979274, | |
| "grad_norm": 35.08461098450067, | |
| "learning_rate": 2.316259630700787e-05, | |
| "loss": 1.0953, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.961139896373057, | |
| "grad_norm": 37.80899503618479, | |
| "learning_rate": 2.307781153408124e-05, | |
| "loss": 1.1224, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.9637305699481865, | |
| "grad_norm": 31.644978122007387, | |
| "learning_rate": 2.2992970069525202e-05, | |
| "loss": 1.1608, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.966321243523316, | |
| "grad_norm": 23.51029318210938, | |
| "learning_rate": 2.29080734760738e-05, | |
| "loss": 1.0914, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.9689119170984456, | |
| "grad_norm": 28.97240481418573, | |
| "learning_rate": 2.2823123317476522e-05, | |
| "loss": 1.1117, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.9715025906735751, | |
| "grad_norm": 36.613893678320395, | |
| "learning_rate": 2.273812115846951e-05, | |
| "loss": 1.1118, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9740932642487047, | |
| "grad_norm": 26.402979304578093, | |
| "learning_rate": 2.2653068564746692e-05, | |
| "loss": 1.13, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.9766839378238342, | |
| "grad_norm": 114.3000444613392, | |
| "learning_rate": 2.2567967102931025e-05, | |
| "loss": 1.1539, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.9792746113989638, | |
| "grad_norm": 26.861359932396834, | |
| "learning_rate": 2.2482818340545534e-05, | |
| "loss": 1.0566, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.9818652849740933, | |
| "grad_norm": 32.75509374223994, | |
| "learning_rate": 2.2397623845984548e-05, | |
| "loss": 1.1746, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.9844559585492227, | |
| "grad_norm": 34.11964206838379, | |
| "learning_rate": 2.2312385188484718e-05, | |
| "loss": 1.0834, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9870466321243523, | |
| "grad_norm": 38.019564122226434, | |
| "learning_rate": 2.2227103938096176e-05, | |
| "loss": 1.1074, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9896373056994818, | |
| "grad_norm": 39.5073811375391, | |
| "learning_rate": 2.2141781665653584e-05, | |
| "loss": 1.1082, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.9922279792746114, | |
| "grad_norm": 298.4258332795163, | |
| "learning_rate": 2.205641994274721e-05, | |
| "loss": 1.125, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.9948186528497409, | |
| "grad_norm": 36.444415670935506, | |
| "learning_rate": 2.1971020341693973e-05, | |
| "loss": 1.0935, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9974093264248705, | |
| "grad_norm": 28.96533429210575, | |
| "learning_rate": 2.188558443550849e-05, | |
| "loss": 1.0957, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 66.41241684127401, | |
| "learning_rate": 2.180011379787411e-05, | |
| "loss": 1.1335, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.0025906735751295, | |
| "grad_norm": 28.75549619538953, | |
| "learning_rate": 2.1714610003113887e-05, | |
| "loss": 1.1316, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.005181347150259, | |
| "grad_norm": 26.911837500852275, | |
| "learning_rate": 2.1629074626161647e-05, | |
| "loss": 1.1026, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.005181347150259, | |
| "eval_loss": 1.0908173322677612, | |
| "eval_runtime": 37.7642, | |
| "eval_samples_per_second": 19.701, | |
| "eval_steps_per_second": 1.245, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.0077720207253886, | |
| "grad_norm": 34.28722746775385, | |
| "learning_rate": 2.1543509242532932e-05, | |
| "loss": 1.1104, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.0103626943005182, | |
| "grad_norm": 37.97709310694863, | |
| "learning_rate": 2.145791542829597e-05, | |
| "loss": 1.0663, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0129533678756477, | |
| "grad_norm": 39.379668162327384, | |
| "learning_rate": 2.1372294760042686e-05, | |
| "loss": 1.1405, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.0155440414507773, | |
| "grad_norm": 27.136201219298698, | |
| "learning_rate": 2.1286648814859636e-05, | |
| "loss": 1.0963, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.0181347150259068, | |
| "grad_norm": 39.34261641469313, | |
| "learning_rate": 2.120097917029897e-05, | |
| "loss": 1.1276, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.0207253886010363, | |
| "grad_norm": 46.77583801285328, | |
| "learning_rate": 2.1115287404349357e-05, | |
| "loss": 1.1171, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.0233160621761659, | |
| "grad_norm": 55.10335066695868, | |
| "learning_rate": 2.1029575095406933e-05, | |
| "loss": 1.0831, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0259067357512954, | |
| "grad_norm": 76.88533851789373, | |
| "learning_rate": 2.0943843822246234e-05, | |
| "loss": 1.0925, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.028497409326425, | |
| "grad_norm": 29.604569209708462, | |
| "learning_rate": 2.0858095163991094e-05, | |
| "loss": 1.1259, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.0310880829015545, | |
| "grad_norm": 37.71348366628868, | |
| "learning_rate": 2.077233070008557e-05, | |
| "loss": 1.0792, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.0336787564766838, | |
| "grad_norm": 26.866133194031644, | |
| "learning_rate": 2.0686552010264872e-05, | |
| "loss": 1.1649, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "grad_norm": 35.739274800620635, | |
| "learning_rate": 2.060076067452622e-05, | |
| "loss": 1.0837, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.038860103626943, | |
| "grad_norm": 24.479129391259896, | |
| "learning_rate": 2.0514958273099778e-05, | |
| "loss": 1.073, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.0414507772020725, | |
| "grad_norm": 50.49963650108008, | |
| "learning_rate": 2.042914638641952e-05, | |
| "loss": 1.0912, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.044041450777202, | |
| "grad_norm": 35.6875451072032, | |
| "learning_rate": 2.0343326595094154e-05, | |
| "loss": 1.0936, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.0466321243523315, | |
| "grad_norm": 30.212298193414487, | |
| "learning_rate": 2.0257500479877965e-05, | |
| "loss": 1.089, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.049222797927461, | |
| "grad_norm": 28.65828720015124, | |
| "learning_rate": 2.0171669621641743e-05, | |
| "loss": 1.1727, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.0518134715025906, | |
| "grad_norm": 39.2199058392425, | |
| "learning_rate": 2.0085835601343627e-05, | |
| "loss": 1.1493, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.0544041450777202, | |
| "grad_norm": 110.01204177059546, | |
| "learning_rate": 2e-05, | |
| "loss": 1.1245, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.0569948186528497, | |
| "grad_norm": 43.427381349600374, | |
| "learning_rate": 1.9914164398656383e-05, | |
| "loss": 1.1183, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.0595854922279793, | |
| "grad_norm": 64.78768909817894, | |
| "learning_rate": 1.9828330378358264e-05, | |
| "loss": 1.1528, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.0621761658031088, | |
| "grad_norm": 26.50257915912425, | |
| "learning_rate": 1.974249952012204e-05, | |
| "loss": 1.1568, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0647668393782384, | |
| "grad_norm": 27.63159204178893, | |
| "learning_rate": 1.9656673404905852e-05, | |
| "loss": 1.1071, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.067357512953368, | |
| "grad_norm": 27.0795355533723, | |
| "learning_rate": 1.957085361358049e-05, | |
| "loss": 1.0809, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.0699481865284974, | |
| "grad_norm": 41.84795332660821, | |
| "learning_rate": 1.9485041726900232e-05, | |
| "loss": 1.0744, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.072538860103627, | |
| "grad_norm": 143.2109134427192, | |
| "learning_rate": 1.939923932547379e-05, | |
| "loss": 1.0905, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.0751295336787565, | |
| "grad_norm": 89.55384065946154, | |
| "learning_rate": 1.931344798973513e-05, | |
| "loss": 1.1012, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.077720207253886, | |
| "grad_norm": 31.072074793068015, | |
| "learning_rate": 1.922766929991443e-05, | |
| "loss": 1.1141, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.0803108808290156, | |
| "grad_norm": 29.82683189045969, | |
| "learning_rate": 1.914190483600891e-05, | |
| "loss": 1.0842, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.0829015544041452, | |
| "grad_norm": 30.09708662586305, | |
| "learning_rate": 1.9056156177753776e-05, | |
| "loss": 1.1088, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.0854922279792747, | |
| "grad_norm": 27.637437518920503, | |
| "learning_rate": 1.897042490459307e-05, | |
| "loss": 1.058, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.0880829015544042, | |
| "grad_norm": 69.34285700381683, | |
| "learning_rate": 1.8884712595650653e-05, | |
| "loss": 1.0314, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0906735751295338, | |
| "grad_norm": 25.644927284592956, | |
| "learning_rate": 1.8799020829701036e-05, | |
| "loss": 1.0916, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.093264248704663, | |
| "grad_norm": 30.3898986852319, | |
| "learning_rate": 1.871335118514037e-05, | |
| "loss": 1.0797, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.0958549222797926, | |
| "grad_norm": 22.271334693423444, | |
| "learning_rate": 1.862770523995732e-05, | |
| "loss": 1.1134, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.0984455958549222, | |
| "grad_norm": 35.85874616678876, | |
| "learning_rate": 1.854208457170404e-05, | |
| "loss": 1.0927, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.1010362694300517, | |
| "grad_norm": 43.06832041948097, | |
| "learning_rate": 1.8456490757467075e-05, | |
| "loss": 1.093, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.1036269430051813, | |
| "grad_norm": 37.83777637993467, | |
| "learning_rate": 1.8370925373838356e-05, | |
| "loss": 1.1268, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.1062176165803108, | |
| "grad_norm": 23.798059023605177, | |
| "learning_rate": 1.8285389996886113e-05, | |
| "loss": 1.0989, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.1088082901554404, | |
| "grad_norm": 25.443104465500795, | |
| "learning_rate": 1.8199886202125897e-05, | |
| "loss": 1.0581, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.11139896373057, | |
| "grad_norm": 23.76241444847441, | |
| "learning_rate": 1.8114415564491513e-05, | |
| "loss": 1.0908, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.1139896373056994, | |
| "grad_norm": 26.5600693044426, | |
| "learning_rate": 1.8028979658306033e-05, | |
| "loss": 1.1321, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.116580310880829, | |
| "grad_norm": 44.854375199828986, | |
| "learning_rate": 1.794358005725279e-05, | |
| "loss": 1.0762, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.1191709844559585, | |
| "grad_norm": 28.05797777410846, | |
| "learning_rate": 1.785821833434642e-05, | |
| "loss": 1.0698, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.121761658031088, | |
| "grad_norm": 26.488479630212364, | |
| "learning_rate": 1.7772896061903824e-05, | |
| "loss": 1.1223, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.1243523316062176, | |
| "grad_norm": 32.77084542157883, | |
| "learning_rate": 1.768761481151529e-05, | |
| "loss": 1.0984, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.1269430051813472, | |
| "grad_norm": 39.13198413130026, | |
| "learning_rate": 1.7602376154015456e-05, | |
| "loss": 1.1551, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.1295336787564767, | |
| "grad_norm": 23.878966995283953, | |
| "learning_rate": 1.751718165945447e-05, | |
| "loss": 1.1133, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.1321243523316062, | |
| "grad_norm": 33.90472985566232, | |
| "learning_rate": 1.743203289706898e-05, | |
| "loss": 1.1219, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.1347150259067358, | |
| "grad_norm": 23.340369938533712, | |
| "learning_rate": 1.734693143525331e-05, | |
| "loss": 1.1244, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.1373056994818653, | |
| "grad_norm": 105.6885206147852, | |
| "learning_rate": 1.7261878841530494e-05, | |
| "loss": 1.0788, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.1398963730569949, | |
| "grad_norm": 28.453526076458317, | |
| "learning_rate": 1.717687668252348e-05, | |
| "loss": 1.1576, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1424870466321244, | |
| "grad_norm": 36.1473991485961, | |
| "learning_rate": 1.7091926523926205e-05, | |
| "loss": 1.0859, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.145077720207254, | |
| "grad_norm": 27.043461146902448, | |
| "learning_rate": 1.7007029930474804e-05, | |
| "loss": 1.1072, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.1476683937823835, | |
| "grad_norm": 28.066170619981435, | |
| "learning_rate": 1.6922188465918763e-05, | |
| "loss": 1.1279, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.150259067357513, | |
| "grad_norm": 38.62445822837212, | |
| "learning_rate": 1.6837403692992136e-05, | |
| "loss": 1.1275, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.1528497409326426, | |
| "grad_norm": 28.077258963587767, | |
| "learning_rate": 1.6752677173384734e-05, | |
| "loss": 1.1004, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.1554404145077721, | |
| "grad_norm": 42.1405744301338, | |
| "learning_rate": 1.6668010467713363e-05, | |
| "loss": 1.1141, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.1580310880829017, | |
| "grad_norm": 26.827291684301034, | |
| "learning_rate": 1.658340513549312e-05, | |
| "loss": 1.1216, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.160621761658031, | |
| "grad_norm": 30.863489441619983, | |
| "learning_rate": 1.649886273510861e-05, | |
| "loss": 1.1898, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.1632124352331605, | |
| "grad_norm": 27.73579733476068, | |
| "learning_rate": 1.641438482378529e-05, | |
| "loss": 1.0971, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.16580310880829, | |
| "grad_norm": 32.84347174567353, | |
| "learning_rate": 1.6329972957560736e-05, | |
| "loss": 1.0579, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1683937823834196, | |
| "grad_norm": 30.06456192962641, | |
| "learning_rate": 1.6245628691256032e-05, | |
| "loss": 1.1057, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.1709844559585492, | |
| "grad_norm": 36.554506394377846, | |
| "learning_rate": 1.616135357844709e-05, | |
| "loss": 1.1008, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.1735751295336787, | |
| "grad_norm": 27.358643056184114, | |
| "learning_rate": 1.6077149171436063e-05, | |
| "loss": 1.101, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.1761658031088082, | |
| "grad_norm": 111.13373813893604, | |
| "learning_rate": 1.599301702122274e-05, | |
| "loss": 1.0688, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.1787564766839378, | |
| "grad_norm": 33.94168250727336, | |
| "learning_rate": 1.590895867747599e-05, | |
| "loss": 1.0721, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.1813471502590673, | |
| "grad_norm": 53.93978395349692, | |
| "learning_rate": 1.582497568850517e-05, | |
| "loss": 1.0584, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.1839378238341969, | |
| "grad_norm": 29.19245794937285, | |
| "learning_rate": 1.574106960123169e-05, | |
| "loss": 1.067, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.1865284974093264, | |
| "grad_norm": 28.06897801999048, | |
| "learning_rate": 1.5657241961160434e-05, | |
| "loss": 1.0899, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.189119170984456, | |
| "grad_norm": 52.31256652964293, | |
| "learning_rate": 1.557349431235135e-05, | |
| "loss": 1.0925, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.1917098445595855, | |
| "grad_norm": 65.39771110845307, | |
| "learning_rate": 1.5489828197390988e-05, | |
| "loss": 1.1448, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.194300518134715, | |
| "grad_norm": 27.062780348557254, | |
| "learning_rate": 1.5406245157364093e-05, | |
| "loss": 1.0871, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.1968911917098446, | |
| "grad_norm": 41.667025056250424, | |
| "learning_rate": 1.5322746731825195e-05, | |
| "loss": 1.048, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.1994818652849741, | |
| "grad_norm": 24.936669803360665, | |
| "learning_rate": 1.5239334458770291e-05, | |
| "loss": 1.1243, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.2020725388601037, | |
| "grad_norm": 26.65392149600558, | |
| "learning_rate": 1.5156009874608484e-05, | |
| "loss": 1.0919, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.2046632124352332, | |
| "grad_norm": 48.57730651937978, | |
| "learning_rate": 1.5072774514133708e-05, | |
| "loss": 1.1259, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.2072538860103628, | |
| "grad_norm": 31.34891257114439, | |
| "learning_rate": 1.4989629910496424e-05, | |
| "loss": 1.0733, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.2098445595854923, | |
| "grad_norm": 24.541559850584985, | |
| "learning_rate": 1.4906577595175428e-05, | |
| "loss": 1.1166, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.2124352331606219, | |
| "grad_norm": 20.4345832961354, | |
| "learning_rate": 1.4823619097949584e-05, | |
| "loss": 1.0916, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.2150259067357512, | |
| "grad_norm": 28.860712194727487, | |
| "learning_rate": 1.4740755946869708e-05, | |
| "loss": 1.1043, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.2176165803108807, | |
| "grad_norm": 25.71820242946282, | |
| "learning_rate": 1.4657989668230363e-05, | |
| "loss": 1.0949, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2202072538860103, | |
| "grad_norm": 51.16994773097077, | |
| "learning_rate": 1.4575321786541801e-05, | |
| "loss": 1.141, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.2227979274611398, | |
| "grad_norm": 32.70442309640389, | |
| "learning_rate": 1.4492753824501833e-05, | |
| "loss": 1.1127, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.2253886010362693, | |
| "grad_norm": 21.913285172411495, | |
| "learning_rate": 1.4410287302967813e-05, | |
| "loss": 1.084, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.2279792746113989, | |
| "grad_norm": 34.45727214001296, | |
| "learning_rate": 1.4327923740928613e-05, | |
| "loss": 1.0836, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.2305699481865284, | |
| "grad_norm": 26.768013926034776, | |
| "learning_rate": 1.4245664655476663e-05, | |
| "loss": 1.1264, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.233160621761658, | |
| "grad_norm": 28.401965255935572, | |
| "learning_rate": 1.4163511561779956e-05, | |
| "loss": 1.0805, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.2357512953367875, | |
| "grad_norm": 29.19935757288793, | |
| "learning_rate": 1.4081465973054216e-05, | |
| "loss": 1.0825, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.238341968911917, | |
| "grad_norm": 24.55918541541201, | |
| "learning_rate": 1.3999529400534941e-05, | |
| "loss": 1.1164, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.2409326424870466, | |
| "grad_norm": 25.35635406268312, | |
| "learning_rate": 1.3917703353449646e-05, | |
| "loss": 1.1334, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.2435233160621761, | |
| "grad_norm": 45.453901005004184, | |
| "learning_rate": 1.3835989338989996e-05, | |
| "loss": 1.1387, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2461139896373057, | |
| "grad_norm": 21.67852694202104, | |
| "learning_rate": 1.375438886228411e-05, | |
| "loss": 1.0846, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.2487046632124352, | |
| "grad_norm": 171.2474074894732, | |
| "learning_rate": 1.3672903426368773e-05, | |
| "loss": 1.1388, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.2512953367875648, | |
| "grad_norm": 43.18223835070906, | |
| "learning_rate": 1.3591534532161781e-05, | |
| "loss": 1.1483, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.2538860103626943, | |
| "grad_norm": 29.447332565856644, | |
| "learning_rate": 1.3510283678434317e-05, | |
| "loss": 1.07, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.2564766839378239, | |
| "grad_norm": 28.600251051615228, | |
| "learning_rate": 1.3429152361783307e-05, | |
| "loss": 1.0798, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.2564766839378239, | |
| "eval_loss": 1.085669755935669, | |
| "eval_runtime": 38.1134, | |
| "eval_samples_per_second": 19.521, | |
| "eval_steps_per_second": 1.233, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.2590673575129534, | |
| "grad_norm": 47.124643074410464, | |
| "learning_rate": 1.3348142076603876e-05, | |
| "loss": 1.0875, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.261658031088083, | |
| "grad_norm": 42.06019726307143, | |
| "learning_rate": 1.3267254315061797e-05, | |
| "loss": 1.1429, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.2642487046632125, | |
| "grad_norm": 18.950734630756962, | |
| "learning_rate": 1.318649056706605e-05, | |
| "loss": 1.0747, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.266839378238342, | |
| "grad_norm": 31.903949502516806, | |
| "learning_rate": 1.3105852320241326e-05, | |
| "loss": 1.1041, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.2694300518134716, | |
| "grad_norm": 22.957473008085927, | |
| "learning_rate": 1.3025341059900675e-05, | |
| "loss": 1.1046, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.2720207253886011, | |
| "grad_norm": 22.325983256563678, | |
| "learning_rate": 1.2944958269018103e-05, | |
| "loss": 1.0643, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.2746113989637307, | |
| "grad_norm": 29.689383331974955, | |
| "learning_rate": 1.2864705428201307e-05, | |
| "loss": 1.0949, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.2772020725388602, | |
| "grad_norm": 25.338298442945575, | |
| "learning_rate": 1.2784584015664337e-05, | |
| "loss": 1.0725, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.2797927461139897, | |
| "grad_norm": 31.591732488078588, | |
| "learning_rate": 1.2704595507200435e-05, | |
| "loss": 1.0347, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.2823834196891193, | |
| "grad_norm": 42.96243570696118, | |
| "learning_rate": 1.26247413761548e-05, | |
| "loss": 1.1196, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.2849740932642488, | |
| "grad_norm": 26.559546676266024, | |
| "learning_rate": 1.254502309339749e-05, | |
| "loss": 1.0187, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.2875647668393784, | |
| "grad_norm": 27.58444017584016, | |
| "learning_rate": 1.2465442127296297e-05, | |
| "loss": 1.0985, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.2901554404145077, | |
| "grad_norm": 36.53028730423797, | |
| "learning_rate": 1.2385999943689732e-05, | |
| "loss": 1.068, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.2927461139896372, | |
| "grad_norm": 38.94837307599113, | |
| "learning_rate": 1.2306698005859975e-05, | |
| "loss": 1.0736, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.2953367875647668, | |
| "grad_norm": 36.67208266195125, | |
| "learning_rate": 1.2227537774505996e-05, | |
| "loss": 1.119, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2979274611398963, | |
| "grad_norm": 31.086410648635283, | |
| "learning_rate": 1.2148520707716567e-05, | |
| "loss": 1.1094, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.3005181347150259, | |
| "grad_norm": 27.96977481605826, | |
| "learning_rate": 1.2069648260943473e-05, | |
| "loss": 1.1345, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.3031088082901554, | |
| "grad_norm": 22.89450502840197, | |
| "learning_rate": 1.1990921886974669e-05, | |
| "loss": 1.12, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.305699481865285, | |
| "grad_norm": 18.54206032224653, | |
| "learning_rate": 1.1912343035907535e-05, | |
| "loss": 1.0929, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.3082901554404145, | |
| "grad_norm": 38.9386007237313, | |
| "learning_rate": 1.1833913155122132e-05, | |
| "loss": 1.1381, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.310880829015544, | |
| "grad_norm": 37.05899458809635, | |
| "learning_rate": 1.1755633689254609e-05, | |
| "loss": 1.0535, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.3134715025906736, | |
| "grad_norm": 27.716372794195156, | |
| "learning_rate": 1.1677506080170512e-05, | |
| "loss": 1.1342, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.3160621761658031, | |
| "grad_norm": 40.42306246079416, | |
| "learning_rate": 1.1599531766938306e-05, | |
| "loss": 1.0887, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.3186528497409327, | |
| "grad_norm": 98.56681767405578, | |
| "learning_rate": 1.1521712185802789e-05, | |
| "loss": 1.0954, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.3212435233160622, | |
| "grad_norm": 34.42816933350743, | |
| "learning_rate": 1.1444048770158718e-05, | |
| "loss": 1.0512, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3238341968911918, | |
| "grad_norm": 52.457523653614096, | |
| "learning_rate": 1.136654295052433e-05, | |
| "loss": 1.1599, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.3264248704663213, | |
| "grad_norm": 26.832339531661276, | |
| "learning_rate": 1.1289196154515048e-05, | |
| "loss": 1.0602, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.3290155440414508, | |
| "grad_norm": 32.746047673769816, | |
| "learning_rate": 1.1212009806817163e-05, | |
| "loss": 1.1544, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.3316062176165804, | |
| "grad_norm": 37.44483451702055, | |
| "learning_rate": 1.1134985329161608e-05, | |
| "loss": 1.1421, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.33419689119171, | |
| "grad_norm": 28.625976525737606, | |
| "learning_rate": 1.1058124140297718e-05, | |
| "loss": 1.0858, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.3367875647668392, | |
| "grad_norm": 38.64141195246213, | |
| "learning_rate": 1.0981427655967183e-05, | |
| "loss": 1.0983, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.3393782383419688, | |
| "grad_norm": 29.989753893533425, | |
| "learning_rate": 1.0904897288877891e-05, | |
| "loss": 1.1269, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.3419689119170983, | |
| "grad_norm": 48.63990665515511, | |
| "learning_rate": 1.0828534448677942e-05, | |
| "loss": 1.0844, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.3445595854922279, | |
| "grad_norm": 25.477227318250847, | |
| "learning_rate": 1.0752340541929711e-05, | |
| "loss": 1.0742, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.3471502590673574, | |
| "grad_norm": 26.363588814537763, | |
| "learning_rate": 1.0676316972083867e-05, | |
| "loss": 1.0533, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.349740932642487, | |
| "grad_norm": 34.59968737708606, | |
| "learning_rate": 1.060046513945361e-05, | |
| "loss": 1.0983, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.3523316062176165, | |
| "grad_norm": 52.51652561846762, | |
| "learning_rate": 1.0524786441188786e-05, | |
| "loss": 1.1319, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.354922279792746, | |
| "grad_norm": 21.360221214301127, | |
| "learning_rate": 1.0449282271250239e-05, | |
| "loss": 1.0627, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.3575129533678756, | |
| "grad_norm": 37.00053933682603, | |
| "learning_rate": 1.0373954020384073e-05, | |
| "loss": 1.096, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.3601036269430051, | |
| "grad_norm": 39.212240822687484, | |
| "learning_rate": 1.029880307609608e-05, | |
| "loss": 1.0512, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.3626943005181347, | |
| "grad_norm": 24.89842378385804, | |
| "learning_rate": 1.0223830822626124e-05, | |
| "loss": 1.0538, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.3652849740932642, | |
| "grad_norm": 29.14416894424653, | |
| "learning_rate": 1.0149038640922715e-05, | |
| "loss": 1.1538, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.3678756476683938, | |
| "grad_norm": 31.688722122648855, | |
| "learning_rate": 1.0074427908617515e-05, | |
| "loss": 1.171, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.3704663212435233, | |
| "grad_norm": 41.918909004413734, | |
| "learning_rate": 1.0000000000000006e-05, | |
| "loss": 1.1203, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.3730569948186528, | |
| "grad_norm": 26.70963454516576, | |
| "learning_rate": 9.92575628599213e-06, | |
| "loss": 1.0855, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.3756476683937824, | |
| "grad_norm": 24.819351173466824, | |
| "learning_rate": 9.851698134123095e-06, | |
| "loss": 1.0972, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.378238341968912, | |
| "grad_norm": 22.100465399566815, | |
| "learning_rate": 9.777826908504126e-06, | |
| "loss": 1.08, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.3808290155440415, | |
| "grad_norm": 29.31574709406259, | |
| "learning_rate": 9.704143969803392e-06, | |
| "loss": 1.0835, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.383419689119171, | |
| "grad_norm": 25.551326748473052, | |
| "learning_rate": 9.630650675220892e-06, | |
| "loss": 1.0396, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.3860103626943006, | |
| "grad_norm": 59.07595627892596, | |
| "learning_rate": 9.557348378463503e-06, | |
| "loss": 1.0814, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.38860103626943, | |
| "grad_norm": 24.96501978981908, | |
| "learning_rate": 9.484238429720018e-06, | |
| "loss": 1.0187, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.3911917098445596, | |
| "grad_norm": 42.530604702279234, | |
| "learning_rate": 9.411322175636298e-06, | |
| "loss": 1.074, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.3937823834196892, | |
| "grad_norm": 34.91129065632851, | |
| "learning_rate": 9.338600959290414e-06, | |
| "loss": 1.0878, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.3963730569948187, | |
| "grad_norm": 32.07525956876426, | |
| "learning_rate": 9.266076120167992e-06, | |
| "loss": 1.0962, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.3989637305699483, | |
| "grad_norm": 40.18387743296675, | |
| "learning_rate": 9.193748994137462e-06, | |
| "loss": 1.1033, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4015544041450778, | |
| "grad_norm": 66.68031460980451, | |
| "learning_rate": 9.121620913425508e-06, | |
| "loss": 1.1466, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.4041450777202074, | |
| "grad_norm": 34.07506059584738, | |
| "learning_rate": 9.04969320659249e-06, | |
| "loss": 1.1184, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.406735751295337, | |
| "grad_norm": 17.130845779169075, | |
| "learning_rate": 8.977967198508001e-06, | |
| "loss": 1.0803, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.4093264248704664, | |
| "grad_norm": 22.4457025132615, | |
| "learning_rate": 8.906444210326441e-06, | |
| "loss": 1.0745, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.411917098445596, | |
| "grad_norm": 73.43971735356851, | |
| "learning_rate": 8.83512555946271e-06, | |
| "loss": 1.0717, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.4145077720207253, | |
| "grad_norm": 38.16321297719761, | |
| "learning_rate": 8.764012559567899e-06, | |
| "loss": 1.1371, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.4170984455958548, | |
| "grad_norm": 56.14718024907725, | |
| "learning_rate": 8.693106520505147e-06, | |
| "loss": 1.0185, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.4196891191709844, | |
| "grad_norm": 53.3812598790062, | |
| "learning_rate": 8.622408748325461e-06, | |
| "loss": 1.0859, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.422279792746114, | |
| "grad_norm": 39.69041631433326, | |
| "learning_rate": 8.551920545243704e-06, | |
| "loss": 1.1146, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.4248704663212435, | |
| "grad_norm": 24.099260758984773, | |
| "learning_rate": 8.481643209614576e-06, | |
| "loss": 1.0968, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.427461139896373, | |
| "grad_norm": 22.623850373369237, | |
| "learning_rate": 8.411578035908728e-06, | |
| "loss": 1.0642, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.4300518134715026, | |
| "grad_norm": 25.343746374404027, | |
| "learning_rate": 8.341726314688875e-06, | |
| "loss": 1.0815, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.432642487046632, | |
| "grad_norm": 35.82641011588973, | |
| "learning_rate": 8.272089332586089e-06, | |
| "loss": 1.1012, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.4352331606217616, | |
| "grad_norm": 24.81161215784662, | |
| "learning_rate": 8.20266837227603e-06, | |
| "loss": 1.1086, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.4378238341968912, | |
| "grad_norm": 54.18243481591251, | |
| "learning_rate": 8.133464712455364e-06, | |
| "loss": 1.0704, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.4404145077720207, | |
| "grad_norm": 23.602598217141395, | |
| "learning_rate": 8.064479627818213e-06, | |
| "loss": 1.1519, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.4430051813471503, | |
| "grad_norm": 31.124404868409982, | |
| "learning_rate": 7.995714389032638e-06, | |
| "loss": 1.0705, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.4455958549222798, | |
| "grad_norm": 24.14171016995626, | |
| "learning_rate": 7.927170262717284e-06, | |
| "loss": 1.1083, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.4481865284974094, | |
| "grad_norm": 47.987203109917175, | |
| "learning_rate": 7.858848511417998e-06, | |
| "loss": 1.0836, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.450777202072539, | |
| "grad_norm": 25.871447098066056, | |
| "learning_rate": 7.790750393584616e-06, | |
| "loss": 1.0787, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4533678756476685, | |
| "grad_norm": 23.820249113937482, | |
| "learning_rate": 7.72287716354776e-06, | |
| "loss": 1.1165, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.455958549222798, | |
| "grad_norm": 48.04131308947624, | |
| "learning_rate": 7.65523007149575e-06, | |
| "loss": 1.0819, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.4585492227979275, | |
| "grad_norm": 29.273494083692352, | |
| "learning_rate": 7.587810363451544e-06, | |
| "loss": 1.0302, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.4611398963730569, | |
| "grad_norm": 120.01571222366722, | |
| "learning_rate": 7.5206192812498345e-06, | |
| "loss": 1.1291, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.4637305699481864, | |
| "grad_norm": 33.16947662083338, | |
| "learning_rate": 7.4536580625141244e-06, | |
| "loss": 1.0842, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.466321243523316, | |
| "grad_norm": 29.979556378166713, | |
| "learning_rate": 7.386927940633981e-06, | |
| "loss": 1.1116, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.4689119170984455, | |
| "grad_norm": 27.172344859281896, | |
| "learning_rate": 7.32043014474227e-06, | |
| "loss": 1.0676, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.471502590673575, | |
| "grad_norm": 30.208548637757318, | |
| "learning_rate": 7.254165899692554e-06, | |
| "loss": 1.1104, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.4740932642487046, | |
| "grad_norm": 19.385421184583773, | |
| "learning_rate": 7.188136426036498e-06, | |
| "loss": 1.0085, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.4766839378238341, | |
| "grad_norm": 30.350787749309685, | |
| "learning_rate": 7.12234294000143e-06, | |
| "loss": 1.0584, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.4792746113989637, | |
| "grad_norm": 31.520305600900198, | |
| "learning_rate": 7.056786653467882e-06, | |
| "loss": 1.0831, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.4818652849740932, | |
| "grad_norm": 46.13006972574487, | |
| "learning_rate": 6.991468773947321e-06, | |
| "loss": 1.1761, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.4844559585492227, | |
| "grad_norm": 26.72340868362835, | |
| "learning_rate": 6.926390504559879e-06, | |
| "loss": 1.0605, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.4870466321243523, | |
| "grad_norm": 25.992965411102556, | |
| "learning_rate": 6.861553044012206e-06, | |
| "loss": 1.1015, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.4896373056994818, | |
| "grad_norm": 38.60187420279626, | |
| "learning_rate": 6.796957586575364e-06, | |
| "loss": 1.1232, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.4922279792746114, | |
| "grad_norm": 21.7618591565717, | |
| "learning_rate": 6.732605322062869e-06, | |
| "loss": 1.1196, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.494818652849741, | |
| "grad_norm": 28.233093007170996, | |
| "learning_rate": 6.668497435808736e-06, | |
| "loss": 1.1451, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.4974093264248705, | |
| "grad_norm": 28.061514297823816, | |
| "learning_rate": 6.604635108645683e-06, | |
| "loss": 1.0832, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 35.34503147975386, | |
| "learning_rate": 6.5410195168833425e-06, | |
| "loss": 1.118, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.5025906735751295, | |
| "grad_norm": 31.940516004139344, | |
| "learning_rate": 6.477651832286633e-06, | |
| "loss": 1.1052, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.505181347150259, | |
| "grad_norm": 25.647504733675635, | |
| "learning_rate": 6.414533222054138e-06, | |
| "loss": 1.1055, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.5077720207253886, | |
| "grad_norm": 68.16422579698298, | |
| "learning_rate": 6.3516648487966456e-06, | |
| "loss": 1.0784, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.5077720207253886, | |
| "eval_loss": 1.0824710130691528, | |
| "eval_runtime": 37.4923, | |
| "eval_samples_per_second": 19.844, | |
| "eval_steps_per_second": 1.254, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.5103626943005182, | |
| "grad_norm": 46.95363643283118, | |
| "learning_rate": 6.289047870515692e-06, | |
| "loss": 1.1271, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.5129533678756477, | |
| "grad_norm": 37.80701104174098, | |
| "learning_rate": 6.226683440582268e-06, | |
| "loss": 1.126, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.5155440414507773, | |
| "grad_norm": 32.03225059321182, | |
| "learning_rate": 6.164572707715564e-06, | |
| "loss": 1.0152, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.5181347150259068, | |
| "grad_norm": 31.21438627768379, | |
| "learning_rate": 6.102716815961787e-06, | |
| "loss": 1.1595, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.5207253886010363, | |
| "grad_norm": 23.55515793723355, | |
| "learning_rate": 6.041116904673125e-06, | |
| "loss": 1.0943, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.5233160621761659, | |
| "grad_norm": 26.92022994571063, | |
| "learning_rate": 5.979774108486751e-06, | |
| "loss": 1.0554, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.5259067357512954, | |
| "grad_norm": 24.957086694295352, | |
| "learning_rate": 5.918689557303885e-06, | |
| "loss": 1.0711, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.528497409326425, | |
| "grad_norm": 87.48440577770464, | |
| "learning_rate": 5.857864376269051e-06, | |
| "loss": 1.1679, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.5310880829015545, | |
| "grad_norm": 21.756969247026838, | |
| "learning_rate": 5.7972996857492896e-06, | |
| "loss": 1.0716, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.533678756476684, | |
| "grad_norm": 33.92695136944769, | |
| "learning_rate": 5.736996601313545e-06, | |
| "loss": 1.0376, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.5362694300518136, | |
| "grad_norm": 32.738888590276794, | |
| "learning_rate": 5.676956233712139e-06, | |
| "loss": 1.0245, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.5388601036269431, | |
| "grad_norm": 22.38597679049821, | |
| "learning_rate": 5.617179688856271e-06, | |
| "loss": 1.1103, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.5414507772020727, | |
| "grad_norm": 30.168619654124416, | |
| "learning_rate": 5.557668067797677e-06, | |
| "loss": 1.2007, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.5440414507772022, | |
| "grad_norm": 24.460334668593116, | |
| "learning_rate": 5.498422466708349e-06, | |
| "loss": 1.0842, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.5466321243523318, | |
| "grad_norm": 25.877463433966412, | |
| "learning_rate": 5.439443976860306e-06, | |
| "loss": 1.0537, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.549222797927461, | |
| "grad_norm": 27.67111694532404, | |
| "learning_rate": 5.38073368460555e-06, | |
| "loss": 1.0863, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.5518134715025906, | |
| "grad_norm": 43.112045139256026, | |
| "learning_rate": 5.32229267135602e-06, | |
| "loss": 1.1168, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.5544041450777202, | |
| "grad_norm": 31.60344278763487, | |
| "learning_rate": 5.2641220135636685e-06, | |
| "loss": 1.0939, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5569948186528497, | |
| "grad_norm": 37.795536334167195, | |
| "learning_rate": 5.206222782700667e-06, | |
| "loss": 1.1084, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.5595854922279793, | |
| "grad_norm": 27.529824319458413, | |
| "learning_rate": 5.1485960452396266e-06, | |
| "loss": 1.0755, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.5621761658031088, | |
| "grad_norm": 29.172376961452496, | |
| "learning_rate": 5.091242862634e-06, | |
| "loss": 1.0231, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.5647668393782384, | |
| "grad_norm": 24.94560254083931, | |
| "learning_rate": 5.0341642912984844e-06, | |
| "loss": 1.0782, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.567357512953368, | |
| "grad_norm": 31.79546143794924, | |
| "learning_rate": 4.977361382589607e-06, | |
| "loss": 1.1202, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.5699481865284974, | |
| "grad_norm": 39.3795372477718, | |
| "learning_rate": 4.920835182786316e-06, | |
| "loss": 1.0349, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.572538860103627, | |
| "grad_norm": 31.308429467189708, | |
| "learning_rate": 4.864586733070755e-06, | |
| "loss": 1.0582, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.5751295336787565, | |
| "grad_norm": 32.82748366949945, | |
| "learning_rate": 4.808617069509034e-06, | |
| "loss": 1.1246, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.577720207253886, | |
| "grad_norm": 24.281936328515055, | |
| "learning_rate": 4.752927223032196e-06, | |
| "loss": 1.0679, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.5803108808290154, | |
| "grad_norm": 111.23884469313498, | |
| "learning_rate": 4.697518219417188e-06, | |
| "loss": 1.1319, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.582901554404145, | |
| "grad_norm": 35.484299416160596, | |
| "learning_rate": 4.6423910792680005e-06, | |
| "loss": 1.1348, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.5854922279792745, | |
| "grad_norm": 27.135342529418295, | |
| "learning_rate": 4.587546817996826e-06, | |
| "loss": 1.0948, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.588082901554404, | |
| "grad_norm": 81.98158494527004, | |
| "learning_rate": 4.532986445805405e-06, | |
| "loss": 1.0864, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.5906735751295336, | |
| "grad_norm": 61.490418707157346, | |
| "learning_rate": 4.478710967666371e-06, | |
| "loss": 1.0693, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.593264248704663, | |
| "grad_norm": 25.633018846282962, | |
| "learning_rate": 4.424721383304791e-06, | |
| "loss": 1.1084, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.5958549222797926, | |
| "grad_norm": 28.194280804517373, | |
| "learning_rate": 4.371018687179689e-06, | |
| "loss": 1.1722, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.5984455958549222, | |
| "grad_norm": 27.8080566828581, | |
| "learning_rate": 4.317603868465794e-06, | |
| "loss": 1.1171, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.6010362694300517, | |
| "grad_norm": 42.959036729178806, | |
| "learning_rate": 4.264477911035265e-06, | |
| "loss": 1.074, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.6036269430051813, | |
| "grad_norm": 23.937218136554392, | |
| "learning_rate": 4.211641793439609e-06, | |
| "loss": 1.13, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 1.6062176165803108, | |
| "grad_norm": 43.913677975121566, | |
| "learning_rate": 4.159096488891623e-06, | |
| "loss": 1.1671, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.6088082901554404, | |
| "grad_norm": 48.107566289352114, | |
| "learning_rate": 4.106842965247497e-06, | |
| "loss": 1.1071, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.61139896373057, | |
| "grad_norm": 28.25790913819402, | |
| "learning_rate": 4.054882184988971e-06, | |
| "loss": 1.0716, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 1.6139896373056994, | |
| "grad_norm": 26.59960827233381, | |
| "learning_rate": 4.003215105205613e-06, | |
| "loss": 1.146, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 1.616580310880829, | |
| "grad_norm": 22.79614250574067, | |
| "learning_rate": 3.951842677577171e-06, | |
| "loss": 1.0761, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.6191709844559585, | |
| "grad_norm": 24.24036779343114, | |
| "learning_rate": 3.900765848356083e-06, | |
| "loss": 1.1037, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.621761658031088, | |
| "grad_norm": 27.295669679621373, | |
| "learning_rate": 3.849985558349998e-06, | |
| "loss": 1.1015, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 1.6243523316062176, | |
| "grad_norm": 54.413225233914176, | |
| "learning_rate": 3.799502742904497e-06, | |
| "loss": 1.0318, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.6269430051813472, | |
| "grad_norm": 38.84848713400369, | |
| "learning_rate": 3.749318331885825e-06, | |
| "loss": 1.1147, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 1.6295336787564767, | |
| "grad_norm": 23.912199342429506, | |
| "learning_rate": 3.699433249663775e-06, | |
| "loss": 1.1439, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 1.6321243523316062, | |
| "grad_norm": 48.95526983090661, | |
| "learning_rate": 3.649848415094681e-06, | |
| "loss": 1.0229, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6347150259067358, | |
| "grad_norm": 32.099897123524585, | |
| "learning_rate": 3.60056474150446e-06, | |
| "loss": 1.0589, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 1.6373056994818653, | |
| "grad_norm": 31.802660850585973, | |
| "learning_rate": 3.551583136671817e-06, | |
| "loss": 1.1137, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 1.6398963730569949, | |
| "grad_norm": 34.2655686599537, | |
| "learning_rate": 3.5029045028115105e-06, | |
| "loss": 1.1318, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.6424870466321244, | |
| "grad_norm": 191.48847051006786, | |
| "learning_rate": 3.4545297365577437e-06, | |
| "loss": 1.0921, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 1.645077720207254, | |
| "grad_norm": 24.236450154622357, | |
| "learning_rate": 3.406459728947622e-06, | |
| "loss": 1.0851, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.6476683937823835, | |
| "grad_norm": 38.819342476228876, | |
| "learning_rate": 3.358695365404785e-06, | |
| "loss": 1.0962, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.650259067357513, | |
| "grad_norm": 31.53545103406636, | |
| "learning_rate": 3.3112375257230547e-06, | |
| "loss": 1.0994, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.6528497409326426, | |
| "grad_norm": 71.55299438562814, | |
| "learning_rate": 3.2640870840502646e-06, | |
| "loss": 1.08, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.6554404145077721, | |
| "grad_norm": 57.94234006640972, | |
| "learning_rate": 3.2172449088721235e-06, | |
| "loss": 1.0921, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.6580310880829017, | |
| "grad_norm": 58.15229256885828, | |
| "learning_rate": 3.1707118629962607e-06, | |
| "loss": 1.0981, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6606217616580312, | |
| "grad_norm": 25.105795165561457, | |
| "learning_rate": 3.1244888035362875e-06, | |
| "loss": 1.101, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.6632124352331608, | |
| "grad_norm": 33.15366058006866, | |
| "learning_rate": 3.0785765818960534e-06, | |
| "loss": 1.0517, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.6658031088082903, | |
| "grad_norm": 35.79893709161297, | |
| "learning_rate": 3.0329760437539233e-06, | |
| "loss": 1.0886, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.6683937823834198, | |
| "grad_norm": 49.59918009099835, | |
| "learning_rate": 2.9876880290472376e-06, | |
| "loss": 1.0756, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.6709844559585494, | |
| "grad_norm": 21.485142494367135, | |
| "learning_rate": 2.942713371956809e-06, | |
| "loss": 1.1017, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.6735751295336787, | |
| "grad_norm": 29.23169287520316, | |
| "learning_rate": 2.8980529008915793e-06, | |
| "loss": 1.1241, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.6761658031088082, | |
| "grad_norm": 27.913868608886553, | |
| "learning_rate": 2.853707438473352e-06, | |
| "loss": 1.0841, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.6787564766839378, | |
| "grad_norm": 18.438597602055644, | |
| "learning_rate": 2.8096778015216484e-06, | |
| "loss": 1.0891, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.6813471502590673, | |
| "grad_norm": 54.0556941620233, | |
| "learning_rate": 2.7659648010386365e-06, | |
| "loss": 1.0589, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.6839378238341969, | |
| "grad_norm": 108.10101848740734, | |
| "learning_rate": 2.7225692421942306e-06, | |
| "loss": 1.0766, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.6865284974093264, | |
| "grad_norm": 106.58835736628185, | |
| "learning_rate": 2.679491924311226e-06, | |
| "loss": 1.1144, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.689119170984456, | |
| "grad_norm": 31.53371570516213, | |
| "learning_rate": 2.6367336408506063e-06, | |
| "loss": 1.02, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.6917098445595855, | |
| "grad_norm": 36.263088086669775, | |
| "learning_rate": 2.594295179396895e-06, | |
| "loss": 1.0679, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.694300518134715, | |
| "grad_norm": 24.47507184337666, | |
| "learning_rate": 2.5521773216436875e-06, | |
| "loss": 1.1092, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.6968911917098446, | |
| "grad_norm": 33.05899532106974, | |
| "learning_rate": 2.5103808433792075e-06, | |
| "loss": 1.053, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.6994818652849741, | |
| "grad_norm": 29.132344102799873, | |
| "learning_rate": 2.468906514472065e-06, | |
| "loss": 1.0518, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.7020725388601037, | |
| "grad_norm": 43.48960854254409, | |
| "learning_rate": 2.4277550988570362e-06, | |
| "loss": 1.0537, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.704663212435233, | |
| "grad_norm": 28.13627467897817, | |
| "learning_rate": 2.3869273545210158e-06, | |
| "loss": 1.0558, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.7072538860103625, | |
| "grad_norm": 33.18164212520423, | |
| "learning_rate": 2.3464240334890496e-06, | |
| "loss": 1.054, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 1.709844559585492, | |
| "grad_norm": 41.884394437273144, | |
| "learning_rate": 2.3062458818104804e-06, | |
| "loss": 1.0871, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.7124352331606216, | |
| "grad_norm": 27.119840736470916, | |
| "learning_rate": 2.266393639545197e-06, | |
| "loss": 1.0743, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 1.7150259067357512, | |
| "grad_norm": 20.70474999023591, | |
| "learning_rate": 2.22686804075003e-06, | |
| "loss": 1.0718, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 1.7176165803108807, | |
| "grad_norm": 21.469651089617198, | |
| "learning_rate": 2.187669813465192e-06, | |
| "loss": 1.0584, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.7202072538860103, | |
| "grad_norm": 29.901704269591495, | |
| "learning_rate": 2.1487996797009103e-06, | |
| "loss": 1.1175, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 1.7227979274611398, | |
| "grad_norm": 75.06310533674302, | |
| "learning_rate": 2.110258355424093e-06, | |
| "loss": 1.124, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.7253886010362693, | |
| "grad_norm": 34.13349153293387, | |
| "learning_rate": 2.0720465505451524e-06, | |
| "loss": 1.1395, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.7279792746113989, | |
| "grad_norm": 26.83922350447555, | |
| "learning_rate": 2.0341649689049458e-06, | |
| "loss": 1.0449, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 1.7305699481865284, | |
| "grad_norm": 37.284339589086024, | |
| "learning_rate": 1.9966143082617797e-06, | |
| "loss": 1.0332, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.733160621761658, | |
| "grad_norm": 46.453238969399074, | |
| "learning_rate": 1.959395260278587e-06, | |
| "loss": 1.1303, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.7357512953367875, | |
| "grad_norm": 22.743791018223284, | |
| "learning_rate": 1.922508510510166e-06, | |
| "loss": 1.0993, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.738341968911917, | |
| "grad_norm": 27.788137087891727, | |
| "learning_rate": 1.885954738390572e-06, | |
| "loss": 1.1234, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 1.7409326424870466, | |
| "grad_norm": 34.03637743502625, | |
| "learning_rate": 1.8497346172205733e-06, | |
| "loss": 1.085, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.7435233160621761, | |
| "grad_norm": 30.308363072599853, | |
| "learning_rate": 1.8138488141552856e-06, | |
| "loss": 1.0348, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 1.7461139896373057, | |
| "grad_norm": 26.81612464278571, | |
| "learning_rate": 1.7782979901918507e-06, | |
| "loss": 1.0672, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.7487046632124352, | |
| "grad_norm": 46.96340147563577, | |
| "learning_rate": 1.7430828001572897e-06, | |
| "loss": 1.0807, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.7512953367875648, | |
| "grad_norm": 30.87064631308438, | |
| "learning_rate": 1.7082038926964162e-06, | |
| "loss": 1.1411, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 1.7538860103626943, | |
| "grad_norm": 79.59411718865987, | |
| "learning_rate": 1.6736619102599073e-06, | |
| "loss": 1.0234, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 1.7564766839378239, | |
| "grad_norm": 30.875792565440594, | |
| "learning_rate": 1.6394574890924574e-06, | |
| "loss": 1.1506, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.7590673575129534, | |
| "grad_norm": 34.227935587917464, | |
| "learning_rate": 1.605591259221071e-06, | |
| "loss": 1.0981, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.7590673575129534, | |
| "eval_loss": 1.0809757709503174, | |
| "eval_runtime": 37.9729, | |
| "eval_samples_per_second": 19.593, | |
| "eval_steps_per_second": 1.238, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.761658031088083, | |
| "grad_norm": 31.849171622198522, | |
| "learning_rate": 1.572063844443441e-06, | |
| "loss": 1.1227, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.7642487046632125, | |
| "grad_norm": 32.75765881856165, | |
| "learning_rate": 1.5388758623164802e-06, | |
| "loss": 1.0842, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.766839378238342, | |
| "grad_norm": 27.83779558188967, | |
| "learning_rate": 1.5060279241449304e-06, | |
| "loss": 1.0419, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 1.7694300518134716, | |
| "grad_norm": 30.646833576522408, | |
| "learning_rate": 1.4735206349701003e-06, | |
| "loss": 1.0983, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 1.7720207253886011, | |
| "grad_norm": 29.748071428344947, | |
| "learning_rate": 1.4413545935587415e-06, | |
| "loss": 1.1276, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.7746113989637307, | |
| "grad_norm": 32.57104117085742, | |
| "learning_rate": 1.4095303923919956e-06, | |
| "loss": 1.0728, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.7772020725388602, | |
| "grad_norm": 32.02209671450587, | |
| "learning_rate": 1.3780486176544905e-06, | |
| "loss": 1.1148, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 1.7797927461139897, | |
| "grad_norm": 31.902388050458736, | |
| "learning_rate": 1.3469098492235521e-06, | |
| "loss": 1.0873, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 1.7823834196891193, | |
| "grad_norm": 33.159581668201604, | |
| "learning_rate": 1.316114660658505e-06, | |
| "loss": 1.0308, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 1.7849740932642488, | |
| "grad_norm": 25.531240947030152, | |
| "learning_rate": 1.2856636191901296e-06, | |
| "loss": 1.0893, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 1.7875647668393784, | |
| "grad_norm": 25.382870674663973, | |
| "learning_rate": 1.255557285710185e-06, | |
| "loss": 1.1089, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.790155440414508, | |
| "grad_norm": 26.184606368046406, | |
| "learning_rate": 1.225796214761117e-06, | |
| "loss": 1.1515, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 1.7927461139896375, | |
| "grad_norm": 27.78595815725415, | |
| "learning_rate": 1.196380954525802e-06, | |
| "loss": 1.0871, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 1.795336787564767, | |
| "grad_norm": 32.137607036645285, | |
| "learning_rate": 1.1673120468174837e-06, | |
| "loss": 1.1396, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 1.7979274611398963, | |
| "grad_norm": 31.931928767500203, | |
| "learning_rate": 1.1385900270697658e-06, | |
| "loss": 1.1175, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 1.8005181347150259, | |
| "grad_norm": 36.61199052966244, | |
| "learning_rate": 1.110215424326775e-06, | |
| "loss": 1.1867, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.8031088082901554, | |
| "grad_norm": 49.9081839820131, | |
| "learning_rate": 1.0821887612333959e-06, | |
| "loss": 1.1266, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 1.805699481865285, | |
| "grad_norm": 25.346034138603734, | |
| "learning_rate": 1.0545105540256628e-06, | |
| "loss": 1.0614, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 1.8082901554404145, | |
| "grad_norm": 47.53838459679947, | |
| "learning_rate": 1.0271813125212237e-06, | |
| "loss": 1.1314, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 1.810880829015544, | |
| "grad_norm": 30.496460286583815, | |
| "learning_rate": 1.0002015401099797e-06, | |
| "loss": 1.1067, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 1.8134715025906736, | |
| "grad_norm": 29.929097539381686, | |
| "learning_rate": 9.735717337447981e-07, | |
| "loss": 1.0424, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8160621761658031, | |
| "grad_norm": 30.887132457194266, | |
| "learning_rate": 9.4729238393235e-07, | |
| "loss": 1.1248, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 1.8186528497409327, | |
| "grad_norm": 24.26916275448189, | |
| "learning_rate": 9.21363974724101e-07, | |
| "loss": 1.0577, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 1.8212435233160622, | |
| "grad_norm": 40.34641617989283, | |
| "learning_rate": 8.957869837073673e-07, | |
| "loss": 1.1639, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 1.8238341968911918, | |
| "grad_norm": 34.3133374466777, | |
| "learning_rate": 8.705618819965411e-07, | |
| "loss": 1.0866, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 1.8264248704663213, | |
| "grad_norm": 25.164299615685284, | |
| "learning_rate": 8.456891342243945e-07, | |
| "loss": 1.1232, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.8290155440414506, | |
| "grad_norm": 129.91297199628124, | |
| "learning_rate": 8.211691985335357e-07, | |
| "loss": 1.1542, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 1.8316062176165802, | |
| "grad_norm": 23.928927141144797, | |
| "learning_rate": 7.970025265679648e-07, | |
| "loss": 1.0813, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 1.8341968911917097, | |
| "grad_norm": 22.631504479886225, | |
| "learning_rate": 7.731895634647513e-07, | |
| "loss": 1.1164, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 1.8367875647668392, | |
| "grad_norm": 84.2359250723018, | |
| "learning_rate": 7.497307478458382e-07, | |
| "loss": 1.1081, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 1.8393782383419688, | |
| "grad_norm": 51.39142883893451, | |
| "learning_rate": 7.266265118099669e-07, | |
| "loss": 1.105, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8419689119170983, | |
| "grad_norm": 41.18280727079993, | |
| "learning_rate": 7.038772809247075e-07, | |
| "loss": 1.1211, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 1.8445595854922279, | |
| "grad_norm": 34.330855277813534, | |
| "learning_rate": 6.814834742186361e-07, | |
| "loss": 1.0783, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 1.8471502590673574, | |
| "grad_norm": 46.858780552576334, | |
| "learning_rate": 6.594455041735925e-07, | |
| "loss": 1.0214, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 1.849740932642487, | |
| "grad_norm": 94.2712798319484, | |
| "learning_rate": 6.377637767171152e-07, | |
| "loss": 1.098, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 1.8523316062176165, | |
| "grad_norm": 33.00073975184253, | |
| "learning_rate": 6.164386912149289e-07, | |
| "loss": 1.0906, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.854922279792746, | |
| "grad_norm": 30.030119862133272, | |
| "learning_rate": 5.954706404636179e-07, | |
| "loss": 1.1073, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 1.8575129533678756, | |
| "grad_norm": 46.42282973245658, | |
| "learning_rate": 5.748600106833735e-07, | |
| "loss": 1.1553, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 1.8601036269430051, | |
| "grad_norm": 26.48910946182044, | |
| "learning_rate": 5.546071815108845e-07, | |
| "loss": 1.0704, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 1.8626943005181347, | |
| "grad_norm": 29.34093197155635, | |
| "learning_rate": 5.347125259923491e-07, | |
| "loss": 1.1, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 1.8652849740932642, | |
| "grad_norm": 24.689130499541356, | |
| "learning_rate": 5.151764105766011e-07, | |
| "loss": 1.067, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.8678756476683938, | |
| "grad_norm": 21.25619644617847, | |
| "learning_rate": 4.959991951083498e-07, | |
| "loss": 1.1125, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 1.8704663212435233, | |
| "grad_norm": 23.946272802272112, | |
| "learning_rate": 4.771812328215708e-07, | |
| "loss": 1.0798, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 1.8730569948186528, | |
| "grad_norm": 33.286030816378954, | |
| "learning_rate": 4.587228703329838e-07, | |
| "loss": 1.0756, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 1.8756476683937824, | |
| "grad_norm": 109.02542545414109, | |
| "learning_rate": 4.40624447635678e-07, | |
| "loss": 1.073, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 1.878238341968912, | |
| "grad_norm": 133.80505789447585, | |
| "learning_rate": 4.228862980928439e-07, | |
| "loss": 1.1218, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.8808290155440415, | |
| "grad_norm": 28.671374209715793, | |
| "learning_rate": 4.0550874843163337e-07, | |
| "loss": 1.1546, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.883419689119171, | |
| "grad_norm": 20.092775273550536, | |
| "learning_rate": 3.8849211873714266e-07, | |
| "loss": 1.0608, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 1.8860103626943006, | |
| "grad_norm": 18.87195408427635, | |
| "learning_rate": 3.7183672244652135e-07, | |
| "loss": 1.0437, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 1.88860103626943, | |
| "grad_norm": 24.985644120932864, | |
| "learning_rate": 3.5554286634318814e-07, | |
| "loss": 1.0989, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 1.8911917098445596, | |
| "grad_norm": 24.09887960702925, | |
| "learning_rate": 3.3961085055119083e-07, | |
| "loss": 1.0347, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.8937823834196892, | |
| "grad_norm": 98.50926523613283, | |
| "learning_rate": 3.2404096852967305e-07, | |
| "loss": 1.1163, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 1.8963730569948187, | |
| "grad_norm": 42.45357973111845, | |
| "learning_rate": 3.0883350706746973e-07, | |
| "loss": 1.1497, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 1.8989637305699483, | |
| "grad_norm": 25.430184794482617, | |
| "learning_rate": 2.9398874627782014e-07, | |
| "loss": 1.0154, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 1.9015544041450778, | |
| "grad_norm": 32.56552224066898, | |
| "learning_rate": 2.7950695959322093e-07, | |
| "loss": 1.0976, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 1.9041450777202074, | |
| "grad_norm": 25.518391980867197, | |
| "learning_rate": 2.653884137603702e-07, | |
| "loss": 1.1122, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.906735751295337, | |
| "grad_norm": 20.537146853099735, | |
| "learning_rate": 2.516333688352801e-07, | |
| "loss": 1.0592, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 1.9093264248704664, | |
| "grad_norm": 25.28898033119641, | |
| "learning_rate": 2.382420781784589e-07, | |
| "loss": 1.0706, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 1.911917098445596, | |
| "grad_norm": 55.74230904177274, | |
| "learning_rate": 2.2521478845025867e-07, | |
| "loss": 1.1706, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 1.9145077720207255, | |
| "grad_norm": 42.768439146141375, | |
| "learning_rate": 2.1255173960634146e-07, | |
| "loss": 1.0917, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 1.917098445595855, | |
| "grad_norm": 31.627146067352545, | |
| "learning_rate": 2.0025316489323597e-07, | |
| "loss": 1.0842, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.9196891191709846, | |
| "grad_norm": 67.01614151937272, | |
| "learning_rate": 1.8831929084406119e-07, | |
| "loss": 1.1287, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 1.922279792746114, | |
| "grad_norm": 56.931018082229045, | |
| "learning_rate": 1.7675033727434288e-07, | |
| "loss": 1.148, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 1.9248704663212435, | |
| "grad_norm": 35.24107640275113, | |
| "learning_rate": 1.655465172779702e-07, | |
| "loss": 1.0814, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 1.927461139896373, | |
| "grad_norm": 28.45308969334642, | |
| "learning_rate": 1.547080372232679e-07, | |
| "loss": 1.1092, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 1.9300518134715026, | |
| "grad_norm": 67.36918357149847, | |
| "learning_rate": 1.44235096749199e-07, | |
| "loss": 1.1332, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.932642487046632, | |
| "grad_norm": 33.50866269131509, | |
| "learning_rate": 1.3412788876167925e-07, | |
| "loss": 1.0884, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 1.9352331606217616, | |
| "grad_norm": 34.359505767271465, | |
| "learning_rate": 1.2438659943003306e-07, | |
| "loss": 0.9982, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 1.9378238341968912, | |
| "grad_norm": 44.805290236152125, | |
| "learning_rate": 1.1501140818355627e-07, | |
| "loss": 1.065, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 1.9404145077720207, | |
| "grad_norm": 35.70322964853727, | |
| "learning_rate": 1.0600248770821886e-07, | |
| "loss": 1.1435, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 1.9430051813471503, | |
| "grad_norm": 37.7037381444634, | |
| "learning_rate": 9.736000394348299e-08, | |
| "loss": 1.1085, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9455958549222798, | |
| "grad_norm": 19.88028370873119, | |
| "learning_rate": 8.908411607923884e-08, | |
| "loss": 1.0903, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 1.9481865284974094, | |
| "grad_norm": 22.037441897095253, | |
| "learning_rate": 8.117497655287798e-08, | |
| "loss": 1.0621, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 1.950777202072539, | |
| "grad_norm": 36.597366625713235, | |
| "learning_rate": 7.363273104648904e-08, | |
| "loss": 1.134, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 1.9533678756476682, | |
| "grad_norm": 36.91544331752125, | |
| "learning_rate": 6.645751848417093e-08, | |
| "loss": 1.0894, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 1.9559585492227978, | |
| "grad_norm": 30.791496804716704, | |
| "learning_rate": 5.964947102946594e-08, | |
| "loss": 1.0774, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.9585492227979273, | |
| "grad_norm": 24.76204564200231, | |
| "learning_rate": 5.320871408294403e-08, | |
| "loss": 1.1167, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 1.9611398963730569, | |
| "grad_norm": 31.78111531944549, | |
| "learning_rate": 4.713536627987347e-08, | |
| "loss": 1.0709, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 1.9637305699481864, | |
| "grad_norm": 36.388018093644106, | |
| "learning_rate": 4.1429539488047066e-08, | |
| "loss": 1.0492, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 1.966321243523316, | |
| "grad_norm": 27.235358627643226, | |
| "learning_rate": 3.6091338805719356e-08, | |
| "loss": 1.1128, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 1.9689119170984455, | |
| "grad_norm": 26.526882273916378, | |
| "learning_rate": 3.1120862559670396e-08, | |
| "loss": 1.1129, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.971502590673575, | |
| "grad_norm": 28.962449597773997, | |
| "learning_rate": 2.651820230338942e-08, | |
| "loss": 1.1286, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 1.9740932642487046, | |
| "grad_norm": 104.33848533313731, | |
| "learning_rate": 2.2283442815402845e-08, | |
| "loss": 1.117, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 1.9766839378238341, | |
| "grad_norm": 179.66099272542536, | |
| "learning_rate": 1.8416662097693326e-08, | |
| "loss": 1.0788, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 1.9792746113989637, | |
| "grad_norm": 28.438877123785307, | |
| "learning_rate": 1.491793137427866e-08, | |
| "loss": 1.1436, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 1.9818652849740932, | |
| "grad_norm": 44.454308819411644, | |
| "learning_rate": 1.1787315089895057e-08, | |
| "loss": 1.1108, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.9844559585492227, | |
| "grad_norm": 53.23249975862293, | |
| "learning_rate": 9.024870908802552e-09, | |
| "loss": 0.9971, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 1.9870466321243523, | |
| "grad_norm": 35.2043549019015, | |
| "learning_rate": 6.630649713739168e-09, | |
| "loss": 1.1205, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 1.9896373056994818, | |
| "grad_norm": 22.286284343829376, | |
| "learning_rate": 4.6046956049639045e-09, | |
| "loss": 1.0848, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 1.9922279792746114, | |
| "grad_norm": 24.94719200433733, | |
| "learning_rate": 2.94704589946182e-09, | |
| "loss": 1.1308, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 1.994818652849741, | |
| "grad_norm": 41.684623957583106, | |
| "learning_rate": 1.657731130246809e-09, | |
| "loss": 1.1555, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.9974093264248705, | |
| "grad_norm": 55.480495348949425, | |
| "learning_rate": 7.367750458020518e-10, | |
| "loss": 1.129, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 43.2148652279276, | |
| "learning_rate": 1.8419460964258505e-10, | |
| "loss": 1.0835, | |
| "step": 772 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 772, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 193, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3363988309999616e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |