| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9978118161925602, | |
| "eval_steps": 500, | |
| "global_step": 171, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005835156819839533, | |
| "grad_norm": 3.0967645370477803, | |
| "learning_rate": 0.0, | |
| "loss": 1.0698, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.011670313639679067, | |
| "grad_norm": 3.240964653147952, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 1.0714, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0175054704595186, | |
| "grad_norm": 3.286415256381705, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.1575, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.023340627279358133, | |
| "grad_norm": 4.198907257722359, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.359, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.029175784099197667, | |
| "grad_norm": 3.4693235279579637, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.9649, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0350109409190372, | |
| "grad_norm": 3.0568945268705883, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 1.0989, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.040846097738876735, | |
| "grad_norm": 2.2641958909637157, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.9482, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.046681254558716266, | |
| "grad_norm": 2.5090273909864367, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 1.1167, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0525164113785558, | |
| "grad_norm": 1.976815998901389, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 1.0178, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.058351568198395334, | |
| "grad_norm": 2.1819711414316108, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2776, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06418672501823487, | |
| "grad_norm": 1.8201699012478207, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 1.1955, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0700218818380744, | |
| "grad_norm": 1.5462205611641233, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.9814, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07585703865791393, | |
| "grad_norm": 1.8781521337402638, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.0493, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.08169219547775347, | |
| "grad_norm": 1.7561324099023312, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 0.9224, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.087527352297593, | |
| "grad_norm": 1.4418880184516636, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.8082, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.09336250911743253, | |
| "grad_norm": 1.8346753334483505, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 1.0255, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09919766593727207, | |
| "grad_norm": 1.5851713574171153, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.9778, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.1050328227571116, | |
| "grad_norm": 1.670718527610444, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.9377, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.11086797957695113, | |
| "grad_norm": 1.6015249574772341, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9287, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.11670313639679067, | |
| "grad_norm": 1.6989482004469072, | |
| "learning_rate": 9.998945997517957e-06, | |
| "loss": 1.152, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12253829321663019, | |
| "grad_norm": 1.3452249233397808, | |
| "learning_rate": 9.99578443444032e-06, | |
| "loss": 0.8807, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.12837345003646974, | |
| "grad_norm": 2.1487234818996686, | |
| "learning_rate": 9.990516643685222e-06, | |
| "loss": 0.7627, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.13420860685630925, | |
| "grad_norm": 1.923473946779632, | |
| "learning_rate": 9.983144846158472e-06, | |
| "loss": 1.2667, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1400437636761488, | |
| "grad_norm": 1.1277306360256565, | |
| "learning_rate": 9.973672149817232e-06, | |
| "loss": 0.7689, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.14587892049598833, | |
| "grad_norm": 1.1475575701592111, | |
| "learning_rate": 9.96210254835968e-06, | |
| "loss": 0.7796, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.15171407731582787, | |
| "grad_norm": 1.423352108857882, | |
| "learning_rate": 9.948440919541277e-06, | |
| "loss": 1.0162, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.1575492341356674, | |
| "grad_norm": 1.3087161085162522, | |
| "learning_rate": 9.932693023118299e-06, | |
| "loss": 0.9945, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.16338439095550694, | |
| "grad_norm": 1.3363607553089747, | |
| "learning_rate": 9.91486549841951e-06, | |
| "loss": 0.8672, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.16921954777534645, | |
| "grad_norm": 1.1887359433001867, | |
| "learning_rate": 9.894965861547023e-06, | |
| "loss": 0.9, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.175054704595186, | |
| "grad_norm": 1.3150047218140335, | |
| "learning_rate": 9.873002502207502e-06, | |
| "loss": 0.786, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.18088986141502553, | |
| "grad_norm": 1.3895338060465705, | |
| "learning_rate": 9.848984680175049e-06, | |
| "loss": 0.935, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.18672501823486506, | |
| "grad_norm": 1.3275940096513228, | |
| "learning_rate": 9.822922521387277e-06, | |
| "loss": 0.804, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1925601750547046, | |
| "grad_norm": 1.0560366942912123, | |
| "learning_rate": 9.794827013676206e-06, | |
| "loss": 0.7541, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.19839533187454414, | |
| "grad_norm": 0.9888394869609451, | |
| "learning_rate": 9.764710002135784e-06, | |
| "loss": 0.6956, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.20423048869438365, | |
| "grad_norm": 1.5276601674154802, | |
| "learning_rate": 9.732584184127973e-06, | |
| "loss": 1.1283, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2100656455142232, | |
| "grad_norm": 1.0255635415282915, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.7307, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.21590080233406272, | |
| "grad_norm": 1.3172712637289348, | |
| "learning_rate": 9.66236114702178e-06, | |
| "loss": 0.912, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.22173595915390226, | |
| "grad_norm": 1.2265556745233968, | |
| "learning_rate": 9.62429353402556e-06, | |
| "loss": 0.8612, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2275711159737418, | |
| "grad_norm": 1.382293666822052, | |
| "learning_rate": 9.584276314284316e-06, | |
| "loss": 0.9029, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.23340627279358134, | |
| "grad_norm": 1.4564772263747658, | |
| "learning_rate": 9.542326359097619e-06, | |
| "loss": 0.9388, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.23924142961342085, | |
| "grad_norm": 1.0351934922881194, | |
| "learning_rate": 9.498461354608228e-06, | |
| "loss": 0.7048, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.24507658643326038, | |
| "grad_norm": 1.1855152897517165, | |
| "learning_rate": 9.452699794345583e-06, | |
| "loss": 0.8375, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.25091174325309995, | |
| "grad_norm": 0.9858455620030542, | |
| "learning_rate": 9.405060971428924e-06, | |
| "loss": 0.6869, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2567469000729395, | |
| "grad_norm": 0.9902232882820138, | |
| "learning_rate": 9.355564970433288e-06, | |
| "loss": 0.7302, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.26258205689277897, | |
| "grad_norm": 1.0675035604533485, | |
| "learning_rate": 9.30423265892184e-06, | |
| "loss": 0.8118, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2684172137126185, | |
| "grad_norm": 1.3867123886210095, | |
| "learning_rate": 9.251085678648072e-06, | |
| "loss": 1.0246, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.27425237053245805, | |
| "grad_norm": 1.5259388197713282, | |
| "learning_rate": 9.196146436431635e-06, | |
| "loss": 1.1016, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.2800875273522976, | |
| "grad_norm": 1.2982224709981671, | |
| "learning_rate": 9.13943809471159e-06, | |
| "loss": 0.9131, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2859226841721371, | |
| "grad_norm": 1.0192488730773452, | |
| "learning_rate": 9.08098456178111e-06, | |
| "loss": 0.6878, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.29175784099197666, | |
| "grad_norm": 1.3595848189757955, | |
| "learning_rate": 9.020810481707709e-06, | |
| "loss": 0.9331, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2975929978118162, | |
| "grad_norm": 0.8812342597637117, | |
| "learning_rate": 8.958941223943292e-06, | |
| "loss": 0.6287, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.30342815463165573, | |
| "grad_norm": 1.2990640418383663, | |
| "learning_rate": 8.895402872628352e-06, | |
| "loss": 0.9154, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.30926331145149527, | |
| "grad_norm": 1.2083829619551534, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.8133, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3150984682713348, | |
| "grad_norm": 1.1431625625586879, | |
| "learning_rate": 8.763426733072624e-06, | |
| "loss": 0.7709, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.32093362509117435, | |
| "grad_norm": 1.0434131741083061, | |
| "learning_rate": 8.695044586103297e-06, | |
| "loss": 0.8007, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3267687819110139, | |
| "grad_norm": 1.0904984432710232, | |
| "learning_rate": 8.625104604667965e-06, | |
| "loss": 0.8123, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.33260393873085337, | |
| "grad_norm": 0.920394581392736, | |
| "learning_rate": 8.553636275532236e-06, | |
| "loss": 0.6425, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3384390955506929, | |
| "grad_norm": 1.1503386459221627, | |
| "learning_rate": 8.480669729814635e-06, | |
| "loss": 0.7206, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.34427425237053244, | |
| "grad_norm": 1.090627383298351, | |
| "learning_rate": 8.40623573028327e-06, | |
| "loss": 0.7557, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.350109409190372, | |
| "grad_norm": 1.2240349403722315, | |
| "learning_rate": 8.330365658386252e-06, | |
| "loss": 0.896, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3559445660102115, | |
| "grad_norm": 1.0765178282667567, | |
| "learning_rate": 8.25309150102121e-06, | |
| "loss": 0.7681, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.36177972283005105, | |
| "grad_norm": 0.9793957317021386, | |
| "learning_rate": 8.174445837049614e-06, | |
| "loss": 0.7504, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3676148796498906, | |
| "grad_norm": 1.2232761418608094, | |
| "learning_rate": 8.094461823561473e-06, | |
| "loss": 0.8488, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.37345003646973013, | |
| "grad_norm": 1.2088998059050347, | |
| "learning_rate": 8.013173181896283e-06, | |
| "loss": 0.8523, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.37928519328956967, | |
| "grad_norm": 1.2900325123864722, | |
| "learning_rate": 7.930614183426074e-06, | |
| "loss": 0.8181, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3851203501094092, | |
| "grad_norm": 0.9454791838776316, | |
| "learning_rate": 7.846819635106569e-06, | |
| "loss": 0.7437, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.39095550692924874, | |
| "grad_norm": 0.9144587470068872, | |
| "learning_rate": 7.76182486480253e-06, | |
| "loss": 0.628, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3967906637490883, | |
| "grad_norm": 1.15724937547755, | |
| "learning_rate": 7.675665706393502e-06, | |
| "loss": 0.871, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4026258205689278, | |
| "grad_norm": 1.5015191452686714, | |
| "learning_rate": 7.588378484666214e-06, | |
| "loss": 1.0221, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4084609773887673, | |
| "grad_norm": 1.39998905729372, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.9229, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.41429613420860684, | |
| "grad_norm": 0.948492711117696, | |
| "learning_rate": 7.4105675128517456e-06, | |
| "loss": 0.6874, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.4201312910284464, | |
| "grad_norm": 1.258374093991861, | |
| "learning_rate": 7.320118728046818e-06, | |
| "loss": 0.9105, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4259664478482859, | |
| "grad_norm": 0.9884107530136707, | |
| "learning_rate": 7.2286917788826926e-06, | |
| "loss": 0.6701, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.43180160466812545, | |
| "grad_norm": 1.4165000586156353, | |
| "learning_rate": 7.136325211051905e-06, | |
| "loss": 1.0041, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.437636761487965, | |
| "grad_norm": 1.1602616170268807, | |
| "learning_rate": 7.043057966391158e-06, | |
| "loss": 0.8137, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4434719183078045, | |
| "grad_norm": 0.8628006157003906, | |
| "learning_rate": 6.948929366463397e-06, | |
| "loss": 0.6294, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.44930707512764406, | |
| "grad_norm": 1.0039942958404424, | |
| "learning_rate": 6.8539790959798045e-06, | |
| "loss": 0.7109, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.4551422319474836, | |
| "grad_norm": 1.3931149797330953, | |
| "learning_rate": 6.758247186068684e-06, | |
| "loss": 0.9198, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.46097738876732314, | |
| "grad_norm": 1.0513701294639524, | |
| "learning_rate": 6.6617739973982985e-06, | |
| "loss": 0.7459, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4668125455871627, | |
| "grad_norm": 1.028695671279348, | |
| "learning_rate": 6.5646002031607726e-06, | |
| "loss": 0.746, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4726477024070022, | |
| "grad_norm": 0.9959898755507389, | |
| "learning_rate": 6.466766771924231e-06, | |
| "loss": 0.8316, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.4784828592268417, | |
| "grad_norm": 1.0084464738139896, | |
| "learning_rate": 6.368314950360416e-06, | |
| "loss": 0.6801, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.48431801604668123, | |
| "grad_norm": 1.2115089332593143, | |
| "learning_rate": 6.269286245855039e-06, | |
| "loss": 0.9926, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.49015317286652077, | |
| "grad_norm": 1.0623534367007055, | |
| "learning_rate": 6.169722409008244e-06, | |
| "loss": 0.7846, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4959883296863603, | |
| "grad_norm": 0.9618691552575331, | |
| "learning_rate": 6.0696654160324875e-06, | |
| "loss": 0.7579, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5018234865061999, | |
| "grad_norm": 1.4236190190616003, | |
| "learning_rate": 5.9691574510553505e-06, | |
| "loss": 0.9145, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5076586433260394, | |
| "grad_norm": 0.9781934808099418, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.7268, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.513493800145879, | |
| "grad_norm": 1.4459804636005047, | |
| "learning_rate": 5.766958274393428e-06, | |
| "loss": 0.9873, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5193289569657185, | |
| "grad_norm": 1.402656043092622, | |
| "learning_rate": 5.66535231008227e-06, | |
| "loss": 0.9982, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5251641137855579, | |
| "grad_norm": 1.0200801826456665, | |
| "learning_rate": 5.5634658325766066e-06, | |
| "loss": 0.678, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5309992706053975, | |
| "grad_norm": 0.9733189988828815, | |
| "learning_rate": 5.46134179731651e-06, | |
| "loss": 0.7651, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.536834427425237, | |
| "grad_norm": 1.4664206961259343, | |
| "learning_rate": 5.359023259896638e-06, | |
| "loss": 1.1306, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5426695842450766, | |
| "grad_norm": 0.8504168428338762, | |
| "learning_rate": 5.2565533579139484e-06, | |
| "loss": 0.63, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.5485047410649161, | |
| "grad_norm": 0.9453356240059136, | |
| "learning_rate": 5.153975292780852e-06, | |
| "loss": 0.7414, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5543398978847557, | |
| "grad_norm": 1.2873508014079351, | |
| "learning_rate": 5.05133231151145e-06, | |
| "loss": 0.9491, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5601750547045952, | |
| "grad_norm": 1.120596468774601, | |
| "learning_rate": 4.948667688488552e-06, | |
| "loss": 0.8434, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5660102115244348, | |
| "grad_norm": 0.933260578220278, | |
| "learning_rate": 4.846024707219149e-06, | |
| "loss": 0.6954, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5718453683442742, | |
| "grad_norm": 1.1505604485932606, | |
| "learning_rate": 4.7434466420860515e-06, | |
| "loss": 0.8766, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5776805251641138, | |
| "grad_norm": 1.3190064743230776, | |
| "learning_rate": 4.640976740103363e-06, | |
| "loss": 0.9601, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5835156819839533, | |
| "grad_norm": 1.1031574587195832, | |
| "learning_rate": 4.53865820268349e-06, | |
| "loss": 0.8262, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5893508388037928, | |
| "grad_norm": 1.2140720848727085, | |
| "learning_rate": 4.436534167423395e-06, | |
| "loss": 0.8474, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5951859956236324, | |
| "grad_norm": 1.0868094387995848, | |
| "learning_rate": 4.334647689917734e-06, | |
| "loss": 0.7998, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6010211524434719, | |
| "grad_norm": 1.0979874833235823, | |
| "learning_rate": 4.233041725606573e-06, | |
| "loss": 0.7538, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6068563092633115, | |
| "grad_norm": 0.8239976607859979, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.6354, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.612691466083151, | |
| "grad_norm": 1.2986211049877099, | |
| "learning_rate": 4.03084254894465e-06, | |
| "loss": 0.8842, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6185266229029905, | |
| "grad_norm": 1.4668023858002492, | |
| "learning_rate": 3.930334583967514e-06, | |
| "loss": 1.1808, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.62436177972283, | |
| "grad_norm": 0.9088094469191689, | |
| "learning_rate": 3.8302775909917585e-06, | |
| "loss": 0.7396, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.6301969365426696, | |
| "grad_norm": 1.1598630921383477, | |
| "learning_rate": 3.730713754144961e-06, | |
| "loss": 0.8171, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.6360320933625091, | |
| "grad_norm": 0.9110392040510746, | |
| "learning_rate": 3.6316850496395863e-06, | |
| "loss": 0.7386, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.6418672501823487, | |
| "grad_norm": 1.4073871245462084, | |
| "learning_rate": 3.5332332280757706e-06, | |
| "loss": 0.8708, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6477024070021882, | |
| "grad_norm": 1.0974400375996476, | |
| "learning_rate": 3.4353997968392295e-06, | |
| "loss": 0.7883, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6535375638220278, | |
| "grad_norm": 0.9061825638441261, | |
| "learning_rate": 3.3382260026017027e-06, | |
| "loss": 0.6915, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6593727206418672, | |
| "grad_norm": 1.0771803335915262, | |
| "learning_rate": 3.241752813931316e-06, | |
| "loss": 0.8647, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6652078774617067, | |
| "grad_norm": 0.9568570846689433, | |
| "learning_rate": 3.1460209040201967e-06, | |
| "loss": 0.743, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6710430342815463, | |
| "grad_norm": 0.8038754456744998, | |
| "learning_rate": 3.0510706335366034e-06, | |
| "loss": 0.6186, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6768781911013858, | |
| "grad_norm": 1.0490487646998612, | |
| "learning_rate": 2.956942033608843e-06, | |
| "loss": 0.7842, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6827133479212254, | |
| "grad_norm": 1.1528124191592586, | |
| "learning_rate": 2.863674788948097e-06, | |
| "loss": 0.8293, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6885485047410649, | |
| "grad_norm": 1.1151481901518225, | |
| "learning_rate": 2.771308221117309e-06, | |
| "loss": 0.8212, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6943836615609045, | |
| "grad_norm": 0.9417128722043704, | |
| "learning_rate": 2.6798812719531843e-06, | |
| "loss": 0.6844, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.700218818380744, | |
| "grad_norm": 1.032191371370444, | |
| "learning_rate": 2.5894324871482557e-06, | |
| "loss": 0.8145, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7060539752005836, | |
| "grad_norm": 0.8894619973494138, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.7, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.711889132020423, | |
| "grad_norm": 1.0056726773650362, | |
| "learning_rate": 2.411621515333788e-06, | |
| "loss": 0.7701, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7177242888402626, | |
| "grad_norm": 1.0869409622185178, | |
| "learning_rate": 2.324334293606499e-06, | |
| "loss": 0.8878, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.7235594456601021, | |
| "grad_norm": 0.9975686046452821, | |
| "learning_rate": 2.238175135197471e-06, | |
| "loss": 0.7396, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.7293946024799417, | |
| "grad_norm": 0.9651532423367729, | |
| "learning_rate": 2.1531803648934333e-06, | |
| "loss": 0.7326, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7352297592997812, | |
| "grad_norm": 1.2022500141898183, | |
| "learning_rate": 2.069385816573928e-06, | |
| "loss": 0.8903, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.7410649161196207, | |
| "grad_norm": 1.1194585298700934, | |
| "learning_rate": 1.9868268181037186e-06, | |
| "loss": 0.7492, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.7469000729394603, | |
| "grad_norm": 0.9563972874909548, | |
| "learning_rate": 1.9055381764385272e-06, | |
| "loss": 0.7019, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7527352297592997, | |
| "grad_norm": 1.1116002934004348, | |
| "learning_rate": 1.8255541629503865e-06, | |
| "loss": 0.8464, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7585703865791393, | |
| "grad_norm": 0.9834353364914193, | |
| "learning_rate": 1.746908498978791e-06, | |
| "loss": 0.7098, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7644055433989788, | |
| "grad_norm": 0.9133138576459355, | |
| "learning_rate": 1.6696343416137495e-06, | |
| "loss": 0.7116, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7702407002188184, | |
| "grad_norm": 1.1533333715544398, | |
| "learning_rate": 1.5937642697167288e-06, | |
| "loss": 0.8582, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7760758570386579, | |
| "grad_norm": 1.1643056283458053, | |
| "learning_rate": 1.5193302701853674e-06, | |
| "loss": 0.8634, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7819110138584975, | |
| "grad_norm": 1.2470889663194495, | |
| "learning_rate": 1.4463637244677648e-06, | |
| "loss": 1.0618, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.787746170678337, | |
| "grad_norm": 1.137971799681056, | |
| "learning_rate": 1.374895395332037e-06, | |
| "loss": 0.7659, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7935813274981766, | |
| "grad_norm": 0.8296991248159793, | |
| "learning_rate": 1.3049554138967052e-06, | |
| "loss": 0.6202, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.799416484318016, | |
| "grad_norm": 1.267967690991332, | |
| "learning_rate": 1.2365732669273778e-06, | |
| "loss": 0.7476, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8052516411378556, | |
| "grad_norm": 1.0879072186919871, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.7832, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.8110867979576951, | |
| "grad_norm": 0.9784037046492985, | |
| "learning_rate": 1.1045971273716476e-06, | |
| "loss": 0.75, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.8169219547775346, | |
| "grad_norm": 0.969625700866293, | |
| "learning_rate": 1.0410587760567104e-06, | |
| "loss": 0.8109, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8227571115973742, | |
| "grad_norm": 1.0370004026477495, | |
| "learning_rate": 9.791895182922911e-07, | |
| "loss": 0.7486, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.8285922684172137, | |
| "grad_norm": 1.313976521343513, | |
| "learning_rate": 9.190154382188921e-07, | |
| "loss": 0.9392, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.8344274252370533, | |
| "grad_norm": 1.3968351116673179, | |
| "learning_rate": 8.605619052884106e-07, | |
| "loss": 1.0106, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.8402625820568927, | |
| "grad_norm": 0.9927938469465771, | |
| "learning_rate": 8.03853563568367e-07, | |
| "loss": 0.7784, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.8460977388767323, | |
| "grad_norm": 0.9788007150864164, | |
| "learning_rate": 7.489143213519301e-07, | |
| "loss": 0.8396, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8519328956965718, | |
| "grad_norm": 1.0639499300007744, | |
| "learning_rate": 6.957673410781617e-07, | |
| "loss": 0.7964, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8577680525164114, | |
| "grad_norm": 1.2358144380534066, | |
| "learning_rate": 6.444350295667112e-07, | |
| "loss": 0.8458, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.8636032093362509, | |
| "grad_norm": 1.1284354446219842, | |
| "learning_rate": 5.949390285710777e-07, | |
| "loss": 0.8905, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8694383661560905, | |
| "grad_norm": 0.9040214380343765, | |
| "learning_rate": 5.473002056544191e-07, | |
| "loss": 0.7118, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.87527352297593, | |
| "grad_norm": 0.9693967414133844, | |
| "learning_rate": 5.015386453917742e-07, | |
| "loss": 0.7273, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8811086797957695, | |
| "grad_norm": 1.08806034257909, | |
| "learning_rate": 4.576736409023813e-07, | |
| "loss": 0.762, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.886943836615609, | |
| "grad_norm": 1.0602639261817013, | |
| "learning_rate": 4.15723685715686e-07, | |
| "loss": 0.8823, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8927789934354485, | |
| "grad_norm": 1.015659074679919, | |
| "learning_rate": 3.7570646597444196e-07, | |
| "loss": 0.8069, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8986141502552881, | |
| "grad_norm": 1.1556479384409875, | |
| "learning_rate": 3.3763885297822153e-07, | |
| "loss": 0.9008, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.9044493070751276, | |
| "grad_norm": 1.1155559090709997, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.7752, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9102844638949672, | |
| "grad_norm": 1.1402114085019883, | |
| "learning_rate": 2.6741581587202747e-07, | |
| "loss": 0.849, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.9161196207148067, | |
| "grad_norm": 1.0419920776286093, | |
| "learning_rate": 2.3528999786421758e-07, | |
| "loss": 0.8142, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.9219547775346463, | |
| "grad_norm": 1.1306470417466037, | |
| "learning_rate": 2.0517298632379445e-07, | |
| "loss": 0.8408, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.9277899343544858, | |
| "grad_norm": 1.125391381754142, | |
| "learning_rate": 1.770774786127244e-07, | |
| "loss": 0.7793, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.9336250911743253, | |
| "grad_norm": 0.8803202289480606, | |
| "learning_rate": 1.510153198249531e-07, | |
| "loss": 0.7106, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9394602479941648, | |
| "grad_norm": 0.9210847874254954, | |
| "learning_rate": 1.2699749779249926e-07, | |
| "loss": 0.6821, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.9452954048140044, | |
| "grad_norm": 1.0075752224421093, | |
| "learning_rate": 1.0503413845297739e-07, | |
| "loss": 0.8513, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.9511305616338439, | |
| "grad_norm": 0.8750161290165441, | |
| "learning_rate": 8.513450158049109e-08, | |
| "loss": 0.7585, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.9569657184536834, | |
| "grad_norm": 1.1096291136415941, | |
| "learning_rate": 6.730697688170251e-08, | |
| "loss": 0.8467, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.962800875273523, | |
| "grad_norm": 0.9247194424342008, | |
| "learning_rate": 5.155908045872349e-08, | |
| "loss": 0.7033, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9686360320933625, | |
| "grad_norm": 0.9831793236532468, | |
| "learning_rate": 3.7897451640321326e-08, | |
| "loss": 0.7581, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.9744711889132021, | |
| "grad_norm": 0.9263535840213669, | |
| "learning_rate": 2.6327850182769065e-08, | |
| "loss": 0.697, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9803063457330415, | |
| "grad_norm": 0.9615311264525579, | |
| "learning_rate": 1.6855153841527915e-08, | |
| "loss": 0.6953, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9861415025528811, | |
| "grad_norm": 0.9734970339085126, | |
| "learning_rate": 9.48335631477948e-09, | |
| "loss": 0.8321, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.9919766593727206, | |
| "grad_norm": 0.8205128623545712, | |
| "learning_rate": 4.2155655596809455e-09, | |
| "loss": 0.6526, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9978118161925602, | |
| "grad_norm": 0.8811350978732005, | |
| "learning_rate": 1.054002482043237e-09, | |
| "loss": 0.7098, | |
| "step": 171 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 171, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 4050, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 18593527627776.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |