{ "best_global_step": 1800, "best_metric": 0.19401330376940132, "best_model_checkpoint": "out_qwen_0.6b_sft_augmented/checkpoint-1800", "epoch": 2.839613335963701, "eval_steps": 50, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015782205563227462, "grad_norm": 167.66599198435412, "learning_rate": 2.3560209424083772e-07, "loss": 34.3388, "step": 10, "true_loss": 4.2431 }, { "epoch": 0.031564411126454923, "grad_norm": 140.88294654003442, "learning_rate": 4.973821989528796e-07, "loss": 35.0522, "step": 20, "true_loss": 4.3078 }, { "epoch": 0.04734661668968238, "grad_norm": 136.79181472954014, "learning_rate": 7.591623036649215e-07, "loss": 34.637, "step": 30, "true_loss": 4.3079 }, { "epoch": 0.06312882225290985, "grad_norm": 131.76245344240186, "learning_rate": 1.0209424083769635e-06, "loss": 34.6678, "step": 40, "true_loss": 4.398 }, { "epoch": 0.0789110278161373, "grad_norm": 121.23785317676568, "learning_rate": 1.2827225130890052e-06, "loss": 34.7919, "step": 50, "true_loss": 4.3715 }, { "epoch": 0.0789110278161373, "eval_accuracy": 0.02328159645232816, "eval_loss": 4.273420333862305, "eval_runtime": 15.4906, "eval_samples_per_second": 58.229, "eval_steps_per_second": 7.295, "step": 50 }, { "epoch": 0.09469323337936476, "grad_norm": 115.65585623224781, "learning_rate": 1.5445026178010472e-06, "loss": 33.6145, "step": 60, "true_loss": 4.0062 }, { "epoch": 0.11047543894259222, "grad_norm": 130.54325341180044, "learning_rate": 1.8062827225130891e-06, "loss": 33.379, "step": 70, "true_loss": 3.8901 }, { "epoch": 0.1262576445058197, "grad_norm": 112.3856329975497, "learning_rate": 2.068062827225131e-06, "loss": 33.5192, "step": 80, "true_loss": 4.307 }, { "epoch": 0.14203985006904715, "grad_norm": 108.9608630618755, "learning_rate": 2.329842931937173e-06, "loss": 33.1023, "step": 90, "true_loss": 4.0033 }, { "epoch": 0.1578220556322746, "grad_norm": 106.77628746014003, "learning_rate": 2.591623036649215e-06, "loss": 33.1149, "step": 100, "true_loss": 4.2262 }, { "epoch": 0.1578220556322746, "eval_accuracy": 0.018847006651884702, "eval_loss": 4.114043235778809, "eval_runtime": 14.8964, "eval_samples_per_second": 60.552, "eval_steps_per_second": 7.586, "step": 100 }, { "epoch": 0.17360426119550207, "grad_norm": 121.11526025215916, "learning_rate": 2.853403141361257e-06, "loss": 32.6546, "step": 110, "true_loss": 4.1197 }, { "epoch": 0.18938646675872953, "grad_norm": 92.42690926252463, "learning_rate": 3.115183246073299e-06, "loss": 32.8328, "step": 120, "true_loss": 4.095 }, { "epoch": 0.20516867232195699, "grad_norm": 101.38444957790254, "learning_rate": 3.3769633507853404e-06, "loss": 32.8388, "step": 130, "true_loss": 4.177 }, { "epoch": 0.22095087788518444, "grad_norm": 81.64774590325571, "learning_rate": 3.6387434554973826e-06, "loss": 33.3548, "step": 140, "true_loss": 4.0975 }, { "epoch": 0.2367330834484119, "grad_norm": 96.23586804909877, "learning_rate": 3.900523560209425e-06, "loss": 32.7975, "step": 150, "true_loss": 4.2231 }, { "epoch": 0.2367330834484119, "eval_accuracy": 0.037694013303769404, "eval_loss": 4.033259391784668, "eval_runtime": 14.9076, "eval_samples_per_second": 60.506, "eval_steps_per_second": 7.58, "step": 150 }, { "epoch": 0.2525152890116394, "grad_norm": 98.59498624074838, "learning_rate": 4.1623036649214665e-06, "loss": 32.6746, "step": 160, "true_loss": 4.0838 }, { "epoch": 0.26829749457486685, "grad_norm": 91.56971668470555, "learning_rate": 4.424083769633508e-06, "loss": 32.4462, "step": 170, "true_loss": 4.0187 }, { "epoch": 0.2840797001380943, "grad_norm": 88.75837984752525, "learning_rate": 4.68586387434555e-06, "loss": 32.573, "step": 180, "true_loss": 4.117 }, { "epoch": 0.29986190570132176, "grad_norm": 91.58443998502435, "learning_rate": 4.947643979057592e-06, "loss": 32.665, "step": 190, "true_loss": 3.9757 }, { "epoch": 0.3156441112645492, "grad_norm": 92.49414152028442, "learning_rate": 4.976621858562245e-06, "loss": 32.7294, "step": 200, "true_loss": 4.1063 }, { "epoch": 0.3156441112645492, "eval_accuracy": 0.04656319290465632, "eval_loss": 3.9896063804626465, "eval_runtime": 14.875, "eval_samples_per_second": 60.639, "eval_steps_per_second": 7.597, "step": 200 }, { "epoch": 0.3314263168277767, "grad_norm": 69.24966334193799, "learning_rate": 4.94739918176505e-06, "loss": 32.8352, "step": 210, "true_loss": 3.9432 }, { "epoch": 0.34720852239100414, "grad_norm": 84.67835268794279, "learning_rate": 4.9181765049678555e-06, "loss": 32.6025, "step": 220, "true_loss": 4.1811 }, { "epoch": 0.3629907279542316, "grad_norm": 62.265769584137104, "learning_rate": 4.888953828170661e-06, "loss": 33.0571, "step": 230, "true_loss": 4.1072 }, { "epoch": 0.37877293351745905, "grad_norm": 54.79839495974889, "learning_rate": 4.859731151373466e-06, "loss": 32.5185, "step": 240, "true_loss": 4.0185 }, { "epoch": 0.3945551390806865, "grad_norm": 60.84068430822686, "learning_rate": 4.830508474576272e-06, "loss": 32.4647, "step": 250, "true_loss": 4.1141 }, { "epoch": 0.3945551390806865, "eval_accuracy": 0.057649667405764965, "eval_loss": 3.965649366378784, "eval_runtime": 14.894, "eval_samples_per_second": 60.561, "eval_steps_per_second": 7.587, "step": 250 }, { "epoch": 0.41033734464391397, "grad_norm": 69.10630072542976, "learning_rate": 4.801285797779077e-06, "loss": 32.716, "step": 260, "true_loss": 4.0852 }, { "epoch": 0.42611955020714143, "grad_norm": 69.30562697285451, "learning_rate": 4.772063120981883e-06, "loss": 32.3945, "step": 270, "true_loss": 4.04 }, { "epoch": 0.4419017557703689, "grad_norm": 61.70708790424729, "learning_rate": 4.742840444184687e-06, "loss": 31.8389, "step": 280, "true_loss": 3.9424 }, { "epoch": 0.45768396133359635, "grad_norm": 63.553230156261236, "learning_rate": 4.713617767387494e-06, "loss": 32.2705, "step": 290, "true_loss": 4.0162 }, { "epoch": 0.4734661668968238, "grad_norm": 66.18745807155665, "learning_rate": 4.684395090590298e-06, "loss": 32.2542, "step": 300, "true_loss": 4.1658 }, { "epoch": 0.4734661668968238, "eval_accuracy": 0.05432372505543237, "eval_loss": 3.950526714324951, "eval_runtime": 14.9818, "eval_samples_per_second": 60.206, "eval_steps_per_second": 7.542, "step": 300 }, { "epoch": 0.4892483724600513, "grad_norm": 58.57848936978391, "learning_rate": 4.655172413793104e-06, "loss": 32.678, "step": 310, "true_loss": 3.9291 }, { "epoch": 0.5050305780232788, "grad_norm": 61.33442223889434, "learning_rate": 4.625949736995909e-06, "loss": 32.1995, "step": 320, "true_loss": 4.1149 }, { "epoch": 0.5208127835865062, "grad_norm": 64.64890658388506, "learning_rate": 4.596727060198715e-06, "loss": 32.6549, "step": 330, "true_loss": 4.2883 }, { "epoch": 0.5365949891497337, "grad_norm": 67.05902120899448, "learning_rate": 4.56750438340152e-06, "loss": 32.4256, "step": 340, "true_loss": 4.1349 }, { "epoch": 0.5523771947129611, "grad_norm": 56.47408736066254, "learning_rate": 4.5382817066043256e-06, "loss": 32.9031, "step": 350, "true_loss": 4.0859 }, { "epoch": 0.5523771947129611, "eval_accuracy": 0.04878048780487805, "eval_loss": 3.9386215209960938, "eval_runtime": 14.7498, "eval_samples_per_second": 61.153, "eval_steps_per_second": 7.661, "step": 350 }, { "epoch": 0.5681594002761886, "grad_norm": 68.21711305531616, "learning_rate": 4.509059029807131e-06, "loss": 31.9884, "step": 360, "true_loss": 4.0053 }, { "epoch": 0.583941605839416, "grad_norm": 67.03237493940061, "learning_rate": 4.479836353009936e-06, "loss": 32.0414, "step": 370, "true_loss": 4.0486 }, { "epoch": 0.5997238114026435, "grad_norm": 62.81448024910035, "learning_rate": 4.450613676212742e-06, "loss": 31.6654, "step": 380, "true_loss": 3.9884 }, { "epoch": 0.6155060169658709, "grad_norm": 58.734278541487996, "learning_rate": 4.4213909994155465e-06, "loss": 32.1588, "step": 390, "true_loss": 4.1576 }, { "epoch": 0.6312882225290984, "grad_norm": 56.72282043276077, "learning_rate": 4.392168322618352e-06, "loss": 32.4645, "step": 400, "true_loss": 3.9613 }, { "epoch": 0.6312882225290984, "eval_accuracy": 0.08425720620842572, "eval_loss": 3.8948559761047363, "eval_runtime": 14.9154, "eval_samples_per_second": 60.474, "eval_steps_per_second": 7.576, "step": 400 }, { "epoch": 0.647070428092326, "grad_norm": 67.89360370302569, "learning_rate": 4.3629456458211574e-06, "loss": 32.2354, "step": 410, "true_loss": 3.9924 }, { "epoch": 0.6628526336555534, "grad_norm": 58.51107977573013, "learning_rate": 4.333722969023963e-06, "loss": 32.3006, "step": 420, "true_loss": 3.9875 }, { "epoch": 0.6786348392187809, "grad_norm": 66.3808988459889, "learning_rate": 4.304500292226768e-06, "loss": 32.2764, "step": 430, "true_loss": 3.9726 }, { "epoch": 0.6944170447820083, "grad_norm": 70.6833559958922, "learning_rate": 4.275277615429574e-06, "loss": 32.2592, "step": 440, "true_loss": 4.1439 }, { "epoch": 0.7101992503452358, "grad_norm": 63.3033292392276, "learning_rate": 4.246054938632379e-06, "loss": 32.362, "step": 450, "true_loss": 4.0236 }, { "epoch": 0.7101992503452358, "eval_accuracy": 0.07206208425720621, "eval_loss": 3.8847439289093018, "eval_runtime": 14.9499, "eval_samples_per_second": 60.335, "eval_steps_per_second": 7.559, "step": 450 }, { "epoch": 0.7259814559084632, "grad_norm": 72.67280180250647, "learning_rate": 4.216832261835184e-06, "loss": 32.2076, "step": 460, "true_loss": 4.1088 }, { "epoch": 0.7417636614716907, "grad_norm": 61.8016941961879, "learning_rate": 4.18760958503799e-06, "loss": 31.8356, "step": 470, "true_loss": 3.9524 }, { "epoch": 0.7575458670349181, "grad_norm": 71.60310974326192, "learning_rate": 4.158386908240795e-06, "loss": 32.4698, "step": 480, "true_loss": 3.9285 }, { "epoch": 0.7733280725981456, "grad_norm": 63.81105183121505, "learning_rate": 4.1291642314436e-06, "loss": 32.4375, "step": 490, "true_loss": 4.0445 }, { "epoch": 0.789110278161373, "grad_norm": 66.81078306732113, "learning_rate": 4.0999415546464065e-06, "loss": 31.6606, "step": 500, "true_loss": 3.912 }, { "epoch": 0.789110278161373, "eval_accuracy": 0.08869179600886919, "eval_loss": 3.842041492462158, "eval_runtime": 14.9881, "eval_samples_per_second": 60.181, "eval_steps_per_second": 7.539, "step": 500 }, { "epoch": 0.8048924837246005, "grad_norm": 66.51737872080436, "learning_rate": 4.070718877849211e-06, "loss": 32.6594, "step": 510, "true_loss": 4.0515 }, { "epoch": 0.8206746892878279, "grad_norm": 102.93145951917813, "learning_rate": 4.0414962010520166e-06, "loss": 32.0275, "step": 520, "true_loss": 4.1003 }, { "epoch": 0.8364568948510555, "grad_norm": 75.53198463360987, "learning_rate": 4.012273524254822e-06, "loss": 32.2893, "step": 530, "true_loss": 4.0965 }, { "epoch": 0.8522391004142829, "grad_norm": 76.07281655644753, "learning_rate": 3.9830508474576275e-06, "loss": 32.2373, "step": 540, "true_loss": 3.9613 }, { "epoch": 0.8680213059775104, "grad_norm": 72.30674725635646, "learning_rate": 3.953828170660433e-06, "loss": 32.1916, "step": 550, "true_loss": 3.9895 }, { "epoch": 0.8680213059775104, "eval_accuracy": 0.10975609756097561, "eval_loss": 3.7870140075683594, "eval_runtime": 14.8068, "eval_samples_per_second": 60.918, "eval_steps_per_second": 7.632, "step": 550 }, { "epoch": 0.8838035115407378, "grad_norm": 77.32202458254106, "learning_rate": 3.924605493863238e-06, "loss": 31.7419, "step": 560, "true_loss": 3.9115 }, { "epoch": 0.8995857171039653, "grad_norm": 78.63719989484625, "learning_rate": 3.895382817066044e-06, "loss": 32.5222, "step": 570, "true_loss": 4.0513 }, { "epoch": 0.9153679226671927, "grad_norm": 79.29136261002967, "learning_rate": 3.8661601402688484e-06, "loss": 31.8769, "step": 580, "true_loss": 4.0132 }, { "epoch": 0.9311501282304202, "grad_norm": 77.29435282237034, "learning_rate": 3.836937463471655e-06, "loss": 31.7648, "step": 590, "true_loss": 3.9623 }, { "epoch": 0.9469323337936476, "grad_norm": 68.35155053663401, "learning_rate": 3.8077147866744598e-06, "loss": 31.884, "step": 600, "true_loss": 3.9455 }, { "epoch": 0.9469323337936476, "eval_accuracy": 0.11751662971175167, "eval_loss": 3.7514853477478027, "eval_runtime": 14.8368, "eval_samples_per_second": 60.795, "eval_steps_per_second": 7.616, "step": 600 }, { "epoch": 0.9627145393568751, "grad_norm": 73.94418735157363, "learning_rate": 3.7784921098772652e-06, "loss": 31.669, "step": 610, "true_loss": 3.9077 }, { "epoch": 0.9784967449201026, "grad_norm": 78.72012954254731, "learning_rate": 3.7492694330800707e-06, "loss": 32.1414, "step": 620, "true_loss": 3.8983 }, { "epoch": 0.99427895048333, "grad_norm": 83.38578257414015, "learning_rate": 3.7200467562828757e-06, "loss": 32.0129, "step": 630, "true_loss": 3.9648 }, { "epoch": 1.0094693233379364, "grad_norm": 79.30562934714216, "learning_rate": 3.6908240794856816e-06, "loss": 30.8218, "step": 640, "true_loss": 4.0426 }, { "epoch": 1.0252515289011639, "grad_norm": 86.51493702246239, "learning_rate": 3.6616014026884866e-06, "loss": 31.7275, "step": 650, "true_loss": 4.0234 }, { "epoch": 1.0252515289011639, "eval_accuracy": 0.11862527716186252, "eval_loss": 3.7260189056396484, "eval_runtime": 14.9748, "eval_samples_per_second": 60.234, "eval_steps_per_second": 7.546, "step": 650 }, { "epoch": 1.0410337344643914, "grad_norm": 95.28223069807079, "learning_rate": 3.6323787258912916e-06, "loss": 31.1501, "step": 660, "true_loss": 4.1224 }, { "epoch": 1.056815940027619, "grad_norm": 95.66233260675281, "learning_rate": 3.6031560490940975e-06, "loss": 31.3991, "step": 670, "true_loss": 4.1524 }, { "epoch": 1.0725981455908462, "grad_norm": 105.1106515406413, "learning_rate": 3.5739333722969025e-06, "loss": 31.4503, "step": 680, "true_loss": 4.02 }, { "epoch": 1.0883803511540737, "grad_norm": 107.36932762546802, "learning_rate": 3.544710695499708e-06, "loss": 31.0981, "step": 690, "true_loss": 3.7185 }, { "epoch": 1.1041625567173012, "grad_norm": 102.49971541043875, "learning_rate": 3.5154880187025135e-06, "loss": 31.0649, "step": 700, "true_loss": 3.9306 }, { "epoch": 1.1041625567173012, "eval_accuracy": 0.13082039911308205, "eval_loss": 3.679271697998047, "eval_runtime": 14.9433, "eval_samples_per_second": 60.362, "eval_steps_per_second": 7.562, "step": 700 }, { "epoch": 1.1199447622805287, "grad_norm": 101.73957570633324, "learning_rate": 3.486265341905319e-06, "loss": 30.7209, "step": 710, "true_loss": 3.7747 }, { "epoch": 1.1357269678437563, "grad_norm": 94.73479469843072, "learning_rate": 3.457042665108124e-06, "loss": 31.5505, "step": 720, "true_loss": 3.7559 }, { "epoch": 1.1515091734069836, "grad_norm": 95.84840897528187, "learning_rate": 3.42781998831093e-06, "loss": 31.1915, "step": 730, "true_loss": 3.9457 }, { "epoch": 1.167291378970211, "grad_norm": 91.92412320762536, "learning_rate": 3.398597311513735e-06, "loss": 30.6652, "step": 740, "true_loss": 4.0984 }, { "epoch": 1.1830735845334386, "grad_norm": 97.13718067393098, "learning_rate": 3.3693746347165403e-06, "loss": 31.3627, "step": 750, "true_loss": 3.9014 }, { "epoch": 1.1830735845334386, "eval_accuracy": 0.14523281596452328, "eval_loss": 3.621267080307007, "eval_runtime": 15.0066, "eval_samples_per_second": 60.107, "eval_steps_per_second": 7.53, "step": 750 }, { "epoch": 1.198855790096666, "grad_norm": 102.94703309653879, "learning_rate": 3.3401519579193458e-06, "loss": 30.9646, "step": 760, "true_loss": 3.888 }, { "epoch": 1.2146379956598934, "grad_norm": 92.09108955881567, "learning_rate": 3.310929281122151e-06, "loss": 31.3938, "step": 770, "true_loss": 3.9565 }, { "epoch": 1.230420201223121, "grad_norm": 102.8940703497217, "learning_rate": 3.2817066043249562e-06, "loss": 30.6722, "step": 780, "true_loss": 3.7934 }, { "epoch": 1.2462024067863484, "grad_norm": 109.37649388601628, "learning_rate": 3.252483927527762e-06, "loss": 30.8783, "step": 790, "true_loss": 3.9625 }, { "epoch": 1.261984612349576, "grad_norm": 97.09349910497292, "learning_rate": 3.223261250730567e-06, "loss": 31.1443, "step": 800, "true_loss": 3.8638 }, { "epoch": 1.261984612349576, "eval_accuracy": 0.15077605321507762, "eval_loss": 3.628795862197876, "eval_runtime": 14.9076, "eval_samples_per_second": 60.506, "eval_steps_per_second": 7.58, "step": 800 }, { "epoch": 1.2777668179128034, "grad_norm": 93.73265189126965, "learning_rate": 3.194038573933372e-06, "loss": 31.4319, "step": 810, "true_loss": 4.0132 }, { "epoch": 1.2935490234760307, "grad_norm": 95.06539169984532, "learning_rate": 3.164815897136178e-06, "loss": 30.1564, "step": 820, "true_loss": 3.7849 }, { "epoch": 1.3093312290392582, "grad_norm": 104.78400272530634, "learning_rate": 3.135593220338983e-06, "loss": 31.4396, "step": 830, "true_loss": 3.9531 }, { "epoch": 1.3251134346024858, "grad_norm": 100.60674522449192, "learning_rate": 3.1063705435417885e-06, "loss": 31.0772, "step": 840, "true_loss": 3.9851 }, { "epoch": 1.340895640165713, "grad_norm": 92.96501685915857, "learning_rate": 3.0771478667445944e-06, "loss": 31.258, "step": 850, "true_loss": 4.0154 }, { "epoch": 1.340895640165713, "eval_accuracy": 0.15299334811529933, "eval_loss": 3.602060556411743, "eval_runtime": 14.8867, "eval_samples_per_second": 60.591, "eval_steps_per_second": 7.591, "step": 850 }, { "epoch": 1.3566778457289406, "grad_norm": 108.12958936489977, "learning_rate": 3.0479251899473994e-06, "loss": 30.7919, "step": 860, "true_loss": 4.071 }, { "epoch": 1.372460051292168, "grad_norm": 106.38226656410994, "learning_rate": 3.0187025131502045e-06, "loss": 31.106, "step": 870, "true_loss": 3.8771 }, { "epoch": 1.3882422568553956, "grad_norm": 108.70900740908084, "learning_rate": 2.9894798363530103e-06, "loss": 31.1107, "step": 880, "true_loss": 4.0616 }, { "epoch": 1.404024462418623, "grad_norm": 96.84422602253281, "learning_rate": 2.9602571595558154e-06, "loss": 31.1303, "step": 890, "true_loss": 3.9508 }, { "epoch": 1.4198066679818504, "grad_norm": 107.15877482205376, "learning_rate": 2.931034482758621e-06, "loss": 31.3023, "step": 900, "true_loss": 3.6901 }, { "epoch": 1.4198066679818504, "eval_accuracy": 0.14412416851441243, "eval_loss": 3.599184989929199, "eval_runtime": 15.0273, "eval_samples_per_second": 60.024, "eval_steps_per_second": 7.52, "step": 900 }, { "epoch": 1.435588873545078, "grad_norm": 111.56871878652656, "learning_rate": 2.9018118059614263e-06, "loss": 31.9703, "step": 910, "true_loss": 3.8989 }, { "epoch": 1.4513710791083054, "grad_norm": 95.57031718308956, "learning_rate": 2.8725891291642317e-06, "loss": 30.6978, "step": 920, "true_loss": 3.9814 }, { "epoch": 1.4671532846715327, "grad_norm": 104.66292767084798, "learning_rate": 2.8433664523670368e-06, "loss": 30.8414, "step": 930, "true_loss": 3.6811 }, { "epoch": 1.4829354902347602, "grad_norm": 109.58289564213575, "learning_rate": 2.8141437755698426e-06, "loss": 30.8233, "step": 940, "true_loss": 3.8887 }, { "epoch": 1.4987176957979877, "grad_norm": 109.2690047474217, "learning_rate": 2.7849210987726477e-06, "loss": 30.8807, "step": 950, "true_loss": 3.8297 }, { "epoch": 1.4987176957979877, "eval_accuracy": 0.14523281596452328, "eval_loss": 3.5658605098724365, "eval_runtime": 14.9659, "eval_samples_per_second": 60.27, "eval_steps_per_second": 7.55, "step": 950 }, { "epoch": 1.5144999013612153, "grad_norm": 111.94119193959217, "learning_rate": 2.7556984219754535e-06, "loss": 30.3332, "step": 960, "true_loss": 3.709 }, { "epoch": 1.5302821069244428, "grad_norm": 123.39562082481231, "learning_rate": 2.7264757451782586e-06, "loss": 31.1628, "step": 970, "true_loss": 4.1074 }, { "epoch": 1.5460643124876703, "grad_norm": 105.28551916442693, "learning_rate": 2.697253068381064e-06, "loss": 30.8197, "step": 980, "true_loss": 3.6786 }, { "epoch": 1.5618465180508976, "grad_norm": 111.08402051996337, "learning_rate": 2.6680303915838695e-06, "loss": 30.9562, "step": 990, "true_loss": 3.9186 }, { "epoch": 1.577628723614125, "grad_norm": 103.10179117331714, "learning_rate": 2.638807714786675e-06, "loss": 30.8469, "step": 1000, "true_loss": 3.6842 }, { "epoch": 1.577628723614125, "eval_accuracy": 0.1629711751662971, "eval_loss": 3.5389108657836914, "eval_runtime": 14.9677, "eval_samples_per_second": 60.263, "eval_steps_per_second": 7.55, "step": 1000 }, { "epoch": 1.5934109291773524, "grad_norm": 104.7664363387938, "learning_rate": 2.60958503798948e-06, "loss": 30.8959, "step": 1010, "true_loss": 3.8303 }, { "epoch": 1.60919313474058, "grad_norm": 108.11532700366512, "learning_rate": 2.580362361192286e-06, "loss": 30.822, "step": 1020, "true_loss": 4.0791 }, { "epoch": 1.6249753403038074, "grad_norm": 122.96775922789799, "learning_rate": 2.551139684395091e-06, "loss": 30.4368, "step": 1030, "true_loss": 3.8672 }, { "epoch": 1.640757545867035, "grad_norm": 111.32549428947904, "learning_rate": 2.521917007597896e-06, "loss": 30.7161, "step": 1040, "true_loss": 3.9227 }, { "epoch": 1.6565397514302624, "grad_norm": 105.3879178888327, "learning_rate": 2.4926943308007014e-06, "loss": 30.6919, "step": 1050, "true_loss": 3.8327 }, { "epoch": 1.6565397514302624, "eval_accuracy": 0.15964523281596452, "eval_loss": 3.5158631801605225, "eval_runtime": 15.0083, "eval_samples_per_second": 60.1, "eval_steps_per_second": 7.529, "step": 1050 }, { "epoch": 1.67232195699349, "grad_norm": 118.06824634956254, "learning_rate": 2.463471654003507e-06, "loss": 30.4308, "step": 1060, "true_loss": 3.718 }, { "epoch": 1.6881041625567172, "grad_norm": 116.61755089060757, "learning_rate": 2.4342489772063123e-06, "loss": 30.5914, "step": 1070, "true_loss": 3.9526 }, { "epoch": 1.7038863681199448, "grad_norm": 115.56434553036443, "learning_rate": 2.4050263004091177e-06, "loss": 30.5987, "step": 1080, "true_loss": 3.9159 }, { "epoch": 1.7196685736831723, "grad_norm": 112.33303898638984, "learning_rate": 2.375803623611923e-06, "loss": 31.0096, "step": 1090, "true_loss": 3.8114 }, { "epoch": 1.7354507792463996, "grad_norm": 112.21346249165515, "learning_rate": 2.3465809468147286e-06, "loss": 30.7385, "step": 1100, "true_loss": 3.8687 }, { "epoch": 1.7354507792463996, "eval_accuracy": 0.1696230598669623, "eval_loss": 3.5062079429626465, "eval_runtime": 14.8938, "eval_samples_per_second": 60.562, "eval_steps_per_second": 7.587, "step": 1100 }, { "epoch": 1.751232984809627, "grad_norm": 122.73525997559868, "learning_rate": 2.3173582700175337e-06, "loss": 30.5289, "step": 1110, "true_loss": 3.8205 }, { "epoch": 1.7670151903728546, "grad_norm": 119.27018553105627, "learning_rate": 2.288135593220339e-06, "loss": 30.1239, "step": 1120, "true_loss": 3.9501 }, { "epoch": 1.782797395936082, "grad_norm": 119.29842149378848, "learning_rate": 2.2589129164231446e-06, "loss": 30.4032, "step": 1130, "true_loss": 3.8006 }, { "epoch": 1.7985796014993096, "grad_norm": 126.27340829379088, "learning_rate": 2.22969023962595e-06, "loss": 30.8506, "step": 1140, "true_loss": 3.8388 }, { "epoch": 1.8143618070625371, "grad_norm": 118.22873644516751, "learning_rate": 2.2004675628287555e-06, "loss": 30.742, "step": 1150, "true_loss": 3.9507 }, { "epoch": 1.8143618070625371, "eval_accuracy": 0.1607538802660754, "eval_loss": 3.498107433319092, "eval_runtime": 15.0542, "eval_samples_per_second": 59.917, "eval_steps_per_second": 7.506, "step": 1150 }, { "epoch": 1.8301440126257644, "grad_norm": 115.24942636463535, "learning_rate": 2.171244886031561e-06, "loss": 30.2917, "step": 1160, "true_loss": 3.854 }, { "epoch": 1.845926218188992, "grad_norm": 120.31076182133407, "learning_rate": 2.142022209234366e-06, "loss": 30.98, "step": 1170, "true_loss": 4.0372 }, { "epoch": 1.8617084237522192, "grad_norm": 114.03103310407008, "learning_rate": 2.1127995324371714e-06, "loss": 31.2044, "step": 1180, "true_loss": 3.7539 }, { "epoch": 1.8774906293154467, "grad_norm": 112.15849385918017, "learning_rate": 2.083576855639977e-06, "loss": 30.6597, "step": 1190, "true_loss": 3.6715 }, { "epoch": 1.8932728348786743, "grad_norm": 139.58809345884583, "learning_rate": 2.054354178842782e-06, "loss": 30.6171, "step": 1200, "true_loss": 3.7479 }, { "epoch": 1.8932728348786743, "eval_accuracy": 0.16186252771618626, "eval_loss": 3.4824934005737305, "eval_runtime": 14.975, "eval_samples_per_second": 60.234, "eval_steps_per_second": 7.546, "step": 1200 }, { "epoch": 1.9090550404419018, "grad_norm": 126.47013176461778, "learning_rate": 2.0251315020455873e-06, "loss": 30.4756, "step": 1210, "true_loss": 3.8482 }, { "epoch": 1.9248372460051293, "grad_norm": 129.40438104945616, "learning_rate": 1.995908825248393e-06, "loss": 30.6922, "step": 1220, "true_loss": 3.7344 }, { "epoch": 1.9406194515683568, "grad_norm": 117.04258986010086, "learning_rate": 1.9666861484511982e-06, "loss": 30.2474, "step": 1230, "true_loss": 3.746 }, { "epoch": 1.956401657131584, "grad_norm": 115.62691180569766, "learning_rate": 1.9374634716540037e-06, "loss": 30.1561, "step": 1240, "true_loss": 3.9135 }, { "epoch": 1.9721838626948116, "grad_norm": 125.90746499326437, "learning_rate": 1.908240794856809e-06, "loss": 30.4126, "step": 1250, "true_loss": 3.8586 }, { "epoch": 1.9721838626948116, "eval_accuracy": 0.164079822616408, "eval_loss": 3.4665002822875977, "eval_runtime": 14.9177, "eval_samples_per_second": 60.465, "eval_steps_per_second": 7.575, "step": 1250 }, { "epoch": 1.987966068258039, "grad_norm": 125.7071274298596, "learning_rate": 1.8790181180596146e-06, "loss": 30.3615, "step": 1260, "true_loss": 3.9108 }, { "epoch": 2.0031564411126457, "grad_norm": 119.68428766484261, "learning_rate": 1.8497954412624196e-06, "loss": 28.5842, "step": 1270, "true_loss": 3.5793 }, { "epoch": 2.0189386466758728, "grad_norm": 142.95432081434146, "learning_rate": 1.820572764465225e-06, "loss": 29.1367, "step": 1280, "true_loss": 3.7706 }, { "epoch": 2.0347208522391003, "grad_norm": 160.24172016677548, "learning_rate": 1.7913500876680305e-06, "loss": 29.5544, "step": 1290, "true_loss": 3.548 }, { "epoch": 2.0505030578023278, "grad_norm": 140.3505056595712, "learning_rate": 1.7621274108708358e-06, "loss": 29.5142, "step": 1300, "true_loss": 3.7087 }, { "epoch": 2.0505030578023278, "eval_accuracy": 0.16518847006651885, "eval_loss": 3.4521355628967285, "eval_runtime": 14.8455, "eval_samples_per_second": 60.759, "eval_steps_per_second": 7.612, "step": 1300 }, { "epoch": 2.0662852633655553, "grad_norm": 135.81575482285095, "learning_rate": 1.7329047340736412e-06, "loss": 29.5029, "step": 1310, "true_loss": 3.6682 }, { "epoch": 2.082067468928783, "grad_norm": 143.74657310067929, "learning_rate": 1.7036820572764467e-06, "loss": 29.6016, "step": 1320, "true_loss": 3.5355 }, { "epoch": 2.0978496744920103, "grad_norm": 137.74730157928755, "learning_rate": 1.674459380479252e-06, "loss": 29.1209, "step": 1330, "true_loss": 3.4128 }, { "epoch": 2.113631880055238, "grad_norm": 147.5729345169822, "learning_rate": 1.6452367036820574e-06, "loss": 29.7667, "step": 1340, "true_loss": 3.5826 }, { "epoch": 2.1294140856184653, "grad_norm": 149.77371005154865, "learning_rate": 1.6160140268848628e-06, "loss": 29.3551, "step": 1350, "true_loss": 3.7582 }, { "epoch": 2.1294140856184653, "eval_accuracy": 0.18292682926829268, "eval_loss": 3.432112693786621, "eval_runtime": 14.9865, "eval_samples_per_second": 60.187, "eval_steps_per_second": 7.54, "step": 1350 }, { "epoch": 2.1451962911816924, "grad_norm": 147.80741608555775, "learning_rate": 1.586791350087668e-06, "loss": 29.2982, "step": 1360, "true_loss": 3.4947 }, { "epoch": 2.16097849674492, "grad_norm": 145.1261208124329, "learning_rate": 1.5575686732904735e-06, "loss": 28.7471, "step": 1370, "true_loss": 3.4964 }, { "epoch": 2.1767607023081474, "grad_norm": 152.29650206925518, "learning_rate": 1.528345996493279e-06, "loss": 29.8086, "step": 1380, "true_loss": 3.724 }, { "epoch": 2.192542907871375, "grad_norm": 161.67922175813698, "learning_rate": 1.4991233196960842e-06, "loss": 29.5229, "step": 1390, "true_loss": 3.8039 }, { "epoch": 2.2083251134346025, "grad_norm": 155.22142131612623, "learning_rate": 1.4699006428988897e-06, "loss": 29.5038, "step": 1400, "true_loss": 3.5815 }, { "epoch": 2.2083251134346025, "eval_accuracy": 0.17960088691796008, "eval_loss": 3.440061569213867, "eval_runtime": 14.9796, "eval_samples_per_second": 60.215, "eval_steps_per_second": 7.544, "step": 1400 }, { "epoch": 2.22410731899783, "grad_norm": 156.53973755104687, "learning_rate": 1.4406779661016951e-06, "loss": 29.2715, "step": 1410, "true_loss": 3.6531 }, { "epoch": 2.2398895245610575, "grad_norm": 149.15210915040223, "learning_rate": 1.4114552893045006e-06, "loss": 29.6691, "step": 1420, "true_loss": 3.9184 }, { "epoch": 2.255671730124285, "grad_norm": 157.87160672579, "learning_rate": 1.3822326125073058e-06, "loss": 29.8417, "step": 1430, "true_loss": 3.6847 }, { "epoch": 2.2714539356875125, "grad_norm": 151.1446502317595, "learning_rate": 1.3530099357101113e-06, "loss": 30.3948, "step": 1440, "true_loss": 3.7877 }, { "epoch": 2.28723614125074, "grad_norm": 177.46389269846443, "learning_rate": 1.3237872589129167e-06, "loss": 29.5136, "step": 1450, "true_loss": 3.7729 }, { "epoch": 2.28723614125074, "eval_accuracy": 0.1762749445676275, "eval_loss": 3.4319870471954346, "eval_runtime": 14.8171, "eval_samples_per_second": 60.876, "eval_steps_per_second": 7.626, "step": 1450 }, { "epoch": 2.303018346813967, "grad_norm": 177.0142136955459, "learning_rate": 1.2945645821157218e-06, "loss": 29.6728, "step": 1460, "true_loss": 3.9035 }, { "epoch": 2.3188005523771946, "grad_norm": 170.26022348801405, "learning_rate": 1.2653419053185272e-06, "loss": 29.6204, "step": 1470, "true_loss": 3.7586 }, { "epoch": 2.334582757940422, "grad_norm": 169.623587107286, "learning_rate": 1.2361192285213327e-06, "loss": 29.4771, "step": 1480, "true_loss": 3.8 }, { "epoch": 2.3503649635036497, "grad_norm": 159.42232132005338, "learning_rate": 1.2068965517241381e-06, "loss": 29.0484, "step": 1490, "true_loss": 3.6296 }, { "epoch": 2.366147169066877, "grad_norm": 168.47474555405154, "learning_rate": 1.1776738749269434e-06, "loss": 29.6276, "step": 1500, "true_loss": 3.6125 }, { "epoch": 2.366147169066877, "eval_accuracy": 0.18403547671840353, "eval_loss": 3.420041084289551, "eval_runtime": 14.9555, "eval_samples_per_second": 60.312, "eval_steps_per_second": 7.556, "step": 1500 }, { "epoch": 2.3819293746301047, "grad_norm": 162.07404618888037, "learning_rate": 1.1484511981297488e-06, "loss": 29.0954, "step": 1510, "true_loss": 3.5425 }, { "epoch": 2.397711580193332, "grad_norm": 181.11213446797987, "learning_rate": 1.1192285213325543e-06, "loss": 29.3659, "step": 1520, "true_loss": 3.5447 }, { "epoch": 2.4134937857565593, "grad_norm": 172.6165441680545, "learning_rate": 1.0900058445353595e-06, "loss": 28.6622, "step": 1530, "true_loss": 3.5335 }, { "epoch": 2.4292759913197868, "grad_norm": 178.73759767816594, "learning_rate": 1.0607831677381648e-06, "loss": 29.8517, "step": 1540, "true_loss": 3.6041 }, { "epoch": 2.4450581968830143, "grad_norm": 192.3719970092387, "learning_rate": 1.0315604909409702e-06, "loss": 28.9193, "step": 1550, "true_loss": 3.7661 }, { "epoch": 2.4450581968830143, "eval_accuracy": 0.188470066518847, "eval_loss": 3.4035346508026123, "eval_runtime": 14.961, "eval_samples_per_second": 60.29, "eval_steps_per_second": 7.553, "step": 1550 }, { "epoch": 2.460840402446242, "grad_norm": 177.67877867152768, "learning_rate": 1.0023378141437757e-06, "loss": 29.2984, "step": 1560, "true_loss": 3.6286 }, { "epoch": 2.4766226080094693, "grad_norm": 188.74386540748054, "learning_rate": 9.731151373465811e-07, "loss": 29.2941, "step": 1570, "true_loss": 3.6082 }, { "epoch": 2.492404813572697, "grad_norm": 188.53177150362492, "learning_rate": 9.438924605493864e-07, "loss": 29.2244, "step": 1580, "true_loss": 3.4747 }, { "epoch": 2.5081870191359243, "grad_norm": 177.0506666498904, "learning_rate": 9.146697837521917e-07, "loss": 28.9166, "step": 1590, "true_loss": 3.4376 }, { "epoch": 2.523969224699152, "grad_norm": 183.5215902444712, "learning_rate": 8.854471069549972e-07, "loss": 28.6856, "step": 1600, "true_loss": 3.6844 }, { "epoch": 2.523969224699152, "eval_accuracy": 0.18181818181818182, "eval_loss": 3.3976125717163086, "eval_runtime": 14.9189, "eval_samples_per_second": 60.46, "eval_steps_per_second": 7.574, "step": 1600 }, { "epoch": 2.5397514302623794, "grad_norm": 175.18878295898904, "learning_rate": 8.562244301578025e-07, "loss": 29.0664, "step": 1610, "true_loss": 3.738 }, { "epoch": 2.555533635825607, "grad_norm": 170.9586548154297, "learning_rate": 8.270017533606079e-07, "loss": 29.4792, "step": 1620, "true_loss": 3.5714 }, { "epoch": 2.571315841388834, "grad_norm": 184.16360272576958, "learning_rate": 7.977790765634133e-07, "loss": 29.5751, "step": 1630, "true_loss": 3.6899 }, { "epoch": 2.5870980469520615, "grad_norm": 177.57568678111596, "learning_rate": 7.685563997662187e-07, "loss": 28.9365, "step": 1640, "true_loss": 3.7683 }, { "epoch": 2.602880252515289, "grad_norm": 174.71239418729613, "learning_rate": 7.393337229690241e-07, "loss": 29.2765, "step": 1650, "true_loss": 3.5981 }, { "epoch": 2.602880252515289, "eval_accuracy": 0.18403547671840353, "eval_loss": 3.399111032485962, "eval_runtime": 14.8984, "eval_samples_per_second": 60.544, "eval_steps_per_second": 7.585, "step": 1650 }, { "epoch": 2.6186624580785165, "grad_norm": 191.0002268007269, "learning_rate": 7.101110461718295e-07, "loss": 29.0547, "step": 1660, "true_loss": 3.8121 }, { "epoch": 2.634444663641744, "grad_norm": 177.3682901257333, "learning_rate": 6.808883693746347e-07, "loss": 29.0442, "step": 1670, "true_loss": 3.5864 }, { "epoch": 2.6502268692049715, "grad_norm": 174.71007183431408, "learning_rate": 6.516656925774401e-07, "loss": 28.9077, "step": 1680, "true_loss": 3.3604 }, { "epoch": 2.6660090747681986, "grad_norm": 189.2465250317481, "learning_rate": 6.224430157802455e-07, "loss": 29.2019, "step": 1690, "true_loss": 3.5039 }, { "epoch": 2.681791280331426, "grad_norm": 177.62073060179011, "learning_rate": 5.93220338983051e-07, "loss": 29.229, "step": 1700, "true_loss": 3.5725 }, { "epoch": 2.681791280331426, "eval_accuracy": 0.18514412416851442, "eval_loss": 3.3881542682647705, "eval_runtime": 15.0332, "eval_samples_per_second": 60.001, "eval_steps_per_second": 7.517, "step": 1700 }, { "epoch": 2.6975734858946536, "grad_norm": 188.85721776025207, "learning_rate": 5.639976621858563e-07, "loss": 28.9533, "step": 1710, "true_loss": 3.7415 }, { "epoch": 2.713355691457881, "grad_norm": 177.46012380599524, "learning_rate": 5.347749853886616e-07, "loss": 29.1529, "step": 1720, "true_loss": 3.5611 }, { "epoch": 2.7291378970211086, "grad_norm": 188.11753895032865, "learning_rate": 5.05552308591467e-07, "loss": 29.3255, "step": 1730, "true_loss": 3.5425 }, { "epoch": 2.744920102584336, "grad_norm": 170.87638552649375, "learning_rate": 4.763296317942724e-07, "loss": 29.4073, "step": 1740, "true_loss": 3.6556 }, { "epoch": 2.7607023081475637, "grad_norm": 180.7284650297668, "learning_rate": 4.4710695499707774e-07, "loss": 29.8538, "step": 1750, "true_loss": 3.8687 }, { "epoch": 2.7607023081475637, "eval_accuracy": 0.19068736141906872, "eval_loss": 3.377656936645508, "eval_runtime": 14.9062, "eval_samples_per_second": 60.512, "eval_steps_per_second": 7.581, "step": 1750 }, { "epoch": 2.776484513710791, "grad_norm": 174.471909092309, "learning_rate": 4.1788427819988314e-07, "loss": 28.8045, "step": 1760, "true_loss": 3.6287 }, { "epoch": 2.7922667192740187, "grad_norm": 195.44928225469693, "learning_rate": 3.8866160140268854e-07, "loss": 28.9067, "step": 1770, "true_loss": 3.4542 }, { "epoch": 2.808048924837246, "grad_norm": 178.92804289511642, "learning_rate": 3.594389246054939e-07, "loss": 28.6787, "step": 1780, "true_loss": 3.6228 }, { "epoch": 2.8238311304004737, "grad_norm": 194.93339966871807, "learning_rate": 3.3021624780829924e-07, "loss": 28.9603, "step": 1790, "true_loss": 3.6706 }, { "epoch": 2.839613335963701, "grad_norm": 201.13665707365328, "learning_rate": 3.0099357101110464e-07, "loss": 28.8935, "step": 1800, "true_loss": 3.5731 }, { "epoch": 2.839613335963701, "eval_accuracy": 0.19401330376940132, "eval_loss": 3.3746535778045654, "eval_runtime": 14.9077, "eval_samples_per_second": 60.506, "eval_steps_per_second": 7.58, "step": 1800 } ], "logging_steps": 10, "max_steps": 1902, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }