| { | |
| "best_global_step": 1800, | |
| "best_metric": 0.19401330376940132, | |
| "best_model_checkpoint": "out_qwen_0.6b_sft_augmented/checkpoint-1800", | |
| "epoch": 2.839613335963701, | |
| "eval_steps": 50, | |
| "global_step": 1800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015782205563227462, | |
| "grad_norm": 167.66599198435412, | |
| "learning_rate": 2.3560209424083772e-07, | |
| "loss": 34.3388, | |
| "step": 10, | |
| "true_loss": 4.2431 | |
| }, | |
| { | |
| "epoch": 0.031564411126454923, | |
| "grad_norm": 140.88294654003442, | |
| "learning_rate": 4.973821989528796e-07, | |
| "loss": 35.0522, | |
| "step": 20, | |
| "true_loss": 4.3078 | |
| }, | |
| { | |
| "epoch": 0.04734661668968238, | |
| "grad_norm": 136.79181472954014, | |
| "learning_rate": 7.591623036649215e-07, | |
| "loss": 34.637, | |
| "step": 30, | |
| "true_loss": 4.3079 | |
| }, | |
| { | |
| "epoch": 0.06312882225290985, | |
| "grad_norm": 131.76245344240186, | |
| "learning_rate": 1.0209424083769635e-06, | |
| "loss": 34.6678, | |
| "step": 40, | |
| "true_loss": 4.398 | |
| }, | |
| { | |
| "epoch": 0.0789110278161373, | |
| "grad_norm": 121.23785317676568, | |
| "learning_rate": 1.2827225130890052e-06, | |
| "loss": 34.7919, | |
| "step": 50, | |
| "true_loss": 4.3715 | |
| }, | |
| { | |
| "epoch": 0.0789110278161373, | |
| "eval_accuracy": 0.02328159645232816, | |
| "eval_loss": 4.273420333862305, | |
| "eval_runtime": 15.4906, | |
| "eval_samples_per_second": 58.229, | |
| "eval_steps_per_second": 7.295, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09469323337936476, | |
| "grad_norm": 115.65585623224781, | |
| "learning_rate": 1.5445026178010472e-06, | |
| "loss": 33.6145, | |
| "step": 60, | |
| "true_loss": 4.0062 | |
| }, | |
| { | |
| "epoch": 0.11047543894259222, | |
| "grad_norm": 130.54325341180044, | |
| "learning_rate": 1.8062827225130891e-06, | |
| "loss": 33.379, | |
| "step": 70, | |
| "true_loss": 3.8901 | |
| }, | |
| { | |
| "epoch": 0.1262576445058197, | |
| "grad_norm": 112.3856329975497, | |
| "learning_rate": 2.068062827225131e-06, | |
| "loss": 33.5192, | |
| "step": 80, | |
| "true_loss": 4.307 | |
| }, | |
| { | |
| "epoch": 0.14203985006904715, | |
| "grad_norm": 108.9608630618755, | |
| "learning_rate": 2.329842931937173e-06, | |
| "loss": 33.1023, | |
| "step": 90, | |
| "true_loss": 4.0033 | |
| }, | |
| { | |
| "epoch": 0.1578220556322746, | |
| "grad_norm": 106.77628746014003, | |
| "learning_rate": 2.591623036649215e-06, | |
| "loss": 33.1149, | |
| "step": 100, | |
| "true_loss": 4.2262 | |
| }, | |
| { | |
| "epoch": 0.1578220556322746, | |
| "eval_accuracy": 0.018847006651884702, | |
| "eval_loss": 4.114043235778809, | |
| "eval_runtime": 14.8964, | |
| "eval_samples_per_second": 60.552, | |
| "eval_steps_per_second": 7.586, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17360426119550207, | |
| "grad_norm": 121.11526025215916, | |
| "learning_rate": 2.853403141361257e-06, | |
| "loss": 32.6546, | |
| "step": 110, | |
| "true_loss": 4.1197 | |
| }, | |
| { | |
| "epoch": 0.18938646675872953, | |
| "grad_norm": 92.42690926252463, | |
| "learning_rate": 3.115183246073299e-06, | |
| "loss": 32.8328, | |
| "step": 120, | |
| "true_loss": 4.095 | |
| }, | |
| { | |
| "epoch": 0.20516867232195699, | |
| "grad_norm": 101.38444957790254, | |
| "learning_rate": 3.3769633507853404e-06, | |
| "loss": 32.8388, | |
| "step": 130, | |
| "true_loss": 4.177 | |
| }, | |
| { | |
| "epoch": 0.22095087788518444, | |
| "grad_norm": 81.64774590325571, | |
| "learning_rate": 3.6387434554973826e-06, | |
| "loss": 33.3548, | |
| "step": 140, | |
| "true_loss": 4.0975 | |
| }, | |
| { | |
| "epoch": 0.2367330834484119, | |
| "grad_norm": 96.23586804909877, | |
| "learning_rate": 3.900523560209425e-06, | |
| "loss": 32.7975, | |
| "step": 150, | |
| "true_loss": 4.2231 | |
| }, | |
| { | |
| "epoch": 0.2367330834484119, | |
| "eval_accuracy": 0.037694013303769404, | |
| "eval_loss": 4.033259391784668, | |
| "eval_runtime": 14.9076, | |
| "eval_samples_per_second": 60.506, | |
| "eval_steps_per_second": 7.58, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2525152890116394, | |
| "grad_norm": 98.59498624074838, | |
| "learning_rate": 4.1623036649214665e-06, | |
| "loss": 32.6746, | |
| "step": 160, | |
| "true_loss": 4.0838 | |
| }, | |
| { | |
| "epoch": 0.26829749457486685, | |
| "grad_norm": 91.56971668470555, | |
| "learning_rate": 4.424083769633508e-06, | |
| "loss": 32.4462, | |
| "step": 170, | |
| "true_loss": 4.0187 | |
| }, | |
| { | |
| "epoch": 0.2840797001380943, | |
| "grad_norm": 88.75837984752525, | |
| "learning_rate": 4.68586387434555e-06, | |
| "loss": 32.573, | |
| "step": 180, | |
| "true_loss": 4.117 | |
| }, | |
| { | |
| "epoch": 0.29986190570132176, | |
| "grad_norm": 91.58443998502435, | |
| "learning_rate": 4.947643979057592e-06, | |
| "loss": 32.665, | |
| "step": 190, | |
| "true_loss": 3.9757 | |
| }, | |
| { | |
| "epoch": 0.3156441112645492, | |
| "grad_norm": 92.49414152028442, | |
| "learning_rate": 4.976621858562245e-06, | |
| "loss": 32.7294, | |
| "step": 200, | |
| "true_loss": 4.1063 | |
| }, | |
| { | |
| "epoch": 0.3156441112645492, | |
| "eval_accuracy": 0.04656319290465632, | |
| "eval_loss": 3.9896063804626465, | |
| "eval_runtime": 14.875, | |
| "eval_samples_per_second": 60.639, | |
| "eval_steps_per_second": 7.597, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3314263168277767, | |
| "grad_norm": 69.24966334193799, | |
| "learning_rate": 4.94739918176505e-06, | |
| "loss": 32.8352, | |
| "step": 210, | |
| "true_loss": 3.9432 | |
| }, | |
| { | |
| "epoch": 0.34720852239100414, | |
| "grad_norm": 84.67835268794279, | |
| "learning_rate": 4.9181765049678555e-06, | |
| "loss": 32.6025, | |
| "step": 220, | |
| "true_loss": 4.1811 | |
| }, | |
| { | |
| "epoch": 0.3629907279542316, | |
| "grad_norm": 62.265769584137104, | |
| "learning_rate": 4.888953828170661e-06, | |
| "loss": 33.0571, | |
| "step": 230, | |
| "true_loss": 4.1072 | |
| }, | |
| { | |
| "epoch": 0.37877293351745905, | |
| "grad_norm": 54.79839495974889, | |
| "learning_rate": 4.859731151373466e-06, | |
| "loss": 32.5185, | |
| "step": 240, | |
| "true_loss": 4.0185 | |
| }, | |
| { | |
| "epoch": 0.3945551390806865, | |
| "grad_norm": 60.84068430822686, | |
| "learning_rate": 4.830508474576272e-06, | |
| "loss": 32.4647, | |
| "step": 250, | |
| "true_loss": 4.1141 | |
| }, | |
| { | |
| "epoch": 0.3945551390806865, | |
| "eval_accuracy": 0.057649667405764965, | |
| "eval_loss": 3.965649366378784, | |
| "eval_runtime": 14.894, | |
| "eval_samples_per_second": 60.561, | |
| "eval_steps_per_second": 7.587, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.41033734464391397, | |
| "grad_norm": 69.10630072542976, | |
| "learning_rate": 4.801285797779077e-06, | |
| "loss": 32.716, | |
| "step": 260, | |
| "true_loss": 4.0852 | |
| }, | |
| { | |
| "epoch": 0.42611955020714143, | |
| "grad_norm": 69.30562697285451, | |
| "learning_rate": 4.772063120981883e-06, | |
| "loss": 32.3945, | |
| "step": 270, | |
| "true_loss": 4.04 | |
| }, | |
| { | |
| "epoch": 0.4419017557703689, | |
| "grad_norm": 61.70708790424729, | |
| "learning_rate": 4.742840444184687e-06, | |
| "loss": 31.8389, | |
| "step": 280, | |
| "true_loss": 3.9424 | |
| }, | |
| { | |
| "epoch": 0.45768396133359635, | |
| "grad_norm": 63.553230156261236, | |
| "learning_rate": 4.713617767387494e-06, | |
| "loss": 32.2705, | |
| "step": 290, | |
| "true_loss": 4.0162 | |
| }, | |
| { | |
| "epoch": 0.4734661668968238, | |
| "grad_norm": 66.18745807155665, | |
| "learning_rate": 4.684395090590298e-06, | |
| "loss": 32.2542, | |
| "step": 300, | |
| "true_loss": 4.1658 | |
| }, | |
| { | |
| "epoch": 0.4734661668968238, | |
| "eval_accuracy": 0.05432372505543237, | |
| "eval_loss": 3.950526714324951, | |
| "eval_runtime": 14.9818, | |
| "eval_samples_per_second": 60.206, | |
| "eval_steps_per_second": 7.542, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4892483724600513, | |
| "grad_norm": 58.57848936978391, | |
| "learning_rate": 4.655172413793104e-06, | |
| "loss": 32.678, | |
| "step": 310, | |
| "true_loss": 3.9291 | |
| }, | |
| { | |
| "epoch": 0.5050305780232788, | |
| "grad_norm": 61.33442223889434, | |
| "learning_rate": 4.625949736995909e-06, | |
| "loss": 32.1995, | |
| "step": 320, | |
| "true_loss": 4.1149 | |
| }, | |
| { | |
| "epoch": 0.5208127835865062, | |
| "grad_norm": 64.64890658388506, | |
| "learning_rate": 4.596727060198715e-06, | |
| "loss": 32.6549, | |
| "step": 330, | |
| "true_loss": 4.2883 | |
| }, | |
| { | |
| "epoch": 0.5365949891497337, | |
| "grad_norm": 67.05902120899448, | |
| "learning_rate": 4.56750438340152e-06, | |
| "loss": 32.4256, | |
| "step": 340, | |
| "true_loss": 4.1349 | |
| }, | |
| { | |
| "epoch": 0.5523771947129611, | |
| "grad_norm": 56.47408736066254, | |
| "learning_rate": 4.5382817066043256e-06, | |
| "loss": 32.9031, | |
| "step": 350, | |
| "true_loss": 4.0859 | |
| }, | |
| { | |
| "epoch": 0.5523771947129611, | |
| "eval_accuracy": 0.04878048780487805, | |
| "eval_loss": 3.9386215209960938, | |
| "eval_runtime": 14.7498, | |
| "eval_samples_per_second": 61.153, | |
| "eval_steps_per_second": 7.661, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5681594002761886, | |
| "grad_norm": 68.21711305531616, | |
| "learning_rate": 4.509059029807131e-06, | |
| "loss": 31.9884, | |
| "step": 360, | |
| "true_loss": 4.0053 | |
| }, | |
| { | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 67.03237493940061, | |
| "learning_rate": 4.479836353009936e-06, | |
| "loss": 32.0414, | |
| "step": 370, | |
| "true_loss": 4.0486 | |
| }, | |
| { | |
| "epoch": 0.5997238114026435, | |
| "grad_norm": 62.81448024910035, | |
| "learning_rate": 4.450613676212742e-06, | |
| "loss": 31.6654, | |
| "step": 380, | |
| "true_loss": 3.9884 | |
| }, | |
| { | |
| "epoch": 0.6155060169658709, | |
| "grad_norm": 58.734278541487996, | |
| "learning_rate": 4.4213909994155465e-06, | |
| "loss": 32.1588, | |
| "step": 390, | |
| "true_loss": 4.1576 | |
| }, | |
| { | |
| "epoch": 0.6312882225290984, | |
| "grad_norm": 56.72282043276077, | |
| "learning_rate": 4.392168322618352e-06, | |
| "loss": 32.4645, | |
| "step": 400, | |
| "true_loss": 3.9613 | |
| }, | |
| { | |
| "epoch": 0.6312882225290984, | |
| "eval_accuracy": 0.08425720620842572, | |
| "eval_loss": 3.8948559761047363, | |
| "eval_runtime": 14.9154, | |
| "eval_samples_per_second": 60.474, | |
| "eval_steps_per_second": 7.576, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.647070428092326, | |
| "grad_norm": 67.89360370302569, | |
| "learning_rate": 4.3629456458211574e-06, | |
| "loss": 32.2354, | |
| "step": 410, | |
| "true_loss": 3.9924 | |
| }, | |
| { | |
| "epoch": 0.6628526336555534, | |
| "grad_norm": 58.51107977573013, | |
| "learning_rate": 4.333722969023963e-06, | |
| "loss": 32.3006, | |
| "step": 420, | |
| "true_loss": 3.9875 | |
| }, | |
| { | |
| "epoch": 0.6786348392187809, | |
| "grad_norm": 66.3808988459889, | |
| "learning_rate": 4.304500292226768e-06, | |
| "loss": 32.2764, | |
| "step": 430, | |
| "true_loss": 3.9726 | |
| }, | |
| { | |
| "epoch": 0.6944170447820083, | |
| "grad_norm": 70.6833559958922, | |
| "learning_rate": 4.275277615429574e-06, | |
| "loss": 32.2592, | |
| "step": 440, | |
| "true_loss": 4.1439 | |
| }, | |
| { | |
| "epoch": 0.7101992503452358, | |
| "grad_norm": 63.3033292392276, | |
| "learning_rate": 4.246054938632379e-06, | |
| "loss": 32.362, | |
| "step": 450, | |
| "true_loss": 4.0236 | |
| }, | |
| { | |
| "epoch": 0.7101992503452358, | |
| "eval_accuracy": 0.07206208425720621, | |
| "eval_loss": 3.8847439289093018, | |
| "eval_runtime": 14.9499, | |
| "eval_samples_per_second": 60.335, | |
| "eval_steps_per_second": 7.559, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7259814559084632, | |
| "grad_norm": 72.67280180250647, | |
| "learning_rate": 4.216832261835184e-06, | |
| "loss": 32.2076, | |
| "step": 460, | |
| "true_loss": 4.1088 | |
| }, | |
| { | |
| "epoch": 0.7417636614716907, | |
| "grad_norm": 61.8016941961879, | |
| "learning_rate": 4.18760958503799e-06, | |
| "loss": 31.8356, | |
| "step": 470, | |
| "true_loss": 3.9524 | |
| }, | |
| { | |
| "epoch": 0.7575458670349181, | |
| "grad_norm": 71.60310974326192, | |
| "learning_rate": 4.158386908240795e-06, | |
| "loss": 32.4698, | |
| "step": 480, | |
| "true_loss": 3.9285 | |
| }, | |
| { | |
| "epoch": 0.7733280725981456, | |
| "grad_norm": 63.81105183121505, | |
| "learning_rate": 4.1291642314436e-06, | |
| "loss": 32.4375, | |
| "step": 490, | |
| "true_loss": 4.0445 | |
| }, | |
| { | |
| "epoch": 0.789110278161373, | |
| "grad_norm": 66.81078306732113, | |
| "learning_rate": 4.0999415546464065e-06, | |
| "loss": 31.6606, | |
| "step": 500, | |
| "true_loss": 3.912 | |
| }, | |
| { | |
| "epoch": 0.789110278161373, | |
| "eval_accuracy": 0.08869179600886919, | |
| "eval_loss": 3.842041492462158, | |
| "eval_runtime": 14.9881, | |
| "eval_samples_per_second": 60.181, | |
| "eval_steps_per_second": 7.539, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8048924837246005, | |
| "grad_norm": 66.51737872080436, | |
| "learning_rate": 4.070718877849211e-06, | |
| "loss": 32.6594, | |
| "step": 510, | |
| "true_loss": 4.0515 | |
| }, | |
| { | |
| "epoch": 0.8206746892878279, | |
| "grad_norm": 102.93145951917813, | |
| "learning_rate": 4.0414962010520166e-06, | |
| "loss": 32.0275, | |
| "step": 520, | |
| "true_loss": 4.1003 | |
| }, | |
| { | |
| "epoch": 0.8364568948510555, | |
| "grad_norm": 75.53198463360987, | |
| "learning_rate": 4.012273524254822e-06, | |
| "loss": 32.2893, | |
| "step": 530, | |
| "true_loss": 4.0965 | |
| }, | |
| { | |
| "epoch": 0.8522391004142829, | |
| "grad_norm": 76.07281655644753, | |
| "learning_rate": 3.9830508474576275e-06, | |
| "loss": 32.2373, | |
| "step": 540, | |
| "true_loss": 3.9613 | |
| }, | |
| { | |
| "epoch": 0.8680213059775104, | |
| "grad_norm": 72.30674725635646, | |
| "learning_rate": 3.953828170660433e-06, | |
| "loss": 32.1916, | |
| "step": 550, | |
| "true_loss": 3.9895 | |
| }, | |
| { | |
| "epoch": 0.8680213059775104, | |
| "eval_accuracy": 0.10975609756097561, | |
| "eval_loss": 3.7870140075683594, | |
| "eval_runtime": 14.8068, | |
| "eval_samples_per_second": 60.918, | |
| "eval_steps_per_second": 7.632, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8838035115407378, | |
| "grad_norm": 77.32202458254106, | |
| "learning_rate": 3.924605493863238e-06, | |
| "loss": 31.7419, | |
| "step": 560, | |
| "true_loss": 3.9115 | |
| }, | |
| { | |
| "epoch": 0.8995857171039653, | |
| "grad_norm": 78.63719989484625, | |
| "learning_rate": 3.895382817066044e-06, | |
| "loss": 32.5222, | |
| "step": 570, | |
| "true_loss": 4.0513 | |
| }, | |
| { | |
| "epoch": 0.9153679226671927, | |
| "grad_norm": 79.29136261002967, | |
| "learning_rate": 3.8661601402688484e-06, | |
| "loss": 31.8769, | |
| "step": 580, | |
| "true_loss": 4.0132 | |
| }, | |
| { | |
| "epoch": 0.9311501282304202, | |
| "grad_norm": 77.29435282237034, | |
| "learning_rate": 3.836937463471655e-06, | |
| "loss": 31.7648, | |
| "step": 590, | |
| "true_loss": 3.9623 | |
| }, | |
| { | |
| "epoch": 0.9469323337936476, | |
| "grad_norm": 68.35155053663401, | |
| "learning_rate": 3.8077147866744598e-06, | |
| "loss": 31.884, | |
| "step": 600, | |
| "true_loss": 3.9455 | |
| }, | |
| { | |
| "epoch": 0.9469323337936476, | |
| "eval_accuracy": 0.11751662971175167, | |
| "eval_loss": 3.7514853477478027, | |
| "eval_runtime": 14.8368, | |
| "eval_samples_per_second": 60.795, | |
| "eval_steps_per_second": 7.616, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9627145393568751, | |
| "grad_norm": 73.94418735157363, | |
| "learning_rate": 3.7784921098772652e-06, | |
| "loss": 31.669, | |
| "step": 610, | |
| "true_loss": 3.9077 | |
| }, | |
| { | |
| "epoch": 0.9784967449201026, | |
| "grad_norm": 78.72012954254731, | |
| "learning_rate": 3.7492694330800707e-06, | |
| "loss": 32.1414, | |
| "step": 620, | |
| "true_loss": 3.8983 | |
| }, | |
| { | |
| "epoch": 0.99427895048333, | |
| "grad_norm": 83.38578257414015, | |
| "learning_rate": 3.7200467562828757e-06, | |
| "loss": 32.0129, | |
| "step": 630, | |
| "true_loss": 3.9648 | |
| }, | |
| { | |
| "epoch": 1.0094693233379364, | |
| "grad_norm": 79.30562934714216, | |
| "learning_rate": 3.6908240794856816e-06, | |
| "loss": 30.8218, | |
| "step": 640, | |
| "true_loss": 4.0426 | |
| }, | |
| { | |
| "epoch": 1.0252515289011639, | |
| "grad_norm": 86.51493702246239, | |
| "learning_rate": 3.6616014026884866e-06, | |
| "loss": 31.7275, | |
| "step": 650, | |
| "true_loss": 4.0234 | |
| }, | |
| { | |
| "epoch": 1.0252515289011639, | |
| "eval_accuracy": 0.11862527716186252, | |
| "eval_loss": 3.7260189056396484, | |
| "eval_runtime": 14.9748, | |
| "eval_samples_per_second": 60.234, | |
| "eval_steps_per_second": 7.546, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0410337344643914, | |
| "grad_norm": 95.28223069807079, | |
| "learning_rate": 3.6323787258912916e-06, | |
| "loss": 31.1501, | |
| "step": 660, | |
| "true_loss": 4.1224 | |
| }, | |
| { | |
| "epoch": 1.056815940027619, | |
| "grad_norm": 95.66233260675281, | |
| "learning_rate": 3.6031560490940975e-06, | |
| "loss": 31.3991, | |
| "step": 670, | |
| "true_loss": 4.1524 | |
| }, | |
| { | |
| "epoch": 1.0725981455908462, | |
| "grad_norm": 105.1106515406413, | |
| "learning_rate": 3.5739333722969025e-06, | |
| "loss": 31.4503, | |
| "step": 680, | |
| "true_loss": 4.02 | |
| }, | |
| { | |
| "epoch": 1.0883803511540737, | |
| "grad_norm": 107.36932762546802, | |
| "learning_rate": 3.544710695499708e-06, | |
| "loss": 31.0981, | |
| "step": 690, | |
| "true_loss": 3.7185 | |
| }, | |
| { | |
| "epoch": 1.1041625567173012, | |
| "grad_norm": 102.49971541043875, | |
| "learning_rate": 3.5154880187025135e-06, | |
| "loss": 31.0649, | |
| "step": 700, | |
| "true_loss": 3.9306 | |
| }, | |
| { | |
| "epoch": 1.1041625567173012, | |
| "eval_accuracy": 0.13082039911308205, | |
| "eval_loss": 3.679271697998047, | |
| "eval_runtime": 14.9433, | |
| "eval_samples_per_second": 60.362, | |
| "eval_steps_per_second": 7.562, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1199447622805287, | |
| "grad_norm": 101.73957570633324, | |
| "learning_rate": 3.486265341905319e-06, | |
| "loss": 30.7209, | |
| "step": 710, | |
| "true_loss": 3.7747 | |
| }, | |
| { | |
| "epoch": 1.1357269678437563, | |
| "grad_norm": 94.73479469843072, | |
| "learning_rate": 3.457042665108124e-06, | |
| "loss": 31.5505, | |
| "step": 720, | |
| "true_loss": 3.7559 | |
| }, | |
| { | |
| "epoch": 1.1515091734069836, | |
| "grad_norm": 95.84840897528187, | |
| "learning_rate": 3.42781998831093e-06, | |
| "loss": 31.1915, | |
| "step": 730, | |
| "true_loss": 3.9457 | |
| }, | |
| { | |
| "epoch": 1.167291378970211, | |
| "grad_norm": 91.92412320762536, | |
| "learning_rate": 3.398597311513735e-06, | |
| "loss": 30.6652, | |
| "step": 740, | |
| "true_loss": 4.0984 | |
| }, | |
| { | |
| "epoch": 1.1830735845334386, | |
| "grad_norm": 97.13718067393098, | |
| "learning_rate": 3.3693746347165403e-06, | |
| "loss": 31.3627, | |
| "step": 750, | |
| "true_loss": 3.9014 | |
| }, | |
| { | |
| "epoch": 1.1830735845334386, | |
| "eval_accuracy": 0.14523281596452328, | |
| "eval_loss": 3.621267080307007, | |
| "eval_runtime": 15.0066, | |
| "eval_samples_per_second": 60.107, | |
| "eval_steps_per_second": 7.53, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.198855790096666, | |
| "grad_norm": 102.94703309653879, | |
| "learning_rate": 3.3401519579193458e-06, | |
| "loss": 30.9646, | |
| "step": 760, | |
| "true_loss": 3.888 | |
| }, | |
| { | |
| "epoch": 1.2146379956598934, | |
| "grad_norm": 92.09108955881567, | |
| "learning_rate": 3.310929281122151e-06, | |
| "loss": 31.3938, | |
| "step": 770, | |
| "true_loss": 3.9565 | |
| }, | |
| { | |
| "epoch": 1.230420201223121, | |
| "grad_norm": 102.8940703497217, | |
| "learning_rate": 3.2817066043249562e-06, | |
| "loss": 30.6722, | |
| "step": 780, | |
| "true_loss": 3.7934 | |
| }, | |
| { | |
| "epoch": 1.2462024067863484, | |
| "grad_norm": 109.37649388601628, | |
| "learning_rate": 3.252483927527762e-06, | |
| "loss": 30.8783, | |
| "step": 790, | |
| "true_loss": 3.9625 | |
| }, | |
| { | |
| "epoch": 1.261984612349576, | |
| "grad_norm": 97.09349910497292, | |
| "learning_rate": 3.223261250730567e-06, | |
| "loss": 31.1443, | |
| "step": 800, | |
| "true_loss": 3.8638 | |
| }, | |
| { | |
| "epoch": 1.261984612349576, | |
| "eval_accuracy": 0.15077605321507762, | |
| "eval_loss": 3.628795862197876, | |
| "eval_runtime": 14.9076, | |
| "eval_samples_per_second": 60.506, | |
| "eval_steps_per_second": 7.58, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2777668179128034, | |
| "grad_norm": 93.73265189126965, | |
| "learning_rate": 3.194038573933372e-06, | |
| "loss": 31.4319, | |
| "step": 810, | |
| "true_loss": 4.0132 | |
| }, | |
| { | |
| "epoch": 1.2935490234760307, | |
| "grad_norm": 95.06539169984532, | |
| "learning_rate": 3.164815897136178e-06, | |
| "loss": 30.1564, | |
| "step": 820, | |
| "true_loss": 3.7849 | |
| }, | |
| { | |
| "epoch": 1.3093312290392582, | |
| "grad_norm": 104.78400272530634, | |
| "learning_rate": 3.135593220338983e-06, | |
| "loss": 31.4396, | |
| "step": 830, | |
| "true_loss": 3.9531 | |
| }, | |
| { | |
| "epoch": 1.3251134346024858, | |
| "grad_norm": 100.60674522449192, | |
| "learning_rate": 3.1063705435417885e-06, | |
| "loss": 31.0772, | |
| "step": 840, | |
| "true_loss": 3.9851 | |
| }, | |
| { | |
| "epoch": 1.340895640165713, | |
| "grad_norm": 92.96501685915857, | |
| "learning_rate": 3.0771478667445944e-06, | |
| "loss": 31.258, | |
| "step": 850, | |
| "true_loss": 4.0154 | |
| }, | |
| { | |
| "epoch": 1.340895640165713, | |
| "eval_accuracy": 0.15299334811529933, | |
| "eval_loss": 3.602060556411743, | |
| "eval_runtime": 14.8867, | |
| "eval_samples_per_second": 60.591, | |
| "eval_steps_per_second": 7.591, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3566778457289406, | |
| "grad_norm": 108.12958936489977, | |
| "learning_rate": 3.0479251899473994e-06, | |
| "loss": 30.7919, | |
| "step": 860, | |
| "true_loss": 4.071 | |
| }, | |
| { | |
| "epoch": 1.372460051292168, | |
| "grad_norm": 106.38226656410994, | |
| "learning_rate": 3.0187025131502045e-06, | |
| "loss": 31.106, | |
| "step": 870, | |
| "true_loss": 3.8771 | |
| }, | |
| { | |
| "epoch": 1.3882422568553956, | |
| "grad_norm": 108.70900740908084, | |
| "learning_rate": 2.9894798363530103e-06, | |
| "loss": 31.1107, | |
| "step": 880, | |
| "true_loss": 4.0616 | |
| }, | |
| { | |
| "epoch": 1.404024462418623, | |
| "grad_norm": 96.84422602253281, | |
| "learning_rate": 2.9602571595558154e-06, | |
| "loss": 31.1303, | |
| "step": 890, | |
| "true_loss": 3.9508 | |
| }, | |
| { | |
| "epoch": 1.4198066679818504, | |
| "grad_norm": 107.15877482205376, | |
| "learning_rate": 2.931034482758621e-06, | |
| "loss": 31.3023, | |
| "step": 900, | |
| "true_loss": 3.6901 | |
| }, | |
| { | |
| "epoch": 1.4198066679818504, | |
| "eval_accuracy": 0.14412416851441243, | |
| "eval_loss": 3.599184989929199, | |
| "eval_runtime": 15.0273, | |
| "eval_samples_per_second": 60.024, | |
| "eval_steps_per_second": 7.52, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.435588873545078, | |
| "grad_norm": 111.56871878652656, | |
| "learning_rate": 2.9018118059614263e-06, | |
| "loss": 31.9703, | |
| "step": 910, | |
| "true_loss": 3.8989 | |
| }, | |
| { | |
| "epoch": 1.4513710791083054, | |
| "grad_norm": 95.57031718308956, | |
| "learning_rate": 2.8725891291642317e-06, | |
| "loss": 30.6978, | |
| "step": 920, | |
| "true_loss": 3.9814 | |
| }, | |
| { | |
| "epoch": 1.4671532846715327, | |
| "grad_norm": 104.66292767084798, | |
| "learning_rate": 2.8433664523670368e-06, | |
| "loss": 30.8414, | |
| "step": 930, | |
| "true_loss": 3.6811 | |
| }, | |
| { | |
| "epoch": 1.4829354902347602, | |
| "grad_norm": 109.58289564213575, | |
| "learning_rate": 2.8141437755698426e-06, | |
| "loss": 30.8233, | |
| "step": 940, | |
| "true_loss": 3.8887 | |
| }, | |
| { | |
| "epoch": 1.4987176957979877, | |
| "grad_norm": 109.2690047474217, | |
| "learning_rate": 2.7849210987726477e-06, | |
| "loss": 30.8807, | |
| "step": 950, | |
| "true_loss": 3.8297 | |
| }, | |
| { | |
| "epoch": 1.4987176957979877, | |
| "eval_accuracy": 0.14523281596452328, | |
| "eval_loss": 3.5658605098724365, | |
| "eval_runtime": 14.9659, | |
| "eval_samples_per_second": 60.27, | |
| "eval_steps_per_second": 7.55, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.5144999013612153, | |
| "grad_norm": 111.94119193959217, | |
| "learning_rate": 2.7556984219754535e-06, | |
| "loss": 30.3332, | |
| "step": 960, | |
| "true_loss": 3.709 | |
| }, | |
| { | |
| "epoch": 1.5302821069244428, | |
| "grad_norm": 123.39562082481231, | |
| "learning_rate": 2.7264757451782586e-06, | |
| "loss": 31.1628, | |
| "step": 970, | |
| "true_loss": 4.1074 | |
| }, | |
| { | |
| "epoch": 1.5460643124876703, | |
| "grad_norm": 105.28551916442693, | |
| "learning_rate": 2.697253068381064e-06, | |
| "loss": 30.8197, | |
| "step": 980, | |
| "true_loss": 3.6786 | |
| }, | |
| { | |
| "epoch": 1.5618465180508976, | |
| "grad_norm": 111.08402051996337, | |
| "learning_rate": 2.6680303915838695e-06, | |
| "loss": 30.9562, | |
| "step": 990, | |
| "true_loss": 3.9186 | |
| }, | |
| { | |
| "epoch": 1.577628723614125, | |
| "grad_norm": 103.10179117331714, | |
| "learning_rate": 2.638807714786675e-06, | |
| "loss": 30.8469, | |
| "step": 1000, | |
| "true_loss": 3.6842 | |
| }, | |
| { | |
| "epoch": 1.577628723614125, | |
| "eval_accuracy": 0.1629711751662971, | |
| "eval_loss": 3.5389108657836914, | |
| "eval_runtime": 14.9677, | |
| "eval_samples_per_second": 60.263, | |
| "eval_steps_per_second": 7.55, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5934109291773524, | |
| "grad_norm": 104.7664363387938, | |
| "learning_rate": 2.60958503798948e-06, | |
| "loss": 30.8959, | |
| "step": 1010, | |
| "true_loss": 3.8303 | |
| }, | |
| { | |
| "epoch": 1.60919313474058, | |
| "grad_norm": 108.11532700366512, | |
| "learning_rate": 2.580362361192286e-06, | |
| "loss": 30.822, | |
| "step": 1020, | |
| "true_loss": 4.0791 | |
| }, | |
| { | |
| "epoch": 1.6249753403038074, | |
| "grad_norm": 122.96775922789799, | |
| "learning_rate": 2.551139684395091e-06, | |
| "loss": 30.4368, | |
| "step": 1030, | |
| "true_loss": 3.8672 | |
| }, | |
| { | |
| "epoch": 1.640757545867035, | |
| "grad_norm": 111.32549428947904, | |
| "learning_rate": 2.521917007597896e-06, | |
| "loss": 30.7161, | |
| "step": 1040, | |
| "true_loss": 3.9227 | |
| }, | |
| { | |
| "epoch": 1.6565397514302624, | |
| "grad_norm": 105.3879178888327, | |
| "learning_rate": 2.4926943308007014e-06, | |
| "loss": 30.6919, | |
| "step": 1050, | |
| "true_loss": 3.8327 | |
| }, | |
| { | |
| "epoch": 1.6565397514302624, | |
| "eval_accuracy": 0.15964523281596452, | |
| "eval_loss": 3.5158631801605225, | |
| "eval_runtime": 15.0083, | |
| "eval_samples_per_second": 60.1, | |
| "eval_steps_per_second": 7.529, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.67232195699349, | |
| "grad_norm": 118.06824634956254, | |
| "learning_rate": 2.463471654003507e-06, | |
| "loss": 30.4308, | |
| "step": 1060, | |
| "true_loss": 3.718 | |
| }, | |
| { | |
| "epoch": 1.6881041625567172, | |
| "grad_norm": 116.61755089060757, | |
| "learning_rate": 2.4342489772063123e-06, | |
| "loss": 30.5914, | |
| "step": 1070, | |
| "true_loss": 3.9526 | |
| }, | |
| { | |
| "epoch": 1.7038863681199448, | |
| "grad_norm": 115.56434553036443, | |
| "learning_rate": 2.4050263004091177e-06, | |
| "loss": 30.5987, | |
| "step": 1080, | |
| "true_loss": 3.9159 | |
| }, | |
| { | |
| "epoch": 1.7196685736831723, | |
| "grad_norm": 112.33303898638984, | |
| "learning_rate": 2.375803623611923e-06, | |
| "loss": 31.0096, | |
| "step": 1090, | |
| "true_loss": 3.8114 | |
| }, | |
| { | |
| "epoch": 1.7354507792463996, | |
| "grad_norm": 112.21346249165515, | |
| "learning_rate": 2.3465809468147286e-06, | |
| "loss": 30.7385, | |
| "step": 1100, | |
| "true_loss": 3.8687 | |
| }, | |
| { | |
| "epoch": 1.7354507792463996, | |
| "eval_accuracy": 0.1696230598669623, | |
| "eval_loss": 3.5062079429626465, | |
| "eval_runtime": 14.8938, | |
| "eval_samples_per_second": 60.562, | |
| "eval_steps_per_second": 7.587, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.751232984809627, | |
| "grad_norm": 122.73525997559868, | |
| "learning_rate": 2.3173582700175337e-06, | |
| "loss": 30.5289, | |
| "step": 1110, | |
| "true_loss": 3.8205 | |
| }, | |
| { | |
| "epoch": 1.7670151903728546, | |
| "grad_norm": 119.27018553105627, | |
| "learning_rate": 2.288135593220339e-06, | |
| "loss": 30.1239, | |
| "step": 1120, | |
| "true_loss": 3.9501 | |
| }, | |
| { | |
| "epoch": 1.782797395936082, | |
| "grad_norm": 119.29842149378848, | |
| "learning_rate": 2.2589129164231446e-06, | |
| "loss": 30.4032, | |
| "step": 1130, | |
| "true_loss": 3.8006 | |
| }, | |
| { | |
| "epoch": 1.7985796014993096, | |
| "grad_norm": 126.27340829379088, | |
| "learning_rate": 2.22969023962595e-06, | |
| "loss": 30.8506, | |
| "step": 1140, | |
| "true_loss": 3.8388 | |
| }, | |
| { | |
| "epoch": 1.8143618070625371, | |
| "grad_norm": 118.22873644516751, | |
| "learning_rate": 2.2004675628287555e-06, | |
| "loss": 30.742, | |
| "step": 1150, | |
| "true_loss": 3.9507 | |
| }, | |
| { | |
| "epoch": 1.8143618070625371, | |
| "eval_accuracy": 0.1607538802660754, | |
| "eval_loss": 3.498107433319092, | |
| "eval_runtime": 15.0542, | |
| "eval_samples_per_second": 59.917, | |
| "eval_steps_per_second": 7.506, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8301440126257644, | |
| "grad_norm": 115.24942636463535, | |
| "learning_rate": 2.171244886031561e-06, | |
| "loss": 30.2917, | |
| "step": 1160, | |
| "true_loss": 3.854 | |
| }, | |
| { | |
| "epoch": 1.845926218188992, | |
| "grad_norm": 120.31076182133407, | |
| "learning_rate": 2.142022209234366e-06, | |
| "loss": 30.98, | |
| "step": 1170, | |
| "true_loss": 4.0372 | |
| }, | |
| { | |
| "epoch": 1.8617084237522192, | |
| "grad_norm": 114.03103310407008, | |
| "learning_rate": 2.1127995324371714e-06, | |
| "loss": 31.2044, | |
| "step": 1180, | |
| "true_loss": 3.7539 | |
| }, | |
| { | |
| "epoch": 1.8774906293154467, | |
| "grad_norm": 112.15849385918017, | |
| "learning_rate": 2.083576855639977e-06, | |
| "loss": 30.6597, | |
| "step": 1190, | |
| "true_loss": 3.6715 | |
| }, | |
| { | |
| "epoch": 1.8932728348786743, | |
| "grad_norm": 139.58809345884583, | |
| "learning_rate": 2.054354178842782e-06, | |
| "loss": 30.6171, | |
| "step": 1200, | |
| "true_loss": 3.7479 | |
| }, | |
| { | |
| "epoch": 1.8932728348786743, | |
| "eval_accuracy": 0.16186252771618626, | |
| "eval_loss": 3.4824934005737305, | |
| "eval_runtime": 14.975, | |
| "eval_samples_per_second": 60.234, | |
| "eval_steps_per_second": 7.546, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.9090550404419018, | |
| "grad_norm": 126.47013176461778, | |
| "learning_rate": 2.0251315020455873e-06, | |
| "loss": 30.4756, | |
| "step": 1210, | |
| "true_loss": 3.8482 | |
| }, | |
| { | |
| "epoch": 1.9248372460051293, | |
| "grad_norm": 129.40438104945616, | |
| "learning_rate": 1.995908825248393e-06, | |
| "loss": 30.6922, | |
| "step": 1220, | |
| "true_loss": 3.7344 | |
| }, | |
| { | |
| "epoch": 1.9406194515683568, | |
| "grad_norm": 117.04258986010086, | |
| "learning_rate": 1.9666861484511982e-06, | |
| "loss": 30.2474, | |
| "step": 1230, | |
| "true_loss": 3.746 | |
| }, | |
| { | |
| "epoch": 1.956401657131584, | |
| "grad_norm": 115.62691180569766, | |
| "learning_rate": 1.9374634716540037e-06, | |
| "loss": 30.1561, | |
| "step": 1240, | |
| "true_loss": 3.9135 | |
| }, | |
| { | |
| "epoch": 1.9721838626948116, | |
| "grad_norm": 125.90746499326437, | |
| "learning_rate": 1.908240794856809e-06, | |
| "loss": 30.4126, | |
| "step": 1250, | |
| "true_loss": 3.8586 | |
| }, | |
| { | |
| "epoch": 1.9721838626948116, | |
| "eval_accuracy": 0.164079822616408, | |
| "eval_loss": 3.4665002822875977, | |
| "eval_runtime": 14.9177, | |
| "eval_samples_per_second": 60.465, | |
| "eval_steps_per_second": 7.575, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.987966068258039, | |
| "grad_norm": 125.7071274298596, | |
| "learning_rate": 1.8790181180596146e-06, | |
| "loss": 30.3615, | |
| "step": 1260, | |
| "true_loss": 3.9108 | |
| }, | |
| { | |
| "epoch": 2.0031564411126457, | |
| "grad_norm": 119.68428766484261, | |
| "learning_rate": 1.8497954412624196e-06, | |
| "loss": 28.5842, | |
| "step": 1270, | |
| "true_loss": 3.5793 | |
| }, | |
| { | |
| "epoch": 2.0189386466758728, | |
| "grad_norm": 142.95432081434146, | |
| "learning_rate": 1.820572764465225e-06, | |
| "loss": 29.1367, | |
| "step": 1280, | |
| "true_loss": 3.7706 | |
| }, | |
| { | |
| "epoch": 2.0347208522391003, | |
| "grad_norm": 160.24172016677548, | |
| "learning_rate": 1.7913500876680305e-06, | |
| "loss": 29.5544, | |
| "step": 1290, | |
| "true_loss": 3.548 | |
| }, | |
| { | |
| "epoch": 2.0505030578023278, | |
| "grad_norm": 140.3505056595712, | |
| "learning_rate": 1.7621274108708358e-06, | |
| "loss": 29.5142, | |
| "step": 1300, | |
| "true_loss": 3.7087 | |
| }, | |
| { | |
| "epoch": 2.0505030578023278, | |
| "eval_accuracy": 0.16518847006651885, | |
| "eval_loss": 3.4521355628967285, | |
| "eval_runtime": 14.8455, | |
| "eval_samples_per_second": 60.759, | |
| "eval_steps_per_second": 7.612, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.0662852633655553, | |
| "grad_norm": 135.81575482285095, | |
| "learning_rate": 1.7329047340736412e-06, | |
| "loss": 29.5029, | |
| "step": 1310, | |
| "true_loss": 3.6682 | |
| }, | |
| { | |
| "epoch": 2.082067468928783, | |
| "grad_norm": 143.74657310067929, | |
| "learning_rate": 1.7036820572764467e-06, | |
| "loss": 29.6016, | |
| "step": 1320, | |
| "true_loss": 3.5355 | |
| }, | |
| { | |
| "epoch": 2.0978496744920103, | |
| "grad_norm": 137.74730157928755, | |
| "learning_rate": 1.674459380479252e-06, | |
| "loss": 29.1209, | |
| "step": 1330, | |
| "true_loss": 3.4128 | |
| }, | |
| { | |
| "epoch": 2.113631880055238, | |
| "grad_norm": 147.5729345169822, | |
| "learning_rate": 1.6452367036820574e-06, | |
| "loss": 29.7667, | |
| "step": 1340, | |
| "true_loss": 3.5826 | |
| }, | |
| { | |
| "epoch": 2.1294140856184653, | |
| "grad_norm": 149.77371005154865, | |
| "learning_rate": 1.6160140268848628e-06, | |
| "loss": 29.3551, | |
| "step": 1350, | |
| "true_loss": 3.7582 | |
| }, | |
| { | |
| "epoch": 2.1294140856184653, | |
| "eval_accuracy": 0.18292682926829268, | |
| "eval_loss": 3.432112693786621, | |
| "eval_runtime": 14.9865, | |
| "eval_samples_per_second": 60.187, | |
| "eval_steps_per_second": 7.54, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.1451962911816924, | |
| "grad_norm": 147.80741608555775, | |
| "learning_rate": 1.586791350087668e-06, | |
| "loss": 29.2982, | |
| "step": 1360, | |
| "true_loss": 3.4947 | |
| }, | |
| { | |
| "epoch": 2.16097849674492, | |
| "grad_norm": 145.1261208124329, | |
| "learning_rate": 1.5575686732904735e-06, | |
| "loss": 28.7471, | |
| "step": 1370, | |
| "true_loss": 3.4964 | |
| }, | |
| { | |
| "epoch": 2.1767607023081474, | |
| "grad_norm": 152.29650206925518, | |
| "learning_rate": 1.528345996493279e-06, | |
| "loss": 29.8086, | |
| "step": 1380, | |
| "true_loss": 3.724 | |
| }, | |
| { | |
| "epoch": 2.192542907871375, | |
| "grad_norm": 161.67922175813698, | |
| "learning_rate": 1.4991233196960842e-06, | |
| "loss": 29.5229, | |
| "step": 1390, | |
| "true_loss": 3.8039 | |
| }, | |
| { | |
| "epoch": 2.2083251134346025, | |
| "grad_norm": 155.22142131612623, | |
| "learning_rate": 1.4699006428988897e-06, | |
| "loss": 29.5038, | |
| "step": 1400, | |
| "true_loss": 3.5815 | |
| }, | |
| { | |
| "epoch": 2.2083251134346025, | |
| "eval_accuracy": 0.17960088691796008, | |
| "eval_loss": 3.440061569213867, | |
| "eval_runtime": 14.9796, | |
| "eval_samples_per_second": 60.215, | |
| "eval_steps_per_second": 7.544, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.22410731899783, | |
| "grad_norm": 156.53973755104687, | |
| "learning_rate": 1.4406779661016951e-06, | |
| "loss": 29.2715, | |
| "step": 1410, | |
| "true_loss": 3.6531 | |
| }, | |
| { | |
| "epoch": 2.2398895245610575, | |
| "grad_norm": 149.15210915040223, | |
| "learning_rate": 1.4114552893045006e-06, | |
| "loss": 29.6691, | |
| "step": 1420, | |
| "true_loss": 3.9184 | |
| }, | |
| { | |
| "epoch": 2.255671730124285, | |
| "grad_norm": 157.87160672579, | |
| "learning_rate": 1.3822326125073058e-06, | |
| "loss": 29.8417, | |
| "step": 1430, | |
| "true_loss": 3.6847 | |
| }, | |
| { | |
| "epoch": 2.2714539356875125, | |
| "grad_norm": 151.1446502317595, | |
| "learning_rate": 1.3530099357101113e-06, | |
| "loss": 30.3948, | |
| "step": 1440, | |
| "true_loss": 3.7877 | |
| }, | |
| { | |
| "epoch": 2.28723614125074, | |
| "grad_norm": 177.46389269846443, | |
| "learning_rate": 1.3237872589129167e-06, | |
| "loss": 29.5136, | |
| "step": 1450, | |
| "true_loss": 3.7729 | |
| }, | |
| { | |
| "epoch": 2.28723614125074, | |
| "eval_accuracy": 0.1762749445676275, | |
| "eval_loss": 3.4319870471954346, | |
| "eval_runtime": 14.8171, | |
| "eval_samples_per_second": 60.876, | |
| "eval_steps_per_second": 7.626, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.303018346813967, | |
| "grad_norm": 177.0142136955459, | |
| "learning_rate": 1.2945645821157218e-06, | |
| "loss": 29.6728, | |
| "step": 1460, | |
| "true_loss": 3.9035 | |
| }, | |
| { | |
| "epoch": 2.3188005523771946, | |
| "grad_norm": 170.26022348801405, | |
| "learning_rate": 1.2653419053185272e-06, | |
| "loss": 29.6204, | |
| "step": 1470, | |
| "true_loss": 3.7586 | |
| }, | |
| { | |
| "epoch": 2.334582757940422, | |
| "grad_norm": 169.623587107286, | |
| "learning_rate": 1.2361192285213327e-06, | |
| "loss": 29.4771, | |
| "step": 1480, | |
| "true_loss": 3.8 | |
| }, | |
| { | |
| "epoch": 2.3503649635036497, | |
| "grad_norm": 159.42232132005338, | |
| "learning_rate": 1.2068965517241381e-06, | |
| "loss": 29.0484, | |
| "step": 1490, | |
| "true_loss": 3.6296 | |
| }, | |
| { | |
| "epoch": 2.366147169066877, | |
| "grad_norm": 168.47474555405154, | |
| "learning_rate": 1.1776738749269434e-06, | |
| "loss": 29.6276, | |
| "step": 1500, | |
| "true_loss": 3.6125 | |
| }, | |
| { | |
| "epoch": 2.366147169066877, | |
| "eval_accuracy": 0.18403547671840353, | |
| "eval_loss": 3.420041084289551, | |
| "eval_runtime": 14.9555, | |
| "eval_samples_per_second": 60.312, | |
| "eval_steps_per_second": 7.556, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.3819293746301047, | |
| "grad_norm": 162.07404618888037, | |
| "learning_rate": 1.1484511981297488e-06, | |
| "loss": 29.0954, | |
| "step": 1510, | |
| "true_loss": 3.5425 | |
| }, | |
| { | |
| "epoch": 2.397711580193332, | |
| "grad_norm": 181.11213446797987, | |
| "learning_rate": 1.1192285213325543e-06, | |
| "loss": 29.3659, | |
| "step": 1520, | |
| "true_loss": 3.5447 | |
| }, | |
| { | |
| "epoch": 2.4134937857565593, | |
| "grad_norm": 172.6165441680545, | |
| "learning_rate": 1.0900058445353595e-06, | |
| "loss": 28.6622, | |
| "step": 1530, | |
| "true_loss": 3.5335 | |
| }, | |
| { | |
| "epoch": 2.4292759913197868, | |
| "grad_norm": 178.73759767816594, | |
| "learning_rate": 1.0607831677381648e-06, | |
| "loss": 29.8517, | |
| "step": 1540, | |
| "true_loss": 3.6041 | |
| }, | |
| { | |
| "epoch": 2.4450581968830143, | |
| "grad_norm": 192.3719970092387, | |
| "learning_rate": 1.0315604909409702e-06, | |
| "loss": 28.9193, | |
| "step": 1550, | |
| "true_loss": 3.7661 | |
| }, | |
| { | |
| "epoch": 2.4450581968830143, | |
| "eval_accuracy": 0.188470066518847, | |
| "eval_loss": 3.4035346508026123, | |
| "eval_runtime": 14.961, | |
| "eval_samples_per_second": 60.29, | |
| "eval_steps_per_second": 7.553, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.460840402446242, | |
| "grad_norm": 177.67877867152768, | |
| "learning_rate": 1.0023378141437757e-06, | |
| "loss": 29.2984, | |
| "step": 1560, | |
| "true_loss": 3.6286 | |
| }, | |
| { | |
| "epoch": 2.4766226080094693, | |
| "grad_norm": 188.74386540748054, | |
| "learning_rate": 9.731151373465811e-07, | |
| "loss": 29.2941, | |
| "step": 1570, | |
| "true_loss": 3.6082 | |
| }, | |
| { | |
| "epoch": 2.492404813572697, | |
| "grad_norm": 188.53177150362492, | |
| "learning_rate": 9.438924605493864e-07, | |
| "loss": 29.2244, | |
| "step": 1580, | |
| "true_loss": 3.4747 | |
| }, | |
| { | |
| "epoch": 2.5081870191359243, | |
| "grad_norm": 177.0506666498904, | |
| "learning_rate": 9.146697837521917e-07, | |
| "loss": 28.9166, | |
| "step": 1590, | |
| "true_loss": 3.4376 | |
| }, | |
| { | |
| "epoch": 2.523969224699152, | |
| "grad_norm": 183.5215902444712, | |
| "learning_rate": 8.854471069549972e-07, | |
| "loss": 28.6856, | |
| "step": 1600, | |
| "true_loss": 3.6844 | |
| }, | |
| { | |
| "epoch": 2.523969224699152, | |
| "eval_accuracy": 0.18181818181818182, | |
| "eval_loss": 3.3976125717163086, | |
| "eval_runtime": 14.9189, | |
| "eval_samples_per_second": 60.46, | |
| "eval_steps_per_second": 7.574, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.5397514302623794, | |
| "grad_norm": 175.18878295898904, | |
| "learning_rate": 8.562244301578025e-07, | |
| "loss": 29.0664, | |
| "step": 1610, | |
| "true_loss": 3.738 | |
| }, | |
| { | |
| "epoch": 2.555533635825607, | |
| "grad_norm": 170.9586548154297, | |
| "learning_rate": 8.270017533606079e-07, | |
| "loss": 29.4792, | |
| "step": 1620, | |
| "true_loss": 3.5714 | |
| }, | |
| { | |
| "epoch": 2.571315841388834, | |
| "grad_norm": 184.16360272576958, | |
| "learning_rate": 7.977790765634133e-07, | |
| "loss": 29.5751, | |
| "step": 1630, | |
| "true_loss": 3.6899 | |
| }, | |
| { | |
| "epoch": 2.5870980469520615, | |
| "grad_norm": 177.57568678111596, | |
| "learning_rate": 7.685563997662187e-07, | |
| "loss": 28.9365, | |
| "step": 1640, | |
| "true_loss": 3.7683 | |
| }, | |
| { | |
| "epoch": 2.602880252515289, | |
| "grad_norm": 174.71239418729613, | |
| "learning_rate": 7.393337229690241e-07, | |
| "loss": 29.2765, | |
| "step": 1650, | |
| "true_loss": 3.5981 | |
| }, | |
| { | |
| "epoch": 2.602880252515289, | |
| "eval_accuracy": 0.18403547671840353, | |
| "eval_loss": 3.399111032485962, | |
| "eval_runtime": 14.8984, | |
| "eval_samples_per_second": 60.544, | |
| "eval_steps_per_second": 7.585, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.6186624580785165, | |
| "grad_norm": 191.0002268007269, | |
| "learning_rate": 7.101110461718295e-07, | |
| "loss": 29.0547, | |
| "step": 1660, | |
| "true_loss": 3.8121 | |
| }, | |
| { | |
| "epoch": 2.634444663641744, | |
| "grad_norm": 177.3682901257333, | |
| "learning_rate": 6.808883693746347e-07, | |
| "loss": 29.0442, | |
| "step": 1670, | |
| "true_loss": 3.5864 | |
| }, | |
| { | |
| "epoch": 2.6502268692049715, | |
| "grad_norm": 174.71007183431408, | |
| "learning_rate": 6.516656925774401e-07, | |
| "loss": 28.9077, | |
| "step": 1680, | |
| "true_loss": 3.3604 | |
| }, | |
| { | |
| "epoch": 2.6660090747681986, | |
| "grad_norm": 189.2465250317481, | |
| "learning_rate": 6.224430157802455e-07, | |
| "loss": 29.2019, | |
| "step": 1690, | |
| "true_loss": 3.5039 | |
| }, | |
| { | |
| "epoch": 2.681791280331426, | |
| "grad_norm": 177.62073060179011, | |
| "learning_rate": 5.93220338983051e-07, | |
| "loss": 29.229, | |
| "step": 1700, | |
| "true_loss": 3.5725 | |
| }, | |
| { | |
| "epoch": 2.681791280331426, | |
| "eval_accuracy": 0.18514412416851442, | |
| "eval_loss": 3.3881542682647705, | |
| "eval_runtime": 15.0332, | |
| "eval_samples_per_second": 60.001, | |
| "eval_steps_per_second": 7.517, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.6975734858946536, | |
| "grad_norm": 188.85721776025207, | |
| "learning_rate": 5.639976621858563e-07, | |
| "loss": 28.9533, | |
| "step": 1710, | |
| "true_loss": 3.7415 | |
| }, | |
| { | |
| "epoch": 2.713355691457881, | |
| "grad_norm": 177.46012380599524, | |
| "learning_rate": 5.347749853886616e-07, | |
| "loss": 29.1529, | |
| "step": 1720, | |
| "true_loss": 3.5611 | |
| }, | |
| { | |
| "epoch": 2.7291378970211086, | |
| "grad_norm": 188.11753895032865, | |
| "learning_rate": 5.05552308591467e-07, | |
| "loss": 29.3255, | |
| "step": 1730, | |
| "true_loss": 3.5425 | |
| }, | |
| { | |
| "epoch": 2.744920102584336, | |
| "grad_norm": 170.87638552649375, | |
| "learning_rate": 4.763296317942724e-07, | |
| "loss": 29.4073, | |
| "step": 1740, | |
| "true_loss": 3.6556 | |
| }, | |
| { | |
| "epoch": 2.7607023081475637, | |
| "grad_norm": 180.7284650297668, | |
| "learning_rate": 4.4710695499707774e-07, | |
| "loss": 29.8538, | |
| "step": 1750, | |
| "true_loss": 3.8687 | |
| }, | |
| { | |
| "epoch": 2.7607023081475637, | |
| "eval_accuracy": 0.19068736141906872, | |
| "eval_loss": 3.377656936645508, | |
| "eval_runtime": 14.9062, | |
| "eval_samples_per_second": 60.512, | |
| "eval_steps_per_second": 7.581, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.776484513710791, | |
| "grad_norm": 174.471909092309, | |
| "learning_rate": 4.1788427819988314e-07, | |
| "loss": 28.8045, | |
| "step": 1760, | |
| "true_loss": 3.6287 | |
| }, | |
| { | |
| "epoch": 2.7922667192740187, | |
| "grad_norm": 195.44928225469693, | |
| "learning_rate": 3.8866160140268854e-07, | |
| "loss": 28.9067, | |
| "step": 1770, | |
| "true_loss": 3.4542 | |
| }, | |
| { | |
| "epoch": 2.808048924837246, | |
| "grad_norm": 178.92804289511642, | |
| "learning_rate": 3.594389246054939e-07, | |
| "loss": 28.6787, | |
| "step": 1780, | |
| "true_loss": 3.6228 | |
| }, | |
| { | |
| "epoch": 2.8238311304004737, | |
| "grad_norm": 194.93339966871807, | |
| "learning_rate": 3.3021624780829924e-07, | |
| "loss": 28.9603, | |
| "step": 1790, | |
| "true_loss": 3.6706 | |
| }, | |
| { | |
| "epoch": 2.839613335963701, | |
| "grad_norm": 201.13665707365328, | |
| "learning_rate": 3.0099357101110464e-07, | |
| "loss": 28.8935, | |
| "step": 1800, | |
| "true_loss": 3.5731 | |
| }, | |
| { | |
| "epoch": 2.839613335963701, | |
| "eval_accuracy": 0.19401330376940132, | |
| "eval_loss": 3.3746535778045654, | |
| "eval_runtime": 14.9077, | |
| "eval_samples_per_second": 60.506, | |
| "eval_steps_per_second": 7.58, | |
| "step": 1800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1902, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |