{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0294991090873093, "eval_steps": 500, "global_step": 5200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019798059790140566, "grad_norm": 79.13713836669922, "learning_rate": 0.0, "llm_loss": 2.109175831079483, "loss": 12.4474, "loss_aux_layer_0": 1.01171875, "loss_aux_layer_1": 1.001953125, "loss_aux_layer_10": 0.986328125, "loss_aux_layer_11": 1.0, "loss_aux_layer_12": 1.03515625, "loss_aux_layer_13": 0.9990234375, "loss_aux_layer_14": 0.9755859375, "loss_aux_layer_15": 1.005859375, "loss_aux_layer_16": 1.0078125, "loss_aux_layer_17": 0.9853515625, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.9736328125, "loss_aux_layer_20": 0.9951171875, "loss_aux_layer_21": 0.9970703125, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.009765625, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.015625, "loss_aux_layer_6": 0.99609375, "loss_aux_layer_7": 0.9892578125, "loss_aux_layer_8": 1.009765625, "loss_aux_layer_9": 0.9990234375, "step": 1, "total_loss": 3.1118614077568054 }, { "epoch": 0.0003959611958028113, "grad_norm": 70.88375091552734, "learning_rate": 5.000000000000001e-07, "llm_loss": 2.102919638156891, "loss": 12.422, "loss_aux_layer_0": 1.01171875, "loss_aux_layer_1": 1.0, "loss_aux_layer_10": 0.986328125, "loss_aux_layer_11": 1.001953125, "loss_aux_layer_12": 1.03125, "loss_aux_layer_13": 0.998046875, "loss_aux_layer_14": 0.974609375, "loss_aux_layer_15": 1.00390625, "loss_aux_layer_16": 1.0078125, "loss_aux_layer_17": 0.986328125, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.97265625, "loss_aux_layer_20": 0.9970703125, "loss_aux_layer_21": 0.9970703125, "loss_aux_layer_22": 1.013671875, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.009765625, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.015625, "loss_aux_layer_6": 0.99609375, "loss_aux_layer_7": 0.990234375, "loss_aux_layer_8": 1.009765625, "loss_aux_layer_9": 0.9990234375, "step": 2, "total_loss": 3.1054983735084534 }, { "epoch": 0.000593941793704217, "grad_norm": 77.23517608642578, "learning_rate": 1.0000000000000002e-06, "llm_loss": 2.0600505471229553, "loss": 12.2499, "loss_aux_layer_0": 1.009765625, "loss_aux_layer_1": 0.9990234375, "loss_aux_layer_10": 0.9853515625, "loss_aux_layer_11": 1.0, "loss_aux_layer_12": 1.037109375, "loss_aux_layer_13": 1.0, "loss_aux_layer_14": 0.9755859375, "loss_aux_layer_15": 1.001953125, "loss_aux_layer_16": 1.0078125, "loss_aux_layer_17": 0.986328125, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.013671875, "loss_aux_layer_2": 0.974609375, "loss_aux_layer_20": 0.994140625, "loss_aux_layer_21": 0.99609375, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.009765625, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.015625, "loss_aux_layer_6": 0.99609375, "loss_aux_layer_7": 0.9892578125, "loss_aux_layer_8": 1.009765625, "loss_aux_layer_9": 0.998046875, "step": 3, "total_loss": 3.062476873397827 }, { "epoch": 0.0007919223916056226, "grad_norm": 96.46268463134766, "learning_rate": 1.5e-06, "llm_loss": 2.3467044830322266, "loss": 13.3971, "loss_aux_layer_0": 1.009765625, "loss_aux_layer_1": 0.9990234375, "loss_aux_layer_10": 0.986328125, "loss_aux_layer_11": 1.0, "loss_aux_layer_12": 1.03515625, "loss_aux_layer_13": 1.0, "loss_aux_layer_14": 0.9755859375, "loss_aux_layer_15": 1.005859375, "loss_aux_layer_16": 1.005859375, "loss_aux_layer_17": 0.986328125, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.974609375, "loss_aux_layer_20": 0.99609375, "loss_aux_layer_21": 0.9970703125, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.0078125, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.015625, "loss_aux_layer_6": 0.9951171875, "loss_aux_layer_7": 0.990234375, "loss_aux_layer_8": 1.01171875, "loss_aux_layer_9": 0.998046875, "step": 4, "total_loss": 3.349273145198822 }, { "epoch": 0.0009899029895070284, "grad_norm": 97.37342834472656, "learning_rate": 2.0000000000000003e-06, "llm_loss": 2.184664785861969, "loss": 12.7478, "loss_aux_layer_0": 1.009765625, "loss_aux_layer_1": 1.001953125, "loss_aux_layer_10": 0.984375, "loss_aux_layer_11": 1.001953125, "loss_aux_layer_12": 1.03125, "loss_aux_layer_13": 0.9990234375, "loss_aux_layer_14": 0.9755859375, "loss_aux_layer_15": 1.009765625, "loss_aux_layer_16": 1.0078125, "loss_aux_layer_17": 0.9853515625, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.013671875, "loss_aux_layer_2": 0.9716796875, "loss_aux_layer_20": 0.9951171875, "loss_aux_layer_21": 0.998046875, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.009765625, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.017578125, "loss_aux_layer_6": 0.9970703125, "loss_aux_layer_7": 0.98828125, "loss_aux_layer_8": 1.009765625, "loss_aux_layer_9": 0.998046875, "step": 5, "total_loss": 3.1869434118270874 }, { "epoch": 0.001187883587408434, "grad_norm": 109.4437484741211, "learning_rate": 2.5e-06, "llm_loss": 1.8981378078460693, "loss": 11.5998, "loss_aux_layer_0": 1.0078125, "loss_aux_layer_1": 1.0009765625, "loss_aux_layer_10": 0.9853515625, "loss_aux_layer_11": 1.0, "loss_aux_layer_12": 1.033203125, "loss_aux_layer_13": 0.9990234375, "loss_aux_layer_14": 0.974609375, "loss_aux_layer_15": 1.005859375, "loss_aux_layer_16": 1.0078125, "loss_aux_layer_17": 0.9853515625, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.97265625, "loss_aux_layer_20": 0.994140625, "loss_aux_layer_21": 0.998046875, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.005859375, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.017578125, "loss_aux_layer_6": 0.994140625, "loss_aux_layer_7": 0.98828125, "loss_aux_layer_8": 1.0078125, "loss_aux_layer_9": 0.99609375, "step": 6, "total_loss": 2.8999382853507996 }, { "epoch": 0.0013858641853098396, "grad_norm": 130.7167205810547, "learning_rate": 3e-06, "llm_loss": 1.8300187289714813, "loss": 11.3265, "loss_aux_layer_0": 1.009765625, "loss_aux_layer_1": 0.998046875, "loss_aux_layer_10": 0.984375, "loss_aux_layer_11": 1.001953125, "loss_aux_layer_12": 1.03125, "loss_aux_layer_13": 0.9990234375, "loss_aux_layer_14": 0.974609375, "loss_aux_layer_15": 1.005859375, "loss_aux_layer_16": 1.005859375, "loss_aux_layer_17": 0.984375, "loss_aux_layer_18": 1.0078125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.97265625, "loss_aux_layer_20": 0.9951171875, "loss_aux_layer_21": 0.99609375, "loss_aux_layer_22": 1.015625, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 1.005859375, "loss_aux_layer_4": 1.015625, "loss_aux_layer_5": 1.015625, "loss_aux_layer_6": 0.994140625, "loss_aux_layer_7": 0.9873046875, "loss_aux_layer_8": 1.0078125, "loss_aux_layer_9": 0.9970703125, "step": 7, "total_loss": 2.8316158056259155 }, { "epoch": 0.0015838447832112453, "grad_norm": 83.78585815429688, "learning_rate": 3.5000000000000004e-06, "llm_loss": 1.459256649017334, "loss": 9.8334, "loss_aux_layer_0": 1.0078125, "loss_aux_layer_1": 0.994140625, "loss_aux_layer_10": 0.9833984375, "loss_aux_layer_11": 0.9970703125, "loss_aux_layer_12": 1.03125, "loss_aux_layer_13": 0.9951171875, "loss_aux_layer_14": 0.97265625, "loss_aux_layer_15": 1.001953125, "loss_aux_layer_16": 1.0, "loss_aux_layer_17": 0.982421875, "loss_aux_layer_18": 1.001953125, "loss_aux_layer_19": 1.015625, "loss_aux_layer_2": 0.9697265625, "loss_aux_layer_20": 0.990234375, "loss_aux_layer_21": 0.998046875, "loss_aux_layer_22": 1.013671875, "loss_aux_layer_23": 1.01953125, "loss_aux_layer_3": 1.0009765625, "loss_aux_layer_4": 1.0078125, "loss_aux_layer_5": 1.01171875, "loss_aux_layer_6": 0.9892578125, "loss_aux_layer_7": 0.98828125, "loss_aux_layer_8": 1.005859375, "loss_aux_layer_9": 0.9931640625, "step": 8, "total_loss": 2.458346366882324 }, { "epoch": 0.0017818253811126509, "grad_norm": 36.000709533691406, "learning_rate": 4.000000000000001e-06, "llm_loss": 1.4329922795295715, "loss": 9.7256, "loss_aux_layer_0": 1.00390625, "loss_aux_layer_1": 0.99609375, "loss_aux_layer_10": 0.9833984375, "loss_aux_layer_11": 0.998046875, "loss_aux_layer_12": 1.03125, "loss_aux_layer_13": 0.9931640625, "loss_aux_layer_14": 0.9716796875, "loss_aux_layer_15": 1.00390625, "loss_aux_layer_16": 1.0, "loss_aux_layer_17": 0.9775390625, "loss_aux_layer_18": 1.00390625, "loss_aux_layer_19": 1.0234375, "loss_aux_layer_2": 0.9677734375, "loss_aux_layer_20": 0.98828125, "loss_aux_layer_21": 0.9990234375, "loss_aux_layer_22": 1.009765625, "loss_aux_layer_23": 1.01953125, "loss_aux_layer_3": 0.9990234375, "loss_aux_layer_4": 1.0078125, "loss_aux_layer_5": 1.01171875, "loss_aux_layer_6": 0.98828125, "loss_aux_layer_7": 0.9853515625, "loss_aux_layer_8": 1.00390625, "loss_aux_layer_9": 0.9921875, "step": 9, "total_loss": 2.431390166282654 }, { "epoch": 0.0019798059790140567, "grad_norm": 44.851715087890625, "learning_rate": 4.5e-06, "llm_loss": 1.4961821138858795, "loss": 9.9461, "loss_aux_layer_0": 0.982421875, "loss_aux_layer_1": 0.986328125, "loss_aux_layer_10": 0.9765625, "loss_aux_layer_11": 0.994140625, "loss_aux_layer_12": 1.0234375, "loss_aux_layer_13": 0.9873046875, "loss_aux_layer_14": 0.96875, "loss_aux_layer_15": 1.00390625, "loss_aux_layer_16": 0.994140625, "loss_aux_layer_17": 0.9716796875, "loss_aux_layer_18": 1.0, "loss_aux_layer_19": 1.01953125, "loss_aux_layer_2": 0.9521484375, "loss_aux_layer_20": 0.9833984375, "loss_aux_layer_21": 0.994140625, "loss_aux_layer_22": 0.9990234375, "loss_aux_layer_23": 1.015625, "loss_aux_layer_3": 0.986328125, "loss_aux_layer_4": 0.99609375, "loss_aux_layer_5": 1.0, "loss_aux_layer_6": 0.974609375, "loss_aux_layer_7": 0.9755859375, "loss_aux_layer_8": 0.9951171875, "loss_aux_layer_9": 0.990234375, "step": 10, "total_loss": 2.4865183234214783 }, { "epoch": 0.002177786576915462, "grad_norm": 41.13898468017578, "learning_rate": 5e-06, "llm_loss": 1.3998808562755585, "loss": 9.556, "loss_aux_layer_0": 0.986328125, "loss_aux_layer_1": 0.98046875, "loss_aux_layer_10": 0.9736328125, "loss_aux_layer_11": 0.9921875, "loss_aux_layer_12": 1.0234375, "loss_aux_layer_13": 0.9853515625, "loss_aux_layer_14": 0.966796875, "loss_aux_layer_15": 1.001953125, "loss_aux_layer_16": 0.9921875, "loss_aux_layer_17": 0.97265625, "loss_aux_layer_18": 0.9970703125, "loss_aux_layer_19": 1.017578125, "loss_aux_layer_2": 0.9521484375, "loss_aux_layer_20": 0.98046875, "loss_aux_layer_21": 0.9921875, "loss_aux_layer_22": 1.0, "loss_aux_layer_23": 1.01171875, "loss_aux_layer_3": 0.9833984375, "loss_aux_layer_4": 0.9970703125, "loss_aux_layer_5": 0.998046875, "loss_aux_layer_6": 0.9716796875, "loss_aux_layer_7": 0.9775390625, "loss_aux_layer_8": 0.9951171875, "loss_aux_layer_9": 0.9853515625, "step": 11, "total_loss": 2.3889962434768677 }, { "epoch": 0.002375767174816868, "grad_norm": 21.822309494018555, "learning_rate": 5.500000000000001e-06, "llm_loss": 1.229336678981781, "loss": 8.8624, "loss_aux_layer_0": 0.9794921875, "loss_aux_layer_1": 0.9775390625, "loss_aux_layer_10": 0.97265625, "loss_aux_layer_11": 0.990234375, "loss_aux_layer_12": 1.0234375, "loss_aux_layer_13": 0.9833984375, "loss_aux_layer_14": 0.9638671875, "loss_aux_layer_15": 1.0, "loss_aux_layer_16": 0.990234375, "loss_aux_layer_17": 0.96875, "loss_aux_layer_18": 0.99609375, "loss_aux_layer_19": 1.017578125, "loss_aux_layer_2": 0.9482421875, "loss_aux_layer_20": 0.978515625, "loss_aux_layer_21": 0.990234375, "loss_aux_layer_22": 1.001953125, "loss_aux_layer_23": 1.0078125, "loss_aux_layer_3": 0.9765625, "loss_aux_layer_4": 0.9921875, "loss_aux_layer_5": 0.9931640625, "loss_aux_layer_6": 0.96875, "loss_aux_layer_7": 0.9736328125, "loss_aux_layer_8": 0.9921875, "loss_aux_layer_9": 0.9833984375, "step": 12, "total_loss": 2.2155935168266296 }, { "epoch": 0.0025737477727182734, "grad_norm": 52.72803497314453, "learning_rate": 6e-06, "llm_loss": 1.1271854639053345, "loss": 8.3144, "loss_aux_layer_0": 0.9189453125, "loss_aux_layer_1": 0.9296875, "loss_aux_layer_10": 0.9453125, "loss_aux_layer_11": 0.9541015625, "loss_aux_layer_12": 1.0, "loss_aux_layer_13": 0.953125, "loss_aux_layer_14": 0.9404296875, "loss_aux_layer_15": 0.9677734375, "loss_aux_layer_16": 0.9599609375, "loss_aux_layer_17": 0.9482421875, "loss_aux_layer_18": 0.970703125, "loss_aux_layer_19": 0.9931640625, "loss_aux_layer_2": 0.90234375, "loss_aux_layer_20": 0.9521484375, "loss_aux_layer_21": 0.9619140625, "loss_aux_layer_22": 0.9775390625, "loss_aux_layer_23": 0.986328125, "loss_aux_layer_3": 0.921875, "loss_aux_layer_4": 0.939453125, "loss_aux_layer_5": 0.94140625, "loss_aux_layer_6": 0.921875, "loss_aux_layer_7": 0.94921875, "loss_aux_layer_8": 0.955078125, "loss_aux_layer_9": 0.9443359375, "step": 13, "total_loss": 2.0786065459251404 }, { "epoch": 0.0027717283706196793, "grad_norm": 250.35055541992188, "learning_rate": 6.5000000000000004e-06, "llm_loss": 1.313174068927765, "loss": 9.1231, "loss_aux_layer_0": 0.9111328125, "loss_aux_layer_1": 0.927734375, "loss_aux_layer_10": 0.96484375, "loss_aux_layer_11": 0.9892578125, "loss_aux_layer_12": 1.001953125, "loss_aux_layer_13": 0.96875, "loss_aux_layer_14": 0.9599609375, "loss_aux_layer_15": 0.9970703125, "loss_aux_layer_16": 0.9951171875, "loss_aux_layer_17": 0.9599609375, "loss_aux_layer_18": 0.9814453125, "loss_aux_layer_19": 0.9873046875, "loss_aux_layer_2": 0.90625, "loss_aux_layer_20": 0.978515625, "loss_aux_layer_21": 0.98828125, "loss_aux_layer_22": 0.9873046875, "loss_aux_layer_23": 0.9931640625, "loss_aux_layer_3": 0.9345703125, "loss_aux_layer_4": 0.9541015625, "loss_aux_layer_5": 0.9609375, "loss_aux_layer_6": 0.9404296875, "loss_aux_layer_7": 0.9697265625, "loss_aux_layer_8": 0.9775390625, "loss_aux_layer_9": 0.98046875, "step": 14, "total_loss": 2.280764400959015 }, { "epoch": 0.002969708968521085, "grad_norm": 81.46060180664062, "learning_rate": 7.000000000000001e-06, "llm_loss": 1.2202293574810028, "loss": 8.7193, "loss_aux_layer_0": 0.908203125, "loss_aux_layer_1": 0.9248046875, "loss_aux_layer_10": 0.955078125, "loss_aux_layer_11": 0.9775390625, "loss_aux_layer_12": 1.00390625, "loss_aux_layer_13": 0.9619140625, "loss_aux_layer_14": 0.951171875, "loss_aux_layer_15": 0.9892578125, "loss_aux_layer_16": 0.98046875, "loss_aux_layer_17": 0.9541015625, "loss_aux_layer_18": 0.9765625, "loss_aux_layer_19": 0.9833984375, "loss_aux_layer_2": 0.896484375, "loss_aux_layer_20": 0.9677734375, "loss_aux_layer_21": 0.9765625, "loss_aux_layer_22": 0.9814453125, "loss_aux_layer_23": 0.9853515625, "loss_aux_layer_3": 0.921875, "loss_aux_layer_4": 0.94921875, "loss_aux_layer_5": 0.953125, "loss_aux_layer_6": 0.9296875, "loss_aux_layer_7": 0.9609375, "loss_aux_layer_8": 0.9697265625, "loss_aux_layer_9": 0.9697265625, "step": 15, "total_loss": 2.1798292994499207 }, { "epoch": 0.0031676895664224905, "grad_norm": 31.070125579833984, "learning_rate": 7.5e-06, "llm_loss": 1.170150801539421, "loss": 8.4562, "loss_aux_layer_0": 0.8916015625, "loss_aux_layer_1": 0.9140625, "loss_aux_layer_10": 0.9423828125, "loss_aux_layer_11": 0.9521484375, "loss_aux_layer_12": 0.9990234375, "loss_aux_layer_13": 0.94921875, "loss_aux_layer_14": 0.93359375, "loss_aux_layer_15": 0.9619140625, "loss_aux_layer_16": 0.953125, "loss_aux_layer_17": 0.9462890625, "loss_aux_layer_18": 0.9658203125, "loss_aux_layer_19": 0.978515625, "loss_aux_layer_2": 0.8837890625, "loss_aux_layer_20": 0.94921875, "loss_aux_layer_21": 0.9580078125, "loss_aux_layer_22": 0.9755859375, "loss_aux_layer_23": 0.978515625, "loss_aux_layer_3": 0.9033203125, "loss_aux_layer_4": 0.9365234375, "loss_aux_layer_5": 0.9345703125, "loss_aux_layer_6": 0.91796875, "loss_aux_layer_7": 0.9453125, "loss_aux_layer_8": 0.9453125, "loss_aux_layer_9": 0.9404296875, "step": 16, "total_loss": 2.1140544712543488 }, { "epoch": 0.0033656701643238964, "grad_norm": 19.652389526367188, "learning_rate": 8.000000000000001e-06, "llm_loss": 0.9870735257863998, "loss": 7.695, "loss_aux_layer_0": 0.8916015625, "loss_aux_layer_1": 0.9091796875, "loss_aux_layer_10": 0.93359375, "loss_aux_layer_11": 0.94140625, "loss_aux_layer_12": 0.98828125, "loss_aux_layer_13": 0.9423828125, "loss_aux_layer_14": 0.927734375, "loss_aux_layer_15": 0.9541015625, "loss_aux_layer_16": 0.94921875, "loss_aux_layer_17": 0.9423828125, "loss_aux_layer_18": 0.95703125, "loss_aux_layer_19": 0.9794921875, "loss_aux_layer_2": 0.880859375, "loss_aux_layer_20": 0.9384765625, "loss_aux_layer_21": 0.947265625, "loss_aux_layer_22": 0.97265625, "loss_aux_layer_23": 0.9716796875, "loss_aux_layer_3": 0.896484375, "loss_aux_layer_4": 0.927734375, "loss_aux_layer_5": 0.92578125, "loss_aux_layer_6": 0.90625, "loss_aux_layer_7": 0.9326171875, "loss_aux_layer_8": 0.939453125, "loss_aux_layer_9": 0.9248046875, "step": 17, "total_loss": 1.9237495362758636 }, { "epoch": 0.0035636507622253018, "grad_norm": 9.6647310256958, "learning_rate": 8.500000000000002e-06, "llm_loss": 0.9279625117778778, "loss": 7.443, "loss_aux_layer_0": 0.888671875, "loss_aux_layer_1": 0.9072265625, "loss_aux_layer_10": 0.9287109375, "loss_aux_layer_11": 0.9345703125, "loss_aux_layer_12": 0.984375, "loss_aux_layer_13": 0.939453125, "loss_aux_layer_14": 0.9248046875, "loss_aux_layer_15": 0.953125, "loss_aux_layer_16": 0.9462890625, "loss_aux_layer_17": 0.9404296875, "loss_aux_layer_18": 0.9560546875, "loss_aux_layer_19": 0.98046875, "loss_aux_layer_2": 0.8759765625, "loss_aux_layer_20": 0.93359375, "loss_aux_layer_21": 0.9453125, "loss_aux_layer_22": 0.96875, "loss_aux_layer_23": 0.970703125, "loss_aux_layer_3": 0.8935546875, "loss_aux_layer_4": 0.919921875, "loss_aux_layer_5": 0.9189453125, "loss_aux_layer_6": 0.8984375, "loss_aux_layer_7": 0.92578125, "loss_aux_layer_8": 0.9375, "loss_aux_layer_9": 0.9169921875, "step": 18, "total_loss": 1.8607425093650818 }, { "epoch": 0.0037616313601267076, "grad_norm": 12.69746208190918, "learning_rate": 9e-06, "llm_loss": 1.0008058547973633, "loss": 7.7051, "loss_aux_layer_0": 0.87109375, "loss_aux_layer_1": 0.8984375, "loss_aux_layer_10": 0.9296875, "loss_aux_layer_11": 0.9306640625, "loss_aux_layer_12": 0.978515625, "loss_aux_layer_13": 0.935546875, "loss_aux_layer_14": 0.9228515625, "loss_aux_layer_15": 0.951171875, "loss_aux_layer_16": 0.94140625, "loss_aux_layer_17": 0.935546875, "loss_aux_layer_18": 0.9521484375, "loss_aux_layer_19": 0.9736328125, "loss_aux_layer_2": 0.859375, "loss_aux_layer_20": 0.927734375, "loss_aux_layer_21": 0.9375, "loss_aux_layer_22": 0.9599609375, "loss_aux_layer_23": 0.962890625, "loss_aux_layer_3": 0.880859375, "loss_aux_layer_4": 0.9033203125, "loss_aux_layer_5": 0.9072265625, "loss_aux_layer_6": 0.8828125, "loss_aux_layer_7": 0.91796875, "loss_aux_layer_8": 0.9345703125, "loss_aux_layer_9": 0.9150390625, "step": 19, "total_loss": 1.9262667000293732 }, { "epoch": 0.0039596119580281135, "grad_norm": 10.637856483459473, "learning_rate": 9.5e-06, "llm_loss": 0.974603146314621, "loss": 7.5663, "loss_aux_layer_0": 0.859375, "loss_aux_layer_1": 0.888671875, "loss_aux_layer_10": 0.923828125, "loss_aux_layer_11": 0.9228515625, "loss_aux_layer_12": 0.9697265625, "loss_aux_layer_13": 0.927734375, "loss_aux_layer_14": 0.9140625, "loss_aux_layer_15": 0.9443359375, "loss_aux_layer_16": 0.93359375, "loss_aux_layer_17": 0.9306640625, "loss_aux_layer_18": 0.9462890625, "loss_aux_layer_19": 0.96484375, "loss_aux_layer_2": 0.849609375, "loss_aux_layer_20": 0.921875, "loss_aux_layer_21": 0.931640625, "loss_aux_layer_22": 0.95703125, "loss_aux_layer_23": 0.9580078125, "loss_aux_layer_3": 0.869140625, "loss_aux_layer_4": 0.890625, "loss_aux_layer_5": 0.8955078125, "loss_aux_layer_6": 0.8681640625, "loss_aux_layer_7": 0.904296875, "loss_aux_layer_8": 0.92578125, "loss_aux_layer_9": 0.904296875, "step": 20, "total_loss": 1.8915647864341736 }, { "epoch": 0.004157592555929519, "grad_norm": 15.363680839538574, "learning_rate": 1e-05, "llm_loss": 0.9186030328273773, "loss": 7.2398, "loss_aux_layer_0": 0.8134765625, "loss_aux_layer_1": 0.8515625, "loss_aux_layer_10": 0.8994140625, "loss_aux_layer_11": 0.904296875, "loss_aux_layer_12": 0.947265625, "loss_aux_layer_13": 0.90625, "loss_aux_layer_14": 0.892578125, "loss_aux_layer_15": 0.9306640625, "loss_aux_layer_16": 0.916015625, "loss_aux_layer_17": 0.91796875, "loss_aux_layer_18": 0.9296875, "loss_aux_layer_19": 0.943359375, "loss_aux_layer_2": 0.8203125, "loss_aux_layer_20": 0.9033203125, "loss_aux_layer_21": 0.9091796875, "loss_aux_layer_22": 0.9384765625, "loss_aux_layer_23": 0.9345703125, "loss_aux_layer_3": 0.8359375, "loss_aux_layer_4": 0.8564453125, "loss_aux_layer_5": 0.861328125, "loss_aux_layer_6": 0.830078125, "loss_aux_layer_7": 0.8720703125, "loss_aux_layer_8": 0.8984375, "loss_aux_layer_9": 0.876953125, "step": 21, "total_loss": 1.8099503815174103 }, { "epoch": 0.004355573153830924, "grad_norm": 15.253813743591309, "learning_rate": 1.05e-05, "llm_loss": 0.9357155412435532, "loss": 7.2546, "loss_aux_layer_0": 0.80078125, "loss_aux_layer_1": 0.8427734375, "loss_aux_layer_10": 0.88671875, "loss_aux_layer_11": 0.890625, "loss_aux_layer_12": 0.93359375, "loss_aux_layer_13": 0.89453125, "loss_aux_layer_14": 0.87890625, "loss_aux_layer_15": 0.9208984375, "loss_aux_layer_16": 0.90625, "loss_aux_layer_17": 0.9130859375, "loss_aux_layer_18": 0.923828125, "loss_aux_layer_19": 0.931640625, "loss_aux_layer_2": 0.8037109375, "loss_aux_layer_20": 0.8994140625, "loss_aux_layer_21": 0.8984375, "loss_aux_layer_22": 0.9306640625, "loss_aux_layer_23": 0.9248046875, "loss_aux_layer_3": 0.8154296875, "loss_aux_layer_4": 0.8349609375, "loss_aux_layer_5": 0.8408203125, "loss_aux_layer_6": 0.8095703125, "loss_aux_layer_7": 0.8515625, "loss_aux_layer_8": 0.8818359375, "loss_aux_layer_9": 0.857421875, "step": 22, "total_loss": 1.8136554658412933 }, { "epoch": 0.00455355375173233, "grad_norm": 19.182390213012695, "learning_rate": 1.1000000000000001e-05, "llm_loss": 0.9475100338459015, "loss": 7.2698, "loss_aux_layer_0": 0.79296875, "loss_aux_layer_1": 0.8330078125, "loss_aux_layer_10": 0.876953125, "loss_aux_layer_11": 0.8837890625, "loss_aux_layer_12": 0.9228515625, "loss_aux_layer_13": 0.8896484375, "loss_aux_layer_14": 0.8681640625, "loss_aux_layer_15": 0.9111328125, "loss_aux_layer_16": 0.9013671875, "loss_aux_layer_17": 0.9150390625, "loss_aux_layer_18": 0.9208984375, "loss_aux_layer_19": 0.9287109375, "loss_aux_layer_2": 0.794921875, "loss_aux_layer_20": 0.9013671875, "loss_aux_layer_21": 0.89453125, "loss_aux_layer_22": 0.931640625, "loss_aux_layer_23": 0.9208984375, "loss_aux_layer_3": 0.8046875, "loss_aux_layer_4": 0.82421875, "loss_aux_layer_5": 0.828125, "loss_aux_layer_6": 0.794921875, "loss_aux_layer_7": 0.833984375, "loss_aux_layer_8": 0.8642578125, "loss_aux_layer_9": 0.83984375, "step": 23, "total_loss": 1.8174492418766022 }, { "epoch": 0.004751534349633736, "grad_norm": 18.021224975585938, "learning_rate": 1.1500000000000002e-05, "llm_loss": 0.9162383079528809, "loss": 7.1161, "loss_aux_layer_0": 0.7802734375, "loss_aux_layer_1": 0.8251953125, "loss_aux_layer_10": 0.875, "loss_aux_layer_11": 0.87890625, "loss_aux_layer_12": 0.9150390625, "loss_aux_layer_13": 0.8857421875, "loss_aux_layer_14": 0.8603515625, "loss_aux_layer_15": 0.8994140625, "loss_aux_layer_16": 0.896484375, "loss_aux_layer_17": 0.91015625, "loss_aux_layer_18": 0.916015625, "loss_aux_layer_19": 0.9267578125, "loss_aux_layer_2": 0.78125, "loss_aux_layer_20": 0.9033203125, "loss_aux_layer_21": 0.88671875, "loss_aux_layer_22": 0.927734375, "loss_aux_layer_23": 0.9189453125, "loss_aux_layer_3": 0.7919921875, "loss_aux_layer_4": 0.8095703125, "loss_aux_layer_5": 0.81640625, "loss_aux_layer_6": 0.7841796875, "loss_aux_layer_7": 0.828125, "loss_aux_layer_8": 0.8564453125, "loss_aux_layer_9": 0.8330078125, "step": 24, "total_loss": 1.7790364027023315 }, { "epoch": 0.004949514947535142, "grad_norm": 8.892189979553223, "learning_rate": 1.2e-05, "llm_loss": 0.8285180926322937, "loss": 6.7247, "loss_aux_layer_0": 0.7705078125, "loss_aux_layer_1": 0.818359375, "loss_aux_layer_10": 0.861328125, "loss_aux_layer_11": 0.8671875, "loss_aux_layer_12": 0.90625, "loss_aux_layer_13": 0.8759765625, "loss_aux_layer_14": 0.8505859375, "loss_aux_layer_15": 0.890625, "loss_aux_layer_16": 0.8857421875, "loss_aux_layer_17": 0.900390625, "loss_aux_layer_18": 0.904296875, "loss_aux_layer_19": 0.9169921875, "loss_aux_layer_2": 0.7744140625, "loss_aux_layer_20": 0.892578125, "loss_aux_layer_21": 0.876953125, "loss_aux_layer_22": 0.9150390625, "loss_aux_layer_23": 0.9111328125, "loss_aux_layer_3": 0.783203125, "loss_aux_layer_4": 0.7998046875, "loss_aux_layer_5": 0.80859375, "loss_aux_layer_6": 0.7744140625, "loss_aux_layer_7": 0.81640625, "loss_aux_layer_8": 0.8466796875, "loss_aux_layer_9": 0.8212890625, "step": 25, "total_loss": 1.6811843514442444 }, { "epoch": 0.005147495545436547, "grad_norm": 6.764518737792969, "learning_rate": 1.25e-05, "llm_loss": 0.8215954303741455, "loss": 6.6188, "loss_aux_layer_0": 0.75390625, "loss_aux_layer_1": 0.8017578125, "loss_aux_layer_10": 0.8369140625, "loss_aux_layer_11": 0.84375, "loss_aux_layer_12": 0.8828125, "loss_aux_layer_13": 0.857421875, "loss_aux_layer_14": 0.830078125, "loss_aux_layer_15": 0.875, "loss_aux_layer_16": 0.8681640625, "loss_aux_layer_17": 0.8828125, "loss_aux_layer_18": 0.890625, "loss_aux_layer_19": 0.8984375, "loss_aux_layer_2": 0.7587890625, "loss_aux_layer_20": 0.8720703125, "loss_aux_layer_21": 0.861328125, "loss_aux_layer_22": 0.8984375, "loss_aux_layer_23": 0.892578125, "loss_aux_layer_3": 0.763671875, "loss_aux_layer_4": 0.78125, "loss_aux_layer_5": 0.787109375, "loss_aux_layer_6": 0.7529296875, "loss_aux_layer_7": 0.7861328125, "loss_aux_layer_8": 0.8212890625, "loss_aux_layer_9": 0.798828125, "step": 26, "total_loss": 1.6546999216079712 }, { "epoch": 0.005345476143337953, "grad_norm": 9.07960033416748, "learning_rate": 1.3000000000000001e-05, "llm_loss": 0.8807760626077652, "loss": 6.7635, "loss_aux_layer_0": 0.71875, "loss_aux_layer_1": 0.775390625, "loss_aux_layer_10": 0.8173828125, "loss_aux_layer_11": 0.8212890625, "loss_aux_layer_12": 0.8623046875, "loss_aux_layer_13": 0.837890625, "loss_aux_layer_14": 0.810546875, "loss_aux_layer_15": 0.8583984375, "loss_aux_layer_16": 0.849609375, "loss_aux_layer_17": 0.8642578125, "loss_aux_layer_18": 0.87109375, "loss_aux_layer_19": 0.8759765625, "loss_aux_layer_2": 0.7255859375, "loss_aux_layer_20": 0.8505859375, "loss_aux_layer_21": 0.8427734375, "loss_aux_layer_22": 0.87890625, "loss_aux_layer_23": 0.875, "loss_aux_layer_3": 0.734375, "loss_aux_layer_4": 0.7470703125, "loss_aux_layer_5": 0.7578125, "loss_aux_layer_6": 0.72265625, "loss_aux_layer_7": 0.763671875, "loss_aux_layer_8": 0.8017578125, "loss_aux_layer_9": 0.779296875, "step": 27, "total_loss": 1.6908804774284363 }, { "epoch": 0.0055434567412393585, "grad_norm": 8.733184814453125, "learning_rate": 1.3500000000000001e-05, "llm_loss": 0.9284534305334091, "loss": 6.8992, "loss_aux_layer_0": 0.7080078125, "loss_aux_layer_1": 0.763671875, "loss_aux_layer_10": 0.8017578125, "loss_aux_layer_11": 0.806640625, "loss_aux_layer_12": 0.849609375, "loss_aux_layer_13": 0.826171875, "loss_aux_layer_14": 0.796875, "loss_aux_layer_15": 0.8447265625, "loss_aux_layer_16": 0.8359375, "loss_aux_layer_17": 0.8525390625, "loss_aux_layer_18": 0.859375, "loss_aux_layer_19": 0.8642578125, "loss_aux_layer_2": 0.712890625, "loss_aux_layer_20": 0.837890625, "loss_aux_layer_21": 0.830078125, "loss_aux_layer_22": 0.8642578125, "loss_aux_layer_23": 0.8642578125, "loss_aux_layer_3": 0.716796875, "loss_aux_layer_4": 0.7314453125, "loss_aux_layer_5": 0.7421875, "loss_aux_layer_6": 0.7041015625, "loss_aux_layer_7": 0.748046875, "loss_aux_layer_8": 0.7880859375, "loss_aux_layer_9": 0.767578125, "step": 28, "total_loss": 1.7247944176197052 }, { "epoch": 0.005741437339140764, "grad_norm": 7.007369518280029, "learning_rate": 1.4000000000000001e-05, "llm_loss": 0.9245864897966385, "loss": 6.8272, "loss_aux_layer_0": 0.7001953125, "loss_aux_layer_1": 0.7568359375, "loss_aux_layer_10": 0.7861328125, "loss_aux_layer_11": 0.7890625, "loss_aux_layer_12": 0.8310546875, "loss_aux_layer_13": 0.8115234375, "loss_aux_layer_14": 0.779296875, "loss_aux_layer_15": 0.8291015625, "loss_aux_layer_16": 0.822265625, "loss_aux_layer_17": 0.8369140625, "loss_aux_layer_18": 0.8466796875, "loss_aux_layer_19": 0.849609375, "loss_aux_layer_2": 0.7021484375, "loss_aux_layer_20": 0.8232421875, "loss_aux_layer_21": 0.8115234375, "loss_aux_layer_22": 0.8505859375, "loss_aux_layer_23": 0.8544921875, "loss_aux_layer_3": 0.703125, "loss_aux_layer_4": 0.71484375, "loss_aux_layer_5": 0.728515625, "loss_aux_layer_6": 0.6904296875, "loss_aux_layer_7": 0.7314453125, "loss_aux_layer_8": 0.7705078125, "loss_aux_layer_9": 0.751953125, "step": 29, "total_loss": 1.7068029046058655 }, { "epoch": 0.00593941793704217, "grad_norm": 7.148609638214111, "learning_rate": 1.45e-05, "llm_loss": 0.9695344120264053, "loss": 6.9266, "loss_aux_layer_0": 0.6884765625, "loss_aux_layer_1": 0.7470703125, "loss_aux_layer_10": 0.76171875, "loss_aux_layer_11": 0.7646484375, "loss_aux_layer_12": 0.8037109375, "loss_aux_layer_13": 0.7900390625, "loss_aux_layer_14": 0.759765625, "loss_aux_layer_15": 0.8076171875, "loss_aux_layer_16": 0.8017578125, "loss_aux_layer_17": 0.818359375, "loss_aux_layer_18": 0.828125, "loss_aux_layer_19": 0.8310546875, "loss_aux_layer_2": 0.6904296875, "loss_aux_layer_20": 0.806640625, "loss_aux_layer_21": 0.79296875, "loss_aux_layer_22": 0.833984375, "loss_aux_layer_23": 0.837890625, "loss_aux_layer_3": 0.6875, "loss_aux_layer_4": 0.69921875, "loss_aux_layer_5": 0.7099609375, "loss_aux_layer_6": 0.6708984375, "loss_aux_layer_7": 0.7021484375, "loss_aux_layer_8": 0.7412109375, "loss_aux_layer_9": 0.7236328125, "step": 30, "total_loss": 1.7316550016403198 }, { "epoch": 0.006137398534943575, "grad_norm": 5.833500862121582, "learning_rate": 1.5e-05, "llm_loss": 0.8583651781082153, "loss": 6.3865, "loss_aux_layer_0": 0.6650390625, "loss_aux_layer_1": 0.7255859375, "loss_aux_layer_10": 0.736328125, "loss_aux_layer_11": 0.7353515625, "loss_aux_layer_12": 0.77734375, "loss_aux_layer_13": 0.7685546875, "loss_aux_layer_14": 0.734375, "loss_aux_layer_15": 0.783203125, "loss_aux_layer_16": 0.783203125, "loss_aux_layer_17": 0.7998046875, "loss_aux_layer_18": 0.8056640625, "loss_aux_layer_19": 0.80859375, "loss_aux_layer_2": 0.66015625, "loss_aux_layer_20": 0.7890625, "loss_aux_layer_21": 0.7724609375, "loss_aux_layer_22": 0.8134765625, "loss_aux_layer_23": 0.822265625, "loss_aux_layer_3": 0.6572265625, "loss_aux_layer_4": 0.6669921875, "loss_aux_layer_5": 0.6826171875, "loss_aux_layer_6": 0.6416015625, "loss_aux_layer_7": 0.6767578125, "loss_aux_layer_8": 0.7138671875, "loss_aux_layer_9": 0.697265625, "step": 31, "total_loss": 1.596626102924347 }, { "epoch": 0.006335379132844981, "grad_norm": 3.915045976638794, "learning_rate": 1.55e-05, "llm_loss": 0.8130850195884705, "loss": 6.1477, "loss_aux_layer_0": 0.6533203125, "loss_aux_layer_1": 0.7138671875, "loss_aux_layer_10": 0.720703125, "loss_aux_layer_11": 0.7197265625, "loss_aux_layer_12": 0.759765625, "loss_aux_layer_13": 0.7548828125, "loss_aux_layer_14": 0.71875, "loss_aux_layer_15": 0.767578125, "loss_aux_layer_16": 0.771484375, "loss_aux_layer_17": 0.7880859375, "loss_aux_layer_18": 0.794921875, "loss_aux_layer_19": 0.7998046875, "loss_aux_layer_2": 0.646484375, "loss_aux_layer_20": 0.7763671875, "loss_aux_layer_21": 0.7587890625, "loss_aux_layer_22": 0.80078125, "loss_aux_layer_23": 0.8095703125, "loss_aux_layer_3": 0.6416015625, "loss_aux_layer_4": 0.6513671875, "loss_aux_layer_5": 0.666015625, "loss_aux_layer_6": 0.6240234375, "loss_aux_layer_7": 0.658203125, "loss_aux_layer_8": 0.6962890625, "loss_aux_layer_9": 0.681640625, "step": 32, "total_loss": 1.5369213223457336 }, { "epoch": 0.006533359730746387, "grad_norm": 3.476928472518921, "learning_rate": 1.6000000000000003e-05, "llm_loss": 0.8619766235351562, "loss": 6.2656, "loss_aux_layer_0": 0.640625, "loss_aux_layer_1": 0.70703125, "loss_aux_layer_10": 0.6953125, "loss_aux_layer_11": 0.6962890625, "loss_aux_layer_12": 0.7373046875, "loss_aux_layer_13": 0.734375, "loss_aux_layer_14": 0.69921875, "loss_aux_layer_15": 0.75, "loss_aux_layer_16": 0.75, "loss_aux_layer_17": 0.767578125, "loss_aux_layer_18": 0.771484375, "loss_aux_layer_19": 0.7783203125, "loss_aux_layer_2": 0.634765625, "loss_aux_layer_20": 0.7548828125, "loss_aux_layer_21": 0.7373046875, "loss_aux_layer_22": 0.78125, "loss_aux_layer_23": 0.791015625, "loss_aux_layer_3": 0.625, "loss_aux_layer_4": 0.634765625, "loss_aux_layer_5": 0.6484375, "loss_aux_layer_6": 0.60546875, "loss_aux_layer_7": 0.6318359375, "loss_aux_layer_8": 0.6748046875, "loss_aux_layer_9": 0.6572265625, "step": 33, "total_loss": 1.5664088129997253 }, { "epoch": 0.006731340328647793, "grad_norm": 3.2363815307617188, "learning_rate": 1.65e-05, "llm_loss": 0.7354985177516937, "loss": 5.6798, "loss_aux_layer_0": 0.6328125, "loss_aux_layer_1": 0.6953125, "loss_aux_layer_10": 0.67578125, "loss_aux_layer_11": 0.673828125, "loss_aux_layer_12": 0.7138671875, "loss_aux_layer_13": 0.7119140625, "loss_aux_layer_14": 0.677734375, "loss_aux_layer_15": 0.7275390625, "loss_aux_layer_16": 0.73046875, "loss_aux_layer_17": 0.74609375, "loss_aux_layer_18": 0.75, "loss_aux_layer_19": 0.75390625, "loss_aux_layer_2": 0.619140625, "loss_aux_layer_20": 0.736328125, "loss_aux_layer_21": 0.7177734375, "loss_aux_layer_22": 0.7578125, "loss_aux_layer_23": 0.7734375, "loss_aux_layer_3": 0.6064453125, "loss_aux_layer_4": 0.6142578125, "loss_aux_layer_5": 0.6298828125, "loss_aux_layer_6": 0.583984375, "loss_aux_layer_7": 0.6103515625, "loss_aux_layer_8": 0.65234375, "loss_aux_layer_9": 0.63671875, "step": 34, "total_loss": 1.4199569821357727 }, { "epoch": 0.006929320926549199, "grad_norm": 3.6640515327453613, "learning_rate": 1.7000000000000003e-05, "llm_loss": 0.8067744076251984, "loss": 5.8912, "loss_aux_layer_0": 0.6171875, "loss_aux_layer_1": 0.68359375, "loss_aux_layer_10": 0.658203125, "loss_aux_layer_11": 0.6552734375, "loss_aux_layer_12": 0.6953125, "loss_aux_layer_13": 0.693359375, "loss_aux_layer_14": 0.65625, "loss_aux_layer_15": 0.7080078125, "loss_aux_layer_16": 0.7138671875, "loss_aux_layer_17": 0.7275390625, "loss_aux_layer_18": 0.7314453125, "loss_aux_layer_19": 0.7353515625, "loss_aux_layer_2": 0.59765625, "loss_aux_layer_20": 0.716796875, "loss_aux_layer_21": 0.701171875, "loss_aux_layer_22": 0.7392578125, "loss_aux_layer_23": 0.755859375, "loss_aux_layer_3": 0.5869140625, "loss_aux_layer_4": 0.59375, "loss_aux_layer_5": 0.6083984375, "loss_aux_layer_6": 0.5654296875, "loss_aux_layer_7": 0.591796875, "loss_aux_layer_8": 0.6328125, "loss_aux_layer_9": 0.619140625, "step": 35, "total_loss": 1.472795158624649 }, { "epoch": 0.0071273015244506036, "grad_norm": 3.782331943511963, "learning_rate": 1.75e-05, "llm_loss": 0.7437592297792435, "loss": 5.5603, "loss_aux_layer_0": 0.60546875, "loss_aux_layer_1": 0.6728515625, "loss_aux_layer_10": 0.6357421875, "loss_aux_layer_11": 0.6337890625, "loss_aux_layer_12": 0.6708984375, "loss_aux_layer_13": 0.6708984375, "loss_aux_layer_14": 0.6337890625, "loss_aux_layer_15": 0.6865234375, "loss_aux_layer_16": 0.6923828125, "loss_aux_layer_17": 0.705078125, "loss_aux_layer_18": 0.7099609375, "loss_aux_layer_19": 0.712890625, "loss_aux_layer_2": 0.583984375, "loss_aux_layer_20": 0.6943359375, "loss_aux_layer_21": 0.6796875, "loss_aux_layer_22": 0.71875, "loss_aux_layer_23": 0.73828125, "loss_aux_layer_3": 0.5703125, "loss_aux_layer_4": 0.578125, "loss_aux_layer_5": 0.5908203125, "loss_aux_layer_6": 0.546875, "loss_aux_layer_7": 0.5703125, "loss_aux_layer_8": 0.6103515625, "loss_aux_layer_9": 0.5966796875, "step": 36, "total_loss": 1.3900707066059113 }, { "epoch": 0.007325282122352009, "grad_norm": 2.9281554222106934, "learning_rate": 1.8e-05, "llm_loss": 0.7367133349180222, "loss": 5.4357, "loss_aux_layer_0": 0.591796875, "loss_aux_layer_1": 0.6611328125, "loss_aux_layer_10": 0.611328125, "loss_aux_layer_11": 0.6083984375, "loss_aux_layer_12": 0.6435546875, "loss_aux_layer_13": 0.6455078125, "loss_aux_layer_14": 0.607421875, "loss_aux_layer_15": 0.6611328125, "loss_aux_layer_16": 0.6640625, "loss_aux_layer_17": 0.677734375, "loss_aux_layer_18": 0.681640625, "loss_aux_layer_19": 0.68359375, "loss_aux_layer_2": 0.5625, "loss_aux_layer_20": 0.6689453125, "loss_aux_layer_21": 0.654296875, "loss_aux_layer_22": 0.6923828125, "loss_aux_layer_23": 0.716796875, "loss_aux_layer_3": 0.5478515625, "loss_aux_layer_4": 0.5517578125, "loss_aux_layer_5": 0.568359375, "loss_aux_layer_6": 0.5244140625, "loss_aux_layer_7": 0.5478515625, "loss_aux_layer_8": 0.587890625, "loss_aux_layer_9": 0.5732421875, "step": 37, "total_loss": 1.3589362800121307 }, { "epoch": 0.007523262720253415, "grad_norm": 3.445024013519287, "learning_rate": 1.85e-05, "llm_loss": 0.7713892608880997, "loss": 5.4911, "loss_aux_layer_0": 0.5791015625, "loss_aux_layer_1": 0.64453125, "loss_aux_layer_10": 0.5927734375, "loss_aux_layer_11": 0.587890625, "loss_aux_layer_12": 0.6220703125, "loss_aux_layer_13": 0.625, "loss_aux_layer_14": 0.583984375, "loss_aux_layer_15": 0.6376953125, "loss_aux_layer_16": 0.646484375, "loss_aux_layer_17": 0.65625, "loss_aux_layer_18": 0.6572265625, "loss_aux_layer_19": 0.658203125, "loss_aux_layer_2": 0.5400390625, "loss_aux_layer_20": 0.6494140625, "loss_aux_layer_21": 0.6337890625, "loss_aux_layer_22": 0.669921875, "loss_aux_layer_23": 0.7001953125, "loss_aux_layer_3": 0.5244140625, "loss_aux_layer_4": 0.5283203125, "loss_aux_layer_5": 0.544921875, "loss_aux_layer_6": 0.5009765625, "loss_aux_layer_7": 0.5283203125, "loss_aux_layer_8": 0.56640625, "loss_aux_layer_9": 0.5556640625, "step": 38, "total_loss": 1.3727635741233826 }, { "epoch": 0.007721243318154821, "grad_norm": 5.197643756866455, "learning_rate": 1.9e-05, "llm_loss": 0.7378920316696167, "loss": 5.31, "loss_aux_layer_0": 0.5810546875, "loss_aux_layer_1": 0.6474609375, "loss_aux_layer_10": 0.576171875, "loss_aux_layer_11": 0.5732421875, "loss_aux_layer_12": 0.607421875, "loss_aux_layer_13": 0.6083984375, "loss_aux_layer_14": 0.5693359375, "loss_aux_layer_15": 0.62109375, "loss_aux_layer_16": 0.625, "loss_aux_layer_17": 0.63671875, "loss_aux_layer_18": 0.640625, "loss_aux_layer_19": 0.642578125, "loss_aux_layer_2": 0.5400390625, "loss_aux_layer_20": 0.6328125, "loss_aux_layer_21": 0.62109375, "loss_aux_layer_22": 0.654296875, "loss_aux_layer_23": 0.6845703125, "loss_aux_layer_3": 0.5224609375, "loss_aux_layer_4": 0.5244140625, "loss_aux_layer_5": 0.5390625, "loss_aux_layer_6": 0.494140625, "loss_aux_layer_7": 0.513671875, "loss_aux_layer_8": 0.5546875, "loss_aux_layer_9": 0.5400390625, "step": 39, "total_loss": 1.327509492635727 }, { "epoch": 0.007919223916056227, "grad_norm": 4.563601016998291, "learning_rate": 1.9500000000000003e-05, "llm_loss": 0.8155398070812225, "loss": 5.5285, "loss_aux_layer_0": 0.5615234375, "loss_aux_layer_1": 0.626953125, "loss_aux_layer_10": 0.556640625, "loss_aux_layer_11": 0.5498046875, "loss_aux_layer_12": 0.5810546875, "loss_aux_layer_13": 0.5859375, "loss_aux_layer_14": 0.546875, "loss_aux_layer_15": 0.59765625, "loss_aux_layer_16": 0.6044921875, "loss_aux_layer_17": 0.61328125, "loss_aux_layer_18": 0.6162109375, "loss_aux_layer_19": 0.6181640625, "loss_aux_layer_2": 0.51171875, "loss_aux_layer_20": 0.6142578125, "loss_aux_layer_21": 0.6005859375, "loss_aux_layer_22": 0.634765625, "loss_aux_layer_23": 0.6640625, "loss_aux_layer_3": 0.494140625, "loss_aux_layer_4": 0.49560546875, "loss_aux_layer_5": 0.5126953125, "loss_aux_layer_6": 0.46875, "loss_aux_layer_7": 0.49169921875, "loss_aux_layer_8": 0.5302734375, "loss_aux_layer_9": 0.517578125, "step": 40, "total_loss": 1.3821266293525696 }, { "epoch": 0.008117204513957633, "grad_norm": 5.055999755859375, "learning_rate": 2e-05, "llm_loss": 0.8029741495847702, "loss": 5.4248, "loss_aux_layer_0": 0.5595703125, "loss_aux_layer_1": 0.6298828125, "loss_aux_layer_10": 0.537109375, "loss_aux_layer_11": 0.5341796875, "loss_aux_layer_12": 0.5634765625, "loss_aux_layer_13": 0.5673828125, "loss_aux_layer_14": 0.529296875, "loss_aux_layer_15": 0.5791015625, "loss_aux_layer_16": 0.5869140625, "loss_aux_layer_17": 0.5947265625, "loss_aux_layer_18": 0.5966796875, "loss_aux_layer_19": 0.6015625, "loss_aux_layer_2": 0.509765625, "loss_aux_layer_20": 0.6005859375, "loss_aux_layer_21": 0.5869140625, "loss_aux_layer_22": 0.6171875, "loss_aux_layer_23": 0.646484375, "loss_aux_layer_3": 0.4892578125, "loss_aux_layer_4": 0.48828125, "loss_aux_layer_5": 0.50341796875, "loss_aux_layer_6": 0.458984375, "loss_aux_layer_7": 0.47802734375, "loss_aux_layer_8": 0.5166015625, "loss_aux_layer_9": 0.50048828125, "step": 41, "total_loss": 1.356204479932785 }, { "epoch": 0.008315185111859039, "grad_norm": 6.819973468780518, "learning_rate": 2.05e-05, "llm_loss": 0.7478239685297012, "loss": 5.1234, "loss_aux_layer_0": 0.548828125, "loss_aux_layer_1": 0.609375, "loss_aux_layer_10": 0.51953125, "loss_aux_layer_11": 0.5146484375, "loss_aux_layer_12": 0.541015625, "loss_aux_layer_13": 0.5458984375, "loss_aux_layer_14": 0.50830078125, "loss_aux_layer_15": 0.556640625, "loss_aux_layer_16": 0.5654296875, "loss_aux_layer_17": 0.572265625, "loss_aux_layer_18": 0.57421875, "loss_aux_layer_19": 0.580078125, "loss_aux_layer_2": 0.48486328125, "loss_aux_layer_20": 0.583984375, "loss_aux_layer_21": 0.5712890625, "loss_aux_layer_22": 0.599609375, "loss_aux_layer_23": 0.6328125, "loss_aux_layer_3": 0.46484375, "loss_aux_layer_4": 0.46484375, "loss_aux_layer_5": 0.48095703125, "loss_aux_layer_6": 0.43798828125, "loss_aux_layer_7": 0.458984375, "loss_aux_layer_8": 0.49462890625, "loss_aux_layer_9": 0.48193359375, "step": 42, "total_loss": 1.280856728553772 }, { "epoch": 0.008513165709760443, "grad_norm": 3.242927074432373, "learning_rate": 2.1e-05, "llm_loss": 0.7367804795503616, "loss": 5.0417, "loss_aux_layer_0": 0.5439453125, "loss_aux_layer_1": 0.615234375, "loss_aux_layer_10": 0.5048828125, "loss_aux_layer_11": 0.50146484375, "loss_aux_layer_12": 0.5263671875, "loss_aux_layer_13": 0.53125, "loss_aux_layer_14": 0.49853515625, "loss_aux_layer_15": 0.5439453125, "loss_aux_layer_16": 0.552734375, "loss_aux_layer_17": 0.5576171875, "loss_aux_layer_18": 0.5634765625, "loss_aux_layer_19": 0.5693359375, "loss_aux_layer_2": 0.48291015625, "loss_aux_layer_20": 0.5732421875, "loss_aux_layer_21": 0.5615234375, "loss_aux_layer_22": 0.58984375, "loss_aux_layer_23": 0.6220703125, "loss_aux_layer_3": 0.46142578125, "loss_aux_layer_4": 0.45947265625, "loss_aux_layer_5": 0.4736328125, "loss_aux_layer_6": 0.43115234375, "loss_aux_layer_7": 0.44873046875, "loss_aux_layer_8": 0.486328125, "loss_aux_layer_9": 0.4697265625, "step": 43, "total_loss": 1.2604341804981232 }, { "epoch": 0.008711146307661849, "grad_norm": 3.2774946689605713, "learning_rate": 2.15e-05, "llm_loss": 0.7603846192359924, "loss": 5.0605, "loss_aux_layer_0": 0.533203125, "loss_aux_layer_1": 0.5947265625, "loss_aux_layer_10": 0.48681640625, "loss_aux_layer_11": 0.482421875, "loss_aux_layer_12": 0.50732421875, "loss_aux_layer_13": 0.51171875, "loss_aux_layer_14": 0.47998046875, "loss_aux_layer_15": 0.5224609375, "loss_aux_layer_16": 0.5341796875, "loss_aux_layer_17": 0.5390625, "loss_aux_layer_18": 0.5458984375, "loss_aux_layer_19": 0.55078125, "loss_aux_layer_2": 0.4580078125, "loss_aux_layer_20": 0.55859375, "loss_aux_layer_21": 0.5478515625, "loss_aux_layer_22": 0.572265625, "loss_aux_layer_23": 0.6083984375, "loss_aux_layer_3": 0.43603515625, "loss_aux_layer_4": 0.43505859375, "loss_aux_layer_5": 0.45068359375, "loss_aux_layer_6": 0.41015625, "loss_aux_layer_7": 0.43115234375, "loss_aux_layer_8": 0.46533203125, "loss_aux_layer_9": 0.45068359375, "step": 44, "total_loss": 1.2651174068450928 }, { "epoch": 0.008909126905563254, "grad_norm": 3.548311233520508, "learning_rate": 2.2000000000000003e-05, "llm_loss": 0.8232103884220123, "loss": 5.2382, "loss_aux_layer_0": 0.5205078125, "loss_aux_layer_1": 0.5791015625, "loss_aux_layer_10": 0.46826171875, "loss_aux_layer_11": 0.4638671875, "loss_aux_layer_12": 0.48486328125, "loss_aux_layer_13": 0.4931640625, "loss_aux_layer_14": 0.46337890625, "loss_aux_layer_15": 0.50390625, "loss_aux_layer_16": 0.517578125, "loss_aux_layer_17": 0.521484375, "loss_aux_layer_18": 0.5263671875, "loss_aux_layer_19": 0.533203125, "loss_aux_layer_2": 0.435546875, "loss_aux_layer_20": 0.5419921875, "loss_aux_layer_21": 0.5302734375, "loss_aux_layer_22": 0.5537109375, "loss_aux_layer_23": 0.5908203125, "loss_aux_layer_3": 0.41455078125, "loss_aux_layer_4": 0.4140625, "loss_aux_layer_5": 0.4306640625, "loss_aux_layer_6": 0.39208984375, "loss_aux_layer_7": 0.4140625, "loss_aux_layer_8": 0.4462890625, "loss_aux_layer_9": 0.4326171875, "step": 45, "total_loss": 1.3095588386058807 }, { "epoch": 0.00910710750346466, "grad_norm": 3.5796098709106445, "learning_rate": 2.25e-05, "llm_loss": 0.8687659949064255, "loss": 5.3976, "loss_aux_layer_0": 0.5185546875, "loss_aux_layer_1": 0.5830078125, "loss_aux_layer_10": 0.45947265625, "loss_aux_layer_11": 0.45751953125, "loss_aux_layer_12": 0.4775390625, "loss_aux_layer_13": 0.4853515625, "loss_aux_layer_14": 0.45947265625, "loss_aux_layer_15": 0.49658203125, "loss_aux_layer_16": 0.5087890625, "loss_aux_layer_17": 0.51171875, "loss_aux_layer_18": 0.517578125, "loss_aux_layer_19": 0.525390625, "loss_aux_layer_2": 0.435546875, "loss_aux_layer_20": 0.5341796875, "loss_aux_layer_21": 0.5234375, "loss_aux_layer_22": 0.5458984375, "loss_aux_layer_23": 0.583984375, "loss_aux_layer_3": 0.4130859375, "loss_aux_layer_4": 0.41162109375, "loss_aux_layer_5": 0.42578125, "loss_aux_layer_6": 0.3896484375, "loss_aux_layer_7": 0.4091796875, "loss_aux_layer_8": 0.43994140625, "loss_aux_layer_9": 0.4248046875, "step": 46, "total_loss": 1.3493949174880981 }, { "epoch": 0.009305088101366066, "grad_norm": 2.8194215297698975, "learning_rate": 2.3000000000000003e-05, "llm_loss": 0.7705581933259964, "loss": 4.9847, "loss_aux_layer_0": 0.5244140625, "loss_aux_layer_1": 0.5810546875, "loss_aux_layer_10": 0.4501953125, "loss_aux_layer_11": 0.4501953125, "loss_aux_layer_12": 0.46923828125, "loss_aux_layer_13": 0.4765625, "loss_aux_layer_14": 0.455078125, "loss_aux_layer_15": 0.4892578125, "loss_aux_layer_16": 0.501953125, "loss_aux_layer_17": 0.50390625, "loss_aux_layer_18": 0.509765625, "loss_aux_layer_19": 0.5185546875, "loss_aux_layer_2": 0.435546875, "loss_aux_layer_20": 0.5283203125, "loss_aux_layer_21": 0.5185546875, "loss_aux_layer_22": 0.5400390625, "loss_aux_layer_23": 0.5830078125, "loss_aux_layer_3": 0.41015625, "loss_aux_layer_4": 0.40771484375, "loss_aux_layer_5": 0.419921875, "loss_aux_layer_6": 0.384765625, "loss_aux_layer_7": 0.40185546875, "loss_aux_layer_8": 0.43505859375, "loss_aux_layer_9": 0.4189453125, "step": 47, "total_loss": 1.2461670339107513 }, { "epoch": 0.009503068699267472, "grad_norm": 2.9192562103271484, "learning_rate": 2.35e-05, "llm_loss": 0.7994656562805176, "loss": 5.0428, "loss_aux_layer_0": 0.51171875, "loss_aux_layer_1": 0.5654296875, "loss_aux_layer_10": 0.43798828125, "loss_aux_layer_11": 0.4365234375, "loss_aux_layer_12": 0.455078125, "loss_aux_layer_13": 0.462890625, "loss_aux_layer_14": 0.44140625, "loss_aux_layer_15": 0.47509765625, "loss_aux_layer_16": 0.48681640625, "loss_aux_layer_17": 0.49072265625, "loss_aux_layer_18": 0.49853515625, "loss_aux_layer_19": 0.5068359375, "loss_aux_layer_2": 0.41357421875, "loss_aux_layer_20": 0.515625, "loss_aux_layer_21": 0.5029296875, "loss_aux_layer_22": 0.5263671875, "loss_aux_layer_23": 0.5673828125, "loss_aux_layer_3": 0.39111328125, "loss_aux_layer_4": 0.38916015625, "loss_aux_layer_5": 0.40380859375, "loss_aux_layer_6": 0.37158203125, "loss_aux_layer_7": 0.39208984375, "loss_aux_layer_8": 0.42041015625, "loss_aux_layer_9": 0.40576171875, "step": 48, "total_loss": 1.260700672864914 }, { "epoch": 0.009701049297168878, "grad_norm": 2.7628190517425537, "learning_rate": 2.4e-05, "llm_loss": 0.7309016734361649, "loss": 4.6938, "loss_aux_layer_0": 0.50244140625, "loss_aux_layer_1": 0.5517578125, "loss_aux_layer_10": 0.41748046875, "loss_aux_layer_11": 0.416015625, "loss_aux_layer_12": 0.43359375, "loss_aux_layer_13": 0.4443359375, "loss_aux_layer_14": 0.42431640625, "loss_aux_layer_15": 0.45654296875, "loss_aux_layer_16": 0.4697265625, "loss_aux_layer_17": 0.47021484375, "loss_aux_layer_18": 0.47998046875, "loss_aux_layer_19": 0.48583984375, "loss_aux_layer_2": 0.39404296875, "loss_aux_layer_20": 0.49560546875, "loss_aux_layer_21": 0.4853515625, "loss_aux_layer_22": 0.5048828125, "loss_aux_layer_23": 0.5478515625, "loss_aux_layer_3": 0.3720703125, "loss_aux_layer_4": 0.37060546875, "loss_aux_layer_5": 0.384765625, "loss_aux_layer_6": 0.35302734375, "loss_aux_layer_7": 0.3740234375, "loss_aux_layer_8": 0.4013671875, "loss_aux_layer_9": 0.38525390625, "step": 49, "total_loss": 1.173454910516739 }, { "epoch": 0.009899029895070284, "grad_norm": 2.490652322769165, "learning_rate": 2.45e-05, "llm_loss": 0.7892317622900009, "loss": 4.9089, "loss_aux_layer_0": 0.50048828125, "loss_aux_layer_1": 0.5478515625, "loss_aux_layer_10": 0.41455078125, "loss_aux_layer_11": 0.4130859375, "loss_aux_layer_12": 0.4287109375, "loss_aux_layer_13": 0.439453125, "loss_aux_layer_14": 0.4208984375, "loss_aux_layer_15": 0.44970703125, "loss_aux_layer_16": 0.462890625, "loss_aux_layer_17": 0.46484375, "loss_aux_layer_18": 0.4736328125, "loss_aux_layer_19": 0.48193359375, "loss_aux_layer_2": 0.388671875, "loss_aux_layer_20": 0.49072265625, "loss_aux_layer_21": 0.48193359375, "loss_aux_layer_22": 0.50048828125, "loss_aux_layer_23": 0.54296875, "loss_aux_layer_3": 0.365234375, "loss_aux_layer_4": 0.36376953125, "loss_aux_layer_5": 0.37890625, "loss_aux_layer_6": 0.349609375, "loss_aux_layer_7": 0.3701171875, "loss_aux_layer_8": 0.39697265625, "loss_aux_layer_9": 0.38232421875, "step": 50, "total_loss": 1.2272302210330963 }, { "epoch": 0.01009701049297169, "grad_norm": 2.900254726409912, "learning_rate": 2.5e-05, "llm_loss": 0.6401623338460922, "loss": 4.3025, "loss_aux_layer_0": 0.5009765625, "loss_aux_layer_1": 0.5517578125, "loss_aux_layer_10": 0.40869140625, "loss_aux_layer_11": 0.41015625, "loss_aux_layer_12": 0.42578125, "loss_aux_layer_13": 0.43603515625, "loss_aux_layer_14": 0.419921875, "loss_aux_layer_15": 0.44580078125, "loss_aux_layer_16": 0.45703125, "loss_aux_layer_17": 0.45947265625, "loss_aux_layer_18": 0.46923828125, "loss_aux_layer_19": 0.4765625, "loss_aux_layer_2": 0.3916015625, "loss_aux_layer_20": 0.4853515625, "loss_aux_layer_21": 0.47705078125, "loss_aux_layer_22": 0.49365234375, "loss_aux_layer_23": 0.537109375, "loss_aux_layer_3": 0.3662109375, "loss_aux_layer_4": 0.365234375, "loss_aux_layer_5": 0.3779296875, "loss_aux_layer_6": 0.34912109375, "loss_aux_layer_7": 0.36767578125, "loss_aux_layer_8": 0.39599609375, "loss_aux_layer_9": 0.3798828125, "step": 51, "total_loss": 1.0756202191114426 }, { "epoch": 0.010294991090873094, "grad_norm": 2.158536434173584, "learning_rate": 2.5500000000000003e-05, "llm_loss": 0.7302769720554352, "loss": 4.6094, "loss_aux_layer_0": 0.49560546875, "loss_aux_layer_1": 0.5361328125, "loss_aux_layer_10": 0.39453125, "loss_aux_layer_11": 0.39501953125, "loss_aux_layer_12": 0.408203125, "loss_aux_layer_13": 0.42138671875, "loss_aux_layer_14": 0.4072265625, "loss_aux_layer_15": 0.4326171875, "loss_aux_layer_16": 0.44580078125, "loss_aux_layer_17": 0.44775390625, "loss_aux_layer_18": 0.45751953125, "loss_aux_layer_19": 0.46630859375, "loss_aux_layer_2": 0.373046875, "loss_aux_layer_20": 0.47412109375, "loss_aux_layer_21": 0.4658203125, "loss_aux_layer_22": 0.4853515625, "loss_aux_layer_23": 0.529296875, "loss_aux_layer_3": 0.349609375, "loss_aux_layer_4": 0.34814453125, "loss_aux_layer_5": 0.36181640625, "loss_aux_layer_6": 0.3349609375, "loss_aux_layer_7": 0.3544921875, "loss_aux_layer_8": 0.38037109375, "loss_aux_layer_9": 0.36474609375, "step": 52, "total_loss": 1.1523401737213135 }, { "epoch": 0.0104929716887745, "grad_norm": 2.2384846210479736, "learning_rate": 2.6000000000000002e-05, "llm_loss": 0.8019936382770538, "loss": 4.8787, "loss_aux_layer_0": 0.4912109375, "loss_aux_layer_1": 0.5263671875, "loss_aux_layer_10": 0.38916015625, "loss_aux_layer_11": 0.3896484375, "loss_aux_layer_12": 0.40185546875, "loss_aux_layer_13": 0.41552734375, "loss_aux_layer_14": 0.40283203125, "loss_aux_layer_15": 0.427734375, "loss_aux_layer_16": 0.44091796875, "loss_aux_layer_17": 0.4443359375, "loss_aux_layer_18": 0.4560546875, "loss_aux_layer_19": 0.462890625, "loss_aux_layer_2": 0.3671875, "loss_aux_layer_20": 0.46923828125, "loss_aux_layer_21": 0.4619140625, "loss_aux_layer_22": 0.4794921875, "loss_aux_layer_23": 0.5244140625, "loss_aux_layer_3": 0.3447265625, "loss_aux_layer_4": 0.34326171875, "loss_aux_layer_5": 0.35791015625, "loss_aux_layer_6": 0.333984375, "loss_aux_layer_7": 0.35302734375, "loss_aux_layer_8": 0.376953125, "loss_aux_layer_9": 0.361328125, "step": 53, "total_loss": 1.2196750044822693 }, { "epoch": 0.010690952286675905, "grad_norm": 1.5443962812423706, "learning_rate": 2.6500000000000004e-05, "llm_loss": 0.6668048650026321, "loss": 4.3098, "loss_aux_layer_0": 0.4873046875, "loss_aux_layer_1": 0.525390625, "loss_aux_layer_10": 0.380859375, "loss_aux_layer_11": 0.38232421875, "loss_aux_layer_12": 0.39453125, "loss_aux_layer_13": 0.40869140625, "loss_aux_layer_14": 0.396484375, "loss_aux_layer_15": 0.41943359375, "loss_aux_layer_16": 0.43115234375, "loss_aux_layer_17": 0.43408203125, "loss_aux_layer_18": 0.44482421875, "loss_aux_layer_19": 0.4521484375, "loss_aux_layer_2": 0.36328125, "loss_aux_layer_20": 0.4599609375, "loss_aux_layer_21": 0.453125, "loss_aux_layer_22": 0.47119140625, "loss_aux_layer_23": 0.51953125, "loss_aux_layer_3": 0.33935546875, "loss_aux_layer_4": 0.33740234375, "loss_aux_layer_5": 0.3505859375, "loss_aux_layer_6": 0.32861328125, "loss_aux_layer_7": 0.3466796875, "loss_aux_layer_8": 0.37158203125, "loss_aux_layer_9": 0.3544921875, "step": 54, "total_loss": 1.0774544775485992 }, { "epoch": 0.010888932884577311, "grad_norm": 1.523149013519287, "learning_rate": 2.7000000000000002e-05, "llm_loss": 0.7385991960763931, "loss": 4.5776, "loss_aux_layer_0": 0.48193359375, "loss_aux_layer_1": 0.5224609375, "loss_aux_layer_10": 0.37451171875, "loss_aux_layer_11": 0.37646484375, "loss_aux_layer_12": 0.3876953125, "loss_aux_layer_13": 0.40380859375, "loss_aux_layer_14": 0.392578125, "loss_aux_layer_15": 0.41650390625, "loss_aux_layer_16": 0.42724609375, "loss_aux_layer_17": 0.43115234375, "loss_aux_layer_18": 0.4423828125, "loss_aux_layer_19": 0.44921875, "loss_aux_layer_2": 0.357421875, "loss_aux_layer_20": 0.455078125, "loss_aux_layer_21": 0.44873046875, "loss_aux_layer_22": 0.4658203125, "loss_aux_layer_23": 0.5166015625, "loss_aux_layer_3": 0.33349609375, "loss_aux_layer_4": 0.3330078125, "loss_aux_layer_5": 0.345703125, "loss_aux_layer_6": 0.32470703125, "loss_aux_layer_7": 0.341796875, "loss_aux_layer_8": 0.3662109375, "loss_aux_layer_9": 0.34765625, "step": 55, "total_loss": 1.1444041728973389 }, { "epoch": 0.011086913482478717, "grad_norm": 1.8930385112762451, "learning_rate": 2.7500000000000004e-05, "llm_loss": 0.7007550299167633, "loss": 4.4009, "loss_aux_layer_0": 0.48388671875, "loss_aux_layer_1": 0.5146484375, "loss_aux_layer_10": 0.3681640625, "loss_aux_layer_11": 0.3701171875, "loss_aux_layer_12": 0.380859375, "loss_aux_layer_13": 0.39697265625, "loss_aux_layer_14": 0.38720703125, "loss_aux_layer_15": 0.40771484375, "loss_aux_layer_16": 0.4189453125, "loss_aux_layer_17": 0.42138671875, "loss_aux_layer_18": 0.4345703125, "loss_aux_layer_19": 0.43994140625, "loss_aux_layer_2": 0.3525390625, "loss_aux_layer_20": 0.447265625, "loss_aux_layer_21": 0.4404296875, "loss_aux_layer_22": 0.45556640625, "loss_aux_layer_23": 0.5068359375, "loss_aux_layer_3": 0.32958984375, "loss_aux_layer_4": 0.328125, "loss_aux_layer_5": 0.34130859375, "loss_aux_layer_6": 0.31982421875, "loss_aux_layer_7": 0.33740234375, "loss_aux_layer_8": 0.361328125, "loss_aux_layer_9": 0.34326171875, "step": 56, "total_loss": 1.1002250462770462 }, { "epoch": 0.011284894080380123, "grad_norm": 1.7978429794311523, "learning_rate": 2.8000000000000003e-05, "llm_loss": 0.7924078404903412, "loss": 4.7027, "loss_aux_layer_0": 0.4677734375, "loss_aux_layer_1": 0.48974609375, "loss_aux_layer_10": 0.35302734375, "loss_aux_layer_11": 0.35302734375, "loss_aux_layer_12": 0.3642578125, "loss_aux_layer_13": 0.3818359375, "loss_aux_layer_14": 0.3720703125, "loss_aux_layer_15": 0.39501953125, "loss_aux_layer_16": 0.40576171875, "loss_aux_layer_17": 0.40966796875, "loss_aux_layer_18": 0.42333984375, "loss_aux_layer_19": 0.42724609375, "loss_aux_layer_2": 0.32763671875, "loss_aux_layer_20": 0.43505859375, "loss_aux_layer_21": 0.42919921875, "loss_aux_layer_22": 0.44287109375, "loss_aux_layer_23": 0.494140625, "loss_aux_layer_3": 0.30712890625, "loss_aux_layer_4": 0.30517578125, "loss_aux_layer_5": 0.3212890625, "loss_aux_layer_6": 0.30126953125, "loss_aux_layer_7": 0.3212890625, "loss_aux_layer_8": 0.3427734375, "loss_aux_layer_9": 0.32666015625, "step": 57, "total_loss": 1.1756692230701447 }, { "epoch": 0.011482874678281529, "grad_norm": 1.5522390604019165, "learning_rate": 2.8499999999999998e-05, "llm_loss": 0.8941142708063126, "loss": 5.0986, "loss_aux_layer_0": 0.46533203125, "loss_aux_layer_1": 0.48681640625, "loss_aux_layer_10": 0.34912109375, "loss_aux_layer_11": 0.35009765625, "loss_aux_layer_12": 0.361328125, "loss_aux_layer_13": 0.38037109375, "loss_aux_layer_14": 0.37060546875, "loss_aux_layer_15": 0.392578125, "loss_aux_layer_16": 0.40234375, "loss_aux_layer_17": 0.40625, "loss_aux_layer_18": 0.4189453125, "loss_aux_layer_19": 0.42333984375, "loss_aux_layer_2": 0.32568359375, "loss_aux_layer_20": 0.4296875, "loss_aux_layer_21": 0.42333984375, "loss_aux_layer_22": 0.4384765625, "loss_aux_layer_23": 0.49072265625, "loss_aux_layer_3": 0.3046875, "loss_aux_layer_4": 0.3046875, "loss_aux_layer_5": 0.31982421875, "loss_aux_layer_6": 0.3017578125, "loss_aux_layer_7": 0.31982421875, "loss_aux_layer_8": 0.34130859375, "loss_aux_layer_9": 0.32373046875, "step": 58, "total_loss": 1.2746431231498718 }, { "epoch": 0.011680855276182935, "grad_norm": 1.1714683771133423, "learning_rate": 2.9e-05, "llm_loss": 0.6277713030576706, "loss": 4.0205, "loss_aux_layer_0": 0.46484375, "loss_aux_layer_1": 0.4814453125, "loss_aux_layer_10": 0.34619140625, "loss_aux_layer_11": 0.34765625, "loss_aux_layer_12": 0.3583984375, "loss_aux_layer_13": 0.37646484375, "loss_aux_layer_14": 0.3681640625, "loss_aux_layer_15": 0.388671875, "loss_aux_layer_16": 0.39794921875, "loss_aux_layer_17": 0.40283203125, "loss_aux_layer_18": 0.41455078125, "loss_aux_layer_19": 0.41845703125, "loss_aux_layer_2": 0.32275390625, "loss_aux_layer_20": 0.42626953125, "loss_aux_layer_21": 0.41845703125, "loss_aux_layer_22": 0.43310546875, "loss_aux_layer_23": 0.48681640625, "loss_aux_layer_3": 0.302734375, "loss_aux_layer_4": 0.302734375, "loss_aux_layer_5": 0.31884765625, "loss_aux_layer_6": 0.2998046875, "loss_aux_layer_7": 0.31787109375, "loss_aux_layer_8": 0.3388671875, "loss_aux_layer_9": 0.3232421875, "step": 59, "total_loss": 1.0051161348819733 }, { "epoch": 0.01187883587408434, "grad_norm": 1.5100634098052979, "learning_rate": 2.95e-05, "llm_loss": 0.7935314923524857, "loss": 4.6356, "loss_aux_layer_0": 0.45263671875, "loss_aux_layer_1": 0.46484375, "loss_aux_layer_10": 0.33544921875, "loss_aux_layer_11": 0.3359375, "loss_aux_layer_12": 0.34619140625, "loss_aux_layer_13": 0.36572265625, "loss_aux_layer_14": 0.35693359375, "loss_aux_layer_15": 0.3779296875, "loss_aux_layer_16": 0.388671875, "loss_aux_layer_17": 0.39404296875, "loss_aux_layer_18": 0.40625, "loss_aux_layer_19": 0.41015625, "loss_aux_layer_2": 0.306640625, "loss_aux_layer_20": 0.416015625, "loss_aux_layer_21": 0.40771484375, "loss_aux_layer_22": 0.42041015625, "loss_aux_layer_23": 0.4765625, "loss_aux_layer_3": 0.287109375, "loss_aux_layer_4": 0.2880859375, "loss_aux_layer_5": 0.30419921875, "loss_aux_layer_6": 0.28759765625, "loss_aux_layer_7": 0.3056640625, "loss_aux_layer_8": 0.32568359375, "loss_aux_layer_9": 0.31103515625, "step": 60, "total_loss": 1.1589007079601288 }, { "epoch": 0.012076816471985746, "grad_norm": 1.5110162496566772, "learning_rate": 3e-05, "llm_loss": 0.758757933974266, "loss": 4.5184, "loss_aux_layer_0": 0.46142578125, "loss_aux_layer_1": 0.478515625, "loss_aux_layer_10": 0.33740234375, "loss_aux_layer_11": 0.33984375, "loss_aux_layer_12": 0.349609375, "loss_aux_layer_13": 0.3681640625, "loss_aux_layer_14": 0.35986328125, "loss_aux_layer_15": 0.38037109375, "loss_aux_layer_16": 0.390625, "loss_aux_layer_17": 0.39599609375, "loss_aux_layer_18": 0.408203125, "loss_aux_layer_19": 0.412109375, "loss_aux_layer_2": 0.32373046875, "loss_aux_layer_20": 0.4150390625, "loss_aux_layer_21": 0.40478515625, "loss_aux_layer_22": 0.41796875, "loss_aux_layer_23": 0.474609375, "loss_aux_layer_3": 0.30126953125, "loss_aux_layer_4": 0.30224609375, "loss_aux_layer_5": 0.3154296875, "loss_aux_layer_6": 0.29833984375, "loss_aux_layer_7": 0.3134765625, "loss_aux_layer_8": 0.3349609375, "loss_aux_layer_9": 0.31689453125, "step": 61, "total_loss": 1.1296063363552094 }, { "epoch": 0.01227479706988715, "grad_norm": 1.2599910497665405, "learning_rate": 3.05e-05, "llm_loss": 0.8010696768760681, "loss": 4.6452, "loss_aux_layer_0": 0.44970703125, "loss_aux_layer_1": 0.45703125, "loss_aux_layer_10": 0.32763671875, "loss_aux_layer_11": 0.32958984375, "loss_aux_layer_12": 0.33984375, "loss_aux_layer_13": 0.35986328125, "loss_aux_layer_14": 0.353515625, "loss_aux_layer_15": 0.3740234375, "loss_aux_layer_16": 0.3837890625, "loss_aux_layer_17": 0.388671875, "loss_aux_layer_18": 0.39990234375, "loss_aux_layer_19": 0.40380859375, "loss_aux_layer_2": 0.3056640625, "loss_aux_layer_20": 0.40771484375, "loss_aux_layer_21": 0.39599609375, "loss_aux_layer_22": 0.4111328125, "loss_aux_layer_23": 0.4697265625, "loss_aux_layer_3": 0.28466796875, "loss_aux_layer_4": 0.28564453125, "loss_aux_layer_5": 0.30126953125, "loss_aux_layer_6": 0.28564453125, "loss_aux_layer_7": 0.30224609375, "loss_aux_layer_8": 0.32275390625, "loss_aux_layer_9": 0.30615234375, "step": 62, "total_loss": 1.1612877398729324 }, { "epoch": 0.012472777667788556, "grad_norm": 1.3920843601226807, "learning_rate": 3.1e-05, "llm_loss": 0.7783526331186295, "loss": 4.5253, "loss_aux_layer_0": 0.4453125, "loss_aux_layer_1": 0.44921875, "loss_aux_layer_10": 0.32275390625, "loss_aux_layer_11": 0.32373046875, "loss_aux_layer_12": 0.33349609375, "loss_aux_layer_13": 0.3525390625, "loss_aux_layer_14": 0.34375, "loss_aux_layer_15": 0.3642578125, "loss_aux_layer_16": 0.37451171875, "loss_aux_layer_17": 0.38037109375, "loss_aux_layer_18": 0.3916015625, "loss_aux_layer_19": 0.39501953125, "loss_aux_layer_2": 0.30029296875, "loss_aux_layer_20": 0.39697265625, "loss_aux_layer_21": 0.38427734375, "loss_aux_layer_22": 0.3994140625, "loss_aux_layer_23": 0.458984375, "loss_aux_layer_3": 0.27880859375, "loss_aux_layer_4": 0.28076171875, "loss_aux_layer_5": 0.29638671875, "loss_aux_layer_6": 0.2822265625, "loss_aux_layer_7": 0.2978515625, "loss_aux_layer_8": 0.31787109375, "loss_aux_layer_9": 0.30126953125, "step": 63, "total_loss": 1.131312608718872 }, { "epoch": 0.012670758265689962, "grad_norm": 1.2720237970352173, "learning_rate": 3.15e-05, "llm_loss": 0.7456763088703156, "loss": 4.3992, "loss_aux_layer_0": 0.44921875, "loss_aux_layer_1": 0.4541015625, "loss_aux_layer_10": 0.31982421875, "loss_aux_layer_11": 0.3212890625, "loss_aux_layer_12": 0.33203125, "loss_aux_layer_13": 0.3525390625, "loss_aux_layer_14": 0.3466796875, "loss_aux_layer_15": 0.36767578125, "loss_aux_layer_16": 0.3779296875, "loss_aux_layer_17": 0.3818359375, "loss_aux_layer_18": 0.39306640625, "loss_aux_layer_19": 0.396484375, "loss_aux_layer_2": 0.3076171875, "loss_aux_layer_20": 0.39453125, "loss_aux_layer_21": 0.38037109375, "loss_aux_layer_22": 0.39599609375, "loss_aux_layer_23": 0.4560546875, "loss_aux_layer_3": 0.28466796875, "loss_aux_layer_4": 0.28564453125, "loss_aux_layer_5": 0.2998046875, "loss_aux_layer_6": 0.28515625, "loss_aux_layer_7": 0.298828125, "loss_aux_layer_8": 0.318359375, "loss_aux_layer_9": 0.2998046875, "step": 64, "total_loss": 1.0998035371303558 }, { "epoch": 0.012868738863591368, "grad_norm": 1.651153802871704, "learning_rate": 3.2000000000000005e-05, "llm_loss": 0.6957023739814758, "loss": 4.1479, "loss_aux_layer_0": 0.43798828125, "loss_aux_layer_1": 0.43408203125, "loss_aux_layer_10": 0.30859375, "loss_aux_layer_11": 0.3095703125, "loss_aux_layer_12": 0.318359375, "loss_aux_layer_13": 0.33837890625, "loss_aux_layer_14": 0.333984375, "loss_aux_layer_15": 0.35498046875, "loss_aux_layer_16": 0.3662109375, "loss_aux_layer_17": 0.37109375, "loss_aux_layer_18": 0.38134765625, "loss_aux_layer_19": 0.3837890625, "loss_aux_layer_2": 0.291015625, "loss_aux_layer_20": 0.3818359375, "loss_aux_layer_21": 0.3662109375, "loss_aux_layer_22": 0.38330078125, "loss_aux_layer_23": 0.4453125, "loss_aux_layer_3": 0.2705078125, "loss_aux_layer_4": 0.27197265625, "loss_aux_layer_5": 0.28759765625, "loss_aux_layer_6": 0.2734375, "loss_aux_layer_7": 0.28759765625, "loss_aux_layer_8": 0.3056640625, "loss_aux_layer_9": 0.2890625, "step": 65, "total_loss": 1.0369626581668854 }, { "epoch": 0.013066719461492774, "grad_norm": 1.0568724870681763, "learning_rate": 3.2500000000000004e-05, "llm_loss": 0.6719686686992645, "loss": 4.0424, "loss_aux_layer_0": 0.44140625, "loss_aux_layer_1": 0.42529296875, "loss_aux_layer_10": 0.30419921875, "loss_aux_layer_11": 0.30517578125, "loss_aux_layer_12": 0.31494140625, "loss_aux_layer_13": 0.3349609375, "loss_aux_layer_14": 0.32958984375, "loss_aux_layer_15": 0.3505859375, "loss_aux_layer_16": 0.36083984375, "loss_aux_layer_17": 0.3681640625, "loss_aux_layer_18": 0.37841796875, "loss_aux_layer_19": 0.3818359375, "loss_aux_layer_2": 0.29150390625, "loss_aux_layer_20": 0.37841796875, "loss_aux_layer_21": 0.3623046875, "loss_aux_layer_22": 0.37939453125, "loss_aux_layer_23": 0.44140625, "loss_aux_layer_3": 0.2705078125, "loss_aux_layer_4": 0.27099609375, "loss_aux_layer_5": 0.28759765625, "loss_aux_layer_6": 0.27294921875, "loss_aux_layer_7": 0.28662109375, "loss_aux_layer_8": 0.30322265625, "loss_aux_layer_9": 0.28662109375, "step": 66, "total_loss": 1.0105955451726913 }, { "epoch": 0.01326470005939418, "grad_norm": 1.1538602113723755, "learning_rate": 3.3e-05, "llm_loss": 0.8007726818323135, "loss": 4.5062, "loss_aux_layer_0": 0.42578125, "loss_aux_layer_1": 0.40625, "loss_aux_layer_10": 0.29833984375, "loss_aux_layer_11": 0.296875, "loss_aux_layer_12": 0.3046875, "loss_aux_layer_13": 0.32421875, "loss_aux_layer_14": 0.318359375, "loss_aux_layer_15": 0.34033203125, "loss_aux_layer_16": 0.3505859375, "loss_aux_layer_17": 0.35693359375, "loss_aux_layer_18": 0.36572265625, "loss_aux_layer_19": 0.36865234375, "loss_aux_layer_2": 0.27392578125, "loss_aux_layer_20": 0.36279296875, "loss_aux_layer_21": 0.34423828125, "loss_aux_layer_22": 0.36376953125, "loss_aux_layer_23": 0.42626953125, "loss_aux_layer_3": 0.253662109375, "loss_aux_layer_4": 0.2548828125, "loss_aux_layer_5": 0.27392578125, "loss_aux_layer_6": 0.2607421875, "loss_aux_layer_7": 0.2763671875, "loss_aux_layer_8": 0.29150390625, "loss_aux_layer_9": 0.2783203125, "step": 67, "total_loss": 1.1265491247177124 }, { "epoch": 0.013462680657295585, "grad_norm": 1.145885705947876, "learning_rate": 3.35e-05, "llm_loss": 0.7198586016893387, "loss": 4.1864, "loss_aux_layer_0": 0.42578125, "loss_aux_layer_1": 0.40966796875, "loss_aux_layer_10": 0.2958984375, "loss_aux_layer_11": 0.294921875, "loss_aux_layer_12": 0.3037109375, "loss_aux_layer_13": 0.32470703125, "loss_aux_layer_14": 0.3193359375, "loss_aux_layer_15": 0.34228515625, "loss_aux_layer_16": 0.3515625, "loss_aux_layer_17": 0.35791015625, "loss_aux_layer_18": 0.36572265625, "loss_aux_layer_19": 0.3671875, "loss_aux_layer_2": 0.28125, "loss_aux_layer_20": 0.361328125, "loss_aux_layer_21": 0.3427734375, "loss_aux_layer_22": 0.361328125, "loss_aux_layer_23": 0.42578125, "loss_aux_layer_3": 0.26025390625, "loss_aux_layer_4": 0.2607421875, "loss_aux_layer_5": 0.2783203125, "loss_aux_layer_6": 0.2646484375, "loss_aux_layer_7": 0.27734375, "loss_aux_layer_8": 0.29296875, "loss_aux_layer_9": 0.2783203125, "step": 68, "total_loss": 1.0465900003910065 }, { "epoch": 0.013660661255196991, "grad_norm": 1.0547683238983154, "learning_rate": 3.4000000000000007e-05, "llm_loss": 0.7332123219966888, "loss": 4.2051, "loss_aux_layer_0": 0.419921875, "loss_aux_layer_1": 0.40087890625, "loss_aux_layer_10": 0.287109375, "loss_aux_layer_11": 0.2861328125, "loss_aux_layer_12": 0.29345703125, "loss_aux_layer_13": 0.31494140625, "loss_aux_layer_14": 0.310546875, "loss_aux_layer_15": 0.3330078125, "loss_aux_layer_16": 0.3427734375, "loss_aux_layer_17": 0.34912109375, "loss_aux_layer_18": 0.35791015625, "loss_aux_layer_19": 0.35791015625, "loss_aux_layer_2": 0.275390625, "loss_aux_layer_20": 0.35107421875, "loss_aux_layer_21": 0.3310546875, "loss_aux_layer_22": 0.35009765625, "loss_aux_layer_23": 0.41455078125, "loss_aux_layer_3": 0.253173828125, "loss_aux_layer_4": 0.253662109375, "loss_aux_layer_5": 0.2705078125, "loss_aux_layer_6": 0.25732421875, "loss_aux_layer_7": 0.27001953125, "loss_aux_layer_8": 0.28466796875, "loss_aux_layer_9": 0.2685546875, "step": 69, "total_loss": 1.0512639731168747 }, { "epoch": 0.013858641853098397, "grad_norm": 1.1113286018371582, "learning_rate": 3.45e-05, "llm_loss": 0.6781223937869072, "loss": 3.9537, "loss_aux_layer_0": 0.41845703125, "loss_aux_layer_1": 0.3837890625, "loss_aux_layer_10": 0.28076171875, "loss_aux_layer_11": 0.28076171875, "loss_aux_layer_12": 0.28857421875, "loss_aux_layer_13": 0.3095703125, "loss_aux_layer_14": 0.3056640625, "loss_aux_layer_15": 0.328125, "loss_aux_layer_16": 0.337890625, "loss_aux_layer_17": 0.34326171875, "loss_aux_layer_18": 0.35009765625, "loss_aux_layer_19": 0.3486328125, "loss_aux_layer_2": 0.2646484375, "loss_aux_layer_20": 0.3408203125, "loss_aux_layer_21": 0.31884765625, "loss_aux_layer_22": 0.33740234375, "loss_aux_layer_23": 0.40283203125, "loss_aux_layer_3": 0.244384765625, "loss_aux_layer_4": 0.245361328125, "loss_aux_layer_5": 0.26416015625, "loss_aux_layer_6": 0.250732421875, "loss_aux_layer_7": 0.26416015625, "loss_aux_layer_8": 0.2763671875, "loss_aux_layer_9": 0.26220703125, "step": 70, "total_loss": 0.9884175062179565 }, { "epoch": 0.014056622450999801, "grad_norm": 1.0996885299682617, "learning_rate": 3.5e-05, "llm_loss": 0.6915289610624313, "loss": 3.998, "loss_aux_layer_0": 0.4111328125, "loss_aux_layer_1": 0.3798828125, "loss_aux_layer_10": 0.27880859375, "loss_aux_layer_11": 0.27734375, "loss_aux_layer_12": 0.28466796875, "loss_aux_layer_13": 0.30322265625, "loss_aux_layer_14": 0.30078125, "loss_aux_layer_15": 0.32373046875, "loss_aux_layer_16": 0.333984375, "loss_aux_layer_17": 0.33837890625, "loss_aux_layer_18": 0.34619140625, "loss_aux_layer_19": 0.34521484375, "loss_aux_layer_2": 0.26806640625, "loss_aux_layer_20": 0.33642578125, "loss_aux_layer_21": 0.31396484375, "loss_aux_layer_22": 0.3349609375, "loss_aux_layer_23": 0.40087890625, "loss_aux_layer_3": 0.24609375, "loss_aux_layer_4": 0.24658203125, "loss_aux_layer_5": 0.26513671875, "loss_aux_layer_6": 0.253173828125, "loss_aux_layer_7": 0.263671875, "loss_aux_layer_8": 0.27685546875, "loss_aux_layer_9": 0.26171875, "step": 71, "total_loss": 0.9995009452104568 }, { "epoch": 0.014254603048901207, "grad_norm": 0.9989221692085266, "learning_rate": 3.55e-05, "llm_loss": 0.7158781290054321, "loss": 4.0578, "loss_aux_layer_0": 0.39892578125, "loss_aux_layer_1": 0.36669921875, "loss_aux_layer_10": 0.26806640625, "loss_aux_layer_11": 0.26708984375, "loss_aux_layer_12": 0.2744140625, "loss_aux_layer_13": 0.2939453125, "loss_aux_layer_14": 0.29296875, "loss_aux_layer_15": 0.31689453125, "loss_aux_layer_16": 0.3271484375, "loss_aux_layer_17": 0.33203125, "loss_aux_layer_18": 0.337890625, "loss_aux_layer_19": 0.3359375, "loss_aux_layer_2": 0.25830078125, "loss_aux_layer_20": 0.326171875, "loss_aux_layer_21": 0.30322265625, "loss_aux_layer_22": 0.3251953125, "loss_aux_layer_23": 0.3896484375, "loss_aux_layer_3": 0.237548828125, "loss_aux_layer_4": 0.238525390625, "loss_aux_layer_5": 0.256591796875, "loss_aux_layer_6": 0.244873046875, "loss_aux_layer_7": 0.256103515625, "loss_aux_layer_8": 0.266845703125, "loss_aux_layer_9": 0.2529296875, "step": 72, "total_loss": 1.0144595801830292 }, { "epoch": 0.014452583646802613, "grad_norm": 0.9640619158744812, "learning_rate": 3.6e-05, "llm_loss": 0.7050826847553253, "loss": 3.989, "loss_aux_layer_0": 0.3974609375, "loss_aux_layer_1": 0.353515625, "loss_aux_layer_10": 0.26318359375, "loss_aux_layer_11": 0.259765625, "loss_aux_layer_12": 0.26708984375, "loss_aux_layer_13": 0.28515625, "loss_aux_layer_14": 0.28564453125, "loss_aux_layer_15": 0.3095703125, "loss_aux_layer_16": 0.3212890625, "loss_aux_layer_17": 0.3271484375, "loss_aux_layer_18": 0.33154296875, "loss_aux_layer_19": 0.33056640625, "loss_aux_layer_2": 0.25146484375, "loss_aux_layer_20": 0.3193359375, "loss_aux_layer_21": 0.29833984375, "loss_aux_layer_22": 0.3193359375, "loss_aux_layer_23": 0.3818359375, "loss_aux_layer_3": 0.23095703125, "loss_aux_layer_4": 0.232666015625, "loss_aux_layer_5": 0.250732421875, "loss_aux_layer_6": 0.239501953125, "loss_aux_layer_7": 0.25, "loss_aux_layer_8": 0.259765625, "loss_aux_layer_9": 0.247314453125, "step": 73, "total_loss": 0.9972402155399323 }, { "epoch": 0.014650564244704019, "grad_norm": 0.9142438173294067, "learning_rate": 3.65e-05, "llm_loss": 0.6747009009122849, "loss": 3.8528, "loss_aux_layer_0": 0.3916015625, "loss_aux_layer_1": 0.3515625, "loss_aux_layer_10": 0.26025390625, "loss_aux_layer_11": 0.25732421875, "loss_aux_layer_12": 0.26416015625, "loss_aux_layer_13": 0.28173828125, "loss_aux_layer_14": 0.28271484375, "loss_aux_layer_15": 0.30615234375, "loss_aux_layer_16": 0.31591796875, "loss_aux_layer_17": 0.32177734375, "loss_aux_layer_18": 0.32568359375, "loss_aux_layer_19": 0.32275390625, "loss_aux_layer_2": 0.25244140625, "loss_aux_layer_20": 0.3095703125, "loss_aux_layer_21": 0.28955078125, "loss_aux_layer_22": 0.310546875, "loss_aux_layer_23": 0.37158203125, "loss_aux_layer_3": 0.232666015625, "loss_aux_layer_4": 0.2333984375, "loss_aux_layer_5": 0.250732421875, "loss_aux_layer_6": 0.239013671875, "loss_aux_layer_7": 0.24853515625, "loss_aux_layer_8": 0.25830078125, "loss_aux_layer_9": 0.245361328125, "step": 74, "total_loss": 0.9632090032100677 }, { "epoch": 0.014848544842605425, "grad_norm": 0.9678594470024109, "learning_rate": 3.7e-05, "llm_loss": 0.6968090087175369, "loss": 3.9144, "loss_aux_layer_0": 0.3837890625, "loss_aux_layer_1": 0.34912109375, "loss_aux_layer_10": 0.251708984375, "loss_aux_layer_11": 0.24853515625, "loss_aux_layer_12": 0.25439453125, "loss_aux_layer_13": 0.27294921875, "loss_aux_layer_14": 0.275390625, "loss_aux_layer_15": 0.298828125, "loss_aux_layer_16": 0.30859375, "loss_aux_layer_17": 0.31298828125, "loss_aux_layer_18": 0.3173828125, "loss_aux_layer_19": 0.31201171875, "loss_aux_layer_2": 0.251708984375, "loss_aux_layer_20": 0.30029296875, "loss_aux_layer_21": 0.283203125, "loss_aux_layer_22": 0.30419921875, "loss_aux_layer_23": 0.36181640625, "loss_aux_layer_3": 0.230224609375, "loss_aux_layer_4": 0.230712890625, "loss_aux_layer_5": 0.247314453125, "loss_aux_layer_6": 0.2353515625, "loss_aux_layer_7": 0.242919921875, "loss_aux_layer_8": 0.2509765625, "loss_aux_layer_9": 0.236328125, "step": 75, "total_loss": 0.9786095917224884 }, { "epoch": 0.01504652544050683, "grad_norm": 0.9761227965354919, "learning_rate": 3.7500000000000003e-05, "llm_loss": 0.8269226849079132, "loss": 4.4134, "loss_aux_layer_0": 0.3828125, "loss_aux_layer_1": 0.333984375, "loss_aux_layer_10": 0.24853515625, "loss_aux_layer_11": 0.244140625, "loss_aux_layer_12": 0.250244140625, "loss_aux_layer_13": 0.265625, "loss_aux_layer_14": 0.26953125, "loss_aux_layer_15": 0.2939453125, "loss_aux_layer_16": 0.3046875, "loss_aux_layer_17": 0.30810546875, "loss_aux_layer_18": 0.31298828125, "loss_aux_layer_19": 0.30908203125, "loss_aux_layer_2": 0.24365234375, "loss_aux_layer_20": 0.2958984375, "loss_aux_layer_21": 0.2822265625, "loss_aux_layer_22": 0.30029296875, "loss_aux_layer_23": 0.35498046875, "loss_aux_layer_3": 0.2236328125, "loss_aux_layer_4": 0.22509765625, "loss_aux_layer_5": 0.2412109375, "loss_aux_layer_6": 0.22998046875, "loss_aux_layer_7": 0.23779296875, "loss_aux_layer_8": 0.2451171875, "loss_aux_layer_9": 0.2333984375, "step": 76, "total_loss": 1.103357270359993 }, { "epoch": 0.015244506038408236, "grad_norm": 0.9041405320167542, "learning_rate": 3.8e-05, "llm_loss": 0.6252636611461639, "loss": 3.5941, "loss_aux_layer_0": 0.3759765625, "loss_aux_layer_1": 0.3310546875, "loss_aux_layer_10": 0.2431640625, "loss_aux_layer_11": 0.239013671875, "loss_aux_layer_12": 0.2451171875, "loss_aux_layer_13": 0.260009765625, "loss_aux_layer_14": 0.26611328125, "loss_aux_layer_15": 0.28857421875, "loss_aux_layer_16": 0.29833984375, "loss_aux_layer_17": 0.302734375, "loss_aux_layer_18": 0.30615234375, "loss_aux_layer_19": 0.30029296875, "loss_aux_layer_2": 0.2451171875, "loss_aux_layer_20": 0.28857421875, "loss_aux_layer_21": 0.2822265625, "loss_aux_layer_22": 0.30224609375, "loss_aux_layer_23": 0.3564453125, "loss_aux_layer_3": 0.22509765625, "loss_aux_layer_4": 0.22509765625, "loss_aux_layer_5": 0.240966796875, "loss_aux_layer_6": 0.22900390625, "loss_aux_layer_7": 0.235595703125, "loss_aux_layer_8": 0.24267578125, "loss_aux_layer_9": 0.22900390625, "step": 77, "total_loss": 0.898519366979599 }, { "epoch": 0.015442486636309642, "grad_norm": 0.9084277153015137, "learning_rate": 3.85e-05, "llm_loss": 0.7125933319330215, "loss": 3.9185, "loss_aux_layer_0": 0.36865234375, "loss_aux_layer_1": 0.318359375, "loss_aux_layer_10": 0.234619140625, "loss_aux_layer_11": 0.23046875, "loss_aux_layer_12": 0.23876953125, "loss_aux_layer_13": 0.253173828125, "loss_aux_layer_14": 0.26123046875, "loss_aux_layer_15": 0.2841796875, "loss_aux_layer_16": 0.2958984375, "loss_aux_layer_17": 0.2978515625, "loss_aux_layer_18": 0.302734375, "loss_aux_layer_19": 0.2958984375, "loss_aux_layer_2": 0.238525390625, "loss_aux_layer_20": 0.28515625, "loss_aux_layer_21": 0.27978515625, "loss_aux_layer_22": 0.2978515625, "loss_aux_layer_23": 0.34765625, "loss_aux_layer_3": 0.21923828125, "loss_aux_layer_4": 0.219482421875, "loss_aux_layer_5": 0.234375, "loss_aux_layer_6": 0.22216796875, "loss_aux_layer_7": 0.2275390625, "loss_aux_layer_8": 0.234130859375, "loss_aux_layer_9": 0.220947265625, "step": 78, "total_loss": 0.9796221554279327 }, { "epoch": 0.015640467234211048, "grad_norm": 0.9047238826751709, "learning_rate": 3.9000000000000006e-05, "llm_loss": 0.6478418558835983, "loss": 3.6562, "loss_aux_layer_0": 0.36279296875, "loss_aux_layer_1": 0.31689453125, "loss_aux_layer_10": 0.232421875, "loss_aux_layer_11": 0.22802734375, "loss_aux_layer_12": 0.235595703125, "loss_aux_layer_13": 0.2490234375, "loss_aux_layer_14": 0.2607421875, "loss_aux_layer_15": 0.28173828125, "loss_aux_layer_16": 0.29345703125, "loss_aux_layer_17": 0.29541015625, "loss_aux_layer_18": 0.30029296875, "loss_aux_layer_19": 0.29296875, "loss_aux_layer_2": 0.2412109375, "loss_aux_layer_20": 0.28369140625, "loss_aux_layer_21": 0.28271484375, "loss_aux_layer_22": 0.302734375, "loss_aux_layer_23": 0.3515625, "loss_aux_layer_3": 0.22119140625, "loss_aux_layer_4": 0.22119140625, "loss_aux_layer_5": 0.234130859375, "loss_aux_layer_6": 0.221923828125, "loss_aux_layer_7": 0.22607421875, "loss_aux_layer_8": 0.232666015625, "loss_aux_layer_9": 0.218017578125, "step": 79, "total_loss": 0.9140619784593582 }, { "epoch": 0.015838447832112454, "grad_norm": 0.8588495254516602, "learning_rate": 3.9500000000000005e-05, "llm_loss": 0.6534857004880905, "loss": 3.6493, "loss_aux_layer_0": 0.35400390625, "loss_aux_layer_1": 0.30810546875, "loss_aux_layer_10": 0.22265625, "loss_aux_layer_11": 0.21826171875, "loss_aux_layer_12": 0.227783203125, "loss_aux_layer_13": 0.240966796875, "loss_aux_layer_14": 0.25341796875, "loss_aux_layer_15": 0.2744140625, "loss_aux_layer_16": 0.28662109375, "loss_aux_layer_17": 0.2880859375, "loss_aux_layer_18": 0.29345703125, "loss_aux_layer_19": 0.28564453125, "loss_aux_layer_2": 0.236083984375, "loss_aux_layer_20": 0.27734375, "loss_aux_layer_21": 0.2763671875, "loss_aux_layer_22": 0.294921875, "loss_aux_layer_23": 0.3447265625, "loss_aux_layer_3": 0.21630859375, "loss_aux_layer_4": 0.216552734375, "loss_aux_layer_5": 0.228515625, "loss_aux_layer_6": 0.21630859375, "loss_aux_layer_7": 0.21875, "loss_aux_layer_8": 0.223876953125, "loss_aux_layer_9": 0.209716796875, "step": 80, "total_loss": 0.9123155027627945 }, { "epoch": 0.01603642843001386, "grad_norm": 1.0829777717590332, "learning_rate": 4e-05, "llm_loss": 0.7547879368066788, "loss": 4.0475, "loss_aux_layer_0": 0.3505859375, "loss_aux_layer_1": 0.30224609375, "loss_aux_layer_10": 0.218994140625, "loss_aux_layer_11": 0.214111328125, "loss_aux_layer_12": 0.22412109375, "loss_aux_layer_13": 0.2373046875, "loss_aux_layer_14": 0.25244140625, "loss_aux_layer_15": 0.27099609375, "loss_aux_layer_16": 0.28515625, "loss_aux_layer_17": 0.28759765625, "loss_aux_layer_18": 0.29345703125, "loss_aux_layer_19": 0.2880859375, "loss_aux_layer_2": 0.23486328125, "loss_aux_layer_20": 0.27978515625, "loss_aux_layer_21": 0.27783203125, "loss_aux_layer_22": 0.29443359375, "loss_aux_layer_23": 0.34326171875, "loss_aux_layer_3": 0.2158203125, "loss_aux_layer_4": 0.21533203125, "loss_aux_layer_5": 0.226806640625, "loss_aux_layer_6": 0.21435546875, "loss_aux_layer_7": 0.215576171875, "loss_aux_layer_8": 0.2197265625, "loss_aux_layer_9": 0.20654296875, "step": 81, "total_loss": 1.0118629187345505 }, { "epoch": 0.016234409027915266, "grad_norm": 0.8157814741134644, "learning_rate": 4.05e-05, "llm_loss": 0.6595746278762817, "loss": 3.6344, "loss_aux_layer_0": 0.33447265625, "loss_aux_layer_1": 0.28759765625, "loss_aux_layer_10": 0.21044921875, "loss_aux_layer_11": 0.205322265625, "loss_aux_layer_12": 0.215576171875, "loss_aux_layer_13": 0.2275390625, "loss_aux_layer_14": 0.24462890625, "loss_aux_layer_15": 0.263671875, "loss_aux_layer_16": 0.27685546875, "loss_aux_layer_17": 0.27783203125, "loss_aux_layer_18": 0.28466796875, "loss_aux_layer_19": 0.27880859375, "loss_aux_layer_2": 0.22607421875, "loss_aux_layer_20": 0.2724609375, "loss_aux_layer_21": 0.27392578125, "loss_aux_layer_22": 0.29296875, "loss_aux_layer_23": 0.34521484375, "loss_aux_layer_3": 0.208251953125, "loss_aux_layer_4": 0.207763671875, "loss_aux_layer_5": 0.218994140625, "loss_aux_layer_6": 0.2060546875, "loss_aux_layer_7": 0.206787109375, "loss_aux_layer_8": 0.211181640625, "loss_aux_layer_9": 0.198486328125, "step": 82, "total_loss": 0.9086000025272369 }, { "epoch": 0.01643238962581667, "grad_norm": 0.9022448062896729, "learning_rate": 4.1e-05, "llm_loss": 0.8384801596403122, "loss": 4.3562, "loss_aux_layer_0": 0.330078125, "loss_aux_layer_1": 0.2880859375, "loss_aux_layer_10": 0.208984375, "loss_aux_layer_11": 0.205322265625, "loss_aux_layer_12": 0.217529296875, "loss_aux_layer_13": 0.22998046875, "loss_aux_layer_14": 0.248779296875, "loss_aux_layer_15": 0.265625, "loss_aux_layer_16": 0.28076171875, "loss_aux_layer_17": 0.28076171875, "loss_aux_layer_18": 0.28857421875, "loss_aux_layer_19": 0.28076171875, "loss_aux_layer_2": 0.22998046875, "loss_aux_layer_20": 0.27490234375, "loss_aux_layer_21": 0.27392578125, "loss_aux_layer_22": 0.29345703125, "loss_aux_layer_23": 0.3447265625, "loss_aux_layer_3": 0.212158203125, "loss_aux_layer_4": 0.211669921875, "loss_aux_layer_5": 0.2216796875, "loss_aux_layer_6": 0.20947265625, "loss_aux_layer_7": 0.20751953125, "loss_aux_layer_8": 0.2119140625, "loss_aux_layer_9": 0.19775390625, "step": 83, "total_loss": 1.089057445526123 }, { "epoch": 0.016630370223718077, "grad_norm": 0.819305956363678, "learning_rate": 4.15e-05, "llm_loss": 0.7241410315036774, "loss": 3.8874, "loss_aux_layer_0": 0.32568359375, "loss_aux_layer_1": 0.27880859375, "loss_aux_layer_10": 0.205322265625, "loss_aux_layer_11": 0.20166015625, "loss_aux_layer_12": 0.21435546875, "loss_aux_layer_13": 0.226806640625, "loss_aux_layer_14": 0.24755859375, "loss_aux_layer_15": 0.265625, "loss_aux_layer_16": 0.279296875, "loss_aux_layer_17": 0.27978515625, "loss_aux_layer_18": 0.28662109375, "loss_aux_layer_19": 0.28076171875, "loss_aux_layer_2": 0.224365234375, "loss_aux_layer_20": 0.2744140625, "loss_aux_layer_21": 0.27294921875, "loss_aux_layer_22": 0.29296875, "loss_aux_layer_23": 0.34521484375, "loss_aux_layer_3": 0.208251953125, "loss_aux_layer_4": 0.207763671875, "loss_aux_layer_5": 0.21728515625, "loss_aux_layer_6": 0.205078125, "loss_aux_layer_7": 0.203369140625, "loss_aux_layer_8": 0.206787109375, "loss_aux_layer_9": 0.194580078125, "step": 84, "total_loss": 0.9718446433544159 }, { "epoch": 0.01682835082161948, "grad_norm": 0.7815269231796265, "learning_rate": 4.2e-05, "llm_loss": 0.7168776392936707, "loss": 3.838, "loss_aux_layer_0": 0.31884765625, "loss_aux_layer_1": 0.271484375, "loss_aux_layer_10": 0.197509765625, "loss_aux_layer_11": 0.1953125, "loss_aux_layer_12": 0.209228515625, "loss_aux_layer_13": 0.224609375, "loss_aux_layer_14": 0.24462890625, "loss_aux_layer_15": 0.26220703125, "loss_aux_layer_16": 0.27685546875, "loss_aux_layer_17": 0.27685546875, "loss_aux_layer_18": 0.2841796875, "loss_aux_layer_19": 0.27734375, "loss_aux_layer_2": 0.220458984375, "loss_aux_layer_20": 0.271484375, "loss_aux_layer_21": 0.2685546875, "loss_aux_layer_22": 0.28515625, "loss_aux_layer_23": 0.33154296875, "loss_aux_layer_3": 0.2041015625, "loss_aux_layer_4": 0.205078125, "loss_aux_layer_5": 0.21337890625, "loss_aux_layer_6": 0.200439453125, "loss_aux_layer_7": 0.197265625, "loss_aux_layer_8": 0.19970703125, "loss_aux_layer_9": 0.187255859375, "step": 85, "total_loss": 0.9595025926828384 }, { "epoch": 0.017026331419520885, "grad_norm": 0.8114915490150452, "learning_rate": 4.25e-05, "llm_loss": 0.7145734429359436, "loss": 3.8337, "loss_aux_layer_0": 0.31103515625, "loss_aux_layer_1": 0.26953125, "loss_aux_layer_10": 0.19677734375, "loss_aux_layer_11": 0.19677734375, "loss_aux_layer_12": 0.211669921875, "loss_aux_layer_13": 0.225341796875, "loss_aux_layer_14": 0.246337890625, "loss_aux_layer_15": 0.2646484375, "loss_aux_layer_16": 0.279296875, "loss_aux_layer_17": 0.2802734375, "loss_aux_layer_18": 0.2861328125, "loss_aux_layer_19": 0.27978515625, "loss_aux_layer_2": 0.22216796875, "loss_aux_layer_20": 0.2744140625, "loss_aux_layer_21": 0.2705078125, "loss_aux_layer_22": 0.2880859375, "loss_aux_layer_23": 0.3388671875, "loss_aux_layer_3": 0.206298828125, "loss_aux_layer_4": 0.205810546875, "loss_aux_layer_5": 0.2138671875, "loss_aux_layer_6": 0.200439453125, "loss_aux_layer_7": 0.196044921875, "loss_aux_layer_8": 0.198974609375, "loss_aux_layer_9": 0.187255859375, "step": 86, "total_loss": 0.958415225148201 }, { "epoch": 0.01722431201742229, "grad_norm": 0.8516019582748413, "learning_rate": 4.3e-05, "llm_loss": 0.749930128455162, "loss": 3.9603, "loss_aux_layer_0": 0.30419921875, "loss_aux_layer_1": 0.25927734375, "loss_aux_layer_10": 0.191162109375, "loss_aux_layer_11": 0.192138671875, "loss_aux_layer_12": 0.2080078125, "loss_aux_layer_13": 0.22314453125, "loss_aux_layer_14": 0.243896484375, "loss_aux_layer_15": 0.26171875, "loss_aux_layer_16": 0.27685546875, "loss_aux_layer_17": 0.27783203125, "loss_aux_layer_18": 0.2841796875, "loss_aux_layer_19": 0.27880859375, "loss_aux_layer_2": 0.216552734375, "loss_aux_layer_20": 0.27392578125, "loss_aux_layer_21": 0.2705078125, "loss_aux_layer_22": 0.2890625, "loss_aux_layer_23": 0.33935546875, "loss_aux_layer_3": 0.201416015625, "loss_aux_layer_4": 0.20068359375, "loss_aux_layer_5": 0.208251953125, "loss_aux_layer_6": 0.1953125, "loss_aux_layer_7": 0.1904296875, "loss_aux_layer_8": 0.192626953125, "loss_aux_layer_9": 0.18212890625, "step": 87, "total_loss": 0.990078017115593 }, { "epoch": 0.017422292615323697, "grad_norm": 0.8042641878128052, "learning_rate": 4.35e-05, "llm_loss": 0.7020066976547241, "loss": 3.7598, "loss_aux_layer_0": 0.29248046875, "loss_aux_layer_1": 0.25146484375, "loss_aux_layer_10": 0.18798828125, "loss_aux_layer_11": 0.1904296875, "loss_aux_layer_12": 0.206298828125, "loss_aux_layer_13": 0.220947265625, "loss_aux_layer_14": 0.2421875, "loss_aux_layer_15": 0.26171875, "loss_aux_layer_16": 0.275390625, "loss_aux_layer_17": 0.27734375, "loss_aux_layer_18": 0.2841796875, "loss_aux_layer_19": 0.27880859375, "loss_aux_layer_2": 0.213623046875, "loss_aux_layer_20": 0.2734375, "loss_aux_layer_21": 0.271484375, "loss_aux_layer_22": 0.28955078125, "loss_aux_layer_23": 0.34130859375, "loss_aux_layer_3": 0.198974609375, "loss_aux_layer_4": 0.198974609375, "loss_aux_layer_5": 0.2060546875, "loss_aux_layer_6": 0.19287109375, "loss_aux_layer_7": 0.187255859375, "loss_aux_layer_8": 0.189453125, "loss_aux_layer_9": 0.18017578125, "step": 88, "total_loss": 0.9399459213018417 }, { "epoch": 0.017620273213225103, "grad_norm": 1.1307638883590698, "learning_rate": 4.4000000000000006e-05, "llm_loss": 0.7118002325296402, "loss": 3.7837, "loss_aux_layer_0": 0.28759765625, "loss_aux_layer_1": 0.247314453125, "loss_aux_layer_10": 0.18408203125, "loss_aux_layer_11": 0.18798828125, "loss_aux_layer_12": 0.203857421875, "loss_aux_layer_13": 0.21875, "loss_aux_layer_14": 0.2392578125, "loss_aux_layer_15": 0.2568359375, "loss_aux_layer_16": 0.27001953125, "loss_aux_layer_17": 0.2705078125, "loss_aux_layer_18": 0.27978515625, "loss_aux_layer_19": 0.271484375, "loss_aux_layer_2": 0.2119140625, "loss_aux_layer_20": 0.26806640625, "loss_aux_layer_21": 0.26611328125, "loss_aux_layer_22": 0.2841796875, "loss_aux_layer_23": 0.33447265625, "loss_aux_layer_3": 0.197509765625, "loss_aux_layer_4": 0.197021484375, "loss_aux_layer_5": 0.2041015625, "loss_aux_layer_6": 0.190185546875, "loss_aux_layer_7": 0.184326171875, "loss_aux_layer_8": 0.186279296875, "loss_aux_layer_9": 0.1767578125, "step": 89, "total_loss": 0.9459202736616135 }, { "epoch": 0.01781825381112651, "grad_norm": 0.8389466404914856, "learning_rate": 4.4500000000000004e-05, "llm_loss": 0.7829442173242569, "loss": 4.0579, "loss_aux_layer_0": 0.2783203125, "loss_aux_layer_1": 0.23681640625, "loss_aux_layer_10": 0.179931640625, "loss_aux_layer_11": 0.1845703125, "loss_aux_layer_12": 0.20166015625, "loss_aux_layer_13": 0.216796875, "loss_aux_layer_14": 0.238037109375, "loss_aux_layer_15": 0.2568359375, "loss_aux_layer_16": 0.27197265625, "loss_aux_layer_17": 0.27294921875, "loss_aux_layer_18": 0.28125, "loss_aux_layer_19": 0.27490234375, "loss_aux_layer_2": 0.205322265625, "loss_aux_layer_20": 0.27001953125, "loss_aux_layer_21": 0.26904296875, "loss_aux_layer_22": 0.28759765625, "loss_aux_layer_23": 0.33935546875, "loss_aux_layer_3": 0.191162109375, "loss_aux_layer_4": 0.19091796875, "loss_aux_layer_5": 0.197021484375, "loss_aux_layer_6": 0.18359375, "loss_aux_layer_7": 0.177978515625, "loss_aux_layer_8": 0.1796875, "loss_aux_layer_9": 0.171875, "step": 90, "total_loss": 1.014483630657196 }, { "epoch": 0.018016234409027915, "grad_norm": 0.7855562567710876, "learning_rate": 4.5e-05, "llm_loss": 0.7303827255964279, "loss": 3.8729, "loss_aux_layer_0": 0.283203125, "loss_aux_layer_1": 0.249267578125, "loss_aux_layer_10": 0.185302734375, "loss_aux_layer_11": 0.1923828125, "loss_aux_layer_12": 0.208251953125, "loss_aux_layer_13": 0.223388671875, "loss_aux_layer_14": 0.2431640625, "loss_aux_layer_15": 0.2607421875, "loss_aux_layer_16": 0.27490234375, "loss_aux_layer_17": 0.27587890625, "loss_aux_layer_18": 0.28369140625, "loss_aux_layer_19": 0.27685546875, "loss_aux_layer_2": 0.21875, "loss_aux_layer_20": 0.27197265625, "loss_aux_layer_21": 0.2705078125, "loss_aux_layer_22": 0.2890625, "loss_aux_layer_23": 0.337890625, "loss_aux_layer_3": 0.204345703125, "loss_aux_layer_4": 0.202880859375, "loss_aux_layer_5": 0.208251953125, "loss_aux_layer_6": 0.193603515625, "loss_aux_layer_7": 0.18603515625, "loss_aux_layer_8": 0.187255859375, "loss_aux_layer_9": 0.17919921875, "step": 91, "total_loss": 0.968223363161087 }, { "epoch": 0.01821421500692932, "grad_norm": 0.7036027908325195, "learning_rate": 4.55e-05, "llm_loss": 0.6653390228748322, "loss": 3.6128, "loss_aux_layer_0": 0.2802734375, "loss_aux_layer_1": 0.2451171875, "loss_aux_layer_10": 0.18701171875, "loss_aux_layer_11": 0.195068359375, "loss_aux_layer_12": 0.2109375, "loss_aux_layer_13": 0.2255859375, "loss_aux_layer_14": 0.24560546875, "loss_aux_layer_15": 0.261474609375, "loss_aux_layer_16": 0.27490234375, "loss_aux_layer_17": 0.275390625, "loss_aux_layer_18": 0.2822265625, "loss_aux_layer_19": 0.2744140625, "loss_aux_layer_2": 0.21728515625, "loss_aux_layer_20": 0.26904296875, "loss_aux_layer_21": 0.26953125, "loss_aux_layer_22": 0.2880859375, "loss_aux_layer_23": 0.33935546875, "loss_aux_layer_3": 0.203857421875, "loss_aux_layer_4": 0.203125, "loss_aux_layer_5": 0.208740234375, "loss_aux_layer_6": 0.194091796875, "loss_aux_layer_7": 0.187255859375, "loss_aux_layer_8": 0.189208984375, "loss_aux_layer_9": 0.181640625, "step": 92, "total_loss": 0.9032057523727417 }, { "epoch": 0.018412195604830726, "grad_norm": 0.6875094771385193, "learning_rate": 4.600000000000001e-05, "llm_loss": 0.6963517814874649, "loss": 3.7038, "loss_aux_layer_0": 0.26953125, "loss_aux_layer_1": 0.22998046875, "loss_aux_layer_10": 0.177734375, "loss_aux_layer_11": 0.186279296875, "loss_aux_layer_12": 0.2021484375, "loss_aux_layer_13": 0.216796875, "loss_aux_layer_14": 0.2373046875, "loss_aux_layer_15": 0.25341796875, "loss_aux_layer_16": 0.26904296875, "loss_aux_layer_17": 0.26953125, "loss_aux_layer_18": 0.27685546875, "loss_aux_layer_19": 0.27099609375, "loss_aux_layer_2": 0.205322265625, "loss_aux_layer_20": 0.26611328125, "loss_aux_layer_21": 0.265625, "loss_aux_layer_22": 0.2841796875, "loss_aux_layer_23": 0.33642578125, "loss_aux_layer_3": 0.19287109375, "loss_aux_layer_4": 0.19189453125, "loss_aux_layer_5": 0.197509765625, "loss_aux_layer_6": 0.183349609375, "loss_aux_layer_7": 0.17626953125, "loss_aux_layer_8": 0.177734375, "loss_aux_layer_9": 0.172607421875, "step": 93, "total_loss": 0.925958439707756 }, { "epoch": 0.018610176202732132, "grad_norm": 0.7022325396537781, "learning_rate": 4.6500000000000005e-05, "llm_loss": 0.6836456507444382, "loss": 3.6766, "loss_aux_layer_0": 0.2666015625, "loss_aux_layer_1": 0.235595703125, "loss_aux_layer_10": 0.18359375, "loss_aux_layer_11": 0.194091796875, "loss_aux_layer_12": 0.20947265625, "loss_aux_layer_13": 0.224609375, "loss_aux_layer_14": 0.24560546875, "loss_aux_layer_15": 0.261962890625, "loss_aux_layer_16": 0.2763671875, "loss_aux_layer_17": 0.27587890625, "loss_aux_layer_18": 0.28271484375, "loss_aux_layer_19": 0.27587890625, "loss_aux_layer_2": 0.2138671875, "loss_aux_layer_20": 0.2705078125, "loss_aux_layer_21": 0.2685546875, "loss_aux_layer_22": 0.287109375, "loss_aux_layer_23": 0.33642578125, "loss_aux_layer_3": 0.20166015625, "loss_aux_layer_4": 0.199462890625, "loss_aux_layer_5": 0.205078125, "loss_aux_layer_6": 0.1904296875, "loss_aux_layer_7": 0.1826171875, "loss_aux_layer_8": 0.184814453125, "loss_aux_layer_9": 0.17919921875, "step": 94, "total_loss": 0.9191510677337646 }, { "epoch": 0.018808156800633538, "grad_norm": 0.7507929801940918, "learning_rate": 4.7e-05, "llm_loss": 0.681625097990036, "loss": 3.6512, "loss_aux_layer_0": 0.26318359375, "loss_aux_layer_1": 0.230224609375, "loss_aux_layer_10": 0.1806640625, "loss_aux_layer_11": 0.1904296875, "loss_aux_layer_12": 0.206298828125, "loss_aux_layer_13": 0.22021484375, "loss_aux_layer_14": 0.2412109375, "loss_aux_layer_15": 0.256591796875, "loss_aux_layer_16": 0.271484375, "loss_aux_layer_17": 0.27001953125, "loss_aux_layer_18": 0.27685546875, "loss_aux_layer_19": 0.2705078125, "loss_aux_layer_2": 0.21044921875, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.28125, "loss_aux_layer_23": 0.33251953125, "loss_aux_layer_3": 0.1982421875, "loss_aux_layer_4": 0.1962890625, "loss_aux_layer_5": 0.201416015625, "loss_aux_layer_6": 0.186279296875, "loss_aux_layer_7": 0.17919921875, "loss_aux_layer_8": 0.180419921875, "loss_aux_layer_9": 0.1767578125, "step": 95, "total_loss": 0.9127983003854752 }, { "epoch": 0.019006137398534944, "grad_norm": 0.7960907816886902, "learning_rate": 4.75e-05, "llm_loss": 0.6880190074443817, "loss": 3.6641, "loss_aux_layer_0": 0.25390625, "loss_aux_layer_1": 0.220458984375, "loss_aux_layer_10": 0.17626953125, "loss_aux_layer_11": 0.1865234375, "loss_aux_layer_12": 0.202392578125, "loss_aux_layer_13": 0.21728515625, "loss_aux_layer_14": 0.238037109375, "loss_aux_layer_15": 0.25537109375, "loss_aux_layer_16": 0.27001953125, "loss_aux_layer_17": 0.26904296875, "loss_aux_layer_18": 0.27685546875, "loss_aux_layer_19": 0.27099609375, "loss_aux_layer_2": 0.203125, "loss_aux_layer_20": 0.26708984375, "loss_aux_layer_21": 0.26611328125, "loss_aux_layer_22": 0.28466796875, "loss_aux_layer_23": 0.333984375, "loss_aux_layer_3": 0.19189453125, "loss_aux_layer_4": 0.19091796875, "loss_aux_layer_5": 0.19580078125, "loss_aux_layer_6": 0.181396484375, "loss_aux_layer_7": 0.17431640625, "loss_aux_layer_8": 0.17529296875, "loss_aux_layer_9": 0.17236328125, "step": 96, "total_loss": 0.9160227924585342 }, { "epoch": 0.01920411799643635, "grad_norm": 0.6751354932785034, "learning_rate": 4.8e-05, "llm_loss": 0.6756804287433624, "loss": 3.6188, "loss_aux_layer_0": 0.2509765625, "loss_aux_layer_1": 0.22021484375, "loss_aux_layer_10": 0.178955078125, "loss_aux_layer_11": 0.18896484375, "loss_aux_layer_12": 0.204345703125, "loss_aux_layer_13": 0.21875, "loss_aux_layer_14": 0.240234375, "loss_aux_layer_15": 0.2568359375, "loss_aux_layer_16": 0.2724609375, "loss_aux_layer_17": 0.27197265625, "loss_aux_layer_18": 0.279296875, "loss_aux_layer_19": 0.27294921875, "loss_aux_layer_2": 0.205810546875, "loss_aux_layer_20": 0.267578125, "loss_aux_layer_21": 0.263671875, "loss_aux_layer_22": 0.28125, "loss_aux_layer_23": 0.33203125, "loss_aux_layer_3": 0.193115234375, "loss_aux_layer_4": 0.191650390625, "loss_aux_layer_5": 0.1962890625, "loss_aux_layer_6": 0.182373046875, "loss_aux_layer_7": 0.17529296875, "loss_aux_layer_8": 0.1767578125, "loss_aux_layer_9": 0.174560546875, "step": 97, "total_loss": 0.9047103673219681 }, { "epoch": 0.019402098594337756, "grad_norm": 0.6887392401695251, "learning_rate": 4.85e-05, "llm_loss": 0.7005340456962585, "loss": 3.7298, "loss_aux_layer_0": 0.248291015625, "loss_aux_layer_1": 0.22314453125, "loss_aux_layer_10": 0.180908203125, "loss_aux_layer_11": 0.191650390625, "loss_aux_layer_12": 0.20751953125, "loss_aux_layer_13": 0.22216796875, "loss_aux_layer_14": 0.2431640625, "loss_aux_layer_15": 0.2587890625, "loss_aux_layer_16": 0.27490234375, "loss_aux_layer_17": 0.2744140625, "loss_aux_layer_18": 0.28125, "loss_aux_layer_19": 0.2744140625, "loss_aux_layer_2": 0.20947265625, "loss_aux_layer_20": 0.26904296875, "loss_aux_layer_21": 0.267578125, "loss_aux_layer_22": 0.28662109375, "loss_aux_layer_23": 0.33837890625, "loss_aux_layer_3": 0.1982421875, "loss_aux_layer_4": 0.196044921875, "loss_aux_layer_5": 0.200439453125, "loss_aux_layer_6": 0.186279296875, "loss_aux_layer_7": 0.1787109375, "loss_aux_layer_8": 0.1796875, "loss_aux_layer_9": 0.177001953125, "step": 98, "total_loss": 0.9324555695056915 }, { "epoch": 0.01960007919223916, "grad_norm": 0.6540769338607788, "learning_rate": 4.9e-05, "llm_loss": 0.6901282668113708, "loss": 3.6641, "loss_aux_layer_0": 0.240234375, "loss_aux_layer_1": 0.213623046875, "loss_aux_layer_10": 0.17333984375, "loss_aux_layer_11": 0.18408203125, "loss_aux_layer_12": 0.2001953125, "loss_aux_layer_13": 0.21533203125, "loss_aux_layer_14": 0.2373046875, "loss_aux_layer_15": 0.25439453125, "loss_aux_layer_16": 0.26953125, "loss_aux_layer_17": 0.26953125, "loss_aux_layer_18": 0.27587890625, "loss_aux_layer_19": 0.26953125, "loss_aux_layer_2": 0.2021484375, "loss_aux_layer_20": 0.26611328125, "loss_aux_layer_21": 0.26513671875, "loss_aux_layer_22": 0.28564453125, "loss_aux_layer_23": 0.33740234375, "loss_aux_layer_3": 0.19091796875, "loss_aux_layer_4": 0.187744140625, "loss_aux_layer_5": 0.192138671875, "loss_aux_layer_6": 0.17822265625, "loss_aux_layer_7": 0.171142578125, "loss_aux_layer_8": 0.171630859375, "loss_aux_layer_9": 0.17041015625, "step": 99, "total_loss": 0.9160333424806595 }, { "epoch": 0.019798059790140567, "grad_norm": 0.7875279188156128, "learning_rate": 4.9500000000000004e-05, "llm_loss": 0.6372517496347427, "loss": 3.4321, "loss_aux_layer_0": 0.226806640625, "loss_aux_layer_1": 0.201171875, "loss_aux_layer_10": 0.170166015625, "loss_aux_layer_11": 0.179931640625, "loss_aux_layer_12": 0.196044921875, "loss_aux_layer_13": 0.211181640625, "loss_aux_layer_14": 0.23291015625, "loss_aux_layer_15": 0.25048828125, "loss_aux_layer_16": 0.26611328125, "loss_aux_layer_17": 0.2666015625, "loss_aux_layer_18": 0.2734375, "loss_aux_layer_19": 0.26708984375, "loss_aux_layer_2": 0.192626953125, "loss_aux_layer_20": 0.2626953125, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.28125, "loss_aux_layer_23": 0.33349609375, "loss_aux_layer_3": 0.18310546875, "loss_aux_layer_4": 0.18115234375, "loss_aux_layer_5": 0.185791015625, "loss_aux_layer_6": 0.17333984375, "loss_aux_layer_7": 0.16650390625, "loss_aux_layer_8": 0.16748046875, "loss_aux_layer_9": 0.166748046875, "step": 100, "total_loss": 0.8580133765935898 }, { "epoch": 0.019996040388041973, "grad_norm": 0.6987673044204712, "learning_rate": 5e-05, "llm_loss": 0.7283210307359695, "loss": 3.8051, "loss_aux_layer_0": 0.223388671875, "loss_aux_layer_1": 0.205322265625, "loss_aux_layer_10": 0.173095703125, "loss_aux_layer_11": 0.1826171875, "loss_aux_layer_12": 0.198974609375, "loss_aux_layer_13": 0.214111328125, "loss_aux_layer_14": 0.236083984375, "loss_aux_layer_15": 0.252685546875, "loss_aux_layer_16": 0.267578125, "loss_aux_layer_17": 0.2685546875, "loss_aux_layer_18": 0.275390625, "loss_aux_layer_19": 0.2705078125, "loss_aux_layer_2": 0.197509765625, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.28173828125, "loss_aux_layer_23": 0.3330078125, "loss_aux_layer_3": 0.1875, "loss_aux_layer_4": 0.184326171875, "loss_aux_layer_5": 0.188232421875, "loss_aux_layer_6": 0.1748046875, "loss_aux_layer_7": 0.16845703125, "loss_aux_layer_8": 0.169677734375, "loss_aux_layer_9": 0.16943359375, "step": 101, "total_loss": 0.9512735307216644 }, { "epoch": 0.02019402098594338, "grad_norm": 0.6552653908729553, "learning_rate": 5e-05, "llm_loss": 0.6141287982463837, "loss": 3.3815, "loss_aux_layer_0": 0.2314453125, "loss_aux_layer_1": 0.2177734375, "loss_aux_layer_10": 0.182373046875, "loss_aux_layer_11": 0.19384765625, "loss_aux_layer_12": 0.210693359375, "loss_aux_layer_13": 0.2255859375, "loss_aux_layer_14": 0.245361328125, "loss_aux_layer_15": 0.26025390625, "loss_aux_layer_16": 0.27392578125, "loss_aux_layer_17": 0.2734375, "loss_aux_layer_18": 0.27978515625, "loss_aux_layer_19": 0.27294921875, "loss_aux_layer_2": 0.20947265625, "loss_aux_layer_20": 0.26806640625, "loss_aux_layer_21": 0.26513671875, "loss_aux_layer_22": 0.2861328125, "loss_aux_layer_23": 0.3349609375, "loss_aux_layer_3": 0.19970703125, "loss_aux_layer_4": 0.1962890625, "loss_aux_layer_5": 0.19970703125, "loss_aux_layer_6": 0.186279296875, "loss_aux_layer_7": 0.178466796875, "loss_aux_layer_8": 0.179931640625, "loss_aux_layer_9": 0.17919921875, "step": 102, "total_loss": 0.8453681170940399 }, { "epoch": 0.020392001583844785, "grad_norm": 0.7991744875907898, "learning_rate": 5e-05, "llm_loss": 0.8209573328495026, "loss": 4.19, "loss_aux_layer_0": 0.2177734375, "loss_aux_layer_1": 0.209228515625, "loss_aux_layer_10": 0.17919921875, "loss_aux_layer_11": 0.18994140625, "loss_aux_layer_12": 0.20654296875, "loss_aux_layer_13": 0.22119140625, "loss_aux_layer_14": 0.241455078125, "loss_aux_layer_15": 0.25634765625, "loss_aux_layer_16": 0.271484375, "loss_aux_layer_17": 0.2705078125, "loss_aux_layer_18": 0.27783203125, "loss_aux_layer_19": 0.27001953125, "loss_aux_layer_2": 0.203369140625, "loss_aux_layer_20": 0.2646484375, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.33154296875, "loss_aux_layer_3": 0.193115234375, "loss_aux_layer_4": 0.18994140625, "loss_aux_layer_5": 0.193359375, "loss_aux_layer_6": 0.180908203125, "loss_aux_layer_7": 0.173828125, "loss_aux_layer_8": 0.17578125, "loss_aux_layer_9": 0.175537109375, "step": 103, "total_loss": 1.0475096851587296 }, { "epoch": 0.020589982181746187, "grad_norm": 1.0398026704788208, "learning_rate": 5e-05, "llm_loss": 0.6167902201414108, "loss": 3.3427, "loss_aux_layer_0": 0.20703125, "loss_aux_layer_1": 0.19775390625, "loss_aux_layer_10": 0.1708984375, "loss_aux_layer_11": 0.181396484375, "loss_aux_layer_12": 0.1982421875, "loss_aux_layer_13": 0.21240234375, "loss_aux_layer_14": 0.233154296875, "loss_aux_layer_15": 0.249755859375, "loss_aux_layer_16": 0.265625, "loss_aux_layer_17": 0.26611328125, "loss_aux_layer_18": 0.27294921875, "loss_aux_layer_19": 0.26708984375, "loss_aux_layer_2": 0.19140625, "loss_aux_layer_20": 0.26123046875, "loss_aux_layer_21": 0.2587890625, "loss_aux_layer_22": 0.2763671875, "loss_aux_layer_23": 0.32861328125, "loss_aux_layer_3": 0.182861328125, "loss_aux_layer_4": 0.17919921875, "loss_aux_layer_5": 0.18310546875, "loss_aux_layer_6": 0.17138671875, "loss_aux_layer_7": 0.164794921875, "loss_aux_layer_8": 0.16650390625, "loss_aux_layer_9": 0.166748046875, "step": 104, "total_loss": 0.8356680572032928 }, { "epoch": 0.020787962779647593, "grad_norm": 0.7654334902763367, "learning_rate": 5e-05, "llm_loss": 0.6765755712985992, "loss": 3.5818, "loss_aux_layer_0": 0.2021484375, "loss_aux_layer_1": 0.197998046875, "loss_aux_layer_10": 0.16796875, "loss_aux_layer_11": 0.17822265625, "loss_aux_layer_12": 0.195068359375, "loss_aux_layer_13": 0.21044921875, "loss_aux_layer_14": 0.231689453125, "loss_aux_layer_15": 0.249267578125, "loss_aux_layer_16": 0.26416015625, "loss_aux_layer_17": 0.265625, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.26904296875, "loss_aux_layer_2": 0.19287109375, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.26318359375, "loss_aux_layer_22": 0.28076171875, "loss_aux_layer_23": 0.3330078125, "loss_aux_layer_3": 0.183837890625, "loss_aux_layer_4": 0.17919921875, "loss_aux_layer_5": 0.182861328125, "loss_aux_layer_6": 0.170654296875, "loss_aux_layer_7": 0.163818359375, "loss_aux_layer_8": 0.1650390625, "loss_aux_layer_9": 0.1650390625, "step": 105, "total_loss": 0.8954457491636276 }, { "epoch": 0.020985943377549, "grad_norm": 0.6665189862251282, "learning_rate": 5e-05, "llm_loss": 0.6527702659368515, "loss": 3.4695, "loss_aux_layer_0": 0.1923828125, "loss_aux_layer_1": 0.18505859375, "loss_aux_layer_10": 0.16455078125, "loss_aux_layer_11": 0.175537109375, "loss_aux_layer_12": 0.1923828125, "loss_aux_layer_13": 0.20849609375, "loss_aux_layer_14": 0.22998046875, "loss_aux_layer_15": 0.24755859375, "loss_aux_layer_16": 0.2626953125, "loss_aux_layer_17": 0.265625, "loss_aux_layer_18": 0.27392578125, "loss_aux_layer_19": 0.2685546875, "loss_aux_layer_2": 0.17919921875, "loss_aux_layer_20": 0.26416015625, "loss_aux_layer_21": 0.263671875, "loss_aux_layer_22": 0.28369140625, "loss_aux_layer_23": 0.33544921875, "loss_aux_layer_3": 0.172607421875, "loss_aux_layer_4": 0.169189453125, "loss_aux_layer_5": 0.173095703125, "loss_aux_layer_6": 0.16357421875, "loss_aux_layer_7": 0.158203125, "loss_aux_layer_8": 0.159912109375, "loss_aux_layer_9": 0.16064453125, "step": 106, "total_loss": 0.8673788011074066 }, { "epoch": 0.021183923975450405, "grad_norm": 0.7152649760246277, "learning_rate": 5e-05, "llm_loss": 0.6956348121166229, "loss": 3.6571, "loss_aux_layer_0": 0.19189453125, "loss_aux_layer_1": 0.19384765625, "loss_aux_layer_10": 0.170654296875, "loss_aux_layer_11": 0.181640625, "loss_aux_layer_12": 0.198486328125, "loss_aux_layer_13": 0.21337890625, "loss_aux_layer_14": 0.234130859375, "loss_aux_layer_15": 0.24951171875, "loss_aux_layer_16": 0.26611328125, "loss_aux_layer_17": 0.265625, "loss_aux_layer_18": 0.27392578125, "loss_aux_layer_19": 0.26806640625, "loss_aux_layer_2": 0.188232421875, "loss_aux_layer_20": 0.263671875, "loss_aux_layer_21": 0.26171875, "loss_aux_layer_22": 0.28125, "loss_aux_layer_23": 0.33349609375, "loss_aux_layer_3": 0.18212890625, "loss_aux_layer_4": 0.178466796875, "loss_aux_layer_5": 0.181884765625, "loss_aux_layer_6": 0.170654296875, "loss_aux_layer_7": 0.1650390625, "loss_aux_layer_8": 0.1669921875, "loss_aux_layer_9": 0.1669921875, "step": 107, "total_loss": 0.9142849743366241 }, { "epoch": 0.02138190457335181, "grad_norm": 0.7264949679374695, "learning_rate": 5e-05, "llm_loss": 0.7040694504976273, "loss": 3.7026, "loss_aux_layer_0": 0.189208984375, "loss_aux_layer_1": 0.197509765625, "loss_aux_layer_10": 0.174072265625, "loss_aux_layer_11": 0.18505859375, "loss_aux_layer_12": 0.200927734375, "loss_aux_layer_13": 0.215576171875, "loss_aux_layer_14": 0.237060546875, "loss_aux_layer_15": 0.25244140625, "loss_aux_layer_16": 0.26806640625, "loss_aux_layer_17": 0.26806640625, "loss_aux_layer_18": 0.2763671875, "loss_aux_layer_19": 0.27001953125, "loss_aux_layer_2": 0.19482421875, "loss_aux_layer_20": 0.26611328125, "loss_aux_layer_21": 0.2646484375, "loss_aux_layer_22": 0.28466796875, "loss_aux_layer_23": 0.3369140625, "loss_aux_layer_3": 0.18701171875, "loss_aux_layer_4": 0.182373046875, "loss_aux_layer_5": 0.18505859375, "loss_aux_layer_6": 0.17431640625, "loss_aux_layer_7": 0.16845703125, "loss_aux_layer_8": 0.17041015625, "loss_aux_layer_9": 0.170166015625, "step": 108, "total_loss": 0.925662025809288 }, { "epoch": 0.021579885171253217, "grad_norm": 0.698540985584259, "learning_rate": 5e-05, "llm_loss": 0.6702700555324554, "loss": 3.5518, "loss_aux_layer_0": 0.18017578125, "loss_aux_layer_1": 0.1904296875, "loss_aux_layer_10": 0.17236328125, "loss_aux_layer_11": 0.183837890625, "loss_aux_layer_12": 0.19921875, "loss_aux_layer_13": 0.213134765625, "loss_aux_layer_14": 0.234130859375, "loss_aux_layer_15": 0.25, "loss_aux_layer_16": 0.265625, "loss_aux_layer_17": 0.2646484375, "loss_aux_layer_18": 0.27294921875, "loss_aux_layer_19": 0.26611328125, "loss_aux_layer_2": 0.1884765625, "loss_aux_layer_20": 0.26123046875, "loss_aux_layer_21": 0.2578125, "loss_aux_layer_22": 0.27783203125, "loss_aux_layer_23": 0.330078125, "loss_aux_layer_3": 0.18212890625, "loss_aux_layer_4": 0.17822265625, "loss_aux_layer_5": 0.181884765625, "loss_aux_layer_6": 0.17138671875, "loss_aux_layer_7": 0.165771484375, "loss_aux_layer_8": 0.167724609375, "loss_aux_layer_9": 0.16845703125, "step": 109, "total_loss": 0.8879589289426804 }, { "epoch": 0.021777865769154622, "grad_norm": 0.6574384570121765, "learning_rate": 5e-05, "llm_loss": 0.6518405899405479, "loss": 3.4724, "loss_aux_layer_0": 0.178955078125, "loss_aux_layer_1": 0.18603515625, "loss_aux_layer_10": 0.168212890625, "loss_aux_layer_11": 0.178955078125, "loss_aux_layer_12": 0.19482421875, "loss_aux_layer_13": 0.21044921875, "loss_aux_layer_14": 0.23193359375, "loss_aux_layer_15": 0.2490234375, "loss_aux_layer_16": 0.2646484375, "loss_aux_layer_17": 0.26708984375, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.2685546875, "loss_aux_layer_2": 0.18310546875, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.263671875, "loss_aux_layer_22": 0.28369140625, "loss_aux_layer_23": 0.33349609375, "loss_aux_layer_3": 0.17822265625, "loss_aux_layer_4": 0.173828125, "loss_aux_layer_5": 0.177001953125, "loss_aux_layer_6": 0.16796875, "loss_aux_layer_7": 0.16259765625, "loss_aux_layer_8": 0.16455078125, "loss_aux_layer_9": 0.16455078125, "step": 110, "total_loss": 0.8681059330701828 }, { "epoch": 0.021975846367056028, "grad_norm": 0.6699132323265076, "learning_rate": 5e-05, "llm_loss": 0.7086319327354431, "loss": 3.6875, "loss_aux_layer_0": 0.174072265625, "loss_aux_layer_1": 0.185302734375, "loss_aux_layer_10": 0.165283203125, "loss_aux_layer_11": 0.176025390625, "loss_aux_layer_12": 0.192138671875, "loss_aux_layer_13": 0.2080078125, "loss_aux_layer_14": 0.229248046875, "loss_aux_layer_15": 0.245361328125, "loss_aux_layer_16": 0.26025390625, "loss_aux_layer_17": 0.26123046875, "loss_aux_layer_18": 0.26953125, "loss_aux_layer_19": 0.263671875, "loss_aux_layer_2": 0.18359375, "loss_aux_layer_20": 0.2607421875, "loss_aux_layer_21": 0.2587890625, "loss_aux_layer_22": 0.27978515625, "loss_aux_layer_23": 0.33056640625, "loss_aux_layer_3": 0.177001953125, "loss_aux_layer_4": 0.171875, "loss_aux_layer_5": 0.175048828125, "loss_aux_layer_6": 0.1650390625, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.161376953125, "loss_aux_layer_9": 0.1611328125, "step": 111, "total_loss": 0.9218798875808716 }, { "epoch": 0.022173826964957434, "grad_norm": 0.6176862716674805, "learning_rate": 5e-05, "llm_loss": 0.6790181174874306, "loss": 3.5995, "loss_aux_layer_0": 0.171630859375, "loss_aux_layer_1": 0.193603515625, "loss_aux_layer_10": 0.174560546875, "loss_aux_layer_11": 0.186279296875, "loss_aux_layer_12": 0.202880859375, "loss_aux_layer_13": 0.217529296875, "loss_aux_layer_14": 0.23876953125, "loss_aux_layer_15": 0.254150390625, "loss_aux_layer_16": 0.27001953125, "loss_aux_layer_17": 0.2705078125, "loss_aux_layer_18": 0.2763671875, "loss_aux_layer_19": 0.26953125, "loss_aux_layer_2": 0.195556640625, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.2607421875, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.3310546875, "loss_aux_layer_3": 0.188232421875, "loss_aux_layer_4": 0.182373046875, "loss_aux_layer_5": 0.185302734375, "loss_aux_layer_6": 0.1748046875, "loss_aux_layer_7": 0.16845703125, "loss_aux_layer_8": 0.171142578125, "loss_aux_layer_9": 0.1708984375, "step": 112, "total_loss": 0.8998725563287735 }, { "epoch": 0.02237180756285884, "grad_norm": 0.7617302536964417, "learning_rate": 5e-05, "llm_loss": 0.6994436830282211, "loss": 3.6363, "loss_aux_layer_0": 0.1591796875, "loss_aux_layer_1": 0.173583984375, "loss_aux_layer_10": 0.162841796875, "loss_aux_layer_11": 0.173583984375, "loss_aux_layer_12": 0.189453125, "loss_aux_layer_13": 0.205078125, "loss_aux_layer_14": 0.2265625, "loss_aux_layer_15": 0.2431640625, "loss_aux_layer_16": 0.25927734375, "loss_aux_layer_17": 0.259765625, "loss_aux_layer_18": 0.26904296875, "loss_aux_layer_19": 0.26416015625, "loss_aux_layer_2": 0.17529296875, "loss_aux_layer_20": 0.2607421875, "loss_aux_layer_21": 0.259765625, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.3310546875, "loss_aux_layer_3": 0.17041015625, "loss_aux_layer_4": 0.166015625, "loss_aux_layer_5": 0.16943359375, "loss_aux_layer_6": 0.1611328125, "loss_aux_layer_7": 0.156005859375, "loss_aux_layer_8": 0.158203125, "loss_aux_layer_9": 0.159423828125, "step": 113, "total_loss": 0.9090714305639267 }, { "epoch": 0.022569788160760246, "grad_norm": 0.9152628183364868, "learning_rate": 5e-05, "llm_loss": 0.662375420331955, "loss": 3.5201, "loss_aux_layer_0": 0.16259765625, "loss_aux_layer_1": 0.184326171875, "loss_aux_layer_10": 0.171630859375, "loss_aux_layer_11": 0.1826171875, "loss_aux_layer_12": 0.198974609375, "loss_aux_layer_13": 0.21484375, "loss_aux_layer_14": 0.236083984375, "loss_aux_layer_15": 0.251708984375, "loss_aux_layer_16": 0.267578125, "loss_aux_layer_17": 0.26708984375, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.267578125, "loss_aux_layer_2": 0.187744140625, "loss_aux_layer_20": 0.26416015625, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.28271484375, "loss_aux_layer_23": 0.33203125, "loss_aux_layer_3": 0.18310546875, "loss_aux_layer_4": 0.177734375, "loss_aux_layer_5": 0.180908203125, "loss_aux_layer_6": 0.171630859375, "loss_aux_layer_7": 0.16552734375, "loss_aux_layer_8": 0.16796875, "loss_aux_layer_9": 0.16796875, "step": 114, "total_loss": 0.8800350427627563 }, { "epoch": 0.02276776875866165, "grad_norm": 0.5783010125160217, "learning_rate": 5e-05, "llm_loss": 0.6378855109214783, "loss": 3.4217, "loss_aux_layer_0": 0.158203125, "loss_aux_layer_1": 0.18408203125, "loss_aux_layer_10": 0.173828125, "loss_aux_layer_11": 0.184326171875, "loss_aux_layer_12": 0.200439453125, "loss_aux_layer_13": 0.215087890625, "loss_aux_layer_14": 0.236083984375, "loss_aux_layer_15": 0.25146484375, "loss_aux_layer_16": 0.26611328125, "loss_aux_layer_17": 0.26708984375, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.267578125, "loss_aux_layer_2": 0.18701171875, "loss_aux_layer_20": 0.2626953125, "loss_aux_layer_21": 0.26171875, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.3310546875, "loss_aux_layer_3": 0.1826171875, "loss_aux_layer_4": 0.177978515625, "loss_aux_layer_5": 0.180908203125, "loss_aux_layer_6": 0.172119140625, "loss_aux_layer_7": 0.166748046875, "loss_aux_layer_8": 0.16943359375, "loss_aux_layer_9": 0.169921875, "step": 115, "total_loss": 0.8554161190986633 }, { "epoch": 0.022965749356563057, "grad_norm": 0.6156476736068726, "learning_rate": 5e-05, "llm_loss": 0.7031547576189041, "loss": 3.6764, "loss_aux_layer_0": 0.155029296875, "loss_aux_layer_1": 0.1826171875, "loss_aux_layer_10": 0.171630859375, "loss_aux_layer_11": 0.18212890625, "loss_aux_layer_12": 0.1982421875, "loss_aux_layer_13": 0.212890625, "loss_aux_layer_14": 0.233154296875, "loss_aux_layer_15": 0.24853515625, "loss_aux_layer_16": 0.26416015625, "loss_aux_layer_17": 0.26513671875, "loss_aux_layer_18": 0.2724609375, "loss_aux_layer_19": 0.265625, "loss_aux_layer_2": 0.1875, "loss_aux_layer_20": 0.26318359375, "loss_aux_layer_21": 0.2607421875, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.33154296875, "loss_aux_layer_3": 0.181884765625, "loss_aux_layer_4": 0.17626953125, "loss_aux_layer_5": 0.178955078125, "loss_aux_layer_6": 0.170654296875, "loss_aux_layer_7": 0.164794921875, "loss_aux_layer_8": 0.16748046875, "loss_aux_layer_9": 0.168212890625, "step": 116, "total_loss": 0.9190990775823593 }, { "epoch": 0.023163729954464463, "grad_norm": 0.6593515276908875, "learning_rate": 5e-05, "llm_loss": 0.714188739657402, "loss": 3.7145, "loss_aux_layer_0": 0.14794921875, "loss_aux_layer_1": 0.177490234375, "loss_aux_layer_10": 0.170166015625, "loss_aux_layer_11": 0.181640625, "loss_aux_layer_12": 0.197998046875, "loss_aux_layer_13": 0.21240234375, "loss_aux_layer_14": 0.23291015625, "loss_aux_layer_15": 0.248779296875, "loss_aux_layer_16": 0.263671875, "loss_aux_layer_17": 0.26513671875, "loss_aux_layer_18": 0.27294921875, "loss_aux_layer_19": 0.26611328125, "loss_aux_layer_2": 0.182861328125, "loss_aux_layer_20": 0.26171875, "loss_aux_layer_21": 0.259765625, "loss_aux_layer_22": 0.2783203125, "loss_aux_layer_23": 0.33056640625, "loss_aux_layer_3": 0.1787109375, "loss_aux_layer_4": 0.173828125, "loss_aux_layer_5": 0.17724609375, "loss_aux_layer_6": 0.169189453125, "loss_aux_layer_7": 0.16357421875, "loss_aux_layer_8": 0.166259765625, "loss_aux_layer_9": 0.166748046875, "step": 117, "total_loss": 0.9286281168460846 }, { "epoch": 0.02336171055236587, "grad_norm": 0.6784913539886475, "learning_rate": 5e-05, "llm_loss": 0.6786830872297287, "loss": 3.5746, "loss_aux_layer_0": 0.150634765625, "loss_aux_layer_1": 0.18017578125, "loss_aux_layer_10": 0.169921875, "loss_aux_layer_11": 0.1806640625, "loss_aux_layer_12": 0.19677734375, "loss_aux_layer_13": 0.212158203125, "loss_aux_layer_14": 0.232666015625, "loss_aux_layer_15": 0.24853515625, "loss_aux_layer_16": 0.2646484375, "loss_aux_layer_17": 0.26611328125, "loss_aux_layer_18": 0.2734375, "loss_aux_layer_19": 0.26611328125, "loss_aux_layer_2": 0.184326171875, "loss_aux_layer_20": 0.2626953125, "loss_aux_layer_21": 0.26123046875, "loss_aux_layer_22": 0.28173828125, "loss_aux_layer_23": 0.32958984375, "loss_aux_layer_3": 0.18017578125, "loss_aux_layer_4": 0.174560546875, "loss_aux_layer_5": 0.177734375, "loss_aux_layer_6": 0.169677734375, "loss_aux_layer_7": 0.164306640625, "loss_aux_layer_8": 0.166015625, "loss_aux_layer_9": 0.166015625, "step": 118, "total_loss": 0.8936508446931839 }, { "epoch": 0.023559691150267275, "grad_norm": 0.5941954851150513, "learning_rate": 5e-05, "llm_loss": 0.6320896148681641, "loss": 3.371, "loss_aux_layer_0": 0.1416015625, "loss_aux_layer_1": 0.17138671875, "loss_aux_layer_10": 0.167724609375, "loss_aux_layer_11": 0.1787109375, "loss_aux_layer_12": 0.195068359375, "loss_aux_layer_13": 0.2099609375, "loss_aux_layer_14": 0.229736328125, "loss_aux_layer_15": 0.245361328125, "loss_aux_layer_16": 0.26025390625, "loss_aux_layer_17": 0.26025390625, "loss_aux_layer_18": 0.26806640625, "loss_aux_layer_19": 0.26171875, "loss_aux_layer_2": 0.1787109375, "loss_aux_layer_20": 0.25830078125, "loss_aux_layer_21": 0.25732421875, "loss_aux_layer_22": 0.275390625, "loss_aux_layer_23": 0.32568359375, "loss_aux_layer_3": 0.1748046875, "loss_aux_layer_4": 0.169921875, "loss_aux_layer_5": 0.173095703125, "loss_aux_layer_6": 0.165283203125, "loss_aux_layer_7": 0.159912109375, "loss_aux_layer_8": 0.162841796875, "loss_aux_layer_9": 0.16357421875, "step": 119, "total_loss": 0.8427391201257706 }, { "epoch": 0.02375767174816868, "grad_norm": 0.6784100532531738, "learning_rate": 5e-05, "llm_loss": 0.7397333383560181, "loss": 3.8119, "loss_aux_layer_0": 0.14306640625, "loss_aux_layer_1": 0.172607421875, "loss_aux_layer_10": 0.169189453125, "loss_aux_layer_11": 0.179443359375, "loss_aux_layer_12": 0.195068359375, "loss_aux_layer_13": 0.21044921875, "loss_aux_layer_14": 0.231201171875, "loss_aux_layer_15": 0.247314453125, "loss_aux_layer_16": 0.26416015625, "loss_aux_layer_17": 0.265625, "loss_aux_layer_18": 0.2734375, "loss_aux_layer_19": 0.267578125, "loss_aux_layer_2": 0.179443359375, "loss_aux_layer_20": 0.2646484375, "loss_aux_layer_21": 0.2607421875, "loss_aux_layer_22": 0.2783203125, "loss_aux_layer_23": 0.328125, "loss_aux_layer_3": 0.1767578125, "loss_aux_layer_4": 0.171875, "loss_aux_layer_5": 0.175537109375, "loss_aux_layer_6": 0.16845703125, "loss_aux_layer_7": 0.163818359375, "loss_aux_layer_8": 0.16552734375, "loss_aux_layer_9": 0.165771484375, "step": 120, "total_loss": 0.9529673010110855 }, { "epoch": 0.023955652346070087, "grad_norm": 0.6060028076171875, "learning_rate": 5e-05, "llm_loss": 0.7377205193042755, "loss": 3.8019, "loss_aux_layer_0": 0.13623046875, "loss_aux_layer_1": 0.1728515625, "loss_aux_layer_10": 0.1689453125, "loss_aux_layer_11": 0.180419921875, "loss_aux_layer_12": 0.19580078125, "loss_aux_layer_13": 0.21142578125, "loss_aux_layer_14": 0.232421875, "loss_aux_layer_15": 0.2490234375, "loss_aux_layer_16": 0.263671875, "loss_aux_layer_17": 0.2646484375, "loss_aux_layer_18": 0.271484375, "loss_aux_layer_19": 0.265625, "loss_aux_layer_2": 0.18115234375, "loss_aux_layer_20": 0.26171875, "loss_aux_layer_21": 0.25927734375, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.33154296875, "loss_aux_layer_3": 0.17724609375, "loss_aux_layer_4": 0.171630859375, "loss_aux_layer_5": 0.17431640625, "loss_aux_layer_6": 0.166748046875, "loss_aux_layer_7": 0.161376953125, "loss_aux_layer_8": 0.164306640625, "loss_aux_layer_9": 0.1650390625, "step": 121, "total_loss": 0.9504655599594116 }, { "epoch": 0.024153632943971493, "grad_norm": 0.6527838706970215, "learning_rate": 5e-05, "llm_loss": 0.7057591080665588, "loss": 3.6616, "loss_aux_layer_0": 0.13720703125, "loss_aux_layer_1": 0.168212890625, "loss_aux_layer_10": 0.1650390625, "loss_aux_layer_11": 0.176025390625, "loss_aux_layer_12": 0.192138671875, "loss_aux_layer_13": 0.206787109375, "loss_aux_layer_14": 0.227783203125, "loss_aux_layer_15": 0.24462890625, "loss_aux_layer_16": 0.2607421875, "loss_aux_layer_17": 0.26220703125, "loss_aux_layer_18": 0.26904296875, "loss_aux_layer_19": 0.26318359375, "loss_aux_layer_2": 0.17578125, "loss_aux_layer_20": 0.259765625, "loss_aux_layer_21": 0.2587890625, "loss_aux_layer_22": 0.2802734375, "loss_aux_layer_23": 0.3310546875, "loss_aux_layer_3": 0.17236328125, "loss_aux_layer_4": 0.166748046875, "loss_aux_layer_5": 0.169921875, "loss_aux_layer_6": 0.163330078125, "loss_aux_layer_7": 0.15771484375, "loss_aux_layer_8": 0.16064453125, "loss_aux_layer_9": 0.16162109375, "step": 122, "total_loss": 0.9153970927000046 }, { "epoch": 0.024351613541872895, "grad_norm": 0.6667352318763733, "learning_rate": 5e-05, "llm_loss": 0.6555888652801514, "loss": 3.4911, "loss_aux_layer_0": 0.137451171875, "loss_aux_layer_1": 0.174072265625, "loss_aux_layer_10": 0.17626953125, "loss_aux_layer_11": 0.18798828125, "loss_aux_layer_12": 0.203857421875, "loss_aux_layer_13": 0.218017578125, "loss_aux_layer_14": 0.2373046875, "loss_aux_layer_15": 0.252685546875, "loss_aux_layer_16": 0.26708984375, "loss_aux_layer_17": 0.2666015625, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.2685546875, "loss_aux_layer_2": 0.184326171875, "loss_aux_layer_20": 0.26416015625, "loss_aux_layer_21": 0.26123046875, "loss_aux_layer_22": 0.27978515625, "loss_aux_layer_23": 0.32958984375, "loss_aux_layer_3": 0.1826171875, "loss_aux_layer_4": 0.177734375, "loss_aux_layer_5": 0.1806640625, "loss_aux_layer_6": 0.17431640625, "loss_aux_layer_7": 0.16943359375, "loss_aux_layer_8": 0.172119140625, "loss_aux_layer_9": 0.173095703125, "step": 123, "total_loss": 0.8727843910455704 }, { "epoch": 0.0245495941397743, "grad_norm": 0.6979156732559204, "learning_rate": 5e-05, "llm_loss": 0.7442515045404434, "loss": 3.822, "loss_aux_layer_0": 0.13330078125, "loss_aux_layer_1": 0.171875, "loss_aux_layer_10": 0.16845703125, "loss_aux_layer_11": 0.178955078125, "loss_aux_layer_12": 0.194091796875, "loss_aux_layer_13": 0.208251953125, "loss_aux_layer_14": 0.228271484375, "loss_aux_layer_15": 0.244140625, "loss_aux_layer_16": 0.25927734375, "loss_aux_layer_17": 0.2607421875, "loss_aux_layer_18": 0.26904296875, "loss_aux_layer_19": 0.26318359375, "loss_aux_layer_2": 0.180908203125, "loss_aux_layer_20": 0.259765625, "loss_aux_layer_21": 0.25732421875, "loss_aux_layer_22": 0.27734375, "loss_aux_layer_23": 0.32568359375, "loss_aux_layer_3": 0.178466796875, "loss_aux_layer_4": 0.172607421875, "loss_aux_layer_5": 0.175537109375, "loss_aux_layer_6": 0.1689453125, "loss_aux_layer_7": 0.16357421875, "loss_aux_layer_8": 0.165283203125, "loss_aux_layer_9": 0.165283203125, "step": 124, "total_loss": 0.9555043280124664 }, { "epoch": 0.024747574737675707, "grad_norm": 0.6601620316505432, "learning_rate": 5e-05, "llm_loss": 0.7165067791938782, "loss": 3.7301, "loss_aux_layer_0": 0.1318359375, "loss_aux_layer_1": 0.1728515625, "loss_aux_layer_10": 0.173583984375, "loss_aux_layer_11": 0.185302734375, "loss_aux_layer_12": 0.200927734375, "loss_aux_layer_13": 0.21484375, "loss_aux_layer_14": 0.235595703125, "loss_aux_layer_15": 0.251220703125, "loss_aux_layer_16": 0.265625, "loss_aux_layer_17": 0.2666015625, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.26806640625, "loss_aux_layer_2": 0.183837890625, "loss_aux_layer_20": 0.2646484375, "loss_aux_layer_21": 0.26318359375, "loss_aux_layer_22": 0.28271484375, "loss_aux_layer_23": 0.33349609375, "loss_aux_layer_3": 0.181884765625, "loss_aux_layer_4": 0.17626953125, "loss_aux_layer_5": 0.17919921875, "loss_aux_layer_6": 0.173095703125, "loss_aux_layer_7": 0.167236328125, "loss_aux_layer_8": 0.16943359375, "loss_aux_layer_9": 0.170166015625, "step": 125, "total_loss": 0.932528018951416 }, { "epoch": 0.024945555335577112, "grad_norm": 0.6571803689002991, "learning_rate": 5e-05, "llm_loss": 0.7313955277204514, "loss": 3.7717, "loss_aux_layer_0": 0.13037109375, "loss_aux_layer_1": 0.170166015625, "loss_aux_layer_10": 0.16796875, "loss_aux_layer_11": 0.1787109375, "loss_aux_layer_12": 0.19384765625, "loss_aux_layer_13": 0.208984375, "loss_aux_layer_14": 0.229248046875, "loss_aux_layer_15": 0.244873046875, "loss_aux_layer_16": 0.26025390625, "loss_aux_layer_17": 0.26171875, "loss_aux_layer_18": 0.26904296875, "loss_aux_layer_19": 0.263671875, "loss_aux_layer_2": 0.18017578125, "loss_aux_layer_20": 0.26123046875, "loss_aux_layer_21": 0.26025390625, "loss_aux_layer_22": 0.28173828125, "loss_aux_layer_23": 0.33203125, "loss_aux_layer_3": 0.177734375, "loss_aux_layer_4": 0.17138671875, "loss_aux_layer_5": 0.17431640625, "loss_aux_layer_6": 0.167724609375, "loss_aux_layer_7": 0.162109375, "loss_aux_layer_8": 0.16455078125, "loss_aux_layer_9": 0.1650390625, "step": 126, "total_loss": 0.9429370313882828 }, { "epoch": 0.02514353593347852, "grad_norm": 0.632175087928772, "learning_rate": 5e-05, "llm_loss": 0.670169785618782, "loss": 3.5098, "loss_aux_layer_0": 0.1234130859375, "loss_aux_layer_1": 0.161865234375, "loss_aux_layer_10": 0.16357421875, "loss_aux_layer_11": 0.1748046875, "loss_aux_layer_12": 0.18994140625, "loss_aux_layer_13": 0.204345703125, "loss_aux_layer_14": 0.225830078125, "loss_aux_layer_15": 0.241455078125, "loss_aux_layer_16": 0.25732421875, "loss_aux_layer_17": 0.25830078125, "loss_aux_layer_18": 0.265625, "loss_aux_layer_19": 0.2607421875, "loss_aux_layer_2": 0.173095703125, "loss_aux_layer_20": 0.25830078125, "loss_aux_layer_21": 0.25927734375, "loss_aux_layer_22": 0.28076171875, "loss_aux_layer_23": 0.33154296875, "loss_aux_layer_3": 0.171142578125, "loss_aux_layer_4": 0.16552734375, "loss_aux_layer_5": 0.16796875, "loss_aux_layer_6": 0.16259765625, "loss_aux_layer_7": 0.1572265625, "loss_aux_layer_8": 0.15966796875, "loss_aux_layer_9": 0.16064453125, "step": 127, "total_loss": 0.8774483650922775 }, { "epoch": 0.025341516531379924, "grad_norm": 0.6554943323135376, "learning_rate": 5e-05, "llm_loss": 0.7237553298473358, "loss": 3.7293, "loss_aux_layer_0": 0.12255859375, "loss_aux_layer_1": 0.161376953125, "loss_aux_layer_10": 0.16455078125, "loss_aux_layer_11": 0.17529296875, "loss_aux_layer_12": 0.192138671875, "loss_aux_layer_13": 0.20703125, "loss_aux_layer_14": 0.22900390625, "loss_aux_layer_15": 0.245361328125, "loss_aux_layer_16": 0.26171875, "loss_aux_layer_17": 0.2626953125, "loss_aux_layer_18": 0.26904296875, "loss_aux_layer_19": 0.263671875, "loss_aux_layer_2": 0.173583984375, "loss_aux_layer_20": 0.26171875, "loss_aux_layer_21": 0.26025390625, "loss_aux_layer_22": 0.279296875, "loss_aux_layer_23": 0.32958984375, "loss_aux_layer_3": 0.171875, "loss_aux_layer_4": 0.165771484375, "loss_aux_layer_5": 0.168212890625, "loss_aux_layer_6": 0.16259765625, "loss_aux_layer_7": 0.156982421875, "loss_aux_layer_8": 0.15966796875, "loss_aux_layer_9": 0.161376953125, "step": 128, "total_loss": 0.9323169142007828 }, { "epoch": 0.02553949712928133, "grad_norm": 0.5699229836463928, "learning_rate": 5e-05, "llm_loss": 0.6191682517528534, "loss": 3.3115, "loss_aux_layer_0": 0.122802734375, "loss_aux_layer_1": 0.162841796875, "loss_aux_layer_10": 0.165771484375, "loss_aux_layer_11": 0.1767578125, "loss_aux_layer_12": 0.1923828125, "loss_aux_layer_13": 0.207763671875, "loss_aux_layer_14": 0.229248046875, "loss_aux_layer_15": 0.24462890625, "loss_aux_layer_16": 0.259765625, "loss_aux_layer_17": 0.2607421875, "loss_aux_layer_18": 0.26806640625, "loss_aux_layer_19": 0.26123046875, "loss_aux_layer_2": 0.175537109375, "loss_aux_layer_20": 0.25830078125, "loss_aux_layer_21": 0.2568359375, "loss_aux_layer_22": 0.27685546875, "loss_aux_layer_23": 0.3271484375, "loss_aux_layer_3": 0.173828125, "loss_aux_layer_4": 0.16796875, "loss_aux_layer_5": 0.17041015625, "loss_aux_layer_6": 0.164794921875, "loss_aux_layer_7": 0.159912109375, "loss_aux_layer_8": 0.162109375, "loss_aux_layer_9": 0.162841796875, "step": 129, "total_loss": 0.8278646469116211 }, { "epoch": 0.025737477727182736, "grad_norm": 0.6281185150146484, "learning_rate": 5e-05, "llm_loss": 0.7104697078466415, "loss": 3.6915, "loss_aux_layer_0": 0.123291015625, "loss_aux_layer_1": 0.166748046875, "loss_aux_layer_10": 0.170654296875, "loss_aux_layer_11": 0.181396484375, "loss_aux_layer_12": 0.19677734375, "loss_aux_layer_13": 0.21044921875, "loss_aux_layer_14": 0.2314453125, "loss_aux_layer_15": 0.24658203125, "loss_aux_layer_16": 0.2626953125, "loss_aux_layer_17": 0.26171875, "loss_aux_layer_18": 0.27001953125, "loss_aux_layer_19": 0.26318359375, "loss_aux_layer_2": 0.181396484375, "loss_aux_layer_20": 0.26025390625, "loss_aux_layer_21": 0.259765625, "loss_aux_layer_22": 0.27880859375, "loss_aux_layer_23": 0.33056640625, "loss_aux_layer_3": 0.18017578125, "loss_aux_layer_4": 0.174560546875, "loss_aux_layer_5": 0.177001953125, "loss_aux_layer_6": 0.170654296875, "loss_aux_layer_7": 0.1650390625, "loss_aux_layer_8": 0.167724609375, "loss_aux_layer_9": 0.16796875, "step": 130, "total_loss": 0.9228860139846802 }, { "epoch": 0.02593545832508414, "grad_norm": 0.6294304132461548, "learning_rate": 5e-05, "llm_loss": 0.6419166401028633, "loss": 3.3895, "loss_aux_layer_0": 0.1199951171875, "loss_aux_layer_1": 0.156494140625, "loss_aux_layer_10": 0.16357421875, "loss_aux_layer_11": 0.173583984375, "loss_aux_layer_12": 0.18896484375, "loss_aux_layer_13": 0.202880859375, "loss_aux_layer_14": 0.22314453125, "loss_aux_layer_15": 0.239990234375, "loss_aux_layer_16": 0.25537109375, "loss_aux_layer_17": 0.25634765625, "loss_aux_layer_18": 0.26513671875, "loss_aux_layer_19": 0.25927734375, "loss_aux_layer_2": 0.168701171875, "loss_aux_layer_20": 0.257568359375, "loss_aux_layer_21": 0.25732421875, "loss_aux_layer_22": 0.27587890625, "loss_aux_layer_23": 0.32666015625, "loss_aux_layer_3": 0.16845703125, "loss_aux_layer_4": 0.163818359375, "loss_aux_layer_5": 0.166748046875, "loss_aux_layer_6": 0.162109375, "loss_aux_layer_7": 0.157470703125, "loss_aux_layer_8": 0.16015625, "loss_aux_layer_9": 0.160400390625, "step": 131, "total_loss": 0.8473749905824661 }, { "epoch": 0.026133438922985548, "grad_norm": 0.6334285736083984, "learning_rate": 5e-05, "llm_loss": 0.6880079954862595, "loss": 3.5877, "loss_aux_layer_0": 0.11669921875, "loss_aux_layer_1": 0.162353515625, "loss_aux_layer_10": 0.166015625, "loss_aux_layer_11": 0.177001953125, "loss_aux_layer_12": 0.193115234375, "loss_aux_layer_13": 0.208740234375, "loss_aux_layer_14": 0.228759765625, "loss_aux_layer_15": 0.244873046875, "loss_aux_layer_16": 0.2607421875, "loss_aux_layer_17": 0.26220703125, "loss_aux_layer_18": 0.27001953125, "loss_aux_layer_19": 0.263671875, "loss_aux_layer_2": 0.174560546875, "loss_aux_layer_20": 0.26025390625, "loss_aux_layer_21": 0.2578125, "loss_aux_layer_22": 0.27880859375, "loss_aux_layer_23": 0.32958984375, "loss_aux_layer_3": 0.173583984375, "loss_aux_layer_4": 0.167724609375, "loss_aux_layer_5": 0.170166015625, "loss_aux_layer_6": 0.164794921875, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.161865234375, "loss_aux_layer_9": 0.16259765625, "step": 132, "total_loss": 0.896927535533905 }, { "epoch": 0.026331419520886953, "grad_norm": 0.6255744099617004, "learning_rate": 5e-05, "llm_loss": 0.7395902723073959, "loss": 3.7863, "loss_aux_layer_0": 0.1138916015625, "loss_aux_layer_1": 0.158447265625, "loss_aux_layer_10": 0.1650390625, "loss_aux_layer_11": 0.17626953125, "loss_aux_layer_12": 0.191162109375, "loss_aux_layer_13": 0.2060546875, "loss_aux_layer_14": 0.2265625, "loss_aux_layer_15": 0.242431640625, "loss_aux_layer_16": 0.2568359375, "loss_aux_layer_17": 0.257568359375, "loss_aux_layer_18": 0.2666015625, "loss_aux_layer_19": 0.2607421875, "loss_aux_layer_2": 0.171630859375, "loss_aux_layer_20": 0.25830078125, "loss_aux_layer_21": 0.25927734375, "loss_aux_layer_22": 0.27783203125, "loss_aux_layer_23": 0.3291015625, "loss_aux_layer_3": 0.171630859375, "loss_aux_layer_4": 0.165771484375, "loss_aux_layer_5": 0.16845703125, "loss_aux_layer_6": 0.163330078125, "loss_aux_layer_7": 0.15869140625, "loss_aux_layer_8": 0.1611328125, "loss_aux_layer_9": 0.161865234375, "step": 133, "total_loss": 0.9465662091970444 }, { "epoch": 0.02652940011878836, "grad_norm": 0.6063054800033569, "learning_rate": 5e-05, "llm_loss": 0.7526323944330215, "loss": 3.8394, "loss_aux_layer_0": 0.11279296875, "loss_aux_layer_1": 0.158447265625, "loss_aux_layer_10": 0.16552734375, "loss_aux_layer_11": 0.1767578125, "loss_aux_layer_12": 0.1923828125, "loss_aux_layer_13": 0.206298828125, "loss_aux_layer_14": 0.226318359375, "loss_aux_layer_15": 0.2421875, "loss_aux_layer_16": 0.2568359375, "loss_aux_layer_17": 0.25830078125, "loss_aux_layer_18": 0.2666015625, "loss_aux_layer_19": 0.259765625, "loss_aux_layer_2": 0.173095703125, "loss_aux_layer_20": 0.2578125, "loss_aux_layer_21": 0.2568359375, "loss_aux_layer_22": 0.27783203125, "loss_aux_layer_23": 0.3291015625, "loss_aux_layer_3": 0.172607421875, "loss_aux_layer_4": 0.1669921875, "loss_aux_layer_5": 0.1689453125, "loss_aux_layer_6": 0.16455078125, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.162109375, "loss_aux_layer_9": 0.162353515625, "step": 134, "total_loss": 0.9598601311445236 }, { "epoch": 0.026727380716689765, "grad_norm": 0.7599971294403076, "learning_rate": 5e-05, "llm_loss": 0.7163548320531845, "loss": 3.7221, "loss_aux_layer_0": 0.1175537109375, "loss_aux_layer_1": 0.16650390625, "loss_aux_layer_10": 0.172607421875, "loss_aux_layer_11": 0.183837890625, "loss_aux_layer_12": 0.19921875, "loss_aux_layer_13": 0.214599609375, "loss_aux_layer_14": 0.235107421875, "loss_aux_layer_15": 0.25, "loss_aux_layer_16": 0.263671875, "loss_aux_layer_17": 0.26318359375, "loss_aux_layer_18": 0.271484375, "loss_aux_layer_19": 0.26416015625, "loss_aux_layer_2": 0.1826171875, "loss_aux_layer_20": 0.26171875, "loss_aux_layer_21": 0.26123046875, "loss_aux_layer_22": 0.2822265625, "loss_aux_layer_23": 0.3330078125, "loss_aux_layer_3": 0.182373046875, "loss_aux_layer_4": 0.176513671875, "loss_aux_layer_5": 0.178466796875, "loss_aux_layer_6": 0.173095703125, "loss_aux_layer_7": 0.16748046875, "loss_aux_layer_8": 0.169677734375, "loss_aux_layer_9": 0.169677734375, "step": 135, "total_loss": 0.930514469742775 }, { "epoch": 0.02692536131459117, "grad_norm": 0.6031292080879211, "learning_rate": 5e-05, "llm_loss": 0.6326802670955658, "loss": 3.3488, "loss_aux_layer_0": 0.1083984375, "loss_aux_layer_1": 0.154541015625, "loss_aux_layer_10": 0.162841796875, "loss_aux_layer_11": 0.173583984375, "loss_aux_layer_12": 0.18896484375, "loss_aux_layer_13": 0.203369140625, "loss_aux_layer_14": 0.2236328125, "loss_aux_layer_15": 0.240234375, "loss_aux_layer_16": 0.254638671875, "loss_aux_layer_17": 0.25634765625, "loss_aux_layer_18": 0.26416015625, "loss_aux_layer_19": 0.2578125, "loss_aux_layer_2": 0.169189453125, "loss_aux_layer_20": 0.256103515625, "loss_aux_layer_21": 0.255859375, "loss_aux_layer_22": 0.27490234375, "loss_aux_layer_23": 0.3251953125, "loss_aux_layer_3": 0.16943359375, "loss_aux_layer_4": 0.16455078125, "loss_aux_layer_5": 0.16650390625, "loss_aux_layer_6": 0.161865234375, "loss_aux_layer_7": 0.156982421875, "loss_aux_layer_8": 0.15966796875, "loss_aux_layer_9": 0.16015625, "step": 136, "total_loss": 0.8372020721435547 }, { "epoch": 0.027123341912492577, "grad_norm": 0.5879358053207397, "learning_rate": 5e-05, "llm_loss": 0.6019897609949112, "loss": 3.2224, "loss_aux_layer_0": 0.10498046875, "loss_aux_layer_1": 0.15283203125, "loss_aux_layer_10": 0.16162109375, "loss_aux_layer_11": 0.17236328125, "loss_aux_layer_12": 0.18798828125, "loss_aux_layer_13": 0.201904296875, "loss_aux_layer_14": 0.22216796875, "loss_aux_layer_15": 0.238037109375, "loss_aux_layer_16": 0.253662109375, "loss_aux_layer_17": 0.254150390625, "loss_aux_layer_18": 0.26318359375, "loss_aux_layer_19": 0.25830078125, "loss_aux_layer_2": 0.16748046875, "loss_aux_layer_20": 0.25732421875, "loss_aux_layer_21": 0.25732421875, "loss_aux_layer_22": 0.27734375, "loss_aux_layer_23": 0.330078125, "loss_aux_layer_3": 0.167724609375, "loss_aux_layer_4": 0.161865234375, "loss_aux_layer_5": 0.1640625, "loss_aux_layer_6": 0.16015625, "loss_aux_layer_7": 0.1552734375, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.158447265625, "step": 137, "total_loss": 0.805605560541153 }, { "epoch": 0.027321322510393983, "grad_norm": 0.6218862533569336, "learning_rate": 5e-05, "llm_loss": 0.6016297936439514, "loss": 3.2217, "loss_aux_layer_0": 0.104248046875, "loss_aux_layer_1": 0.152587890625, "loss_aux_layer_10": 0.162109375, "loss_aux_layer_11": 0.17236328125, "loss_aux_layer_12": 0.188232421875, "loss_aux_layer_13": 0.203125, "loss_aux_layer_14": 0.224365234375, "loss_aux_layer_15": 0.240234375, "loss_aux_layer_16": 0.25537109375, "loss_aux_layer_17": 0.256591796875, "loss_aux_layer_18": 0.26513671875, "loss_aux_layer_19": 0.2587890625, "loss_aux_layer_2": 0.16748046875, "loss_aux_layer_20": 0.25732421875, "loss_aux_layer_21": 0.2568359375, "loss_aux_layer_22": 0.275390625, "loss_aux_layer_23": 0.3251953125, "loss_aux_layer_3": 0.16748046875, "loss_aux_layer_4": 0.162109375, "loss_aux_layer_5": 0.16455078125, "loss_aux_layer_6": 0.16015625, "loss_aux_layer_7": 0.1552734375, "loss_aux_layer_8": 0.158203125, "loss_aux_layer_9": 0.1591796875, "step": 138, "total_loss": 0.8054261952638626 }, { "epoch": 0.02751930310829539, "grad_norm": 0.5570290088653564, "learning_rate": 5e-05, "llm_loss": 0.6264763996005058, "loss": 3.343, "loss_aux_layer_0": 0.1094970703125, "loss_aux_layer_1": 0.158935546875, "loss_aux_layer_10": 0.16943359375, "loss_aux_layer_11": 0.1796875, "loss_aux_layer_12": 0.194580078125, "loss_aux_layer_13": 0.207763671875, "loss_aux_layer_14": 0.2275390625, "loss_aux_layer_15": 0.242919921875, "loss_aux_layer_16": 0.257568359375, "loss_aux_layer_17": 0.2587890625, "loss_aux_layer_18": 0.26611328125, "loss_aux_layer_19": 0.26025390625, "loss_aux_layer_2": 0.17578125, "loss_aux_layer_20": 0.258056640625, "loss_aux_layer_21": 0.25830078125, "loss_aux_layer_22": 0.279296875, "loss_aux_layer_23": 0.33056640625, "loss_aux_layer_3": 0.1767578125, "loss_aux_layer_4": 0.17138671875, "loss_aux_layer_5": 0.17333984375, "loss_aux_layer_6": 0.1689453125, "loss_aux_layer_7": 0.163818359375, "loss_aux_layer_8": 0.166015625, "loss_aux_layer_9": 0.1669921875, "step": 139, "total_loss": 0.835746243596077 }, { "epoch": 0.027717283706196794, "grad_norm": 0.6001929044723511, "learning_rate": 5e-05, "llm_loss": 0.6881073862314224, "loss": 3.5896, "loss_aux_layer_0": 0.10791015625, "loss_aux_layer_1": 0.162353515625, "loss_aux_layer_10": 0.169921875, "loss_aux_layer_11": 0.18115234375, "loss_aux_layer_12": 0.1962890625, "loss_aux_layer_13": 0.210205078125, "loss_aux_layer_14": 0.2294921875, "loss_aux_layer_15": 0.243896484375, "loss_aux_layer_16": 0.258056640625, "loss_aux_layer_17": 0.256591796875, "loss_aux_layer_18": 0.26513671875, "loss_aux_layer_19": 0.25927734375, "loss_aux_layer_2": 0.17919921875, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.2548828125, "loss_aux_layer_22": 0.27392578125, "loss_aux_layer_23": 0.32421875, "loss_aux_layer_3": 0.179443359375, "loss_aux_layer_4": 0.1728515625, "loss_aux_layer_5": 0.174560546875, "loss_aux_layer_6": 0.169677734375, "loss_aux_layer_7": 0.164306640625, "loss_aux_layer_8": 0.166748046875, "loss_aux_layer_9": 0.166748046875, "step": 140, "total_loss": 0.8973925113677979 }, { "epoch": 0.027915264304098197, "grad_norm": 0.6777665615081787, "learning_rate": 5e-05, "llm_loss": 0.6959707587957382, "loss": 3.5833, "loss_aux_layer_0": 0.0992431640625, "loss_aux_layer_1": 0.146240234375, "loss_aux_layer_10": 0.15966796875, "loss_aux_layer_11": 0.169677734375, "loss_aux_layer_12": 0.18408203125, "loss_aux_layer_13": 0.197509765625, "loss_aux_layer_14": 0.2177734375, "loss_aux_layer_15": 0.233642578125, "loss_aux_layer_16": 0.24853515625, "loss_aux_layer_17": 0.251220703125, "loss_aux_layer_18": 0.259765625, "loss_aux_layer_19": 0.2548828125, "loss_aux_layer_2": 0.163330078125, "loss_aux_layer_20": 0.25341796875, "loss_aux_layer_21": 0.2529296875, "loss_aux_layer_22": 0.27294921875, "loss_aux_layer_23": 0.32470703125, "loss_aux_layer_3": 0.163818359375, "loss_aux_layer_4": 0.158935546875, "loss_aux_layer_5": 0.161376953125, "loss_aux_layer_6": 0.157470703125, "loss_aux_layer_7": 0.152587890625, "loss_aux_layer_8": 0.155517578125, "loss_aux_layer_9": 0.156494140625, "step": 141, "total_loss": 0.8958145081996918 }, { "epoch": 0.028113244901999603, "grad_norm": 0.6301952004432678, "learning_rate": 5e-05, "llm_loss": 0.6958879232406616, "loss": 3.6196, "loss_aux_layer_0": 0.104736328125, "loss_aux_layer_1": 0.1591796875, "loss_aux_layer_10": 0.170654296875, "loss_aux_layer_11": 0.181640625, "loss_aux_layer_12": 0.196044921875, "loss_aux_layer_13": 0.209228515625, "loss_aux_layer_14": 0.228515625, "loss_aux_layer_15": 0.242919921875, "loss_aux_layer_16": 0.25830078125, "loss_aux_layer_17": 0.2578125, "loss_aux_layer_18": 0.2646484375, "loss_aux_layer_19": 0.2587890625, "loss_aux_layer_2": 0.1767578125, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.25390625, "loss_aux_layer_22": 0.27490234375, "loss_aux_layer_23": 0.32373046875, "loss_aux_layer_3": 0.17822265625, "loss_aux_layer_4": 0.173095703125, "loss_aux_layer_5": 0.175048828125, "loss_aux_layer_6": 0.170654296875, "loss_aux_layer_7": 0.16552734375, "loss_aux_layer_8": 0.16796875, "loss_aux_layer_9": 0.167724609375, "step": 142, "total_loss": 0.9048951864242554 }, { "epoch": 0.02831122549990101, "grad_norm": 0.6112794280052185, "learning_rate": 5e-05, "llm_loss": 0.6611264944076538, "loss": 3.4783, "loss_aux_layer_0": 0.1026611328125, "loss_aux_layer_1": 0.156982421875, "loss_aux_layer_10": 0.16845703125, "loss_aux_layer_11": 0.179443359375, "loss_aux_layer_12": 0.1943359375, "loss_aux_layer_13": 0.207763671875, "loss_aux_layer_14": 0.227783203125, "loss_aux_layer_15": 0.242431640625, "loss_aux_layer_16": 0.2578125, "loss_aux_layer_17": 0.2578125, "loss_aux_layer_18": 0.26611328125, "loss_aux_layer_19": 0.25927734375, "loss_aux_layer_2": 0.1748046875, "loss_aux_layer_20": 0.2578125, "loss_aux_layer_21": 0.2587890625, "loss_aux_layer_22": 0.27978515625, "loss_aux_layer_23": 0.330078125, "loss_aux_layer_3": 0.175537109375, "loss_aux_layer_4": 0.170654296875, "loss_aux_layer_5": 0.171875, "loss_aux_layer_6": 0.167724609375, "loss_aux_layer_7": 0.16259765625, "loss_aux_layer_8": 0.165283203125, "loss_aux_layer_9": 0.16552734375, "step": 143, "total_loss": 0.8695730566978455 }, { "epoch": 0.028509206097802414, "grad_norm": 0.6290109157562256, "learning_rate": 5e-05, "llm_loss": 0.7316445261240005, "loss": 3.7447, "loss_aux_layer_0": 0.0992431640625, "loss_aux_layer_1": 0.15380859375, "loss_aux_layer_10": 0.165283203125, "loss_aux_layer_11": 0.17626953125, "loss_aux_layer_12": 0.191650390625, "loss_aux_layer_13": 0.205322265625, "loss_aux_layer_14": 0.22412109375, "loss_aux_layer_15": 0.239013671875, "loss_aux_layer_16": 0.25439453125, "loss_aux_layer_17": 0.255126953125, "loss_aux_layer_18": 0.26318359375, "loss_aux_layer_19": 0.25732421875, "loss_aux_layer_2": 0.172119140625, "loss_aux_layer_20": 0.25439453125, "loss_aux_layer_21": 0.251953125, "loss_aux_layer_22": 0.27001953125, "loss_aux_layer_23": 0.31884765625, "loss_aux_layer_3": 0.1728515625, "loss_aux_layer_4": 0.1669921875, "loss_aux_layer_5": 0.16845703125, "loss_aux_layer_6": 0.16455078125, "loss_aux_layer_7": 0.158935546875, "loss_aux_layer_8": 0.16162109375, "loss_aux_layer_9": 0.162353515625, "step": 144, "total_loss": 0.936166986823082 }, { "epoch": 0.02870718669570382, "grad_norm": 0.7400573492050171, "learning_rate": 5e-05, "llm_loss": 0.6703679114580154, "loss": 3.4804, "loss_aux_layer_0": 0.095458984375, "loss_aux_layer_1": 0.1455078125, "loss_aux_layer_10": 0.15869140625, "loss_aux_layer_11": 0.168701171875, "loss_aux_layer_12": 0.18310546875, "loss_aux_layer_13": 0.19580078125, "loss_aux_layer_14": 0.2158203125, "loss_aux_layer_15": 0.233154296875, "loss_aux_layer_16": 0.248779296875, "loss_aux_layer_17": 0.25048828125, "loss_aux_layer_18": 0.26025390625, "loss_aux_layer_19": 0.25634765625, "loss_aux_layer_2": 0.16259765625, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.25634765625, "loss_aux_layer_22": 0.275390625, "loss_aux_layer_23": 0.3271484375, "loss_aux_layer_3": 0.164306640625, "loss_aux_layer_4": 0.15869140625, "loss_aux_layer_5": 0.16064453125, "loss_aux_layer_6": 0.1572265625, "loss_aux_layer_7": 0.152099609375, "loss_aux_layer_8": 0.155517578125, "loss_aux_layer_9": 0.15625, "step": 145, "total_loss": 0.8700946718454361 }, { "epoch": 0.028905167293605226, "grad_norm": 0.7588018774986267, "learning_rate": 5e-05, "llm_loss": 0.6310099214315414, "loss": 3.3863, "loss_aux_layer_0": 0.1015625, "loss_aux_layer_1": 0.1640625, "loss_aux_layer_10": 0.17724609375, "loss_aux_layer_11": 0.18896484375, "loss_aux_layer_12": 0.203857421875, "loss_aux_layer_13": 0.21630859375, "loss_aux_layer_14": 0.236083984375, "loss_aux_layer_15": 0.25, "loss_aux_layer_16": 0.2646484375, "loss_aux_layer_17": 0.26513671875, "loss_aux_layer_18": 0.271484375, "loss_aux_layer_19": 0.26318359375, "loss_aux_layer_2": 0.184814453125, "loss_aux_layer_20": 0.26123046875, "loss_aux_layer_21": 0.25927734375, "loss_aux_layer_22": 0.28076171875, "loss_aux_layer_23": 0.3291015625, "loss_aux_layer_3": 0.18701171875, "loss_aux_layer_4": 0.181640625, "loss_aux_layer_5": 0.18359375, "loss_aux_layer_6": 0.17919921875, "loss_aux_layer_7": 0.17333984375, "loss_aux_layer_8": 0.17529296875, "loss_aux_layer_9": 0.175048828125, "step": 146, "total_loss": 0.8465638607740402 }, { "epoch": 0.029103147891506632, "grad_norm": 0.6005535125732422, "learning_rate": 5e-05, "llm_loss": 0.7065153419971466, "loss": 3.6515, "loss_aux_layer_0": 0.0970458984375, "loss_aux_layer_1": 0.15478515625, "loss_aux_layer_10": 0.16796875, "loss_aux_layer_11": 0.1787109375, "loss_aux_layer_12": 0.19287109375, "loss_aux_layer_13": 0.20654296875, "loss_aux_layer_14": 0.225830078125, "loss_aux_layer_15": 0.239990234375, "loss_aux_layer_16": 0.25439453125, "loss_aux_layer_17": 0.25439453125, "loss_aux_layer_18": 0.26220703125, "loss_aux_layer_19": 0.25634765625, "loss_aux_layer_2": 0.174072265625, "loss_aux_layer_20": 0.25390625, "loss_aux_layer_21": 0.254150390625, "loss_aux_layer_22": 0.275390625, "loss_aux_layer_23": 0.32666015625, "loss_aux_layer_3": 0.175537109375, "loss_aux_layer_4": 0.170166015625, "loss_aux_layer_5": 0.171630859375, "loss_aux_layer_6": 0.167724609375, "loss_aux_layer_7": 0.162109375, "loss_aux_layer_8": 0.164794921875, "loss_aux_layer_9": 0.1650390625, "step": 147, "total_loss": 0.9128681719303131 }, { "epoch": 0.029301128489408038, "grad_norm": 0.7066617608070374, "learning_rate": 5e-05, "llm_loss": 0.6505563259124756, "loss": 3.4289, "loss_aux_layer_0": 0.0985107421875, "loss_aux_layer_1": 0.15380859375, "loss_aux_layer_10": 0.16748046875, "loss_aux_layer_11": 0.1787109375, "loss_aux_layer_12": 0.193603515625, "loss_aux_layer_13": 0.2080078125, "loss_aux_layer_14": 0.2275390625, "loss_aux_layer_15": 0.241943359375, "loss_aux_layer_16": 0.25732421875, "loss_aux_layer_17": 0.25830078125, "loss_aux_layer_18": 0.26611328125, "loss_aux_layer_19": 0.2578125, "loss_aux_layer_2": 0.17333984375, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.25439453125, "loss_aux_layer_22": 0.27294921875, "loss_aux_layer_23": 0.3212890625, "loss_aux_layer_3": 0.1748046875, "loss_aux_layer_4": 0.169677734375, "loss_aux_layer_5": 0.171142578125, "loss_aux_layer_6": 0.16748046875, "loss_aux_layer_7": 0.16162109375, "loss_aux_layer_8": 0.1640625, "loss_aux_layer_9": 0.164306640625, "step": 148, "total_loss": 0.8572207391262054 }, { "epoch": 0.029499109087309443, "grad_norm": 0.6843802332878113, "learning_rate": 5e-05, "llm_loss": 0.6814787685871124, "loss": 3.5675, "loss_aux_layer_0": 0.0958251953125, "loss_aux_layer_1": 0.1591796875, "loss_aux_layer_10": 0.172607421875, "loss_aux_layer_11": 0.184326171875, "loss_aux_layer_12": 0.19970703125, "loss_aux_layer_13": 0.212646484375, "loss_aux_layer_14": 0.23095703125, "loss_aux_layer_15": 0.2451171875, "loss_aux_layer_16": 0.258056640625, "loss_aux_layer_17": 0.25830078125, "loss_aux_layer_18": 0.26513671875, "loss_aux_layer_19": 0.2578125, "loss_aux_layer_2": 0.178955078125, "loss_aux_layer_20": 0.25537109375, "loss_aux_layer_21": 0.25634765625, "loss_aux_layer_22": 0.27783203125, "loss_aux_layer_23": 0.3271484375, "loss_aux_layer_3": 0.181396484375, "loss_aux_layer_4": 0.175537109375, "loss_aux_layer_5": 0.17724609375, "loss_aux_layer_6": 0.17333984375, "loss_aux_layer_7": 0.167724609375, "loss_aux_layer_8": 0.170166015625, "loss_aux_layer_9": 0.169677734375, "step": 149, "total_loss": 0.8918631672859192 }, { "epoch": 0.02969708968521085, "grad_norm": 3.6558706760406494, "learning_rate": 5e-05, "llm_loss": 0.6322033405303955, "loss": 3.297, "loss_aux_layer_0": 0.0875244140625, "loss_aux_layer_1": 0.138671875, "loss_aux_layer_10": 0.1513671875, "loss_aux_layer_11": 0.161865234375, "loss_aux_layer_12": 0.176513671875, "loss_aux_layer_13": 0.18994140625, "loss_aux_layer_14": 0.208740234375, "loss_aux_layer_15": 0.22509765625, "loss_aux_layer_16": 0.241455078125, "loss_aux_layer_17": 0.24365234375, "loss_aux_layer_18": 0.2529296875, "loss_aux_layer_19": 0.247802734375, "loss_aux_layer_2": 0.1552734375, "loss_aux_layer_20": 0.246826171875, "loss_aux_layer_21": 0.24658203125, "loss_aux_layer_22": 0.2666015625, "loss_aux_layer_23": 0.31640625, "loss_aux_layer_3": 0.156494140625, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.15283203125, "loss_aux_layer_6": 0.14990234375, "loss_aux_layer_7": 0.14453125, "loss_aux_layer_8": 0.147705078125, "loss_aux_layer_9": 0.148193359375, "step": 150, "total_loss": 0.8242422491312027 }, { "epoch": 0.029895070283112255, "grad_norm": 1.7335046529769897, "learning_rate": 5e-05, "llm_loss": 0.806490421295166, "loss": 4.0602, "loss_aux_layer_0": 0.0997314453125, "loss_aux_layer_1": 0.155517578125, "loss_aux_layer_10": 0.168701171875, "loss_aux_layer_11": 0.1796875, "loss_aux_layer_12": 0.194580078125, "loss_aux_layer_13": 0.208740234375, "loss_aux_layer_14": 0.22900390625, "loss_aux_layer_15": 0.243896484375, "loss_aux_layer_16": 0.25830078125, "loss_aux_layer_17": 0.259765625, "loss_aux_layer_18": 0.267578125, "loss_aux_layer_19": 0.26123046875, "loss_aux_layer_2": 0.17333984375, "loss_aux_layer_20": 0.25927734375, "loss_aux_layer_21": 0.2568359375, "loss_aux_layer_22": 0.2783203125, "loss_aux_layer_23": 0.32763671875, "loss_aux_layer_3": 0.17626953125, "loss_aux_layer_4": 0.1708984375, "loss_aux_layer_5": 0.17236328125, "loss_aux_layer_6": 0.1689453125, "loss_aux_layer_7": 0.16357421875, "loss_aux_layer_8": 0.166015625, "loss_aux_layer_9": 0.166015625, "step": 151, "total_loss": 1.0150431096553802 }, { "epoch": 0.03009305088101366, "grad_norm": 2.434321641921997, "learning_rate": 5e-05, "llm_loss": 0.7200019359588623, "loss": 3.7137, "loss_aux_layer_0": 0.09423828125, "loss_aux_layer_1": 0.149658203125, "loss_aux_layer_10": 0.1650390625, "loss_aux_layer_11": 0.1767578125, "loss_aux_layer_12": 0.1923828125, "loss_aux_layer_13": 0.2080078125, "loss_aux_layer_14": 0.229736328125, "loss_aux_layer_15": 0.24658203125, "loss_aux_layer_16": 0.26220703125, "loss_aux_layer_17": 0.2666015625, "loss_aux_layer_18": 0.2744140625, "loss_aux_layer_19": 0.26806640625, "loss_aux_layer_2": 0.168212890625, "loss_aux_layer_20": 0.26513671875, "loss_aux_layer_21": 0.263671875, "loss_aux_layer_22": 0.28515625, "loss_aux_layer_23": 0.33349609375, "loss_aux_layer_3": 0.171142578125, "loss_aux_layer_4": 0.166259765625, "loss_aux_layer_5": 0.16748046875, "loss_aux_layer_6": 0.164306640625, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.161865234375, "loss_aux_layer_9": 0.16259765625, "step": 152, "total_loss": 0.9284287244081497 }, { "epoch": 0.030291031478915067, "grad_norm": 2.0811774730682373, "learning_rate": 5e-05, "llm_loss": 0.7102298438549042, "loss": 3.6621, "loss_aux_layer_0": 0.09228515625, "loss_aux_layer_1": 0.150390625, "loss_aux_layer_10": 0.164306640625, "loss_aux_layer_11": 0.1748046875, "loss_aux_layer_12": 0.19140625, "loss_aux_layer_13": 0.20556640625, "loss_aux_layer_14": 0.225830078125, "loss_aux_layer_15": 0.243408203125, "loss_aux_layer_16": 0.256103515625, "loss_aux_layer_17": 0.26025390625, "loss_aux_layer_18": 0.26806640625, "loss_aux_layer_19": 0.26171875, "loss_aux_layer_2": 0.16845703125, "loss_aux_layer_20": 0.2587890625, "loss_aux_layer_21": 0.2587890625, "loss_aux_layer_22": 0.27783203125, "loss_aux_layer_23": 0.326171875, "loss_aux_layer_3": 0.170654296875, "loss_aux_layer_4": 0.165283203125, "loss_aux_layer_5": 0.166015625, "loss_aux_layer_6": 0.162841796875, "loss_aux_layer_7": 0.1572265625, "loss_aux_layer_8": 0.160400390625, "loss_aux_layer_9": 0.1611328125, "step": 153, "total_loss": 0.9155222773551941 }, { "epoch": 0.030489012076816473, "grad_norm": 0.7605463862419128, "learning_rate": 5e-05, "llm_loss": 0.6680365204811096, "loss": 3.4792, "loss_aux_layer_0": 0.0875244140625, "loss_aux_layer_1": 0.147705078125, "loss_aux_layer_10": 0.16259765625, "loss_aux_layer_11": 0.173095703125, "loss_aux_layer_12": 0.188232421875, "loss_aux_layer_13": 0.202880859375, "loss_aux_layer_14": 0.221923828125, "loss_aux_layer_15": 0.238037109375, "loss_aux_layer_16": 0.252197265625, "loss_aux_layer_17": 0.253662109375, "loss_aux_layer_18": 0.2607421875, "loss_aux_layer_19": 0.25439453125, "loss_aux_layer_2": 0.1669921875, "loss_aux_layer_20": 0.2529296875, "loss_aux_layer_21": 0.250732421875, "loss_aux_layer_22": 0.271484375, "loss_aux_layer_23": 0.31884765625, "loss_aux_layer_3": 0.16943359375, "loss_aux_layer_4": 0.164306640625, "loss_aux_layer_5": 0.165283203125, "loss_aux_layer_6": 0.162109375, "loss_aux_layer_7": 0.15673828125, "loss_aux_layer_8": 0.15966796875, "loss_aux_layer_9": 0.159423828125, "step": 154, "total_loss": 0.8697968125343323 }, { "epoch": 0.03068699267471788, "grad_norm": 1.4689347743988037, "learning_rate": 5e-05, "llm_loss": 0.659589022397995, "loss": 3.4439, "loss_aux_layer_0": 0.0855712890625, "loss_aux_layer_1": 0.14453125, "loss_aux_layer_10": 0.159423828125, "loss_aux_layer_11": 0.169921875, "loss_aux_layer_12": 0.184814453125, "loss_aux_layer_13": 0.199951171875, "loss_aux_layer_14": 0.22021484375, "loss_aux_layer_15": 0.237548828125, "loss_aux_layer_16": 0.253173828125, "loss_aux_layer_17": 0.2548828125, "loss_aux_layer_18": 0.26318359375, "loss_aux_layer_19": 0.25732421875, "loss_aux_layer_2": 0.16455078125, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.25537109375, "loss_aux_layer_22": 0.27734375, "loss_aux_layer_23": 0.32666015625, "loss_aux_layer_3": 0.16748046875, "loss_aux_layer_4": 0.162109375, "loss_aux_layer_5": 0.163330078125, "loss_aux_layer_6": 0.16064453125, "loss_aux_layer_7": 0.154296875, "loss_aux_layer_8": 0.156982421875, "loss_aux_layer_9": 0.1572265625, "step": 155, "total_loss": 0.8609783500432968 }, { "epoch": 0.030884973272619284, "grad_norm": 1.0221978425979614, "learning_rate": 5e-05, "llm_loss": 0.6331070512533188, "loss": 3.359, "loss_aux_layer_0": 0.0875244140625, "loss_aux_layer_1": 0.1533203125, "loss_aux_layer_10": 0.1689453125, "loss_aux_layer_11": 0.178955078125, "loss_aux_layer_12": 0.194580078125, "loss_aux_layer_13": 0.20751953125, "loss_aux_layer_14": 0.22705078125, "loss_aux_layer_15": 0.24267578125, "loss_aux_layer_16": 0.25634765625, "loss_aux_layer_17": 0.25634765625, "loss_aux_layer_18": 0.26318359375, "loss_aux_layer_19": 0.255859375, "loss_aux_layer_2": 0.17431640625, "loss_aux_layer_20": 0.25439453125, "loss_aux_layer_21": 0.2529296875, "loss_aux_layer_22": 0.2734375, "loss_aux_layer_23": 0.32275390625, "loss_aux_layer_3": 0.17724609375, "loss_aux_layer_4": 0.171630859375, "loss_aux_layer_5": 0.17333984375, "loss_aux_layer_6": 0.17041015625, "loss_aux_layer_7": 0.164306640625, "loss_aux_layer_8": 0.16650390625, "loss_aux_layer_9": 0.166259765625, "step": 156, "total_loss": 0.8397415578365326 }, { "epoch": 0.03108295387052069, "grad_norm": 1.0999221801757812, "learning_rate": 5e-05, "llm_loss": 0.6800033450126648, "loss": 3.5377, "loss_aux_layer_0": 0.088134765625, "loss_aux_layer_1": 0.15234375, "loss_aux_layer_10": 0.163818359375, "loss_aux_layer_11": 0.175048828125, "loss_aux_layer_12": 0.1904296875, "loss_aux_layer_13": 0.20458984375, "loss_aux_layer_14": 0.223876953125, "loss_aux_layer_15": 0.23974609375, "loss_aux_layer_16": 0.256103515625, "loss_aux_layer_17": 0.25732421875, "loss_aux_layer_18": 0.264404296875, "loss_aux_layer_19": 0.257080078125, "loss_aux_layer_2": 0.171630859375, "loss_aux_layer_20": 0.25537109375, "loss_aux_layer_21": 0.25244140625, "loss_aux_layer_22": 0.27294921875, "loss_aux_layer_23": 0.32080078125, "loss_aux_layer_3": 0.174072265625, "loss_aux_layer_4": 0.168212890625, "loss_aux_layer_5": 0.1689453125, "loss_aux_layer_6": 0.16552734375, "loss_aux_layer_7": 0.1591796875, "loss_aux_layer_8": 0.161865234375, "loss_aux_layer_9": 0.161376953125, "step": 157, "total_loss": 0.8844263106584549 }, { "epoch": 0.031280934468422096, "grad_norm": 0.857225775718689, "learning_rate": 5e-05, "llm_loss": 0.6299090087413788, "loss": 3.3402, "loss_aux_layer_0": 0.0849609375, "loss_aux_layer_1": 0.1474609375, "loss_aux_layer_10": 0.16748046875, "loss_aux_layer_11": 0.1787109375, "loss_aux_layer_12": 0.193359375, "loss_aux_layer_13": 0.2060546875, "loss_aux_layer_14": 0.225830078125, "loss_aux_layer_15": 0.24072265625, "loss_aux_layer_16": 0.254150390625, "loss_aux_layer_17": 0.255126953125, "loss_aux_layer_18": 0.26318359375, "loss_aux_layer_19": 0.257080078125, "loss_aux_layer_2": 0.16748046875, "loss_aux_layer_20": 0.255859375, "loss_aux_layer_21": 0.256591796875, "loss_aux_layer_22": 0.2763671875, "loss_aux_layer_23": 0.32763671875, "loss_aux_layer_3": 0.172119140625, "loss_aux_layer_4": 0.1669921875, "loss_aux_layer_5": 0.168701171875, "loss_aux_layer_6": 0.166748046875, "loss_aux_layer_7": 0.161376953125, "loss_aux_layer_8": 0.164306640625, "loss_aux_layer_9": 0.16455078125, "step": 158, "total_loss": 0.8350405395030975 }, { "epoch": 0.0314789150663235, "grad_norm": 1.1804927587509155, "learning_rate": 5e-05, "llm_loss": 0.6420990899205208, "loss": 3.387, "loss_aux_layer_0": 0.0831298828125, "loss_aux_layer_1": 0.147705078125, "loss_aux_layer_10": 0.16552734375, "loss_aux_layer_11": 0.17626953125, "loss_aux_layer_12": 0.191162109375, "loss_aux_layer_13": 0.205078125, "loss_aux_layer_14": 0.22509765625, "loss_aux_layer_15": 0.2412109375, "loss_aux_layer_16": 0.25537109375, "loss_aux_layer_17": 0.2568359375, "loss_aux_layer_18": 0.265625, "loss_aux_layer_19": 0.2587890625, "loss_aux_layer_2": 0.16943359375, "loss_aux_layer_20": 0.25634765625, "loss_aux_layer_21": 0.255859375, "loss_aux_layer_22": 0.2744140625, "loss_aux_layer_23": 0.3232421875, "loss_aux_layer_3": 0.172607421875, "loss_aux_layer_4": 0.16748046875, "loss_aux_layer_5": 0.168701171875, "loss_aux_layer_6": 0.166015625, "loss_aux_layer_7": 0.159912109375, "loss_aux_layer_8": 0.16259765625, "loss_aux_layer_9": 0.16259765625, "step": 159, "total_loss": 0.8467496633529663 }, { "epoch": 0.03167689566422491, "grad_norm": 1.1501104831695557, "learning_rate": 5e-05, "llm_loss": 0.6979541033506393, "loss": 3.6021, "loss_aux_layer_0": 0.0810546875, "loss_aux_layer_1": 0.146484375, "loss_aux_layer_10": 0.164306640625, "loss_aux_layer_11": 0.174560546875, "loss_aux_layer_12": 0.189208984375, "loss_aux_layer_13": 0.20263671875, "loss_aux_layer_14": 0.22216796875, "loss_aux_layer_15": 0.23681640625, "loss_aux_layer_16": 0.25048828125, "loss_aux_layer_17": 0.25146484375, "loss_aux_layer_18": 0.25927734375, "loss_aux_layer_19": 0.252685546875, "loss_aux_layer_2": 0.167724609375, "loss_aux_layer_20": 0.251708984375, "loss_aux_layer_21": 0.25244140625, "loss_aux_layer_22": 0.27490234375, "loss_aux_layer_23": 0.326171875, "loss_aux_layer_3": 0.171630859375, "loss_aux_layer_4": 0.1669921875, "loss_aux_layer_5": 0.168701171875, "loss_aux_layer_6": 0.16552734375, "loss_aux_layer_7": 0.159912109375, "loss_aux_layer_8": 0.162353515625, "loss_aux_layer_9": 0.162353515625, "step": 160, "total_loss": 0.9005170315504074 }, { "epoch": 0.03187487626212631, "grad_norm": 1.1132287979125977, "learning_rate": 5e-05, "llm_loss": 0.6236226707696915, "loss": 3.3439, "loss_aux_layer_0": 0.087158203125, "loss_aux_layer_1": 0.159423828125, "loss_aux_layer_10": 0.175048828125, "loss_aux_layer_11": 0.185791015625, "loss_aux_layer_12": 0.201416015625, "loss_aux_layer_13": 0.214599609375, "loss_aux_layer_14": 0.233154296875, "loss_aux_layer_15": 0.2470703125, "loss_aux_layer_16": 0.26123046875, "loss_aux_layer_17": 0.2607421875, "loss_aux_layer_18": 0.2685546875, "loss_aux_layer_19": 0.26123046875, "loss_aux_layer_2": 0.18310546875, "loss_aux_layer_20": 0.2587890625, "loss_aux_layer_21": 0.2568359375, "loss_aux_layer_22": 0.2763671875, "loss_aux_layer_23": 0.32421875, "loss_aux_layer_3": 0.186767578125, "loss_aux_layer_4": 0.1806640625, "loss_aux_layer_5": 0.181396484375, "loss_aux_layer_6": 0.17822265625, "loss_aux_layer_7": 0.171630859375, "loss_aux_layer_8": 0.173583984375, "loss_aux_layer_9": 0.17236328125, "step": 161, "total_loss": 0.8359792828559875 }, { "epoch": 0.03207285686002772, "grad_norm": 1.16935396194458, "learning_rate": 5e-05, "llm_loss": 0.6013026237487793, "loss": 3.2147, "loss_aux_layer_0": 0.0849609375, "loss_aux_layer_1": 0.1484375, "loss_aux_layer_10": 0.163818359375, "loss_aux_layer_11": 0.173828125, "loss_aux_layer_12": 0.188232421875, "loss_aux_layer_13": 0.2001953125, "loss_aux_layer_14": 0.218994140625, "loss_aux_layer_15": 0.234619140625, "loss_aux_layer_16": 0.248779296875, "loss_aux_layer_17": 0.25, "loss_aux_layer_18": 0.259765625, "loss_aux_layer_19": 0.25341796875, "loss_aux_layer_2": 0.16943359375, "loss_aux_layer_20": 0.25341796875, "loss_aux_layer_21": 0.25390625, "loss_aux_layer_22": 0.27392578125, "loss_aux_layer_23": 0.32421875, "loss_aux_layer_3": 0.173095703125, "loss_aux_layer_4": 0.167724609375, "loss_aux_layer_5": 0.168701171875, "loss_aux_layer_6": 0.16552734375, "loss_aux_layer_7": 0.15966796875, "loss_aux_layer_8": 0.1611328125, "loss_aux_layer_9": 0.161376953125, "step": 162, "total_loss": 0.8036793172359467 }, { "epoch": 0.03227083745792912, "grad_norm": 0.9756426215171814, "learning_rate": 5e-05, "llm_loss": 0.6435291916131973, "loss": 3.3688, "loss_aux_layer_0": 0.0765380859375, "loss_aux_layer_1": 0.140869140625, "loss_aux_layer_10": 0.160400390625, "loss_aux_layer_11": 0.1708984375, "loss_aux_layer_12": 0.18505859375, "loss_aux_layer_13": 0.1982421875, "loss_aux_layer_14": 0.217529296875, "loss_aux_layer_15": 0.232421875, "loss_aux_layer_16": 0.24755859375, "loss_aux_layer_17": 0.24853515625, "loss_aux_layer_18": 0.25732421875, "loss_aux_layer_19": 0.251708984375, "loss_aux_layer_2": 0.162109375, "loss_aux_layer_20": 0.251220703125, "loss_aux_layer_21": 0.251220703125, "loss_aux_layer_22": 0.26953125, "loss_aux_layer_23": 0.31982421875, "loss_aux_layer_3": 0.166748046875, "loss_aux_layer_4": 0.162353515625, "loss_aux_layer_5": 0.164306640625, "loss_aux_layer_6": 0.16162109375, "loss_aux_layer_7": 0.156005859375, "loss_aux_layer_8": 0.158203125, "loss_aux_layer_9": 0.157958984375, "step": 163, "total_loss": 0.8421887755393982 }, { "epoch": 0.03246881805583053, "grad_norm": 0.8010618090629578, "learning_rate": 5e-05, "llm_loss": 0.7283288240432739, "loss": 3.7119, "loss_aux_layer_0": 0.08056640625, "loss_aux_layer_1": 0.14306640625, "loss_aux_layer_10": 0.16015625, "loss_aux_layer_11": 0.17138671875, "loss_aux_layer_12": 0.18603515625, "loss_aux_layer_13": 0.19921875, "loss_aux_layer_14": 0.217529296875, "loss_aux_layer_15": 0.233154296875, "loss_aux_layer_16": 0.248291015625, "loss_aux_layer_17": 0.249755859375, "loss_aux_layer_18": 0.25830078125, "loss_aux_layer_19": 0.2529296875, "loss_aux_layer_2": 0.16357421875, "loss_aux_layer_20": 0.25244140625, "loss_aux_layer_21": 0.252685546875, "loss_aux_layer_22": 0.2734375, "loss_aux_layer_23": 0.322265625, "loss_aux_layer_3": 0.167236328125, "loss_aux_layer_4": 0.162353515625, "loss_aux_layer_5": 0.163818359375, "loss_aux_layer_6": 0.161376953125, "loss_aux_layer_7": 0.155517578125, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.15771484375, "step": 164, "total_loss": 0.9279745221138 }, { "epoch": 0.032666798653731934, "grad_norm": 1.059099555015564, "learning_rate": 5e-05, "llm_loss": 0.748511478304863, "loss": 3.7751, "loss_aux_layer_0": 0.0751953125, "loss_aux_layer_1": 0.136962890625, "loss_aux_layer_10": 0.15625, "loss_aux_layer_11": 0.16650390625, "loss_aux_layer_12": 0.180419921875, "loss_aux_layer_13": 0.19384765625, "loss_aux_layer_14": 0.21337890625, "loss_aux_layer_15": 0.229248046875, "loss_aux_layer_16": 0.243408203125, "loss_aux_layer_17": 0.24658203125, "loss_aux_layer_18": 0.2548828125, "loss_aux_layer_19": 0.25, "loss_aux_layer_2": 0.157470703125, "loss_aux_layer_20": 0.250732421875, "loss_aux_layer_21": 0.25048828125, "loss_aux_layer_22": 0.26904296875, "loss_aux_layer_23": 0.31689453125, "loss_aux_layer_3": 0.161865234375, "loss_aux_layer_4": 0.157470703125, "loss_aux_layer_5": 0.158935546875, "loss_aux_layer_6": 0.1572265625, "loss_aux_layer_7": 0.151123046875, "loss_aux_layer_8": 0.154052734375, "loss_aux_layer_9": 0.15380859375, "step": 165, "total_loss": 0.9437740594148636 }, { "epoch": 0.03286477925163334, "grad_norm": 1.2065684795379639, "learning_rate": 5e-05, "llm_loss": 0.6356232166290283, "loss": 3.3501, "loss_aux_layer_0": 0.0810546875, "loss_aux_layer_1": 0.149658203125, "loss_aux_layer_10": 0.164794921875, "loss_aux_layer_11": 0.17529296875, "loss_aux_layer_12": 0.18994140625, "loss_aux_layer_13": 0.201904296875, "loss_aux_layer_14": 0.2197265625, "loss_aux_layer_15": 0.234130859375, "loss_aux_layer_16": 0.2470703125, "loss_aux_layer_17": 0.248291015625, "loss_aux_layer_18": 0.25634765625, "loss_aux_layer_19": 0.249755859375, "loss_aux_layer_2": 0.17138671875, "loss_aux_layer_20": 0.24853515625, "loss_aux_layer_21": 0.24853515625, "loss_aux_layer_22": 0.26953125, "loss_aux_layer_23": 0.31884765625, "loss_aux_layer_3": 0.175048828125, "loss_aux_layer_4": 0.16943359375, "loss_aux_layer_5": 0.1708984375, "loss_aux_layer_6": 0.167236328125, "loss_aux_layer_7": 0.16162109375, "loss_aux_layer_8": 0.163818359375, "loss_aux_layer_9": 0.162841796875, "step": 166, "total_loss": 0.8375268429517746 }, { "epoch": 0.033062759849534745, "grad_norm": 0.6823099851608276, "learning_rate": 5e-05, "llm_loss": 0.7483128756284714, "loss": 3.7843, "loss_aux_layer_0": 0.077392578125, "loss_aux_layer_1": 0.142333984375, "loss_aux_layer_10": 0.160400390625, "loss_aux_layer_11": 0.17041015625, "loss_aux_layer_12": 0.184326171875, "loss_aux_layer_13": 0.196044921875, "loss_aux_layer_14": 0.215087890625, "loss_aux_layer_15": 0.230224609375, "loss_aux_layer_16": 0.243896484375, "loss_aux_layer_17": 0.245849609375, "loss_aux_layer_18": 0.253173828125, "loss_aux_layer_19": 0.247802734375, "loss_aux_layer_2": 0.164306640625, "loss_aux_layer_20": 0.24755859375, "loss_aux_layer_21": 0.24853515625, "loss_aux_layer_22": 0.27001953125, "loss_aux_layer_23": 0.3203125, "loss_aux_layer_3": 0.16796875, "loss_aux_layer_4": 0.162841796875, "loss_aux_layer_5": 0.163818359375, "loss_aux_layer_6": 0.161865234375, "loss_aux_layer_7": 0.15576171875, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.15771484375, "step": 167, "total_loss": 0.9460703134536743 }, { "epoch": 0.033260740447436155, "grad_norm": 1.159000277519226, "learning_rate": 5e-05, "llm_loss": 0.768456757068634, "loss": 3.8689, "loss_aux_layer_0": 0.0794677734375, "loss_aux_layer_1": 0.1439208984375, "loss_aux_layer_10": 0.159423828125, "loss_aux_layer_11": 0.169677734375, "loss_aux_layer_12": 0.18359375, "loss_aux_layer_13": 0.1962890625, "loss_aux_layer_14": 0.21533203125, "loss_aux_layer_15": 0.229736328125, "loss_aux_layer_16": 0.244384765625, "loss_aux_layer_17": 0.248046875, "loss_aux_layer_18": 0.25732421875, "loss_aux_layer_19": 0.251220703125, "loss_aux_layer_2": 0.1650390625, "loss_aux_layer_20": 0.251953125, "loss_aux_layer_21": 0.251708984375, "loss_aux_layer_22": 0.27197265625, "loss_aux_layer_23": 0.31982421875, "loss_aux_layer_3": 0.1689453125, "loss_aux_layer_4": 0.16357421875, "loss_aux_layer_5": 0.1650390625, "loss_aux_layer_6": 0.162109375, "loss_aux_layer_7": 0.15576171875, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.157470703125, "step": 168, "total_loss": 0.9672136604785919 }, { "epoch": 0.03345872104533756, "grad_norm": 1.0387593507766724, "learning_rate": 5e-05, "llm_loss": 0.7365806996822357, "loss": 3.784, "loss_aux_layer_0": 0.0828857421875, "loss_aux_layer_1": 0.155029296875, "loss_aux_layer_10": 0.173095703125, "loss_aux_layer_11": 0.183837890625, "loss_aux_layer_12": 0.1982421875, "loss_aux_layer_13": 0.209228515625, "loss_aux_layer_14": 0.228271484375, "loss_aux_layer_15": 0.24072265625, "loss_aux_layer_16": 0.255126953125, "loss_aux_layer_17": 0.2548828125, "loss_aux_layer_18": 0.26220703125, "loss_aux_layer_19": 0.255859375, "loss_aux_layer_2": 0.1796875, "loss_aux_layer_20": 0.2548828125, "loss_aux_layer_21": 0.25439453125, "loss_aux_layer_22": 0.2763671875, "loss_aux_layer_23": 0.326171875, "loss_aux_layer_3": 0.1845703125, "loss_aux_layer_4": 0.179443359375, "loss_aux_layer_5": 0.180419921875, "loss_aux_layer_6": 0.177978515625, "loss_aux_layer_7": 0.170654296875, "loss_aux_layer_8": 0.172607421875, "loss_aux_layer_9": 0.170654296875, "step": 169, "total_loss": 0.9459951967000961 }, { "epoch": 0.03365670164323896, "grad_norm": 1.071868896484375, "learning_rate": 5e-05, "llm_loss": 0.6407819539308548, "loss": 3.3602, "loss_aux_layer_0": 0.075927734375, "loss_aux_layer_1": 0.142333984375, "loss_aux_layer_10": 0.162109375, "loss_aux_layer_11": 0.17236328125, "loss_aux_layer_12": 0.187255859375, "loss_aux_layer_13": 0.199462890625, "loss_aux_layer_14": 0.218017578125, "loss_aux_layer_15": 0.232421875, "loss_aux_layer_16": 0.247802734375, "loss_aux_layer_17": 0.24951171875, "loss_aux_layer_18": 0.25732421875, "loss_aux_layer_19": 0.251953125, "loss_aux_layer_2": 0.164794921875, "loss_aux_layer_20": 0.2509765625, "loss_aux_layer_21": 0.2490234375, "loss_aux_layer_22": 0.2685546875, "loss_aux_layer_23": 0.31787109375, "loss_aux_layer_3": 0.169189453125, "loss_aux_layer_4": 0.164306640625, "loss_aux_layer_5": 0.1650390625, "loss_aux_layer_6": 0.16259765625, "loss_aux_layer_7": 0.15673828125, "loss_aux_layer_8": 0.158935546875, "loss_aux_layer_9": 0.1591796875, "step": 170, "total_loss": 0.8400455713272095 }, { "epoch": 0.03385468224114037, "grad_norm": 1.255284070968628, "learning_rate": 5e-05, "llm_loss": 0.6304368078708649, "loss": 3.3438, "loss_aux_layer_0": 0.078369140625, "loss_aux_layer_1": 0.154052734375, "loss_aux_layer_10": 0.1689453125, "loss_aux_layer_11": 0.179443359375, "loss_aux_layer_12": 0.1943359375, "loss_aux_layer_13": 0.205810546875, "loss_aux_layer_14": 0.22412109375, "loss_aux_layer_15": 0.237548828125, "loss_aux_layer_16": 0.251708984375, "loss_aux_layer_17": 0.250732421875, "loss_aux_layer_18": 0.2578125, "loss_aux_layer_19": 0.251953125, "loss_aux_layer_2": 0.178466796875, "loss_aux_layer_20": 0.251953125, "loss_aux_layer_21": 0.250244140625, "loss_aux_layer_22": 0.2705078125, "loss_aux_layer_23": 0.31884765625, "loss_aux_layer_3": 0.182373046875, "loss_aux_layer_4": 0.176025390625, "loss_aux_layer_5": 0.17626953125, "loss_aux_layer_6": 0.173095703125, "loss_aux_layer_7": 0.165771484375, "loss_aux_layer_8": 0.167724609375, "loss_aux_layer_9": 0.166748046875, "step": 171, "total_loss": 0.8359517455101013 }, { "epoch": 0.03405266283904177, "grad_norm": 0.6608083844184875, "learning_rate": 5e-05, "llm_loss": 0.7587469667196274, "loss": 3.8413, "loss_aux_layer_0": 0.0784912109375, "loss_aux_layer_1": 0.146728515625, "loss_aux_layer_10": 0.162353515625, "loss_aux_layer_11": 0.172607421875, "loss_aux_layer_12": 0.187744140625, "loss_aux_layer_13": 0.199951171875, "loss_aux_layer_14": 0.2197265625, "loss_aux_layer_15": 0.23486328125, "loss_aux_layer_16": 0.250244140625, "loss_aux_layer_17": 0.251220703125, "loss_aux_layer_18": 0.25830078125, "loss_aux_layer_19": 0.2529296875, "loss_aux_layer_2": 0.168701171875, "loss_aux_layer_20": 0.2529296875, "loss_aux_layer_21": 0.25244140625, "loss_aux_layer_22": 0.27294921875, "loss_aux_layer_23": 0.3212890625, "loss_aux_layer_3": 0.173095703125, "loss_aux_layer_4": 0.167724609375, "loss_aux_layer_5": 0.168701171875, "loss_aux_layer_6": 0.165771484375, "loss_aux_layer_7": 0.1591796875, "loss_aux_layer_8": 0.1611328125, "loss_aux_layer_9": 0.160400390625, "step": 172, "total_loss": 0.9603333622217178 }, { "epoch": 0.03425064343694318, "grad_norm": 0.9804598093032837, "learning_rate": 5e-05, "llm_loss": 0.6852699518203735, "loss": 3.5495, "loss_aux_layer_0": 0.0736083984375, "loss_aux_layer_1": 0.146484375, "loss_aux_layer_10": 0.16552734375, "loss_aux_layer_11": 0.17529296875, "loss_aux_layer_12": 0.18896484375, "loss_aux_layer_13": 0.200439453125, "loss_aux_layer_14": 0.218505859375, "loss_aux_layer_15": 0.23388671875, "loss_aux_layer_16": 0.248046875, "loss_aux_layer_17": 0.24755859375, "loss_aux_layer_18": 0.25537109375, "loss_aux_layer_19": 0.249267578125, "loss_aux_layer_2": 0.17041015625, "loss_aux_layer_20": 0.249267578125, "loss_aux_layer_21": 0.251220703125, "loss_aux_layer_22": 0.2724609375, "loss_aux_layer_23": 0.322265625, "loss_aux_layer_3": 0.175537109375, "loss_aux_layer_4": 0.171142578125, "loss_aux_layer_5": 0.17333984375, "loss_aux_layer_6": 0.17041015625, "loss_aux_layer_7": 0.163330078125, "loss_aux_layer_8": 0.164794921875, "loss_aux_layer_9": 0.16357421875, "step": 173, "total_loss": 0.8873837292194366 }, { "epoch": 0.03444862403484458, "grad_norm": 0.9414743185043335, "learning_rate": 5e-05, "llm_loss": 0.6521150916814804, "loss": 3.3627, "loss_aux_layer_0": 0.07098388671875, "loss_aux_layer_1": 0.1319580078125, "loss_aux_layer_10": 0.15087890625, "loss_aux_layer_11": 0.159423828125, "loss_aux_layer_12": 0.173095703125, "loss_aux_layer_13": 0.185302734375, "loss_aux_layer_14": 0.2041015625, "loss_aux_layer_15": 0.2197265625, "loss_aux_layer_16": 0.23486328125, "loss_aux_layer_17": 0.238037109375, "loss_aux_layer_18": 0.2470703125, "loss_aux_layer_19": 0.241455078125, "loss_aux_layer_2": 0.152099609375, "loss_aux_layer_20": 0.24169921875, "loss_aux_layer_21": 0.24169921875, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.31201171875, "loss_aux_layer_3": 0.156982421875, "loss_aux_layer_4": 0.15283203125, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.15234375, "loss_aux_layer_7": 0.146484375, "loss_aux_layer_8": 0.148681640625, "loss_aux_layer_9": 0.1484375, "step": 174, "total_loss": 0.8406796902418137 }, { "epoch": 0.03464660463274599, "grad_norm": 2.6428635120391846, "learning_rate": 5e-05, "llm_loss": 0.5631384700536728, "loss": 3.0725, "loss_aux_layer_0": 0.075927734375, "loss_aux_layer_1": 0.149658203125, "loss_aux_layer_10": 0.169921875, "loss_aux_layer_11": 0.1796875, "loss_aux_layer_12": 0.193603515625, "loss_aux_layer_13": 0.204345703125, "loss_aux_layer_14": 0.22265625, "loss_aux_layer_15": 0.236083984375, "loss_aux_layer_16": 0.24951171875, "loss_aux_layer_17": 0.2490234375, "loss_aux_layer_18": 0.2568359375, "loss_aux_layer_19": 0.250732421875, "loss_aux_layer_2": 0.175537109375, "loss_aux_layer_20": 0.25, "loss_aux_layer_21": 0.25146484375, "loss_aux_layer_22": 0.2734375, "loss_aux_layer_23": 0.3212890625, "loss_aux_layer_3": 0.1806640625, "loss_aux_layer_4": 0.175048828125, "loss_aux_layer_5": 0.176513671875, "loss_aux_layer_6": 0.174072265625, "loss_aux_layer_7": 0.167236328125, "loss_aux_layer_8": 0.16943359375, "loss_aux_layer_9": 0.167724609375, "step": 175, "total_loss": 0.7681291699409485 }, { "epoch": 0.034844585230647394, "grad_norm": 3.1431491374969482, "learning_rate": 5e-05, "llm_loss": 0.7347351610660553, "loss": 3.7364, "loss_aux_layer_0": 0.072021484375, "loss_aux_layer_1": 0.14111328125, "loss_aux_layer_10": 0.15966796875, "loss_aux_layer_11": 0.169921875, "loss_aux_layer_12": 0.18505859375, "loss_aux_layer_13": 0.197998046875, "loss_aux_layer_14": 0.218505859375, "loss_aux_layer_15": 0.233154296875, "loss_aux_layer_16": 0.248291015625, "loss_aux_layer_17": 0.25048828125, "loss_aux_layer_18": 0.25830078125, "loss_aux_layer_19": 0.25341796875, "loss_aux_layer_2": 0.164306640625, "loss_aux_layer_20": 0.25244140625, "loss_aux_layer_21": 0.25244140625, "loss_aux_layer_22": 0.2724609375, "loss_aux_layer_23": 0.32470703125, "loss_aux_layer_3": 0.16845703125, "loss_aux_layer_4": 0.163330078125, "loss_aux_layer_5": 0.16455078125, "loss_aux_layer_6": 0.162353515625, "loss_aux_layer_7": 0.155517578125, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.157958984375, "step": 176, "total_loss": 0.9341090619564056 }, { "epoch": 0.035042565828548804, "grad_norm": 2.38759183883667, "learning_rate": 5e-05, "llm_loss": 0.7050700634717941, "loss": 3.6268, "loss_aux_layer_0": 0.0728759765625, "loss_aux_layer_1": 0.1416015625, "loss_aux_layer_10": 0.16259765625, "loss_aux_layer_11": 0.172607421875, "loss_aux_layer_12": 0.186767578125, "loss_aux_layer_13": 0.19970703125, "loss_aux_layer_14": 0.22021484375, "loss_aux_layer_15": 0.2353515625, "loss_aux_layer_16": 0.251220703125, "loss_aux_layer_17": 0.25390625, "loss_aux_layer_18": 0.2626953125, "loss_aux_layer_19": 0.256591796875, "loss_aux_layer_2": 0.165771484375, "loss_aux_layer_20": 0.255615234375, "loss_aux_layer_21": 0.25439453125, "loss_aux_layer_22": 0.27587890625, "loss_aux_layer_23": 0.32421875, "loss_aux_layer_3": 0.17041015625, "loss_aux_layer_4": 0.165771484375, "loss_aux_layer_5": 0.16748046875, "loss_aux_layer_6": 0.165283203125, "loss_aux_layer_7": 0.158203125, "loss_aux_layer_8": 0.160400390625, "loss_aux_layer_9": 0.16015625, "step": 177, "total_loss": 0.9066946506500244 }, { "epoch": 0.035240546426450206, "grad_norm": 1.722018837928772, "learning_rate": 5e-05, "llm_loss": 0.6553711444139481, "loss": 3.4201, "loss_aux_layer_0": 0.0721435546875, "loss_aux_layer_1": 0.146240234375, "loss_aux_layer_10": 0.1630859375, "loss_aux_layer_11": 0.172607421875, "loss_aux_layer_12": 0.1865234375, "loss_aux_layer_13": 0.1982421875, "loss_aux_layer_14": 0.21630859375, "loss_aux_layer_15": 0.230712890625, "loss_aux_layer_16": 0.24560546875, "loss_aux_layer_17": 0.245849609375, "loss_aux_layer_18": 0.253662109375, "loss_aux_layer_19": 0.2470703125, "loss_aux_layer_2": 0.170166015625, "loss_aux_layer_20": 0.247314453125, "loss_aux_layer_21": 0.2470703125, "loss_aux_layer_22": 0.26708984375, "loss_aux_layer_23": 0.3154296875, "loss_aux_layer_3": 0.1748046875, "loss_aux_layer_4": 0.169921875, "loss_aux_layer_5": 0.170654296875, "loss_aux_layer_6": 0.168212890625, "loss_aux_layer_7": 0.160888671875, "loss_aux_layer_8": 0.161865234375, "loss_aux_layer_9": 0.160888671875, "step": 178, "total_loss": 0.8550349473953247 }, { "epoch": 0.035438527024351615, "grad_norm": 1.8709371089935303, "learning_rate": 5e-05, "llm_loss": 0.6769080609083176, "loss": 3.5143, "loss_aux_layer_0": 0.070556640625, "loss_aux_layer_1": 0.146484375, "loss_aux_layer_10": 0.165283203125, "loss_aux_layer_11": 0.17578125, "loss_aux_layer_12": 0.189453125, "loss_aux_layer_13": 0.20068359375, "loss_aux_layer_14": 0.218505859375, "loss_aux_layer_15": 0.232177734375, "loss_aux_layer_16": 0.245849609375, "loss_aux_layer_17": 0.246826171875, "loss_aux_layer_18": 0.25537109375, "loss_aux_layer_19": 0.24951171875, "loss_aux_layer_2": 0.1708984375, "loss_aux_layer_20": 0.24951171875, "loss_aux_layer_21": 0.250244140625, "loss_aux_layer_22": 0.2705078125, "loss_aux_layer_23": 0.31982421875, "loss_aux_layer_3": 0.176025390625, "loss_aux_layer_4": 0.17138671875, "loss_aux_layer_5": 0.173095703125, "loss_aux_layer_6": 0.17041015625, "loss_aux_layer_7": 0.163330078125, "loss_aux_layer_8": 0.164794921875, "loss_aux_layer_9": 0.163330078125, "step": 179, "total_loss": 0.8785628229379654 }, { "epoch": 0.03563650762225302, "grad_norm": 4.055916786193848, "learning_rate": 5e-05, "llm_loss": 0.7661997079849243, "loss": 3.8472, "loss_aux_layer_0": 0.0703125, "loss_aux_layer_1": 0.140625, "loss_aux_layer_10": 0.16015625, "loss_aux_layer_11": 0.169677734375, "loss_aux_layer_12": 0.18310546875, "loss_aux_layer_13": 0.1943359375, "loss_aux_layer_14": 0.21337890625, "loss_aux_layer_15": 0.22607421875, "loss_aux_layer_16": 0.240478515625, "loss_aux_layer_17": 0.24169921875, "loss_aux_layer_18": 0.2490234375, "loss_aux_layer_19": 0.24462890625, "loss_aux_layer_2": 0.1640625, "loss_aux_layer_20": 0.244873046875, "loss_aux_layer_21": 0.244384765625, "loss_aux_layer_22": 0.26416015625, "loss_aux_layer_23": 0.31103515625, "loss_aux_layer_3": 0.16845703125, "loss_aux_layer_4": 0.16357421875, "loss_aux_layer_5": 0.1650390625, "loss_aux_layer_6": 0.1630859375, "loss_aux_layer_7": 0.156005859375, "loss_aux_layer_8": 0.15869140625, "loss_aux_layer_9": 0.158203125, "step": 180, "total_loss": 0.9618052840232849 }, { "epoch": 0.03583448822015443, "grad_norm": 1.1034502983093262, "learning_rate": 5e-05, "llm_loss": 0.7256975471973419, "loss": 3.6987, "loss_aux_layer_0": 0.0726318359375, "loss_aux_layer_1": 0.14453125, "loss_aux_layer_10": 0.163818359375, "loss_aux_layer_11": 0.172607421875, "loss_aux_layer_12": 0.186279296875, "loss_aux_layer_13": 0.196044921875, "loss_aux_layer_14": 0.214111328125, "loss_aux_layer_15": 0.227783203125, "loss_aux_layer_16": 0.2412109375, "loss_aux_layer_17": 0.2421875, "loss_aux_layer_18": 0.2509765625, "loss_aux_layer_19": 0.247314453125, "loss_aux_layer_2": 0.168212890625, "loss_aux_layer_20": 0.247314453125, "loss_aux_layer_21": 0.24853515625, "loss_aux_layer_22": 0.26953125, "loss_aux_layer_23": 0.31787109375, "loss_aux_layer_3": 0.17333984375, "loss_aux_layer_4": 0.16845703125, "loss_aux_layer_5": 0.169677734375, "loss_aux_layer_6": 0.16796875, "loss_aux_layer_7": 0.160888671875, "loss_aux_layer_8": 0.16259765625, "loss_aux_layer_9": 0.161865234375, "step": 181, "total_loss": 0.9246721714735031 }, { "epoch": 0.03603246881805583, "grad_norm": 2.298239231109619, "learning_rate": 5e-05, "llm_loss": 0.7786948531866074, "loss": 3.8989, "loss_aux_layer_0": 0.0675048828125, "loss_aux_layer_1": 0.139404296875, "loss_aux_layer_10": 0.157958984375, "loss_aux_layer_11": 0.166748046875, "loss_aux_layer_12": 0.1806640625, "loss_aux_layer_13": 0.19189453125, "loss_aux_layer_14": 0.21142578125, "loss_aux_layer_15": 0.22705078125, "loss_aux_layer_16": 0.24365234375, "loss_aux_layer_17": 0.244873046875, "loss_aux_layer_18": 0.25390625, "loss_aux_layer_19": 0.250244140625, "loss_aux_layer_2": 0.1630859375, "loss_aux_layer_20": 0.249267578125, "loss_aux_layer_21": 0.249267578125, "loss_aux_layer_22": 0.26953125, "loss_aux_layer_23": 0.31787109375, "loss_aux_layer_3": 0.167236328125, "loss_aux_layer_4": 0.162109375, "loss_aux_layer_5": 0.163330078125, "loss_aux_layer_6": 0.1611328125, "loss_aux_layer_7": 0.15380859375, "loss_aux_layer_8": 0.156005859375, "loss_aux_layer_9": 0.15576171875, "step": 182, "total_loss": 0.9747219681739807 }, { "epoch": 0.03623044941595724, "grad_norm": 1.523383378982544, "learning_rate": 5e-05, "llm_loss": 0.6955223828554153, "loss": 3.5504, "loss_aux_layer_0": 0.0660400390625, "loss_aux_layer_1": 0.136474609375, "loss_aux_layer_10": 0.15478515625, "loss_aux_layer_11": 0.164794921875, "loss_aux_layer_12": 0.17822265625, "loss_aux_layer_13": 0.190185546875, "loss_aux_layer_14": 0.208251953125, "loss_aux_layer_15": 0.221923828125, "loss_aux_layer_16": 0.236572265625, "loss_aux_layer_17": 0.23876953125, "loss_aux_layer_18": 0.24755859375, "loss_aux_layer_19": 0.2421875, "loss_aux_layer_2": 0.158935546875, "loss_aux_layer_20": 0.24267578125, "loss_aux_layer_21": 0.244140625, "loss_aux_layer_22": 0.26513671875, "loss_aux_layer_23": 0.3134765625, "loss_aux_layer_3": 0.163818359375, "loss_aux_layer_4": 0.1591796875, "loss_aux_layer_5": 0.16064453125, "loss_aux_layer_6": 0.158203125, "loss_aux_layer_7": 0.151611328125, "loss_aux_layer_8": 0.15380859375, "loss_aux_layer_9": 0.152587890625, "step": 183, "total_loss": 0.8876117616891861 }, { "epoch": 0.03642843001385864, "grad_norm": 1.1797798871994019, "learning_rate": 5e-05, "llm_loss": 0.6984829306602478, "loss": 3.5529, "loss_aux_layer_0": 0.068359375, "loss_aux_layer_1": 0.1376953125, "loss_aux_layer_10": 0.15234375, "loss_aux_layer_11": 0.160888671875, "loss_aux_layer_12": 0.17431640625, "loss_aux_layer_13": 0.185791015625, "loss_aux_layer_14": 0.203857421875, "loss_aux_layer_15": 0.21826171875, "loss_aux_layer_16": 0.232666015625, "loss_aux_layer_17": 0.234619140625, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.239013671875, "loss_aux_layer_2": 0.159912109375, "loss_aux_layer_20": 0.23974609375, "loss_aux_layer_21": 0.240234375, "loss_aux_layer_22": 0.26123046875, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.1640625, "loss_aux_layer_4": 0.158935546875, "loss_aux_layer_5": 0.15966796875, "loss_aux_layer_6": 0.15771484375, "loss_aux_layer_7": 0.14990234375, "loss_aux_layer_8": 0.151611328125, "loss_aux_layer_9": 0.150634765625, "step": 184, "total_loss": 0.8882351815700531 }, { "epoch": 0.03662641061176005, "grad_norm": 1.5774563550949097, "learning_rate": 5e-05, "llm_loss": 0.5986344814300537, "loss": 3.1809, "loss_aux_layer_0": 0.0655517578125, "loss_aux_layer_1": 0.14111328125, "loss_aux_layer_10": 0.16064453125, "loss_aux_layer_11": 0.169921875, "loss_aux_layer_12": 0.18408203125, "loss_aux_layer_13": 0.195556640625, "loss_aux_layer_14": 0.214111328125, "loss_aux_layer_15": 0.2275390625, "loss_aux_layer_16": 0.24169921875, "loss_aux_layer_17": 0.242919921875, "loss_aux_layer_18": 0.2509765625, "loss_aux_layer_19": 0.245849609375, "loss_aux_layer_2": 0.164306640625, "loss_aux_layer_20": 0.245361328125, "loss_aux_layer_21": 0.24658203125, "loss_aux_layer_22": 0.265625, "loss_aux_layer_23": 0.314453125, "loss_aux_layer_3": 0.169921875, "loss_aux_layer_4": 0.16552734375, "loss_aux_layer_5": 0.16650390625, "loss_aux_layer_6": 0.164306640625, "loss_aux_layer_7": 0.157470703125, "loss_aux_layer_8": 0.1591796875, "loss_aux_layer_9": 0.158447265625, "step": 185, "total_loss": 0.7952169924974442 }, { "epoch": 0.03682439120966145, "grad_norm": 3.6339423656463623, "learning_rate": 5e-05, "llm_loss": 0.6258084699511528, "loss": 3.2653, "loss_aux_layer_0": 0.064208984375, "loss_aux_layer_1": 0.134765625, "loss_aux_layer_10": 0.153076171875, "loss_aux_layer_11": 0.16162109375, "loss_aux_layer_12": 0.174560546875, "loss_aux_layer_13": 0.185791015625, "loss_aux_layer_14": 0.204833984375, "loss_aux_layer_15": 0.21923828125, "loss_aux_layer_16": 0.23388671875, "loss_aux_layer_17": 0.236328125, "loss_aux_layer_18": 0.245361328125, "loss_aux_layer_19": 0.2412109375, "loss_aux_layer_2": 0.158203125, "loss_aux_layer_20": 0.242431640625, "loss_aux_layer_21": 0.244873046875, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.31396484375, "loss_aux_layer_3": 0.162353515625, "loss_aux_layer_4": 0.158447265625, "loss_aux_layer_5": 0.160400390625, "loss_aux_layer_6": 0.158447265625, "loss_aux_layer_7": 0.1513671875, "loss_aux_layer_8": 0.15283203125, "loss_aux_layer_9": 0.1513671875, "step": 186, "total_loss": 0.8163354992866516 }, { "epoch": 0.03702237180756286, "grad_norm": 1.2139722108840942, "learning_rate": 5e-05, "llm_loss": 0.6070375591516495, "loss": 3.1904, "loss_aux_layer_0": 0.068603515625, "loss_aux_layer_1": 0.136474609375, "loss_aux_layer_10": 0.153076171875, "loss_aux_layer_11": 0.1611328125, "loss_aux_layer_12": 0.174560546875, "loss_aux_layer_13": 0.184814453125, "loss_aux_layer_14": 0.203369140625, "loss_aux_layer_15": 0.21923828125, "loss_aux_layer_16": 0.23388671875, "loss_aux_layer_17": 0.236083984375, "loss_aux_layer_18": 0.245361328125, "loss_aux_layer_19": 0.24169921875, "loss_aux_layer_2": 0.159423828125, "loss_aux_layer_20": 0.241455078125, "loss_aux_layer_21": 0.242431640625, "loss_aux_layer_22": 0.26171875, "loss_aux_layer_23": 0.31201171875, "loss_aux_layer_3": 0.164306640625, "loss_aux_layer_4": 0.159423828125, "loss_aux_layer_5": 0.160888671875, "loss_aux_layer_6": 0.158935546875, "loss_aux_layer_7": 0.15087890625, "loss_aux_layer_8": 0.152587890625, "loss_aux_layer_9": 0.151611328125, "step": 187, "total_loss": 0.7976010739803314 }, { "epoch": 0.037220352405464265, "grad_norm": 1.771709680557251, "learning_rate": 5e-05, "llm_loss": 0.7013482302427292, "loss": 3.5318, "loss_aux_layer_0": 0.060546875, "loss_aux_layer_1": 0.125732421875, "loss_aux_layer_10": 0.142822265625, "loss_aux_layer_11": 0.15087890625, "loss_aux_layer_12": 0.164794921875, "loss_aux_layer_13": 0.177001953125, "loss_aux_layer_14": 0.1943359375, "loss_aux_layer_15": 0.210205078125, "loss_aux_layer_16": 0.2255859375, "loss_aux_layer_17": 0.229736328125, "loss_aux_layer_18": 0.23974609375, "loss_aux_layer_19": 0.236572265625, "loss_aux_layer_2": 0.146240234375, "loss_aux_layer_20": 0.23828125, "loss_aux_layer_21": 0.23828125, "loss_aux_layer_22": 0.25732421875, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.15087890625, "loss_aux_layer_4": 0.146728515625, "loss_aux_layer_5": 0.14794921875, "loss_aux_layer_6": 0.146484375, "loss_aux_layer_7": 0.139404296875, "loss_aux_layer_8": 0.1416015625, "loss_aux_layer_9": 0.14111328125, "step": 188, "total_loss": 0.8829488158226013 }, { "epoch": 0.03741833300336567, "grad_norm": 1.9450358152389526, "learning_rate": 5e-05, "llm_loss": 0.699817031621933, "loss": 3.5795, "loss_aux_layer_0": 0.065673828125, "loss_aux_layer_1": 0.141357421875, "loss_aux_layer_10": 0.158447265625, "loss_aux_layer_11": 0.1669921875, "loss_aux_layer_12": 0.180419921875, "loss_aux_layer_13": 0.1904296875, "loss_aux_layer_14": 0.208984375, "loss_aux_layer_15": 0.22314453125, "loss_aux_layer_16": 0.237548828125, "loss_aux_layer_17": 0.23974609375, "loss_aux_layer_18": 0.2490234375, "loss_aux_layer_19": 0.2451171875, "loss_aux_layer_2": 0.1650390625, "loss_aux_layer_20": 0.24560546875, "loss_aux_layer_21": 0.244873046875, "loss_aux_layer_22": 0.265625, "loss_aux_layer_23": 0.3125, "loss_aux_layer_3": 0.17041015625, "loss_aux_layer_4": 0.166015625, "loss_aux_layer_5": 0.16748046875, "loss_aux_layer_6": 0.1650390625, "loss_aux_layer_7": 0.156982421875, "loss_aux_layer_8": 0.157958984375, "loss_aux_layer_9": 0.15673828125, "step": 189, "total_loss": 0.8948767930269241 }, { "epoch": 0.037616313601267076, "grad_norm": 2.0055344104766846, "learning_rate": 5e-05, "llm_loss": 0.7594177573919296, "loss": 3.8116, "loss_aux_layer_0": 0.0650634765625, "loss_aux_layer_1": 0.138916015625, "loss_aux_layer_10": 0.15625, "loss_aux_layer_11": 0.1650390625, "loss_aux_layer_12": 0.177978515625, "loss_aux_layer_13": 0.188720703125, "loss_aux_layer_14": 0.20703125, "loss_aux_layer_15": 0.221435546875, "loss_aux_layer_16": 0.236328125, "loss_aux_layer_17": 0.238037109375, "loss_aux_layer_18": 0.24658203125, "loss_aux_layer_19": 0.2431640625, "loss_aux_layer_2": 0.163818359375, "loss_aux_layer_20": 0.243896484375, "loss_aux_layer_21": 0.24462890625, "loss_aux_layer_22": 0.26513671875, "loss_aux_layer_23": 0.31298828125, "loss_aux_layer_3": 0.16845703125, "loss_aux_layer_4": 0.1640625, "loss_aux_layer_5": 0.16552734375, "loss_aux_layer_6": 0.16357421875, "loss_aux_layer_7": 0.1552734375, "loss_aux_layer_8": 0.156494140625, "loss_aux_layer_9": 0.155029296875, "step": 190, "total_loss": 0.9528973251581192 }, { "epoch": 0.03781429419916848, "grad_norm": 1.7712050676345825, "learning_rate": 5e-05, "llm_loss": 0.7324788421392441, "loss": 3.7017, "loss_aux_layer_0": 0.06414794921875, "loss_aux_layer_1": 0.13916015625, "loss_aux_layer_10": 0.155029296875, "loss_aux_layer_11": 0.163818359375, "loss_aux_layer_12": 0.177490234375, "loss_aux_layer_13": 0.189208984375, "loss_aux_layer_14": 0.20751953125, "loss_aux_layer_15": 0.2216796875, "loss_aux_layer_16": 0.23681640625, "loss_aux_layer_17": 0.239990234375, "loss_aux_layer_18": 0.248779296875, "loss_aux_layer_19": 0.24462890625, "loss_aux_layer_2": 0.1630859375, "loss_aux_layer_20": 0.244384765625, "loss_aux_layer_21": 0.244140625, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.3115234375, "loss_aux_layer_3": 0.16748046875, "loss_aux_layer_4": 0.16259765625, "loss_aux_layer_5": 0.163330078125, "loss_aux_layer_6": 0.161376953125, "loss_aux_layer_7": 0.153076171875, "loss_aux_layer_8": 0.154296875, "loss_aux_layer_9": 0.15283203125, "step": 191, "total_loss": 0.9254125654697418 }, { "epoch": 0.03801227479706989, "grad_norm": 3.618515729904175, "learning_rate": 5e-05, "llm_loss": 0.5861613750457764, "loss": 3.1239, "loss_aux_layer_0": 0.0667724609375, "loss_aux_layer_1": 0.14208984375, "loss_aux_layer_10": 0.159912109375, "loss_aux_layer_11": 0.168701171875, "loss_aux_layer_12": 0.181884765625, "loss_aux_layer_13": 0.19189453125, "loss_aux_layer_14": 0.208740234375, "loss_aux_layer_15": 0.221435546875, "loss_aux_layer_16": 0.23486328125, "loss_aux_layer_17": 0.236572265625, "loss_aux_layer_18": 0.2451171875, "loss_aux_layer_19": 0.240966796875, "loss_aux_layer_2": 0.166015625, "loss_aux_layer_20": 0.241455078125, "loss_aux_layer_21": 0.241943359375, "loss_aux_layer_22": 0.2626953125, "loss_aux_layer_23": 0.310546875, "loss_aux_layer_3": 0.171630859375, "loss_aux_layer_4": 0.167724609375, "loss_aux_layer_5": 0.16943359375, "loss_aux_layer_6": 0.1669921875, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.160400390625, "loss_aux_layer_9": 0.158447265625, "step": 192, "total_loss": 0.7809827476739883 }, { "epoch": 0.03821025539497129, "grad_norm": 1.0014046430587769, "learning_rate": 5e-05, "llm_loss": 0.6860674917697906, "loss": 3.4651, "loss_aux_layer_0": 0.06005859375, "loss_aux_layer_1": 0.1239013671875, "loss_aux_layer_10": 0.140625, "loss_aux_layer_11": 0.148681640625, "loss_aux_layer_12": 0.161865234375, "loss_aux_layer_13": 0.1728515625, "loss_aux_layer_14": 0.19189453125, "loss_aux_layer_15": 0.20703125, "loss_aux_layer_16": 0.22265625, "loss_aux_layer_17": 0.226318359375, "loss_aux_layer_18": 0.23681640625, "loss_aux_layer_19": 0.234375, "loss_aux_layer_2": 0.1455078125, "loss_aux_layer_20": 0.237060546875, "loss_aux_layer_21": 0.23876953125, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.30908203125, "loss_aux_layer_3": 0.150634765625, "loss_aux_layer_4": 0.146484375, "loss_aux_layer_5": 0.147705078125, "loss_aux_layer_6": 0.14599609375, "loss_aux_layer_7": 0.138427734375, "loss_aux_layer_8": 0.140380859375, "loss_aux_layer_9": 0.139404296875, "step": 193, "total_loss": 0.8662674278020859 }, { "epoch": 0.0384082359928727, "grad_norm": 3.330857992172241, "learning_rate": 5e-05, "llm_loss": 0.6581374406814575, "loss": 3.4096, "loss_aux_layer_0": 0.06304931640625, "loss_aux_layer_1": 0.142333984375, "loss_aux_layer_10": 0.159912109375, "loss_aux_layer_11": 0.16845703125, "loss_aux_layer_12": 0.180908203125, "loss_aux_layer_13": 0.190185546875, "loss_aux_layer_14": 0.20654296875, "loss_aux_layer_15": 0.219482421875, "loss_aux_layer_16": 0.233642578125, "loss_aux_layer_17": 0.23583984375, "loss_aux_layer_18": 0.244873046875, "loss_aux_layer_19": 0.240966796875, "loss_aux_layer_2": 0.1669921875, "loss_aux_layer_20": 0.2412109375, "loss_aux_layer_21": 0.241455078125, "loss_aux_layer_22": 0.26171875, "loss_aux_layer_23": 0.3076171875, "loss_aux_layer_3": 0.172607421875, "loss_aux_layer_4": 0.16845703125, "loss_aux_layer_5": 0.170166015625, "loss_aux_layer_6": 0.167724609375, "loss_aux_layer_7": 0.159912109375, "loss_aux_layer_8": 0.16015625, "loss_aux_layer_9": 0.158447265625, "step": 194, "total_loss": 0.8524070084095001 }, { "epoch": 0.0386062165907741, "grad_norm": 1.1188404560089111, "learning_rate": 5e-05, "llm_loss": 0.7014737278223038, "loss": 3.5804, "loss_aux_layer_0": 0.0653076171875, "loss_aux_layer_1": 0.142333984375, "loss_aux_layer_10": 0.156982421875, "loss_aux_layer_11": 0.16552734375, "loss_aux_layer_12": 0.17822265625, "loss_aux_layer_13": 0.1875, "loss_aux_layer_14": 0.20556640625, "loss_aux_layer_15": 0.21923828125, "loss_aux_layer_16": 0.234375, "loss_aux_layer_17": 0.236328125, "loss_aux_layer_18": 0.24609375, "loss_aux_layer_19": 0.2431640625, "loss_aux_layer_2": 0.166015625, "loss_aux_layer_20": 0.243408203125, "loss_aux_layer_21": 0.243408203125, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.31103515625, "loss_aux_layer_3": 0.171142578125, "loss_aux_layer_4": 0.166259765625, "loss_aux_layer_5": 0.167236328125, "loss_aux_layer_6": 0.1650390625, "loss_aux_layer_7": 0.156494140625, "loss_aux_layer_8": 0.157470703125, "loss_aux_layer_9": 0.15576171875, "step": 195, "total_loss": 0.8951096832752228 }, { "epoch": 0.03880419718867551, "grad_norm": 3.1946539878845215, "learning_rate": 5e-05, "llm_loss": 0.6790178716182709, "loss": 3.4594, "loss_aux_layer_0": 0.06060791015625, "loss_aux_layer_1": 0.1319580078125, "loss_aux_layer_10": 0.14794921875, "loss_aux_layer_11": 0.156005859375, "loss_aux_layer_12": 0.168701171875, "loss_aux_layer_13": 0.178955078125, "loss_aux_layer_14": 0.197998046875, "loss_aux_layer_15": 0.212646484375, "loss_aux_layer_16": 0.227783203125, "loss_aux_layer_17": 0.230712890625, "loss_aux_layer_18": 0.24072265625, "loss_aux_layer_19": 0.23828125, "loss_aux_layer_2": 0.153564453125, "loss_aux_layer_20": 0.2392578125, "loss_aux_layer_21": 0.240966796875, "loss_aux_layer_22": 0.26025390625, "loss_aux_layer_23": 0.310546875, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.154052734375, "loss_aux_layer_5": 0.15576171875, "loss_aux_layer_6": 0.154296875, "loss_aux_layer_7": 0.146240234375, "loss_aux_layer_8": 0.14794921875, "loss_aux_layer_9": 0.146728515625, "step": 196, "total_loss": 0.8648460656404495 }, { "epoch": 0.039002177786576914, "grad_norm": 1.9524129629135132, "learning_rate": 5e-05, "llm_loss": 0.7146974503993988, "loss": 3.618, "loss_aux_layer_0": 0.066650390625, "loss_aux_layer_1": 0.140869140625, "loss_aux_layer_10": 0.151611328125, "loss_aux_layer_11": 0.159912109375, "loss_aux_layer_12": 0.173095703125, "loss_aux_layer_13": 0.18408203125, "loss_aux_layer_14": 0.201904296875, "loss_aux_layer_15": 0.2158203125, "loss_aux_layer_16": 0.23046875, "loss_aux_layer_17": 0.2333984375, "loss_aux_layer_18": 0.242919921875, "loss_aux_layer_19": 0.238525390625, "loss_aux_layer_2": 0.162841796875, "loss_aux_layer_20": 0.2392578125, "loss_aux_layer_21": 0.239501953125, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.30615234375, "loss_aux_layer_3": 0.167724609375, "loss_aux_layer_4": 0.162353515625, "loss_aux_layer_5": 0.163330078125, "loss_aux_layer_6": 0.160888671875, "loss_aux_layer_7": 0.152099609375, "loss_aux_layer_8": 0.15234375, "loss_aux_layer_9": 0.150390625, "step": 197, "total_loss": 0.9045037627220154 }, { "epoch": 0.03920015838447832, "grad_norm": 1.8962790966033936, "learning_rate": 5e-05, "llm_loss": 0.6281272023916245, "loss": 3.2679, "loss_aux_layer_0": 0.06170654296875, "loss_aux_layer_1": 0.13720703125, "loss_aux_layer_10": 0.15283203125, "loss_aux_layer_11": 0.160400390625, "loss_aux_layer_12": 0.173583984375, "loss_aux_layer_13": 0.183349609375, "loss_aux_layer_14": 0.20068359375, "loss_aux_layer_15": 0.214111328125, "loss_aux_layer_16": 0.2275390625, "loss_aux_layer_17": 0.229736328125, "loss_aux_layer_18": 0.238525390625, "loss_aux_layer_19": 0.234619140625, "loss_aux_layer_2": 0.1611328125, "loss_aux_layer_20": 0.236328125, "loss_aux_layer_21": 0.238525390625, "loss_aux_layer_22": 0.26171875, "loss_aux_layer_23": 0.31103515625, "loss_aux_layer_3": 0.16650390625, "loss_aux_layer_4": 0.161865234375, "loss_aux_layer_5": 0.163330078125, "loss_aux_layer_6": 0.161376953125, "loss_aux_layer_7": 0.15234375, "loss_aux_layer_8": 0.1533203125, "loss_aux_layer_9": 0.15185546875, "step": 198, "total_loss": 0.8169830441474915 }, { "epoch": 0.039398138982379725, "grad_norm": 1.1150091886520386, "learning_rate": 5e-05, "llm_loss": 0.6395651698112488, "loss": 3.3264, "loss_aux_layer_0": 0.0650634765625, "loss_aux_layer_1": 0.140869140625, "loss_aux_layer_10": 0.153564453125, "loss_aux_layer_11": 0.162353515625, "loss_aux_layer_12": 0.17578125, "loss_aux_layer_13": 0.187255859375, "loss_aux_layer_14": 0.203857421875, "loss_aux_layer_15": 0.218505859375, "loss_aux_layer_16": 0.23291015625, "loss_aux_layer_17": 0.23681640625, "loss_aux_layer_18": 0.245361328125, "loss_aux_layer_19": 0.24267578125, "loss_aux_layer_2": 0.16259765625, "loss_aux_layer_20": 0.243896484375, "loss_aux_layer_21": 0.2431640625, "loss_aux_layer_22": 0.26416015625, "loss_aux_layer_23": 0.3134765625, "loss_aux_layer_3": 0.167724609375, "loss_aux_layer_4": 0.163330078125, "loss_aux_layer_5": 0.164306640625, "loss_aux_layer_6": 0.161865234375, "loss_aux_layer_7": 0.153076171875, "loss_aux_layer_8": 0.153564453125, "loss_aux_layer_9": 0.15185546875, "step": 199, "total_loss": 0.8315948098897934 }, { "epoch": 0.039596119580281135, "grad_norm": 2.438791275024414, "learning_rate": 5e-05, "llm_loss": 0.6360547989606857, "loss": 3.3305, "loss_aux_layer_0": 0.06695556640625, "loss_aux_layer_1": 0.142822265625, "loss_aux_layer_10": 0.159912109375, "loss_aux_layer_11": 0.168701171875, "loss_aux_layer_12": 0.18115234375, "loss_aux_layer_13": 0.1904296875, "loss_aux_layer_14": 0.208251953125, "loss_aux_layer_15": 0.22216796875, "loss_aux_layer_16": 0.236083984375, "loss_aux_layer_17": 0.23779296875, "loss_aux_layer_18": 0.246826171875, "loss_aux_layer_19": 0.243896484375, "loss_aux_layer_2": 0.167724609375, "loss_aux_layer_20": 0.244873046875, "loss_aux_layer_21": 0.248046875, "loss_aux_layer_22": 0.2705078125, "loss_aux_layer_23": 0.318359375, "loss_aux_layer_3": 0.173583984375, "loss_aux_layer_4": 0.16943359375, "loss_aux_layer_5": 0.17138671875, "loss_aux_layer_6": 0.1689453125, "loss_aux_layer_7": 0.160400390625, "loss_aux_layer_8": 0.160888671875, "loss_aux_layer_9": 0.159423828125, "step": 200, "total_loss": 0.8326249122619629 }, { "epoch": 0.03979410017818254, "grad_norm": 1.0194536447525024, "learning_rate": 5e-05, "llm_loss": 0.7419736087322235, "loss": 3.7398, "loss_aux_layer_0": 0.06756591796875, "loss_aux_layer_1": 0.142822265625, "loss_aux_layer_10": 0.156005859375, "loss_aux_layer_11": 0.163818359375, "loss_aux_layer_12": 0.176025390625, "loss_aux_layer_13": 0.18603515625, "loss_aux_layer_14": 0.203369140625, "loss_aux_layer_15": 0.21728515625, "loss_aux_layer_16": 0.2314453125, "loss_aux_layer_17": 0.235107421875, "loss_aux_layer_18": 0.244873046875, "loss_aux_layer_19": 0.241455078125, "loss_aux_layer_2": 0.166015625, "loss_aux_layer_20": 0.24267578125, "loss_aux_layer_21": 0.242919921875, "loss_aux_layer_22": 0.26318359375, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.171630859375, "loss_aux_layer_4": 0.1669921875, "loss_aux_layer_5": 0.16845703125, "loss_aux_layer_6": 0.166015625, "loss_aux_layer_7": 0.156494140625, "loss_aux_layer_8": 0.156494140625, "loss_aux_layer_9": 0.154541015625, "step": 201, "total_loss": 0.9349565804004669 }, { "epoch": 0.039992080776083946, "grad_norm": 3.317420721054077, "learning_rate": 5e-05, "llm_loss": 0.7554933577775955, "loss": 3.7732, "loss_aux_layer_0": 0.06298828125, "loss_aux_layer_1": 0.13720703125, "loss_aux_layer_10": 0.152099609375, "loss_aux_layer_11": 0.16015625, "loss_aux_layer_12": 0.17236328125, "loss_aux_layer_13": 0.181884765625, "loss_aux_layer_14": 0.199462890625, "loss_aux_layer_15": 0.21337890625, "loss_aux_layer_16": 0.2275390625, "loss_aux_layer_17": 0.23046875, "loss_aux_layer_18": 0.238525390625, "loss_aux_layer_19": 0.234375, "loss_aux_layer_2": 0.160400390625, "loss_aux_layer_20": 0.235595703125, "loss_aux_layer_21": 0.235107421875, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.3017578125, "loss_aux_layer_3": 0.166015625, "loss_aux_layer_4": 0.16162109375, "loss_aux_layer_5": 0.1630859375, "loss_aux_layer_6": 0.16162109375, "loss_aux_layer_7": 0.152099609375, "loss_aux_layer_8": 0.15283203125, "loss_aux_layer_9": 0.1513671875, "step": 202, "total_loss": 0.9433004409074783 }, { "epoch": 0.04019006137398535, "grad_norm": 2.49920392036438, "learning_rate": 5e-05, "llm_loss": 0.6173365116119385, "loss": 3.2446, "loss_aux_layer_0": 0.06402587890625, "loss_aux_layer_1": 0.14013671875, "loss_aux_layer_10": 0.15576171875, "loss_aux_layer_11": 0.164794921875, "loss_aux_layer_12": 0.177490234375, "loss_aux_layer_13": 0.18896484375, "loss_aux_layer_14": 0.20654296875, "loss_aux_layer_15": 0.220947265625, "loss_aux_layer_16": 0.236083984375, "loss_aux_layer_17": 0.23974609375, "loss_aux_layer_18": 0.248291015625, "loss_aux_layer_19": 0.2451171875, "loss_aux_layer_2": 0.16357421875, "loss_aux_layer_20": 0.244140625, "loss_aux_layer_21": 0.24462890625, "loss_aux_layer_22": 0.265625, "loss_aux_layer_23": 0.31396484375, "loss_aux_layer_3": 0.16943359375, "loss_aux_layer_4": 0.165283203125, "loss_aux_layer_5": 0.166259765625, "loss_aux_layer_6": 0.164306640625, "loss_aux_layer_7": 0.1552734375, "loss_aux_layer_8": 0.155517578125, "loss_aux_layer_9": 0.154052734375, "step": 203, "total_loss": 0.8111453801393509 }, { "epoch": 0.04038804197188676, "grad_norm": 0.9375646114349365, "learning_rate": 5e-05, "llm_loss": 0.5901808142662048, "loss": 3.1049, "loss_aux_layer_0": 0.05816650390625, "loss_aux_layer_1": 0.1282958984375, "loss_aux_layer_10": 0.146240234375, "loss_aux_layer_11": 0.154296875, "loss_aux_layer_12": 0.167236328125, "loss_aux_layer_13": 0.17724609375, "loss_aux_layer_14": 0.196044921875, "loss_aux_layer_15": 0.212158203125, "loss_aux_layer_16": 0.228271484375, "loss_aux_layer_17": 0.231689453125, "loss_aux_layer_18": 0.242431640625, "loss_aux_layer_19": 0.241943359375, "loss_aux_layer_2": 0.151123046875, "loss_aux_layer_20": 0.24462890625, "loss_aux_layer_21": 0.24609375, "loss_aux_layer_22": 0.26708984375, "loss_aux_layer_23": 0.31640625, "loss_aux_layer_3": 0.157470703125, "loss_aux_layer_4": 0.153564453125, "loss_aux_layer_5": 0.155029296875, "loss_aux_layer_6": 0.1533203125, "loss_aux_layer_7": 0.14501953125, "loss_aux_layer_8": 0.14599609375, "loss_aux_layer_9": 0.14453125, "step": 204, "total_loss": 0.7762178927659988 }, { "epoch": 0.04058602256978816, "grad_norm": 1.6444929838180542, "learning_rate": 5e-05, "llm_loss": 0.6617871075868607, "loss": 3.4059, "loss_aux_layer_0": 0.0640869140625, "loss_aux_layer_1": 0.1405029296875, "loss_aux_layer_10": 0.153076171875, "loss_aux_layer_11": 0.1611328125, "loss_aux_layer_12": 0.17333984375, "loss_aux_layer_13": 0.183349609375, "loss_aux_layer_14": 0.20068359375, "loss_aux_layer_15": 0.215576171875, "loss_aux_layer_16": 0.2294921875, "loss_aux_layer_17": 0.230712890625, "loss_aux_layer_18": 0.23974609375, "loss_aux_layer_19": 0.236328125, "loss_aux_layer_2": 0.1640625, "loss_aux_layer_20": 0.237548828125, "loss_aux_layer_21": 0.239501953125, "loss_aux_layer_22": 0.26025390625, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.1689453125, "loss_aux_layer_4": 0.16357421875, "loss_aux_layer_5": 0.1650390625, "loss_aux_layer_6": 0.162353515625, "loss_aux_layer_7": 0.15283203125, "loss_aux_layer_8": 0.1533203125, "loss_aux_layer_9": 0.15185546875, "step": 205, "total_loss": 0.8514786511659622 }, { "epoch": 0.04078400316768957, "grad_norm": 1.534348487854004, "learning_rate": 5e-05, "llm_loss": 0.6617314666509628, "loss": 3.4176, "loss_aux_layer_0": 0.05877685546875, "loss_aux_layer_1": 0.141357421875, "loss_aux_layer_10": 0.154541015625, "loss_aux_layer_11": 0.16357421875, "loss_aux_layer_12": 0.176513671875, "loss_aux_layer_13": 0.18798828125, "loss_aux_layer_14": 0.206787109375, "loss_aux_layer_15": 0.220703125, "loss_aux_layer_16": 0.234619140625, "loss_aux_layer_17": 0.238037109375, "loss_aux_layer_18": 0.246826171875, "loss_aux_layer_19": 0.241943359375, "loss_aux_layer_2": 0.1650390625, "loss_aux_layer_20": 0.242919921875, "loss_aux_layer_21": 0.242919921875, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.170654296875, "loss_aux_layer_4": 0.16552734375, "loss_aux_layer_5": 0.167236328125, "loss_aux_layer_6": 0.164794921875, "loss_aux_layer_7": 0.15478515625, "loss_aux_layer_8": 0.15478515625, "loss_aux_layer_9": 0.1533203125, "step": 206, "total_loss": 0.8544004261493683 }, { "epoch": 0.04098198376559097, "grad_norm": 1.4993700981140137, "learning_rate": 5e-05, "llm_loss": 0.7132295668125153, "loss": 3.6054, "loss_aux_layer_0": 0.05792236328125, "loss_aux_layer_1": 0.1356201171875, "loss_aux_layer_10": 0.14990234375, "loss_aux_layer_11": 0.15771484375, "loss_aux_layer_12": 0.170166015625, "loss_aux_layer_13": 0.180419921875, "loss_aux_layer_14": 0.19873046875, "loss_aux_layer_15": 0.213623046875, "loss_aux_layer_16": 0.228271484375, "loss_aux_layer_17": 0.232421875, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.23876953125, "loss_aux_layer_2": 0.159912109375, "loss_aux_layer_20": 0.240234375, "loss_aux_layer_21": 0.24072265625, "loss_aux_layer_22": 0.26171875, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.16552734375, "loss_aux_layer_4": 0.16064453125, "loss_aux_layer_5": 0.162353515625, "loss_aux_layer_6": 0.159912109375, "loss_aux_layer_7": 0.150634765625, "loss_aux_layer_8": 0.150634765625, "loss_aux_layer_9": 0.14892578125, "step": 207, "total_loss": 0.9013456553220749 }, { "epoch": 0.041179964363492375, "grad_norm": 3.991447687149048, "learning_rate": 5e-05, "llm_loss": 0.6078555881977081, "loss": 3.1894, "loss_aux_layer_0": 0.05950927734375, "loss_aux_layer_1": 0.138916015625, "loss_aux_layer_10": 0.15234375, "loss_aux_layer_11": 0.1611328125, "loss_aux_layer_12": 0.173095703125, "loss_aux_layer_13": 0.18359375, "loss_aux_layer_14": 0.201171875, "loss_aux_layer_15": 0.21533203125, "loss_aux_layer_16": 0.22998046875, "loss_aux_layer_17": 0.232666015625, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.237548828125, "loss_aux_layer_2": 0.16259765625, "loss_aux_layer_20": 0.238525390625, "loss_aux_layer_21": 0.2392578125, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.3046875, "loss_aux_layer_3": 0.16845703125, "loss_aux_layer_4": 0.163330078125, "loss_aux_layer_5": 0.16455078125, "loss_aux_layer_6": 0.162353515625, "loss_aux_layer_7": 0.152587890625, "loss_aux_layer_8": 0.1533203125, "loss_aux_layer_9": 0.15087890625, "step": 208, "total_loss": 0.7973532229661942 }, { "epoch": 0.041377944961393784, "grad_norm": 2.8139166831970215, "learning_rate": 5e-05, "llm_loss": 0.6419569849967957, "loss": 3.3197, "loss_aux_layer_0": 0.057373046875, "loss_aux_layer_1": 0.13427734375, "loss_aux_layer_10": 0.14990234375, "loss_aux_layer_11": 0.157470703125, "loss_aux_layer_12": 0.170166015625, "loss_aux_layer_13": 0.181396484375, "loss_aux_layer_14": 0.199951171875, "loss_aux_layer_15": 0.214111328125, "loss_aux_layer_16": 0.229248046875, "loss_aux_layer_17": 0.23291015625, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.23974609375, "loss_aux_layer_2": 0.157470703125, "loss_aux_layer_20": 0.240478515625, "loss_aux_layer_21": 0.240966796875, "loss_aux_layer_22": 0.26025390625, "loss_aux_layer_23": 0.3076171875, "loss_aux_layer_3": 0.163818359375, "loss_aux_layer_4": 0.15966796875, "loss_aux_layer_5": 0.161376953125, "loss_aux_layer_6": 0.159912109375, "loss_aux_layer_7": 0.150634765625, "loss_aux_layer_8": 0.150390625, "loss_aux_layer_9": 0.1484375, "step": 209, "total_loss": 0.8299325406551361 }, { "epoch": 0.041575925559295186, "grad_norm": 3.1385490894317627, "learning_rate": 5e-05, "llm_loss": 0.7055174708366394, "loss": 3.5523, "loss_aux_layer_0": 0.05474853515625, "loss_aux_layer_1": 0.1278076171875, "loss_aux_layer_10": 0.144775390625, "loss_aux_layer_11": 0.15185546875, "loss_aux_layer_12": 0.164306640625, "loss_aux_layer_13": 0.175048828125, "loss_aux_layer_14": 0.193359375, "loss_aux_layer_15": 0.208984375, "loss_aux_layer_16": 0.224609375, "loss_aux_layer_17": 0.228759765625, "loss_aux_layer_18": 0.23876953125, "loss_aux_layer_19": 0.2353515625, "loss_aux_layer_2": 0.150146484375, "loss_aux_layer_20": 0.237060546875, "loss_aux_layer_21": 0.23681640625, "loss_aux_layer_22": 0.2548828125, "loss_aux_layer_23": 0.3037109375, "loss_aux_layer_3": 0.15673828125, "loss_aux_layer_4": 0.153076171875, "loss_aux_layer_5": 0.15478515625, "loss_aux_layer_6": 0.1533203125, "loss_aux_layer_7": 0.145263671875, "loss_aux_layer_8": 0.144775390625, "loss_aux_layer_9": 0.14306640625, "step": 210, "total_loss": 0.8880870044231415 }, { "epoch": 0.041773906157196596, "grad_norm": 2.019057035446167, "learning_rate": 5e-05, "llm_loss": 0.6483956277370453, "loss": 3.3564, "loss_aux_layer_0": 0.0621337890625, "loss_aux_layer_1": 0.14208984375, "loss_aux_layer_10": 0.1533203125, "loss_aux_layer_11": 0.161376953125, "loss_aux_layer_12": 0.173583984375, "loss_aux_layer_13": 0.18310546875, "loss_aux_layer_14": 0.200439453125, "loss_aux_layer_15": 0.21337890625, "loss_aux_layer_16": 0.227294921875, "loss_aux_layer_17": 0.229736328125, "loss_aux_layer_18": 0.239013671875, "loss_aux_layer_19": 0.236572265625, "loss_aux_layer_2": 0.166259765625, "loss_aux_layer_20": 0.238037109375, "loss_aux_layer_21": 0.240478515625, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.3125, "loss_aux_layer_3": 0.171875, "loss_aux_layer_4": 0.16650390625, "loss_aux_layer_5": 0.167724609375, "loss_aux_layer_6": 0.165771484375, "loss_aux_layer_7": 0.155029296875, "loss_aux_layer_8": 0.154541015625, "loss_aux_layer_9": 0.152099609375, "step": 211, "total_loss": 0.8390974551439285 }, { "epoch": 0.041971886755098, "grad_norm": 2.0421178340911865, "learning_rate": 5e-05, "llm_loss": 0.6707650646567345, "loss": 3.4395, "loss_aux_layer_0": 0.06109619140625, "loss_aux_layer_1": 0.138427734375, "loss_aux_layer_10": 0.150146484375, "loss_aux_layer_11": 0.158447265625, "loss_aux_layer_12": 0.17138671875, "loss_aux_layer_13": 0.180908203125, "loss_aux_layer_14": 0.198974609375, "loss_aux_layer_15": 0.2138671875, "loss_aux_layer_16": 0.22900390625, "loss_aux_layer_17": 0.232666015625, "loss_aux_layer_18": 0.242431640625, "loss_aux_layer_19": 0.239013671875, "loss_aux_layer_2": 0.161865234375, "loss_aux_layer_20": 0.240234375, "loss_aux_layer_21": 0.240478515625, "loss_aux_layer_22": 0.26123046875, "loss_aux_layer_23": 0.31005859375, "loss_aux_layer_3": 0.16748046875, "loss_aux_layer_4": 0.1630859375, "loss_aux_layer_5": 0.1640625, "loss_aux_layer_6": 0.162109375, "loss_aux_layer_7": 0.15185546875, "loss_aux_layer_8": 0.151123046875, "loss_aux_layer_9": 0.1494140625, "step": 212, "total_loss": 0.8598815500736237 }, { "epoch": 0.04216986735299941, "grad_norm": 1.5049365758895874, "learning_rate": 5e-05, "llm_loss": 0.6821173578500748, "loss": 3.4691, "loss_aux_layer_0": 0.05609130859375, "loss_aux_layer_1": 0.132080078125, "loss_aux_layer_10": 0.14599609375, "loss_aux_layer_11": 0.154296875, "loss_aux_layer_12": 0.1669921875, "loss_aux_layer_13": 0.17822265625, "loss_aux_layer_14": 0.19677734375, "loss_aux_layer_15": 0.211669921875, "loss_aux_layer_16": 0.22607421875, "loss_aux_layer_17": 0.23046875, "loss_aux_layer_18": 0.240478515625, "loss_aux_layer_19": 0.2373046875, "loss_aux_layer_2": 0.154296875, "loss_aux_layer_20": 0.23828125, "loss_aux_layer_21": 0.2392578125, "loss_aux_layer_22": 0.259765625, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.16064453125, "loss_aux_layer_4": 0.15625, "loss_aux_layer_5": 0.157958984375, "loss_aux_layer_6": 0.156005859375, "loss_aux_layer_7": 0.145751953125, "loss_aux_layer_8": 0.145751953125, "loss_aux_layer_9": 0.144287109375, "step": 213, "total_loss": 0.8672722727060318 }, { "epoch": 0.04236784795090081, "grad_norm": 1.6932449340820312, "learning_rate": 5e-05, "llm_loss": 0.6562913656234741, "loss": 3.3641, "loss_aux_layer_0": 0.0582275390625, "loss_aux_layer_1": 0.1314697265625, "loss_aux_layer_10": 0.144287109375, "loss_aux_layer_11": 0.15234375, "loss_aux_layer_12": 0.16455078125, "loss_aux_layer_13": 0.17529296875, "loss_aux_layer_14": 0.19384765625, "loss_aux_layer_15": 0.209228515625, "loss_aux_layer_16": 0.224365234375, "loss_aux_layer_17": 0.229736328125, "loss_aux_layer_18": 0.2392578125, "loss_aux_layer_19": 0.237060546875, "loss_aux_layer_2": 0.15380859375, "loss_aux_layer_20": 0.2392578125, "loss_aux_layer_21": 0.242431640625, "loss_aux_layer_22": 0.2646484375, "loss_aux_layer_23": 0.31201171875, "loss_aux_layer_3": 0.16015625, "loss_aux_layer_4": 0.15625, "loss_aux_layer_5": 0.15771484375, "loss_aux_layer_6": 0.155517578125, "loss_aux_layer_7": 0.14501953125, "loss_aux_layer_8": 0.14501953125, "loss_aux_layer_9": 0.142822265625, "step": 214, "total_loss": 0.8410269618034363 }, { "epoch": 0.04256582854880222, "grad_norm": 1.1040458679199219, "learning_rate": 5e-05, "llm_loss": 0.6447097063064575, "loss": 3.3253, "loss_aux_layer_0": 0.056396484375, "loss_aux_layer_1": 0.135498046875, "loss_aux_layer_10": 0.1474609375, "loss_aux_layer_11": 0.15576171875, "loss_aux_layer_12": 0.167236328125, "loss_aux_layer_13": 0.1767578125, "loss_aux_layer_14": 0.195068359375, "loss_aux_layer_15": 0.21044921875, "loss_aux_layer_16": 0.22607421875, "loss_aux_layer_17": 0.22998046875, "loss_aux_layer_18": 0.240234375, "loss_aux_layer_19": 0.237060546875, "loss_aux_layer_2": 0.158447265625, "loss_aux_layer_20": 0.23876953125, "loss_aux_layer_21": 0.2412109375, "loss_aux_layer_22": 0.2626953125, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.164794921875, "loss_aux_layer_4": 0.16015625, "loss_aux_layer_5": 0.161376953125, "loss_aux_layer_6": 0.15966796875, "loss_aux_layer_7": 0.14892578125, "loss_aux_layer_8": 0.148193359375, "loss_aux_layer_9": 0.146240234375, "step": 215, "total_loss": 0.8313243836164474 }, { "epoch": 0.04276380914670362, "grad_norm": 1.5449782609939575, "learning_rate": 5e-05, "llm_loss": 0.7456542104482651, "loss": 3.7272, "loss_aux_layer_0": 0.056640625, "loss_aux_layer_1": 0.135009765625, "loss_aux_layer_10": 0.14794921875, "loss_aux_layer_11": 0.155517578125, "loss_aux_layer_12": 0.167236328125, "loss_aux_layer_13": 0.17724609375, "loss_aux_layer_14": 0.19482421875, "loss_aux_layer_15": 0.2099609375, "loss_aux_layer_16": 0.22412109375, "loss_aux_layer_17": 0.22900390625, "loss_aux_layer_18": 0.237548828125, "loss_aux_layer_19": 0.235595703125, "loss_aux_layer_2": 0.157958984375, "loss_aux_layer_20": 0.23681640625, "loss_aux_layer_21": 0.239990234375, "loss_aux_layer_22": 0.2626953125, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.1640625, "loss_aux_layer_4": 0.159912109375, "loss_aux_layer_5": 0.16162109375, "loss_aux_layer_6": 0.159912109375, "loss_aux_layer_7": 0.14892578125, "loss_aux_layer_8": 0.148681640625, "loss_aux_layer_9": 0.146240234375, "step": 216, "total_loss": 0.9318082630634308 }, { "epoch": 0.04296178974460503, "grad_norm": 1.78507399559021, "learning_rate": 5e-05, "llm_loss": 0.6635310798883438, "loss": 3.3911, "loss_aux_layer_0": 0.057373046875, "loss_aux_layer_1": 0.1322021484375, "loss_aux_layer_10": 0.145751953125, "loss_aux_layer_11": 0.153076171875, "loss_aux_layer_12": 0.165283203125, "loss_aux_layer_13": 0.175048828125, "loss_aux_layer_14": 0.1923828125, "loss_aux_layer_15": 0.207763671875, "loss_aux_layer_16": 0.223388671875, "loss_aux_layer_17": 0.227294921875, "loss_aux_layer_18": 0.237060546875, "loss_aux_layer_19": 0.2353515625, "loss_aux_layer_2": 0.155517578125, "loss_aux_layer_20": 0.238525390625, "loss_aux_layer_21": 0.2392578125, "loss_aux_layer_22": 0.26025390625, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.16162109375, "loss_aux_layer_4": 0.156982421875, "loss_aux_layer_5": 0.158447265625, "loss_aux_layer_6": 0.156494140625, "loss_aux_layer_7": 0.146240234375, "loss_aux_layer_8": 0.146240234375, "loss_aux_layer_9": 0.14404296875, "step": 217, "total_loss": 0.847786009311676 }, { "epoch": 0.04315977034250643, "grad_norm": 1.0634400844573975, "learning_rate": 5e-05, "llm_loss": 0.6157641261816025, "loss": 3.1931, "loss_aux_layer_0": 0.058349609375, "loss_aux_layer_1": 0.130859375, "loss_aux_layer_10": 0.143310546875, "loss_aux_layer_11": 0.150634765625, "loss_aux_layer_12": 0.162353515625, "loss_aux_layer_13": 0.17333984375, "loss_aux_layer_14": 0.19091796875, "loss_aux_layer_15": 0.206298828125, "loss_aux_layer_16": 0.2216796875, "loss_aux_layer_17": 0.225830078125, "loss_aux_layer_18": 0.236083984375, "loss_aux_layer_19": 0.234375, "loss_aux_layer_2": 0.15234375, "loss_aux_layer_20": 0.236328125, "loss_aux_layer_21": 0.23876953125, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.30908203125, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.15478515625, "loss_aux_layer_5": 0.156005859375, "loss_aux_layer_6": 0.1533203125, "loss_aux_layer_7": 0.1435546875, "loss_aux_layer_8": 0.14306640625, "loss_aux_layer_9": 0.1416015625, "step": 218, "total_loss": 0.798276737332344 }, { "epoch": 0.04335775094040784, "grad_norm": 1.458869218826294, "learning_rate": 5e-05, "llm_loss": 0.7366319596767426, "loss": 3.6752, "loss_aux_layer_0": 0.05517578125, "loss_aux_layer_1": 0.13134765625, "loss_aux_layer_10": 0.1435546875, "loss_aux_layer_11": 0.150634765625, "loss_aux_layer_12": 0.16259765625, "loss_aux_layer_13": 0.173828125, "loss_aux_layer_14": 0.19140625, "loss_aux_layer_15": 0.206787109375, "loss_aux_layer_16": 0.2216796875, "loss_aux_layer_17": 0.225341796875, "loss_aux_layer_18": 0.235107421875, "loss_aux_layer_19": 0.23291015625, "loss_aux_layer_2": 0.15478515625, "loss_aux_layer_20": 0.234375, "loss_aux_layer_21": 0.235107421875, "loss_aux_layer_22": 0.25439453125, "loss_aux_layer_23": 0.30322265625, "loss_aux_layer_3": 0.16064453125, "loss_aux_layer_4": 0.156005859375, "loss_aux_layer_5": 0.1572265625, "loss_aux_layer_6": 0.1552734375, "loss_aux_layer_7": 0.144287109375, "loss_aux_layer_8": 0.1435546875, "loss_aux_layer_9": 0.141845703125, "step": 219, "total_loss": 0.9188012331724167 }, { "epoch": 0.043555731538309245, "grad_norm": 1.8609862327575684, "learning_rate": 5e-05, "llm_loss": 0.7427665889263153, "loss": 3.7204, "loss_aux_layer_0": 0.056396484375, "loss_aux_layer_1": 0.134521484375, "loss_aux_layer_10": 0.1484375, "loss_aux_layer_11": 0.156494140625, "loss_aux_layer_12": 0.16845703125, "loss_aux_layer_13": 0.17919921875, "loss_aux_layer_14": 0.197021484375, "loss_aux_layer_15": 0.211669921875, "loss_aux_layer_16": 0.22607421875, "loss_aux_layer_17": 0.2314453125, "loss_aux_layer_18": 0.24072265625, "loss_aux_layer_19": 0.23828125, "loss_aux_layer_2": 0.158203125, "loss_aux_layer_20": 0.2392578125, "loss_aux_layer_21": 0.240966796875, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.30908203125, "loss_aux_layer_3": 0.165283203125, "loss_aux_layer_4": 0.161376953125, "loss_aux_layer_5": 0.1630859375, "loss_aux_layer_6": 0.160888671875, "loss_aux_layer_7": 0.150146484375, "loss_aux_layer_8": 0.14892578125, "loss_aux_layer_9": 0.146728515625, "step": 220, "total_loss": 0.9301000535488129 }, { "epoch": 0.043753712136210654, "grad_norm": 3.0885612964630127, "learning_rate": 5e-05, "llm_loss": 0.6142750829458237, "loss": 3.1812, "loss_aux_layer_0": 0.053466796875, "loss_aux_layer_1": 0.1314697265625, "loss_aux_layer_10": 0.142578125, "loss_aux_layer_11": 0.150634765625, "loss_aux_layer_12": 0.161865234375, "loss_aux_layer_13": 0.17236328125, "loss_aux_layer_14": 0.189453125, "loss_aux_layer_15": 0.204345703125, "loss_aux_layer_16": 0.219482421875, "loss_aux_layer_17": 0.22412109375, "loss_aux_layer_18": 0.233642578125, "loss_aux_layer_19": 0.230712890625, "loss_aux_layer_2": 0.153564453125, "loss_aux_layer_20": 0.23291015625, "loss_aux_layer_21": 0.233154296875, "loss_aux_layer_22": 0.25390625, "loss_aux_layer_23": 0.2998046875, "loss_aux_layer_3": 0.159912109375, "loss_aux_layer_4": 0.15576171875, "loss_aux_layer_5": 0.15771484375, "loss_aux_layer_6": 0.1552734375, "loss_aux_layer_7": 0.14453125, "loss_aux_layer_8": 0.1435546875, "loss_aux_layer_9": 0.141357421875, "step": 221, "total_loss": 0.7952926456928253 }, { "epoch": 0.043951692734112056, "grad_norm": 1.2289211750030518, "learning_rate": 5e-05, "llm_loss": 0.6557567715644836, "loss": 3.3593, "loss_aux_layer_0": 0.054443359375, "loss_aux_layer_1": 0.1324462890625, "loss_aux_layer_10": 0.143798828125, "loss_aux_layer_11": 0.15185546875, "loss_aux_layer_12": 0.163818359375, "loss_aux_layer_13": 0.17529296875, "loss_aux_layer_14": 0.194091796875, "loss_aux_layer_15": 0.20947265625, "loss_aux_layer_16": 0.224853515625, "loss_aux_layer_17": 0.229248046875, "loss_aux_layer_18": 0.239501953125, "loss_aux_layer_19": 0.236572265625, "loss_aux_layer_2": 0.154541015625, "loss_aux_layer_20": 0.237548828125, "loss_aux_layer_21": 0.23974609375, "loss_aux_layer_22": 0.259765625, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.160888671875, "loss_aux_layer_4": 0.156494140625, "loss_aux_layer_5": 0.15771484375, "loss_aux_layer_6": 0.15576171875, "loss_aux_layer_7": 0.14501953125, "loss_aux_layer_8": 0.14404296875, "loss_aux_layer_9": 0.142333984375, "step": 222, "total_loss": 0.8398229032754898 }, { "epoch": 0.044149673332013466, "grad_norm": 1.0785290002822876, "learning_rate": 5e-05, "llm_loss": 0.6647623032331467, "loss": 3.3975, "loss_aux_layer_0": 0.0545654296875, "loss_aux_layer_1": 0.1337890625, "loss_aux_layer_10": 0.144775390625, "loss_aux_layer_11": 0.15234375, "loss_aux_layer_12": 0.163818359375, "loss_aux_layer_13": 0.174560546875, "loss_aux_layer_14": 0.192138671875, "loss_aux_layer_15": 0.20751953125, "loss_aux_layer_16": 0.222900390625, "loss_aux_layer_17": 0.226806640625, "loss_aux_layer_18": 0.236572265625, "loss_aux_layer_19": 0.234619140625, "loss_aux_layer_2": 0.1572265625, "loss_aux_layer_20": 0.236572265625, "loss_aux_layer_21": 0.239501953125, "loss_aux_layer_22": 0.26123046875, "loss_aux_layer_23": 0.31103515625, "loss_aux_layer_3": 0.16357421875, "loss_aux_layer_4": 0.159423828125, "loss_aux_layer_5": 0.160888671875, "loss_aux_layer_6": 0.15869140625, "loss_aux_layer_7": 0.147705078125, "loss_aux_layer_8": 0.146240234375, "loss_aux_layer_9": 0.14404296875, "step": 223, "total_loss": 0.8493628352880478 }, { "epoch": 0.04434765392991487, "grad_norm": 3.0467417240142822, "learning_rate": 5e-05, "llm_loss": 0.6460565254092216, "loss": 3.31, "loss_aux_layer_0": 0.05303955078125, "loss_aux_layer_1": 0.1302490234375, "loss_aux_layer_10": 0.142822265625, "loss_aux_layer_11": 0.150390625, "loss_aux_layer_12": 0.162109375, "loss_aux_layer_13": 0.173095703125, "loss_aux_layer_14": 0.190673828125, "loss_aux_layer_15": 0.205322265625, "loss_aux_layer_16": 0.219970703125, "loss_aux_layer_17": 0.223388671875, "loss_aux_layer_18": 0.233642578125, "loss_aux_layer_19": 0.231201171875, "loss_aux_layer_2": 0.15283203125, "loss_aux_layer_20": 0.23291015625, "loss_aux_layer_21": 0.23486328125, "loss_aux_layer_22": 0.25634765625, "loss_aux_layer_23": 0.30419921875, "loss_aux_layer_3": 0.15966796875, "loss_aux_layer_4": 0.155517578125, "loss_aux_layer_5": 0.157470703125, "loss_aux_layer_6": 0.155517578125, "loss_aux_layer_7": 0.144775390625, "loss_aux_layer_8": 0.143310546875, "loss_aux_layer_9": 0.1416015625, "step": 224, "total_loss": 0.8275060653686523 }, { "epoch": 0.04454563452781628, "grad_norm": 1.766924262046814, "learning_rate": 5e-05, "llm_loss": 0.6826951503753662, "loss": 3.455, "loss_aux_layer_0": 0.0562744140625, "loss_aux_layer_1": 0.13232421875, "loss_aux_layer_10": 0.141845703125, "loss_aux_layer_11": 0.1494140625, "loss_aux_layer_12": 0.160400390625, "loss_aux_layer_13": 0.170654296875, "loss_aux_layer_14": 0.188232421875, "loss_aux_layer_15": 0.203125, "loss_aux_layer_16": 0.21826171875, "loss_aux_layer_17": 0.22265625, "loss_aux_layer_18": 0.23291015625, "loss_aux_layer_19": 0.231201171875, "loss_aux_layer_2": 0.154052734375, "loss_aux_layer_20": 0.234130859375, "loss_aux_layer_21": 0.235107421875, "loss_aux_layer_22": 0.255615234375, "loss_aux_layer_23": 0.3046875, "loss_aux_layer_3": 0.16015625, "loss_aux_layer_4": 0.15576171875, "loss_aux_layer_5": 0.1572265625, "loss_aux_layer_6": 0.155029296875, "loss_aux_layer_7": 0.1435546875, "loss_aux_layer_8": 0.142578125, "loss_aux_layer_9": 0.140380859375, "step": 225, "total_loss": 0.8637467622756958 }, { "epoch": 0.04474361512571768, "grad_norm": 1.94161057472229, "learning_rate": 5e-05, "llm_loss": 0.6690951287746429, "loss": 3.3913, "loss_aux_layer_0": 0.0517578125, "loss_aux_layer_1": 0.1268310546875, "loss_aux_layer_10": 0.1376953125, "loss_aux_layer_11": 0.1455078125, "loss_aux_layer_12": 0.1572265625, "loss_aux_layer_13": 0.168701171875, "loss_aux_layer_14": 0.186767578125, "loss_aux_layer_15": 0.20263671875, "loss_aux_layer_16": 0.21875, "loss_aux_layer_17": 0.224853515625, "loss_aux_layer_18": 0.235107421875, "loss_aux_layer_19": 0.23388671875, "loss_aux_layer_2": 0.147216796875, "loss_aux_layer_20": 0.235595703125, "loss_aux_layer_21": 0.237060546875, "loss_aux_layer_22": 0.25830078125, "loss_aux_layer_23": 0.3046875, "loss_aux_layer_3": 0.153076171875, "loss_aux_layer_4": 0.14990234375, "loss_aux_layer_5": 0.151611328125, "loss_aux_layer_6": 0.149658203125, "loss_aux_layer_7": 0.138671875, "loss_aux_layer_8": 0.137939453125, "loss_aux_layer_9": 0.1357421875, "step": 226, "total_loss": 0.8478286564350128 }, { "epoch": 0.04494159572361908, "grad_norm": 2.016544818878174, "learning_rate": 5e-05, "llm_loss": 0.6925721168518066, "loss": 3.5065, "loss_aux_layer_0": 0.0518798828125, "loss_aux_layer_1": 0.1300048828125, "loss_aux_layer_10": 0.1455078125, "loss_aux_layer_11": 0.1533203125, "loss_aux_layer_12": 0.165283203125, "loss_aux_layer_13": 0.17626953125, "loss_aux_layer_14": 0.194091796875, "loss_aux_layer_15": 0.209228515625, "loss_aux_layer_16": 0.22509765625, "loss_aux_layer_17": 0.2294921875, "loss_aux_layer_18": 0.2392578125, "loss_aux_layer_19": 0.2373046875, "loss_aux_layer_2": 0.153076171875, "loss_aux_layer_20": 0.23876953125, "loss_aux_layer_21": 0.23876953125, "loss_aux_layer_22": 0.258544921875, "loss_aux_layer_23": 0.30712890625, "loss_aux_layer_3": 0.159912109375, "loss_aux_layer_4": 0.15673828125, "loss_aux_layer_5": 0.158203125, "loss_aux_layer_6": 0.15625, "loss_aux_layer_7": 0.1455078125, "loss_aux_layer_8": 0.144775390625, "loss_aux_layer_9": 0.143310546875, "step": 227, "total_loss": 0.87662173807621 }, { "epoch": 0.04513957632152049, "grad_norm": 2.0764477252960205, "learning_rate": 5e-05, "llm_loss": 0.6078571230173111, "loss": 3.1484, "loss_aux_layer_0": 0.05145263671875, "loss_aux_layer_1": 0.12890625, "loss_aux_layer_10": 0.140869140625, "loss_aux_layer_11": 0.148681640625, "loss_aux_layer_12": 0.16015625, "loss_aux_layer_13": 0.170166015625, "loss_aux_layer_14": 0.1875, "loss_aux_layer_15": 0.203125, "loss_aux_layer_16": 0.21728515625, "loss_aux_layer_17": 0.222412109375, "loss_aux_layer_18": 0.232177734375, "loss_aux_layer_19": 0.229736328125, "loss_aux_layer_2": 0.15185546875, "loss_aux_layer_20": 0.23046875, "loss_aux_layer_21": 0.232177734375, "loss_aux_layer_22": 0.25048828125, "loss_aux_layer_23": 0.29736328125, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.155029296875, "loss_aux_layer_5": 0.15625, "loss_aux_layer_6": 0.154052734375, "loss_aux_layer_7": 0.142578125, "loss_aux_layer_8": 0.14111328125, "loss_aux_layer_9": 0.139404296875, "step": 228, "total_loss": 0.7870958000421524 }, { "epoch": 0.045337556919421894, "grad_norm": 1.035398006439209, "learning_rate": 5e-05, "llm_loss": 0.6491740047931671, "loss": 3.3363, "loss_aux_layer_0": 0.053466796875, "loss_aux_layer_1": 0.134521484375, "loss_aux_layer_10": 0.143798828125, "loss_aux_layer_11": 0.151611328125, "loss_aux_layer_12": 0.1640625, "loss_aux_layer_13": 0.1748046875, "loss_aux_layer_14": 0.19287109375, "loss_aux_layer_15": 0.2080078125, "loss_aux_layer_16": 0.223876953125, "loss_aux_layer_17": 0.229248046875, "loss_aux_layer_18": 0.2392578125, "loss_aux_layer_19": 0.235595703125, "loss_aux_layer_2": 0.157470703125, "loss_aux_layer_20": 0.23779296875, "loss_aux_layer_21": 0.240234375, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.30908203125, "loss_aux_layer_3": 0.16455078125, "loss_aux_layer_4": 0.16015625, "loss_aux_layer_5": 0.162109375, "loss_aux_layer_6": 0.15869140625, "loss_aux_layer_7": 0.146728515625, "loss_aux_layer_8": 0.14453125, "loss_aux_layer_9": 0.142333984375, "step": 229, "total_loss": 0.8340834975242615 }, { "epoch": 0.0455355375173233, "grad_norm": 2.1547317504882812, "learning_rate": 5e-05, "llm_loss": 0.6851986199617386, "loss": 3.4427, "loss_aux_layer_0": 0.04998779296875, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.136962890625, "loss_aux_layer_11": 0.14404296875, "loss_aux_layer_12": 0.155029296875, "loss_aux_layer_13": 0.16552734375, "loss_aux_layer_14": 0.18359375, "loss_aux_layer_15": 0.198486328125, "loss_aux_layer_16": 0.21337890625, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.228515625, "loss_aux_layer_19": 0.22607421875, "loss_aux_layer_2": 0.148681640625, "loss_aux_layer_20": 0.228271484375, "loss_aux_layer_21": 0.228759765625, "loss_aux_layer_22": 0.248046875, "loss_aux_layer_23": 0.2939453125, "loss_aux_layer_3": 0.154541015625, "loss_aux_layer_4": 0.150146484375, "loss_aux_layer_5": 0.15185546875, "loss_aux_layer_6": 0.14990234375, "loss_aux_layer_7": 0.138427734375, "loss_aux_layer_8": 0.13671875, "loss_aux_layer_9": 0.13525390625, "step": 230, "total_loss": 0.8606871068477631 }, { "epoch": 0.045733518115224706, "grad_norm": 1.275467872619629, "learning_rate": 5e-05, "llm_loss": 0.659816175699234, "loss": 3.3487, "loss_aux_layer_0": 0.0531005859375, "loss_aux_layer_1": 0.127197265625, "loss_aux_layer_10": 0.136962890625, "loss_aux_layer_11": 0.144775390625, "loss_aux_layer_12": 0.156494140625, "loss_aux_layer_13": 0.167236328125, "loss_aux_layer_14": 0.185302734375, "loss_aux_layer_15": 0.20068359375, "loss_aux_layer_16": 0.215576171875, "loss_aux_layer_17": 0.22119140625, "loss_aux_layer_18": 0.232177734375, "loss_aux_layer_19": 0.230712890625, "loss_aux_layer_2": 0.1494140625, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.2529296875, "loss_aux_layer_23": 0.30126953125, "loss_aux_layer_3": 0.155029296875, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.152099609375, "loss_aux_layer_6": 0.149169921875, "loss_aux_layer_7": 0.1376953125, "loss_aux_layer_8": 0.13671875, "loss_aux_layer_9": 0.13525390625, "step": 231, "total_loss": 0.8371846377849579 }, { "epoch": 0.045931498713126115, "grad_norm": 2.1609506607055664, "learning_rate": 5e-05, "llm_loss": 0.6319799423217773, "loss": 3.2583, "loss_aux_layer_0": 0.052001953125, "loss_aux_layer_1": 0.13427734375, "loss_aux_layer_10": 0.144775390625, "loss_aux_layer_11": 0.15185546875, "loss_aux_layer_12": 0.16357421875, "loss_aux_layer_13": 0.173828125, "loss_aux_layer_14": 0.190673828125, "loss_aux_layer_15": 0.205078125, "loss_aux_layer_16": 0.2197265625, "loss_aux_layer_17": 0.224609375, "loss_aux_layer_18": 0.23486328125, "loss_aux_layer_19": 0.232177734375, "loss_aux_layer_2": 0.157470703125, "loss_aux_layer_20": 0.23291015625, "loss_aux_layer_21": 0.232177734375, "loss_aux_layer_22": 0.251708984375, "loss_aux_layer_23": 0.2978515625, "loss_aux_layer_3": 0.164794921875, "loss_aux_layer_4": 0.160400390625, "loss_aux_layer_5": 0.162109375, "loss_aux_layer_6": 0.15966796875, "loss_aux_layer_7": 0.147216796875, "loss_aux_layer_8": 0.145263671875, "loss_aux_layer_9": 0.14306640625, "step": 232, "total_loss": 0.8145748972892761 }, { "epoch": 0.04612947931102752, "grad_norm": 1.8913195133209229, "learning_rate": 5e-05, "llm_loss": 0.5059083327651024, "loss": 2.7712, "loss_aux_layer_0": 0.0506591796875, "loss_aux_layer_1": 0.136474609375, "loss_aux_layer_10": 0.150146484375, "loss_aux_layer_11": 0.15771484375, "loss_aux_layer_12": 0.169189453125, "loss_aux_layer_13": 0.1787109375, "loss_aux_layer_14": 0.195556640625, "loss_aux_layer_15": 0.208984375, "loss_aux_layer_16": 0.223876953125, "loss_aux_layer_17": 0.227294921875, "loss_aux_layer_18": 0.23681640625, "loss_aux_layer_19": 0.23388671875, "loss_aux_layer_2": 0.161376953125, "loss_aux_layer_20": 0.23486328125, "loss_aux_layer_21": 0.236572265625, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.30712890625, "loss_aux_layer_3": 0.1689453125, "loss_aux_layer_4": 0.165283203125, "loss_aux_layer_5": 0.1669921875, "loss_aux_layer_6": 0.164794921875, "loss_aux_layer_7": 0.15234375, "loss_aux_layer_8": 0.15087890625, "loss_aux_layer_9": 0.149169921875, "step": 233, "total_loss": 0.6928002089262009 }, { "epoch": 0.04632745990892893, "grad_norm": 1.3596328496932983, "learning_rate": 5e-05, "llm_loss": 0.6876414865255356, "loss": 3.4819, "loss_aux_layer_0": 0.05426025390625, "loss_aux_layer_1": 0.132080078125, "loss_aux_layer_10": 0.14208984375, "loss_aux_layer_11": 0.150390625, "loss_aux_layer_12": 0.16162109375, "loss_aux_layer_13": 0.17236328125, "loss_aux_layer_14": 0.18994140625, "loss_aux_layer_15": 0.205810546875, "loss_aux_layer_16": 0.221435546875, "loss_aux_layer_17": 0.22607421875, "loss_aux_layer_18": 0.236083984375, "loss_aux_layer_19": 0.23486328125, "loss_aux_layer_2": 0.155517578125, "loss_aux_layer_20": 0.236328125, "loss_aux_layer_21": 0.23828125, "loss_aux_layer_22": 0.26025390625, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.161865234375, "loss_aux_layer_4": 0.15771484375, "loss_aux_layer_5": 0.158935546875, "loss_aux_layer_6": 0.156494140625, "loss_aux_layer_7": 0.143798828125, "loss_aux_layer_8": 0.142333984375, "loss_aux_layer_9": 0.140625, "step": 234, "total_loss": 0.8704837560653687 }, { "epoch": 0.04652544050683033, "grad_norm": 1.5356429815292358, "learning_rate": 5e-05, "llm_loss": 0.7333939522504807, "loss": 3.6503, "loss_aux_layer_0": 0.05169677734375, "loss_aux_layer_1": 0.1309814453125, "loss_aux_layer_10": 0.137939453125, "loss_aux_layer_11": 0.145751953125, "loss_aux_layer_12": 0.1572265625, "loss_aux_layer_13": 0.169189453125, "loss_aux_layer_14": 0.18701171875, "loss_aux_layer_15": 0.2021484375, "loss_aux_layer_16": 0.21728515625, "loss_aux_layer_17": 0.22216796875, "loss_aux_layer_18": 0.232666015625, "loss_aux_layer_19": 0.230224609375, "loss_aux_layer_2": 0.151611328125, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.23583984375, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.157958984375, "loss_aux_layer_4": 0.154052734375, "loss_aux_layer_5": 0.1552734375, "loss_aux_layer_6": 0.15234375, "loss_aux_layer_7": 0.14013671875, "loss_aux_layer_8": 0.138671875, "loss_aux_layer_9": 0.136474609375, "step": 235, "total_loss": 0.9125836491584778 }, { "epoch": 0.04672342110473174, "grad_norm": 1.5876836776733398, "learning_rate": 5e-05, "llm_loss": 0.6588570475578308, "loss": 3.3728, "loss_aux_layer_0": 0.05438232421875, "loss_aux_layer_1": 0.135498046875, "loss_aux_layer_10": 0.145263671875, "loss_aux_layer_11": 0.1533203125, "loss_aux_layer_12": 0.164794921875, "loss_aux_layer_13": 0.17578125, "loss_aux_layer_14": 0.193359375, "loss_aux_layer_15": 0.207275390625, "loss_aux_layer_16": 0.22216796875, "loss_aux_layer_17": 0.22705078125, "loss_aux_layer_18": 0.237060546875, "loss_aux_layer_19": 0.234130859375, "loss_aux_layer_2": 0.158203125, "loss_aux_layer_20": 0.2353515625, "loss_aux_layer_21": 0.236083984375, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.30224609375, "loss_aux_layer_3": 0.16552734375, "loss_aux_layer_4": 0.16162109375, "loss_aux_layer_5": 0.16259765625, "loss_aux_layer_6": 0.160400390625, "loss_aux_layer_7": 0.147216796875, "loss_aux_layer_8": 0.145263671875, "loss_aux_layer_9": 0.143798828125, "step": 236, "total_loss": 0.8431883007287979 }, { "epoch": 0.04692140170263314, "grad_norm": 0.9836574792861938, "learning_rate": 5e-05, "llm_loss": 0.6289204061031342, "loss": 3.2093, "loss_aux_layer_0": 0.04815673828125, "loss_aux_layer_1": 0.1241455078125, "loss_aux_layer_10": 0.13232421875, "loss_aux_layer_11": 0.1396484375, "loss_aux_layer_12": 0.151123046875, "loss_aux_layer_13": 0.162109375, "loss_aux_layer_14": 0.1796875, "loss_aux_layer_15": 0.195556640625, "loss_aux_layer_16": 0.21142578125, "loss_aux_layer_17": 0.217529296875, "loss_aux_layer_18": 0.2275390625, "loss_aux_layer_19": 0.22705078125, "loss_aux_layer_2": 0.14453125, "loss_aux_layer_20": 0.22998046875, "loss_aux_layer_21": 0.231689453125, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.29736328125, "loss_aux_layer_3": 0.15087890625, "loss_aux_layer_4": 0.147216796875, "loss_aux_layer_5": 0.148681640625, "loss_aux_layer_6": 0.146240234375, "loss_aux_layer_7": 0.134033203125, "loss_aux_layer_8": 0.1328125, "loss_aux_layer_9": 0.130859375, "step": 237, "total_loss": 0.8023286163806915 }, { "epoch": 0.04711938230053455, "grad_norm": 1.2545729875564575, "learning_rate": 5e-05, "llm_loss": 0.70827417075634, "loss": 3.5499, "loss_aux_layer_0": 0.0511474609375, "loss_aux_layer_1": 0.1285400390625, "loss_aux_layer_10": 0.13720703125, "loss_aux_layer_11": 0.145263671875, "loss_aux_layer_12": 0.156494140625, "loss_aux_layer_13": 0.16796875, "loss_aux_layer_14": 0.186279296875, "loss_aux_layer_15": 0.202880859375, "loss_aux_layer_16": 0.2177734375, "loss_aux_layer_17": 0.22412109375, "loss_aux_layer_18": 0.234619140625, "loss_aux_layer_19": 0.2333984375, "loss_aux_layer_2": 0.150634765625, "loss_aux_layer_20": 0.235107421875, "loss_aux_layer_21": 0.236572265625, "loss_aux_layer_22": 0.25830078125, "loss_aux_layer_23": 0.30615234375, "loss_aux_layer_3": 0.1572265625, "loss_aux_layer_4": 0.153076171875, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.151123046875, "loss_aux_layer_7": 0.138916015625, "loss_aux_layer_8": 0.13720703125, "loss_aux_layer_9": 0.1358642578125, "step": 238, "total_loss": 0.8874737024307251 }, { "epoch": 0.04731736289843595, "grad_norm": 1.1251708269119263, "learning_rate": 5e-05, "llm_loss": 0.6753120124340057, "loss": 3.4249, "loss_aux_layer_0": 0.0504150390625, "loss_aux_layer_1": 0.1317138671875, "loss_aux_layer_10": 0.140625, "loss_aux_layer_11": 0.1484375, "loss_aux_layer_12": 0.1591796875, "loss_aux_layer_13": 0.169677734375, "loss_aux_layer_14": 0.18701171875, "loss_aux_layer_15": 0.2021484375, "loss_aux_layer_16": 0.217529296875, "loss_aux_layer_17": 0.223388671875, "loss_aux_layer_18": 0.234130859375, "loss_aux_layer_19": 0.231689453125, "loss_aux_layer_2": 0.155517578125, "loss_aux_layer_20": 0.234619140625, "loss_aux_layer_21": 0.235595703125, "loss_aux_layer_22": 0.25634765625, "loss_aux_layer_23": 0.302734375, "loss_aux_layer_3": 0.162109375, "loss_aux_layer_4": 0.157958984375, "loss_aux_layer_5": 0.159912109375, "loss_aux_layer_6": 0.15673828125, "loss_aux_layer_7": 0.1435546875, "loss_aux_layer_8": 0.141357421875, "loss_aux_layer_9": 0.138916015625, "step": 239, "total_loss": 0.8562154471874237 }, { "epoch": 0.04751534349633736, "grad_norm": 1.544764518737793, "learning_rate": 5e-05, "llm_loss": 0.73508220911026, "loss": 3.6716, "loss_aux_layer_0": 0.05401611328125, "loss_aux_layer_1": 0.135009765625, "loss_aux_layer_10": 0.1416015625, "loss_aux_layer_11": 0.149169921875, "loss_aux_layer_12": 0.16015625, "loss_aux_layer_13": 0.17138671875, "loss_aux_layer_14": 0.18896484375, "loss_aux_layer_15": 0.205078125, "loss_aux_layer_16": 0.220703125, "loss_aux_layer_17": 0.2255859375, "loss_aux_layer_18": 0.236083984375, "loss_aux_layer_19": 0.234130859375, "loss_aux_layer_2": 0.157470703125, "loss_aux_layer_20": 0.23583984375, "loss_aux_layer_21": 0.23779296875, "loss_aux_layer_22": 0.2587890625, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.163818359375, "loss_aux_layer_4": 0.16015625, "loss_aux_layer_5": 0.161376953125, "loss_aux_layer_6": 0.158447265625, "loss_aux_layer_7": 0.14453125, "loss_aux_layer_8": 0.142333984375, "loss_aux_layer_9": 0.14013671875, "step": 240, "total_loss": 0.9178987145423889 }, { "epoch": 0.047713324094238764, "grad_norm": 1.3120898008346558, "learning_rate": 5e-05, "llm_loss": 0.792045384645462, "loss": 3.8649, "loss_aux_layer_0": 0.05279541015625, "loss_aux_layer_1": 0.1246337890625, "loss_aux_layer_10": 0.1318359375, "loss_aux_layer_11": 0.139404296875, "loss_aux_layer_12": 0.1513671875, "loss_aux_layer_13": 0.1630859375, "loss_aux_layer_14": 0.181396484375, "loss_aux_layer_15": 0.197265625, "loss_aux_layer_16": 0.2138671875, "loss_aux_layer_17": 0.218994140625, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.22802734375, "loss_aux_layer_2": 0.145751953125, "loss_aux_layer_20": 0.22998046875, "loss_aux_layer_21": 0.2314453125, "loss_aux_layer_22": 0.25146484375, "loss_aux_layer_23": 0.298828125, "loss_aux_layer_3": 0.15234375, "loss_aux_layer_4": 0.147216796875, "loss_aux_layer_5": 0.148681640625, "loss_aux_layer_6": 0.14599609375, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.132080078125, "loss_aux_layer_9": 0.13037109375, "step": 241, "total_loss": 0.9662222266197205 }, { "epoch": 0.04791130469214017, "grad_norm": 1.083624005317688, "learning_rate": 5e-05, "llm_loss": 0.640606477856636, "loss": 3.2929, "loss_aux_layer_0": 0.05157470703125, "loss_aux_layer_1": 0.1357421875, "loss_aux_layer_10": 0.143798828125, "loss_aux_layer_11": 0.151611328125, "loss_aux_layer_12": 0.161865234375, "loss_aux_layer_13": 0.172119140625, "loss_aux_layer_14": 0.1884765625, "loss_aux_layer_15": 0.20361328125, "loss_aux_layer_16": 0.218017578125, "loss_aux_layer_17": 0.2216796875, "loss_aux_layer_18": 0.2314453125, "loss_aux_layer_19": 0.230224609375, "loss_aux_layer_2": 0.158935546875, "loss_aux_layer_20": 0.23193359375, "loss_aux_layer_21": 0.23388671875, "loss_aux_layer_22": 0.255859375, "loss_aux_layer_23": 0.30322265625, "loss_aux_layer_3": 0.166748046875, "loss_aux_layer_4": 0.16259765625, "loss_aux_layer_5": 0.164306640625, "loss_aux_layer_6": 0.160400390625, "loss_aux_layer_7": 0.14697265625, "loss_aux_layer_8": 0.14501953125, "loss_aux_layer_9": 0.142578125, "step": 242, "total_loss": 0.823212519288063 }, { "epoch": 0.048109285290041576, "grad_norm": 1.1369752883911133, "learning_rate": 5e-05, "llm_loss": 0.6852954030036926, "loss": 3.4659, "loss_aux_layer_0": 0.05242919921875, "loss_aux_layer_1": 0.13330078125, "loss_aux_layer_10": 0.140380859375, "loss_aux_layer_11": 0.1474609375, "loss_aux_layer_12": 0.15869140625, "loss_aux_layer_13": 0.169189453125, "loss_aux_layer_14": 0.186279296875, "loss_aux_layer_15": 0.20166015625, "loss_aux_layer_16": 0.217041015625, "loss_aux_layer_17": 0.2236328125, "loss_aux_layer_18": 0.23388671875, "loss_aux_layer_19": 0.23193359375, "loss_aux_layer_2": 0.155029296875, "loss_aux_layer_20": 0.234375, "loss_aux_layer_21": 0.236328125, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.162353515625, "loss_aux_layer_4": 0.1591796875, "loss_aux_layer_5": 0.16015625, "loss_aux_layer_6": 0.1572265625, "loss_aux_layer_7": 0.14306640625, "loss_aux_layer_8": 0.141357421875, "loss_aux_layer_9": 0.139404296875, "step": 243, "total_loss": 0.8664833605289459 }, { "epoch": 0.048307265887942985, "grad_norm": 1.0597831010818481, "learning_rate": 5e-05, "llm_loss": 0.7152489125728607, "loss": 3.5859, "loss_aux_layer_0": 0.05120849609375, "loss_aux_layer_1": 0.128662109375, "loss_aux_layer_10": 0.14111328125, "loss_aux_layer_11": 0.148681640625, "loss_aux_layer_12": 0.15966796875, "loss_aux_layer_13": 0.17041015625, "loss_aux_layer_14": 0.188720703125, "loss_aux_layer_15": 0.20458984375, "loss_aux_layer_16": 0.2197265625, "loss_aux_layer_17": 0.22607421875, "loss_aux_layer_18": 0.23583984375, "loss_aux_layer_19": 0.234619140625, "loss_aux_layer_2": 0.150390625, "loss_aux_layer_20": 0.23681640625, "loss_aux_layer_21": 0.238037109375, "loss_aux_layer_22": 0.25830078125, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.158935546875, "loss_aux_layer_4": 0.15576171875, "loss_aux_layer_5": 0.157958984375, "loss_aux_layer_6": 0.155029296875, "loss_aux_layer_7": 0.142333984375, "loss_aux_layer_8": 0.140869140625, "loss_aux_layer_9": 0.139892578125, "step": 244, "total_loss": 0.8964861482381821 }, { "epoch": 0.04850524648584439, "grad_norm": 1.1821101903915405, "learning_rate": 5e-05, "llm_loss": 0.6459055691957474, "loss": 3.3038, "loss_aux_layer_0": 0.051513671875, "loss_aux_layer_1": 0.13232421875, "loss_aux_layer_10": 0.14013671875, "loss_aux_layer_11": 0.14794921875, "loss_aux_layer_12": 0.158935546875, "loss_aux_layer_13": 0.170166015625, "loss_aux_layer_14": 0.187744140625, "loss_aux_layer_15": 0.202392578125, "loss_aux_layer_16": 0.21728515625, "loss_aux_layer_17": 0.2236328125, "loss_aux_layer_18": 0.233154296875, "loss_aux_layer_19": 0.23046875, "loss_aux_layer_2": 0.154052734375, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.25244140625, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.161376953125, "loss_aux_layer_4": 0.15771484375, "loss_aux_layer_5": 0.1591796875, "loss_aux_layer_6": 0.156005859375, "loss_aux_layer_7": 0.142578125, "loss_aux_layer_8": 0.140625, "loss_aux_layer_9": 0.138427734375, "step": 245, "total_loss": 0.8259526193141937 }, { "epoch": 0.04870322708374579, "grad_norm": 0.9039604067802429, "learning_rate": 5e-05, "llm_loss": 0.6932439059019089, "loss": 3.4764, "loss_aux_layer_0": 0.05242919921875, "loss_aux_layer_1": 0.1273193359375, "loss_aux_layer_10": 0.133544921875, "loss_aux_layer_11": 0.140869140625, "loss_aux_layer_12": 0.151611328125, "loss_aux_layer_13": 0.163330078125, "loss_aux_layer_14": 0.18115234375, "loss_aux_layer_15": 0.197998046875, "loss_aux_layer_16": 0.21435546875, "loss_aux_layer_17": 0.2197265625, "loss_aux_layer_18": 0.230224609375, "loss_aux_layer_19": 0.229248046875, "loss_aux_layer_2": 0.14794921875, "loss_aux_layer_20": 0.231201171875, "loss_aux_layer_21": 0.233642578125, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.302734375, "loss_aux_layer_3": 0.155029296875, "loss_aux_layer_4": 0.150634765625, "loss_aux_layer_5": 0.15185546875, "loss_aux_layer_6": 0.149169921875, "loss_aux_layer_7": 0.1357421875, "loss_aux_layer_8": 0.1336669921875, "loss_aux_layer_9": 0.1319580078125, "step": 246, "total_loss": 0.869102731347084 }, { "epoch": 0.0489012076816472, "grad_norm": 1.0872647762298584, "learning_rate": 5e-05, "llm_loss": 0.6397125571966171, "loss": 3.2693, "loss_aux_layer_0": 0.0511474609375, "loss_aux_layer_1": 0.1290283203125, "loss_aux_layer_10": 0.135986328125, "loss_aux_layer_11": 0.143310546875, "loss_aux_layer_12": 0.154541015625, "loss_aux_layer_13": 0.16552734375, "loss_aux_layer_14": 0.183349609375, "loss_aux_layer_15": 0.199462890625, "loss_aux_layer_16": 0.215087890625, "loss_aux_layer_17": 0.2216796875, "loss_aux_layer_18": 0.231201171875, "loss_aux_layer_19": 0.22998046875, "loss_aux_layer_2": 0.1494140625, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.23486328125, "loss_aux_layer_22": 0.257568359375, "loss_aux_layer_23": 0.30615234375, "loss_aux_layer_3": 0.15673828125, "loss_aux_layer_4": 0.152587890625, "loss_aux_layer_5": 0.154296875, "loss_aux_layer_6": 0.1513671875, "loss_aux_layer_7": 0.13818359375, "loss_aux_layer_8": 0.13623046875, "loss_aux_layer_9": 0.13427734375, "step": 247, "total_loss": 0.817323625087738 }, { "epoch": 0.0490991882795486, "grad_norm": 0.6908246874809265, "learning_rate": 5e-05, "llm_loss": 0.5984766855835915, "loss": 3.1069, "loss_aux_layer_0": 0.0460205078125, "loss_aux_layer_1": 0.128173828125, "loss_aux_layer_10": 0.137451171875, "loss_aux_layer_11": 0.1455078125, "loss_aux_layer_12": 0.156982421875, "loss_aux_layer_13": 0.16748046875, "loss_aux_layer_14": 0.185302734375, "loss_aux_layer_15": 0.20068359375, "loss_aux_layer_16": 0.216064453125, "loss_aux_layer_17": 0.22021484375, "loss_aux_layer_18": 0.23095703125, "loss_aux_layer_19": 0.230712890625, "loss_aux_layer_2": 0.151611328125, "loss_aux_layer_20": 0.233154296875, "loss_aux_layer_21": 0.235107421875, "loss_aux_layer_22": 0.255859375, "loss_aux_layer_23": 0.3037109375, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.154052734375, "loss_aux_layer_5": 0.1552734375, "loss_aux_layer_6": 0.152099609375, "loss_aux_layer_7": 0.138671875, "loss_aux_layer_8": 0.13720703125, "loss_aux_layer_9": 0.135986328125, "step": 248, "total_loss": 0.7767320722341537 }, { "epoch": 0.04929716887745001, "grad_norm": 1.6513618230819702, "learning_rate": 5e-05, "llm_loss": 0.6478532999753952, "loss": 3.317, "loss_aux_layer_0": 0.05181884765625, "loss_aux_layer_1": 0.13232421875, "loss_aux_layer_10": 0.1396484375, "loss_aux_layer_11": 0.147705078125, "loss_aux_layer_12": 0.15869140625, "loss_aux_layer_13": 0.17041015625, "loss_aux_layer_14": 0.1884765625, "loss_aux_layer_15": 0.20361328125, "loss_aux_layer_16": 0.21826171875, "loss_aux_layer_17": 0.224365234375, "loss_aux_layer_18": 0.234130859375, "loss_aux_layer_19": 0.23193359375, "loss_aux_layer_2": 0.154052734375, "loss_aux_layer_20": 0.234130859375, "loss_aux_layer_21": 0.237060546875, "loss_aux_layer_22": 0.2607421875, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.162109375, "loss_aux_layer_4": 0.158447265625, "loss_aux_layer_5": 0.15966796875, "loss_aux_layer_6": 0.15673828125, "loss_aux_layer_7": 0.142822265625, "loss_aux_layer_8": 0.140380859375, "loss_aux_layer_9": 0.13818359375, "step": 249, "total_loss": 0.8292561322450638 }, { "epoch": 0.04949514947535141, "grad_norm": 7.434368133544922, "learning_rate": 5e-05, "llm_loss": 0.7603515237569809, "loss": 3.751, "loss_aux_layer_0": 0.0469970703125, "loss_aux_layer_1": 0.125244140625, "loss_aux_layer_10": 0.140380859375, "loss_aux_layer_11": 0.148193359375, "loss_aux_layer_12": 0.159423828125, "loss_aux_layer_13": 0.169921875, "loss_aux_layer_14": 0.18603515625, "loss_aux_layer_15": 0.2001953125, "loss_aux_layer_16": 0.21484375, "loss_aux_layer_17": 0.219970703125, "loss_aux_layer_18": 0.22900390625, "loss_aux_layer_19": 0.227783203125, "loss_aux_layer_2": 0.14794921875, "loss_aux_layer_20": 0.230224609375, "loss_aux_layer_21": 0.2314453125, "loss_aux_layer_22": 0.25244140625, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.15478515625, "loss_aux_layer_4": 0.152099609375, "loss_aux_layer_5": 0.15380859375, "loss_aux_layer_6": 0.151123046875, "loss_aux_layer_7": 0.138916015625, "loss_aux_layer_8": 0.138671875, "loss_aux_layer_9": 0.137451171875, "step": 250, "total_loss": 0.9377562254667282 }, { "epoch": 0.04969313007325282, "grad_norm": 6.821317672729492, "learning_rate": 5e-05, "llm_loss": 0.7085544764995575, "loss": 3.5408, "loss_aux_layer_0": 0.0469970703125, "loss_aux_layer_1": 0.1253662109375, "loss_aux_layer_10": 0.1368408203125, "loss_aux_layer_11": 0.144287109375, "loss_aux_layer_12": 0.156005859375, "loss_aux_layer_13": 0.167236328125, "loss_aux_layer_14": 0.184814453125, "loss_aux_layer_15": 0.2001953125, "loss_aux_layer_16": 0.21533203125, "loss_aux_layer_17": 0.221435546875, "loss_aux_layer_18": 0.232421875, "loss_aux_layer_19": 0.22998046875, "loss_aux_layer_2": 0.146728515625, "loss_aux_layer_20": 0.23193359375, "loss_aux_layer_21": 0.233154296875, "loss_aux_layer_22": 0.251953125, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.153564453125, "loss_aux_layer_4": 0.14990234375, "loss_aux_layer_5": 0.151611328125, "loss_aux_layer_6": 0.149658203125, "loss_aux_layer_7": 0.1380615234375, "loss_aux_layer_8": 0.1368408203125, "loss_aux_layer_9": 0.1356201171875, "step": 251, "total_loss": 0.8852067440748215 }, { "epoch": 0.049891110671154225, "grad_norm": 3.6372973918914795, "learning_rate": 5e-05, "llm_loss": 0.6950059831142426, "loss": 3.5148, "loss_aux_layer_0": 0.0491943359375, "loss_aux_layer_1": 0.1287841796875, "loss_aux_layer_10": 0.1474609375, "loss_aux_layer_11": 0.156494140625, "loss_aux_layer_12": 0.167236328125, "loss_aux_layer_13": 0.178955078125, "loss_aux_layer_14": 0.196044921875, "loss_aux_layer_15": 0.210205078125, "loss_aux_layer_16": 0.223876953125, "loss_aux_layer_17": 0.227783203125, "loss_aux_layer_18": 0.23681640625, "loss_aux_layer_19": 0.2333984375, "loss_aux_layer_2": 0.151611328125, "loss_aux_layer_20": 0.235107421875, "loss_aux_layer_21": 0.2373046875, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.158935546875, "loss_aux_layer_4": 0.155517578125, "loss_aux_layer_5": 0.1572265625, "loss_aux_layer_6": 0.15478515625, "loss_aux_layer_7": 0.145751953125, "loss_aux_layer_8": 0.146728515625, "loss_aux_layer_9": 0.145751953125, "step": 252, "total_loss": 0.8786951303482056 }, { "epoch": 0.050089091269055634, "grad_norm": 8.460978507995605, "learning_rate": 5e-05, "llm_loss": 0.6056825369596481, "loss": 3.1531, "loss_aux_layer_0": 0.04766845703125, "loss_aux_layer_1": 0.1259765625, "loss_aux_layer_10": 0.1416015625, "loss_aux_layer_11": 0.15087890625, "loss_aux_layer_12": 0.162109375, "loss_aux_layer_13": 0.17431640625, "loss_aux_layer_14": 0.193359375, "loss_aux_layer_15": 0.208984375, "loss_aux_layer_16": 0.224365234375, "loss_aux_layer_17": 0.23095703125, "loss_aux_layer_18": 0.241455078125, "loss_aux_layer_19": 0.23828125, "loss_aux_layer_2": 0.14794921875, "loss_aux_layer_20": 0.23974609375, "loss_aux_layer_21": 0.242919921875, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.31201171875, "loss_aux_layer_3": 0.1552734375, "loss_aux_layer_4": 0.151611328125, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.152099609375, "loss_aux_layer_7": 0.1416015625, "loss_aux_layer_8": 0.140625, "loss_aux_layer_9": 0.139892578125, "step": 253, "total_loss": 0.7882857471704483 }, { "epoch": 0.05028707186695704, "grad_norm": 1.8669391870498657, "learning_rate": 5e-05, "llm_loss": 0.7247804403305054, "loss": 3.6235, "loss_aux_layer_0": 0.04815673828125, "loss_aux_layer_1": 0.131591796875, "loss_aux_layer_10": 0.14208984375, "loss_aux_layer_11": 0.149169921875, "loss_aux_layer_12": 0.16064453125, "loss_aux_layer_13": 0.171142578125, "loss_aux_layer_14": 0.188720703125, "loss_aux_layer_15": 0.2041015625, "loss_aux_layer_16": 0.218505859375, "loss_aux_layer_17": 0.223876953125, "loss_aux_layer_18": 0.234619140625, "loss_aux_layer_19": 0.232421875, "loss_aux_layer_2": 0.154541015625, "loss_aux_layer_20": 0.23388671875, "loss_aux_layer_21": 0.23486328125, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.3017578125, "loss_aux_layer_3": 0.161865234375, "loss_aux_layer_4": 0.157958984375, "loss_aux_layer_5": 0.158935546875, "loss_aux_layer_6": 0.155517578125, "loss_aux_layer_7": 0.14306640625, "loss_aux_layer_8": 0.142822265625, "loss_aux_layer_9": 0.140625, "step": 254, "total_loss": 0.9058819264173508 }, { "epoch": 0.050485052464858446, "grad_norm": 22.597795486450195, "learning_rate": 5e-05, "llm_loss": 0.6983882635831833, "loss": 3.559, "loss_aux_layer_0": 0.0462646484375, "loss_aux_layer_1": 0.1263427734375, "loss_aux_layer_10": 0.163330078125, "loss_aux_layer_11": 0.172607421875, "loss_aux_layer_12": 0.18310546875, "loss_aux_layer_13": 0.18994140625, "loss_aux_layer_14": 0.206298828125, "loss_aux_layer_15": 0.2197265625, "loss_aux_layer_16": 0.232421875, "loss_aux_layer_17": 0.234619140625, "loss_aux_layer_18": 0.2421875, "loss_aux_layer_19": 0.23828125, "loss_aux_layer_2": 0.15283203125, "loss_aux_layer_20": 0.239013671875, "loss_aux_layer_21": 0.24072265625, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.31103515625, "loss_aux_layer_3": 0.16162109375, "loss_aux_layer_4": 0.158935546875, "loss_aux_layer_5": 0.161376953125, "loss_aux_layer_6": 0.158935546875, "loss_aux_layer_7": 0.1650390625, "loss_aux_layer_8": 0.16259765625, "loss_aux_layer_9": 0.16259765625, "step": 255, "total_loss": 0.8897401541471481 }, { "epoch": 0.05068303306275985, "grad_norm": 4.720426082611084, "learning_rate": 5e-05, "llm_loss": 0.6730273813009262, "loss": 3.4236, "loss_aux_layer_0": 0.0499267578125, "loss_aux_layer_1": 0.1265869140625, "loss_aux_layer_10": 0.140869140625, "loss_aux_layer_11": 0.14892578125, "loss_aux_layer_12": 0.161865234375, "loss_aux_layer_13": 0.174560546875, "loss_aux_layer_14": 0.193603515625, "loss_aux_layer_15": 0.209228515625, "loss_aux_layer_16": 0.225341796875, "loss_aux_layer_17": 0.232177734375, "loss_aux_layer_18": 0.2431640625, "loss_aux_layer_19": 0.240478515625, "loss_aux_layer_2": 0.148681640625, "loss_aux_layer_20": 0.242431640625, "loss_aux_layer_21": 0.24462890625, "loss_aux_layer_22": 0.26611328125, "loss_aux_layer_23": 0.3134765625, "loss_aux_layer_3": 0.154541015625, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.15234375, "loss_aux_layer_6": 0.150634765625, "loss_aux_layer_7": 0.140380859375, "loss_aux_layer_8": 0.139892578125, "loss_aux_layer_9": 0.13916015625, "step": 256, "total_loss": 0.8559017479419708 }, { "epoch": 0.05088101366066126, "grad_norm": 8.309176445007324, "learning_rate": 5e-05, "llm_loss": 0.6756604760885239, "loss": 3.4629, "loss_aux_layer_0": 0.048583984375, "loss_aux_layer_1": 0.1307373046875, "loss_aux_layer_10": 0.15380859375, "loss_aux_layer_11": 0.161865234375, "loss_aux_layer_12": 0.173828125, "loss_aux_layer_13": 0.184326171875, "loss_aux_layer_14": 0.201416015625, "loss_aux_layer_15": 0.216064453125, "loss_aux_layer_16": 0.22998046875, "loss_aux_layer_17": 0.234130859375, "loss_aux_layer_18": 0.244140625, "loss_aux_layer_19": 0.239990234375, "loss_aux_layer_2": 0.16064453125, "loss_aux_layer_20": 0.240234375, "loss_aux_layer_21": 0.2421875, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.310546875, "loss_aux_layer_3": 0.166259765625, "loss_aux_layer_4": 0.162109375, "loss_aux_layer_5": 0.164306640625, "loss_aux_layer_6": 0.164306640625, "loss_aux_layer_7": 0.159423828125, "loss_aux_layer_8": 0.156005859375, "loss_aux_layer_9": 0.153564453125, "step": 257, "total_loss": 0.8657353818416595 }, { "epoch": 0.05107899425856266, "grad_norm": 3.006206512451172, "learning_rate": 5e-05, "llm_loss": 0.6578124910593033, "loss": 3.3705, "loss_aux_layer_0": 0.04937744140625, "loss_aux_layer_1": 0.12548828125, "loss_aux_layer_10": 0.14697265625, "loss_aux_layer_11": 0.154296875, "loss_aux_layer_12": 0.166259765625, "loss_aux_layer_13": 0.17626953125, "loss_aux_layer_14": 0.195068359375, "loss_aux_layer_15": 0.210205078125, "loss_aux_layer_16": 0.225830078125, "loss_aux_layer_17": 0.231689453125, "loss_aux_layer_18": 0.242431640625, "loss_aux_layer_19": 0.24072265625, "loss_aux_layer_2": 0.152099609375, "loss_aux_layer_20": 0.241455078125, "loss_aux_layer_21": 0.240966796875, "loss_aux_layer_22": 0.2607421875, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.15771484375, "loss_aux_layer_4": 0.154296875, "loss_aux_layer_5": 0.15673828125, "loss_aux_layer_6": 0.15673828125, "loss_aux_layer_7": 0.149169921875, "loss_aux_layer_8": 0.146728515625, "loss_aux_layer_9": 0.145751953125, "step": 258, "total_loss": 0.8426272571086884 }, { "epoch": 0.05127697485646407, "grad_norm": 1.5673385858535767, "learning_rate": 5e-05, "llm_loss": 0.6557074338197708, "loss": 3.3875, "loss_aux_layer_0": 0.0487060546875, "loss_aux_layer_1": 0.134033203125, "loss_aux_layer_10": 0.155029296875, "loss_aux_layer_11": 0.162841796875, "loss_aux_layer_12": 0.174560546875, "loss_aux_layer_13": 0.18505859375, "loss_aux_layer_14": 0.203125, "loss_aux_layer_15": 0.21728515625, "loss_aux_layer_16": 0.23095703125, "loss_aux_layer_17": 0.236083984375, "loss_aux_layer_18": 0.24560546875, "loss_aux_layer_19": 0.2412109375, "loss_aux_layer_2": 0.163330078125, "loss_aux_layer_20": 0.242431640625, "loss_aux_layer_21": 0.244140625, "loss_aux_layer_22": 0.26416015625, "loss_aux_layer_23": 0.3125, "loss_aux_layer_3": 0.168212890625, "loss_aux_layer_4": 0.163818359375, "loss_aux_layer_5": 0.16650390625, "loss_aux_layer_6": 0.164794921875, "loss_aux_layer_7": 0.155517578125, "loss_aux_layer_8": 0.15478515625, "loss_aux_layer_9": 0.15380859375, "step": 259, "total_loss": 0.8468654155731201 }, { "epoch": 0.05147495545436547, "grad_norm": 2.085340976715088, "learning_rate": 5e-05, "llm_loss": 0.6693829894065857, "loss": 3.4162, "loss_aux_layer_0": 0.04766845703125, "loss_aux_layer_1": 0.12451171875, "loss_aux_layer_10": 0.146728515625, "loss_aux_layer_11": 0.1552734375, "loss_aux_layer_12": 0.167236328125, "loss_aux_layer_13": 0.17919921875, "loss_aux_layer_14": 0.197509765625, "loss_aux_layer_15": 0.212890625, "loss_aux_layer_16": 0.22802734375, "loss_aux_layer_17": 0.232177734375, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.2392578125, "loss_aux_layer_2": 0.150390625, "loss_aux_layer_20": 0.23974609375, "loss_aux_layer_21": 0.2412109375, "loss_aux_layer_22": 0.26416015625, "loss_aux_layer_23": 0.3134765625, "loss_aux_layer_3": 0.155517578125, "loss_aux_layer_4": 0.15185546875, "loss_aux_layer_5": 0.15380859375, "loss_aux_layer_6": 0.152587890625, "loss_aux_layer_7": 0.1455078125, "loss_aux_layer_8": 0.14599609375, "loss_aux_layer_9": 0.14501953125, "step": 260, "total_loss": 0.8540575802326202 }, { "epoch": 0.05167293605226688, "grad_norm": 1.3235752582550049, "learning_rate": 5e-05, "llm_loss": 0.6207326948642731, "loss": 3.2266, "loss_aux_layer_0": 0.0491943359375, "loss_aux_layer_1": 0.1297607421875, "loss_aux_layer_10": 0.14794921875, "loss_aux_layer_11": 0.156982421875, "loss_aux_layer_12": 0.168701171875, "loss_aux_layer_13": 0.18115234375, "loss_aux_layer_14": 0.19921875, "loss_aux_layer_15": 0.213134765625, "loss_aux_layer_16": 0.22705078125, "loss_aux_layer_17": 0.231201171875, "loss_aux_layer_18": 0.24072265625, "loss_aux_layer_19": 0.23779296875, "loss_aux_layer_2": 0.154052734375, "loss_aux_layer_20": 0.23876953125, "loss_aux_layer_21": 0.240234375, "loss_aux_layer_22": 0.26123046875, "loss_aux_layer_23": 0.3076171875, "loss_aux_layer_3": 0.160400390625, "loss_aux_layer_4": 0.1572265625, "loss_aux_layer_5": 0.1591796875, "loss_aux_layer_6": 0.1572265625, "loss_aux_layer_7": 0.149169921875, "loss_aux_layer_8": 0.147705078125, "loss_aux_layer_9": 0.146728515625, "step": 261, "total_loss": 0.8066410422325134 }, { "epoch": 0.05187091665016828, "grad_norm": 1.7408952713012695, "learning_rate": 5e-05, "llm_loss": 0.7186462134122849, "loss": 3.6181, "loss_aux_layer_0": 0.0487060546875, "loss_aux_layer_1": 0.1265869140625, "loss_aux_layer_10": 0.148193359375, "loss_aux_layer_11": 0.1572265625, "loss_aux_layer_12": 0.16845703125, "loss_aux_layer_13": 0.181884765625, "loss_aux_layer_14": 0.20068359375, "loss_aux_layer_15": 0.214599609375, "loss_aux_layer_16": 0.22900390625, "loss_aux_layer_17": 0.2333984375, "loss_aux_layer_18": 0.24267578125, "loss_aux_layer_19": 0.238525390625, "loss_aux_layer_2": 0.151611328125, "loss_aux_layer_20": 0.239501953125, "loss_aux_layer_21": 0.24072265625, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.30615234375, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.15576171875, "loss_aux_layer_5": 0.15869140625, "loss_aux_layer_6": 0.156982421875, "loss_aux_layer_7": 0.148681640625, "loss_aux_layer_8": 0.14794921875, "loss_aux_layer_9": 0.14697265625, "step": 262, "total_loss": 0.9045128971338272 }, { "epoch": 0.052068897248069686, "grad_norm": 1.9242911338806152, "learning_rate": 5e-05, "llm_loss": 0.6827191114425659, "loss": 3.4443, "loss_aux_layer_0": 0.046875, "loss_aux_layer_1": 0.119140625, "loss_aux_layer_10": 0.13818359375, "loss_aux_layer_11": 0.146728515625, "loss_aux_layer_12": 0.158935546875, "loss_aux_layer_13": 0.172119140625, "loss_aux_layer_14": 0.19091796875, "loss_aux_layer_15": 0.20654296875, "loss_aux_layer_16": 0.221923828125, "loss_aux_layer_17": 0.227783203125, "loss_aux_layer_18": 0.238037109375, "loss_aux_layer_19": 0.235595703125, "loss_aux_layer_2": 0.141357421875, "loss_aux_layer_20": 0.23779296875, "loss_aux_layer_21": 0.239013671875, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.143798828125, "loss_aux_layer_5": 0.14599609375, "loss_aux_layer_6": 0.14501953125, "loss_aux_layer_7": 0.1378173828125, "loss_aux_layer_8": 0.1373291015625, "loss_aux_layer_9": 0.136474609375, "step": 263, "total_loss": 0.8610676527023315 }, { "epoch": 0.052266877845971095, "grad_norm": 1.1894279718399048, "learning_rate": 5e-05, "llm_loss": 0.5961412489414215, "loss": 3.0945, "loss_aux_layer_0": 0.0430908203125, "loss_aux_layer_1": 0.119384765625, "loss_aux_layer_10": 0.138671875, "loss_aux_layer_11": 0.14697265625, "loss_aux_layer_12": 0.158935546875, "loss_aux_layer_13": 0.17041015625, "loss_aux_layer_14": 0.188232421875, "loss_aux_layer_15": 0.20361328125, "loss_aux_layer_16": 0.218994140625, "loss_aux_layer_17": 0.2236328125, "loss_aux_layer_18": 0.235107421875, "loss_aux_layer_19": 0.232666015625, "loss_aux_layer_2": 0.144287109375, "loss_aux_layer_20": 0.23388671875, "loss_aux_layer_21": 0.235595703125, "loss_aux_layer_22": 0.254638671875, "loss_aux_layer_23": 0.3017578125, "loss_aux_layer_3": 0.14990234375, "loss_aux_layer_4": 0.14697265625, "loss_aux_layer_5": 0.14892578125, "loss_aux_layer_6": 0.146728515625, "loss_aux_layer_7": 0.13916015625, "loss_aux_layer_8": 0.138671875, "loss_aux_layer_9": 0.137451171875, "step": 264, "total_loss": 0.7736175060272217 }, { "epoch": 0.0524648584438725, "grad_norm": 0.8440191745758057, "learning_rate": 5e-05, "llm_loss": 0.662931278347969, "loss": 3.3923, "loss_aux_layer_0": 0.04986572265625, "loss_aux_layer_1": 0.12744140625, "loss_aux_layer_10": 0.14599609375, "loss_aux_layer_11": 0.155029296875, "loss_aux_layer_12": 0.166259765625, "loss_aux_layer_13": 0.178955078125, "loss_aux_layer_14": 0.197509765625, "loss_aux_layer_15": 0.211669921875, "loss_aux_layer_16": 0.226318359375, "loss_aux_layer_17": 0.230224609375, "loss_aux_layer_18": 0.240234375, "loss_aux_layer_19": 0.23779296875, "loss_aux_layer_2": 0.153564453125, "loss_aux_layer_20": 0.239013671875, "loss_aux_layer_21": 0.2412109375, "loss_aux_layer_22": 0.26318359375, "loss_aux_layer_23": 0.3095703125, "loss_aux_layer_3": 0.159423828125, "loss_aux_layer_4": 0.15625, "loss_aux_layer_5": 0.15869140625, "loss_aux_layer_6": 0.156982421875, "loss_aux_layer_7": 0.148193359375, "loss_aux_layer_8": 0.146240234375, "loss_aux_layer_9": 0.145263671875, "step": 265, "total_loss": 0.8480766415596008 }, { "epoch": 0.05266283904177391, "grad_norm": 1.4174977540969849, "learning_rate": 5e-05, "llm_loss": 0.6144343018531799, "loss": 3.1727, "loss_aux_layer_0": 0.04681396484375, "loss_aux_layer_1": 0.1265869140625, "loss_aux_layer_10": 0.140625, "loss_aux_layer_11": 0.148193359375, "loss_aux_layer_12": 0.159423828125, "loss_aux_layer_13": 0.170654296875, "loss_aux_layer_14": 0.1884765625, "loss_aux_layer_15": 0.2021484375, "loss_aux_layer_16": 0.216552734375, "loss_aux_layer_17": 0.2216796875, "loss_aux_layer_18": 0.231689453125, "loss_aux_layer_19": 0.229248046875, "loss_aux_layer_2": 0.151123046875, "loss_aux_layer_20": 0.231689453125, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.251953125, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.156494140625, "loss_aux_layer_4": 0.153076171875, "loss_aux_layer_5": 0.15478515625, "loss_aux_layer_6": 0.152099609375, "loss_aux_layer_7": 0.143310546875, "loss_aux_layer_8": 0.1412353515625, "loss_aux_layer_9": 0.1396484375, "step": 266, "total_loss": 0.7931655645370483 }, { "epoch": 0.05286081963967531, "grad_norm": 1.1668975353240967, "learning_rate": 5e-05, "llm_loss": 0.7089467346668243, "loss": 3.5758, "loss_aux_layer_0": 0.04901123046875, "loss_aux_layer_1": 0.1324462890625, "loss_aux_layer_10": 0.14697265625, "loss_aux_layer_11": 0.1552734375, "loss_aux_layer_12": 0.166748046875, "loss_aux_layer_13": 0.1767578125, "loss_aux_layer_14": 0.194580078125, "loss_aux_layer_15": 0.209228515625, "loss_aux_layer_16": 0.223388671875, "loss_aux_layer_17": 0.226806640625, "loss_aux_layer_18": 0.23583984375, "loss_aux_layer_19": 0.233154296875, "loss_aux_layer_2": 0.15869140625, "loss_aux_layer_20": 0.235107421875, "loss_aux_layer_21": 0.23681640625, "loss_aux_layer_22": 0.2587890625, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.164794921875, "loss_aux_layer_4": 0.1611328125, "loss_aux_layer_5": 0.16259765625, "loss_aux_layer_6": 0.160400390625, "loss_aux_layer_7": 0.150634765625, "loss_aux_layer_8": 0.1484375, "loss_aux_layer_9": 0.146240234375, "step": 267, "total_loss": 0.893952265381813 }, { "epoch": 0.05305880023757672, "grad_norm": 1.0703717470169067, "learning_rate": 5e-05, "llm_loss": 0.6781246215105057, "loss": 3.4227, "loss_aux_layer_0": 0.046630859375, "loss_aux_layer_1": 0.1241455078125, "loss_aux_layer_10": 0.13818359375, "loss_aux_layer_11": 0.1455078125, "loss_aux_layer_12": 0.15673828125, "loss_aux_layer_13": 0.16748046875, "loss_aux_layer_14": 0.18505859375, "loss_aux_layer_15": 0.2001953125, "loss_aux_layer_16": 0.2158203125, "loss_aux_layer_17": 0.220947265625, "loss_aux_layer_18": 0.2314453125, "loss_aux_layer_19": 0.229736328125, "loss_aux_layer_2": 0.148193359375, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.234130859375, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.3037109375, "loss_aux_layer_3": 0.154296875, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.1533203125, "loss_aux_layer_6": 0.150634765625, "loss_aux_layer_7": 0.140869140625, "loss_aux_layer_8": 0.138427734375, "loss_aux_layer_9": 0.136962890625, "step": 268, "total_loss": 0.8556775003671646 }, { "epoch": 0.05325678083547812, "grad_norm": 1.7417446374893188, "learning_rate": 5e-05, "llm_loss": 0.5786152929067612, "loss": 3.0436, "loss_aux_layer_0": 0.0458984375, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.144775390625, "loss_aux_layer_11": 0.15234375, "loss_aux_layer_12": 0.163330078125, "loss_aux_layer_13": 0.1748046875, "loss_aux_layer_14": 0.192138671875, "loss_aux_layer_15": 0.206298828125, "loss_aux_layer_16": 0.219970703125, "loss_aux_layer_17": 0.224853515625, "loss_aux_layer_18": 0.235107421875, "loss_aux_layer_19": 0.231689453125, "loss_aux_layer_2": 0.15234375, "loss_aux_layer_20": 0.233642578125, "loss_aux_layer_21": 0.2373046875, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.15966796875, "loss_aux_layer_4": 0.1572265625, "loss_aux_layer_5": 0.16015625, "loss_aux_layer_6": 0.1572265625, "loss_aux_layer_7": 0.147216796875, "loss_aux_layer_8": 0.145263671875, "loss_aux_layer_9": 0.143310546875, "step": 269, "total_loss": 0.7608990222215652 }, { "epoch": 0.05345476143337953, "grad_norm": 1.1200922727584839, "learning_rate": 5e-05, "llm_loss": 0.718038022518158, "loss": 3.5998, "loss_aux_layer_0": 0.046142578125, "loss_aux_layer_1": 0.130126953125, "loss_aux_layer_10": 0.1435546875, "loss_aux_layer_11": 0.150634765625, "loss_aux_layer_12": 0.161376953125, "loss_aux_layer_13": 0.173095703125, "loss_aux_layer_14": 0.18994140625, "loss_aux_layer_15": 0.20458984375, "loss_aux_layer_16": 0.218994140625, "loss_aux_layer_17": 0.223388671875, "loss_aux_layer_18": 0.23486328125, "loss_aux_layer_19": 0.231689453125, "loss_aux_layer_2": 0.1552734375, "loss_aux_layer_20": 0.234130859375, "loss_aux_layer_21": 0.235595703125, "loss_aux_layer_22": 0.255615234375, "loss_aux_layer_23": 0.3037109375, "loss_aux_layer_3": 0.16259765625, "loss_aux_layer_4": 0.159423828125, "loss_aux_layer_5": 0.16064453125, "loss_aux_layer_6": 0.157958984375, "loss_aux_layer_7": 0.146484375, "loss_aux_layer_8": 0.14404296875, "loss_aux_layer_9": 0.141845703125, "step": 270, "total_loss": 0.8999526649713516 }, { "epoch": 0.05365274203128093, "grad_norm": 1.0107742547988892, "learning_rate": 5e-05, "llm_loss": 0.6660884022712708, "loss": 3.3815, "loss_aux_layer_0": 0.04583740234375, "loss_aux_layer_1": 0.1287841796875, "loss_aux_layer_10": 0.141357421875, "loss_aux_layer_11": 0.14892578125, "loss_aux_layer_12": 0.160400390625, "loss_aux_layer_13": 0.17041015625, "loss_aux_layer_14": 0.1875, "loss_aux_layer_15": 0.201904296875, "loss_aux_layer_16": 0.216064453125, "loss_aux_layer_17": 0.22119140625, "loss_aux_layer_18": 0.23095703125, "loss_aux_layer_19": 0.2294921875, "loss_aux_layer_2": 0.15234375, "loss_aux_layer_20": 0.2314453125, "loss_aux_layer_21": 0.232421875, "loss_aux_layer_22": 0.251953125, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.1591796875, "loss_aux_layer_4": 0.15625, "loss_aux_layer_5": 0.15771484375, "loss_aux_layer_6": 0.154541015625, "loss_aux_layer_7": 0.1435546875, "loss_aux_layer_8": 0.14111328125, "loss_aux_layer_9": 0.139892578125, "step": 271, "total_loss": 0.8453826606273651 }, { "epoch": 0.05385072262918234, "grad_norm": 1.4330086708068848, "learning_rate": 5e-05, "llm_loss": 0.5652510300278664, "loss": 2.9879, "loss_aux_layer_0": 0.0489501953125, "loss_aux_layer_1": 0.12890625, "loss_aux_layer_10": 0.141845703125, "loss_aux_layer_11": 0.149658203125, "loss_aux_layer_12": 0.161376953125, "loss_aux_layer_13": 0.173095703125, "loss_aux_layer_14": 0.191162109375, "loss_aux_layer_15": 0.2060546875, "loss_aux_layer_16": 0.22119140625, "loss_aux_layer_17": 0.2265625, "loss_aux_layer_18": 0.236328125, "loss_aux_layer_19": 0.23388671875, "loss_aux_layer_2": 0.15185546875, "loss_aux_layer_20": 0.23583984375, "loss_aux_layer_21": 0.2373046875, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.30419921875, "loss_aux_layer_3": 0.1591796875, "loss_aux_layer_4": 0.156005859375, "loss_aux_layer_5": 0.15771484375, "loss_aux_layer_6": 0.155029296875, "loss_aux_layer_7": 0.144287109375, "loss_aux_layer_8": 0.14208984375, "loss_aux_layer_9": 0.140869140625, "step": 272, "total_loss": 0.746979683637619 }, { "epoch": 0.054048703227083744, "grad_norm": 1.0732486248016357, "learning_rate": 5e-05, "llm_loss": 0.6779581904411316, "loss": 3.4462, "loss_aux_layer_0": 0.0460205078125, "loss_aux_layer_1": 0.133056640625, "loss_aux_layer_10": 0.14404296875, "loss_aux_layer_11": 0.152099609375, "loss_aux_layer_12": 0.163330078125, "loss_aux_layer_13": 0.173828125, "loss_aux_layer_14": 0.19140625, "loss_aux_layer_15": 0.206787109375, "loss_aux_layer_16": 0.221435546875, "loss_aux_layer_17": 0.225830078125, "loss_aux_layer_18": 0.2353515625, "loss_aux_layer_19": 0.23291015625, "loss_aux_layer_2": 0.158203125, "loss_aux_layer_20": 0.234130859375, "loss_aux_layer_21": 0.236328125, "loss_aux_layer_22": 0.2578125, "loss_aux_layer_23": 0.3056640625, "loss_aux_layer_3": 0.166259765625, "loss_aux_layer_4": 0.162353515625, "loss_aux_layer_5": 0.1630859375, "loss_aux_layer_6": 0.16015625, "loss_aux_layer_7": 0.14794921875, "loss_aux_layer_8": 0.145263671875, "loss_aux_layer_9": 0.14306640625, "step": 273, "total_loss": 0.861539214849472 }, { "epoch": 0.054246683824985154, "grad_norm": 0.6990344524383545, "learning_rate": 5e-05, "llm_loss": 0.6222339272499084, "loss": 3.2005, "loss_aux_layer_0": 0.04425048828125, "loss_aux_layer_1": 0.12890625, "loss_aux_layer_10": 0.137939453125, "loss_aux_layer_11": 0.145751953125, "loss_aux_layer_12": 0.156982421875, "loss_aux_layer_13": 0.16796875, "loss_aux_layer_14": 0.185546875, "loss_aux_layer_15": 0.200439453125, "loss_aux_layer_16": 0.21435546875, "loss_aux_layer_17": 0.219482421875, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.228271484375, "loss_aux_layer_2": 0.152099609375, "loss_aux_layer_20": 0.229736328125, "loss_aux_layer_21": 0.232177734375, "loss_aux_layer_22": 0.25341796875, "loss_aux_layer_23": 0.30224609375, "loss_aux_layer_3": 0.15869140625, "loss_aux_layer_4": 0.1552734375, "loss_aux_layer_5": 0.15625, "loss_aux_layer_6": 0.1533203125, "loss_aux_layer_7": 0.1412353515625, "loss_aux_layer_8": 0.1385498046875, "loss_aux_layer_9": 0.136962890625, "step": 274, "total_loss": 0.8001244068145752 }, { "epoch": 0.054444664422886556, "grad_norm": 0.8077584505081177, "learning_rate": 5e-05, "llm_loss": 0.6231066286563873, "loss": 3.2074, "loss_aux_layer_0": 0.04656982421875, "loss_aux_layer_1": 0.127685546875, "loss_aux_layer_10": 0.1396484375, "loss_aux_layer_11": 0.1474609375, "loss_aux_layer_12": 0.15869140625, "loss_aux_layer_13": 0.16943359375, "loss_aux_layer_14": 0.1865234375, "loss_aux_layer_15": 0.20068359375, "loss_aux_layer_16": 0.215087890625, "loss_aux_layer_17": 0.21923828125, "loss_aux_layer_18": 0.2294921875, "loss_aux_layer_19": 0.228271484375, "loss_aux_layer_2": 0.151123046875, "loss_aux_layer_20": 0.231201171875, "loss_aux_layer_21": 0.234130859375, "loss_aux_layer_22": 0.254638671875, "loss_aux_layer_23": 0.30322265625, "loss_aux_layer_3": 0.158447265625, "loss_aux_layer_4": 0.1552734375, "loss_aux_layer_5": 0.1572265625, "loss_aux_layer_6": 0.154052734375, "loss_aux_layer_7": 0.142578125, "loss_aux_layer_8": 0.140380859375, "loss_aux_layer_9": 0.138671875, "step": 275, "total_loss": 0.8018500059843063 }, { "epoch": 0.054642645020787965, "grad_norm": 1.148311734199524, "learning_rate": 5e-05, "llm_loss": 0.604918360710144, "loss": 3.161, "loss_aux_layer_0": 0.0528564453125, "loss_aux_layer_1": 0.13671875, "loss_aux_layer_10": 0.146240234375, "loss_aux_layer_11": 0.154052734375, "loss_aux_layer_12": 0.1650390625, "loss_aux_layer_13": 0.1748046875, "loss_aux_layer_14": 0.191650390625, "loss_aux_layer_15": 0.2060546875, "loss_aux_layer_16": 0.220703125, "loss_aux_layer_17": 0.22509765625, "loss_aux_layer_18": 0.235595703125, "loss_aux_layer_19": 0.233642578125, "loss_aux_layer_2": 0.16015625, "loss_aux_layer_20": 0.2353515625, "loss_aux_layer_21": 0.23828125, "loss_aux_layer_22": 0.2607421875, "loss_aux_layer_23": 0.30810546875, "loss_aux_layer_3": 0.16748046875, "loss_aux_layer_4": 0.16455078125, "loss_aux_layer_5": 0.166015625, "loss_aux_layer_6": 0.16259765625, "loss_aux_layer_7": 0.150146484375, "loss_aux_layer_8": 0.1474609375, "loss_aux_layer_9": 0.145263671875, "step": 276, "total_loss": 0.7902561128139496 }, { "epoch": 0.05484062561868937, "grad_norm": 1.0241755247116089, "learning_rate": 5e-05, "llm_loss": 0.6740403175354004, "loss": 3.4021, "loss_aux_layer_0": 0.04644775390625, "loss_aux_layer_1": 0.1279296875, "loss_aux_layer_10": 0.136962890625, "loss_aux_layer_11": 0.14501953125, "loss_aux_layer_12": 0.15625, "loss_aux_layer_13": 0.167236328125, "loss_aux_layer_14": 0.1845703125, "loss_aux_layer_15": 0.198974609375, "loss_aux_layer_16": 0.21337890625, "loss_aux_layer_17": 0.218505859375, "loss_aux_layer_18": 0.228759765625, "loss_aux_layer_19": 0.2255859375, "loss_aux_layer_2": 0.150146484375, "loss_aux_layer_20": 0.22802734375, "loss_aux_layer_21": 0.229736328125, "loss_aux_layer_22": 0.2509765625, "loss_aux_layer_23": 0.296875, "loss_aux_layer_3": 0.1572265625, "loss_aux_layer_4": 0.15380859375, "loss_aux_layer_5": 0.1552734375, "loss_aux_layer_6": 0.151611328125, "loss_aux_layer_7": 0.139404296875, "loss_aux_layer_8": 0.136962890625, "loss_aux_layer_9": 0.135986328125, "step": 277, "total_loss": 0.8505240827798843 }, { "epoch": 0.05503860621659078, "grad_norm": 1.1894105672836304, "learning_rate": 5e-05, "llm_loss": 0.5981575474143028, "loss": 3.1106, "loss_aux_layer_0": 0.0462646484375, "loss_aux_layer_1": 0.1287841796875, "loss_aux_layer_10": 0.139892578125, "loss_aux_layer_11": 0.1474609375, "loss_aux_layer_12": 0.158203125, "loss_aux_layer_13": 0.16943359375, "loss_aux_layer_14": 0.1865234375, "loss_aux_layer_15": 0.201416015625, "loss_aux_layer_16": 0.21533203125, "loss_aux_layer_17": 0.219970703125, "loss_aux_layer_18": 0.229248046875, "loss_aux_layer_19": 0.227783203125, "loss_aux_layer_2": 0.152587890625, "loss_aux_layer_20": 0.230712890625, "loss_aux_layer_21": 0.2353515625, "loss_aux_layer_22": 0.25732421875, "loss_aux_layer_23": 0.3046875, "loss_aux_layer_3": 0.160888671875, "loss_aux_layer_4": 0.157470703125, "loss_aux_layer_5": 0.159423828125, "loss_aux_layer_6": 0.155517578125, "loss_aux_layer_7": 0.143310546875, "loss_aux_layer_8": 0.140869140625, "loss_aux_layer_9": 0.138671875, "step": 278, "total_loss": 0.7776438295841217 }, { "epoch": 0.05523658681449218, "grad_norm": 1.983101487159729, "learning_rate": 5e-05, "llm_loss": 0.6020008623600006, "loss": 3.1253, "loss_aux_layer_0": 0.04791259765625, "loss_aux_layer_1": 0.1309814453125, "loss_aux_layer_10": 0.140380859375, "loss_aux_layer_11": 0.14794921875, "loss_aux_layer_12": 0.15869140625, "loss_aux_layer_13": 0.16943359375, "loss_aux_layer_14": 0.1865234375, "loss_aux_layer_15": 0.201171875, "loss_aux_layer_16": 0.215576171875, "loss_aux_layer_17": 0.22119140625, "loss_aux_layer_18": 0.2314453125, "loss_aux_layer_19": 0.228271484375, "loss_aux_layer_2": 0.15380859375, "loss_aux_layer_20": 0.23095703125, "loss_aux_layer_21": 0.2314453125, "loss_aux_layer_22": 0.251220703125, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.162109375, "loss_aux_layer_4": 0.15869140625, "loss_aux_layer_5": 0.16015625, "loss_aux_layer_6": 0.156982421875, "loss_aux_layer_7": 0.14404296875, "loss_aux_layer_8": 0.1416015625, "loss_aux_layer_9": 0.13916015625, "step": 279, "total_loss": 0.781318336725235 }, { "epoch": 0.05543456741239359, "grad_norm": 1.3594274520874023, "learning_rate": 5e-05, "llm_loss": 0.636176273226738, "loss": 3.2485, "loss_aux_layer_0": 0.04608154296875, "loss_aux_layer_1": 0.12646484375, "loss_aux_layer_10": 0.135498046875, "loss_aux_layer_11": 0.142822265625, "loss_aux_layer_12": 0.153076171875, "loss_aux_layer_13": 0.163818359375, "loss_aux_layer_14": 0.181640625, "loss_aux_layer_15": 0.197509765625, "loss_aux_layer_16": 0.21240234375, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.22802734375, "loss_aux_layer_19": 0.2275390625, "loss_aux_layer_2": 0.147705078125, "loss_aux_layer_20": 0.23046875, "loss_aux_layer_21": 0.23388671875, "loss_aux_layer_22": 0.25537109375, "loss_aux_layer_23": 0.3037109375, "loss_aux_layer_3": 0.154541015625, "loss_aux_layer_4": 0.152099609375, "loss_aux_layer_5": 0.15380859375, "loss_aux_layer_6": 0.150146484375, "loss_aux_layer_7": 0.137939453125, "loss_aux_layer_8": 0.135986328125, "loss_aux_layer_9": 0.13427734375, "step": 280, "total_loss": 0.8121126741170883 }, { "epoch": 0.05563254801029499, "grad_norm": 1.1381033658981323, "learning_rate": 5e-05, "llm_loss": 0.7300158739089966, "loss": 3.6146, "loss_aux_layer_0": 0.04473876953125, "loss_aux_layer_1": 0.123291015625, "loss_aux_layer_10": 0.1322021484375, "loss_aux_layer_11": 0.1400146484375, "loss_aux_layer_12": 0.150634765625, "loss_aux_layer_13": 0.16162109375, "loss_aux_layer_14": 0.1796875, "loss_aux_layer_15": 0.19482421875, "loss_aux_layer_16": 0.2099609375, "loss_aux_layer_17": 0.2158203125, "loss_aux_layer_18": 0.2265625, "loss_aux_layer_19": 0.2255859375, "loss_aux_layer_2": 0.14599609375, "loss_aux_layer_20": 0.228515625, "loss_aux_layer_21": 0.231689453125, "loss_aux_layer_22": 0.25341796875, "loss_aux_layer_23": 0.30078125, "loss_aux_layer_3": 0.15283203125, "loss_aux_layer_4": 0.14990234375, "loss_aux_layer_5": 0.151123046875, "loss_aux_layer_6": 0.147705078125, "loss_aux_layer_7": 0.1351318359375, "loss_aux_layer_8": 0.1328125, "loss_aux_layer_9": 0.13134765625, "step": 281, "total_loss": 0.903639629483223 }, { "epoch": 0.05583052860819639, "grad_norm": 1.3889416456222534, "learning_rate": 5e-05, "llm_loss": 0.6493559628725052, "loss": 3.293, "loss_aux_layer_0": 0.04351806640625, "loss_aux_layer_1": 0.12158203125, "loss_aux_layer_10": 0.13427734375, "loss_aux_layer_11": 0.142578125, "loss_aux_layer_12": 0.153564453125, "loss_aux_layer_13": 0.1650390625, "loss_aux_layer_14": 0.18212890625, "loss_aux_layer_15": 0.197509765625, "loss_aux_layer_16": 0.212158203125, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.22802734375, "loss_aux_layer_19": 0.22607421875, "loss_aux_layer_2": 0.142578125, "loss_aux_layer_20": 0.227783203125, "loss_aux_layer_21": 0.230224609375, "loss_aux_layer_22": 0.2509765625, "loss_aux_layer_23": 0.2978515625, "loss_aux_layer_3": 0.150634765625, "loss_aux_layer_4": 0.148193359375, "loss_aux_layer_5": 0.150390625, "loss_aux_layer_6": 0.147705078125, "loss_aux_layer_7": 0.1356201171875, "loss_aux_layer_8": 0.1337890625, "loss_aux_layer_9": 0.133056640625, "step": 282, "total_loss": 0.8232518136501312 }, { "epoch": 0.0560285092060978, "grad_norm": 0.8182996511459351, "learning_rate": 5e-05, "llm_loss": 0.6726816445589066, "loss": 3.4062, "loss_aux_layer_0": 0.04486083984375, "loss_aux_layer_1": 0.12744140625, "loss_aux_layer_10": 0.1376953125, "loss_aux_layer_11": 0.14599609375, "loss_aux_layer_12": 0.157470703125, "loss_aux_layer_13": 0.170166015625, "loss_aux_layer_14": 0.187744140625, "loss_aux_layer_15": 0.20361328125, "loss_aux_layer_16": 0.21923828125, "loss_aux_layer_17": 0.223876953125, "loss_aux_layer_18": 0.234375, "loss_aux_layer_19": 0.23193359375, "loss_aux_layer_2": 0.149658203125, "loss_aux_layer_20": 0.234130859375, "loss_aux_layer_21": 0.236083984375, "loss_aux_layer_22": 0.2568359375, "loss_aux_layer_23": 0.302734375, "loss_aux_layer_3": 0.156982421875, "loss_aux_layer_4": 0.153564453125, "loss_aux_layer_5": 0.15478515625, "loss_aux_layer_6": 0.151611328125, "loss_aux_layer_7": 0.13916015625, "loss_aux_layer_8": 0.13720703125, "loss_aux_layer_9": 0.1357421875, "step": 283, "total_loss": 0.8515610992908478 }, { "epoch": 0.056226489803999205, "grad_norm": 1.0004156827926636, "learning_rate": 5e-05, "llm_loss": 0.6867843717336655, "loss": 3.4533, "loss_aux_layer_0": 0.0452880859375, "loss_aux_layer_1": 0.12646484375, "loss_aux_layer_10": 0.135498046875, "loss_aux_layer_11": 0.1435546875, "loss_aux_layer_12": 0.154541015625, "loss_aux_layer_13": 0.166015625, "loss_aux_layer_14": 0.183837890625, "loss_aux_layer_15": 0.198974609375, "loss_aux_layer_16": 0.21435546875, "loss_aux_layer_17": 0.2197265625, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.228759765625, "loss_aux_layer_2": 0.14794921875, "loss_aux_layer_20": 0.230712890625, "loss_aux_layer_21": 0.23388671875, "loss_aux_layer_22": 0.254150390625, "loss_aux_layer_23": 0.30126953125, "loss_aux_layer_3": 0.1552734375, "loss_aux_layer_4": 0.15283203125, "loss_aux_layer_5": 0.154296875, "loss_aux_layer_6": 0.151123046875, "loss_aux_layer_7": 0.137939453125, "loss_aux_layer_8": 0.1357421875, "loss_aux_layer_9": 0.1337890625, "step": 284, "total_loss": 0.8633126467466354 }, { "epoch": 0.056424470401900614, "grad_norm": 1.7931220531463623, "learning_rate": 5e-05, "llm_loss": 0.703838661313057, "loss": 3.5132, "loss_aux_layer_0": 0.0440673828125, "loss_aux_layer_1": 0.1231689453125, "loss_aux_layer_10": 0.134521484375, "loss_aux_layer_11": 0.141845703125, "loss_aux_layer_12": 0.15283203125, "loss_aux_layer_13": 0.1640625, "loss_aux_layer_14": 0.180908203125, "loss_aux_layer_15": 0.196044921875, "loss_aux_layer_16": 0.211669921875, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.228515625, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.1435546875, "loss_aux_layer_20": 0.2294921875, "loss_aux_layer_21": 0.232666015625, "loss_aux_layer_22": 0.253662109375, "loss_aux_layer_23": 0.3017578125, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.1494140625, "loss_aux_layer_5": 0.15087890625, "loss_aux_layer_6": 0.147705078125, "loss_aux_layer_7": 0.13623046875, "loss_aux_layer_8": 0.13427734375, "loss_aux_layer_9": 0.1328125, "step": 285, "total_loss": 0.8783035278320312 }, { "epoch": 0.05662245099980202, "grad_norm": 2.0119054317474365, "learning_rate": 5e-05, "llm_loss": 0.8026529848575592, "loss": 3.9153, "loss_aux_layer_0": 0.04400634765625, "loss_aux_layer_1": 0.125, "loss_aux_layer_10": 0.13623046875, "loss_aux_layer_11": 0.143798828125, "loss_aux_layer_12": 0.154296875, "loss_aux_layer_13": 0.165771484375, "loss_aux_layer_14": 0.18359375, "loss_aux_layer_15": 0.198486328125, "loss_aux_layer_16": 0.214111328125, "loss_aux_layer_17": 0.219482421875, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.22900390625, "loss_aux_layer_2": 0.146240234375, "loss_aux_layer_20": 0.23095703125, "loss_aux_layer_21": 0.2333984375, "loss_aux_layer_22": 0.2529296875, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.154052734375, "loss_aux_layer_4": 0.15234375, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.150634765625, "loss_aux_layer_7": 0.138671875, "loss_aux_layer_8": 0.136474609375, "loss_aux_layer_9": 0.134765625, "step": 286, "total_loss": 0.9788288474082947 }, { "epoch": 0.056820431597703426, "grad_norm": 1.0921299457550049, "learning_rate": 5e-05, "llm_loss": 0.7081029415130615, "loss": 3.5289, "loss_aux_layer_0": 0.04541015625, "loss_aux_layer_1": 0.1243896484375, "loss_aux_layer_10": 0.1318359375, "loss_aux_layer_11": 0.13916015625, "loss_aux_layer_12": 0.15087890625, "loss_aux_layer_13": 0.16162109375, "loss_aux_layer_14": 0.180908203125, "loss_aux_layer_15": 0.196533203125, "loss_aux_layer_16": 0.212646484375, "loss_aux_layer_17": 0.2197265625, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.229736328125, "loss_aux_layer_2": 0.14306640625, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.234375, "loss_aux_layer_22": 0.255859375, "loss_aux_layer_23": 0.30322265625, "loss_aux_layer_3": 0.15087890625, "loss_aux_layer_4": 0.147705078125, "loss_aux_layer_5": 0.14892578125, "loss_aux_layer_6": 0.1455078125, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.131103515625, "loss_aux_layer_9": 0.13037109375, "step": 287, "total_loss": 0.8822207003831863 }, { "epoch": 0.05701841219560483, "grad_norm": 1.286982536315918, "learning_rate": 5e-05, "llm_loss": 0.6318127810955048, "loss": 3.2255, "loss_aux_layer_0": 0.04296875, "loss_aux_layer_1": 0.12548828125, "loss_aux_layer_10": 0.135498046875, "loss_aux_layer_11": 0.143310546875, "loss_aux_layer_12": 0.15380859375, "loss_aux_layer_13": 0.16455078125, "loss_aux_layer_14": 0.181884765625, "loss_aux_layer_15": 0.197021484375, "loss_aux_layer_16": 0.211669921875, "loss_aux_layer_17": 0.217041015625, "loss_aux_layer_18": 0.22705078125, "loss_aux_layer_19": 0.224853515625, "loss_aux_layer_2": 0.146484375, "loss_aux_layer_20": 0.227783203125, "loss_aux_layer_21": 0.230224609375, "loss_aux_layer_22": 0.24951171875, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.154296875, "loss_aux_layer_4": 0.151611328125, "loss_aux_layer_5": 0.153076171875, "loss_aux_layer_6": 0.1494140625, "loss_aux_layer_7": 0.137451171875, "loss_aux_layer_8": 0.1357421875, "loss_aux_layer_9": 0.1339111328125, "step": 288, "total_loss": 0.8063857555389404 }, { "epoch": 0.05721639279350624, "grad_norm": 1.665808081626892, "learning_rate": 5e-05, "llm_loss": 0.6563509553670883, "loss": 3.3415, "loss_aux_layer_0": 0.0440673828125, "loss_aux_layer_1": 0.130859375, "loss_aux_layer_10": 0.13916015625, "loss_aux_layer_11": 0.147216796875, "loss_aux_layer_12": 0.158203125, "loss_aux_layer_13": 0.1689453125, "loss_aux_layer_14": 0.1865234375, "loss_aux_layer_15": 0.200927734375, "loss_aux_layer_16": 0.21484375, "loss_aux_layer_17": 0.2197265625, "loss_aux_layer_18": 0.230224609375, "loss_aux_layer_19": 0.227783203125, "loss_aux_layer_2": 0.15234375, "loss_aux_layer_20": 0.23046875, "loss_aux_layer_21": 0.234619140625, "loss_aux_layer_22": 0.25732421875, "loss_aux_layer_23": 0.30419921875, "loss_aux_layer_3": 0.16015625, "loss_aux_layer_4": 0.156982421875, "loss_aux_layer_5": 0.158203125, "loss_aux_layer_6": 0.154296875, "loss_aux_layer_7": 0.141845703125, "loss_aux_layer_8": 0.14013671875, "loss_aux_layer_9": 0.1376953125, "step": 289, "total_loss": 0.8353699594736099 }, { "epoch": 0.05741437339140764, "grad_norm": 1.2343106269836426, "learning_rate": 5e-05, "llm_loss": 0.563551776111126, "loss": 2.996, "loss_aux_layer_0": 0.04803466796875, "loss_aux_layer_1": 0.134521484375, "loss_aux_layer_10": 0.146484375, "loss_aux_layer_11": 0.154052734375, "loss_aux_layer_12": 0.164794921875, "loss_aux_layer_13": 0.175537109375, "loss_aux_layer_14": 0.193115234375, "loss_aux_layer_15": 0.20751953125, "loss_aux_layer_16": 0.2216796875, "loss_aux_layer_17": 0.225830078125, "loss_aux_layer_18": 0.236572265625, "loss_aux_layer_19": 0.234130859375, "loss_aux_layer_2": 0.15771484375, "loss_aux_layer_20": 0.235595703125, "loss_aux_layer_21": 0.23876953125, "loss_aux_layer_22": 0.2607421875, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.167236328125, "loss_aux_layer_4": 0.16552734375, "loss_aux_layer_5": 0.168212890625, "loss_aux_layer_6": 0.1640625, "loss_aux_layer_7": 0.150390625, "loss_aux_layer_8": 0.1474609375, "loss_aux_layer_9": 0.14599609375, "step": 290, "total_loss": 0.7489886581897736 }, { "epoch": 0.05761235398930905, "grad_norm": 1.5130608081817627, "learning_rate": 5e-05, "llm_loss": 0.6616673097014427, "loss": 3.3247, "loss_aux_layer_0": 0.04315185546875, "loss_aux_layer_1": 0.1195068359375, "loss_aux_layer_10": 0.12841796875, "loss_aux_layer_11": 0.135986328125, "loss_aux_layer_12": 0.14697265625, "loss_aux_layer_13": 0.158203125, "loss_aux_layer_14": 0.17626953125, "loss_aux_layer_15": 0.192138671875, "loss_aux_layer_16": 0.207763671875, "loss_aux_layer_17": 0.214111328125, "loss_aux_layer_18": 0.224609375, "loss_aux_layer_19": 0.222900390625, "loss_aux_layer_2": 0.13818359375, "loss_aux_layer_20": 0.226318359375, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.2490234375, "loss_aux_layer_23": 0.2958984375, "loss_aux_layer_3": 0.146484375, "loss_aux_layer_4": 0.1435546875, "loss_aux_layer_5": 0.144775390625, "loss_aux_layer_6": 0.1416015625, "loss_aux_layer_7": 0.1290283203125, "loss_aux_layer_8": 0.128173828125, "loss_aux_layer_9": 0.1268310546875, "step": 291, "total_loss": 0.8311797529459 }, { "epoch": 0.05781033458721045, "grad_norm": 2.238393783569336, "learning_rate": 5e-05, "llm_loss": 0.6797046363353729, "loss": 3.4005, "loss_aux_layer_0": 0.0433349609375, "loss_aux_layer_1": 0.12109375, "loss_aux_layer_10": 0.1300048828125, "loss_aux_layer_11": 0.13720703125, "loss_aux_layer_12": 0.14794921875, "loss_aux_layer_13": 0.15966796875, "loss_aux_layer_14": 0.177490234375, "loss_aux_layer_15": 0.1923828125, "loss_aux_layer_16": 0.207275390625, "loss_aux_layer_17": 0.2138671875, "loss_aux_layer_18": 0.224609375, "loss_aux_layer_19": 0.223388671875, "loss_aux_layer_2": 0.14013671875, "loss_aux_layer_20": 0.22705078125, "loss_aux_layer_21": 0.228759765625, "loss_aux_layer_22": 0.24853515625, "loss_aux_layer_23": 0.29443359375, "loss_aux_layer_3": 0.147705078125, "loss_aux_layer_4": 0.14501953125, "loss_aux_layer_5": 0.146240234375, "loss_aux_layer_6": 0.142822265625, "loss_aux_layer_7": 0.1318359375, "loss_aux_layer_8": 0.1304931640625, "loss_aux_layer_9": 0.128662109375, "step": 292, "total_loss": 0.8501342236995697 }, { "epoch": 0.05800831518511186, "grad_norm": 0.7841664552688599, "learning_rate": 5e-05, "llm_loss": 0.6948685199022293, "loss": 3.4942, "loss_aux_layer_0": 0.0411376953125, "loss_aux_layer_1": 0.12890625, "loss_aux_layer_10": 0.138671875, "loss_aux_layer_11": 0.146240234375, "loss_aux_layer_12": 0.1572265625, "loss_aux_layer_13": 0.1689453125, "loss_aux_layer_14": 0.18701171875, "loss_aux_layer_15": 0.201904296875, "loss_aux_layer_16": 0.216552734375, "loss_aux_layer_17": 0.22216796875, "loss_aux_layer_18": 0.23291015625, "loss_aux_layer_19": 0.2294921875, "loss_aux_layer_2": 0.1513671875, "loss_aux_layer_20": 0.231689453125, "loss_aux_layer_21": 0.233642578125, "loss_aux_layer_22": 0.25439453125, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.159423828125, "loss_aux_layer_4": 0.1572265625, "loss_aux_layer_5": 0.158203125, "loss_aux_layer_6": 0.154296875, "loss_aux_layer_7": 0.140869140625, "loss_aux_layer_8": 0.138671875, "loss_aux_layer_9": 0.13720703125, "step": 293, "total_loss": 0.8735416084527969 }, { "epoch": 0.058206295783013264, "grad_norm": 3.2641170024871826, "learning_rate": 5e-05, "llm_loss": 0.6603719592094421, "loss": 3.3436, "loss_aux_layer_0": 0.043701171875, "loss_aux_layer_1": 0.1260986328125, "loss_aux_layer_10": 0.1357421875, "loss_aux_layer_11": 0.143798828125, "loss_aux_layer_12": 0.154296875, "loss_aux_layer_13": 0.165771484375, "loss_aux_layer_14": 0.183349609375, "loss_aux_layer_15": 0.198486328125, "loss_aux_layer_16": 0.214111328125, "loss_aux_layer_17": 0.21923828125, "loss_aux_layer_18": 0.2294921875, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.144287109375, "loss_aux_layer_20": 0.230224609375, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.25390625, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.15234375, "loss_aux_layer_4": 0.150146484375, "loss_aux_layer_5": 0.151123046875, "loss_aux_layer_6": 0.148193359375, "loss_aux_layer_7": 0.136474609375, "loss_aux_layer_8": 0.135986328125, "loss_aux_layer_9": 0.134765625, "step": 294, "total_loss": 0.83589668571949 }, { "epoch": 0.05840427638091467, "grad_norm": 3.122018575668335, "learning_rate": 5e-05, "llm_loss": 0.6102011948823929, "loss": 3.1521, "loss_aux_layer_0": 0.04400634765625, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.138671875, "loss_aux_layer_11": 0.146484375, "loss_aux_layer_12": 0.157958984375, "loss_aux_layer_13": 0.169677734375, "loss_aux_layer_14": 0.18701171875, "loss_aux_layer_15": 0.201904296875, "loss_aux_layer_16": 0.21630859375, "loss_aux_layer_17": 0.221435546875, "loss_aux_layer_18": 0.2314453125, "loss_aux_layer_19": 0.228515625, "loss_aux_layer_2": 0.1473388671875, "loss_aux_layer_20": 0.230712890625, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.254638671875, "loss_aux_layer_23": 0.30126953125, "loss_aux_layer_3": 0.155517578125, "loss_aux_layer_4": 0.1534423828125, "loss_aux_layer_5": 0.155029296875, "loss_aux_layer_6": 0.1513671875, "loss_aux_layer_7": 0.139404296875, "loss_aux_layer_8": 0.138427734375, "loss_aux_layer_9": 0.1368408203125, "step": 295, "total_loss": 0.7880251854658127 }, { "epoch": 0.058602256978816075, "grad_norm": 0.9368335008621216, "learning_rate": 5e-05, "llm_loss": 0.6787471026182175, "loss": 3.4017, "loss_aux_layer_0": 0.0419921875, "loss_aux_layer_1": 0.123779296875, "loss_aux_layer_10": 0.131103515625, "loss_aux_layer_11": 0.138671875, "loss_aux_layer_12": 0.149169921875, "loss_aux_layer_13": 0.16015625, "loss_aux_layer_14": 0.177734375, "loss_aux_layer_15": 0.193115234375, "loss_aux_layer_16": 0.207763671875, "loss_aux_layer_17": 0.213623046875, "loss_aux_layer_18": 0.22412109375, "loss_aux_layer_19": 0.222900390625, "loss_aux_layer_2": 0.143798828125, "loss_aux_layer_20": 0.225830078125, "loss_aux_layer_21": 0.228759765625, "loss_aux_layer_22": 0.250244140625, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.151611328125, "loss_aux_layer_4": 0.1484375, "loss_aux_layer_5": 0.150146484375, "loss_aux_layer_6": 0.146240234375, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.13134765625, "loss_aux_layer_9": 0.129638671875, "step": 296, "total_loss": 0.8504183441400528 }, { "epoch": 0.058800237576717485, "grad_norm": 2.9150993824005127, "learning_rate": 5e-05, "llm_loss": 0.6699685826897621, "loss": 3.3954, "loss_aux_layer_0": 0.04296875, "loss_aux_layer_1": 0.1292724609375, "loss_aux_layer_10": 0.1396484375, "loss_aux_layer_11": 0.14794921875, "loss_aux_layer_12": 0.158447265625, "loss_aux_layer_13": 0.169677734375, "loss_aux_layer_14": 0.187255859375, "loss_aux_layer_15": 0.2021484375, "loss_aux_layer_16": 0.216552734375, "loss_aux_layer_17": 0.2216796875, "loss_aux_layer_18": 0.23291015625, "loss_aux_layer_19": 0.230712890625, "loss_aux_layer_2": 0.149169921875, "loss_aux_layer_20": 0.232177734375, "loss_aux_layer_21": 0.234130859375, "loss_aux_layer_22": 0.25634765625, "loss_aux_layer_23": 0.30419921875, "loss_aux_layer_3": 0.15771484375, "loss_aux_layer_4": 0.15478515625, "loss_aux_layer_5": 0.155517578125, "loss_aux_layer_6": 0.152099609375, "loss_aux_layer_7": 0.139892578125, "loss_aux_layer_8": 0.13916015625, "loss_aux_layer_9": 0.137939453125, "step": 297, "total_loss": 0.8488457798957825 }, { "epoch": 0.05899821817461889, "grad_norm": 2.569523572921753, "learning_rate": 5e-05, "llm_loss": 0.6250202357769012, "loss": 3.1885, "loss_aux_layer_0": 0.04583740234375, "loss_aux_layer_1": 0.1209716796875, "loss_aux_layer_10": 0.1314697265625, "loss_aux_layer_11": 0.13818359375, "loss_aux_layer_12": 0.14892578125, "loss_aux_layer_13": 0.160400390625, "loss_aux_layer_14": 0.1787109375, "loss_aux_layer_15": 0.195068359375, "loss_aux_layer_16": 0.21044921875, "loss_aux_layer_17": 0.217041015625, "loss_aux_layer_18": 0.22802734375, "loss_aux_layer_19": 0.227294921875, "loss_aux_layer_2": 0.139404296875, "loss_aux_layer_20": 0.23046875, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.25146484375, "loss_aux_layer_23": 0.29833984375, "loss_aux_layer_3": 0.14794921875, "loss_aux_layer_4": 0.14501953125, "loss_aux_layer_5": 0.14697265625, "loss_aux_layer_6": 0.142822265625, "loss_aux_layer_7": 0.1317138671875, "loss_aux_layer_8": 0.131591796875, "loss_aux_layer_9": 0.129638671875, "step": 298, "total_loss": 0.7971174865961075 }, { "epoch": 0.059196198772520296, "grad_norm": 2.063990831375122, "learning_rate": 5e-05, "llm_loss": 0.5722816437482834, "loss": 2.9732, "loss_aux_layer_0": 0.0438232421875, "loss_aux_layer_1": 0.119873046875, "loss_aux_layer_10": 0.1307373046875, "loss_aux_layer_11": 0.13818359375, "loss_aux_layer_12": 0.14892578125, "loss_aux_layer_13": 0.160400390625, "loss_aux_layer_14": 0.17822265625, "loss_aux_layer_15": 0.193603515625, "loss_aux_layer_16": 0.208251953125, "loss_aux_layer_17": 0.214111328125, "loss_aux_layer_18": 0.22509765625, "loss_aux_layer_19": 0.225830078125, "loss_aux_layer_2": 0.138427734375, "loss_aux_layer_20": 0.22802734375, "loss_aux_layer_21": 0.230712890625, "loss_aux_layer_22": 0.251708984375, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.144287109375, "loss_aux_layer_5": 0.14599609375, "loss_aux_layer_6": 0.142578125, "loss_aux_layer_7": 0.13037109375, "loss_aux_layer_8": 0.129638671875, "loss_aux_layer_9": 0.1285400390625, "step": 299, "total_loss": 0.7432993054389954 }, { "epoch": 0.0593941793704217, "grad_norm": 3.662116527557373, "learning_rate": 5e-05, "llm_loss": 0.6464252769947052, "loss": 3.2999, "loss_aux_layer_0": 0.04193115234375, "loss_aux_layer_1": 0.1290283203125, "loss_aux_layer_10": 0.139404296875, "loss_aux_layer_11": 0.1474609375, "loss_aux_layer_12": 0.158203125, "loss_aux_layer_13": 0.168701171875, "loss_aux_layer_14": 0.186279296875, "loss_aux_layer_15": 0.20068359375, "loss_aux_layer_16": 0.215576171875, "loss_aux_layer_17": 0.22021484375, "loss_aux_layer_18": 0.230224609375, "loss_aux_layer_19": 0.228759765625, "loss_aux_layer_2": 0.14990234375, "loss_aux_layer_20": 0.2314453125, "loss_aux_layer_21": 0.234375, "loss_aux_layer_22": 0.256591796875, "loss_aux_layer_23": 0.30419921875, "loss_aux_layer_3": 0.15771484375, "loss_aux_layer_4": 0.15478515625, "loss_aux_layer_5": 0.15576171875, "loss_aux_layer_6": 0.152587890625, "loss_aux_layer_7": 0.142578125, "loss_aux_layer_8": 0.140869140625, "loss_aux_layer_9": 0.138427734375, "step": 300, "total_loss": 0.8249744176864624 }, { "epoch": 0.0595921599683231, "grad_norm": 1.478460431098938, "learning_rate": 5e-05, "llm_loss": 0.7023463845252991, "loss": 3.4923, "loss_aux_layer_0": 0.04248046875, "loss_aux_layer_1": 0.1241455078125, "loss_aux_layer_10": 0.1307373046875, "loss_aux_layer_11": 0.137451171875, "loss_aux_layer_12": 0.14892578125, "loss_aux_layer_13": 0.159423828125, "loss_aux_layer_14": 0.177001953125, "loss_aux_layer_15": 0.192626953125, "loss_aux_layer_16": 0.2080078125, "loss_aux_layer_17": 0.21337890625, "loss_aux_layer_18": 0.223876953125, "loss_aux_layer_19": 0.222412109375, "loss_aux_layer_2": 0.143798828125, "loss_aux_layer_20": 0.224609375, "loss_aux_layer_21": 0.225830078125, "loss_aux_layer_22": 0.2451171875, "loss_aux_layer_23": 0.29052734375, "loss_aux_layer_3": 0.15185546875, "loss_aux_layer_4": 0.14892578125, "loss_aux_layer_5": 0.1494140625, "loss_aux_layer_6": 0.145751953125, "loss_aux_layer_7": 0.1319580078125, "loss_aux_layer_8": 0.1307373046875, "loss_aux_layer_9": 0.1290283203125, "step": 301, "total_loss": 0.8730718940496445 }, { "epoch": 0.05979014056622451, "grad_norm": 2.8852009773254395, "learning_rate": 5e-05, "llm_loss": 0.670396238565445, "loss": 3.3848, "loss_aux_layer_0": 0.043212890625, "loss_aux_layer_1": 0.1256103515625, "loss_aux_layer_10": 0.13671875, "loss_aux_layer_11": 0.144287109375, "loss_aux_layer_12": 0.1552734375, "loss_aux_layer_13": 0.166748046875, "loss_aux_layer_14": 0.18408203125, "loss_aux_layer_15": 0.198486328125, "loss_aux_layer_16": 0.212890625, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.2294921875, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.14501953125, "loss_aux_layer_20": 0.229248046875, "loss_aux_layer_21": 0.230712890625, "loss_aux_layer_22": 0.253173828125, "loss_aux_layer_23": 0.30029296875, "loss_aux_layer_3": 0.154296875, "loss_aux_layer_4": 0.152587890625, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.150634765625, "loss_aux_layer_7": 0.1376953125, "loss_aux_layer_8": 0.135986328125, "loss_aux_layer_9": 0.134765625, "step": 302, "total_loss": 0.8462035655975342 }, { "epoch": 0.05998812116412591, "grad_norm": 0.9391802549362183, "learning_rate": 5e-05, "llm_loss": 0.6840949505567551, "loss": 3.4366, "loss_aux_layer_0": 0.04449462890625, "loss_aux_layer_1": 0.1280517578125, "loss_aux_layer_10": 0.13525390625, "loss_aux_layer_11": 0.1435546875, "loss_aux_layer_12": 0.154296875, "loss_aux_layer_13": 0.165283203125, "loss_aux_layer_14": 0.18212890625, "loss_aux_layer_15": 0.19677734375, "loss_aux_layer_16": 0.211669921875, "loss_aux_layer_17": 0.21630859375, "loss_aux_layer_18": 0.2265625, "loss_aux_layer_19": 0.225341796875, "loss_aux_layer_2": 0.147705078125, "loss_aux_layer_20": 0.22705078125, "loss_aux_layer_21": 0.229736328125, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.156005859375, "loss_aux_layer_4": 0.15380859375, "loss_aux_layer_5": 0.154052734375, "loss_aux_layer_6": 0.150634765625, "loss_aux_layer_7": 0.13671875, "loss_aux_layer_8": 0.135009765625, "loss_aux_layer_9": 0.1337890625, "step": 303, "total_loss": 0.8591418862342834 }, { "epoch": 0.06018610176202732, "grad_norm": 2.8030354976654053, "learning_rate": 5e-05, "llm_loss": 0.6273876130580902, "loss": 3.2, "loss_aux_layer_0": 0.041259765625, "loss_aux_layer_1": 0.1221923828125, "loss_aux_layer_10": 0.1334228515625, "loss_aux_layer_11": 0.140869140625, "loss_aux_layer_12": 0.152587890625, "loss_aux_layer_13": 0.163818359375, "loss_aux_layer_14": 0.18212890625, "loss_aux_layer_15": 0.197021484375, "loss_aux_layer_16": 0.21142578125, "loss_aux_layer_17": 0.21630859375, "loss_aux_layer_18": 0.2265625, "loss_aux_layer_19": 0.2236328125, "loss_aux_layer_2": 0.14111328125, "loss_aux_layer_20": 0.227294921875, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.248779296875, "loss_aux_layer_23": 0.2939453125, "loss_aux_layer_3": 0.149658203125, "loss_aux_layer_4": 0.1474609375, "loss_aux_layer_5": 0.148681640625, "loss_aux_layer_6": 0.1455078125, "loss_aux_layer_7": 0.134521484375, "loss_aux_layer_8": 0.1336669921875, "loss_aux_layer_9": 0.1324462890625, "step": 304, "total_loss": 0.7999968081712723 }, { "epoch": 0.060384082359928724, "grad_norm": 1.3640562295913696, "learning_rate": 5e-05, "llm_loss": 0.6317683458328247, "loss": 3.2143, "loss_aux_layer_0": 0.0439453125, "loss_aux_layer_1": 0.1258544921875, "loss_aux_layer_10": 0.13330078125, "loss_aux_layer_11": 0.140380859375, "loss_aux_layer_12": 0.15087890625, "loss_aux_layer_13": 0.16162109375, "loss_aux_layer_14": 0.17822265625, "loss_aux_layer_15": 0.193603515625, "loss_aux_layer_16": 0.208740234375, "loss_aux_layer_17": 0.2138671875, "loss_aux_layer_18": 0.22412109375, "loss_aux_layer_19": 0.222412109375, "loss_aux_layer_2": 0.143798828125, "loss_aux_layer_20": 0.22412109375, "loss_aux_layer_21": 0.225341796875, "loss_aux_layer_22": 0.243896484375, "loss_aux_layer_23": 0.2880859375, "loss_aux_layer_3": 0.152099609375, "loss_aux_layer_4": 0.149658203125, "loss_aux_layer_5": 0.150634765625, "loss_aux_layer_6": 0.1474609375, "loss_aux_layer_7": 0.135009765625, "loss_aux_layer_8": 0.133544921875, "loss_aux_layer_9": 0.1318359375, "step": 305, "total_loss": 0.8035731166601181 }, { "epoch": 0.060582062957830134, "grad_norm": 2.1720964908599854, "learning_rate": 5e-05, "llm_loss": 0.6100987941026688, "loss": 3.1413, "loss_aux_layer_0": 0.04150390625, "loss_aux_layer_1": 0.1268310546875, "loss_aux_layer_10": 0.135009765625, "loss_aux_layer_11": 0.142822265625, "loss_aux_layer_12": 0.15380859375, "loss_aux_layer_13": 0.1650390625, "loss_aux_layer_14": 0.182861328125, "loss_aux_layer_15": 0.197998046875, "loss_aux_layer_16": 0.21337890625, "loss_aux_layer_17": 0.219482421875, "loss_aux_layer_18": 0.2294921875, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.145263671875, "loss_aux_layer_20": 0.228759765625, "loss_aux_layer_21": 0.23095703125, "loss_aux_layer_22": 0.25341796875, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.154052734375, "loss_aux_layer_4": 0.151611328125, "loss_aux_layer_5": 0.15234375, "loss_aux_layer_6": 0.14892578125, "loss_aux_layer_7": 0.1365966796875, "loss_aux_layer_8": 0.13525390625, "loss_aux_layer_9": 0.1331787109375, "step": 306, "total_loss": 0.7853351980447769 }, { "epoch": 0.060780043555731536, "grad_norm": 2.31976580619812, "learning_rate": 5e-05, "llm_loss": 0.6030752509832382, "loss": 3.1133, "loss_aux_layer_0": 0.0447998046875, "loss_aux_layer_1": 0.1256103515625, "loss_aux_layer_10": 0.13623046875, "loss_aux_layer_11": 0.144287109375, "loss_aux_layer_12": 0.154541015625, "loss_aux_layer_13": 0.1650390625, "loss_aux_layer_14": 0.1826171875, "loss_aux_layer_15": 0.197509765625, "loss_aux_layer_16": 0.21240234375, "loss_aux_layer_17": 0.21728515625, "loss_aux_layer_18": 0.22900390625, "loss_aux_layer_19": 0.227783203125, "loss_aux_layer_2": 0.145263671875, "loss_aux_layer_20": 0.229248046875, "loss_aux_layer_21": 0.2314453125, "loss_aux_layer_22": 0.25146484375, "loss_aux_layer_23": 0.2978515625, "loss_aux_layer_3": 0.15380859375, "loss_aux_layer_4": 0.1513671875, "loss_aux_layer_5": 0.15234375, "loss_aux_layer_6": 0.1484375, "loss_aux_layer_7": 0.13623046875, "loss_aux_layer_8": 0.13623046875, "loss_aux_layer_9": 0.135009765625, "step": 307, "total_loss": 0.778324693441391 }, { "epoch": 0.060978024153632945, "grad_norm": 1.1990002393722534, "learning_rate": 5e-05, "llm_loss": 0.6925680190324783, "loss": 3.4618, "loss_aux_layer_0": 0.04058837890625, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.1326904296875, "loss_aux_layer_11": 0.140625, "loss_aux_layer_12": 0.15087890625, "loss_aux_layer_13": 0.161865234375, "loss_aux_layer_14": 0.17919921875, "loss_aux_layer_15": 0.19482421875, "loss_aux_layer_16": 0.209716796875, "loss_aux_layer_17": 0.2158203125, "loss_aux_layer_18": 0.2275390625, "loss_aux_layer_19": 0.22509765625, "loss_aux_layer_2": 0.14453125, "loss_aux_layer_20": 0.22802734375, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.152587890625, "loss_aux_layer_4": 0.149658203125, "loss_aux_layer_5": 0.150390625, "loss_aux_layer_6": 0.14697265625, "loss_aux_layer_7": 0.1334228515625, "loss_aux_layer_8": 0.1324462890625, "loss_aux_layer_9": 0.131103515625, "step": 308, "total_loss": 0.8654520213603973 }, { "epoch": 0.06117600475153435, "grad_norm": 1.716041088104248, "learning_rate": 5e-05, "llm_loss": 0.6746088862419128, "loss": 3.3708, "loss_aux_layer_0": 0.04144287109375, "loss_aux_layer_1": 0.1219482421875, "loss_aux_layer_10": 0.1282958984375, "loss_aux_layer_11": 0.1357421875, "loss_aux_layer_12": 0.146240234375, "loss_aux_layer_13": 0.15771484375, "loss_aux_layer_14": 0.174072265625, "loss_aux_layer_15": 0.189208984375, "loss_aux_layer_16": 0.204345703125, "loss_aux_layer_17": 0.2109375, "loss_aux_layer_18": 0.221923828125, "loss_aux_layer_19": 0.220458984375, "loss_aux_layer_2": 0.1376953125, "loss_aux_layer_20": 0.22314453125, "loss_aux_layer_21": 0.22412109375, "loss_aux_layer_22": 0.243896484375, "loss_aux_layer_23": 0.2890625, "loss_aux_layer_3": 0.146484375, "loss_aux_layer_4": 0.14453125, "loss_aux_layer_5": 0.1455078125, "loss_aux_layer_6": 0.1416015625, "loss_aux_layer_7": 0.1302490234375, "loss_aux_layer_8": 0.1285400390625, "loss_aux_layer_9": 0.1268310546875, "step": 309, "total_loss": 0.8426990807056427 }, { "epoch": 0.06137398534943576, "grad_norm": 1.0950367450714111, "learning_rate": 5e-05, "llm_loss": 0.6956032663583755, "loss": 3.4666, "loss_aux_layer_0": 0.03985595703125, "loss_aux_layer_1": 0.12451171875, "loss_aux_layer_10": 0.131591796875, "loss_aux_layer_11": 0.138916015625, "loss_aux_layer_12": 0.150146484375, "loss_aux_layer_13": 0.160400390625, "loss_aux_layer_14": 0.177001953125, "loss_aux_layer_15": 0.19189453125, "loss_aux_layer_16": 0.206787109375, "loss_aux_layer_17": 0.212890625, "loss_aux_layer_18": 0.22412109375, "loss_aux_layer_19": 0.223388671875, "loss_aux_layer_2": 0.142333984375, "loss_aux_layer_20": 0.2255859375, "loss_aux_layer_21": 0.22607421875, "loss_aux_layer_22": 0.246826171875, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.15087890625, "loss_aux_layer_4": 0.14892578125, "loss_aux_layer_5": 0.149658203125, "loss_aux_layer_6": 0.145751953125, "loss_aux_layer_7": 0.133056640625, "loss_aux_layer_8": 0.13134765625, "loss_aux_layer_9": 0.129638671875, "step": 310, "total_loss": 0.8666377812623978 }, { "epoch": 0.06157196594733716, "grad_norm": 2.2392120361328125, "learning_rate": 5e-05, "llm_loss": 0.6707318127155304, "loss": 3.3757, "loss_aux_layer_0": 0.0413818359375, "loss_aux_layer_1": 0.1246337890625, "loss_aux_layer_10": 0.1326904296875, "loss_aux_layer_11": 0.14013671875, "loss_aux_layer_12": 0.151123046875, "loss_aux_layer_13": 0.16259765625, "loss_aux_layer_14": 0.1806640625, "loss_aux_layer_15": 0.1962890625, "loss_aux_layer_16": 0.211669921875, "loss_aux_layer_17": 0.217529296875, "loss_aux_layer_18": 0.228515625, "loss_aux_layer_19": 0.2265625, "loss_aux_layer_2": 0.142578125, "loss_aux_layer_20": 0.2294921875, "loss_aux_layer_21": 0.23046875, "loss_aux_layer_22": 0.251220703125, "loss_aux_layer_23": 0.298828125, "loss_aux_layer_3": 0.150634765625, "loss_aux_layer_4": 0.148193359375, "loss_aux_layer_5": 0.1484375, "loss_aux_layer_6": 0.14501953125, "loss_aux_layer_7": 0.133544921875, "loss_aux_layer_8": 0.13232421875, "loss_aux_layer_9": 0.1309814453125, "step": 311, "total_loss": 0.8439320772886276 }, { "epoch": 0.06176994654523857, "grad_norm": 0.8595796823501587, "learning_rate": 5e-05, "llm_loss": 0.7341238707304001, "loss": 3.6333, "loss_aux_layer_0": 0.0418701171875, "loss_aux_layer_1": 0.1273193359375, "loss_aux_layer_10": 0.13427734375, "loss_aux_layer_11": 0.141845703125, "loss_aux_layer_12": 0.15283203125, "loss_aux_layer_13": 0.16455078125, "loss_aux_layer_14": 0.181396484375, "loss_aux_layer_15": 0.1962890625, "loss_aux_layer_16": 0.211181640625, "loss_aux_layer_17": 0.21728515625, "loss_aux_layer_18": 0.226806640625, "loss_aux_layer_19": 0.224609375, "loss_aux_layer_2": 0.14599609375, "loss_aux_layer_20": 0.22705078125, "loss_aux_layer_21": 0.22802734375, "loss_aux_layer_22": 0.250244140625, "loss_aux_layer_23": 0.29541015625, "loss_aux_layer_3": 0.155029296875, "loss_aux_layer_4": 0.15283203125, "loss_aux_layer_5": 0.15380859375, "loss_aux_layer_6": 0.14990234375, "loss_aux_layer_7": 0.135986328125, "loss_aux_layer_8": 0.134521484375, "loss_aux_layer_9": 0.132568359375, "step": 312, "total_loss": 0.9083255380392075 }, { "epoch": 0.06196792714313997, "grad_norm": 1.5285441875457764, "learning_rate": 5e-05, "llm_loss": 0.6696313172578812, "loss": 3.3532, "loss_aux_layer_0": 0.04156494140625, "loss_aux_layer_1": 0.123291015625, "loss_aux_layer_10": 0.12939453125, "loss_aux_layer_11": 0.136474609375, "loss_aux_layer_12": 0.14697265625, "loss_aux_layer_13": 0.158447265625, "loss_aux_layer_14": 0.175048828125, "loss_aux_layer_15": 0.18994140625, "loss_aux_layer_16": 0.204345703125, "loss_aux_layer_17": 0.2099609375, "loss_aux_layer_18": 0.22119140625, "loss_aux_layer_19": 0.218994140625, "loss_aux_layer_2": 0.139892578125, "loss_aux_layer_20": 0.22216796875, "loss_aux_layer_21": 0.22509765625, "loss_aux_layer_22": 0.244384765625, "loss_aux_layer_23": 0.28857421875, "loss_aux_layer_3": 0.148193359375, "loss_aux_layer_4": 0.145751953125, "loss_aux_layer_5": 0.14697265625, "loss_aux_layer_6": 0.1435546875, "loss_aux_layer_7": 0.130859375, "loss_aux_layer_8": 0.1298828125, "loss_aux_layer_9": 0.128173828125, "step": 313, "total_loss": 0.8382943272590637 }, { "epoch": 0.06216590774104138, "grad_norm": 1.54703688621521, "learning_rate": 5e-05, "llm_loss": 0.6547525823116302, "loss": 3.3064, "loss_aux_layer_0": 0.04107666015625, "loss_aux_layer_1": 0.1240234375, "loss_aux_layer_10": 0.130615234375, "loss_aux_layer_11": 0.138671875, "loss_aux_layer_12": 0.149658203125, "loss_aux_layer_13": 0.160400390625, "loss_aux_layer_14": 0.17822265625, "loss_aux_layer_15": 0.1943359375, "loss_aux_layer_16": 0.210205078125, "loss_aux_layer_17": 0.216552734375, "loss_aux_layer_18": 0.226318359375, "loss_aux_layer_19": 0.224365234375, "loss_aux_layer_2": 0.1416015625, "loss_aux_layer_20": 0.2265625, "loss_aux_layer_21": 0.228271484375, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.296875, "loss_aux_layer_3": 0.150390625, "loss_aux_layer_4": 0.148193359375, "loss_aux_layer_5": 0.14892578125, "loss_aux_layer_6": 0.145263671875, "loss_aux_layer_7": 0.1328125, "loss_aux_layer_8": 0.131103515625, "loss_aux_layer_9": 0.129638671875, "step": 314, "total_loss": 0.826609805226326 }, { "epoch": 0.06236388833894278, "grad_norm": 1.2200266122817993, "learning_rate": 5e-05, "llm_loss": 0.6353557258844376, "loss": 3.2331, "loss_aux_layer_0": 0.04046630859375, "loss_aux_layer_1": 0.124755859375, "loss_aux_layer_10": 0.1339111328125, "loss_aux_layer_11": 0.14208984375, "loss_aux_layer_12": 0.152587890625, "loss_aux_layer_13": 0.1640625, "loss_aux_layer_14": 0.180908203125, "loss_aux_layer_15": 0.196044921875, "loss_aux_layer_16": 0.210693359375, "loss_aux_layer_17": 0.216552734375, "loss_aux_layer_18": 0.226318359375, "loss_aux_layer_19": 0.22314453125, "loss_aux_layer_2": 0.142578125, "loss_aux_layer_20": 0.22509765625, "loss_aux_layer_21": 0.2275390625, "loss_aux_layer_22": 0.2490234375, "loss_aux_layer_23": 0.29443359375, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.14990234375, "loss_aux_layer_5": 0.15087890625, "loss_aux_layer_6": 0.147216796875, "loss_aux_layer_7": 0.135009765625, "loss_aux_layer_8": 0.1337890625, "loss_aux_layer_9": 0.1322021484375, "step": 315, "total_loss": 0.8082632422447205 }, { "epoch": 0.06256186893684419, "grad_norm": 1.4861053228378296, "learning_rate": 5e-05, "llm_loss": 0.7023740261793137, "loss": 3.5032, "loss_aux_layer_0": 0.04107666015625, "loss_aux_layer_1": 0.125244140625, "loss_aux_layer_10": 0.13232421875, "loss_aux_layer_11": 0.14013671875, "loss_aux_layer_12": 0.151611328125, "loss_aux_layer_13": 0.162841796875, "loss_aux_layer_14": 0.181884765625, "loss_aux_layer_15": 0.197265625, "loss_aux_layer_16": 0.212646484375, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.228759765625, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.142333984375, "loss_aux_layer_20": 0.2294921875, "loss_aux_layer_21": 0.230712890625, "loss_aux_layer_22": 0.252197265625, "loss_aux_layer_23": 0.296875, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.149169921875, "loss_aux_layer_5": 0.14990234375, "loss_aux_layer_6": 0.14599609375, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.1318359375, "loss_aux_layer_9": 0.130615234375, "step": 316, "total_loss": 0.8758038431406021 }, { "epoch": 0.0627598495347456, "grad_norm": 1.8156648874282837, "learning_rate": 5e-05, "llm_loss": 0.6693378686904907, "loss": 3.3743, "loss_aux_layer_0": 0.04107666015625, "loss_aux_layer_1": 0.12548828125, "loss_aux_layer_10": 0.13525390625, "loss_aux_layer_11": 0.143310546875, "loss_aux_layer_12": 0.15380859375, "loss_aux_layer_13": 0.164794921875, "loss_aux_layer_14": 0.18212890625, "loss_aux_layer_15": 0.197509765625, "loss_aux_layer_16": 0.21240234375, "loss_aux_layer_17": 0.217529296875, "loss_aux_layer_18": 0.22705078125, "loss_aux_layer_19": 0.225341796875, "loss_aux_layer_2": 0.143310546875, "loss_aux_layer_20": 0.22705078125, "loss_aux_layer_21": 0.22998046875, "loss_aux_layer_22": 0.252197265625, "loss_aux_layer_23": 0.298828125, "loss_aux_layer_3": 0.15234375, "loss_aux_layer_4": 0.150634765625, "loss_aux_layer_5": 0.151611328125, "loss_aux_layer_6": 0.14794921875, "loss_aux_layer_7": 0.13525390625, "loss_aux_layer_8": 0.13427734375, "loss_aux_layer_9": 0.133544921875, "step": 317, "total_loss": 0.8435792624950409 }, { "epoch": 0.062957830132647, "grad_norm": 0.9157980680465698, "learning_rate": 5e-05, "llm_loss": 0.6763131022453308, "loss": 3.3896, "loss_aux_layer_0": 0.04132080078125, "loss_aux_layer_1": 0.124267578125, "loss_aux_layer_10": 0.1318359375, "loss_aux_layer_11": 0.1396484375, "loss_aux_layer_12": 0.150146484375, "loss_aux_layer_13": 0.160888671875, "loss_aux_layer_14": 0.177490234375, "loss_aux_layer_15": 0.191650390625, "loss_aux_layer_16": 0.20654296875, "loss_aux_layer_17": 0.211669921875, "loss_aux_layer_18": 0.2216796875, "loss_aux_layer_19": 0.219970703125, "loss_aux_layer_2": 0.142578125, "loss_aux_layer_20": 0.22314453125, "loss_aux_layer_21": 0.2255859375, "loss_aux_layer_22": 0.246826171875, "loss_aux_layer_23": 0.29248046875, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.150146484375, "loss_aux_layer_5": 0.151123046875, "loss_aux_layer_6": 0.1474609375, "loss_aux_layer_7": 0.13427734375, "loss_aux_layer_8": 0.132568359375, "loss_aux_layer_9": 0.130615234375, "step": 318, "total_loss": 0.8474057912826538 }, { "epoch": 0.0631558107305484, "grad_norm": 1.5821962356567383, "learning_rate": 5e-05, "llm_loss": 0.6491387486457825, "loss": 3.2721, "loss_aux_layer_0": 0.04351806640625, "loss_aux_layer_1": 0.1201171875, "loss_aux_layer_10": 0.12744140625, "loss_aux_layer_11": 0.134521484375, "loss_aux_layer_12": 0.14501953125, "loss_aux_layer_13": 0.1572265625, "loss_aux_layer_14": 0.1748046875, "loss_aux_layer_15": 0.190673828125, "loss_aux_layer_16": 0.206298828125, "loss_aux_layer_17": 0.21240234375, "loss_aux_layer_18": 0.22265625, "loss_aux_layer_19": 0.222900390625, "loss_aux_layer_2": 0.1348876953125, "loss_aux_layer_20": 0.226806640625, "loss_aux_layer_21": 0.23046875, "loss_aux_layer_22": 0.254150390625, "loss_aux_layer_23": 0.2998046875, "loss_aux_layer_3": 0.143798828125, "loss_aux_layer_4": 0.142333984375, "loss_aux_layer_5": 0.14306640625, "loss_aux_layer_6": 0.1396484375, "loss_aux_layer_7": 0.1279296875, "loss_aux_layer_8": 0.1268310546875, "loss_aux_layer_9": 0.1258544921875, "step": 319, "total_loss": 0.8180214613676071 }, { "epoch": 0.06335379132844982, "grad_norm": 1.2705398797988892, "learning_rate": 5e-05, "llm_loss": 0.568808451294899, "loss": 2.9442, "loss_aux_layer_0": 0.041748046875, "loss_aux_layer_1": 0.117431640625, "loss_aux_layer_10": 0.1253662109375, "loss_aux_layer_11": 0.1326904296875, "loss_aux_layer_12": 0.1435546875, "loss_aux_layer_13": 0.156005859375, "loss_aux_layer_14": 0.1748046875, "loss_aux_layer_15": 0.191162109375, "loss_aux_layer_16": 0.2060546875, "loss_aux_layer_17": 0.2138671875, "loss_aux_layer_18": 0.224609375, "loss_aux_layer_19": 0.2236328125, "loss_aux_layer_2": 0.13232421875, "loss_aux_layer_20": 0.226806640625, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.24951171875, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.140380859375, "loss_aux_layer_4": 0.138427734375, "loss_aux_layer_5": 0.139892578125, "loss_aux_layer_6": 0.13671875, "loss_aux_layer_7": 0.125244140625, "loss_aux_layer_8": 0.1246337890625, "loss_aux_layer_9": 0.1236572265625, "step": 320, "total_loss": 0.7360409498214722 }, { "epoch": 0.06355177192635121, "grad_norm": 0.8210428357124329, "learning_rate": 5e-05, "llm_loss": 0.5663638412952423, "loss": 2.9337, "loss_aux_layer_0": 0.0399169921875, "loss_aux_layer_1": 0.1170654296875, "loss_aux_layer_10": 0.1270751953125, "loss_aux_layer_11": 0.13427734375, "loss_aux_layer_12": 0.144287109375, "loss_aux_layer_13": 0.15478515625, "loss_aux_layer_14": 0.172119140625, "loss_aux_layer_15": 0.187744140625, "loss_aux_layer_16": 0.20263671875, "loss_aux_layer_17": 0.2080078125, "loss_aux_layer_18": 0.219482421875, "loss_aux_layer_19": 0.219482421875, "loss_aux_layer_2": 0.135009765625, "loss_aux_layer_20": 0.223876953125, "loss_aux_layer_21": 0.228515625, "loss_aux_layer_22": 0.2490234375, "loss_aux_layer_23": 0.29541015625, "loss_aux_layer_3": 0.143798828125, "loss_aux_layer_4": 0.142333984375, "loss_aux_layer_5": 0.143310546875, "loss_aux_layer_6": 0.139892578125, "loss_aux_layer_7": 0.12841796875, "loss_aux_layer_8": 0.1270751953125, "loss_aux_layer_9": 0.1259765625, "step": 321, "total_loss": 0.7334157973527908 }, { "epoch": 0.06374975252425262, "grad_norm": 1.0508836507797241, "learning_rate": 5e-05, "llm_loss": 0.684890404343605, "loss": 3.4143, "loss_aux_layer_0": 0.04071044921875, "loss_aux_layer_1": 0.1197509765625, "loss_aux_layer_10": 0.128173828125, "loss_aux_layer_11": 0.135498046875, "loss_aux_layer_12": 0.14599609375, "loss_aux_layer_13": 0.157470703125, "loss_aux_layer_14": 0.1748046875, "loss_aux_layer_15": 0.19140625, "loss_aux_layer_16": 0.20654296875, "loss_aux_layer_17": 0.212890625, "loss_aux_layer_18": 0.2236328125, "loss_aux_layer_19": 0.22314453125, "loss_aux_layer_2": 0.135498046875, "loss_aux_layer_20": 0.2265625, "loss_aux_layer_21": 0.228271484375, "loss_aux_layer_22": 0.25, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.14404296875, "loss_aux_layer_4": 0.142333984375, "loss_aux_layer_5": 0.1435546875, "loss_aux_layer_6": 0.14013671875, "loss_aux_layer_7": 0.12841796875, "loss_aux_layer_8": 0.1273193359375, "loss_aux_layer_9": 0.1260986328125, "step": 322, "total_loss": 0.8535769581794739 }, { "epoch": 0.06394773312215403, "grad_norm": 1.4825226068496704, "learning_rate": 5e-05, "llm_loss": 0.6803658083081245, "loss": 3.4266, "loss_aux_layer_0": 0.0439453125, "loss_aux_layer_1": 0.129150390625, "loss_aux_layer_10": 0.136474609375, "loss_aux_layer_11": 0.14453125, "loss_aux_layer_12": 0.1552734375, "loss_aux_layer_13": 0.166015625, "loss_aux_layer_14": 0.18359375, "loss_aux_layer_15": 0.1982421875, "loss_aux_layer_16": 0.21240234375, "loss_aux_layer_17": 0.218017578125, "loss_aux_layer_18": 0.228515625, "loss_aux_layer_19": 0.2265625, "loss_aux_layer_2": 0.14794921875, "loss_aux_layer_20": 0.22900390625, "loss_aux_layer_21": 0.230224609375, "loss_aux_layer_22": 0.25244140625, "loss_aux_layer_23": 0.29833984375, "loss_aux_layer_3": 0.156982421875, "loss_aux_layer_4": 0.15478515625, "loss_aux_layer_5": 0.155517578125, "loss_aux_layer_6": 0.152099609375, "loss_aux_layer_7": 0.138671875, "loss_aux_layer_8": 0.136962890625, "loss_aux_layer_9": 0.13525390625, "step": 323, "total_loss": 0.8566534668207169 }, { "epoch": 0.06414571372005544, "grad_norm": 1.204494595527649, "learning_rate": 5e-05, "llm_loss": 0.6465491652488708, "loss": 3.2752, "loss_aux_layer_0": 0.0408935546875, "loss_aux_layer_1": 0.12646484375, "loss_aux_layer_10": 0.132080078125, "loss_aux_layer_11": 0.13916015625, "loss_aux_layer_12": 0.14990234375, "loss_aux_layer_13": 0.1611328125, "loss_aux_layer_14": 0.178466796875, "loss_aux_layer_15": 0.1943359375, "loss_aux_layer_16": 0.20947265625, "loss_aux_layer_17": 0.215576171875, "loss_aux_layer_18": 0.224853515625, "loss_aux_layer_19": 0.222900390625, "loss_aux_layer_2": 0.142333984375, "loss_aux_layer_20": 0.226318359375, "loss_aux_layer_21": 0.229248046875, "loss_aux_layer_22": 0.251953125, "loss_aux_layer_23": 0.2978515625, "loss_aux_layer_3": 0.151123046875, "loss_aux_layer_4": 0.149169921875, "loss_aux_layer_5": 0.14990234375, "loss_aux_layer_6": 0.14599609375, "loss_aux_layer_7": 0.1328125, "loss_aux_layer_8": 0.132080078125, "loss_aux_layer_9": 0.130126953125, "step": 324, "total_loss": 0.8187989741563797 }, { "epoch": 0.06434369431795683, "grad_norm": 1.2797398567199707, "learning_rate": 5e-05, "llm_loss": 0.6618870347738266, "loss": 3.334, "loss_aux_layer_0": 0.039794921875, "loss_aux_layer_1": 0.1239013671875, "loss_aux_layer_10": 0.1309814453125, "loss_aux_layer_11": 0.138427734375, "loss_aux_layer_12": 0.149169921875, "loss_aux_layer_13": 0.161376953125, "loss_aux_layer_14": 0.178955078125, "loss_aux_layer_15": 0.19482421875, "loss_aux_layer_16": 0.20947265625, "loss_aux_layer_17": 0.215576171875, "loss_aux_layer_18": 0.226318359375, "loss_aux_layer_19": 0.224853515625, "loss_aux_layer_2": 0.14013671875, "loss_aux_layer_20": 0.227294921875, "loss_aux_layer_21": 0.2294921875, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.29541015625, "loss_aux_layer_3": 0.149169921875, "loss_aux_layer_4": 0.14697265625, "loss_aux_layer_5": 0.148193359375, "loss_aux_layer_6": 0.143798828125, "loss_aux_layer_7": 0.1324462890625, "loss_aux_layer_8": 0.1309814453125, "loss_aux_layer_9": 0.12939453125, "step": 325, "total_loss": 0.8334926515817642 }, { "epoch": 0.06454167491585824, "grad_norm": 1.277974247932434, "learning_rate": 5e-05, "llm_loss": 0.6621056199073792, "loss": 3.3331, "loss_aux_layer_0": 0.04315185546875, "loss_aux_layer_1": 0.125244140625, "loss_aux_layer_10": 0.131103515625, "loss_aux_layer_11": 0.138916015625, "loss_aux_layer_12": 0.149169921875, "loss_aux_layer_13": 0.16015625, "loss_aux_layer_14": 0.177978515625, "loss_aux_layer_15": 0.193115234375, "loss_aux_layer_16": 0.207763671875, "loss_aux_layer_17": 0.21337890625, "loss_aux_layer_18": 0.223876953125, "loss_aux_layer_19": 0.22216796875, "loss_aux_layer_2": 0.141357421875, "loss_aux_layer_20": 0.22509765625, "loss_aux_layer_21": 0.227783203125, "loss_aux_layer_22": 0.24951171875, "loss_aux_layer_23": 0.2958984375, "loss_aux_layer_3": 0.150146484375, "loss_aux_layer_4": 0.1484375, "loss_aux_layer_5": 0.1484375, "loss_aux_layer_6": 0.144287109375, "loss_aux_layer_7": 0.1318359375, "loss_aux_layer_8": 0.130615234375, "loss_aux_layer_9": 0.129638671875, "step": 326, "total_loss": 0.8332707583904266 }, { "epoch": 0.06473965551375965, "grad_norm": 1.7755167484283447, "learning_rate": 5e-05, "llm_loss": 0.683961495757103, "loss": 3.4327, "loss_aux_layer_0": 0.0423583984375, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.135498046875, "loss_aux_layer_11": 0.142578125, "loss_aux_layer_12": 0.15283203125, "loss_aux_layer_13": 0.163818359375, "loss_aux_layer_14": 0.18115234375, "loss_aux_layer_15": 0.195556640625, "loss_aux_layer_16": 0.20947265625, "loss_aux_layer_17": 0.215576171875, "loss_aux_layer_18": 0.22607421875, "loss_aux_layer_19": 0.224609375, "loss_aux_layer_2": 0.14404296875, "loss_aux_layer_20": 0.2275390625, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.2509765625, "loss_aux_layer_23": 0.29736328125, "loss_aux_layer_3": 0.1533203125, "loss_aux_layer_4": 0.15234375, "loss_aux_layer_5": 0.15283203125, "loss_aux_layer_6": 0.149658203125, "loss_aux_layer_7": 0.13720703125, "loss_aux_layer_8": 0.13623046875, "loss_aux_layer_9": 0.13427734375, "step": 327, "total_loss": 0.8581628203392029 }, { "epoch": 0.06493763611166106, "grad_norm": 1.287034034729004, "learning_rate": 5e-05, "llm_loss": 0.6551598459482193, "loss": 3.2925, "loss_aux_layer_0": 0.03955078125, "loss_aux_layer_1": 0.123291015625, "loss_aux_layer_10": 0.12841796875, "loss_aux_layer_11": 0.13623046875, "loss_aux_layer_12": 0.146484375, "loss_aux_layer_13": 0.1572265625, "loss_aux_layer_14": 0.173828125, "loss_aux_layer_15": 0.1884765625, "loss_aux_layer_16": 0.202880859375, "loss_aux_layer_17": 0.208984375, "loss_aux_layer_18": 0.22021484375, "loss_aux_layer_19": 0.2177734375, "loss_aux_layer_2": 0.13818359375, "loss_aux_layer_20": 0.22119140625, "loss_aux_layer_21": 0.22412109375, "loss_aux_layer_22": 0.24560546875, "loss_aux_layer_23": 0.29150390625, "loss_aux_layer_3": 0.147216796875, "loss_aux_layer_4": 0.14599609375, "loss_aux_layer_5": 0.146240234375, "loss_aux_layer_6": 0.142333984375, "loss_aux_layer_7": 0.1297607421875, "loss_aux_layer_8": 0.1282958984375, "loss_aux_layer_9": 0.1270751953125, "step": 328, "total_loss": 0.8231174796819687 }, { "epoch": 0.06513561670956246, "grad_norm": 2.2743306159973145, "learning_rate": 5e-05, "llm_loss": 0.5889516696333885, "loss": 3.0448, "loss_aux_layer_0": 0.0394287109375, "loss_aux_layer_1": 0.123046875, "loss_aux_layer_10": 0.1324462890625, "loss_aux_layer_11": 0.14013671875, "loss_aux_layer_12": 0.15087890625, "loss_aux_layer_13": 0.16357421875, "loss_aux_layer_14": 0.180908203125, "loss_aux_layer_15": 0.196533203125, "loss_aux_layer_16": 0.211669921875, "loss_aux_layer_17": 0.216552734375, "loss_aux_layer_18": 0.2265625, "loss_aux_layer_19": 0.2255859375, "loss_aux_layer_2": 0.138671875, "loss_aux_layer_20": 0.227294921875, "loss_aux_layer_21": 0.23046875, "loss_aux_layer_22": 0.2529296875, "loss_aux_layer_23": 0.2998046875, "loss_aux_layer_3": 0.147705078125, "loss_aux_layer_4": 0.146240234375, "loss_aux_layer_5": 0.14697265625, "loss_aux_layer_6": 0.143310546875, "loss_aux_layer_7": 0.132568359375, "loss_aux_layer_8": 0.1314697265625, "loss_aux_layer_9": 0.1300048828125, "step": 329, "total_loss": 0.7611963748931885 }, { "epoch": 0.06533359730746387, "grad_norm": 1.5269118547439575, "learning_rate": 5e-05, "llm_loss": 0.5765106528997421, "loss": 2.9764, "loss_aux_layer_0": 0.041015625, "loss_aux_layer_1": 0.1199951171875, "loss_aux_layer_10": 0.126953125, "loss_aux_layer_11": 0.13427734375, "loss_aux_layer_12": 0.14453125, "loss_aux_layer_13": 0.1552734375, "loss_aux_layer_14": 0.17333984375, "loss_aux_layer_15": 0.188720703125, "loss_aux_layer_16": 0.204345703125, "loss_aux_layer_17": 0.210205078125, "loss_aux_layer_18": 0.220703125, "loss_aux_layer_19": 0.220703125, "loss_aux_layer_2": 0.135986328125, "loss_aux_layer_20": 0.2236328125, "loss_aux_layer_21": 0.2275390625, "loss_aux_layer_22": 0.248779296875, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.144287109375, "loss_aux_layer_4": 0.142578125, "loss_aux_layer_5": 0.1435546875, "loss_aux_layer_6": 0.139892578125, "loss_aux_layer_7": 0.128662109375, "loss_aux_layer_8": 0.12744140625, "loss_aux_layer_9": 0.1258544921875, "step": 330, "total_loss": 0.7440906167030334 }, { "epoch": 0.06553157790536528, "grad_norm": 1.5595195293426514, "learning_rate": 5e-05, "llm_loss": 0.7042233943939209, "loss": 3.4783, "loss_aux_layer_0": 0.03857421875, "loss_aux_layer_1": 0.1156005859375, "loss_aux_layer_10": 0.1246337890625, "loss_aux_layer_11": 0.1314697265625, "loss_aux_layer_12": 0.142578125, "loss_aux_layer_13": 0.154296875, "loss_aux_layer_14": 0.172119140625, "loss_aux_layer_15": 0.1875, "loss_aux_layer_16": 0.2021484375, "loss_aux_layer_17": 0.208740234375, "loss_aux_layer_18": 0.21923828125, "loss_aux_layer_19": 0.218994140625, "loss_aux_layer_2": 0.131103515625, "loss_aux_layer_20": 0.22216796875, "loss_aux_layer_21": 0.226318359375, "loss_aux_layer_22": 0.2490234375, "loss_aux_layer_23": 0.298828125, "loss_aux_layer_3": 0.14013671875, "loss_aux_layer_4": 0.138427734375, "loss_aux_layer_5": 0.139404296875, "loss_aux_layer_6": 0.135498046875, "loss_aux_layer_7": 0.124755859375, "loss_aux_layer_8": 0.1241455078125, "loss_aux_layer_9": 0.1226806640625, "step": 331, "total_loss": 0.8695826977491379 }, { "epoch": 0.06572955850326669, "grad_norm": 1.0438753366470337, "learning_rate": 5e-05, "llm_loss": 0.6329388618469238, "loss": 3.2103, "loss_aux_layer_0": 0.041259765625, "loss_aux_layer_1": 0.1231689453125, "loss_aux_layer_10": 0.1292724609375, "loss_aux_layer_11": 0.13671875, "loss_aux_layer_12": 0.14697265625, "loss_aux_layer_13": 0.158447265625, "loss_aux_layer_14": 0.17578125, "loss_aux_layer_15": 0.191162109375, "loss_aux_layer_16": 0.206298828125, "loss_aux_layer_17": 0.212646484375, "loss_aux_layer_18": 0.223388671875, "loss_aux_layer_19": 0.222412109375, "loss_aux_layer_2": 0.138671875, "loss_aux_layer_20": 0.2255859375, "loss_aux_layer_21": 0.227294921875, "loss_aux_layer_22": 0.247802734375, "loss_aux_layer_23": 0.2939453125, "loss_aux_layer_3": 0.147216796875, "loss_aux_layer_4": 0.14599609375, "loss_aux_layer_5": 0.14697265625, "loss_aux_layer_6": 0.142822265625, "loss_aux_layer_7": 0.1304931640625, "loss_aux_layer_8": 0.12890625, "loss_aux_layer_9": 0.1275634765625, "step": 332, "total_loss": 0.8025635182857513 }, { "epoch": 0.06592753910116808, "grad_norm": 0.8295272588729858, "learning_rate": 5e-05, "llm_loss": 0.5770601630210876, "loss": 2.97, "loss_aux_layer_0": 0.0396728515625, "loss_aux_layer_1": 0.119384765625, "loss_aux_layer_10": 0.124755859375, "loss_aux_layer_11": 0.132080078125, "loss_aux_layer_12": 0.142578125, "loss_aux_layer_13": 0.154052734375, "loss_aux_layer_14": 0.1708984375, "loss_aux_layer_15": 0.186279296875, "loss_aux_layer_16": 0.201416015625, "loss_aux_layer_17": 0.207275390625, "loss_aux_layer_18": 0.2177734375, "loss_aux_layer_19": 0.21728515625, "loss_aux_layer_2": 0.134521484375, "loss_aux_layer_20": 0.220947265625, "loss_aux_layer_21": 0.224365234375, "loss_aux_layer_22": 0.245361328125, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.14306640625, "loss_aux_layer_4": 0.141845703125, "loss_aux_layer_5": 0.14208984375, "loss_aux_layer_6": 0.138427734375, "loss_aux_layer_7": 0.1260986328125, "loss_aux_layer_8": 0.125, "loss_aux_layer_9": 0.1236572265625, "step": 333, "total_loss": 0.7424964159727097 }, { "epoch": 0.06612551969906949, "grad_norm": 0.9452935457229614, "learning_rate": 5e-05, "llm_loss": 0.7265636175870895, "loss": 3.5603, "loss_aux_layer_0": 0.0399169921875, "loss_aux_layer_1": 0.1163330078125, "loss_aux_layer_10": 0.12158203125, "loss_aux_layer_11": 0.128662109375, "loss_aux_layer_12": 0.1396484375, "loss_aux_layer_13": 0.151123046875, "loss_aux_layer_14": 0.16943359375, "loss_aux_layer_15": 0.18603515625, "loss_aux_layer_16": 0.201416015625, "loss_aux_layer_17": 0.208740234375, "loss_aux_layer_18": 0.2197265625, "loss_aux_layer_19": 0.21875, "loss_aux_layer_2": 0.1297607421875, "loss_aux_layer_20": 0.22314453125, "loss_aux_layer_21": 0.224609375, "loss_aux_layer_22": 0.2451171875, "loss_aux_layer_23": 0.29248046875, "loss_aux_layer_3": 0.1376953125, "loss_aux_layer_4": 0.13671875, "loss_aux_layer_5": 0.13720703125, "loss_aux_layer_6": 0.133544921875, "loss_aux_layer_7": 0.12158203125, "loss_aux_layer_8": 0.120849609375, "loss_aux_layer_9": 0.1199951171875, "step": 334, "total_loss": 0.8900823593139648 }, { "epoch": 0.0663235002969709, "grad_norm": 0.9775875210762024, "learning_rate": 5e-05, "llm_loss": 0.7076838463544846, "loss": 3.5004, "loss_aux_layer_0": 0.04071044921875, "loss_aux_layer_1": 0.1241455078125, "loss_aux_layer_10": 0.126953125, "loss_aux_layer_11": 0.1341552734375, "loss_aux_layer_12": 0.14453125, "loss_aux_layer_13": 0.156005859375, "loss_aux_layer_14": 0.173095703125, "loss_aux_layer_15": 0.1884765625, "loss_aux_layer_16": 0.20361328125, "loss_aux_layer_17": 0.20947265625, "loss_aux_layer_18": 0.220458984375, "loss_aux_layer_19": 0.218994140625, "loss_aux_layer_2": 0.138671875, "loss_aux_layer_20": 0.22216796875, "loss_aux_layer_21": 0.2236328125, "loss_aux_layer_22": 0.24462890625, "loss_aux_layer_23": 0.28955078125, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.14501953125, "loss_aux_layer_5": 0.14501953125, "loss_aux_layer_6": 0.140869140625, "loss_aux_layer_7": 0.1285400390625, "loss_aux_layer_8": 0.1273193359375, "loss_aux_layer_9": 0.12548828125, "step": 335, "total_loss": 0.8751106113195419 }, { "epoch": 0.06652148089487231, "grad_norm": 1.3529177904129028, "learning_rate": 5e-05, "llm_loss": 0.6806557923555374, "loss": 3.3916, "loss_aux_layer_0": 0.03790283203125, "loss_aux_layer_1": 0.1201171875, "loss_aux_layer_10": 0.1265869140625, "loss_aux_layer_11": 0.134521484375, "loss_aux_layer_12": 0.145263671875, "loss_aux_layer_13": 0.15673828125, "loss_aux_layer_14": 0.173583984375, "loss_aux_layer_15": 0.1884765625, "loss_aux_layer_16": 0.203369140625, "loss_aux_layer_17": 0.210205078125, "loss_aux_layer_18": 0.220947265625, "loss_aux_layer_19": 0.219970703125, "loss_aux_layer_2": 0.1337890625, "loss_aux_layer_20": 0.2236328125, "loss_aux_layer_21": 0.22705078125, "loss_aux_layer_22": 0.250732421875, "loss_aux_layer_23": 0.29736328125, "loss_aux_layer_3": 0.142822265625, "loss_aux_layer_4": 0.1416015625, "loss_aux_layer_5": 0.14208984375, "loss_aux_layer_6": 0.138671875, "loss_aux_layer_7": 0.1270751953125, "loss_aux_layer_8": 0.1260986328125, "loss_aux_layer_9": 0.1248779296875, "step": 336, "total_loss": 0.8478911966085434 }, { "epoch": 0.0667194614927737, "grad_norm": 0.8834431767463684, "learning_rate": 5e-05, "llm_loss": 0.6149764508008957, "loss": 3.141, "loss_aux_layer_0": 0.0400390625, "loss_aux_layer_1": 0.12646484375, "loss_aux_layer_10": 0.13037109375, "loss_aux_layer_11": 0.137451171875, "loss_aux_layer_12": 0.147705078125, "loss_aux_layer_13": 0.15869140625, "loss_aux_layer_14": 0.176025390625, "loss_aux_layer_15": 0.19140625, "loss_aux_layer_16": 0.2060546875, "loss_aux_layer_17": 0.211669921875, "loss_aux_layer_18": 0.22119140625, "loss_aux_layer_19": 0.2197265625, "loss_aux_layer_2": 0.14208984375, "loss_aux_layer_20": 0.22314453125, "loss_aux_layer_21": 0.224853515625, "loss_aux_layer_22": 0.246337890625, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.149658203125, "loss_aux_layer_5": 0.14990234375, "loss_aux_layer_6": 0.1455078125, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.13134765625, "loss_aux_layer_9": 0.129150390625, "step": 337, "total_loss": 0.7852468937635422 }, { "epoch": 0.06691744209067511, "grad_norm": 0.8895295858383179, "learning_rate": 5e-05, "llm_loss": 0.7038462907075882, "loss": 3.474, "loss_aux_layer_0": 0.03729248046875, "loss_aux_layer_1": 0.1190185546875, "loss_aux_layer_10": 0.12451171875, "loss_aux_layer_11": 0.1317138671875, "loss_aux_layer_12": 0.14208984375, "loss_aux_layer_13": 0.1533203125, "loss_aux_layer_14": 0.169921875, "loss_aux_layer_15": 0.186279296875, "loss_aux_layer_16": 0.20068359375, "loss_aux_layer_17": 0.20751953125, "loss_aux_layer_18": 0.2177734375, "loss_aux_layer_19": 0.216796875, "loss_aux_layer_2": 0.133544921875, "loss_aux_layer_20": 0.220703125, "loss_aux_layer_21": 0.222412109375, "loss_aux_layer_22": 0.241943359375, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.142578125, "loss_aux_layer_4": 0.14111328125, "loss_aux_layer_5": 0.1416015625, "loss_aux_layer_6": 0.13818359375, "loss_aux_layer_7": 0.125732421875, "loss_aux_layer_8": 0.1248779296875, "loss_aux_layer_9": 0.1231689453125, "step": 338, "total_loss": 0.8684934079647064 }, { "epoch": 0.06711542268857652, "grad_norm": 0.6964613795280457, "learning_rate": 5e-05, "llm_loss": 0.6210818737745285, "loss": 3.1645, "loss_aux_layer_0": 0.0389404296875, "loss_aux_layer_1": 0.12353515625, "loss_aux_layer_10": 0.12890625, "loss_aux_layer_11": 0.13623046875, "loss_aux_layer_12": 0.146484375, "loss_aux_layer_13": 0.15771484375, "loss_aux_layer_14": 0.175537109375, "loss_aux_layer_15": 0.19140625, "loss_aux_layer_16": 0.206787109375, "loss_aux_layer_17": 0.21337890625, "loss_aux_layer_18": 0.224365234375, "loss_aux_layer_19": 0.224365234375, "loss_aux_layer_2": 0.137939453125, "loss_aux_layer_20": 0.227783203125, "loss_aux_layer_21": 0.229736328125, "loss_aux_layer_22": 0.251708984375, "loss_aux_layer_23": 0.29833984375, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.1455078125, "loss_aux_layer_5": 0.145751953125, "loss_aux_layer_6": 0.142578125, "loss_aux_layer_7": 0.1304931640625, "loss_aux_layer_8": 0.1290283203125, "loss_aux_layer_9": 0.1275634765625, "step": 339, "total_loss": 0.79111547768116 }, { "epoch": 0.06731340328647792, "grad_norm": 1.3350846767425537, "learning_rate": 5e-05, "llm_loss": 0.610927626490593, "loss": 3.1278, "loss_aux_layer_0": 0.0421142578125, "loss_aux_layer_1": 0.127685546875, "loss_aux_layer_10": 0.1307373046875, "loss_aux_layer_11": 0.13818359375, "loss_aux_layer_12": 0.149169921875, "loss_aux_layer_13": 0.16064453125, "loss_aux_layer_14": 0.177734375, "loss_aux_layer_15": 0.193115234375, "loss_aux_layer_16": 0.207275390625, "loss_aux_layer_17": 0.212646484375, "loss_aux_layer_18": 0.22314453125, "loss_aux_layer_19": 0.220458984375, "loss_aux_layer_2": 0.14208984375, "loss_aux_layer_20": 0.22265625, "loss_aux_layer_21": 0.22607421875, "loss_aux_layer_22": 0.24853515625, "loss_aux_layer_23": 0.2939453125, "loss_aux_layer_3": 0.150634765625, "loss_aux_layer_4": 0.149169921875, "loss_aux_layer_5": 0.14892578125, "loss_aux_layer_6": 0.145263671875, "loss_aux_layer_7": 0.133056640625, "loss_aux_layer_8": 0.131103515625, "loss_aux_layer_9": 0.1295166015625, "step": 340, "total_loss": 0.7819430381059647 }, { "epoch": 0.06751138388437933, "grad_norm": 1.6567860841751099, "learning_rate": 5e-05, "llm_loss": 0.670208677649498, "loss": 3.339, "loss_aux_layer_0": 0.03814697265625, "loss_aux_layer_1": 0.1181640625, "loss_aux_layer_10": 0.1248779296875, "loss_aux_layer_11": 0.1324462890625, "loss_aux_layer_12": 0.142578125, "loss_aux_layer_13": 0.154541015625, "loss_aux_layer_14": 0.171630859375, "loss_aux_layer_15": 0.18701171875, "loss_aux_layer_16": 0.200927734375, "loss_aux_layer_17": 0.20751953125, "loss_aux_layer_18": 0.218505859375, "loss_aux_layer_19": 0.216552734375, "loss_aux_layer_2": 0.1312255859375, "loss_aux_layer_20": 0.219970703125, "loss_aux_layer_21": 0.222900390625, "loss_aux_layer_22": 0.2451171875, "loss_aux_layer_23": 0.28955078125, "loss_aux_layer_3": 0.1396484375, "loss_aux_layer_4": 0.138427734375, "loss_aux_layer_5": 0.13916015625, "loss_aux_layer_6": 0.13623046875, "loss_aux_layer_7": 0.125244140625, "loss_aux_layer_8": 0.1243896484375, "loss_aux_layer_9": 0.1234130859375, "step": 341, "total_loss": 0.8347591310739517 }, { "epoch": 0.06770936448228074, "grad_norm": 1.474394679069519, "learning_rate": 5e-05, "llm_loss": 0.7101104706525803, "loss": 3.4724, "loss_aux_layer_0": 0.03924560546875, "loss_aux_layer_1": 0.1131591796875, "loss_aux_layer_10": 0.1165771484375, "loss_aux_layer_11": 0.1234130859375, "loss_aux_layer_12": 0.13427734375, "loss_aux_layer_13": 0.1455078125, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.180419921875, "loss_aux_layer_16": 0.196044921875, "loss_aux_layer_17": 0.20263671875, "loss_aux_layer_18": 0.212890625, "loss_aux_layer_19": 0.213134765625, "loss_aux_layer_2": 0.1243896484375, "loss_aux_layer_20": 0.21728515625, "loss_aux_layer_21": 0.218994140625, "loss_aux_layer_22": 0.239501953125, "loss_aux_layer_23": 0.2841796875, "loss_aux_layer_3": 0.1318359375, "loss_aux_layer_4": 0.1298828125, "loss_aux_layer_5": 0.130126953125, "loss_aux_layer_6": 0.126708984375, "loss_aux_layer_7": 0.1162109375, "loss_aux_layer_8": 0.115966796875, "loss_aux_layer_9": 0.114990234375, "step": 342, "total_loss": 0.8680875599384308 }, { "epoch": 0.06790734508018215, "grad_norm": 1.3876599073410034, "learning_rate": 5e-05, "llm_loss": 0.7343430519104004, "loss": 3.6217, "loss_aux_layer_0": 0.04339599609375, "loss_aux_layer_1": 0.1304931640625, "loss_aux_layer_10": 0.13134765625, "loss_aux_layer_11": 0.138671875, "loss_aux_layer_12": 0.14892578125, "loss_aux_layer_13": 0.159423828125, "loss_aux_layer_14": 0.17626953125, "loss_aux_layer_15": 0.191162109375, "loss_aux_layer_16": 0.2060546875, "loss_aux_layer_17": 0.211669921875, "loss_aux_layer_18": 0.22216796875, "loss_aux_layer_19": 0.220947265625, "loss_aux_layer_2": 0.14404296875, "loss_aux_layer_20": 0.223876953125, "loss_aux_layer_21": 0.224853515625, "loss_aux_layer_22": 0.245361328125, "loss_aux_layer_23": 0.28955078125, "loss_aux_layer_3": 0.153076171875, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.15087890625, "loss_aux_layer_6": 0.146728515625, "loss_aux_layer_7": 0.133544921875, "loss_aux_layer_8": 0.1322021484375, "loss_aux_layer_9": 0.13037109375, "step": 343, "total_loss": 0.905431479215622 }, { "epoch": 0.06810532567808354, "grad_norm": 1.2295597791671753, "learning_rate": 5e-05, "llm_loss": 0.6195266842842102, "loss": 3.1225, "loss_aux_layer_0": 0.037841796875, "loss_aux_layer_1": 0.1138916015625, "loss_aux_layer_10": 0.12060546875, "loss_aux_layer_11": 0.127685546875, "loss_aux_layer_12": 0.138427734375, "loss_aux_layer_13": 0.149169921875, "loss_aux_layer_14": 0.166748046875, "loss_aux_layer_15": 0.182373046875, "loss_aux_layer_16": 0.197998046875, "loss_aux_layer_17": 0.204833984375, "loss_aux_layer_18": 0.214599609375, "loss_aux_layer_19": 0.215087890625, "loss_aux_layer_2": 0.1268310546875, "loss_aux_layer_20": 0.21875, "loss_aux_layer_21": 0.220947265625, "loss_aux_layer_22": 0.243896484375, "loss_aux_layer_23": 0.2900390625, "loss_aux_layer_3": 0.13525390625, "loss_aux_layer_4": 0.1337890625, "loss_aux_layer_5": 0.134765625, "loss_aux_layer_6": 0.131591796875, "loss_aux_layer_7": 0.120849609375, "loss_aux_layer_8": 0.11962890625, "loss_aux_layer_9": 0.119140625, "step": 344, "total_loss": 0.7806175202131271 }, { "epoch": 0.06830330627598495, "grad_norm": 1.457924485206604, "learning_rate": 5e-05, "llm_loss": 0.6960186809301376, "loss": 3.4563, "loss_aux_layer_0": 0.0372314453125, "loss_aux_layer_1": 0.1199951171875, "loss_aux_layer_10": 0.1270751953125, "loss_aux_layer_11": 0.134765625, "loss_aux_layer_12": 0.14599609375, "loss_aux_layer_13": 0.157958984375, "loss_aux_layer_14": 0.17626953125, "loss_aux_layer_15": 0.192626953125, "loss_aux_layer_16": 0.2080078125, "loss_aux_layer_17": 0.21484375, "loss_aux_layer_18": 0.22509765625, "loss_aux_layer_19": 0.223388671875, "loss_aux_layer_2": 0.1328125, "loss_aux_layer_20": 0.226318359375, "loss_aux_layer_21": 0.227294921875, "loss_aux_layer_22": 0.24755859375, "loss_aux_layer_23": 0.29345703125, "loss_aux_layer_3": 0.14208984375, "loss_aux_layer_4": 0.140869140625, "loss_aux_layer_5": 0.141845703125, "loss_aux_layer_6": 0.138427734375, "loss_aux_layer_7": 0.12744140625, "loss_aux_layer_8": 0.1263427734375, "loss_aux_layer_9": 0.1253662109375, "step": 345, "total_loss": 0.8640627861022949 }, { "epoch": 0.06850128687388636, "grad_norm": 1.0955595970153809, "learning_rate": 5e-05, "llm_loss": 0.6795306205749512, "loss": 3.3644, "loss_aux_layer_0": 0.03759765625, "loss_aux_layer_1": 0.1163330078125, "loss_aux_layer_10": 0.1209716796875, "loss_aux_layer_11": 0.12841796875, "loss_aux_layer_12": 0.137939453125, "loss_aux_layer_13": 0.149658203125, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.182861328125, "loss_aux_layer_16": 0.197998046875, "loss_aux_layer_17": 0.2041015625, "loss_aux_layer_18": 0.214599609375, "loss_aux_layer_19": 0.212646484375, "loss_aux_layer_2": 0.1298828125, "loss_aux_layer_20": 0.216796875, "loss_aux_layer_21": 0.219970703125, "loss_aux_layer_22": 0.241943359375, "loss_aux_layer_23": 0.28662109375, "loss_aux_layer_3": 0.13818359375, "loss_aux_layer_4": 0.13720703125, "loss_aux_layer_5": 0.137451171875, "loss_aux_layer_6": 0.134521484375, "loss_aux_layer_7": 0.1229248046875, "loss_aux_layer_8": 0.121826171875, "loss_aux_layer_9": 0.1202392578125, "step": 346, "total_loss": 0.841098964214325 }, { "epoch": 0.06869926747178777, "grad_norm": 1.0930875539779663, "learning_rate": 5e-05, "llm_loss": 0.5981017202138901, "loss": 3.037, "loss_aux_layer_0": 0.03863525390625, "loss_aux_layer_1": 0.115966796875, "loss_aux_layer_10": 0.12060546875, "loss_aux_layer_11": 0.127685546875, "loss_aux_layer_12": 0.137939453125, "loss_aux_layer_13": 0.149169921875, "loss_aux_layer_14": 0.166259765625, "loss_aux_layer_15": 0.181396484375, "loss_aux_layer_16": 0.196533203125, "loss_aux_layer_17": 0.203125, "loss_aux_layer_18": 0.21484375, "loss_aux_layer_19": 0.2138671875, "loss_aux_layer_2": 0.1295166015625, "loss_aux_layer_20": 0.218017578125, "loss_aux_layer_21": 0.22021484375, "loss_aux_layer_22": 0.2412109375, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.1376953125, "loss_aux_layer_4": 0.13623046875, "loss_aux_layer_5": 0.136474609375, "loss_aux_layer_6": 0.133056640625, "loss_aux_layer_7": 0.1221923828125, "loss_aux_layer_8": 0.1209716796875, "loss_aux_layer_9": 0.1192626953125, "step": 347, "total_loss": 0.7592507749795914 }, { "epoch": 0.06889724806968917, "grad_norm": 0.8549122214317322, "learning_rate": 5e-05, "llm_loss": 0.6271318942308426, "loss": 3.1696, "loss_aux_layer_0": 0.03924560546875, "loss_aux_layer_1": 0.1214599609375, "loss_aux_layer_10": 0.125732421875, "loss_aux_layer_11": 0.1328125, "loss_aux_layer_12": 0.1435546875, "loss_aux_layer_13": 0.154052734375, "loss_aux_layer_14": 0.1708984375, "loss_aux_layer_15": 0.186279296875, "loss_aux_layer_16": 0.2021484375, "loss_aux_layer_17": 0.20849609375, "loss_aux_layer_18": 0.21923828125, "loss_aux_layer_19": 0.218505859375, "loss_aux_layer_2": 0.133544921875, "loss_aux_layer_20": 0.220947265625, "loss_aux_layer_21": 0.22216796875, "loss_aux_layer_22": 0.241943359375, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.1416015625, "loss_aux_layer_4": 0.141357421875, "loss_aux_layer_5": 0.14111328125, "loss_aux_layer_6": 0.13818359375, "loss_aux_layer_7": 0.12646484375, "loss_aux_layer_8": 0.1258544921875, "loss_aux_layer_9": 0.1246337890625, "step": 348, "total_loss": 0.79241082072258 }, { "epoch": 0.06909522866759057, "grad_norm": 1.3157166242599487, "learning_rate": 5e-05, "llm_loss": 0.6331999748945236, "loss": 3.1816, "loss_aux_layer_0": 0.0367431640625, "loss_aux_layer_1": 0.114990234375, "loss_aux_layer_10": 0.120361328125, "loss_aux_layer_11": 0.1279296875, "loss_aux_layer_12": 0.138427734375, "loss_aux_layer_13": 0.150146484375, "loss_aux_layer_14": 0.16748046875, "loss_aux_layer_15": 0.184326171875, "loss_aux_layer_16": 0.19970703125, "loss_aux_layer_17": 0.20654296875, "loss_aux_layer_18": 0.21875, "loss_aux_layer_19": 0.2177734375, "loss_aux_layer_2": 0.12744140625, "loss_aux_layer_20": 0.22119140625, "loss_aux_layer_21": 0.224853515625, "loss_aux_layer_22": 0.246826171875, "loss_aux_layer_23": 0.29248046875, "loss_aux_layer_3": 0.135009765625, "loss_aux_layer_4": 0.134033203125, "loss_aux_layer_5": 0.135009765625, "loss_aux_layer_6": 0.1318359375, "loss_aux_layer_7": 0.121337890625, "loss_aux_layer_8": 0.120361328125, "loss_aux_layer_9": 0.119140625, "step": 349, "total_loss": 0.7954082190990448 }, { "epoch": 0.06929320926549198, "grad_norm": 1.2996299266815186, "learning_rate": 5e-05, "llm_loss": 0.6298497319221497, "loss": 3.1824, "loss_aux_layer_0": 0.0390625, "loss_aux_layer_1": 0.1234130859375, "loss_aux_layer_10": 0.12646484375, "loss_aux_layer_11": 0.13427734375, "loss_aux_layer_12": 0.144287109375, "loss_aux_layer_13": 0.155029296875, "loss_aux_layer_14": 0.172119140625, "loss_aux_layer_15": 0.18701171875, "loss_aux_layer_16": 0.201171875, "loss_aux_layer_17": 0.20654296875, "loss_aux_layer_18": 0.21630859375, "loss_aux_layer_19": 0.215087890625, "loss_aux_layer_2": 0.1357421875, "loss_aux_layer_20": 0.218505859375, "loss_aux_layer_21": 0.221435546875, "loss_aux_layer_22": 0.242919921875, "loss_aux_layer_23": 0.2880859375, "loss_aux_layer_3": 0.143798828125, "loss_aux_layer_4": 0.14306640625, "loss_aux_layer_5": 0.143310546875, "loss_aux_layer_6": 0.1396484375, "loss_aux_layer_7": 0.1280517578125, "loss_aux_layer_8": 0.126708984375, "loss_aux_layer_9": 0.12548828125, "step": 350, "total_loss": 0.7956091463565826 }, { "epoch": 0.0694911898633934, "grad_norm": 1.2307184934616089, "learning_rate": 5e-05, "llm_loss": 0.6434687227010727, "loss": 3.2207, "loss_aux_layer_0": 0.03594970703125, "loss_aux_layer_1": 0.114013671875, "loss_aux_layer_10": 0.1219482421875, "loss_aux_layer_11": 0.1287841796875, "loss_aux_layer_12": 0.1396484375, "loss_aux_layer_13": 0.15087890625, "loss_aux_layer_14": 0.16845703125, "loss_aux_layer_15": 0.184326171875, "loss_aux_layer_16": 0.199951171875, "loss_aux_layer_17": 0.2060546875, "loss_aux_layer_18": 0.21630859375, "loss_aux_layer_19": 0.21484375, "loss_aux_layer_2": 0.1270751953125, "loss_aux_layer_20": 0.218505859375, "loss_aux_layer_21": 0.220703125, "loss_aux_layer_22": 0.24072265625, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.135986328125, "loss_aux_layer_4": 0.1351318359375, "loss_aux_layer_5": 0.135986328125, "loss_aux_layer_6": 0.13330078125, "loss_aux_layer_7": 0.122802734375, "loss_aux_layer_8": 0.1217041015625, "loss_aux_layer_9": 0.120849609375, "step": 351, "total_loss": 0.8051770627498627 }, { "epoch": 0.06968917046129479, "grad_norm": 1.7917604446411133, "learning_rate": 5e-05, "llm_loss": 0.6969247460365295, "loss": 3.4598, "loss_aux_layer_0": 0.04461669921875, "loss_aux_layer_1": 0.122802734375, "loss_aux_layer_10": 0.126708984375, "loss_aux_layer_11": 0.134521484375, "loss_aux_layer_12": 0.145263671875, "loss_aux_layer_13": 0.156982421875, "loss_aux_layer_14": 0.1748046875, "loss_aux_layer_15": 0.1904296875, "loss_aux_layer_16": 0.2060546875, "loss_aux_layer_17": 0.2119140625, "loss_aux_layer_18": 0.221923828125, "loss_aux_layer_19": 0.220947265625, "loss_aux_layer_2": 0.13525390625, "loss_aux_layer_20": 0.223388671875, "loss_aux_layer_21": 0.22607421875, "loss_aux_layer_22": 0.24951171875, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.1435546875, "loss_aux_layer_4": 0.141845703125, "loss_aux_layer_5": 0.142578125, "loss_aux_layer_6": 0.138671875, "loss_aux_layer_7": 0.1279296875, "loss_aux_layer_8": 0.126220703125, "loss_aux_layer_9": 0.1251220703125, "step": 352, "total_loss": 0.8649516850709915 }, { "epoch": 0.0698871510591962, "grad_norm": 1.0914454460144043, "learning_rate": 5e-05, "llm_loss": 0.6832523494958878, "loss": 3.4045, "loss_aux_layer_0": 0.03826904296875, "loss_aux_layer_1": 0.12158203125, "loss_aux_layer_10": 0.1273193359375, "loss_aux_layer_11": 0.135009765625, "loss_aux_layer_12": 0.14599609375, "loss_aux_layer_13": 0.157470703125, "loss_aux_layer_14": 0.1748046875, "loss_aux_layer_15": 0.190673828125, "loss_aux_layer_16": 0.205810546875, "loss_aux_layer_17": 0.212646484375, "loss_aux_layer_18": 0.222900390625, "loss_aux_layer_19": 0.220947265625, "loss_aux_layer_2": 0.134765625, "loss_aux_layer_20": 0.223876953125, "loss_aux_layer_21": 0.22607421875, "loss_aux_layer_22": 0.24658203125, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.143310546875, "loss_aux_layer_4": 0.142822265625, "loss_aux_layer_5": 0.143310546875, "loss_aux_layer_6": 0.140625, "loss_aux_layer_7": 0.128662109375, "loss_aux_layer_8": 0.1270751953125, "loss_aux_layer_9": 0.1259765625, "step": 353, "total_loss": 0.8511152416467667 }, { "epoch": 0.07008513165709761, "grad_norm": 2.200404405593872, "learning_rate": 5e-05, "llm_loss": 0.7126787155866623, "loss": 3.5046, "loss_aux_layer_0": 0.03759765625, "loss_aux_layer_1": 0.1185302734375, "loss_aux_layer_10": 0.1234130859375, "loss_aux_layer_11": 0.1307373046875, "loss_aux_layer_12": 0.1416015625, "loss_aux_layer_13": 0.15283203125, "loss_aux_layer_14": 0.17041015625, "loss_aux_layer_15": 0.185791015625, "loss_aux_layer_16": 0.200439453125, "loss_aux_layer_17": 0.207275390625, "loss_aux_layer_18": 0.216796875, "loss_aux_layer_19": 0.21533203125, "loss_aux_layer_2": 0.131103515625, "loss_aux_layer_20": 0.21875, "loss_aux_layer_21": 0.220703125, "loss_aux_layer_22": 0.2421875, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.139404296875, "loss_aux_layer_4": 0.138427734375, "loss_aux_layer_5": 0.138427734375, "loss_aux_layer_6": 0.1358642578125, "loss_aux_layer_7": 0.1253662109375, "loss_aux_layer_8": 0.12353515625, "loss_aux_layer_9": 0.1219482421875, "step": 354, "total_loss": 0.8761408776044846 }, { "epoch": 0.07028311225499902, "grad_norm": 1.470106840133667, "learning_rate": 5e-05, "llm_loss": 0.6495156735181808, "loss": 3.2274, "loss_aux_layer_0": 0.03515625, "loss_aux_layer_1": 0.112548828125, "loss_aux_layer_10": 0.1168212890625, "loss_aux_layer_11": 0.1231689453125, "loss_aux_layer_12": 0.13427734375, "loss_aux_layer_13": 0.14599609375, "loss_aux_layer_14": 0.1640625, "loss_aux_layer_15": 0.1796875, "loss_aux_layer_16": 0.195556640625, "loss_aux_layer_17": 0.202392578125, "loss_aux_layer_18": 0.213134765625, "loss_aux_layer_19": 0.211181640625, "loss_aux_layer_2": 0.1234130859375, "loss_aux_layer_20": 0.21533203125, "loss_aux_layer_21": 0.217041015625, "loss_aux_layer_22": 0.236083984375, "loss_aux_layer_23": 0.2822265625, "loss_aux_layer_3": 0.131591796875, "loss_aux_layer_4": 0.13037109375, "loss_aux_layer_5": 0.1302490234375, "loss_aux_layer_6": 0.1279296875, "loss_aux_layer_7": 0.11767578125, "loss_aux_layer_8": 0.1170654296875, "loss_aux_layer_9": 0.1153564453125, "step": 355, "total_loss": 0.8068539649248123 }, { "epoch": 0.07048109285290041, "grad_norm": 2.3775343894958496, "learning_rate": 5e-05, "llm_loss": 0.7192055433988571, "loss": 3.5222, "loss_aux_layer_0": 0.03741455078125, "loss_aux_layer_1": 0.1146240234375, "loss_aux_layer_10": 0.1204833984375, "loss_aux_layer_11": 0.1280517578125, "loss_aux_layer_12": 0.138427734375, "loss_aux_layer_13": 0.14990234375, "loss_aux_layer_14": 0.167724609375, "loss_aux_layer_15": 0.183837890625, "loss_aux_layer_16": 0.19921875, "loss_aux_layer_17": 0.205078125, "loss_aux_layer_18": 0.216064453125, "loss_aux_layer_19": 0.2158203125, "loss_aux_layer_2": 0.1273193359375, "loss_aux_layer_20": 0.219482421875, "loss_aux_layer_21": 0.2216796875, "loss_aux_layer_22": 0.242431640625, "loss_aux_layer_23": 0.2880859375, "loss_aux_layer_3": 0.13525390625, "loss_aux_layer_4": 0.13427734375, "loss_aux_layer_5": 0.134521484375, "loss_aux_layer_6": 0.13232421875, "loss_aux_layer_7": 0.121337890625, "loss_aux_layer_8": 0.1199951171875, "loss_aux_layer_9": 0.1190185546875, "step": 356, "total_loss": 0.880558043718338 }, { "epoch": 0.07067907345080182, "grad_norm": 2.657639980316162, "learning_rate": 5e-05, "llm_loss": 0.6188103333115578, "loss": 3.1361, "loss_aux_layer_0": 0.04248046875, "loss_aux_layer_1": 0.12060546875, "loss_aux_layer_10": 0.1243896484375, "loss_aux_layer_11": 0.1324462890625, "loss_aux_layer_12": 0.142333984375, "loss_aux_layer_13": 0.154052734375, "loss_aux_layer_14": 0.171875, "loss_aux_layer_15": 0.1875, "loss_aux_layer_16": 0.202392578125, "loss_aux_layer_17": 0.2080078125, "loss_aux_layer_18": 0.21826171875, "loss_aux_layer_19": 0.217041015625, "loss_aux_layer_2": 0.132080078125, "loss_aux_layer_20": 0.22021484375, "loss_aux_layer_21": 0.223388671875, "loss_aux_layer_22": 0.24560546875, "loss_aux_layer_23": 0.29296875, "loss_aux_layer_3": 0.140625, "loss_aux_layer_4": 0.1396484375, "loss_aux_layer_5": 0.14013671875, "loss_aux_layer_6": 0.13623046875, "loss_aux_layer_7": 0.1263427734375, "loss_aux_layer_8": 0.125, "loss_aux_layer_9": 0.1231689453125, "step": 357, "total_loss": 0.7840289920568466 }, { "epoch": 0.07087705404870323, "grad_norm": 2.1371116638183594, "learning_rate": 5e-05, "llm_loss": 0.5800872892141342, "loss": 2.9789, "loss_aux_layer_0": 0.03717041015625, "loss_aux_layer_1": 0.1201171875, "loss_aux_layer_10": 0.12646484375, "loss_aux_layer_11": 0.1337890625, "loss_aux_layer_12": 0.144287109375, "loss_aux_layer_13": 0.1552734375, "loss_aux_layer_14": 0.172119140625, "loss_aux_layer_15": 0.186767578125, "loss_aux_layer_16": 0.199951171875, "loss_aux_layer_17": 0.205810546875, "loss_aux_layer_18": 0.21435546875, "loss_aux_layer_19": 0.21240234375, "loss_aux_layer_2": 0.133544921875, "loss_aux_layer_20": 0.215576171875, "loss_aux_layer_21": 0.21923828125, "loss_aux_layer_22": 0.24169921875, "loss_aux_layer_23": 0.28857421875, "loss_aux_layer_3": 0.14208984375, "loss_aux_layer_4": 0.141357421875, "loss_aux_layer_5": 0.141845703125, "loss_aux_layer_6": 0.13916015625, "loss_aux_layer_7": 0.1285400390625, "loss_aux_layer_8": 0.1268310546875, "loss_aux_layer_9": 0.1248779296875, "step": 358, "total_loss": 0.7447140663862228 }, { "epoch": 0.07107503464660463, "grad_norm": 12.011703491210938, "learning_rate": 5e-05, "llm_loss": 0.6083015650510788, "loss": 3.1179, "loss_aux_layer_0": 0.04241943359375, "loss_aux_layer_1": 0.125, "loss_aux_layer_10": 0.134765625, "loss_aux_layer_11": 0.14208984375, "loss_aux_layer_12": 0.152587890625, "loss_aux_layer_13": 0.16259765625, "loss_aux_layer_14": 0.17919921875, "loss_aux_layer_15": 0.19384765625, "loss_aux_layer_16": 0.207763671875, "loss_aux_layer_17": 0.213134765625, "loss_aux_layer_18": 0.2236328125, "loss_aux_layer_19": 0.220947265625, "loss_aux_layer_2": 0.13720703125, "loss_aux_layer_20": 0.22412109375, "loss_aux_layer_21": 0.22705078125, "loss_aux_layer_22": 0.24755859375, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.146484375, "loss_aux_layer_4": 0.144775390625, "loss_aux_layer_5": 0.145263671875, "loss_aux_layer_6": 0.14306640625, "loss_aux_layer_7": 0.134033203125, "loss_aux_layer_8": 0.134521484375, "loss_aux_layer_9": 0.133056640625, "step": 359, "total_loss": 0.7794635444879532 }, { "epoch": 0.07127301524450604, "grad_norm": 13.939728736877441, "learning_rate": 5e-05, "llm_loss": 0.6585283130407333, "loss": 3.3455, "loss_aux_layer_0": 0.03692626953125, "loss_aux_layer_1": 0.1141357421875, "loss_aux_layer_10": 0.144775390625, "loss_aux_layer_11": 0.153076171875, "loss_aux_layer_12": 0.1640625, "loss_aux_layer_13": 0.174560546875, "loss_aux_layer_14": 0.19140625, "loss_aux_layer_15": 0.2060546875, "loss_aux_layer_16": 0.21923828125, "loss_aux_layer_17": 0.223388671875, "loss_aux_layer_18": 0.232666015625, "loss_aux_layer_19": 0.22998046875, "loss_aux_layer_2": 0.1295166015625, "loss_aux_layer_20": 0.2314453125, "loss_aux_layer_21": 0.232177734375, "loss_aux_layer_22": 0.25244140625, "loss_aux_layer_23": 0.29833984375, "loss_aux_layer_3": 0.1435546875, "loss_aux_layer_4": 0.143310546875, "loss_aux_layer_5": 0.14697265625, "loss_aux_layer_6": 0.1474609375, "loss_aux_layer_7": 0.156982421875, "loss_aux_layer_8": 0.1513671875, "loss_aux_layer_9": 0.145263671875, "step": 360, "total_loss": 0.8363844603300095 }, { "epoch": 0.07147099584240744, "grad_norm": 8.287993431091309, "learning_rate": 5e-05, "llm_loss": 0.5935888886451721, "loss": 3.2096, "loss_aux_layer_0": 0.03851318359375, "loss_aux_layer_1": 0.1185302734375, "loss_aux_layer_10": 0.197998046875, "loss_aux_layer_11": 0.2099609375, "loss_aux_layer_12": 0.224609375, "loss_aux_layer_13": 0.235107421875, "loss_aux_layer_14": 0.246337890625, "loss_aux_layer_15": 0.256103515625, "loss_aux_layer_16": 0.26416015625, "loss_aux_layer_17": 0.26513671875, "loss_aux_layer_18": 0.2705078125, "loss_aux_layer_19": 0.2607421875, "loss_aux_layer_2": 0.1328125, "loss_aux_layer_20": 0.25537109375, "loss_aux_layer_21": 0.2529296875, "loss_aux_layer_22": 0.27197265625, "loss_aux_layer_23": 0.31787109375, "loss_aux_layer_3": 0.1474609375, "loss_aux_layer_4": 0.145263671875, "loss_aux_layer_5": 0.1474609375, "loss_aux_layer_6": 0.156982421875, "loss_aux_layer_7": 0.19140625, "loss_aux_layer_8": 0.20654296875, "loss_aux_layer_9": 0.197509765625, "step": 361, "total_loss": 0.8023981004953384 }, { "epoch": 0.07166897644030885, "grad_norm": 4.81840705871582, "learning_rate": 5e-05, "llm_loss": 0.6756044626235962, "loss": 3.4667, "loss_aux_layer_0": 0.03887939453125, "loss_aux_layer_1": 0.1224365234375, "loss_aux_layer_10": 0.16455078125, "loss_aux_layer_11": 0.17236328125, "loss_aux_layer_12": 0.18408203125, "loss_aux_layer_13": 0.191650390625, "loss_aux_layer_14": 0.20751953125, "loss_aux_layer_15": 0.219482421875, "loss_aux_layer_16": 0.232421875, "loss_aux_layer_17": 0.2353515625, "loss_aux_layer_18": 0.24560546875, "loss_aux_layer_19": 0.2412109375, "loss_aux_layer_2": 0.14208984375, "loss_aux_layer_20": 0.241455078125, "loss_aux_layer_21": 0.24658203125, "loss_aux_layer_22": 0.27001953125, "loss_aux_layer_23": 0.31884765625, "loss_aux_layer_3": 0.153564453125, "loss_aux_layer_4": 0.15185546875, "loss_aux_layer_5": 0.152587890625, "loss_aux_layer_6": 0.1552734375, "loss_aux_layer_7": 0.16162109375, "loss_aux_layer_8": 0.1708984375, "loss_aux_layer_9": 0.164794921875, "step": 362, "total_loss": 0.8666683435440063 }, { "epoch": 0.07186695703821025, "grad_norm": 6.07126522064209, "learning_rate": 5e-05, "llm_loss": 0.6848760396242142, "loss": 3.504, "loss_aux_layer_0": 0.0384521484375, "loss_aux_layer_1": 0.1160888671875, "loss_aux_layer_10": 0.147216796875, "loss_aux_layer_11": 0.155029296875, "loss_aux_layer_12": 0.168701171875, "loss_aux_layer_13": 0.18798828125, "loss_aux_layer_14": 0.2060546875, "loss_aux_layer_15": 0.222412109375, "loss_aux_layer_16": 0.23876953125, "loss_aux_layer_17": 0.248291015625, "loss_aux_layer_18": 0.26806640625, "loss_aux_layer_19": 0.26171875, "loss_aux_layer_2": 0.147216796875, "loss_aux_layer_20": 0.26220703125, "loss_aux_layer_21": 0.2626953125, "loss_aux_layer_22": 0.28271484375, "loss_aux_layer_23": 0.33056640625, "loss_aux_layer_3": 0.1513671875, "loss_aux_layer_4": 0.14990234375, "loss_aux_layer_5": 0.1513671875, "loss_aux_layer_6": 0.149658203125, "loss_aux_layer_7": 0.150146484375, "loss_aux_layer_8": 0.145751953125, "loss_aux_layer_9": 0.14501953125, "step": 363, "total_loss": 0.8759970217943192 }, { "epoch": 0.07206493763611166, "grad_norm": 2.6123552322387695, "learning_rate": 5e-05, "llm_loss": 0.653085008263588, "loss": 3.3791, "loss_aux_layer_0": 0.0379638671875, "loss_aux_layer_1": 0.122802734375, "loss_aux_layer_10": 0.15576171875, "loss_aux_layer_11": 0.16455078125, "loss_aux_layer_12": 0.175048828125, "loss_aux_layer_13": 0.185546875, "loss_aux_layer_14": 0.20361328125, "loss_aux_layer_15": 0.21630859375, "loss_aux_layer_16": 0.22900390625, "loss_aux_layer_17": 0.2353515625, "loss_aux_layer_18": 0.24853515625, "loss_aux_layer_19": 0.24560546875, "loss_aux_layer_2": 0.160888671875, "loss_aux_layer_20": 0.24560546875, "loss_aux_layer_21": 0.2490234375, "loss_aux_layer_22": 0.26904296875, "loss_aux_layer_23": 0.3193359375, "loss_aux_layer_3": 0.16455078125, "loss_aux_layer_4": 0.161865234375, "loss_aux_layer_5": 0.1640625, "loss_aux_layer_6": 0.16357421875, "loss_aux_layer_7": 0.16748046875, "loss_aux_layer_8": 0.1591796875, "loss_aux_layer_9": 0.1552734375, "step": 364, "total_loss": 0.8447737097740173 }, { "epoch": 0.07226291823401307, "grad_norm": 1.8521955013275146, "learning_rate": 5e-05, "llm_loss": 0.708721473813057, "loss": 3.5872, "loss_aux_layer_0": 0.03985595703125, "loss_aux_layer_1": 0.119140625, "loss_aux_layer_10": 0.1513671875, "loss_aux_layer_11": 0.15869140625, "loss_aux_layer_12": 0.16943359375, "loss_aux_layer_13": 0.180419921875, "loss_aux_layer_14": 0.200439453125, "loss_aux_layer_15": 0.215576171875, "loss_aux_layer_16": 0.231201171875, "loss_aux_layer_17": 0.23681640625, "loss_aux_layer_18": 0.24951171875, "loss_aux_layer_19": 0.244140625, "loss_aux_layer_2": 0.149658203125, "loss_aux_layer_20": 0.246337890625, "loss_aux_layer_21": 0.248779296875, "loss_aux_layer_22": 0.271484375, "loss_aux_layer_23": 0.31982421875, "loss_aux_layer_3": 0.156494140625, "loss_aux_layer_4": 0.15380859375, "loss_aux_layer_5": 0.15576171875, "loss_aux_layer_6": 0.155029296875, "loss_aux_layer_7": 0.158447265625, "loss_aux_layer_8": 0.15185546875, "loss_aux_layer_9": 0.1494140625, "step": 365, "total_loss": 0.8967920690774918 }, { "epoch": 0.07246089883191448, "grad_norm": 1.4253480434417725, "learning_rate": 5e-05, "llm_loss": 0.6437167078256607, "loss": 3.2939, "loss_aux_layer_0": 0.038330078125, "loss_aux_layer_1": 0.1104736328125, "loss_aux_layer_10": 0.140869140625, "loss_aux_layer_11": 0.148681640625, "loss_aux_layer_12": 0.162109375, "loss_aux_layer_13": 0.17626953125, "loss_aux_layer_14": 0.1953125, "loss_aux_layer_15": 0.211669921875, "loss_aux_layer_16": 0.228515625, "loss_aux_layer_17": 0.234130859375, "loss_aux_layer_18": 0.247802734375, "loss_aux_layer_19": 0.243896484375, "loss_aux_layer_2": 0.1336669921875, "loss_aux_layer_20": 0.243896484375, "loss_aux_layer_21": 0.243896484375, "loss_aux_layer_22": 0.26416015625, "loss_aux_layer_23": 0.310546875, "loss_aux_layer_3": 0.141845703125, "loss_aux_layer_4": 0.1400146484375, "loss_aux_layer_5": 0.141357421875, "loss_aux_layer_6": 0.139892578125, "loss_aux_layer_7": 0.141845703125, "loss_aux_layer_8": 0.1383056640625, "loss_aux_layer_9": 0.137939453125, "step": 366, "total_loss": 0.8234855830669403 }, { "epoch": 0.07265887942981587, "grad_norm": 1.246006727218628, "learning_rate": 5e-05, "llm_loss": 0.6885715872049332, "loss": 3.482, "loss_aux_layer_0": 0.038330078125, "loss_aux_layer_1": 0.115966796875, "loss_aux_layer_10": 0.142333984375, "loss_aux_layer_11": 0.1513671875, "loss_aux_layer_12": 0.1640625, "loss_aux_layer_13": 0.176025390625, "loss_aux_layer_14": 0.195068359375, "loss_aux_layer_15": 0.211181640625, "loss_aux_layer_16": 0.225830078125, "loss_aux_layer_17": 0.23291015625, "loss_aux_layer_18": 0.246337890625, "loss_aux_layer_19": 0.244873046875, "loss_aux_layer_2": 0.138671875, "loss_aux_layer_20": 0.24462890625, "loss_aux_layer_21": 0.248046875, "loss_aux_layer_22": 0.2685546875, "loss_aux_layer_23": 0.314453125, "loss_aux_layer_3": 0.1474609375, "loss_aux_layer_4": 0.145263671875, "loss_aux_layer_5": 0.146240234375, "loss_aux_layer_6": 0.14306640625, "loss_aux_layer_7": 0.1435546875, "loss_aux_layer_8": 0.140869140625, "loss_aux_layer_9": 0.140380859375, "step": 367, "total_loss": 0.8704998940229416 }, { "epoch": 0.07285686002771728, "grad_norm": 1.3693320751190186, "learning_rate": 5e-05, "llm_loss": 0.6009893268346786, "loss": 3.1462, "loss_aux_layer_0": 0.04095458984375, "loss_aux_layer_1": 0.124267578125, "loss_aux_layer_10": 0.14892578125, "loss_aux_layer_11": 0.1572265625, "loss_aux_layer_12": 0.168701171875, "loss_aux_layer_13": 0.178466796875, "loss_aux_layer_14": 0.19775390625, "loss_aux_layer_15": 0.212158203125, "loss_aux_layer_16": 0.225341796875, "loss_aux_layer_17": 0.23193359375, "loss_aux_layer_18": 0.2431640625, "loss_aux_layer_19": 0.240966796875, "loss_aux_layer_2": 0.150634765625, "loss_aux_layer_20": 0.2421875, "loss_aux_layer_21": 0.243896484375, "loss_aux_layer_22": 0.2646484375, "loss_aux_layer_23": 0.3115234375, "loss_aux_layer_3": 0.157958984375, "loss_aux_layer_4": 0.15576171875, "loss_aux_layer_5": 0.15625, "loss_aux_layer_6": 0.1533203125, "loss_aux_layer_7": 0.151611328125, "loss_aux_layer_8": 0.1494140625, "loss_aux_layer_9": 0.1474609375, "step": 368, "total_loss": 0.7865544855594635 }, { "epoch": 0.07305484062561869, "grad_norm": 1.2220590114593506, "learning_rate": 5e-05, "llm_loss": 0.6154468432068825, "loss": 3.2084, "loss_aux_layer_0": 0.037841796875, "loss_aux_layer_1": 0.12451171875, "loss_aux_layer_10": 0.151611328125, "loss_aux_layer_11": 0.1611328125, "loss_aux_layer_12": 0.172119140625, "loss_aux_layer_13": 0.18359375, "loss_aux_layer_14": 0.200927734375, "loss_aux_layer_15": 0.215087890625, "loss_aux_layer_16": 0.228759765625, "loss_aux_layer_17": 0.23193359375, "loss_aux_layer_18": 0.2412109375, "loss_aux_layer_19": 0.2373046875, "loss_aux_layer_2": 0.155029296875, "loss_aux_layer_20": 0.238525390625, "loss_aux_layer_21": 0.240966796875, "loss_aux_layer_22": 0.26220703125, "loss_aux_layer_23": 0.30908203125, "loss_aux_layer_3": 0.15966796875, "loss_aux_layer_4": 0.157958984375, "loss_aux_layer_5": 0.159423828125, "loss_aux_layer_6": 0.157470703125, "loss_aux_layer_7": 0.15283203125, "loss_aux_layer_8": 0.151123046875, "loss_aux_layer_9": 0.14990234375, "step": 369, "total_loss": 0.8021088689565659 }, { "epoch": 0.0732528212235201, "grad_norm": 1.4612103700637817, "learning_rate": 5e-05, "llm_loss": 0.7178976088762283, "loss": 3.6121, "loss_aux_layer_0": 0.03997802734375, "loss_aux_layer_1": 0.1231689453125, "loss_aux_layer_10": 0.14892578125, "loss_aux_layer_11": 0.158447265625, "loss_aux_layer_12": 0.169921875, "loss_aux_layer_13": 0.1826171875, "loss_aux_layer_14": 0.19921875, "loss_aux_layer_15": 0.21337890625, "loss_aux_layer_16": 0.227783203125, "loss_aux_layer_17": 0.232666015625, "loss_aux_layer_18": 0.241943359375, "loss_aux_layer_19": 0.23828125, "loss_aux_layer_2": 0.1513671875, "loss_aux_layer_20": 0.23974609375, "loss_aux_layer_21": 0.241943359375, "loss_aux_layer_22": 0.263671875, "loss_aux_layer_23": 0.31005859375, "loss_aux_layer_3": 0.154541015625, "loss_aux_layer_4": 0.153076171875, "loss_aux_layer_5": 0.155029296875, "loss_aux_layer_6": 0.15380859375, "loss_aux_layer_7": 0.148681640625, "loss_aux_layer_8": 0.147216796875, "loss_aux_layer_9": 0.146240234375, "step": 370, "total_loss": 0.903014674782753 }, { "epoch": 0.0734508018214215, "grad_norm": 0.926396369934082, "learning_rate": 5e-05, "llm_loss": 0.6821043789386749, "loss": 3.4358, "loss_aux_layer_0": 0.03668212890625, "loss_aux_layer_1": 0.1148681640625, "loss_aux_layer_10": 0.140625, "loss_aux_layer_11": 0.1484375, "loss_aux_layer_12": 0.16015625, "loss_aux_layer_13": 0.17236328125, "loss_aux_layer_14": 0.18896484375, "loss_aux_layer_15": 0.2041015625, "loss_aux_layer_16": 0.21875, "loss_aux_layer_17": 0.2236328125, "loss_aux_layer_18": 0.234375, "loss_aux_layer_19": 0.23291015625, "loss_aux_layer_2": 0.14111328125, "loss_aux_layer_20": 0.23388671875, "loss_aux_layer_21": 0.23583984375, "loss_aux_layer_22": 0.2548828125, "loss_aux_layer_23": 0.30224609375, "loss_aux_layer_3": 0.14599609375, "loss_aux_layer_4": 0.144775390625, "loss_aux_layer_5": 0.146484375, "loss_aux_layer_6": 0.144287109375, "loss_aux_layer_7": 0.140869140625, "loss_aux_layer_8": 0.139404296875, "loss_aux_layer_9": 0.138671875, "step": 371, "total_loss": 0.8589613884687424 }, { "epoch": 0.0736487824193229, "grad_norm": 1.197955846786499, "learning_rate": 5e-05, "llm_loss": 0.6265980899333954, "loss": 3.204, "loss_aux_layer_0": 0.03619384765625, "loss_aux_layer_1": 0.11669921875, "loss_aux_layer_10": 0.13818359375, "loss_aux_layer_11": 0.145263671875, "loss_aux_layer_12": 0.156005859375, "loss_aux_layer_13": 0.167724609375, "loss_aux_layer_14": 0.184814453125, "loss_aux_layer_15": 0.19921875, "loss_aux_layer_16": 0.213623046875, "loss_aux_layer_17": 0.22021484375, "loss_aux_layer_18": 0.23046875, "loss_aux_layer_19": 0.228515625, "loss_aux_layer_2": 0.13916015625, "loss_aux_layer_20": 0.22998046875, "loss_aux_layer_21": 0.233154296875, "loss_aux_layer_22": 0.25439453125, "loss_aux_layer_23": 0.29931640625, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.145263671875, "loss_aux_layer_5": 0.145751953125, "loss_aux_layer_6": 0.14208984375, "loss_aux_layer_7": 0.138916015625, "loss_aux_layer_8": 0.1376953125, "loss_aux_layer_9": 0.1357421875, "step": 372, "total_loss": 0.801002562046051 }, { "epoch": 0.07384676301722432, "grad_norm": 1.516467809677124, "learning_rate": 5e-05, "llm_loss": 0.6913848966360092, "loss": 3.4733, "loss_aux_layer_0": 0.0401611328125, "loss_aux_layer_1": 0.1226806640625, "loss_aux_layer_10": 0.142578125, "loss_aux_layer_11": 0.14990234375, "loss_aux_layer_12": 0.16064453125, "loss_aux_layer_13": 0.171630859375, "loss_aux_layer_14": 0.18798828125, "loss_aux_layer_15": 0.200927734375, "loss_aux_layer_16": 0.21484375, "loss_aux_layer_17": 0.21875, "loss_aux_layer_18": 0.227783203125, "loss_aux_layer_19": 0.2255859375, "loss_aux_layer_2": 0.143310546875, "loss_aux_layer_20": 0.22900390625, "loss_aux_layer_21": 0.231201171875, "loss_aux_layer_22": 0.251220703125, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.152099609375, "loss_aux_layer_4": 0.15087890625, "loss_aux_layer_5": 0.15185546875, "loss_aux_layer_6": 0.1484375, "loss_aux_layer_7": 0.14453125, "loss_aux_layer_8": 0.143310546875, "loss_aux_layer_9": 0.140869140625, "step": 373, "total_loss": 0.868316113948822 }, { "epoch": 0.07404474361512572, "grad_norm": 1.1411479711532593, "learning_rate": 5e-05, "llm_loss": 0.6857805997133255, "loss": 3.4535, "loss_aux_layer_0": 0.038330078125, "loss_aux_layer_1": 0.1243896484375, "loss_aux_layer_10": 0.142333984375, "loss_aux_layer_11": 0.150390625, "loss_aux_layer_12": 0.161865234375, "loss_aux_layer_13": 0.17333984375, "loss_aux_layer_14": 0.18994140625, "loss_aux_layer_15": 0.203369140625, "loss_aux_layer_16": 0.21630859375, "loss_aux_layer_17": 0.22021484375, "loss_aux_layer_18": 0.229736328125, "loss_aux_layer_19": 0.2265625, "loss_aux_layer_2": 0.145751953125, "loss_aux_layer_20": 0.228271484375, "loss_aux_layer_21": 0.229736328125, "loss_aux_layer_22": 0.25, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.15380859375, "loss_aux_layer_4": 0.15234375, "loss_aux_layer_5": 0.15283203125, "loss_aux_layer_6": 0.1494140625, "loss_aux_layer_7": 0.144775390625, "loss_aux_layer_8": 0.142822265625, "loss_aux_layer_9": 0.140869140625, "step": 374, "total_loss": 0.8633766919374466 }, { "epoch": 0.07424272421302712, "grad_norm": 1.529384732246399, "learning_rate": 5e-05, "llm_loss": 0.7641306221485138, "loss": 3.7556, "loss_aux_layer_0": 0.03668212890625, "loss_aux_layer_1": 0.1204833984375, "loss_aux_layer_10": 0.137939453125, "loss_aux_layer_11": 0.145751953125, "loss_aux_layer_12": 0.15771484375, "loss_aux_layer_13": 0.169189453125, "loss_aux_layer_14": 0.18603515625, "loss_aux_layer_15": 0.2001953125, "loss_aux_layer_16": 0.214599609375, "loss_aux_layer_17": 0.2197265625, "loss_aux_layer_18": 0.230224609375, "loss_aux_layer_19": 0.228515625, "loss_aux_layer_2": 0.140625, "loss_aux_layer_20": 0.23095703125, "loss_aux_layer_21": 0.231689453125, "loss_aux_layer_22": 0.2509765625, "loss_aux_layer_23": 0.29443359375, "loss_aux_layer_3": 0.14794921875, "loss_aux_layer_4": 0.147216796875, "loss_aux_layer_5": 0.147216796875, "loss_aux_layer_6": 0.14404296875, "loss_aux_layer_7": 0.13916015625, "loss_aux_layer_8": 0.137451171875, "loss_aux_layer_9": 0.13623046875, "step": 375, "total_loss": 0.9388989359140396 }, { "epoch": 0.07444070481092853, "grad_norm": 0.9374645352363586, "learning_rate": 5e-05, "llm_loss": 0.6557023674249649, "loss": 3.3368, "loss_aux_layer_0": 0.041748046875, "loss_aux_layer_1": 0.1229248046875, "loss_aux_layer_10": 0.141845703125, "loss_aux_layer_11": 0.150146484375, "loss_aux_layer_12": 0.1611328125, "loss_aux_layer_13": 0.172119140625, "loss_aux_layer_14": 0.18896484375, "loss_aux_layer_15": 0.202880859375, "loss_aux_layer_16": 0.21728515625, "loss_aux_layer_17": 0.221923828125, "loss_aux_layer_18": 0.232421875, "loss_aux_layer_19": 0.23046875, "loss_aux_layer_2": 0.143798828125, "loss_aux_layer_20": 0.232666015625, "loss_aux_layer_21": 0.235107421875, "loss_aux_layer_22": 0.25732421875, "loss_aux_layer_23": 0.30224609375, "loss_aux_layer_3": 0.151123046875, "loss_aux_layer_4": 0.151123046875, "loss_aux_layer_5": 0.151611328125, "loss_aux_layer_6": 0.149169921875, "loss_aux_layer_7": 0.143798828125, "loss_aux_layer_8": 0.14208984375, "loss_aux_layer_9": 0.14013671875, "step": 376, "total_loss": 0.8341880589723587 }, { "epoch": 0.07463868540882994, "grad_norm": 1.4000892639160156, "learning_rate": 5e-05, "llm_loss": 0.6451012790203094, "loss": 3.2743, "loss_aux_layer_0": 0.03472900390625, "loss_aux_layer_1": 0.1180419921875, "loss_aux_layer_10": 0.1376953125, "loss_aux_layer_11": 0.145263671875, "loss_aux_layer_12": 0.155517578125, "loss_aux_layer_13": 0.166259765625, "loss_aux_layer_14": 0.18310546875, "loss_aux_layer_15": 0.19775390625, "loss_aux_layer_16": 0.212646484375, "loss_aux_layer_17": 0.21826171875, "loss_aux_layer_18": 0.229248046875, "loss_aux_layer_19": 0.22705078125, "loss_aux_layer_2": 0.1376953125, "loss_aux_layer_20": 0.22998046875, "loss_aux_layer_21": 0.232177734375, "loss_aux_layer_22": 0.251953125, "loss_aux_layer_23": 0.29638671875, "loss_aux_layer_3": 0.144775390625, "loss_aux_layer_4": 0.144775390625, "loss_aux_layer_5": 0.1455078125, "loss_aux_layer_6": 0.143310546875, "loss_aux_layer_7": 0.137939453125, "loss_aux_layer_8": 0.13720703125, "loss_aux_layer_9": 0.1357421875, "step": 377, "total_loss": 0.8185835480690002 }, { "epoch": 0.07483666600673133, "grad_norm": 1.2198799848556519, "learning_rate": 5e-05, "llm_loss": 0.7411276996135712, "loss": 3.6568, "loss_aux_layer_0": 0.03802490234375, "loss_aux_layer_1": 0.1162109375, "loss_aux_layer_10": 0.13525390625, "loss_aux_layer_11": 0.143798828125, "loss_aux_layer_12": 0.155029296875, "loss_aux_layer_13": 0.16650390625, "loss_aux_layer_14": 0.184326171875, "loss_aux_layer_15": 0.199462890625, "loss_aux_layer_16": 0.21435546875, "loss_aux_layer_17": 0.220703125, "loss_aux_layer_18": 0.22998046875, "loss_aux_layer_19": 0.22900390625, "loss_aux_layer_2": 0.13427734375, "loss_aux_layer_20": 0.23095703125, "loss_aux_layer_21": 0.23291015625, "loss_aux_layer_22": 0.255126953125, "loss_aux_layer_23": 0.302734375, "loss_aux_layer_3": 0.1416015625, "loss_aux_layer_4": 0.140625, "loss_aux_layer_5": 0.140869140625, "loss_aux_layer_6": 0.13916015625, "loss_aux_layer_7": 0.13427734375, "loss_aux_layer_8": 0.13427734375, "loss_aux_layer_9": 0.13330078125, "step": 378, "total_loss": 0.9141981303691864 }, { "epoch": 0.07503464660463274, "grad_norm": 1.485983967781067, "learning_rate": 5e-05, "llm_loss": 0.6199405342340469, "loss": 3.2055, "loss_aux_layer_0": 0.0394287109375, "loss_aux_layer_1": 0.126708984375, "loss_aux_layer_10": 0.145263671875, "loss_aux_layer_11": 0.1533203125, "loss_aux_layer_12": 0.16455078125, "loss_aux_layer_13": 0.17431640625, "loss_aux_layer_14": 0.18994140625, "loss_aux_layer_15": 0.203857421875, "loss_aux_layer_16": 0.218994140625, "loss_aux_layer_17": 0.22412109375, "loss_aux_layer_18": 0.233642578125, "loss_aux_layer_19": 0.23193359375, "loss_aux_layer_2": 0.1474609375, "loss_aux_layer_20": 0.233642578125, "loss_aux_layer_21": 0.23828125, "loss_aux_layer_22": 0.25927734375, "loss_aux_layer_23": 0.306640625, "loss_aux_layer_3": 0.156005859375, "loss_aux_layer_4": 0.156005859375, "loss_aux_layer_5": 0.157470703125, "loss_aux_layer_6": 0.15478515625, "loss_aux_layer_7": 0.1484375, "loss_aux_layer_8": 0.146240234375, "loss_aux_layer_9": 0.14404296875, "step": 379, "total_loss": 0.8013789653778076 }, { "epoch": 0.07523262720253415, "grad_norm": 1.1823585033416748, "learning_rate": 5e-05, "llm_loss": 0.6136908084154129, "loss": 3.1476, "loss_aux_layer_0": 0.03680419921875, "loss_aux_layer_1": 0.1212158203125, "loss_aux_layer_10": 0.13623046875, "loss_aux_layer_11": 0.144287109375, "loss_aux_layer_12": 0.1552734375, "loss_aux_layer_13": 0.166259765625, "loss_aux_layer_14": 0.18310546875, "loss_aux_layer_15": 0.196533203125, "loss_aux_layer_16": 0.211181640625, "loss_aux_layer_17": 0.21728515625, "loss_aux_layer_18": 0.2275390625, "loss_aux_layer_19": 0.224853515625, "loss_aux_layer_2": 0.138916015625, "loss_aux_layer_20": 0.226806640625, "loss_aux_layer_21": 0.229248046875, "loss_aux_layer_22": 0.25, "loss_aux_layer_23": 0.294921875, "loss_aux_layer_3": 0.14697265625, "loss_aux_layer_4": 0.14697265625, "loss_aux_layer_5": 0.1474609375, "loss_aux_layer_6": 0.145263671875, "loss_aux_layer_7": 0.138427734375, "loss_aux_layer_8": 0.1368408203125, "loss_aux_layer_9": 0.1348876953125, "step": 380, "total_loss": 0.7868993580341339 }, { "epoch": 0.07543060780043556, "grad_norm": 1.49400794506073, "learning_rate": 5e-05, "llm_loss": 0.6875873059034348, "loss": 3.4486, "loss_aux_layer_0": 0.04010009765625, "loss_aux_layer_1": 0.1256103515625, "loss_aux_layer_10": 0.13720703125, "loss_aux_layer_11": 0.14501953125, "loss_aux_layer_12": 0.156005859375, "loss_aux_layer_13": 0.166259765625, "loss_aux_layer_14": 0.18359375, "loss_aux_layer_15": 0.196533203125, "loss_aux_layer_16": 0.211181640625, "loss_aux_layer_17": 0.216552734375, "loss_aux_layer_18": 0.226806640625, "loss_aux_layer_19": 0.225341796875, "loss_aux_layer_2": 0.1416015625, "loss_aux_layer_20": 0.227783203125, "loss_aux_layer_21": 0.23046875, "loss_aux_layer_22": 0.253173828125, "loss_aux_layer_23": 0.298828125, "loss_aux_layer_3": 0.1494140625, "loss_aux_layer_4": 0.14892578125, "loss_aux_layer_5": 0.14892578125, "loss_aux_layer_6": 0.147216796875, "loss_aux_layer_7": 0.14013671875, "loss_aux_layer_8": 0.13818359375, "loss_aux_layer_9": 0.135986328125, "step": 381, "total_loss": 0.8621496111154556 }, { "epoch": 0.07562858839833696, "grad_norm": 0.8629288077354431, "learning_rate": 5e-05, "llm_loss": 0.6785987764596939, "loss": 3.3975, "loss_aux_layer_0": 0.03570556640625, "loss_aux_layer_1": 0.1168212890625, "loss_aux_layer_10": 0.13427734375, "loss_aux_layer_11": 0.142333984375, "loss_aux_layer_12": 0.1533203125, "loss_aux_layer_13": 0.163818359375, "loss_aux_layer_14": 0.181396484375, "loss_aux_layer_15": 0.1962890625, "loss_aux_layer_16": 0.210205078125, "loss_aux_layer_17": 0.2158203125, "loss_aux_layer_18": 0.22607421875, "loss_aux_layer_19": 0.22509765625, "loss_aux_layer_2": 0.1326904296875, "loss_aux_layer_20": 0.2265625, "loss_aux_layer_21": 0.22802734375, "loss_aux_layer_22": 0.248779296875, "loss_aux_layer_23": 0.29443359375, "loss_aux_layer_3": 0.140869140625, "loss_aux_layer_4": 0.14111328125, "loss_aux_layer_5": 0.142333984375, "loss_aux_layer_6": 0.140869140625, "loss_aux_layer_7": 0.134521484375, "loss_aux_layer_8": 0.1337890625, "loss_aux_layer_9": 0.1326904296875, "step": 382, "total_loss": 0.8493810743093491 }, { "epoch": 0.07582656899623837, "grad_norm": 0.6859288215637207, "learning_rate": 5e-05, "llm_loss": 0.6241410672664642, "loss": 3.1634, "loss_aux_layer_0": 0.0347900390625, "loss_aux_layer_1": 0.1162109375, "loss_aux_layer_10": 0.1285400390625, "loss_aux_layer_11": 0.135986328125, "loss_aux_layer_12": 0.146728515625, "loss_aux_layer_13": 0.15771484375, "loss_aux_layer_14": 0.175048828125, "loss_aux_layer_15": 0.1904296875, "loss_aux_layer_16": 0.205322265625, "loss_aux_layer_17": 0.211181640625, "loss_aux_layer_18": 0.221923828125, "loss_aux_layer_19": 0.219970703125, "loss_aux_layer_2": 0.1317138671875, "loss_aux_layer_20": 0.22265625, "loss_aux_layer_21": 0.224853515625, "loss_aux_layer_22": 0.245849609375, "loss_aux_layer_23": 0.29248046875, "loss_aux_layer_3": 0.1395263671875, "loss_aux_layer_4": 0.138427734375, "loss_aux_layer_5": 0.138427734375, "loss_aux_layer_6": 0.1365966796875, "loss_aux_layer_7": 0.129638671875, "loss_aux_layer_8": 0.1290283203125, "loss_aux_layer_9": 0.12744140625, "step": 383, "total_loss": 0.7908522188663483 }, { "epoch": 0.07602454959413978, "grad_norm": 1.1137163639068604, "learning_rate": 5e-05, "llm_loss": 0.6569514125585556, "loss": 3.2588, "loss_aux_layer_0": 0.035888671875, "loss_aux_layer_1": 0.1046142578125, "loss_aux_layer_10": 0.1192626953125, "loss_aux_layer_11": 0.12548828125, "loss_aux_layer_12": 0.136962890625, "loss_aux_layer_13": 0.148193359375, "loss_aux_layer_14": 0.166748046875, "loss_aux_layer_15": 0.182373046875, "loss_aux_layer_16": 0.197509765625, "loss_aux_layer_17": 0.20458984375, "loss_aux_layer_18": 0.2158203125, "loss_aux_layer_19": 0.21484375, "loss_aux_layer_2": 0.116943359375, "loss_aux_layer_20": 0.21875, "loss_aux_layer_21": 0.220947265625, "loss_aux_layer_22": 0.239013671875, "loss_aux_layer_23": 0.28515625, "loss_aux_layer_3": 0.12451171875, "loss_aux_layer_4": 0.124267578125, "loss_aux_layer_5": 0.1251220703125, "loss_aux_layer_6": 0.124267578125, "loss_aux_layer_7": 0.1185302734375, "loss_aux_layer_8": 0.118408203125, "loss_aux_layer_9": 0.1175537109375, "step": 384, "total_loss": 0.8146958500146866 }, { "epoch": 0.07622253019204119, "grad_norm": 1.249002456665039, "learning_rate": 5e-05, "llm_loss": 0.6948186010122299, "loss": 3.4545, "loss_aux_layer_0": 0.03668212890625, "loss_aux_layer_1": 0.11865234375, "loss_aux_layer_10": 0.1322021484375, "loss_aux_layer_11": 0.139404296875, "loss_aux_layer_12": 0.150390625, "loss_aux_layer_13": 0.161376953125, "loss_aux_layer_14": 0.1787109375, "loss_aux_layer_15": 0.193115234375, "loss_aux_layer_16": 0.207275390625, "loss_aux_layer_17": 0.213134765625, "loss_aux_layer_18": 0.22314453125, "loss_aux_layer_19": 0.2216796875, "loss_aux_layer_2": 0.131591796875, "loss_aux_layer_20": 0.224853515625, "loss_aux_layer_21": 0.226806640625, "loss_aux_layer_22": 0.24658203125, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.139892578125, "loss_aux_layer_4": 0.139404296875, "loss_aux_layer_5": 0.139892578125, "loss_aux_layer_6": 0.138916015625, "loss_aux_layer_7": 0.132080078125, "loss_aux_layer_8": 0.1314697265625, "loss_aux_layer_9": 0.1307373046875, "step": 385, "total_loss": 0.8636137098073959 }, { "epoch": 0.07642051078994258, "grad_norm": 1.395763874053955, "learning_rate": 5e-05, "llm_loss": 0.6135871112346649, "loss": 3.1346, "loss_aux_layer_0": 0.03631591796875, "loss_aux_layer_1": 0.119140625, "loss_aux_layer_10": 0.133056640625, "loss_aux_layer_11": 0.140869140625, "loss_aux_layer_12": 0.151611328125, "loss_aux_layer_13": 0.16259765625, "loss_aux_layer_14": 0.17919921875, "loss_aux_layer_15": 0.19384765625, "loss_aux_layer_16": 0.208251953125, "loss_aux_layer_17": 0.2138671875, "loss_aux_layer_18": 0.224609375, "loss_aux_layer_19": 0.222900390625, "loss_aux_layer_2": 0.13330078125, "loss_aux_layer_20": 0.22509765625, "loss_aux_layer_21": 0.227294921875, "loss_aux_layer_22": 0.248779296875, "loss_aux_layer_23": 0.29443359375, "loss_aux_layer_3": 0.141845703125, "loss_aux_layer_4": 0.14208984375, "loss_aux_layer_5": 0.142333984375, "loss_aux_layer_6": 0.14111328125, "loss_aux_layer_7": 0.13427734375, "loss_aux_layer_8": 0.13330078125, "loss_aux_layer_9": 0.131591796875, "step": 386, "total_loss": 0.7836450338363647 }, { "epoch": 0.07661849138784399, "grad_norm": 0.7372751235961914, "learning_rate": 5e-05, "llm_loss": 0.6772238165140152, "loss": 3.3734, "loss_aux_layer_0": 0.0374755859375, "loss_aux_layer_1": 0.1148681640625, "loss_aux_layer_10": 0.12890625, "loss_aux_layer_11": 0.13623046875, "loss_aux_layer_12": 0.146484375, "loss_aux_layer_13": 0.15673828125, "loss_aux_layer_14": 0.174072265625, "loss_aux_layer_15": 0.18994140625, "loss_aux_layer_16": 0.205322265625, "loss_aux_layer_17": 0.21142578125, "loss_aux_layer_18": 0.221923828125, "loss_aux_layer_19": 0.220458984375, "loss_aux_layer_2": 0.127197265625, "loss_aux_layer_20": 0.222900390625, "loss_aux_layer_21": 0.224609375, "loss_aux_layer_22": 0.244873046875, "loss_aux_layer_23": 0.291015625, "loss_aux_layer_3": 0.1357421875, "loss_aux_layer_4": 0.135986328125, "loss_aux_layer_5": 0.136962890625, "loss_aux_layer_6": 0.13623046875, "loss_aux_layer_7": 0.129638671875, "loss_aux_layer_8": 0.1287841796875, "loss_aux_layer_9": 0.1278076171875, "step": 387, "total_loss": 0.8433406949043274 }, { "epoch": 0.0768164719857454, "grad_norm": 1.1586558818817139, "learning_rate": 5e-05, "llm_loss": 0.6485741585493088, "loss": 3.2611, "loss_aux_layer_0": 0.0355224609375, "loss_aux_layer_1": 0.116455078125, "loss_aux_layer_10": 0.1287841796875, "loss_aux_layer_11": 0.135986328125, "loss_aux_layer_12": 0.14697265625, "loss_aux_layer_13": 0.158203125, "loss_aux_layer_14": 0.17626953125, "loss_aux_layer_15": 0.19091796875, "loss_aux_layer_16": 0.204833984375, "loss_aux_layer_17": 0.21142578125, "loss_aux_layer_18": 0.2216796875, "loss_aux_layer_19": 0.220703125, "loss_aux_layer_2": 0.1279296875, "loss_aux_layer_20": 0.224609375, "loss_aux_layer_21": 0.226318359375, "loss_aux_layer_22": 0.24658203125, "loss_aux_layer_23": 0.291015625, "loss_aux_layer_3": 0.13671875, "loss_aux_layer_4": 0.136962890625, "loss_aux_layer_5": 0.1376953125, "loss_aux_layer_6": 0.13720703125, "loss_aux_layer_7": 0.1304931640625, "loss_aux_layer_8": 0.12939453125, "loss_aux_layer_9": 0.1273193359375, "step": 388, "total_loss": 0.815268024802208 }, { "epoch": 0.07701445258364681, "grad_norm": 1.1057524681091309, "learning_rate": 5e-05, "llm_loss": 0.7120161205530167, "loss": 3.4971, "loss_aux_layer_0": 0.033966064453125, "loss_aux_layer_1": 0.1090087890625, "loss_aux_layer_10": 0.124267578125, "loss_aux_layer_11": 0.1314697265625, "loss_aux_layer_12": 0.1427001953125, "loss_aux_layer_13": 0.15380859375, "loss_aux_layer_14": 0.171875, "loss_aux_layer_15": 0.1875, "loss_aux_layer_16": 0.20361328125, "loss_aux_layer_17": 0.21044921875, "loss_aux_layer_18": 0.220458984375, "loss_aux_layer_19": 0.22021484375, "loss_aux_layer_2": 0.1195068359375, "loss_aux_layer_20": 0.222900390625, "loss_aux_layer_21": 0.224365234375, "loss_aux_layer_22": 0.242919921875, "loss_aux_layer_23": 0.28857421875, "loss_aux_layer_3": 0.1280517578125, "loss_aux_layer_4": 0.1290283203125, "loss_aux_layer_5": 0.130126953125, "loss_aux_layer_6": 0.130126953125, "loss_aux_layer_7": 0.123779296875, "loss_aux_layer_8": 0.1231689453125, "loss_aux_layer_9": 0.122802734375, "step": 389, "total_loss": 0.8742745965719223 }, { "epoch": 0.0772124331815482, "grad_norm": 1.3262966871261597, "learning_rate": 5e-05, "llm_loss": 0.6163360327482224, "loss": 3.1604, "loss_aux_layer_0": 0.03607177734375, "loss_aux_layer_1": 0.1239013671875, "loss_aux_layer_10": 0.13720703125, "loss_aux_layer_11": 0.145263671875, "loss_aux_layer_12": 0.156005859375, "loss_aux_layer_13": 0.166259765625, "loss_aux_layer_14": 0.183837890625, "loss_aux_layer_15": 0.197998046875, "loss_aux_layer_16": 0.21337890625, "loss_aux_layer_17": 0.218505859375, "loss_aux_layer_18": 0.22900390625, "loss_aux_layer_19": 0.226806640625, "loss_aux_layer_2": 0.13720703125, "loss_aux_layer_20": 0.228271484375, "loss_aux_layer_21": 0.22900390625, "loss_aux_layer_22": 0.25048828125, "loss_aux_layer_23": 0.2958984375, "loss_aux_layer_3": 0.146240234375, "loss_aux_layer_4": 0.146484375, "loss_aux_layer_5": 0.146728515625, "loss_aux_layer_6": 0.14599609375, "loss_aux_layer_7": 0.138427734375, "loss_aux_layer_8": 0.13720703125, "loss_aux_layer_9": 0.1357421875, "step": 390, "total_loss": 0.7901116907596588 }, { "epoch": 0.07741041377944961, "grad_norm": 1.2576546669006348, "learning_rate": 5e-05, "llm_loss": 0.6171857714653015, "loss": 3.1244, "loss_aux_layer_0": 0.037353515625, "loss_aux_layer_1": 0.1126708984375, "loss_aux_layer_10": 0.12548828125, "loss_aux_layer_11": 0.1328125, "loss_aux_layer_12": 0.143798828125, "loss_aux_layer_13": 0.155029296875, "loss_aux_layer_14": 0.172607421875, "loss_aux_layer_15": 0.18798828125, "loss_aux_layer_16": 0.20263671875, "loss_aux_layer_17": 0.208984375, "loss_aux_layer_18": 0.21923828125, "loss_aux_layer_19": 0.218994140625, "loss_aux_layer_2": 0.1246337890625, "loss_aux_layer_20": 0.222412109375, "loss_aux_layer_21": 0.225341796875, "loss_aux_layer_22": 0.247314453125, "loss_aux_layer_23": 0.2939453125, "loss_aux_layer_3": 0.13232421875, "loss_aux_layer_4": 0.132080078125, "loss_aux_layer_5": 0.1328125, "loss_aux_layer_6": 0.1318359375, "loss_aux_layer_7": 0.124755859375, "loss_aux_layer_8": 0.124755859375, "loss_aux_layer_9": 0.1240234375, "step": 391, "total_loss": 0.7811073213815689 }, { "epoch": 0.07760839437735102, "grad_norm": 0.7563461065292358, "learning_rate": 5e-05, "llm_loss": 0.6060348898172379, "loss": 3.0986, "loss_aux_layer_0": 0.03668212890625, "loss_aux_layer_1": 0.1221923828125, "loss_aux_layer_10": 0.1322021484375, "loss_aux_layer_11": 0.139404296875, "loss_aux_layer_12": 0.150146484375, "loss_aux_layer_13": 0.161865234375, "loss_aux_layer_14": 0.178955078125, "loss_aux_layer_15": 0.193115234375, "loss_aux_layer_16": 0.2080078125, "loss_aux_layer_17": 0.212646484375, "loss_aux_layer_18": 0.22265625, "loss_aux_layer_19": 0.22021484375, "loss_aux_layer_2": 0.1322021484375, "loss_aux_layer_20": 0.22216796875, "loss_aux_layer_21": 0.223388671875, "loss_aux_layer_22": 0.24462890625, "loss_aux_layer_23": 0.2880859375, "loss_aux_layer_3": 0.140869140625, "loss_aux_layer_4": 0.140869140625, "loss_aux_layer_5": 0.1416015625, "loss_aux_layer_6": 0.14013671875, "loss_aux_layer_7": 0.1329345703125, "loss_aux_layer_8": 0.1319580078125, "loss_aux_layer_9": 0.1302490234375, "step": 392, "total_loss": 0.7746570706367493 }, { "epoch": 0.07780637497525243, "grad_norm": 1.724058747291565, "learning_rate": 5e-05, "llm_loss": 0.6310302913188934, "loss": 3.1863, "loss_aux_layer_0": 0.03662109375, "loss_aux_layer_1": 0.1199951171875, "loss_aux_layer_10": 0.1279296875, "loss_aux_layer_11": 0.13525390625, "loss_aux_layer_12": 0.1455078125, "loss_aux_layer_13": 0.156494140625, "loss_aux_layer_14": 0.172607421875, "loss_aux_layer_15": 0.187255859375, "loss_aux_layer_16": 0.201904296875, "loss_aux_layer_17": 0.209228515625, "loss_aux_layer_18": 0.218994140625, "loss_aux_layer_19": 0.2177734375, "loss_aux_layer_2": 0.1292724609375, "loss_aux_layer_20": 0.22119140625, "loss_aux_layer_21": 0.222900390625, "loss_aux_layer_22": 0.243408203125, "loss_aux_layer_23": 0.28857421875, "loss_aux_layer_3": 0.1376953125, "loss_aux_layer_4": 0.138671875, "loss_aux_layer_5": 0.13916015625, "loss_aux_layer_6": 0.13818359375, "loss_aux_layer_7": 0.1300048828125, "loss_aux_layer_8": 0.12841796875, "loss_aux_layer_9": 0.1265869140625, "step": 393, "total_loss": 0.7965722680091858 }, { "epoch": 0.07800435557315383, "grad_norm": 0.7032169103622437, "learning_rate": 5e-05, "llm_loss": 0.7577840834856033, "loss": 3.6905, "loss_aux_layer_0": 0.037353515625, "loss_aux_layer_1": 0.1187744140625, "loss_aux_layer_10": 0.128173828125, "loss_aux_layer_11": 0.135009765625, "loss_aux_layer_12": 0.1455078125, "loss_aux_layer_13": 0.15673828125, "loss_aux_layer_14": 0.174072265625, "loss_aux_layer_15": 0.18896484375, "loss_aux_layer_16": 0.20263671875, "loss_aux_layer_17": 0.208984375, "loss_aux_layer_18": 0.218505859375, "loss_aux_layer_19": 0.216064453125, "loss_aux_layer_2": 0.128662109375, "loss_aux_layer_20": 0.21923828125, "loss_aux_layer_21": 0.219482421875, "loss_aux_layer_22": 0.23974609375, "loss_aux_layer_23": 0.28369140625, "loss_aux_layer_3": 0.136962890625, "loss_aux_layer_4": 0.1376953125, "loss_aux_layer_5": 0.13818359375, "loss_aux_layer_6": 0.1376953125, "loss_aux_layer_7": 0.1298828125, "loss_aux_layer_8": 0.12841796875, "loss_aux_layer_9": 0.126708984375, "step": 394, "total_loss": 0.922614574432373 }, { "epoch": 0.07820233617105524, "grad_norm": 1.7668589353561401, "learning_rate": 5e-05, "llm_loss": 0.6528612524271011, "loss": 3.2529, "loss_aux_layer_0": 0.03472900390625, "loss_aux_layer_1": 0.1104736328125, "loss_aux_layer_10": 0.122314453125, "loss_aux_layer_11": 0.1290283203125, "loss_aux_layer_12": 0.139404296875, "loss_aux_layer_13": 0.14990234375, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.182373046875, "loss_aux_layer_16": 0.19775390625, "loss_aux_layer_17": 0.2041015625, "loss_aux_layer_18": 0.21484375, "loss_aux_layer_19": 0.214111328125, "loss_aux_layer_2": 0.122314453125, "loss_aux_layer_20": 0.218017578125, "loss_aux_layer_21": 0.22021484375, "loss_aux_layer_22": 0.24267578125, "loss_aux_layer_23": 0.28955078125, "loss_aux_layer_3": 0.13037109375, "loss_aux_layer_4": 0.1304931640625, "loss_aux_layer_5": 0.1314697265625, "loss_aux_layer_6": 0.13037109375, "loss_aux_layer_7": 0.123046875, "loss_aux_layer_8": 0.12255859375, "loss_aux_layer_9": 0.1209716796875, "step": 395, "total_loss": 0.813219353556633 }, { "epoch": 0.07840031676895665, "grad_norm": 1.6041619777679443, "learning_rate": 5e-05, "llm_loss": 0.8138559907674789, "loss": 3.928, "loss_aux_layer_0": 0.03570556640625, "loss_aux_layer_1": 0.11962890625, "loss_aux_layer_10": 0.1314697265625, "loss_aux_layer_11": 0.13916015625, "loss_aux_layer_12": 0.150146484375, "loss_aux_layer_13": 0.1611328125, "loss_aux_layer_14": 0.17822265625, "loss_aux_layer_15": 0.19287109375, "loss_aux_layer_16": 0.20654296875, "loss_aux_layer_17": 0.212646484375, "loss_aux_layer_18": 0.222412109375, "loss_aux_layer_19": 0.22021484375, "loss_aux_layer_2": 0.1300048828125, "loss_aux_layer_20": 0.222900390625, "loss_aux_layer_21": 0.223388671875, "loss_aux_layer_22": 0.243896484375, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.139404296875, "loss_aux_layer_4": 0.140380859375, "loss_aux_layer_5": 0.141357421875, "loss_aux_layer_6": 0.1412353515625, "loss_aux_layer_7": 0.1331787109375, "loss_aux_layer_8": 0.1318359375, "loss_aux_layer_9": 0.1300048828125, "step": 396, "total_loss": 0.9820013493299484 }, { "epoch": 0.07859829736685804, "grad_norm": 0.7296205163002014, "learning_rate": 5e-05, "llm_loss": 0.6701316237449646, "loss": 3.325, "loss_aux_layer_0": 0.0361328125, "loss_aux_layer_1": 0.113525390625, "loss_aux_layer_10": 0.1220703125, "loss_aux_layer_11": 0.129150390625, "loss_aux_layer_12": 0.140380859375, "loss_aux_layer_13": 0.151611328125, "loss_aux_layer_14": 0.169677734375, "loss_aux_layer_15": 0.18505859375, "loss_aux_layer_16": 0.20068359375, "loss_aux_layer_17": 0.207275390625, "loss_aux_layer_18": 0.218017578125, "loss_aux_layer_19": 0.2177734375, "loss_aux_layer_2": 0.12060546875, "loss_aux_layer_20": 0.220947265625, "loss_aux_layer_21": 0.222412109375, "loss_aux_layer_22": 0.2431640625, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.12841796875, "loss_aux_layer_4": 0.129150390625, "loss_aux_layer_5": 0.1292724609375, "loss_aux_layer_6": 0.1287841796875, "loss_aux_layer_7": 0.121826171875, "loss_aux_layer_8": 0.1212158203125, "loss_aux_layer_9": 0.1201171875, "step": 397, "total_loss": 0.8312529772520065 }, { "epoch": 0.07879627796475945, "grad_norm": 1.0381594896316528, "learning_rate": 5e-05, "llm_loss": 0.6561175286769867, "loss": 3.2738, "loss_aux_layer_0": 0.03509521484375, "loss_aux_layer_1": 0.1156005859375, "loss_aux_layer_10": 0.123779296875, "loss_aux_layer_11": 0.130859375, "loss_aux_layer_12": 0.141357421875, "loss_aux_layer_13": 0.15234375, "loss_aux_layer_14": 0.170166015625, "loss_aux_layer_15": 0.185302734375, "loss_aux_layer_16": 0.2001953125, "loss_aux_layer_17": 0.20751953125, "loss_aux_layer_18": 0.21875, "loss_aux_layer_19": 0.217041015625, "loss_aux_layer_2": 0.123291015625, "loss_aux_layer_20": 0.219482421875, "loss_aux_layer_21": 0.2216796875, "loss_aux_layer_22": 0.242431640625, "loss_aux_layer_23": 0.2890625, "loss_aux_layer_3": 0.1314697265625, "loss_aux_layer_4": 0.1324462890625, "loss_aux_layer_5": 0.13330078125, "loss_aux_layer_6": 0.13232421875, "loss_aux_layer_7": 0.12548828125, "loss_aux_layer_8": 0.124267578125, "loss_aux_layer_9": 0.1226806640625, "step": 398, "total_loss": 0.8184401839971542 }, { "epoch": 0.07899425856266086, "grad_norm": 0.8248782157897949, "learning_rate": 5e-05, "llm_loss": 0.6443310976028442, "loss": 3.2139, "loss_aux_layer_0": 0.0357666015625, "loss_aux_layer_1": 0.1107177734375, "loss_aux_layer_10": 0.1201171875, "loss_aux_layer_11": 0.1270751953125, "loss_aux_layer_12": 0.137451171875, "loss_aux_layer_13": 0.14892578125, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.1826171875, "loss_aux_layer_16": 0.19775390625, "loss_aux_layer_17": 0.204833984375, "loss_aux_layer_18": 0.214111328125, "loss_aux_layer_19": 0.2138671875, "loss_aux_layer_2": 0.11962890625, "loss_aux_layer_20": 0.217041015625, "loss_aux_layer_21": 0.2197265625, "loss_aux_layer_22": 0.241943359375, "loss_aux_layer_23": 0.2900390625, "loss_aux_layer_3": 0.127685546875, "loss_aux_layer_4": 0.1278076171875, "loss_aux_layer_5": 0.128662109375, "loss_aux_layer_6": 0.1278076171875, "loss_aux_layer_7": 0.1209716796875, "loss_aux_layer_8": 0.1199951171875, "loss_aux_layer_9": 0.1187744140625, "step": 399, "total_loss": 0.8034748435020447 }, { "epoch": 0.07919223916056227, "grad_norm": 1.497991919517517, "learning_rate": 5e-05, "llm_loss": 0.6995693147182465, "loss": 3.4526, "loss_aux_layer_0": 0.03857421875, "loss_aux_layer_1": 0.1182861328125, "loss_aux_layer_10": 0.1259765625, "loss_aux_layer_11": 0.1328125, "loss_aux_layer_12": 0.142333984375, "loss_aux_layer_13": 0.153564453125, "loss_aux_layer_14": 0.1708984375, "loss_aux_layer_15": 0.1865234375, "loss_aux_layer_16": 0.20166015625, "loss_aux_layer_17": 0.20703125, "loss_aux_layer_18": 0.217529296875, "loss_aux_layer_19": 0.216064453125, "loss_aux_layer_2": 0.1256103515625, "loss_aux_layer_20": 0.219482421875, "loss_aux_layer_21": 0.221923828125, "loss_aux_layer_22": 0.2421875, "loss_aux_layer_23": 0.28662109375, "loss_aux_layer_3": 0.13427734375, "loss_aux_layer_4": 0.13525390625, "loss_aux_layer_5": 0.1361083984375, "loss_aux_layer_6": 0.1353759765625, "loss_aux_layer_7": 0.12744140625, "loss_aux_layer_8": 0.1260986328125, "loss_aux_layer_9": 0.12451171875, "step": 400, "total_loss": 0.863155260682106 }, { "epoch": 0.07939021975846366, "grad_norm": 1.1952673196792603, "learning_rate": 5e-05, "llm_loss": 0.59507255256176, "loss": 3.0383, "loss_aux_layer_0": 0.03533935546875, "loss_aux_layer_1": 0.1197509765625, "loss_aux_layer_10": 0.1285400390625, "loss_aux_layer_11": 0.1357421875, "loss_aux_layer_12": 0.146240234375, "loss_aux_layer_13": 0.156494140625, "loss_aux_layer_14": 0.173583984375, "loss_aux_layer_15": 0.18798828125, "loss_aux_layer_16": 0.202392578125, "loss_aux_layer_17": 0.207275390625, "loss_aux_layer_18": 0.2177734375, "loss_aux_layer_19": 0.2158203125, "loss_aux_layer_2": 0.127197265625, "loss_aux_layer_20": 0.21826171875, "loss_aux_layer_21": 0.2197265625, "loss_aux_layer_22": 0.24072265625, "loss_aux_layer_23": 0.28515625, "loss_aux_layer_3": 0.1357421875, "loss_aux_layer_4": 0.136962890625, "loss_aux_layer_5": 0.13720703125, "loss_aux_layer_6": 0.136474609375, "loss_aux_layer_7": 0.1290283203125, "loss_aux_layer_8": 0.1279296875, "loss_aux_layer_9": 0.1275634765625, "step": 401, "total_loss": 0.7595804184675217 }, { "epoch": 0.07958820035636507, "grad_norm": 0.8490651249885559, "learning_rate": 5e-05, "llm_loss": 0.6381635367870331, "loss": 3.1955, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.1138916015625, "loss_aux_layer_10": 0.1217041015625, "loss_aux_layer_11": 0.1287841796875, "loss_aux_layer_12": 0.138916015625, "loss_aux_layer_13": 0.150146484375, "loss_aux_layer_14": 0.167724609375, "loss_aux_layer_15": 0.183837890625, "loss_aux_layer_16": 0.198486328125, "loss_aux_layer_17": 0.20556640625, "loss_aux_layer_18": 0.216064453125, "loss_aux_layer_19": 0.21484375, "loss_aux_layer_2": 0.12255859375, "loss_aux_layer_20": 0.21923828125, "loss_aux_layer_21": 0.221923828125, "loss_aux_layer_22": 0.2421875, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.13037109375, "loss_aux_layer_4": 0.130859375, "loss_aux_layer_5": 0.1318359375, "loss_aux_layer_6": 0.131103515625, "loss_aux_layer_7": 0.1236572265625, "loss_aux_layer_8": 0.1224365234375, "loss_aux_layer_9": 0.1207275390625, "step": 402, "total_loss": 0.7988856732845306 }, { "epoch": 0.07978618095426648, "grad_norm": 1.063453197479248, "learning_rate": 5e-05, "llm_loss": 0.658512532711029, "loss": 3.3007, "loss_aux_layer_0": 0.0357666015625, "loss_aux_layer_1": 0.121337890625, "loss_aux_layer_10": 0.130615234375, "loss_aux_layer_11": 0.137939453125, "loss_aux_layer_12": 0.14794921875, "loss_aux_layer_13": 0.158203125, "loss_aux_layer_14": 0.17529296875, "loss_aux_layer_15": 0.189453125, "loss_aux_layer_16": 0.203125, "loss_aux_layer_17": 0.208251953125, "loss_aux_layer_18": 0.218017578125, "loss_aux_layer_19": 0.2158203125, "loss_aux_layer_2": 0.130615234375, "loss_aux_layer_20": 0.218994140625, "loss_aux_layer_21": 0.22119140625, "loss_aux_layer_22": 0.24267578125, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.139892578125, "loss_aux_layer_4": 0.140869140625, "loss_aux_layer_5": 0.14208984375, "loss_aux_layer_6": 0.140869140625, "loss_aux_layer_7": 0.133056640625, "loss_aux_layer_8": 0.13134765625, "loss_aux_layer_9": 0.129638671875, "step": 403, "total_loss": 0.8251816034317017 }, { "epoch": 0.07998416155216789, "grad_norm": 1.120084524154663, "learning_rate": 5e-05, "llm_loss": 0.6579890102148056, "loss": 3.2871, "loss_aux_layer_0": 0.03594970703125, "loss_aux_layer_1": 0.120361328125, "loss_aux_layer_10": 0.1298828125, "loss_aux_layer_11": 0.136962890625, "loss_aux_layer_12": 0.147216796875, "loss_aux_layer_13": 0.156494140625, "loss_aux_layer_14": 0.173095703125, "loss_aux_layer_15": 0.1865234375, "loss_aux_layer_16": 0.200927734375, "loss_aux_layer_17": 0.20654296875, "loss_aux_layer_18": 0.21484375, "loss_aux_layer_19": 0.2119140625, "loss_aux_layer_2": 0.1282958984375, "loss_aux_layer_20": 0.214599609375, "loss_aux_layer_21": 0.21484375, "loss_aux_layer_22": 0.2333984375, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.137451171875, "loss_aux_layer_4": 0.138671875, "loss_aux_layer_5": 0.139404296875, "loss_aux_layer_6": 0.138427734375, "loss_aux_layer_7": 0.1307373046875, "loss_aux_layer_8": 0.129638671875, "loss_aux_layer_9": 0.12890625, "step": 404, "total_loss": 0.8217720687389374 }, { "epoch": 0.08018214215006929, "grad_norm": 1.4397244453430176, "learning_rate": 5e-05, "llm_loss": 0.5760118365287781, "loss": 2.9421, "loss_aux_layer_0": 0.03680419921875, "loss_aux_layer_1": 0.11328125, "loss_aux_layer_10": 0.123046875, "loss_aux_layer_11": 0.1297607421875, "loss_aux_layer_12": 0.139892578125, "loss_aux_layer_13": 0.149169921875, "loss_aux_layer_14": 0.165771484375, "loss_aux_layer_15": 0.1806640625, "loss_aux_layer_16": 0.195068359375, "loss_aux_layer_17": 0.200927734375, "loss_aux_layer_18": 0.210205078125, "loss_aux_layer_19": 0.210205078125, "loss_aux_layer_2": 0.1212158203125, "loss_aux_layer_20": 0.214599609375, "loss_aux_layer_21": 0.218505859375, "loss_aux_layer_22": 0.240234375, "loss_aux_layer_23": 0.28564453125, "loss_aux_layer_3": 0.129638671875, "loss_aux_layer_4": 0.130126953125, "loss_aux_layer_5": 0.131591796875, "loss_aux_layer_6": 0.131103515625, "loss_aux_layer_7": 0.1240234375, "loss_aux_layer_8": 0.1234130859375, "loss_aux_layer_9": 0.1221923828125, "step": 405, "total_loss": 0.7355211824178696 }, { "epoch": 0.0803801227479707, "grad_norm": 1.0634675025939941, "learning_rate": 5e-05, "llm_loss": 0.5743830651044846, "loss": 2.9289, "loss_aux_layer_0": 0.03497314453125, "loss_aux_layer_1": 0.1103515625, "loss_aux_layer_10": 0.117919921875, "loss_aux_layer_11": 0.124755859375, "loss_aux_layer_12": 0.13525390625, "loss_aux_layer_13": 0.146728515625, "loss_aux_layer_14": 0.1650390625, "loss_aux_layer_15": 0.181884765625, "loss_aux_layer_16": 0.19775390625, "loss_aux_layer_17": 0.20458984375, "loss_aux_layer_18": 0.21484375, "loss_aux_layer_19": 0.21435546875, "loss_aux_layer_2": 0.1175537109375, "loss_aux_layer_20": 0.21826171875, "loss_aux_layer_21": 0.220947265625, "loss_aux_layer_22": 0.2412109375, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.1248779296875, "loss_aux_layer_4": 0.1253662109375, "loss_aux_layer_5": 0.1258544921875, "loss_aux_layer_6": 0.1256103515625, "loss_aux_layer_7": 0.1187744140625, "loss_aux_layer_8": 0.117919921875, "loss_aux_layer_9": 0.116455078125, "step": 406, "total_loss": 0.7322252243757248 }, { "epoch": 0.0805781033458721, "grad_norm": 1.5095040798187256, "learning_rate": 5e-05, "llm_loss": 0.5777203589677811, "loss": 2.9614, "loss_aux_layer_0": 0.03411865234375, "loss_aux_layer_1": 0.1156005859375, "loss_aux_layer_10": 0.125244140625, "loss_aux_layer_11": 0.1326904296875, "loss_aux_layer_12": 0.143310546875, "loss_aux_layer_13": 0.153564453125, "loss_aux_layer_14": 0.1708984375, "loss_aux_layer_15": 0.185302734375, "loss_aux_layer_16": 0.19970703125, "loss_aux_layer_17": 0.20458984375, "loss_aux_layer_18": 0.215087890625, "loss_aux_layer_19": 0.214599609375, "loss_aux_layer_2": 0.1248779296875, "loss_aux_layer_20": 0.218017578125, "loss_aux_layer_21": 0.221435546875, "loss_aux_layer_22": 0.2431640625, "loss_aux_layer_23": 0.29150390625, "loss_aux_layer_3": 0.1334228515625, "loss_aux_layer_4": 0.13330078125, "loss_aux_layer_5": 0.134521484375, "loss_aux_layer_6": 0.1334228515625, "loss_aux_layer_7": 0.126220703125, "loss_aux_layer_8": 0.1253662109375, "loss_aux_layer_9": 0.1240234375, "step": 407, "total_loss": 0.7403469234704971 }, { "epoch": 0.08077608394377352, "grad_norm": 1.5480613708496094, "learning_rate": 5e-05, "llm_loss": 0.6881625056266785, "loss": 3.3815, "loss_aux_layer_0": 0.035888671875, "loss_aux_layer_1": 0.1090087890625, "loss_aux_layer_10": 0.1202392578125, "loss_aux_layer_11": 0.126708984375, "loss_aux_layer_12": 0.13720703125, "loss_aux_layer_13": 0.1474609375, "loss_aux_layer_14": 0.1650390625, "loss_aux_layer_15": 0.18017578125, "loss_aux_layer_16": 0.19580078125, "loss_aux_layer_17": 0.2021484375, "loss_aux_layer_18": 0.2119140625, "loss_aux_layer_19": 0.212158203125, "loss_aux_layer_2": 0.1160888671875, "loss_aux_layer_20": 0.216064453125, "loss_aux_layer_21": 0.21728515625, "loss_aux_layer_22": 0.236572265625, "loss_aux_layer_23": 0.28125, "loss_aux_layer_3": 0.124267578125, "loss_aux_layer_4": 0.1256103515625, "loss_aux_layer_5": 0.1265869140625, "loss_aux_layer_6": 0.1265869140625, "loss_aux_layer_7": 0.119873046875, "loss_aux_layer_8": 0.1201171875, "loss_aux_layer_9": 0.1192626953125, "step": 408, "total_loss": 0.8453661501407623 }, { "epoch": 0.08097406454167491, "grad_norm": 1.732758641242981, "learning_rate": 5e-05, "llm_loss": 0.6524156630039215, "loss": 3.2736, "loss_aux_layer_0": 0.03472900390625, "loss_aux_layer_1": 0.1181640625, "loss_aux_layer_10": 0.1298828125, "loss_aux_layer_11": 0.137451171875, "loss_aux_layer_12": 0.14794921875, "loss_aux_layer_13": 0.15869140625, "loss_aux_layer_14": 0.17578125, "loss_aux_layer_15": 0.190185546875, "loss_aux_layer_16": 0.204345703125, "loss_aux_layer_17": 0.2099609375, "loss_aux_layer_18": 0.219482421875, "loss_aux_layer_19": 0.21728515625, "loss_aux_layer_2": 0.1258544921875, "loss_aux_layer_20": 0.22021484375, "loss_aux_layer_21": 0.22216796875, "loss_aux_layer_22": 0.243408203125, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.1357421875, "loss_aux_layer_4": 0.13720703125, "loss_aux_layer_5": 0.138916015625, "loss_aux_layer_6": 0.13818359375, "loss_aux_layer_7": 0.130859375, "loss_aux_layer_8": 0.130126953125, "loss_aux_layer_9": 0.129150390625, "step": 409, "total_loss": 0.8183979094028473 }, { "epoch": 0.08117204513957632, "grad_norm": 1.007692575454712, "learning_rate": 5e-05, "llm_loss": 0.6289557814598083, "loss": 3.1704, "loss_aux_layer_0": 0.03631591796875, "loss_aux_layer_1": 0.1185302734375, "loss_aux_layer_10": 0.1265869140625, "loss_aux_layer_11": 0.133544921875, "loss_aux_layer_12": 0.1435546875, "loss_aux_layer_13": 0.154296875, "loss_aux_layer_14": 0.171875, "loss_aux_layer_15": 0.18701171875, "loss_aux_layer_16": 0.201171875, "loss_aux_layer_17": 0.206787109375, "loss_aux_layer_18": 0.216064453125, "loss_aux_layer_19": 0.214599609375, "loss_aux_layer_2": 0.1260986328125, "loss_aux_layer_20": 0.217529296875, "loss_aux_layer_21": 0.219482421875, "loss_aux_layer_22": 0.240478515625, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.135009765625, "loss_aux_layer_4": 0.136474609375, "loss_aux_layer_5": 0.137451171875, "loss_aux_layer_6": 0.13671875, "loss_aux_layer_7": 0.128662109375, "loss_aux_layer_8": 0.127685546875, "loss_aux_layer_9": 0.1259765625, "step": 410, "total_loss": 0.7926008999347687 }, { "epoch": 0.08137002573747773, "grad_norm": 1.3808619976043701, "learning_rate": 5e-05, "llm_loss": 0.6641124337911606, "loss": 3.3064, "loss_aux_layer_0": 0.03466796875, "loss_aux_layer_1": 0.1153564453125, "loss_aux_layer_10": 0.126220703125, "loss_aux_layer_11": 0.133056640625, "loss_aux_layer_12": 0.143798828125, "loss_aux_layer_13": 0.15478515625, "loss_aux_layer_14": 0.17138671875, "loss_aux_layer_15": 0.1865234375, "loss_aux_layer_16": 0.201171875, "loss_aux_layer_17": 0.20751953125, "loss_aux_layer_18": 0.216796875, "loss_aux_layer_19": 0.215576171875, "loss_aux_layer_2": 0.122802734375, "loss_aux_layer_20": 0.217041015625, "loss_aux_layer_21": 0.21923828125, "loss_aux_layer_22": 0.239013671875, "loss_aux_layer_23": 0.2841796875, "loss_aux_layer_3": 0.1318359375, "loss_aux_layer_4": 0.1331787109375, "loss_aux_layer_5": 0.134033203125, "loss_aux_layer_6": 0.13427734375, "loss_aux_layer_7": 0.126953125, "loss_aux_layer_8": 0.126220703125, "loss_aux_layer_9": 0.1248779296875, "step": 411, "total_loss": 0.8266013562679291 }, { "epoch": 0.08156800633537914, "grad_norm": 1.0045262575149536, "learning_rate": 5e-05, "llm_loss": 0.7009026259183884, "loss": 3.4339, "loss_aux_layer_0": 0.03570556640625, "loss_aux_layer_1": 0.1119384765625, "loss_aux_layer_10": 0.11962890625, "loss_aux_layer_11": 0.1265869140625, "loss_aux_layer_12": 0.137451171875, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.181884765625, "loss_aux_layer_16": 0.1962890625, "loss_aux_layer_17": 0.203369140625, "loss_aux_layer_18": 0.212158203125, "loss_aux_layer_19": 0.210693359375, "loss_aux_layer_2": 0.1182861328125, "loss_aux_layer_20": 0.2138671875, "loss_aux_layer_21": 0.215087890625, "loss_aux_layer_22": 0.236328125, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.1256103515625, "loss_aux_layer_4": 0.12646484375, "loss_aux_layer_5": 0.12744140625, "loss_aux_layer_6": 0.12744140625, "loss_aux_layer_7": 0.1202392578125, "loss_aux_layer_8": 0.119384765625, "loss_aux_layer_9": 0.1182861328125, "step": 412, "total_loss": 0.8584717363119125 }, { "epoch": 0.08176598693328054, "grad_norm": 1.8494693040847778, "learning_rate": 5e-05, "llm_loss": 0.6536068916320801, "loss": 3.273, "loss_aux_layer_0": 0.03546142578125, "loss_aux_layer_1": 0.119140625, "loss_aux_layer_10": 0.1287841796875, "loss_aux_layer_11": 0.13623046875, "loss_aux_layer_12": 0.1474609375, "loss_aux_layer_13": 0.158203125, "loss_aux_layer_14": 0.17529296875, "loss_aux_layer_15": 0.18994140625, "loss_aux_layer_16": 0.20458984375, "loss_aux_layer_17": 0.210205078125, "loss_aux_layer_18": 0.218994140625, "loss_aux_layer_19": 0.2158203125, "loss_aux_layer_2": 0.126220703125, "loss_aux_layer_20": 0.217529296875, "loss_aux_layer_21": 0.21826171875, "loss_aux_layer_22": 0.237548828125, "loss_aux_layer_23": 0.28125, "loss_aux_layer_3": 0.135009765625, "loss_aux_layer_4": 0.1357421875, "loss_aux_layer_5": 0.136962890625, "loss_aux_layer_6": 0.136962890625, "loss_aux_layer_7": 0.129150390625, "loss_aux_layer_8": 0.128662109375, "loss_aux_layer_9": 0.1275634765625, "step": 413, "total_loss": 0.8182595819234848 }, { "epoch": 0.08196396753118194, "grad_norm": 2.5539276599884033, "learning_rate": 5e-05, "llm_loss": 0.6165378540754318, "loss": 3.1171, "loss_aux_layer_0": 0.035400390625, "loss_aux_layer_1": 0.1180419921875, "loss_aux_layer_10": 0.1260986328125, "loss_aux_layer_11": 0.13330078125, "loss_aux_layer_12": 0.143310546875, "loss_aux_layer_13": 0.15380859375, "loss_aux_layer_14": 0.171630859375, "loss_aux_layer_15": 0.18603515625, "loss_aux_layer_16": 0.20068359375, "loss_aux_layer_17": 0.20654296875, "loss_aux_layer_18": 0.2158203125, "loss_aux_layer_19": 0.2138671875, "loss_aux_layer_2": 0.1253662109375, "loss_aux_layer_20": 0.216552734375, "loss_aux_layer_21": 0.218505859375, "loss_aux_layer_22": 0.2392578125, "loss_aux_layer_23": 0.28271484375, "loss_aux_layer_3": 0.134033203125, "loss_aux_layer_4": 0.13525390625, "loss_aux_layer_5": 0.135986328125, "loss_aux_layer_6": 0.13525390625, "loss_aux_layer_7": 0.1273193359375, "loss_aux_layer_8": 0.1265869140625, "loss_aux_layer_9": 0.1253662109375, "step": 414, "total_loss": 0.7792675942182541 }, { "epoch": 0.08216194812908335, "grad_norm": 2.23964786529541, "learning_rate": 5e-05, "llm_loss": 0.6298218816518784, "loss": 3.1499, "loss_aux_layer_0": 0.03387451171875, "loss_aux_layer_1": 0.1124267578125, "loss_aux_layer_10": 0.12158203125, "loss_aux_layer_11": 0.128173828125, "loss_aux_layer_12": 0.13916015625, "loss_aux_layer_13": 0.149658203125, "loss_aux_layer_14": 0.166748046875, "loss_aux_layer_15": 0.181884765625, "loss_aux_layer_16": 0.1962890625, "loss_aux_layer_17": 0.201904296875, "loss_aux_layer_18": 0.2099609375, "loss_aux_layer_19": 0.20751953125, "loss_aux_layer_2": 0.118896484375, "loss_aux_layer_20": 0.2109375, "loss_aux_layer_21": 0.214111328125, "loss_aux_layer_22": 0.235107421875, "loss_aux_layer_23": 0.28076171875, "loss_aux_layer_3": 0.126953125, "loss_aux_layer_4": 0.127685546875, "loss_aux_layer_5": 0.128662109375, "loss_aux_layer_6": 0.1280517578125, "loss_aux_layer_7": 0.1217041015625, "loss_aux_layer_8": 0.1214599609375, "loss_aux_layer_9": 0.1204833984375, "step": 415, "total_loss": 0.7874855548143387 }, { "epoch": 0.08235992872698475, "grad_norm": 1.2042288780212402, "learning_rate": 5e-05, "llm_loss": 0.5911287069320679, "loss": 3.0138, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.1158447265625, "loss_aux_layer_10": 0.1258544921875, "loss_aux_layer_11": 0.1324462890625, "loss_aux_layer_12": 0.142822265625, "loss_aux_layer_13": 0.1533203125, "loss_aux_layer_14": 0.171142578125, "loss_aux_layer_15": 0.18603515625, "loss_aux_layer_16": 0.200927734375, "loss_aux_layer_17": 0.206298828125, "loss_aux_layer_18": 0.21533203125, "loss_aux_layer_19": 0.214599609375, "loss_aux_layer_2": 0.122802734375, "loss_aux_layer_20": 0.2177734375, "loss_aux_layer_21": 0.220703125, "loss_aux_layer_22": 0.24267578125, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.13134765625, "loss_aux_layer_4": 0.1322021484375, "loss_aux_layer_5": 0.133056640625, "loss_aux_layer_6": 0.1326904296875, "loss_aux_layer_7": 0.1256103515625, "loss_aux_layer_8": 0.1253662109375, "loss_aux_layer_9": 0.12451171875, "step": 416, "total_loss": 0.753445491194725 }, { "epoch": 0.08255790932488616, "grad_norm": 1.3887492418289185, "learning_rate": 5e-05, "llm_loss": 0.6261311173439026, "loss": 3.1459, "loss_aux_layer_0": 0.03436279296875, "loss_aux_layer_1": 0.11279296875, "loss_aux_layer_10": 0.121826171875, "loss_aux_layer_11": 0.1290283203125, "loss_aux_layer_12": 0.138916015625, "loss_aux_layer_13": 0.150634765625, "loss_aux_layer_14": 0.169189453125, "loss_aux_layer_15": 0.1845703125, "loss_aux_layer_16": 0.19921875, "loss_aux_layer_17": 0.20556640625, "loss_aux_layer_18": 0.21630859375, "loss_aux_layer_19": 0.214599609375, "loss_aux_layer_2": 0.1201171875, "loss_aux_layer_20": 0.21875, "loss_aux_layer_21": 0.22119140625, "loss_aux_layer_22": 0.242431640625, "loss_aux_layer_23": 0.287109375, "loss_aux_layer_3": 0.127685546875, "loss_aux_layer_4": 0.1287841796875, "loss_aux_layer_5": 0.1302490234375, "loss_aux_layer_6": 0.1300048828125, "loss_aux_layer_7": 0.1229248046875, "loss_aux_layer_8": 0.1220703125, "loss_aux_layer_9": 0.120361328125, "step": 417, "total_loss": 0.7864850908517838 }, { "epoch": 0.08275588992278757, "grad_norm": 1.6944862604141235, "learning_rate": 5e-05, "llm_loss": 0.6265493631362915, "loss": 3.1647, "loss_aux_layer_0": 0.03533935546875, "loss_aux_layer_1": 0.1175537109375, "loss_aux_layer_10": 0.1287841796875, "loss_aux_layer_11": 0.135986328125, "loss_aux_layer_12": 0.146240234375, "loss_aux_layer_13": 0.157470703125, "loss_aux_layer_14": 0.174072265625, "loss_aux_layer_15": 0.188720703125, "loss_aux_layer_16": 0.203369140625, "loss_aux_layer_17": 0.208984375, "loss_aux_layer_18": 0.2177734375, "loss_aux_layer_19": 0.21630859375, "loss_aux_layer_2": 0.1258544921875, "loss_aux_layer_20": 0.218017578125, "loss_aux_layer_21": 0.220458984375, "loss_aux_layer_22": 0.24072265625, "loss_aux_layer_23": 0.28466796875, "loss_aux_layer_3": 0.1348876953125, "loss_aux_layer_4": 0.1361083984375, "loss_aux_layer_5": 0.137451171875, "loss_aux_layer_6": 0.1370849609375, "loss_aux_layer_7": 0.12939453125, "loss_aux_layer_8": 0.12890625, "loss_aux_layer_9": 0.127685546875, "step": 418, "total_loss": 0.7911705821752548 }, { "epoch": 0.08295387052068898, "grad_norm": 0.7718940377235413, "learning_rate": 5e-05, "llm_loss": 0.6425760090351105, "loss": 3.1962, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.11181640625, "loss_aux_layer_10": 0.11962890625, "loss_aux_layer_11": 0.1260986328125, "loss_aux_layer_12": 0.13671875, "loss_aux_layer_13": 0.14697265625, "loss_aux_layer_14": 0.163818359375, "loss_aux_layer_15": 0.1787109375, "loss_aux_layer_16": 0.194091796875, "loss_aux_layer_17": 0.201171875, "loss_aux_layer_18": 0.210693359375, "loss_aux_layer_19": 0.20947265625, "loss_aux_layer_2": 0.1173095703125, "loss_aux_layer_20": 0.213134765625, "loss_aux_layer_21": 0.214599609375, "loss_aux_layer_22": 0.23486328125, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.1248779296875, "loss_aux_layer_4": 0.1259765625, "loss_aux_layer_5": 0.127197265625, "loss_aux_layer_6": 0.1268310546875, "loss_aux_layer_7": 0.1201171875, "loss_aux_layer_8": 0.119140625, "loss_aux_layer_9": 0.1181640625, "step": 419, "total_loss": 0.7990480959415436 }, { "epoch": 0.08315185111859037, "grad_norm": 1.6934505701065063, "learning_rate": 5e-05, "llm_loss": 0.685999408364296, "loss": 3.3824, "loss_aux_layer_0": 0.032958984375, "loss_aux_layer_1": 0.113525390625, "loss_aux_layer_10": 0.1219482421875, "loss_aux_layer_11": 0.1287841796875, "loss_aux_layer_12": 0.139404296875, "loss_aux_layer_13": 0.150390625, "loss_aux_layer_14": 0.168212890625, "loss_aux_layer_15": 0.18310546875, "loss_aux_layer_16": 0.197509765625, "loss_aux_layer_17": 0.203369140625, "loss_aux_layer_18": 0.213134765625, "loss_aux_layer_19": 0.21142578125, "loss_aux_layer_2": 0.1201171875, "loss_aux_layer_20": 0.215087890625, "loss_aux_layer_21": 0.2197265625, "loss_aux_layer_22": 0.241455078125, "loss_aux_layer_23": 0.28857421875, "loss_aux_layer_3": 0.1280517578125, "loss_aux_layer_4": 0.128662109375, "loss_aux_layer_5": 0.1302490234375, "loss_aux_layer_6": 0.129638671875, "loss_aux_layer_7": 0.1229248046875, "loss_aux_layer_8": 0.1217041015625, "loss_aux_layer_9": 0.12060546875, "step": 420, "total_loss": 0.8455982506275177 }, { "epoch": 0.08334983171649178, "grad_norm": 1.927269697189331, "learning_rate": 5e-05, "llm_loss": 0.641675129532814, "loss": 3.1932, "loss_aux_layer_0": 0.033935546875, "loss_aux_layer_1": 0.1099853515625, "loss_aux_layer_10": 0.11962890625, "loss_aux_layer_11": 0.12646484375, "loss_aux_layer_12": 0.135986328125, "loss_aux_layer_13": 0.146484375, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.17919921875, "loss_aux_layer_16": 0.19482421875, "loss_aux_layer_17": 0.201171875, "loss_aux_layer_18": 0.211181640625, "loss_aux_layer_19": 0.21044921875, "loss_aux_layer_2": 0.1151123046875, "loss_aux_layer_20": 0.21484375, "loss_aux_layer_21": 0.217041015625, "loss_aux_layer_22": 0.238525390625, "loss_aux_layer_23": 0.28369140625, "loss_aux_layer_3": 0.123291015625, "loss_aux_layer_4": 0.124755859375, "loss_aux_layer_5": 0.126220703125, "loss_aux_layer_6": 0.1263427734375, "loss_aux_layer_7": 0.1195068359375, "loss_aux_layer_8": 0.119140625, "loss_aux_layer_9": 0.1181640625, "step": 421, "total_loss": 0.79829341173172 }, { "epoch": 0.08354781231439319, "grad_norm": 1.4428406953811646, "learning_rate": 5e-05, "llm_loss": 0.6209100261330605, "loss": 3.1116, "loss_aux_layer_0": 0.03369140625, "loss_aux_layer_1": 0.1134033203125, "loss_aux_layer_10": 0.1192626953125, "loss_aux_layer_11": 0.1259765625, "loss_aux_layer_12": 0.13623046875, "loss_aux_layer_13": 0.14697265625, "loss_aux_layer_14": 0.16455078125, "loss_aux_layer_15": 0.179931640625, "loss_aux_layer_16": 0.194580078125, "loss_aux_layer_17": 0.20166015625, "loss_aux_layer_18": 0.211669921875, "loss_aux_layer_19": 0.21044921875, "loss_aux_layer_2": 0.1187744140625, "loss_aux_layer_20": 0.213623046875, "loss_aux_layer_21": 0.216064453125, "loss_aux_layer_22": 0.235107421875, "loss_aux_layer_23": 0.27880859375, "loss_aux_layer_3": 0.12646484375, "loss_aux_layer_4": 0.127197265625, "loss_aux_layer_5": 0.1280517578125, "loss_aux_layer_6": 0.1278076171875, "loss_aux_layer_7": 0.120361328125, "loss_aux_layer_8": 0.119384765625, "loss_aux_layer_9": 0.1181640625, "step": 422, "total_loss": 0.7778883278369904 }, { "epoch": 0.0837457929122946, "grad_norm": 0.7274448871612549, "learning_rate": 5e-05, "llm_loss": 0.6396273076534271, "loss": 3.2092, "loss_aux_layer_0": 0.03338623046875, "loss_aux_layer_1": 0.1177978515625, "loss_aux_layer_10": 0.1263427734375, "loss_aux_layer_11": 0.1337890625, "loss_aux_layer_12": 0.14404296875, "loss_aux_layer_13": 0.154541015625, "loss_aux_layer_14": 0.171142578125, "loss_aux_layer_15": 0.185546875, "loss_aux_layer_16": 0.200439453125, "loss_aux_layer_17": 0.205322265625, "loss_aux_layer_18": 0.2138671875, "loss_aux_layer_19": 0.21142578125, "loss_aux_layer_2": 0.1259765625, "loss_aux_layer_20": 0.21484375, "loss_aux_layer_21": 0.2177734375, "loss_aux_layer_22": 0.239013671875, "loss_aux_layer_23": 0.2841796875, "loss_aux_layer_3": 0.13427734375, "loss_aux_layer_4": 0.1357421875, "loss_aux_layer_5": 0.136474609375, "loss_aux_layer_6": 0.135986328125, "loss_aux_layer_7": 0.1282958984375, "loss_aux_layer_8": 0.126953125, "loss_aux_layer_9": 0.1253662109375, "step": 423, "total_loss": 0.8022888451814651 }, { "epoch": 0.083943773510196, "grad_norm": 1.3125258684158325, "learning_rate": 5e-05, "llm_loss": 0.6626056730747223, "loss": 3.2769, "loss_aux_layer_0": 0.034088134765625, "loss_aux_layer_1": 0.11279296875, "loss_aux_layer_10": 0.1192626953125, "loss_aux_layer_11": 0.12646484375, "loss_aux_layer_12": 0.13671875, "loss_aux_layer_13": 0.14794921875, "loss_aux_layer_14": 0.1650390625, "loss_aux_layer_15": 0.17919921875, "loss_aux_layer_16": 0.193115234375, "loss_aux_layer_17": 0.2001953125, "loss_aux_layer_18": 0.208984375, "loss_aux_layer_19": 0.2080078125, "loss_aux_layer_2": 0.1175537109375, "loss_aux_layer_20": 0.212158203125, "loss_aux_layer_21": 0.215576171875, "loss_aux_layer_22": 0.236083984375, "loss_aux_layer_23": 0.2822265625, "loss_aux_layer_3": 0.1248779296875, "loss_aux_layer_4": 0.126220703125, "loss_aux_layer_5": 0.12744140625, "loss_aux_layer_6": 0.127197265625, "loss_aux_layer_7": 0.1202392578125, "loss_aux_layer_8": 0.1197509765625, "loss_aux_layer_9": 0.1182861328125, "step": 424, "total_loss": 0.8192171156406403 }, { "epoch": 0.0841417541080974, "grad_norm": 1.6333597898483276, "learning_rate": 5e-05, "llm_loss": 0.5914794430136681, "loss": 3.0008, "loss_aux_layer_0": 0.0352783203125, "loss_aux_layer_1": 0.1141357421875, "loss_aux_layer_10": 0.1217041015625, "loss_aux_layer_11": 0.12841796875, "loss_aux_layer_12": 0.138671875, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.166748046875, "loss_aux_layer_15": 0.181396484375, "loss_aux_layer_16": 0.196044921875, "loss_aux_layer_17": 0.202392578125, "loss_aux_layer_18": 0.2119140625, "loss_aux_layer_19": 0.2099609375, "loss_aux_layer_2": 0.1199951171875, "loss_aux_layer_20": 0.213623046875, "loss_aux_layer_21": 0.21630859375, "loss_aux_layer_22": 0.238037109375, "loss_aux_layer_23": 0.28369140625, "loss_aux_layer_3": 0.1279296875, "loss_aux_layer_4": 0.1287841796875, "loss_aux_layer_5": 0.1297607421875, "loss_aux_layer_6": 0.1297607421875, "loss_aux_layer_7": 0.1231689453125, "loss_aux_layer_8": 0.1220703125, "loss_aux_layer_9": 0.120849609375, "step": 425, "total_loss": 0.7502120286226273 }, { "epoch": 0.08433973470599881, "grad_norm": 0.9638034105300903, "learning_rate": 5e-05, "llm_loss": 0.6658787727355957, "loss": 3.2826, "loss_aux_layer_0": 0.03436279296875, "loss_aux_layer_1": 0.1085205078125, "loss_aux_layer_10": 0.1163330078125, "loss_aux_layer_11": 0.123291015625, "loss_aux_layer_12": 0.1337890625, "loss_aux_layer_13": 0.14453125, "loss_aux_layer_14": 0.162353515625, "loss_aux_layer_15": 0.178466796875, "loss_aux_layer_16": 0.193115234375, "loss_aux_layer_17": 0.19970703125, "loss_aux_layer_18": 0.209228515625, "loss_aux_layer_19": 0.208984375, "loss_aux_layer_2": 0.1146240234375, "loss_aux_layer_20": 0.2119140625, "loss_aux_layer_21": 0.215087890625, "loss_aux_layer_22": 0.2353515625, "loss_aux_layer_23": 0.28076171875, "loss_aux_layer_3": 0.12255859375, "loss_aux_layer_4": 0.1231689453125, "loss_aux_layer_5": 0.1240234375, "loss_aux_layer_6": 0.12451171875, "loss_aux_layer_7": 0.1173095703125, "loss_aux_layer_8": 0.116455078125, "loss_aux_layer_9": 0.11572265625, "step": 426, "total_loss": 0.8206548541784286 }, { "epoch": 0.08453771530390022, "grad_norm": 1.8251433372497559, "learning_rate": 5e-05, "llm_loss": 0.6919232755899429, "loss": 3.3985, "loss_aux_layer_0": 0.03369140625, "loss_aux_layer_1": 0.1141357421875, "loss_aux_layer_10": 0.1212158203125, "loss_aux_layer_11": 0.128662109375, "loss_aux_layer_12": 0.138916015625, "loss_aux_layer_13": 0.14990234375, "loss_aux_layer_14": 0.166259765625, "loss_aux_layer_15": 0.181640625, "loss_aux_layer_16": 0.196044921875, "loss_aux_layer_17": 0.20166015625, "loss_aux_layer_18": 0.210693359375, "loss_aux_layer_19": 0.208984375, "loss_aux_layer_2": 0.119140625, "loss_aux_layer_20": 0.211181640625, "loss_aux_layer_21": 0.213134765625, "loss_aux_layer_22": 0.233154296875, "loss_aux_layer_23": 0.27734375, "loss_aux_layer_3": 0.127197265625, "loss_aux_layer_4": 0.1280517578125, "loss_aux_layer_5": 0.1295166015625, "loss_aux_layer_6": 0.12939453125, "loss_aux_layer_7": 0.123046875, "loss_aux_layer_8": 0.12158203125, "loss_aux_layer_9": 0.119873046875, "step": 427, "total_loss": 0.8496130257844925 }, { "epoch": 0.08473569590180162, "grad_norm": 1.5817433595657349, "learning_rate": 5e-05, "llm_loss": 0.6035008877515793, "loss": 3.0477, "loss_aux_layer_0": 0.03515625, "loss_aux_layer_1": 0.1142578125, "loss_aux_layer_10": 0.123291015625, "loss_aux_layer_11": 0.1298828125, "loss_aux_layer_12": 0.14013671875, "loss_aux_layer_13": 0.14990234375, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.1806640625, "loss_aux_layer_16": 0.195556640625, "loss_aux_layer_17": 0.200439453125, "loss_aux_layer_18": 0.208251953125, "loss_aux_layer_19": 0.20751953125, "loss_aux_layer_2": 0.11962890625, "loss_aux_layer_20": 0.21044921875, "loss_aux_layer_21": 0.213623046875, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.1282958984375, "loss_aux_layer_4": 0.1295166015625, "loss_aux_layer_5": 0.13134765625, "loss_aux_layer_6": 0.131591796875, "loss_aux_layer_7": 0.1248779296875, "loss_aux_layer_8": 0.123779296875, "loss_aux_layer_9": 0.1226806640625, "step": 428, "total_loss": 0.7619244605302811 }, { "epoch": 0.08493367649970303, "grad_norm": 1.8217722177505493, "learning_rate": 5e-05, "llm_loss": 0.6151654422283173, "loss": 3.1041, "loss_aux_layer_0": 0.03607177734375, "loss_aux_layer_1": 0.1187744140625, "loss_aux_layer_10": 0.1219482421875, "loss_aux_layer_11": 0.12890625, "loss_aux_layer_12": 0.1396484375, "loss_aux_layer_13": 0.151123046875, "loss_aux_layer_14": 0.168701171875, "loss_aux_layer_15": 0.1845703125, "loss_aux_layer_16": 0.19921875, "loss_aux_layer_17": 0.205078125, "loss_aux_layer_18": 0.214111328125, "loss_aux_layer_19": 0.212646484375, "loss_aux_layer_2": 0.12353515625, "loss_aux_layer_20": 0.21630859375, "loss_aux_layer_21": 0.218994140625, "loss_aux_layer_22": 0.241455078125, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.130615234375, "loss_aux_layer_4": 0.13134765625, "loss_aux_layer_5": 0.13232421875, "loss_aux_layer_6": 0.1319580078125, "loss_aux_layer_7": 0.1246337890625, "loss_aux_layer_8": 0.122802734375, "loss_aux_layer_9": 0.120849609375, "step": 429, "total_loss": 0.7760289907455444 }, { "epoch": 0.08513165709760444, "grad_norm": 1.9816794395446777, "learning_rate": 5e-05, "llm_loss": 0.6957042068243027, "loss": 3.4004, "loss_aux_layer_0": 0.03436279296875, "loss_aux_layer_1": 0.1114501953125, "loss_aux_layer_10": 0.117431640625, "loss_aux_layer_11": 0.1243896484375, "loss_aux_layer_12": 0.13427734375, "loss_aux_layer_13": 0.14501953125, "loss_aux_layer_14": 0.162109375, "loss_aux_layer_15": 0.177490234375, "loss_aux_layer_16": 0.192138671875, "loss_aux_layer_17": 0.197998046875, "loss_aux_layer_18": 0.206787109375, "loss_aux_layer_19": 0.205810546875, "loss_aux_layer_2": 0.114990234375, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.122314453125, "loss_aux_layer_4": 0.1234130859375, "loss_aux_layer_5": 0.1246337890625, "loss_aux_layer_6": 0.125, "loss_aux_layer_7": 0.1185302734375, "loss_aux_layer_8": 0.11767578125, "loss_aux_layer_9": 0.1162109375, "step": 430, "total_loss": 0.8501105904579163 }, { "epoch": 0.08532963769550585, "grad_norm": 1.5959769487380981, "learning_rate": 5e-05, "llm_loss": 0.6946717798709869, "loss": 3.4397, "loss_aux_layer_0": 0.0394287109375, "loss_aux_layer_1": 0.1259765625, "loss_aux_layer_10": 0.129638671875, "loss_aux_layer_11": 0.136962890625, "loss_aux_layer_12": 0.1474609375, "loss_aux_layer_13": 0.158447265625, "loss_aux_layer_14": 0.175048828125, "loss_aux_layer_15": 0.18896484375, "loss_aux_layer_16": 0.202880859375, "loss_aux_layer_17": 0.2080078125, "loss_aux_layer_18": 0.21484375, "loss_aux_layer_19": 0.21142578125, "loss_aux_layer_2": 0.1302490234375, "loss_aux_layer_20": 0.213134765625, "loss_aux_layer_21": 0.2138671875, "loss_aux_layer_22": 0.235595703125, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.138671875, "loss_aux_layer_4": 0.140380859375, "loss_aux_layer_5": 0.141357421875, "loss_aux_layer_6": 0.141357421875, "loss_aux_layer_7": 0.13330078125, "loss_aux_layer_8": 0.131103515625, "loss_aux_layer_9": 0.12890625, "step": 431, "total_loss": 0.8599162399768829 }, { "epoch": 0.08552761829340724, "grad_norm": 3.152057647705078, "learning_rate": 5e-05, "llm_loss": 0.6969381272792816, "loss": 3.4307, "loss_aux_layer_0": 0.0352783203125, "loss_aux_layer_1": 0.1162109375, "loss_aux_layer_10": 0.12451171875, "loss_aux_layer_11": 0.131591796875, "loss_aux_layer_12": 0.141845703125, "loss_aux_layer_13": 0.15234375, "loss_aux_layer_14": 0.170166015625, "loss_aux_layer_15": 0.18505859375, "loss_aux_layer_16": 0.19921875, "loss_aux_layer_17": 0.205078125, "loss_aux_layer_18": 0.213623046875, "loss_aux_layer_19": 0.2109375, "loss_aux_layer_2": 0.1220703125, "loss_aux_layer_20": 0.21337890625, "loss_aux_layer_21": 0.2158203125, "loss_aux_layer_22": 0.23583984375, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.131103515625, "loss_aux_layer_4": 0.1318359375, "loss_aux_layer_5": 0.133544921875, "loss_aux_layer_6": 0.133544921875, "loss_aux_layer_7": 0.1268310546875, "loss_aux_layer_8": 0.12548828125, "loss_aux_layer_9": 0.12353515625, "step": 432, "total_loss": 0.8576851785182953 }, { "epoch": 0.08572559889130865, "grad_norm": 4.266529083251953, "learning_rate": 5e-05, "llm_loss": 0.7334447056055069, "loss": 3.5643, "loss_aux_layer_0": 0.03546142578125, "loss_aux_layer_1": 0.11376953125, "loss_aux_layer_10": 0.12158203125, "loss_aux_layer_11": 0.1285400390625, "loss_aux_layer_12": 0.138671875, "loss_aux_layer_13": 0.149169921875, "loss_aux_layer_14": 0.166015625, "loss_aux_layer_15": 0.1796875, "loss_aux_layer_16": 0.193603515625, "loss_aux_layer_17": 0.19921875, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.20654296875, "loss_aux_layer_2": 0.11962890625, "loss_aux_layer_20": 0.21044921875, "loss_aux_layer_21": 0.21337890625, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1290283203125, "loss_aux_layer_4": 0.1300048828125, "loss_aux_layer_5": 0.131591796875, "loss_aux_layer_6": 0.1307373046875, "loss_aux_layer_7": 0.1239013671875, "loss_aux_layer_8": 0.1221923828125, "loss_aux_layer_9": 0.120361328125, "step": 433, "total_loss": 0.8910831063985825 }, { "epoch": 0.08592357948921006, "grad_norm": 3.4628732204437256, "learning_rate": 5e-05, "llm_loss": 0.6209619641304016, "loss": 3.1274, "loss_aux_layer_0": 0.0361328125, "loss_aux_layer_1": 0.1170654296875, "loss_aux_layer_10": 0.123779296875, "loss_aux_layer_11": 0.1309814453125, "loss_aux_layer_12": 0.14111328125, "loss_aux_layer_13": 0.152099609375, "loss_aux_layer_14": 0.1689453125, "loss_aux_layer_15": 0.18408203125, "loss_aux_layer_16": 0.198486328125, "loss_aux_layer_17": 0.204345703125, "loss_aux_layer_18": 0.21435546875, "loss_aux_layer_19": 0.212158203125, "loss_aux_layer_2": 0.12109375, "loss_aux_layer_20": 0.215087890625, "loss_aux_layer_21": 0.218994140625, "loss_aux_layer_22": 0.24072265625, "loss_aux_layer_23": 0.2861328125, "loss_aux_layer_3": 0.1290283203125, "loss_aux_layer_4": 0.13037109375, "loss_aux_layer_5": 0.131591796875, "loss_aux_layer_6": 0.132080078125, "loss_aux_layer_7": 0.12548828125, "loss_aux_layer_8": 0.124267578125, "loss_aux_layer_9": 0.1224365234375, "step": 434, "total_loss": 0.7818624079227448 }, { "epoch": 0.08612156008711146, "grad_norm": 1.1257601976394653, "learning_rate": 5e-05, "llm_loss": 0.6287346556782722, "loss": 3.1374, "loss_aux_layer_0": 0.035400390625, "loss_aux_layer_1": 0.114013671875, "loss_aux_layer_10": 0.117919921875, "loss_aux_layer_11": 0.1248779296875, "loss_aux_layer_12": 0.135498046875, "loss_aux_layer_13": 0.146240234375, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.17822265625, "loss_aux_layer_16": 0.19287109375, "loss_aux_layer_17": 0.19873046875, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.20556640625, "loss_aux_layer_2": 0.117919921875, "loss_aux_layer_20": 0.209228515625, "loss_aux_layer_21": 0.212158203125, "loss_aux_layer_22": 0.233154296875, "loss_aux_layer_23": 0.27783203125, "loss_aux_layer_3": 0.1256103515625, "loss_aux_layer_4": 0.1265869140625, "loss_aux_layer_5": 0.1279296875, "loss_aux_layer_6": 0.1278076171875, "loss_aux_layer_7": 0.1204833984375, "loss_aux_layer_8": 0.1187744140625, "loss_aux_layer_9": 0.1171875, "step": 435, "total_loss": 0.7843394577503204 }, { "epoch": 0.08631954068501287, "grad_norm": 2.0999512672424316, "learning_rate": 5e-05, "llm_loss": 0.7164789438247681, "loss": 3.4898, "loss_aux_layer_0": 0.03509521484375, "loss_aux_layer_1": 0.1116943359375, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.125, "loss_aux_layer_12": 0.135986328125, "loss_aux_layer_13": 0.14697265625, "loss_aux_layer_14": 0.164794921875, "loss_aux_layer_15": 0.1806640625, "loss_aux_layer_16": 0.195556640625, "loss_aux_layer_17": 0.20166015625, "loss_aux_layer_18": 0.210205078125, "loss_aux_layer_19": 0.208740234375, "loss_aux_layer_2": 0.1158447265625, "loss_aux_layer_20": 0.212646484375, "loss_aux_layer_21": 0.21533203125, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1229248046875, "loss_aux_layer_4": 0.124267578125, "loss_aux_layer_5": 0.1256103515625, "loss_aux_layer_6": 0.1259765625, "loss_aux_layer_7": 0.1187744140625, "loss_aux_layer_8": 0.11767578125, "loss_aux_layer_9": 0.11669921875, "step": 436, "total_loss": 0.8724600523710251 }, { "epoch": 0.08651752128291428, "grad_norm": 3.314854145050049, "learning_rate": 5e-05, "llm_loss": 0.5652688667178154, "loss": 2.9038, "loss_aux_layer_0": 0.03369140625, "loss_aux_layer_1": 0.114990234375, "loss_aux_layer_10": 0.1234130859375, "loss_aux_layer_11": 0.13037109375, "loss_aux_layer_12": 0.140625, "loss_aux_layer_13": 0.15185546875, "loss_aux_layer_14": 0.169921875, "loss_aux_layer_15": 0.18505859375, "loss_aux_layer_16": 0.2001953125, "loss_aux_layer_17": 0.205322265625, "loss_aux_layer_18": 0.214111328125, "loss_aux_layer_19": 0.212158203125, "loss_aux_layer_2": 0.12158203125, "loss_aux_layer_20": 0.215087890625, "loss_aux_layer_21": 0.218017578125, "loss_aux_layer_22": 0.2392578125, "loss_aux_layer_23": 0.2841796875, "loss_aux_layer_3": 0.1300048828125, "loss_aux_layer_4": 0.1304931640625, "loss_aux_layer_5": 0.1324462890625, "loss_aux_layer_6": 0.1319580078125, "loss_aux_layer_7": 0.12548828125, "loss_aux_layer_8": 0.12451171875, "loss_aux_layer_9": 0.1221923828125, "step": 437, "total_loss": 0.7259432971477509 }, { "epoch": 0.08671550188081568, "grad_norm": 3.476839065551758, "learning_rate": 5e-05, "llm_loss": 0.5772870630025864, "loss": 2.9505, "loss_aux_layer_0": 0.03582763671875, "loss_aux_layer_1": 0.1153564453125, "loss_aux_layer_10": 0.1226806640625, "loss_aux_layer_11": 0.1297607421875, "loss_aux_layer_12": 0.14013671875, "loss_aux_layer_13": 0.15087890625, "loss_aux_layer_14": 0.168212890625, "loss_aux_layer_15": 0.1826171875, "loss_aux_layer_16": 0.197998046875, "loss_aux_layer_17": 0.20361328125, "loss_aux_layer_18": 0.21240234375, "loss_aux_layer_19": 0.21142578125, "loss_aux_layer_2": 0.1212158203125, "loss_aux_layer_20": 0.214599609375, "loss_aux_layer_21": 0.2177734375, "loss_aux_layer_22": 0.24072265625, "loss_aux_layer_23": 0.28564453125, "loss_aux_layer_3": 0.130859375, "loss_aux_layer_4": 0.1312255859375, "loss_aux_layer_5": 0.133056640625, "loss_aux_layer_6": 0.1324462890625, "loss_aux_layer_7": 0.1253662109375, "loss_aux_layer_8": 0.123291015625, "loss_aux_layer_9": 0.1212158203125, "step": 438, "total_loss": 0.737629622220993 }, { "epoch": 0.08691348247871708, "grad_norm": 2.5896127223968506, "learning_rate": 5e-05, "llm_loss": 0.6369152516126633, "loss": 3.1856, "loss_aux_layer_0": 0.0341796875, "loss_aux_layer_1": 0.1151123046875, "loss_aux_layer_10": 0.1234130859375, "loss_aux_layer_11": 0.130615234375, "loss_aux_layer_12": 0.140869140625, "loss_aux_layer_13": 0.151611328125, "loss_aux_layer_14": 0.168701171875, "loss_aux_layer_15": 0.183837890625, "loss_aux_layer_16": 0.197998046875, "loss_aux_layer_17": 0.203369140625, "loss_aux_layer_18": 0.21240234375, "loss_aux_layer_19": 0.211181640625, "loss_aux_layer_2": 0.119873046875, "loss_aux_layer_20": 0.2138671875, "loss_aux_layer_21": 0.21533203125, "loss_aux_layer_22": 0.2353515625, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.1280517578125, "loss_aux_layer_4": 0.1295166015625, "loss_aux_layer_5": 0.1309814453125, "loss_aux_layer_6": 0.13134765625, "loss_aux_layer_7": 0.1240234375, "loss_aux_layer_8": 0.1234130859375, "loss_aux_layer_9": 0.1220703125, "step": 439, "total_loss": 0.7964037954807281 }, { "epoch": 0.08711146307661849, "grad_norm": 0.9370372891426086, "learning_rate": 5e-05, "llm_loss": 0.6169769018888474, "loss": 3.0876, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.1123046875, "loss_aux_layer_10": 0.118896484375, "loss_aux_layer_11": 0.1253662109375, "loss_aux_layer_12": 0.1353759765625, "loss_aux_layer_13": 0.146484375, "loss_aux_layer_14": 0.163330078125, "loss_aux_layer_15": 0.17822265625, "loss_aux_layer_16": 0.192138671875, "loss_aux_layer_17": 0.197021484375, "loss_aux_layer_18": 0.205078125, "loss_aux_layer_19": 0.20361328125, "loss_aux_layer_2": 0.1160888671875, "loss_aux_layer_20": 0.20751953125, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.233154296875, "loss_aux_layer_23": 0.27734375, "loss_aux_layer_3": 0.1241455078125, "loss_aux_layer_4": 0.1253662109375, "loss_aux_layer_5": 0.1265869140625, "loss_aux_layer_6": 0.1270751953125, "loss_aux_layer_7": 0.1202392578125, "loss_aux_layer_8": 0.1187744140625, "loss_aux_layer_9": 0.1175537109375, "step": 440, "total_loss": 0.7718989104032516 }, { "epoch": 0.0873094436745199, "grad_norm": 2.3871963024139404, "learning_rate": 5e-05, "llm_loss": 0.6592602282762527, "loss": 3.2581, "loss_aux_layer_0": 0.03338623046875, "loss_aux_layer_1": 0.1109619140625, "loss_aux_layer_10": 0.1190185546875, "loss_aux_layer_11": 0.12548828125, "loss_aux_layer_12": 0.13623046875, "loss_aux_layer_13": 0.14697265625, "loss_aux_layer_14": 0.163818359375, "loss_aux_layer_15": 0.17822265625, "loss_aux_layer_16": 0.1923828125, "loss_aux_layer_17": 0.1982421875, "loss_aux_layer_18": 0.206787109375, "loss_aux_layer_19": 0.205810546875, "loss_aux_layer_2": 0.115478515625, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.233642578125, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1236572265625, "loss_aux_layer_4": 0.1246337890625, "loss_aux_layer_5": 0.1265869140625, "loss_aux_layer_6": 0.1265869140625, "loss_aux_layer_7": 0.1201171875, "loss_aux_layer_8": 0.118896484375, "loss_aux_layer_9": 0.1177978515625, "step": 441, "total_loss": 0.8145185858011246 }, { "epoch": 0.08750742427242131, "grad_norm": 3.4871091842651367, "learning_rate": 5e-05, "llm_loss": 0.6248611509799957, "loss": 3.1204, "loss_aux_layer_0": 0.03448486328125, "loss_aux_layer_1": 0.112060546875, "loss_aux_layer_10": 0.1177978515625, "loss_aux_layer_11": 0.1246337890625, "loss_aux_layer_12": 0.13525390625, "loss_aux_layer_13": 0.146484375, "loss_aux_layer_14": 0.163330078125, "loss_aux_layer_15": 0.177734375, "loss_aux_layer_16": 0.19287109375, "loss_aux_layer_17": 0.198974609375, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.20654296875, "loss_aux_layer_2": 0.1163330078125, "loss_aux_layer_20": 0.2099609375, "loss_aux_layer_21": 0.213134765625, "loss_aux_layer_22": 0.23486328125, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.123779296875, "loss_aux_layer_4": 0.1246337890625, "loss_aux_layer_5": 0.1258544921875, "loss_aux_layer_6": 0.1253662109375, "loss_aux_layer_7": 0.119384765625, "loss_aux_layer_8": 0.118408203125, "loss_aux_layer_9": 0.1165771484375, "step": 442, "total_loss": 0.7801007330417633 }, { "epoch": 0.0877054048703227, "grad_norm": 2.3318898677825928, "learning_rate": 5e-05, "llm_loss": 0.7229459136724472, "loss": 3.5355, "loss_aux_layer_0": 0.0384521484375, "loss_aux_layer_1": 0.1212158203125, "loss_aux_layer_10": 0.1258544921875, "loss_aux_layer_11": 0.13330078125, "loss_aux_layer_12": 0.143798828125, "loss_aux_layer_13": 0.153564453125, "loss_aux_layer_14": 0.169921875, "loss_aux_layer_15": 0.183837890625, "loss_aux_layer_16": 0.197265625, "loss_aux_layer_17": 0.201904296875, "loss_aux_layer_18": 0.209716796875, "loss_aux_layer_19": 0.20654296875, "loss_aux_layer_2": 0.12451171875, "loss_aux_layer_20": 0.209228515625, "loss_aux_layer_21": 0.212158203125, "loss_aux_layer_22": 0.234619140625, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.1326904296875, "loss_aux_layer_4": 0.1336669921875, "loss_aux_layer_5": 0.134765625, "loss_aux_layer_6": 0.1353759765625, "loss_aux_layer_7": 0.128173828125, "loss_aux_layer_8": 0.1268310546875, "loss_aux_layer_9": 0.124755859375, "step": 443, "total_loss": 0.8838863968849182 }, { "epoch": 0.08790338546822411, "grad_norm": 1.489128828048706, "learning_rate": 5e-05, "llm_loss": 0.6514758914709091, "loss": 3.2233, "loss_aux_layer_0": 0.03741455078125, "loss_aux_layer_1": 0.10986328125, "loss_aux_layer_10": 0.115966796875, "loss_aux_layer_11": 0.12255859375, "loss_aux_layer_12": 0.132568359375, "loss_aux_layer_13": 0.143798828125, "loss_aux_layer_14": 0.162109375, "loss_aux_layer_15": 0.177490234375, "loss_aux_layer_16": 0.193115234375, "loss_aux_layer_17": 0.19873046875, "loss_aux_layer_18": 0.2080078125, "loss_aux_layer_19": 0.2080078125, "loss_aux_layer_2": 0.113037109375, "loss_aux_layer_20": 0.211181640625, "loss_aux_layer_21": 0.21484375, "loss_aux_layer_22": 0.236328125, "loss_aux_layer_23": 0.28125, "loss_aux_layer_3": 0.1204833984375, "loss_aux_layer_4": 0.1214599609375, "loss_aux_layer_5": 0.123046875, "loss_aux_layer_6": 0.1239013671875, "loss_aux_layer_7": 0.1175537109375, "loss_aux_layer_8": 0.1162109375, "loss_aux_layer_9": 0.11474609375, "step": 444, "total_loss": 0.8058179020881653 }, { "epoch": 0.08810136606612552, "grad_norm": 1.143316626548767, "learning_rate": 5e-05, "llm_loss": 0.6523849219083786, "loss": 3.2088, "loss_aux_layer_0": 0.032867431640625, "loss_aux_layer_1": 0.103515625, "loss_aux_layer_10": 0.1116943359375, "loss_aux_layer_11": 0.1181640625, "loss_aux_layer_12": 0.12890625, "loss_aux_layer_13": 0.140625, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.17431640625, "loss_aux_layer_16": 0.189697265625, "loss_aux_layer_17": 0.19677734375, "loss_aux_layer_18": 0.205810546875, "loss_aux_layer_19": 0.205078125, "loss_aux_layer_2": 0.107177734375, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.212158203125, "loss_aux_layer_22": 0.23291015625, "loss_aux_layer_23": 0.275390625, "loss_aux_layer_3": 0.11376953125, "loss_aux_layer_4": 0.11474609375, "loss_aux_layer_5": 0.1162109375, "loss_aux_layer_6": 0.116455078125, "loss_aux_layer_7": 0.1109619140625, "loss_aux_layer_8": 0.1099853515625, "loss_aux_layer_9": 0.1097412109375, "step": 445, "total_loss": 0.8021899908781052 }, { "epoch": 0.08829934666402693, "grad_norm": 2.6397721767425537, "learning_rate": 5e-05, "llm_loss": 0.6227444112300873, "loss": 3.1036, "loss_aux_layer_0": 0.03192138671875, "loss_aux_layer_1": 0.1102294921875, "loss_aux_layer_10": 0.1175537109375, "loss_aux_layer_11": 0.1239013671875, "loss_aux_layer_12": 0.1337890625, "loss_aux_layer_13": 0.144287109375, "loss_aux_layer_14": 0.1611328125, "loss_aux_layer_15": 0.17578125, "loss_aux_layer_16": 0.189697265625, "loss_aux_layer_17": 0.195556640625, "loss_aux_layer_18": 0.20458984375, "loss_aux_layer_19": 0.2021484375, "loss_aux_layer_2": 0.11474609375, "loss_aux_layer_20": 0.205322265625, "loss_aux_layer_21": 0.20947265625, "loss_aux_layer_22": 0.22998046875, "loss_aux_layer_23": 0.27392578125, "loss_aux_layer_3": 0.1231689453125, "loss_aux_layer_4": 0.1236572265625, "loss_aux_layer_5": 0.1251220703125, "loss_aux_layer_6": 0.1253662109375, "loss_aux_layer_7": 0.119140625, "loss_aux_layer_8": 0.117919921875, "loss_aux_layer_9": 0.116455078125, "step": 446, "total_loss": 0.7759000509977341 }, { "epoch": 0.08849732726192833, "grad_norm": 2.5944039821624756, "learning_rate": 5e-05, "llm_loss": 0.7233416885137558, "loss": 3.5348, "loss_aux_layer_0": 0.03643798828125, "loss_aux_layer_1": 0.1165771484375, "loss_aux_layer_10": 0.1231689453125, "loss_aux_layer_11": 0.1302490234375, "loss_aux_layer_12": 0.140380859375, "loss_aux_layer_13": 0.15185546875, "loss_aux_layer_14": 0.169189453125, "loss_aux_layer_15": 0.18359375, "loss_aux_layer_16": 0.197509765625, "loss_aux_layer_17": 0.203369140625, "loss_aux_layer_18": 0.2119140625, "loss_aux_layer_19": 0.209228515625, "loss_aux_layer_2": 0.12109375, "loss_aux_layer_20": 0.212646484375, "loss_aux_layer_21": 0.216796875, "loss_aux_layer_22": 0.23974609375, "loss_aux_layer_23": 0.28515625, "loss_aux_layer_3": 0.1295166015625, "loss_aux_layer_4": 0.13134765625, "loss_aux_layer_5": 0.1328125, "loss_aux_layer_6": 0.133544921875, "loss_aux_layer_7": 0.1265869140625, "loss_aux_layer_8": 0.124755859375, "loss_aux_layer_9": 0.1221923828125, "step": 447, "total_loss": 0.8837026655673981 }, { "epoch": 0.08869530785982974, "grad_norm": 1.193625807762146, "learning_rate": 5e-05, "llm_loss": 0.6663041263818741, "loss": 3.285, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.112060546875, "loss_aux_layer_10": 0.11767578125, "loss_aux_layer_11": 0.1243896484375, "loss_aux_layer_12": 0.134521484375, "loss_aux_layer_13": 0.146240234375, "loss_aux_layer_14": 0.163818359375, "loss_aux_layer_15": 0.179443359375, "loss_aux_layer_16": 0.194091796875, "loss_aux_layer_17": 0.199951171875, "loss_aux_layer_18": 0.2080078125, "loss_aux_layer_19": 0.207275390625, "loss_aux_layer_2": 0.1138916015625, "loss_aux_layer_20": 0.211181640625, "loss_aux_layer_21": 0.213623046875, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1212158203125, "loss_aux_layer_4": 0.12255859375, "loss_aux_layer_5": 0.1241455078125, "loss_aux_layer_6": 0.125, "loss_aux_layer_7": 0.1187744140625, "loss_aux_layer_8": 0.11767578125, "loss_aux_layer_9": 0.116455078125, "step": 448, "total_loss": 0.8212495148181915 }, { "epoch": 0.08889328845773115, "grad_norm": 2.4263813495635986, "learning_rate": 5e-05, "llm_loss": 0.6556603759527206, "loss": 3.2332, "loss_aux_layer_0": 0.032684326171875, "loss_aux_layer_1": 0.105712890625, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.1217041015625, "loss_aux_layer_12": 0.132080078125, "loss_aux_layer_13": 0.142333984375, "loss_aux_layer_14": 0.159912109375, "loss_aux_layer_15": 0.17578125, "loss_aux_layer_16": 0.19091796875, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.206787109375, "loss_aux_layer_2": 0.1103515625, "loss_aux_layer_20": 0.21044921875, "loss_aux_layer_21": 0.21337890625, "loss_aux_layer_22": 0.236083984375, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.118408203125, "loss_aux_layer_4": 0.1195068359375, "loss_aux_layer_5": 0.1214599609375, "loss_aux_layer_6": 0.1220703125, "loss_aux_layer_7": 0.1156005859375, "loss_aux_layer_8": 0.1146240234375, "loss_aux_layer_9": 0.113525390625, "step": 449, "total_loss": 0.80830217897892 }, { "epoch": 0.08909126905563255, "grad_norm": 2.4791922569274902, "learning_rate": 5e-05, "llm_loss": 0.6487747877836227, "loss": 3.2354, "loss_aux_layer_0": 0.03472900390625, "loss_aux_layer_1": 0.1192626953125, "loss_aux_layer_10": 0.125, "loss_aux_layer_11": 0.132568359375, "loss_aux_layer_12": 0.142578125, "loss_aux_layer_13": 0.153076171875, "loss_aux_layer_14": 0.169677734375, "loss_aux_layer_15": 0.183349609375, "loss_aux_layer_16": 0.197509765625, "loss_aux_layer_17": 0.2021484375, "loss_aux_layer_18": 0.209228515625, "loss_aux_layer_19": 0.206787109375, "loss_aux_layer_2": 0.122802734375, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.235107421875, "loss_aux_layer_23": 0.27880859375, "loss_aux_layer_3": 0.130859375, "loss_aux_layer_4": 0.1324462890625, "loss_aux_layer_5": 0.133544921875, "loss_aux_layer_6": 0.134033203125, "loss_aux_layer_7": 0.127197265625, "loss_aux_layer_8": 0.12548828125, "loss_aux_layer_9": 0.1239013671875, "step": 450, "total_loss": 0.8088530004024506 }, { "epoch": 0.08928924965353395, "grad_norm": 2.1419918537139893, "learning_rate": 5e-05, "llm_loss": 0.7268965542316437, "loss": 3.5214, "loss_aux_layer_0": 0.03375244140625, "loss_aux_layer_1": 0.111572265625, "loss_aux_layer_10": 0.116455078125, "loss_aux_layer_11": 0.123779296875, "loss_aux_layer_12": 0.134521484375, "loss_aux_layer_13": 0.145751953125, "loss_aux_layer_14": 0.162109375, "loss_aux_layer_15": 0.176513671875, "loss_aux_layer_16": 0.19091796875, "loss_aux_layer_17": 0.197021484375, "loss_aux_layer_18": 0.206298828125, "loss_aux_layer_19": 0.20458984375, "loss_aux_layer_2": 0.1131591796875, "loss_aux_layer_20": 0.20849609375, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.232666015625, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.12060546875, "loss_aux_layer_4": 0.121826171875, "loss_aux_layer_5": 0.1229248046875, "loss_aux_layer_6": 0.12353515625, "loss_aux_layer_7": 0.116943359375, "loss_aux_layer_8": 0.1156005859375, "loss_aux_layer_9": 0.1148681640625, "step": 451, "total_loss": 0.8803502321243286 }, { "epoch": 0.08948723025143536, "grad_norm": 2.407090663909912, "learning_rate": 5e-05, "llm_loss": 0.6502677798271179, "loss": 3.2317, "loss_aux_layer_0": 0.0377197265625, "loss_aux_layer_1": 0.11669921875, "loss_aux_layer_10": 0.1199951171875, "loss_aux_layer_11": 0.127197265625, "loss_aux_layer_12": 0.138427734375, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.166748046875, "loss_aux_layer_15": 0.181640625, "loss_aux_layer_16": 0.195556640625, "loss_aux_layer_17": 0.20166015625, "loss_aux_layer_18": 0.21044921875, "loss_aux_layer_19": 0.208984375, "loss_aux_layer_2": 0.1177978515625, "loss_aux_layer_20": 0.212890625, "loss_aux_layer_21": 0.215576171875, "loss_aux_layer_22": 0.238525390625, "loss_aux_layer_23": 0.28369140625, "loss_aux_layer_3": 0.1243896484375, "loss_aux_layer_4": 0.12548828125, "loss_aux_layer_5": 0.12646484375, "loss_aux_layer_6": 0.1273193359375, "loss_aux_layer_7": 0.1204833984375, "loss_aux_layer_8": 0.1197509765625, "loss_aux_layer_9": 0.11865234375, "step": 452, "total_loss": 0.8079336285591125 }, { "epoch": 0.08968521084933677, "grad_norm": 0.9666104912757874, "learning_rate": 5e-05, "llm_loss": 0.6804491281509399, "loss": 3.3379, "loss_aux_layer_0": 0.03216552734375, "loss_aux_layer_1": 0.1103515625, "loss_aux_layer_10": 0.1165771484375, "loss_aux_layer_11": 0.12353515625, "loss_aux_layer_12": 0.134033203125, "loss_aux_layer_13": 0.14599609375, "loss_aux_layer_14": 0.163330078125, "loss_aux_layer_15": 0.1796875, "loss_aux_layer_16": 0.1943359375, "loss_aux_layer_17": 0.2001953125, "loss_aux_layer_18": 0.20947265625, "loss_aux_layer_19": 0.208251953125, "loss_aux_layer_2": 0.1114501953125, "loss_aux_layer_20": 0.21142578125, "loss_aux_layer_21": 0.21240234375, "loss_aux_layer_22": 0.231689453125, "loss_aux_layer_23": 0.27490234375, "loss_aux_layer_3": 0.119384765625, "loss_aux_layer_4": 0.1209716796875, "loss_aux_layer_5": 0.122802734375, "loss_aux_layer_6": 0.1240234375, "loss_aux_layer_7": 0.11767578125, "loss_aux_layer_8": 0.1163330078125, "loss_aux_layer_9": 0.11572265625, "step": 453, "total_loss": 0.8344805687665939 }, { "epoch": 0.08988319144723816, "grad_norm": 1.5351712703704834, "learning_rate": 5e-05, "llm_loss": 0.6439899504184723, "loss": 3.1814, "loss_aux_layer_0": 0.031402587890625, "loss_aux_layer_1": 0.1068115234375, "loss_aux_layer_10": 0.1142578125, "loss_aux_layer_11": 0.12109375, "loss_aux_layer_12": 0.13134765625, "loss_aux_layer_13": 0.141845703125, "loss_aux_layer_14": 0.159912109375, "loss_aux_layer_15": 0.175048828125, "loss_aux_layer_16": 0.190185546875, "loss_aux_layer_17": 0.1953125, "loss_aux_layer_18": 0.20361328125, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.112060546875, "loss_aux_layer_20": 0.20703125, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.1180419921875, "loss_aux_layer_4": 0.11865234375, "loss_aux_layer_5": 0.1202392578125, "loss_aux_layer_6": 0.1209716796875, "loss_aux_layer_7": 0.1148681640625, "loss_aux_layer_8": 0.114013671875, "loss_aux_layer_9": 0.1129150390625, "step": 454, "total_loss": 0.7953522056341171 }, { "epoch": 0.09008117204513957, "grad_norm": 2.262946367263794, "learning_rate": 5e-05, "llm_loss": 0.5920728296041489, "loss": 3.0141, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.116943359375, "loss_aux_layer_10": 0.1258544921875, "loss_aux_layer_11": 0.13330078125, "loss_aux_layer_12": 0.143310546875, "loss_aux_layer_13": 0.154052734375, "loss_aux_layer_14": 0.170166015625, "loss_aux_layer_15": 0.18505859375, "loss_aux_layer_16": 0.19970703125, "loss_aux_layer_17": 0.20458984375, "loss_aux_layer_18": 0.21240234375, "loss_aux_layer_19": 0.2109375, "loss_aux_layer_2": 0.1209716796875, "loss_aux_layer_20": 0.213623046875, "loss_aux_layer_21": 0.216552734375, "loss_aux_layer_22": 0.238525390625, "loss_aux_layer_23": 0.2822265625, "loss_aux_layer_3": 0.1304931640625, "loss_aux_layer_4": 0.1324462890625, "loss_aux_layer_5": 0.134521484375, "loss_aux_layer_6": 0.134765625, "loss_aux_layer_7": 0.1285400390625, "loss_aux_layer_8": 0.1265869140625, "loss_aux_layer_9": 0.12451171875, "step": 455, "total_loss": 0.7535276859998703 }, { "epoch": 0.09027915264304098, "grad_norm": 3.1998255252838135, "learning_rate": 5e-05, "llm_loss": 0.6722275018692017, "loss": 3.3153, "loss_aux_layer_0": 0.0340576171875, "loss_aux_layer_1": 0.1116943359375, "loss_aux_layer_10": 0.1204833984375, "loss_aux_layer_11": 0.12744140625, "loss_aux_layer_12": 0.138671875, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.166259765625, "loss_aux_layer_15": 0.1806640625, "loss_aux_layer_16": 0.19482421875, "loss_aux_layer_17": 0.200439453125, "loss_aux_layer_18": 0.20849609375, "loss_aux_layer_19": 0.206298828125, "loss_aux_layer_2": 0.114990234375, "loss_aux_layer_20": 0.2099609375, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.234130859375, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.125, "loss_aux_layer_4": 0.1259765625, "loss_aux_layer_5": 0.128173828125, "loss_aux_layer_6": 0.128173828125, "loss_aux_layer_7": 0.1220703125, "loss_aux_layer_8": 0.1201171875, "loss_aux_layer_9": 0.119140625, "step": 456, "total_loss": 0.8288216292858124 }, { "epoch": 0.09047713324094239, "grad_norm": 2.8930013179779053, "learning_rate": 5e-05, "llm_loss": 0.6168587803840637, "loss": 3.0801, "loss_aux_layer_0": 0.03497314453125, "loss_aux_layer_1": 0.1077880859375, "loss_aux_layer_10": 0.1163330078125, "loss_aux_layer_11": 0.12255859375, "loss_aux_layer_12": 0.1326904296875, "loss_aux_layer_13": 0.14306640625, "loss_aux_layer_14": 0.1611328125, "loss_aux_layer_15": 0.177001953125, "loss_aux_layer_16": 0.191650390625, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.206787109375, "loss_aux_layer_19": 0.206787109375, "loss_aux_layer_2": 0.110595703125, "loss_aux_layer_20": 0.210693359375, "loss_aux_layer_21": 0.21435546875, "loss_aux_layer_22": 0.234130859375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1180419921875, "loss_aux_layer_4": 0.1195068359375, "loss_aux_layer_5": 0.121826171875, "loss_aux_layer_6": 0.1229248046875, "loss_aux_layer_7": 0.1168212890625, "loss_aux_layer_8": 0.1160888671875, "loss_aux_layer_9": 0.114990234375, "step": 457, "total_loss": 0.770015075802803 }, { "epoch": 0.09067511383884379, "grad_norm": 2.0693681240081787, "learning_rate": 5e-05, "llm_loss": 0.6416610330343246, "loss": 3.1823, "loss_aux_layer_0": 0.03314208984375, "loss_aux_layer_1": 0.1094970703125, "loss_aux_layer_10": 0.116455078125, "loss_aux_layer_11": 0.123291015625, "loss_aux_layer_12": 0.1337890625, "loss_aux_layer_13": 0.14453125, "loss_aux_layer_14": 0.161865234375, "loss_aux_layer_15": 0.177001953125, "loss_aux_layer_16": 0.190673828125, "loss_aux_layer_17": 0.1962890625, "loss_aux_layer_18": 0.20556640625, "loss_aux_layer_19": 0.205078125, "loss_aux_layer_2": 0.11279296875, "loss_aux_layer_20": 0.209228515625, "loss_aux_layer_21": 0.214599609375, "loss_aux_layer_22": 0.23681640625, "loss_aux_layer_23": 0.28125, "loss_aux_layer_3": 0.1204833984375, "loss_aux_layer_4": 0.1219482421875, "loss_aux_layer_5": 0.1239013671875, "loss_aux_layer_6": 0.124755859375, "loss_aux_layer_7": 0.1181640625, "loss_aux_layer_8": 0.11669921875, "loss_aux_layer_9": 0.115478515625, "step": 458, "total_loss": 0.7955638915300369 }, { "epoch": 0.0908730944367452, "grad_norm": 1.818406343460083, "learning_rate": 5e-05, "llm_loss": 0.6599357426166534, "loss": 3.2443, "loss_aux_layer_0": 0.03289794921875, "loss_aux_layer_1": 0.11083984375, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.1217041015625, "loss_aux_layer_12": 0.131591796875, "loss_aux_layer_13": 0.141845703125, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.1728515625, "loss_aux_layer_16": 0.187255859375, "loss_aux_layer_17": 0.19287109375, "loss_aux_layer_18": 0.201904296875, "loss_aux_layer_19": 0.20166015625, "loss_aux_layer_2": 0.11328125, "loss_aux_layer_20": 0.205322265625, "loss_aux_layer_21": 0.207275390625, "loss_aux_layer_22": 0.2265625, "loss_aux_layer_23": 0.27001953125, "loss_aux_layer_3": 0.1204833984375, "loss_aux_layer_4": 0.1219482421875, "loss_aux_layer_5": 0.123046875, "loss_aux_layer_6": 0.1236572265625, "loss_aux_layer_7": 0.1168212890625, "loss_aux_layer_8": 0.1153564453125, "loss_aux_layer_9": 0.1141357421875, "step": 459, "total_loss": 0.8110730946063995 }, { "epoch": 0.0910710750346466, "grad_norm": 2.1370952129364014, "learning_rate": 5e-05, "llm_loss": 0.6784115880727768, "loss": 3.3352, "loss_aux_layer_0": 0.03192138671875, "loss_aux_layer_1": 0.1124267578125, "loss_aux_layer_10": 0.121337890625, "loss_aux_layer_11": 0.128662109375, "loss_aux_layer_12": 0.138671875, "loss_aux_layer_13": 0.149169921875, "loss_aux_layer_14": 0.16552734375, "loss_aux_layer_15": 0.1796875, "loss_aux_layer_16": 0.193359375, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.205078125, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1160888671875, "loss_aux_layer_20": 0.2060546875, "loss_aux_layer_21": 0.2080078125, "loss_aux_layer_22": 0.228271484375, "loss_aux_layer_23": 0.2724609375, "loss_aux_layer_3": 0.1240234375, "loss_aux_layer_4": 0.12646484375, "loss_aux_layer_5": 0.1280517578125, "loss_aux_layer_6": 0.12890625, "loss_aux_layer_7": 0.122802734375, "loss_aux_layer_8": 0.12158203125, "loss_aux_layer_9": 0.120361328125, "step": 460, "total_loss": 0.8338093161582947 }, { "epoch": 0.09126905563254802, "grad_norm": 3.0939085483551025, "learning_rate": 5e-05, "llm_loss": 0.6203707158565521, "loss": 3.1064, "loss_aux_layer_0": 0.03497314453125, "loss_aux_layer_1": 0.1104736328125, "loss_aux_layer_10": 0.1187744140625, "loss_aux_layer_11": 0.125244140625, "loss_aux_layer_12": 0.1351318359375, "loss_aux_layer_13": 0.146728515625, "loss_aux_layer_14": 0.165283203125, "loss_aux_layer_15": 0.180908203125, "loss_aux_layer_16": 0.19580078125, "loss_aux_layer_17": 0.201171875, "loss_aux_layer_18": 0.2099609375, "loss_aux_layer_19": 0.208740234375, "loss_aux_layer_2": 0.1142578125, "loss_aux_layer_20": 0.212646484375, "loss_aux_layer_21": 0.21533203125, "loss_aux_layer_22": 0.237060546875, "loss_aux_layer_23": 0.2822265625, "loss_aux_layer_3": 0.1229248046875, "loss_aux_layer_4": 0.1239013671875, "loss_aux_layer_5": 0.12548828125, "loss_aux_layer_6": 0.12646484375, "loss_aux_layer_7": 0.1204833984375, "loss_aux_layer_8": 0.1187744140625, "loss_aux_layer_9": 0.1175537109375, "step": 461, "total_loss": 0.7766113132238388 }, { "epoch": 0.09146703623044941, "grad_norm": 3.4645988941192627, "learning_rate": 5e-05, "llm_loss": 0.7885955572128296, "loss": 3.7859, "loss_aux_layer_0": 0.0330810546875, "loss_aux_layer_1": 0.112548828125, "loss_aux_layer_10": 0.1214599609375, "loss_aux_layer_11": 0.128173828125, "loss_aux_layer_12": 0.137939453125, "loss_aux_layer_13": 0.14892578125, "loss_aux_layer_14": 0.16650390625, "loss_aux_layer_15": 0.180908203125, "loss_aux_layer_16": 0.194580078125, "loss_aux_layer_17": 0.200439453125, "loss_aux_layer_18": 0.208984375, "loss_aux_layer_19": 0.20849609375, "loss_aux_layer_2": 0.11669921875, "loss_aux_layer_20": 0.212158203125, "loss_aux_layer_21": 0.216552734375, "loss_aux_layer_22": 0.2392578125, "loss_aux_layer_23": 0.28466796875, "loss_aux_layer_3": 0.1258544921875, "loss_aux_layer_4": 0.127197265625, "loss_aux_layer_5": 0.12939453125, "loss_aux_layer_6": 0.1300048828125, "loss_aux_layer_7": 0.123291015625, "loss_aux_layer_8": 0.1221923828125, "loss_aux_layer_9": 0.120361328125, "step": 462, "total_loss": 0.946478396654129 }, { "epoch": 0.09166501682835082, "grad_norm": 4.020228385925293, "learning_rate": 5e-05, "llm_loss": 0.7382624596357346, "loss": 3.5608, "loss_aux_layer_0": 0.03289794921875, "loss_aux_layer_1": 0.10888671875, "loss_aux_layer_10": 0.115478515625, "loss_aux_layer_11": 0.1219482421875, "loss_aux_layer_12": 0.1317138671875, "loss_aux_layer_13": 0.142333984375, "loss_aux_layer_14": 0.158935546875, "loss_aux_layer_15": 0.173583984375, "loss_aux_layer_16": 0.18798828125, "loss_aux_layer_17": 0.195068359375, "loss_aux_layer_18": 0.205078125, "loss_aux_layer_19": 0.2041015625, "loss_aux_layer_2": 0.1107177734375, "loss_aux_layer_20": 0.20849609375, "loss_aux_layer_21": 0.211181640625, "loss_aux_layer_22": 0.2314453125, "loss_aux_layer_23": 0.27490234375, "loss_aux_layer_3": 0.119140625, "loss_aux_layer_4": 0.1204833984375, "loss_aux_layer_5": 0.122314453125, "loss_aux_layer_6": 0.122802734375, "loss_aux_layer_7": 0.116455078125, "loss_aux_layer_8": 0.115478515625, "loss_aux_layer_9": 0.1146240234375, "step": 463, "total_loss": 0.8902034610509872 }, { "epoch": 0.09186299742625223, "grad_norm": 1.8198139667510986, "learning_rate": 5e-05, "llm_loss": 0.6416546553373337, "loss": 3.1867, "loss_aux_layer_0": 0.03265380859375, "loss_aux_layer_1": 0.111083984375, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.125732421875, "loss_aux_layer_12": 0.13525390625, "loss_aux_layer_13": 0.146240234375, "loss_aux_layer_14": 0.1630859375, "loss_aux_layer_15": 0.177978515625, "loss_aux_layer_16": 0.192138671875, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.20849609375, "loss_aux_layer_19": 0.20751953125, "loss_aux_layer_2": 0.11474609375, "loss_aux_layer_20": 0.211181640625, "loss_aux_layer_21": 0.21435546875, "loss_aux_layer_22": 0.235107421875, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.121826171875, "loss_aux_layer_4": 0.123046875, "loss_aux_layer_5": 0.125, "loss_aux_layer_6": 0.1256103515625, "loss_aux_layer_7": 0.119384765625, "loss_aux_layer_8": 0.117919921875, "loss_aux_layer_9": 0.116943359375, "step": 464, "total_loss": 0.7966853827238083 }, { "epoch": 0.09206097802415364, "grad_norm": 1.153658151626587, "learning_rate": 5e-05, "llm_loss": 0.7108826637268066, "loss": 3.4691, "loss_aux_layer_0": 0.033782958984375, "loss_aux_layer_1": 0.1148681640625, "loss_aux_layer_10": 0.11962890625, "loss_aux_layer_11": 0.1265869140625, "loss_aux_layer_12": 0.1365966796875, "loss_aux_layer_13": 0.148193359375, "loss_aux_layer_14": 0.16552734375, "loss_aux_layer_15": 0.18017578125, "loss_aux_layer_16": 0.1943359375, "loss_aux_layer_17": 0.199951171875, "loss_aux_layer_18": 0.2080078125, "loss_aux_layer_19": 0.20556640625, "loss_aux_layer_2": 0.1170654296875, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.213134765625, "loss_aux_layer_22": 0.23486328125, "loss_aux_layer_23": 0.27880859375, "loss_aux_layer_3": 0.1240234375, "loss_aux_layer_4": 0.125732421875, "loss_aux_layer_5": 0.12744140625, "loss_aux_layer_6": 0.1285400390625, "loss_aux_layer_7": 0.122314453125, "loss_aux_layer_8": 0.1202392578125, "loss_aux_layer_9": 0.11865234375, "step": 465, "total_loss": 0.8672734946012497 }, { "epoch": 0.09225895862205503, "grad_norm": 1.5939640998840332, "learning_rate": 5e-05, "llm_loss": 0.5796119347214699, "loss": 2.9495, "loss_aux_layer_0": 0.032562255859375, "loss_aux_layer_1": 0.1107177734375, "loss_aux_layer_10": 0.1190185546875, "loss_aux_layer_11": 0.1263427734375, "loss_aux_layer_12": 0.136962890625, "loss_aux_layer_13": 0.1484375, "loss_aux_layer_14": 0.166259765625, "loss_aux_layer_15": 0.18115234375, "loss_aux_layer_16": 0.19482421875, "loss_aux_layer_17": 0.201416015625, "loss_aux_layer_18": 0.211669921875, "loss_aux_layer_19": 0.212158203125, "loss_aux_layer_2": 0.1151123046875, "loss_aux_layer_20": 0.216552734375, "loss_aux_layer_21": 0.2216796875, "loss_aux_layer_22": 0.24560546875, "loss_aux_layer_23": 0.2919921875, "loss_aux_layer_3": 0.1226806640625, "loss_aux_layer_4": 0.1236572265625, "loss_aux_layer_5": 0.1253662109375, "loss_aux_layer_6": 0.1260986328125, "loss_aux_layer_7": 0.119873046875, "loss_aux_layer_8": 0.1187744140625, "loss_aux_layer_9": 0.1177978515625, "step": 466, "total_loss": 0.7373654097318649 }, { "epoch": 0.09245693921995644, "grad_norm": 1.2654800415039062, "learning_rate": 5e-05, "llm_loss": 0.6268637776374817, "loss": 3.1108, "loss_aux_layer_0": 0.033111572265625, "loss_aux_layer_1": 0.1064453125, "loss_aux_layer_10": 0.1131591796875, "loss_aux_layer_11": 0.11962890625, "loss_aux_layer_12": 0.1300048828125, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.17333984375, "loss_aux_layer_16": 0.18798828125, "loss_aux_layer_17": 0.19384765625, "loss_aux_layer_18": 0.202880859375, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1092529296875, "loss_aux_layer_20": 0.208740234375, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.2333984375, "loss_aux_layer_23": 0.27880859375, "loss_aux_layer_3": 0.1163330078125, "loss_aux_layer_4": 0.1177978515625, "loss_aux_layer_5": 0.11962890625, "loss_aux_layer_6": 0.1209716796875, "loss_aux_layer_7": 0.114990234375, "loss_aux_layer_8": 0.113525390625, "loss_aux_layer_9": 0.1124267578125, "step": 467, "total_loss": 0.7776916772127151 }, { "epoch": 0.09265491981785785, "grad_norm": 1.951516032218933, "learning_rate": 5e-05, "llm_loss": 0.6300665736198425, "loss": 3.115, "loss_aux_layer_0": 0.0318603515625, "loss_aux_layer_1": 0.1053466796875, "loss_aux_layer_10": 0.1107177734375, "loss_aux_layer_11": 0.1175537109375, "loss_aux_layer_12": 0.127685546875, "loss_aux_layer_13": 0.138916015625, "loss_aux_layer_14": 0.15576171875, "loss_aux_layer_15": 0.171142578125, "loss_aux_layer_16": 0.18505859375, "loss_aux_layer_17": 0.191650390625, "loss_aux_layer_18": 0.20166015625, "loss_aux_layer_19": 0.201904296875, "loss_aux_layer_2": 0.1077880859375, "loss_aux_layer_20": 0.206787109375, "loss_aux_layer_21": 0.210693359375, "loss_aux_layer_22": 0.2314453125, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.1142578125, "loss_aux_layer_4": 0.1153564453125, "loss_aux_layer_5": 0.1163330078125, "loss_aux_layer_6": 0.1173095703125, "loss_aux_layer_7": 0.111328125, "loss_aux_layer_8": 0.1104736328125, "loss_aux_layer_9": 0.109375, "step": 468, "total_loss": 0.7787531316280365 }, { "epoch": 0.09285290041575926, "grad_norm": 0.936705470085144, "learning_rate": 5e-05, "llm_loss": 0.6233813762664795, "loss": 3.0993, "loss_aux_layer_0": 0.032257080078125, "loss_aux_layer_1": 0.104736328125, "loss_aux_layer_10": 0.1119384765625, "loss_aux_layer_11": 0.119140625, "loss_aux_layer_12": 0.1300048828125, "loss_aux_layer_13": 0.141845703125, "loss_aux_layer_14": 0.159423828125, "loss_aux_layer_15": 0.17529296875, "loss_aux_layer_16": 0.1904296875, "loss_aux_layer_17": 0.1962890625, "loss_aux_layer_18": 0.20556640625, "loss_aux_layer_19": 0.20654296875, "loss_aux_layer_2": 0.107666015625, "loss_aux_layer_20": 0.21142578125, "loss_aux_layer_21": 0.216796875, "loss_aux_layer_22": 0.238037109375, "loss_aux_layer_23": 0.28466796875, "loss_aux_layer_3": 0.11474609375, "loss_aux_layer_4": 0.115966796875, "loss_aux_layer_5": 0.11767578125, "loss_aux_layer_6": 0.118896484375, "loss_aux_layer_7": 0.11328125, "loss_aux_layer_8": 0.1119384765625, "loss_aux_layer_9": 0.1107177734375, "step": 469, "total_loss": 0.774834394454956 }, { "epoch": 0.09305088101366066, "grad_norm": 2.3807532787323, "learning_rate": 5e-05, "llm_loss": 0.7560977786779404, "loss": 3.6215, "loss_aux_layer_0": 0.03472900390625, "loss_aux_layer_1": 0.1077880859375, "loss_aux_layer_10": 0.111083984375, "loss_aux_layer_11": 0.1180419921875, "loss_aux_layer_12": 0.128173828125, "loss_aux_layer_13": 0.1390380859375, "loss_aux_layer_14": 0.156982421875, "loss_aux_layer_15": 0.171875, "loss_aux_layer_16": 0.187255859375, "loss_aux_layer_17": 0.193359375, "loss_aux_layer_18": 0.202880859375, "loss_aux_layer_19": 0.202880859375, "loss_aux_layer_2": 0.1094970703125, "loss_aux_layer_20": 0.20654296875, "loss_aux_layer_21": 0.209716796875, "loss_aux_layer_22": 0.228515625, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.115966796875, "loss_aux_layer_4": 0.1168212890625, "loss_aux_layer_5": 0.1181640625, "loss_aux_layer_6": 0.1182861328125, "loss_aux_layer_7": 0.1126708984375, "loss_aux_layer_8": 0.11083984375, "loss_aux_layer_9": 0.1097412109375, "step": 470, "total_loss": 0.9053739011287689 }, { "epoch": 0.09324886161156207, "grad_norm": 3.6455581188201904, "learning_rate": 5e-05, "llm_loss": 0.6157196313142776, "loss": 3.0774, "loss_aux_layer_0": 0.032073974609375, "loss_aux_layer_1": 0.1090087890625, "loss_aux_layer_10": 0.1173095703125, "loss_aux_layer_11": 0.1240234375, "loss_aux_layer_12": 0.133544921875, "loss_aux_layer_13": 0.144775390625, "loss_aux_layer_14": 0.161865234375, "loss_aux_layer_15": 0.176513671875, "loss_aux_layer_16": 0.191162109375, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.206298828125, "loss_aux_layer_19": 0.205078125, "loss_aux_layer_2": 0.1121826171875, "loss_aux_layer_20": 0.208740234375, "loss_aux_layer_21": 0.2109375, "loss_aux_layer_22": 0.231201171875, "loss_aux_layer_23": 0.275390625, "loss_aux_layer_3": 0.121826171875, "loss_aux_layer_4": 0.1231689453125, "loss_aux_layer_5": 0.1248779296875, "loss_aux_layer_6": 0.1251220703125, "loss_aux_layer_7": 0.11962890625, "loss_aux_layer_8": 0.11767578125, "loss_aux_layer_9": 0.1160888671875, "step": 471, "total_loss": 0.769357368350029 }, { "epoch": 0.09344684220946348, "grad_norm": 4.142650604248047, "learning_rate": 5e-05, "llm_loss": 0.6510204970836639, "loss": 3.2283, "loss_aux_layer_0": 0.0328369140625, "loss_aux_layer_1": 0.1116943359375, "loss_aux_layer_10": 0.1202392578125, "loss_aux_layer_11": 0.1275634765625, "loss_aux_layer_12": 0.13720703125, "loss_aux_layer_13": 0.1484375, "loss_aux_layer_14": 0.165283203125, "loss_aux_layer_15": 0.18017578125, "loss_aux_layer_16": 0.194091796875, "loss_aux_layer_17": 0.198974609375, "loss_aux_layer_18": 0.20654296875, "loss_aux_layer_19": 0.204833984375, "loss_aux_layer_2": 0.11572265625, "loss_aux_layer_20": 0.207275390625, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.233642578125, "loss_aux_layer_23": 0.27783203125, "loss_aux_layer_3": 0.124267578125, "loss_aux_layer_4": 0.1258544921875, "loss_aux_layer_5": 0.1275634765625, "loss_aux_layer_6": 0.129150390625, "loss_aux_layer_7": 0.12353515625, "loss_aux_layer_8": 0.1217041015625, "loss_aux_layer_9": 0.1190185546875, "step": 472, "total_loss": 0.8070699125528336 }, { "epoch": 0.09364482280736487, "grad_norm": 4.506112098693848, "learning_rate": 5e-05, "llm_loss": 0.6124822720885277, "loss": 3.0674, "loss_aux_layer_0": 0.03277587890625, "loss_aux_layer_1": 0.11181640625, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.124755859375, "loss_aux_layer_12": 0.134033203125, "loss_aux_layer_13": 0.14453125, "loss_aux_layer_14": 0.1611328125, "loss_aux_layer_15": 0.175537109375, "loss_aux_layer_16": 0.189208984375, "loss_aux_layer_17": 0.194091796875, "loss_aux_layer_18": 0.203125, "loss_aux_layer_19": 0.202392578125, "loss_aux_layer_2": 0.1141357421875, "loss_aux_layer_20": 0.206298828125, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.23486328125, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.1265869140625, "loss_aux_layer_4": 0.1268310546875, "loss_aux_layer_5": 0.129150390625, "loss_aux_layer_6": 0.1275634765625, "loss_aux_layer_7": 0.1212158203125, "loss_aux_layer_8": 0.118896484375, "loss_aux_layer_9": 0.1168212890625, "step": 473, "total_loss": 0.7668470144271851 }, { "epoch": 0.09384280340526628, "grad_norm": 3.0677287578582764, "learning_rate": 5e-05, "llm_loss": 0.6209544092416763, "loss": 3.1035, "loss_aux_layer_0": 0.0321044921875, "loss_aux_layer_1": 0.1109619140625, "loss_aux_layer_10": 0.1181640625, "loss_aux_layer_11": 0.125244140625, "loss_aux_layer_12": 0.13525390625, "loss_aux_layer_13": 0.146240234375, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.1787109375, "loss_aux_layer_16": 0.192626953125, "loss_aux_layer_17": 0.19775390625, "loss_aux_layer_18": 0.2060546875, "loss_aux_layer_19": 0.204833984375, "loss_aux_layer_2": 0.115478515625, "loss_aux_layer_20": 0.208984375, "loss_aux_layer_21": 0.213134765625, "loss_aux_layer_22": 0.237060546875, "loss_aux_layer_23": 0.2822265625, "loss_aux_layer_3": 0.120849609375, "loss_aux_layer_4": 0.1226806640625, "loss_aux_layer_5": 0.1240234375, "loss_aux_layer_6": 0.1253662109375, "loss_aux_layer_7": 0.1201171875, "loss_aux_layer_8": 0.1187744140625, "loss_aux_layer_9": 0.1170654296875, "step": 474, "total_loss": 0.7758764028549194 }, { "epoch": 0.09404078400316769, "grad_norm": 1.2184339761734009, "learning_rate": 5e-05, "llm_loss": 0.6087933331727982, "loss": 3.0417, "loss_aux_layer_0": 0.03302001953125, "loss_aux_layer_1": 0.1075439453125, "loss_aux_layer_10": 0.1146240234375, "loss_aux_layer_11": 0.1220703125, "loss_aux_layer_12": 0.13232421875, "loss_aux_layer_13": 0.143310546875, "loss_aux_layer_14": 0.161376953125, "loss_aux_layer_15": 0.17578125, "loss_aux_layer_16": 0.18994140625, "loss_aux_layer_17": 0.19482421875, "loss_aux_layer_18": 0.203125, "loss_aux_layer_19": 0.2021484375, "loss_aux_layer_2": 0.111083984375, "loss_aux_layer_20": 0.20556640625, "loss_aux_layer_21": 0.208740234375, "loss_aux_layer_22": 0.23046875, "loss_aux_layer_23": 0.275390625, "loss_aux_layer_3": 0.1181640625, "loss_aux_layer_4": 0.119873046875, "loss_aux_layer_5": 0.1217041015625, "loss_aux_layer_6": 0.1229248046875, "loss_aux_layer_7": 0.1165771484375, "loss_aux_layer_8": 0.1146240234375, "loss_aux_layer_9": 0.113525390625, "step": 475, "total_loss": 0.7604207098484039 }, { "epoch": 0.0942387646010691, "grad_norm": 2.948462724685669, "learning_rate": 5e-05, "llm_loss": 0.6854136735200882, "loss": 3.3696, "loss_aux_layer_0": 0.0325927734375, "loss_aux_layer_1": 0.1134033203125, "loss_aux_layer_10": 0.1224365234375, "loss_aux_layer_11": 0.12939453125, "loss_aux_layer_12": 0.139404296875, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.167236328125, "loss_aux_layer_15": 0.181396484375, "loss_aux_layer_16": 0.195068359375, "loss_aux_layer_17": 0.198974609375, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.2060546875, "loss_aux_layer_2": 0.1175537109375, "loss_aux_layer_20": 0.208740234375, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.12548828125, "loss_aux_layer_4": 0.127197265625, "loss_aux_layer_5": 0.1290283203125, "loss_aux_layer_6": 0.1295166015625, "loss_aux_layer_7": 0.1236572265625, "loss_aux_layer_8": 0.1224365234375, "loss_aux_layer_9": 0.120849609375, "step": 476, "total_loss": 0.8424078822135925 }, { "epoch": 0.0944367451989705, "grad_norm": 2.643921136856079, "learning_rate": 5e-05, "llm_loss": 0.6347029954195023, "loss": 3.1537, "loss_aux_layer_0": 0.0321044921875, "loss_aux_layer_1": 0.1090087890625, "loss_aux_layer_10": 0.1177978515625, "loss_aux_layer_11": 0.124755859375, "loss_aux_layer_12": 0.134765625, "loss_aux_layer_13": 0.14501953125, "loss_aux_layer_14": 0.162353515625, "loss_aux_layer_15": 0.1767578125, "loss_aux_layer_16": 0.1904296875, "loss_aux_layer_17": 0.1962890625, "loss_aux_layer_18": 0.2041015625, "loss_aux_layer_19": 0.20361328125, "loss_aux_layer_2": 0.112060546875, "loss_aux_layer_20": 0.207763671875, "loss_aux_layer_21": 0.21240234375, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1212158203125, "loss_aux_layer_4": 0.12255859375, "loss_aux_layer_5": 0.124267578125, "loss_aux_layer_6": 0.1251220703125, "loss_aux_layer_7": 0.11962890625, "loss_aux_layer_8": 0.1181640625, "loss_aux_layer_9": 0.116943359375, "step": 477, "total_loss": 0.7884292602539062 }, { "epoch": 0.0946347257968719, "grad_norm": 1.4276291131973267, "learning_rate": 5e-05, "llm_loss": 0.5700121074914932, "loss": 2.8863, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.108154296875, "loss_aux_layer_10": 0.11474609375, "loss_aux_layer_11": 0.12158203125, "loss_aux_layer_12": 0.1318359375, "loss_aux_layer_13": 0.142822265625, "loss_aux_layer_14": 0.159912109375, "loss_aux_layer_15": 0.175048828125, "loss_aux_layer_16": 0.189208984375, "loss_aux_layer_17": 0.1943359375, "loss_aux_layer_18": 0.203125, "loss_aux_layer_19": 0.202880859375, "loss_aux_layer_2": 0.110595703125, "loss_aux_layer_20": 0.20703125, "loss_aux_layer_21": 0.2109375, "loss_aux_layer_22": 0.231689453125, "loss_aux_layer_23": 0.27734375, "loss_aux_layer_3": 0.1175537109375, "loss_aux_layer_4": 0.119140625, "loss_aux_layer_5": 0.120361328125, "loss_aux_layer_6": 0.1217041015625, "loss_aux_layer_7": 0.1158447265625, "loss_aux_layer_8": 0.1142578125, "loss_aux_layer_9": 0.1136474609375, "step": 478, "total_loss": 0.7215816676616669 }, { "epoch": 0.09483270639477331, "grad_norm": 1.4245291948318481, "learning_rate": 5e-05, "llm_loss": 0.6146702319383621, "loss": 3.0739, "loss_aux_layer_0": 0.0330810546875, "loss_aux_layer_1": 0.1138916015625, "loss_aux_layer_10": 0.1182861328125, "loss_aux_layer_11": 0.12548828125, "loss_aux_layer_12": 0.1348876953125, "loss_aux_layer_13": 0.14501953125, "loss_aux_layer_14": 0.162109375, "loss_aux_layer_15": 0.17626953125, "loss_aux_layer_16": 0.189697265625, "loss_aux_layer_17": 0.1943359375, "loss_aux_layer_18": 0.201904296875, "loss_aux_layer_19": 0.200927734375, "loss_aux_layer_2": 0.1158447265625, "loss_aux_layer_20": 0.20556640625, "loss_aux_layer_21": 0.20849609375, "loss_aux_layer_22": 0.229736328125, "loss_aux_layer_23": 0.27392578125, "loss_aux_layer_3": 0.1229248046875, "loss_aux_layer_4": 0.1248779296875, "loss_aux_layer_5": 0.126220703125, "loss_aux_layer_6": 0.1280517578125, "loss_aux_layer_7": 0.12158203125, "loss_aux_layer_8": 0.119384765625, "loss_aux_layer_9": 0.1175537109375, "step": 479, "total_loss": 0.768470898270607 }, { "epoch": 0.09503068699267472, "grad_norm": 2.7254507541656494, "learning_rate": 5e-05, "llm_loss": 0.5895393639802933, "loss": 2.9672, "loss_aux_layer_0": 0.03240966796875, "loss_aux_layer_1": 0.10498046875, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.1220703125, "loss_aux_layer_12": 0.13232421875, "loss_aux_layer_13": 0.1435546875, "loss_aux_layer_14": 0.1611328125, "loss_aux_layer_15": 0.17626953125, "loss_aux_layer_16": 0.191162109375, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.20556640625, "loss_aux_layer_19": 0.20556640625, "loss_aux_layer_2": 0.1082763671875, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.234619140625, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.117431640625, "loss_aux_layer_4": 0.11865234375, "loss_aux_layer_5": 0.120849609375, "loss_aux_layer_6": 0.12109375, "loss_aux_layer_7": 0.115478515625, "loss_aux_layer_8": 0.1143798828125, "loss_aux_layer_9": 0.11328125, "step": 480, "total_loss": 0.7417901754379272 }, { "epoch": 0.09522866759057612, "grad_norm": 3.094967842102051, "learning_rate": 5e-05, "llm_loss": 0.6087892055511475, "loss": 3.0291, "loss_aux_layer_0": 0.03240966796875, "loss_aux_layer_1": 0.104736328125, "loss_aux_layer_10": 0.1126708984375, "loss_aux_layer_11": 0.1187744140625, "loss_aux_layer_12": 0.12890625, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.15673828125, "loss_aux_layer_15": 0.17138671875, "loss_aux_layer_16": 0.185302734375, "loss_aux_layer_17": 0.189697265625, "loss_aux_layer_18": 0.198486328125, "loss_aux_layer_19": 0.19775390625, "loss_aux_layer_2": 0.1082763671875, "loss_aux_layer_20": 0.202392578125, "loss_aux_layer_21": 0.206787109375, "loss_aux_layer_22": 0.2265625, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.1163330078125, "loss_aux_layer_4": 0.1171875, "loss_aux_layer_5": 0.11865234375, "loss_aux_layer_6": 0.119873046875, "loss_aux_layer_7": 0.1141357421875, "loss_aux_layer_8": 0.11279296875, "loss_aux_layer_9": 0.11181640625, "step": 481, "total_loss": 0.7572759985923767 }, { "epoch": 0.09542664818847753, "grad_norm": 1.2578308582305908, "learning_rate": 5e-05, "llm_loss": 0.7036239057779312, "loss": 3.4246, "loss_aux_layer_0": 0.03387451171875, "loss_aux_layer_1": 0.1094970703125, "loss_aux_layer_10": 0.1153564453125, "loss_aux_layer_11": 0.1221923828125, "loss_aux_layer_12": 0.13232421875, "loss_aux_layer_13": 0.1435546875, "loss_aux_layer_14": 0.161376953125, "loss_aux_layer_15": 0.1767578125, "loss_aux_layer_16": 0.19189453125, "loss_aux_layer_17": 0.197265625, "loss_aux_layer_18": 0.205810546875, "loss_aux_layer_19": 0.203857421875, "loss_aux_layer_2": 0.109619140625, "loss_aux_layer_20": 0.207275390625, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.232666015625, "loss_aux_layer_23": 0.27783203125, "loss_aux_layer_3": 0.1177978515625, "loss_aux_layer_4": 0.1195068359375, "loss_aux_layer_5": 0.121337890625, "loss_aux_layer_6": 0.1226806640625, "loss_aux_layer_7": 0.1165771484375, "loss_aux_layer_8": 0.1151123046875, "loss_aux_layer_9": 0.114501953125, "step": 482, "total_loss": 0.8561395108699799 }, { "epoch": 0.09562462878637894, "grad_norm": 2.5400474071502686, "learning_rate": 5e-05, "llm_loss": 0.661270022392273, "loss": 3.2608, "loss_aux_layer_0": 0.03253173828125, "loss_aux_layer_1": 0.11083984375, "loss_aux_layer_10": 0.1182861328125, "loss_aux_layer_11": 0.12548828125, "loss_aux_layer_12": 0.1351318359375, "loss_aux_layer_13": 0.146240234375, "loss_aux_layer_14": 0.162353515625, "loss_aux_layer_15": 0.1767578125, "loss_aux_layer_16": 0.1904296875, "loss_aux_layer_17": 0.19482421875, "loss_aux_layer_18": 0.203369140625, "loss_aux_layer_19": 0.201904296875, "loss_aux_layer_2": 0.1126708984375, "loss_aux_layer_20": 0.2060546875, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.234619140625, "loss_aux_layer_23": 0.27978515625, "loss_aux_layer_3": 0.12109375, "loss_aux_layer_4": 0.1229248046875, "loss_aux_layer_5": 0.125244140625, "loss_aux_layer_6": 0.126220703125, "loss_aux_layer_7": 0.120361328125, "loss_aux_layer_8": 0.118896484375, "loss_aux_layer_9": 0.116943359375, "step": 483, "total_loss": 0.815192773938179 }, { "epoch": 0.09582260938428035, "grad_norm": 2.281080961227417, "learning_rate": 5e-05, "llm_loss": 0.5705170631408691, "loss": 2.9013, "loss_aux_layer_0": 0.03485107421875, "loss_aux_layer_1": 0.1124267578125, "loss_aux_layer_10": 0.1185302734375, "loss_aux_layer_11": 0.1253662109375, "loss_aux_layer_12": 0.1348876953125, "loss_aux_layer_13": 0.145263671875, "loss_aux_layer_14": 0.16162109375, "loss_aux_layer_15": 0.176513671875, "loss_aux_layer_16": 0.1904296875, "loss_aux_layer_17": 0.19580078125, "loss_aux_layer_18": 0.2041015625, "loss_aux_layer_19": 0.202880859375, "loss_aux_layer_2": 0.1160888671875, "loss_aux_layer_20": 0.20703125, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.23681640625, "loss_aux_layer_23": 0.28271484375, "loss_aux_layer_3": 0.1236572265625, "loss_aux_layer_4": 0.12451171875, "loss_aux_layer_5": 0.1259765625, "loss_aux_layer_6": 0.1265869140625, "loss_aux_layer_7": 0.120849609375, "loss_aux_layer_8": 0.119384765625, "loss_aux_layer_9": 0.1175537109375, "step": 484, "total_loss": 0.7253222465515137 }, { "epoch": 0.09602058998218174, "grad_norm": 1.6844040155410767, "learning_rate": 5e-05, "llm_loss": 0.6513121128082275, "loss": 3.2277, "loss_aux_layer_0": 0.03448486328125, "loss_aux_layer_1": 0.1136474609375, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.125244140625, "loss_aux_layer_12": 0.1357421875, "loss_aux_layer_13": 0.14599609375, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.17822265625, "loss_aux_layer_16": 0.1923828125, "loss_aux_layer_17": 0.197998046875, "loss_aux_layer_18": 0.20654296875, "loss_aux_layer_19": 0.20556640625, "loss_aux_layer_2": 0.1151123046875, "loss_aux_layer_20": 0.210205078125, "loss_aux_layer_21": 0.21484375, "loss_aux_layer_22": 0.23828125, "loss_aux_layer_23": 0.283203125, "loss_aux_layer_3": 0.1226806640625, "loss_aux_layer_4": 0.12451171875, "loss_aux_layer_5": 0.1256103515625, "loss_aux_layer_6": 0.126708984375, "loss_aux_layer_7": 0.120849609375, "loss_aux_layer_8": 0.11865234375, "loss_aux_layer_9": 0.1170654296875, "step": 485, "total_loss": 0.8069271147251129 }, { "epoch": 0.09621857058008315, "grad_norm": 1.4824862480163574, "learning_rate": 5e-05, "llm_loss": 0.6466542780399323, "loss": 3.2049, "loss_aux_layer_0": 0.03765869140625, "loss_aux_layer_1": 0.1119384765625, "loss_aux_layer_10": 0.1168212890625, "loss_aux_layer_11": 0.123779296875, "loss_aux_layer_12": 0.134033203125, "loss_aux_layer_13": 0.14501953125, "loss_aux_layer_14": 0.1630859375, "loss_aux_layer_15": 0.177490234375, "loss_aux_layer_16": 0.19189453125, "loss_aux_layer_17": 0.197021484375, "loss_aux_layer_18": 0.205810546875, "loss_aux_layer_19": 0.205078125, "loss_aux_layer_2": 0.1124267578125, "loss_aux_layer_20": 0.20947265625, "loss_aux_layer_21": 0.21484375, "loss_aux_layer_22": 0.239013671875, "loss_aux_layer_23": 0.2841796875, "loss_aux_layer_3": 0.119873046875, "loss_aux_layer_4": 0.121337890625, "loss_aux_layer_5": 0.1234130859375, "loss_aux_layer_6": 0.124755859375, "loss_aux_layer_7": 0.1190185546875, "loss_aux_layer_8": 0.1173095703125, "loss_aux_layer_9": 0.1158447265625, "step": 486, "total_loss": 0.8012312054634094 }, { "epoch": 0.09641655117798456, "grad_norm": 2.609874725341797, "learning_rate": 5e-05, "llm_loss": 0.671049952507019, "loss": 3.2937, "loss_aux_layer_0": 0.033538818359375, "loss_aux_layer_1": 0.1136474609375, "loss_aux_layer_10": 0.116943359375, "loss_aux_layer_11": 0.1234130859375, "loss_aux_layer_12": 0.13330078125, "loss_aux_layer_13": 0.143798828125, "loss_aux_layer_14": 0.16064453125, "loss_aux_layer_15": 0.174560546875, "loss_aux_layer_16": 0.1884765625, "loss_aux_layer_17": 0.193359375, "loss_aux_layer_18": 0.20068359375, "loss_aux_layer_19": 0.198974609375, "loss_aux_layer_2": 0.1141357421875, "loss_aux_layer_20": 0.203369140625, "loss_aux_layer_21": 0.207275390625, "loss_aux_layer_22": 0.229248046875, "loss_aux_layer_23": 0.27294921875, "loss_aux_layer_3": 0.121337890625, "loss_aux_layer_4": 0.123046875, "loss_aux_layer_5": 0.1246337890625, "loss_aux_layer_6": 0.1258544921875, "loss_aux_layer_7": 0.1195068359375, "loss_aux_layer_8": 0.1177978515625, "loss_aux_layer_9": 0.115966796875, "step": 487, "total_loss": 0.8234275728464127 }, { "epoch": 0.09661453177588597, "grad_norm": 4.317695617675781, "learning_rate": 5e-05, "llm_loss": 0.6649497747421265, "loss": 3.2354, "loss_aux_layer_0": 0.033294677734375, "loss_aux_layer_1": 0.094970703125, "loss_aux_layer_10": 0.108154296875, "loss_aux_layer_11": 0.113525390625, "loss_aux_layer_12": 0.1226806640625, "loss_aux_layer_13": 0.133544921875, "loss_aux_layer_14": 0.15185546875, "loss_aux_layer_15": 0.16845703125, "loss_aux_layer_16": 0.18359375, "loss_aux_layer_17": 0.189453125, "loss_aux_layer_18": 0.198974609375, "loss_aux_layer_19": 0.199951171875, "loss_aux_layer_2": 0.097412109375, "loss_aux_layer_20": 0.20556640625, "loss_aux_layer_21": 0.208984375, "loss_aux_layer_22": 0.228271484375, "loss_aux_layer_23": 0.27392578125, "loss_aux_layer_3": 0.1041259765625, "loss_aux_layer_4": 0.1051025390625, "loss_aux_layer_5": 0.1072998046875, "loss_aux_layer_6": 0.1087646484375, "loss_aux_layer_7": 0.104248046875, "loss_aux_layer_8": 0.1051025390625, "loss_aux_layer_9": 0.105712890625, "step": 488, "total_loss": 0.8088584244251251 }, { "epoch": 0.09681251237378737, "grad_norm": 4.491673469543457, "learning_rate": 5e-05, "llm_loss": 0.6959913074970245, "loss": 3.3998, "loss_aux_layer_0": 0.0341796875, "loss_aux_layer_1": 0.109375, "loss_aux_layer_10": 0.123779296875, "loss_aux_layer_11": 0.127197265625, "loss_aux_layer_12": 0.13720703125, "loss_aux_layer_13": 0.147705078125, "loss_aux_layer_14": 0.1650390625, "loss_aux_layer_15": 0.1796875, "loss_aux_layer_16": 0.1943359375, "loss_aux_layer_17": 0.197509765625, "loss_aux_layer_18": 0.20556640625, "loss_aux_layer_19": 0.203857421875, "loss_aux_layer_2": 0.11083984375, "loss_aux_layer_20": 0.207275390625, "loss_aux_layer_21": 0.2099609375, "loss_aux_layer_22": 0.230224609375, "loss_aux_layer_23": 0.27490234375, "loss_aux_layer_3": 0.1182861328125, "loss_aux_layer_4": 0.119384765625, "loss_aux_layer_5": 0.1202392578125, "loss_aux_layer_6": 0.1221923828125, "loss_aux_layer_7": 0.1170654296875, "loss_aux_layer_8": 0.119140625, "loss_aux_layer_9": 0.1207275390625, "step": 489, "total_loss": 0.8499582409858704 }, { "epoch": 0.09701049297168877, "grad_norm": 3.10933518409729, "learning_rate": 5e-05, "llm_loss": 0.6011511087417603, "loss": 3.0081, "loss_aux_layer_0": 0.03277587890625, "loss_aux_layer_1": 0.101806640625, "loss_aux_layer_10": 0.1119384765625, "loss_aux_layer_11": 0.1185302734375, "loss_aux_layer_12": 0.1297607421875, "loss_aux_layer_13": 0.14111328125, "loss_aux_layer_14": 0.160400390625, "loss_aux_layer_15": 0.176513671875, "loss_aux_layer_16": 0.19189453125, "loss_aux_layer_17": 0.198486328125, "loss_aux_layer_18": 0.208251953125, "loss_aux_layer_19": 0.20849609375, "loss_aux_layer_2": 0.1064453125, "loss_aux_layer_20": 0.212158203125, "loss_aux_layer_21": 0.21484375, "loss_aux_layer_22": 0.234619140625, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.11279296875, "loss_aux_layer_4": 0.1138916015625, "loss_aux_layer_5": 0.1153564453125, "loss_aux_layer_6": 0.1173095703125, "loss_aux_layer_7": 0.1121826171875, "loss_aux_layer_8": 0.110595703125, "loss_aux_layer_9": 0.1107177734375, "step": 490, "total_loss": 0.75202476978302 }, { "epoch": 0.09720847356959018, "grad_norm": 2.034078359603882, "learning_rate": 5e-05, "llm_loss": 0.6383686810731888, "loss": 3.1404, "loss_aux_layer_0": 0.029815673828125, "loss_aux_layer_1": 0.10205078125, "loss_aux_layer_10": 0.112060546875, "loss_aux_layer_11": 0.117919921875, "loss_aux_layer_12": 0.1275634765625, "loss_aux_layer_13": 0.138671875, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.17041015625, "loss_aux_layer_16": 0.185302734375, "loss_aux_layer_17": 0.19140625, "loss_aux_layer_18": 0.199462890625, "loss_aux_layer_19": 0.198974609375, "loss_aux_layer_2": 0.103515625, "loss_aux_layer_20": 0.20263671875, "loss_aux_layer_21": 0.206787109375, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.1103515625, "loss_aux_layer_4": 0.11181640625, "loss_aux_layer_5": 0.1136474609375, "loss_aux_layer_6": 0.1153564453125, "loss_aux_layer_7": 0.1104736328125, "loss_aux_layer_8": 0.1097412109375, "loss_aux_layer_9": 0.10986328125, "step": 491, "total_loss": 0.7851078808307648 }, { "epoch": 0.09740645416749158, "grad_norm": 3.3413002490997314, "learning_rate": 5e-05, "llm_loss": 0.5961006283760071, "loss": 2.9945, "loss_aux_layer_0": 0.031524658203125, "loss_aux_layer_1": 0.1077880859375, "loss_aux_layer_10": 0.1171875, "loss_aux_layer_11": 0.12353515625, "loss_aux_layer_12": 0.133056640625, "loss_aux_layer_13": 0.144287109375, "loss_aux_layer_14": 0.161865234375, "loss_aux_layer_15": 0.176513671875, "loss_aux_layer_16": 0.19140625, "loss_aux_layer_17": 0.1962890625, "loss_aux_layer_18": 0.204345703125, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1109619140625, "loss_aux_layer_20": 0.20654296875, "loss_aux_layer_21": 0.2099609375, "loss_aux_layer_22": 0.231689453125, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.11865234375, "loss_aux_layer_4": 0.1204833984375, "loss_aux_layer_5": 0.12158203125, "loss_aux_layer_6": 0.12353515625, "loss_aux_layer_7": 0.11767578125, "loss_aux_layer_8": 0.11669921875, "loss_aux_layer_9": 0.1156005859375, "step": 492, "total_loss": 0.74863001704216 }, { "epoch": 0.09760443476539299, "grad_norm": 3.1314685344696045, "learning_rate": 5e-05, "llm_loss": 0.671058788895607, "loss": 3.2957, "loss_aux_layer_0": 0.03192138671875, "loss_aux_layer_1": 0.1082763671875, "loss_aux_layer_10": 0.11767578125, "loss_aux_layer_11": 0.1243896484375, "loss_aux_layer_12": 0.134521484375, "loss_aux_layer_13": 0.145751953125, "loss_aux_layer_14": 0.163818359375, "loss_aux_layer_15": 0.1787109375, "loss_aux_layer_16": 0.19287109375, "loss_aux_layer_17": 0.197998046875, "loss_aux_layer_18": 0.2060546875, "loss_aux_layer_19": 0.20361328125, "loss_aux_layer_2": 0.109130859375, "loss_aux_layer_20": 0.20751953125, "loss_aux_layer_21": 0.2099609375, "loss_aux_layer_22": 0.229736328125, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1187744140625, "loss_aux_layer_4": 0.1204833984375, "loss_aux_layer_5": 0.1220703125, "loss_aux_layer_6": 0.12353515625, "loss_aux_layer_7": 0.117919921875, "loss_aux_layer_8": 0.11669921875, "loss_aux_layer_9": 0.1156005859375, "step": 493, "total_loss": 0.823933869600296 }, { "epoch": 0.0978024153632944, "grad_norm": 3.9547126293182373, "learning_rate": 5e-05, "llm_loss": 0.5636855289340019, "loss": 2.8691, "loss_aux_layer_0": 0.031494140625, "loss_aux_layer_1": 0.10302734375, "loss_aux_layer_10": 0.1165771484375, "loss_aux_layer_11": 0.1224365234375, "loss_aux_layer_12": 0.133056640625, "loss_aux_layer_13": 0.14501953125, "loss_aux_layer_14": 0.16259765625, "loss_aux_layer_15": 0.1787109375, "loss_aux_layer_16": 0.193115234375, "loss_aux_layer_17": 0.19921875, "loss_aux_layer_18": 0.207275390625, "loss_aux_layer_19": 0.20703125, "loss_aux_layer_2": 0.1083984375, "loss_aux_layer_20": 0.21142578125, "loss_aux_layer_21": 0.216552734375, "loss_aux_layer_22": 0.241943359375, "loss_aux_layer_23": 0.28955078125, "loss_aux_layer_3": 0.1165771484375, "loss_aux_layer_4": 0.1175537109375, "loss_aux_layer_5": 0.119384765625, "loss_aux_layer_6": 0.119873046875, "loss_aux_layer_7": 0.115966796875, "loss_aux_layer_8": 0.115234375, "loss_aux_layer_9": 0.1151123046875, "step": 494, "total_loss": 0.7172846645116806 }, { "epoch": 0.09800039596119581, "grad_norm": 3.628011703491211, "learning_rate": 5e-05, "llm_loss": 0.6148634031414986, "loss": 3.0781, "loss_aux_layer_0": 0.0350341796875, "loss_aux_layer_1": 0.1075439453125, "loss_aux_layer_10": 0.118896484375, "loss_aux_layer_11": 0.1251220703125, "loss_aux_layer_12": 0.135986328125, "loss_aux_layer_13": 0.147216796875, "loss_aux_layer_14": 0.165771484375, "loss_aux_layer_15": 0.180908203125, "loss_aux_layer_16": 0.19482421875, "loss_aux_layer_17": 0.199462890625, "loss_aux_layer_18": 0.20751953125, "loss_aux_layer_19": 0.206298828125, "loss_aux_layer_2": 0.1104736328125, "loss_aux_layer_20": 0.209716796875, "loss_aux_layer_21": 0.212646484375, "loss_aux_layer_22": 0.233642578125, "loss_aux_layer_23": 0.27783203125, "loss_aux_layer_3": 0.1201171875, "loss_aux_layer_4": 0.121337890625, "loss_aux_layer_5": 0.1234130859375, "loss_aux_layer_6": 0.123779296875, "loss_aux_layer_7": 0.118408203125, "loss_aux_layer_8": 0.118408203125, "loss_aux_layer_9": 0.1171875, "step": 495, "total_loss": 0.7695152312517166 }, { "epoch": 0.0981983765590972, "grad_norm": 1.1956984996795654, "learning_rate": 5e-05, "llm_loss": 0.5950875654816628, "loss": 2.977, "loss_aux_layer_0": 0.0318603515625, "loss_aux_layer_1": 0.1055908203125, "loss_aux_layer_10": 0.1124267578125, "loss_aux_layer_11": 0.1201171875, "loss_aux_layer_12": 0.130615234375, "loss_aux_layer_13": 0.141845703125, "loss_aux_layer_14": 0.1591796875, "loss_aux_layer_15": 0.175048828125, "loss_aux_layer_16": 0.18994140625, "loss_aux_layer_17": 0.19482421875, "loss_aux_layer_18": 0.203369140625, "loss_aux_layer_19": 0.201904296875, "loss_aux_layer_2": 0.105712890625, "loss_aux_layer_20": 0.20458984375, "loss_aux_layer_21": 0.206787109375, "loss_aux_layer_22": 0.226806640625, "loss_aux_layer_23": 0.2705078125, "loss_aux_layer_3": 0.113525390625, "loss_aux_layer_4": 0.1151123046875, "loss_aux_layer_5": 0.116455078125, "loss_aux_layer_6": 0.1182861328125, "loss_aux_layer_7": 0.113037109375, "loss_aux_layer_8": 0.1112060546875, "loss_aux_layer_9": 0.111083984375, "step": 496, "total_loss": 0.7442404627799988 }, { "epoch": 0.09839635715699861, "grad_norm": 2.7979631423950195, "learning_rate": 5e-05, "llm_loss": 0.7057991027832031, "loss": 3.4415, "loss_aux_layer_0": 0.0322265625, "loss_aux_layer_1": 0.1124267578125, "loss_aux_layer_10": 0.1202392578125, "loss_aux_layer_11": 0.1263427734375, "loss_aux_layer_12": 0.13671875, "loss_aux_layer_13": 0.1474609375, "loss_aux_layer_14": 0.1640625, "loss_aux_layer_15": 0.177490234375, "loss_aux_layer_16": 0.191650390625, "loss_aux_layer_17": 0.196533203125, "loss_aux_layer_18": 0.2041015625, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1138916015625, "loss_aux_layer_20": 0.20703125, "loss_aux_layer_21": 0.211181640625, "loss_aux_layer_22": 0.234130859375, "loss_aux_layer_23": 0.27783203125, "loss_aux_layer_3": 0.1212158203125, "loss_aux_layer_4": 0.1231689453125, "loss_aux_layer_5": 0.1243896484375, "loss_aux_layer_6": 0.1260986328125, "loss_aux_layer_7": 0.120849609375, "loss_aux_layer_8": 0.119873046875, "loss_aux_layer_9": 0.11865234375, "step": 497, "total_loss": 0.8603779524564743 }, { "epoch": 0.09859433775490002, "grad_norm": 3.6415727138519287, "learning_rate": 5e-05, "llm_loss": 0.6019372642040253, "loss": 3.0291, "loss_aux_layer_0": 0.034698486328125, "loss_aux_layer_1": 0.110107421875, "loss_aux_layer_10": 0.1209716796875, "loss_aux_layer_11": 0.127685546875, "loss_aux_layer_12": 0.13720703125, "loss_aux_layer_13": 0.14892578125, "loss_aux_layer_14": 0.165771484375, "loss_aux_layer_15": 0.180419921875, "loss_aux_layer_16": 0.1943359375, "loss_aux_layer_17": 0.1982421875, "loss_aux_layer_18": 0.205810546875, "loss_aux_layer_19": 0.204345703125, "loss_aux_layer_2": 0.112060546875, "loss_aux_layer_20": 0.2080078125, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.233642578125, "loss_aux_layer_23": 0.27880859375, "loss_aux_layer_3": 0.1220703125, "loss_aux_layer_4": 0.123291015625, "loss_aux_layer_5": 0.125, "loss_aux_layer_6": 0.1259765625, "loss_aux_layer_7": 0.1199951171875, "loss_aux_layer_8": 0.1195068359375, "loss_aux_layer_9": 0.1187744140625, "step": 498, "total_loss": 0.757264107465744 }, { "epoch": 0.09879231835280143, "grad_norm": 4.275915622711182, "learning_rate": 5e-05, "llm_loss": 0.6395896822214127, "loss": 3.1465, "loss_aux_layer_0": 0.03033447265625, "loss_aux_layer_1": 0.1043701171875, "loss_aux_layer_10": 0.11083984375, "loss_aux_layer_11": 0.1171875, "loss_aux_layer_12": 0.126953125, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.15478515625, "loss_aux_layer_15": 0.16943359375, "loss_aux_layer_16": 0.184814453125, "loss_aux_layer_17": 0.190185546875, "loss_aux_layer_18": 0.19873046875, "loss_aux_layer_19": 0.198486328125, "loss_aux_layer_2": 0.105224609375, "loss_aux_layer_20": 0.202392578125, "loss_aux_layer_21": 0.205078125, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.26708984375, "loss_aux_layer_3": 0.114501953125, "loss_aux_layer_4": 0.1156005859375, "loss_aux_layer_5": 0.1177978515625, "loss_aux_layer_6": 0.11865234375, "loss_aux_layer_7": 0.1134033203125, "loss_aux_layer_8": 0.111328125, "loss_aux_layer_9": 0.1099853515625, "step": 499, "total_loss": 0.7866263836622238 }, { "epoch": 0.09899029895070283, "grad_norm": 3.762977123260498, "learning_rate": 5e-05, "llm_loss": 0.6354382485151291, "loss": 3.1565, "loss_aux_layer_0": 0.032196044921875, "loss_aux_layer_1": 0.1087646484375, "loss_aux_layer_10": 0.1182861328125, "loss_aux_layer_11": 0.12548828125, "loss_aux_layer_12": 0.135009765625, "loss_aux_layer_13": 0.14599609375, "loss_aux_layer_14": 0.16357421875, "loss_aux_layer_15": 0.1787109375, "loss_aux_layer_16": 0.19287109375, "loss_aux_layer_17": 0.19677734375, "loss_aux_layer_18": 0.204833984375, "loss_aux_layer_19": 0.203125, "loss_aux_layer_2": 0.1114501953125, "loss_aux_layer_20": 0.2060546875, "loss_aux_layer_21": 0.20947265625, "loss_aux_layer_22": 0.22998046875, "loss_aux_layer_23": 0.2724609375, "loss_aux_layer_3": 0.1231689453125, "loss_aux_layer_4": 0.1241455078125, "loss_aux_layer_5": 0.1260986328125, "loss_aux_layer_6": 0.1259765625, "loss_aux_layer_7": 0.1195068359375, "loss_aux_layer_8": 0.1181640625, "loss_aux_layer_9": 0.1168212890625, "step": 500, "total_loss": 0.789135605096817 }, { "epoch": 0.09918827954860424, "grad_norm": 1.2539563179016113, "learning_rate": 5e-05, "llm_loss": 0.6431127339601517, "loss": 3.1663, "loss_aux_layer_0": 0.03460693359375, "loss_aux_layer_1": 0.10009765625, "loss_aux_layer_10": 0.1099853515625, "loss_aux_layer_11": 0.1165771484375, "loss_aux_layer_12": 0.1265869140625, "loss_aux_layer_13": 0.137939453125, "loss_aux_layer_14": 0.155517578125, "loss_aux_layer_15": 0.171875, "loss_aux_layer_16": 0.185791015625, "loss_aux_layer_17": 0.19189453125, "loss_aux_layer_18": 0.203125, "loss_aux_layer_19": 0.203125, "loss_aux_layer_2": 0.1031494140625, "loss_aux_layer_20": 0.208251953125, "loss_aux_layer_21": 0.214599609375, "loss_aux_layer_22": 0.23828125, "loss_aux_layer_23": 0.28564453125, "loss_aux_layer_3": 0.10986328125, "loss_aux_layer_4": 0.1109619140625, "loss_aux_layer_5": 0.1129150390625, "loss_aux_layer_6": 0.1146240234375, "loss_aux_layer_7": 0.110107421875, "loss_aux_layer_8": 0.109130859375, "loss_aux_layer_9": 0.10888671875, "step": 501, "total_loss": 0.7915715277194977 }, { "epoch": 0.09938626014650564, "grad_norm": 3.3266916275024414, "learning_rate": 5e-05, "llm_loss": 0.6390442401170731, "loss": 3.1468, "loss_aux_layer_0": 0.032562255859375, "loss_aux_layer_1": 0.1043701171875, "loss_aux_layer_10": 0.11181640625, "loss_aux_layer_11": 0.1182861328125, "loss_aux_layer_12": 0.12744140625, "loss_aux_layer_13": 0.138671875, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.169921875, "loss_aux_layer_16": 0.183349609375, "loss_aux_layer_17": 0.189208984375, "loss_aux_layer_18": 0.197998046875, "loss_aux_layer_19": 0.197265625, "loss_aux_layer_2": 0.1063232421875, "loss_aux_layer_20": 0.201904296875, "loss_aux_layer_21": 0.20703125, "loss_aux_layer_22": 0.228515625, "loss_aux_layer_23": 0.27294921875, "loss_aux_layer_3": 0.1143798828125, "loss_aux_layer_4": 0.115478515625, "loss_aux_layer_5": 0.1171875, "loss_aux_layer_6": 0.11865234375, "loss_aux_layer_7": 0.1136474609375, "loss_aux_layer_8": 0.11181640625, "loss_aux_layer_9": 0.1104736328125, "step": 502, "total_loss": 0.7866914570331573 }, { "epoch": 0.09958424074440705, "grad_norm": 4.404371738433838, "learning_rate": 5e-05, "llm_loss": 0.6908309757709503, "loss": 3.3946, "loss_aux_layer_0": 0.03314208984375, "loss_aux_layer_1": 0.11376953125, "loss_aux_layer_10": 0.1224365234375, "loss_aux_layer_11": 0.12939453125, "loss_aux_layer_12": 0.139404296875, "loss_aux_layer_13": 0.14892578125, "loss_aux_layer_14": 0.165771484375, "loss_aux_layer_15": 0.179931640625, "loss_aux_layer_16": 0.1943359375, "loss_aux_layer_17": 0.19970703125, "loss_aux_layer_18": 0.209228515625, "loss_aux_layer_19": 0.207763671875, "loss_aux_layer_2": 0.11669921875, "loss_aux_layer_20": 0.2109375, "loss_aux_layer_21": 0.213623046875, "loss_aux_layer_22": 0.234375, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1285400390625, "loss_aux_layer_4": 0.1298828125, "loss_aux_layer_5": 0.1309814453125, "loss_aux_layer_6": 0.13134765625, "loss_aux_layer_7": 0.1246337890625, "loss_aux_layer_8": 0.1226806640625, "loss_aux_layer_9": 0.1212158203125, "step": 503, "total_loss": 0.8486585319042206 }, { "epoch": 0.09978222134230845, "grad_norm": 2.506413459777832, "learning_rate": 5e-05, "llm_loss": 0.7140385657548904, "loss": 3.4608, "loss_aux_layer_0": 0.03546142578125, "loss_aux_layer_1": 0.1092529296875, "loss_aux_layer_10": 0.1165771484375, "loss_aux_layer_11": 0.1234130859375, "loss_aux_layer_12": 0.1336669921875, "loss_aux_layer_13": 0.143798828125, "loss_aux_layer_14": 0.16064453125, "loss_aux_layer_15": 0.175537109375, "loss_aux_layer_16": 0.18896484375, "loss_aux_layer_17": 0.192626953125, "loss_aux_layer_18": 0.20068359375, "loss_aux_layer_19": 0.198974609375, "loss_aux_layer_2": 0.1099853515625, "loss_aux_layer_20": 0.20263671875, "loss_aux_layer_21": 0.2060546875, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.117919921875, "loss_aux_layer_4": 0.1195068359375, "loss_aux_layer_5": 0.12109375, "loss_aux_layer_6": 0.123291015625, "loss_aux_layer_7": 0.1177978515625, "loss_aux_layer_8": 0.1163330078125, "loss_aux_layer_9": 0.11572265625, "step": 504, "total_loss": 0.8651942014694214 }, { "epoch": 0.09998020194020986, "grad_norm": 6.936850070953369, "learning_rate": 5e-05, "llm_loss": 0.6193081438541412, "loss": 3.1207, "loss_aux_layer_0": 0.031890869140625, "loss_aux_layer_1": 0.11474609375, "loss_aux_layer_10": 0.1240234375, "loss_aux_layer_11": 0.130615234375, "loss_aux_layer_12": 0.1396484375, "loss_aux_layer_13": 0.150146484375, "loss_aux_layer_14": 0.16650390625, "loss_aux_layer_15": 0.180908203125, "loss_aux_layer_16": 0.194091796875, "loss_aux_layer_17": 0.19970703125, "loss_aux_layer_18": 0.209228515625, "loss_aux_layer_19": 0.20849609375, "loss_aux_layer_2": 0.1209716796875, "loss_aux_layer_20": 0.211669921875, "loss_aux_layer_21": 0.217041015625, "loss_aux_layer_22": 0.24169921875, "loss_aux_layer_23": 0.28759765625, "loss_aux_layer_3": 0.136474609375, "loss_aux_layer_4": 0.13671875, "loss_aux_layer_5": 0.138671875, "loss_aux_layer_6": 0.137939453125, "loss_aux_layer_7": 0.131103515625, "loss_aux_layer_8": 0.1275634765625, "loss_aux_layer_9": 0.12353515625, "step": 505, "total_loss": 0.7801660001277924 }, { "epoch": 0.10017818253811127, "grad_norm": 5.742719650268555, "learning_rate": 5e-05, "llm_loss": 0.7241515666246414, "loss": 3.5375, "loss_aux_layer_0": 0.0325927734375, "loss_aux_layer_1": 0.111083984375, "loss_aux_layer_10": 0.12451171875, "loss_aux_layer_11": 0.130615234375, "loss_aux_layer_12": 0.1397705078125, "loss_aux_layer_13": 0.1494140625, "loss_aux_layer_14": 0.166015625, "loss_aux_layer_15": 0.180908203125, "loss_aux_layer_16": 0.1953125, "loss_aux_layer_17": 0.200439453125, "loss_aux_layer_18": 0.20947265625, "loss_aux_layer_19": 0.208984375, "loss_aux_layer_2": 0.1212158203125, "loss_aux_layer_20": 0.213134765625, "loss_aux_layer_21": 0.216796875, "loss_aux_layer_22": 0.238037109375, "loss_aux_layer_23": 0.28173828125, "loss_aux_layer_3": 0.1376953125, "loss_aux_layer_4": 0.136474609375, "loss_aux_layer_5": 0.138671875, "loss_aux_layer_6": 0.1358642578125, "loss_aux_layer_7": 0.12744140625, "loss_aux_layer_8": 0.125732421875, "loss_aux_layer_9": 0.123046875, "step": 506, "total_loss": 0.8843710720539093 }, { "epoch": 0.10037616313601266, "grad_norm": 3.5423057079315186, "learning_rate": 5e-05, "llm_loss": 0.7114307731389999, "loss": 3.4595, "loss_aux_layer_0": 0.031524658203125, "loss_aux_layer_1": 0.1085205078125, "loss_aux_layer_10": 0.116943359375, "loss_aux_layer_11": 0.1234130859375, "loss_aux_layer_12": 0.1328125, "loss_aux_layer_13": 0.143798828125, "loss_aux_layer_14": 0.160400390625, "loss_aux_layer_15": 0.175537109375, "loss_aux_layer_16": 0.189453125, "loss_aux_layer_17": 0.196044921875, "loss_aux_layer_18": 0.204345703125, "loss_aux_layer_19": 0.202392578125, "loss_aux_layer_2": 0.110107421875, "loss_aux_layer_20": 0.20654296875, "loss_aux_layer_21": 0.20849609375, "loss_aux_layer_22": 0.23046875, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.1278076171875, "loss_aux_layer_4": 0.128662109375, "loss_aux_layer_5": 0.1307373046875, "loss_aux_layer_6": 0.128662109375, "loss_aux_layer_7": 0.1199951171875, "loss_aux_layer_8": 0.1181640625, "loss_aux_layer_9": 0.11572265625, "step": 507, "total_loss": 0.8648790568113327 }, { "epoch": 0.10057414373391407, "grad_norm": 2.4504082202911377, "learning_rate": 5e-05, "llm_loss": 0.5670596957206726, "loss": 2.8684, "loss_aux_layer_0": 0.032470703125, "loss_aux_layer_1": 0.103759765625, "loss_aux_layer_10": 0.1142578125, "loss_aux_layer_11": 0.12060546875, "loss_aux_layer_12": 0.130126953125, "loss_aux_layer_13": 0.140625, "loss_aux_layer_14": 0.156982421875, "loss_aux_layer_15": 0.172607421875, "loss_aux_layer_16": 0.18701171875, "loss_aux_layer_17": 0.19287109375, "loss_aux_layer_18": 0.2021484375, "loss_aux_layer_19": 0.20166015625, "loss_aux_layer_2": 0.1136474609375, "loss_aux_layer_20": 0.205810546875, "loss_aux_layer_21": 0.2099609375, "loss_aux_layer_22": 0.22998046875, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.115234375, "loss_aux_layer_4": 0.1162109375, "loss_aux_layer_5": 0.1177978515625, "loss_aux_layer_6": 0.118408203125, "loss_aux_layer_7": 0.116455078125, "loss_aux_layer_8": 0.11572265625, "loss_aux_layer_9": 0.11328125, "step": 508, "total_loss": 0.7171060591936111 }, { "epoch": 0.10077212433181548, "grad_norm": 3.949594736099243, "learning_rate": 5e-05, "llm_loss": 0.628577396273613, "loss": 3.1427, "loss_aux_layer_0": 0.03167724609375, "loss_aux_layer_1": 0.10986328125, "loss_aux_layer_10": 0.121826171875, "loss_aux_layer_11": 0.1279296875, "loss_aux_layer_12": 0.136962890625, "loss_aux_layer_13": 0.147705078125, "loss_aux_layer_14": 0.163330078125, "loss_aux_layer_15": 0.177490234375, "loss_aux_layer_16": 0.1923828125, "loss_aux_layer_17": 0.197265625, "loss_aux_layer_18": 0.205810546875, "loss_aux_layer_19": 0.20458984375, "loss_aux_layer_2": 0.12646484375, "loss_aux_layer_20": 0.208740234375, "loss_aux_layer_21": 0.212890625, "loss_aux_layer_22": 0.235595703125, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.127685546875, "loss_aux_layer_4": 0.12841796875, "loss_aux_layer_5": 0.130615234375, "loss_aux_layer_6": 0.1292724609375, "loss_aux_layer_7": 0.1279296875, "loss_aux_layer_8": 0.124755859375, "loss_aux_layer_9": 0.121337890625, "step": 509, "total_loss": 0.7856809049844742 }, { "epoch": 0.10097010492971689, "grad_norm": 3.167658805847168, "learning_rate": 5e-05, "llm_loss": 0.6436900347471237, "loss": 3.1801, "loss_aux_layer_0": 0.03167724609375, "loss_aux_layer_1": 0.1029052734375, "loss_aux_layer_10": 0.115478515625, "loss_aux_layer_11": 0.1221923828125, "loss_aux_layer_12": 0.131591796875, "loss_aux_layer_13": 0.14306640625, "loss_aux_layer_14": 0.159912109375, "loss_aux_layer_15": 0.17529296875, "loss_aux_layer_16": 0.189208984375, "loss_aux_layer_17": 0.19384765625, "loss_aux_layer_18": 0.2041015625, "loss_aux_layer_19": 0.203857421875, "loss_aux_layer_2": 0.109375, "loss_aux_layer_20": 0.208251953125, "loss_aux_layer_21": 0.2119140625, "loss_aux_layer_22": 0.23291015625, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.1160888671875, "loss_aux_layer_4": 0.1171875, "loss_aux_layer_5": 0.119140625, "loss_aux_layer_6": 0.120361328125, "loss_aux_layer_7": 0.1160888671875, "loss_aux_layer_8": 0.11572265625, "loss_aux_layer_9": 0.1146240234375, "step": 510, "total_loss": 0.795025110244751 }, { "epoch": 0.10116808552761829, "grad_norm": 2.9709362983703613, "learning_rate": 5e-05, "llm_loss": 0.6135103702545166, "loss": 3.0781, "loss_aux_layer_0": 0.03424072265625, "loss_aux_layer_1": 0.1107177734375, "loss_aux_layer_10": 0.121826171875, "loss_aux_layer_11": 0.1282958984375, "loss_aux_layer_12": 0.137939453125, "loss_aux_layer_13": 0.148193359375, "loss_aux_layer_14": 0.164306640625, "loss_aux_layer_15": 0.178466796875, "loss_aux_layer_16": 0.1923828125, "loss_aux_layer_17": 0.196533203125, "loss_aux_layer_18": 0.205078125, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1185302734375, "loss_aux_layer_20": 0.20703125, "loss_aux_layer_21": 0.211181640625, "loss_aux_layer_22": 0.233154296875, "loss_aux_layer_23": 0.2783203125, "loss_aux_layer_3": 0.1259765625, "loss_aux_layer_4": 0.1263427734375, "loss_aux_layer_5": 0.128173828125, "loss_aux_layer_6": 0.1280517578125, "loss_aux_layer_7": 0.12353515625, "loss_aux_layer_8": 0.121826171875, "loss_aux_layer_9": 0.12109375, "step": 511, "total_loss": 0.769537016749382 }, { "epoch": 0.1013660661255197, "grad_norm": 4.372951030731201, "learning_rate": 5e-05, "llm_loss": 0.643200471997261, "loss": 3.2066, "loss_aux_layer_0": 0.03656005859375, "loss_aux_layer_1": 0.115966796875, "loss_aux_layer_10": 0.1251220703125, "loss_aux_layer_11": 0.131591796875, "loss_aux_layer_12": 0.140869140625, "loss_aux_layer_13": 0.150146484375, "loss_aux_layer_14": 0.1669921875, "loss_aux_layer_15": 0.18115234375, "loss_aux_layer_16": 0.194091796875, "loss_aux_layer_17": 0.199462890625, "loss_aux_layer_18": 0.208251953125, "loss_aux_layer_19": 0.2060546875, "loss_aux_layer_2": 0.123291015625, "loss_aux_layer_20": 0.208251953125, "loss_aux_layer_21": 0.21044921875, "loss_aux_layer_22": 0.232421875, "loss_aux_layer_23": 0.27587890625, "loss_aux_layer_3": 0.1280517578125, "loss_aux_layer_4": 0.128662109375, "loss_aux_layer_5": 0.1295166015625, "loss_aux_layer_6": 0.1312255859375, "loss_aux_layer_7": 0.12744140625, "loss_aux_layer_8": 0.125732421875, "loss_aux_layer_9": 0.1246337890625, "step": 512, "total_loss": 0.8016377240419388 }, { "epoch": 0.1015640467234211, "grad_norm": 1.9744454622268677, "learning_rate": 5e-05, "llm_loss": 0.6685879528522491, "loss": 3.2732, "loss_aux_layer_0": 0.0306396484375, "loss_aux_layer_1": 0.10302734375, "loss_aux_layer_10": 0.1141357421875, "loss_aux_layer_11": 0.1204833984375, "loss_aux_layer_12": 0.130615234375, "loss_aux_layer_13": 0.140869140625, "loss_aux_layer_14": 0.158203125, "loss_aux_layer_15": 0.1728515625, "loss_aux_layer_16": 0.1875, "loss_aux_layer_17": 0.19287109375, "loss_aux_layer_18": 0.201171875, "loss_aux_layer_19": 0.200927734375, "loss_aux_layer_2": 0.1075439453125, "loss_aux_layer_20": 0.2041015625, "loss_aux_layer_21": 0.2080078125, "loss_aux_layer_22": 0.22802734375, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.11767578125, "loss_aux_layer_4": 0.118896484375, "loss_aux_layer_5": 0.1214599609375, "loss_aux_layer_6": 0.12109375, "loss_aux_layer_7": 0.11474609375, "loss_aux_layer_8": 0.11376953125, "loss_aux_layer_9": 0.1129150390625, "step": 513, "total_loss": 0.8183102011680603 }, { "epoch": 0.10176202732132252, "grad_norm": 2.597687005996704, "learning_rate": 5e-05, "llm_loss": 0.6014354228973389, "loss": 3.0215, "loss_aux_layer_0": 0.038818359375, "loss_aux_layer_1": 0.1112060546875, "loss_aux_layer_10": 0.1195068359375, "loss_aux_layer_11": 0.1260986328125, "loss_aux_layer_12": 0.13525390625, "loss_aux_layer_13": 0.145263671875, "loss_aux_layer_14": 0.161376953125, "loss_aux_layer_15": 0.176025390625, "loss_aux_layer_16": 0.18994140625, "loss_aux_layer_17": 0.1943359375, "loss_aux_layer_18": 0.202392578125, "loss_aux_layer_19": 0.200927734375, "loss_aux_layer_2": 0.1168212890625, "loss_aux_layer_20": 0.205322265625, "loss_aux_layer_21": 0.208984375, "loss_aux_layer_22": 0.2294921875, "loss_aux_layer_23": 0.2724609375, "loss_aux_layer_3": 0.12451171875, "loss_aux_layer_4": 0.125244140625, "loss_aux_layer_5": 0.1263427734375, "loss_aux_layer_6": 0.125732421875, "loss_aux_layer_7": 0.119873046875, "loss_aux_layer_8": 0.11962890625, "loss_aux_layer_9": 0.1192626953125, "step": 514, "total_loss": 0.7553848475217819 }, { "epoch": 0.10196000791922391, "grad_norm": 1.5494763851165771, "learning_rate": 5e-05, "llm_loss": 0.6971534043550491, "loss": 3.3791, "loss_aux_layer_0": 0.03448486328125, "loss_aux_layer_1": 0.1009521484375, "loss_aux_layer_10": 0.1109619140625, "loss_aux_layer_11": 0.1170654296875, "loss_aux_layer_12": 0.126708984375, "loss_aux_layer_13": 0.13818359375, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.170654296875, "loss_aux_layer_16": 0.185302734375, "loss_aux_layer_17": 0.191650390625, "loss_aux_layer_18": 0.20068359375, "loss_aux_layer_19": 0.201171875, "loss_aux_layer_2": 0.104248046875, "loss_aux_layer_20": 0.205810546875, "loss_aux_layer_21": 0.210205078125, "loss_aux_layer_22": 0.2314453125, "loss_aux_layer_23": 0.275390625, "loss_aux_layer_3": 0.111083984375, "loss_aux_layer_4": 0.1121826171875, "loss_aux_layer_5": 0.1142578125, "loss_aux_layer_6": 0.115478515625, "loss_aux_layer_7": 0.111083984375, "loss_aux_layer_8": 0.1107177734375, "loss_aux_layer_9": 0.1094970703125, "step": 515, "total_loss": 0.8447851985692978 }, { "epoch": 0.10215798851712532, "grad_norm": 1.208250641822815, "learning_rate": 5e-05, "llm_loss": 0.6446433514356613, "loss": 3.1745, "loss_aux_layer_0": 0.03118896484375, "loss_aux_layer_1": 0.10791015625, "loss_aux_layer_10": 0.1153564453125, "loss_aux_layer_11": 0.1217041015625, "loss_aux_layer_12": 0.1302490234375, "loss_aux_layer_13": 0.140625, "loss_aux_layer_14": 0.156005859375, "loss_aux_layer_15": 0.17041015625, "loss_aux_layer_16": 0.183837890625, "loss_aux_layer_17": 0.1884765625, "loss_aux_layer_18": 0.197265625, "loss_aux_layer_19": 0.1962890625, "loss_aux_layer_2": 0.109130859375, "loss_aux_layer_20": 0.200439453125, "loss_aux_layer_21": 0.205322265625, "loss_aux_layer_22": 0.224609375, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.1181640625, "loss_aux_layer_4": 0.1199951171875, "loss_aux_layer_5": 0.121826171875, "loss_aux_layer_6": 0.1224365234375, "loss_aux_layer_7": 0.1165771484375, "loss_aux_layer_8": 0.115478515625, "loss_aux_layer_9": 0.1142578125, "step": 516, "total_loss": 0.7936363369226456 }, { "epoch": 0.10235596911502673, "grad_norm": 1.5866445302963257, "learning_rate": 5e-05, "llm_loss": 0.6620239317417145, "loss": 3.2589, "loss_aux_layer_0": 0.034759521484375, "loss_aux_layer_1": 0.109130859375, "loss_aux_layer_10": 0.1168212890625, "loss_aux_layer_11": 0.1239013671875, "loss_aux_layer_12": 0.1337890625, "loss_aux_layer_13": 0.144775390625, "loss_aux_layer_14": 0.161376953125, "loss_aux_layer_15": 0.17578125, "loss_aux_layer_16": 0.18994140625, "loss_aux_layer_17": 0.19580078125, "loss_aux_layer_18": 0.2041015625, "loss_aux_layer_19": 0.203369140625, "loss_aux_layer_2": 0.1102294921875, "loss_aux_layer_20": 0.207763671875, "loss_aux_layer_21": 0.2109375, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.27587890625, "loss_aux_layer_3": 0.11865234375, "loss_aux_layer_4": 0.119873046875, "loss_aux_layer_5": 0.1220703125, "loss_aux_layer_6": 0.1229248046875, "loss_aux_layer_7": 0.1177978515625, "loss_aux_layer_8": 0.1171875, "loss_aux_layer_9": 0.11572265625, "step": 517, "total_loss": 0.8147251605987549 }, { "epoch": 0.10255394971292814, "grad_norm": 1.334559440612793, "learning_rate": 5e-05, "llm_loss": 0.5935389250516891, "loss": 2.9767, "loss_aux_layer_0": 0.0333251953125, "loss_aux_layer_1": 0.1099853515625, "loss_aux_layer_10": 0.1168212890625, "loss_aux_layer_11": 0.123291015625, "loss_aux_layer_12": 0.13232421875, "loss_aux_layer_13": 0.142822265625, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.172119140625, "loss_aux_layer_16": 0.184814453125, "loss_aux_layer_17": 0.188720703125, "loss_aux_layer_18": 0.1962890625, "loss_aux_layer_19": 0.196044921875, "loss_aux_layer_2": 0.1131591796875, "loss_aux_layer_20": 0.199462890625, "loss_aux_layer_21": 0.205322265625, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1207275390625, "loss_aux_layer_4": 0.1221923828125, "loss_aux_layer_5": 0.1236572265625, "loss_aux_layer_6": 0.124267578125, "loss_aux_layer_7": 0.1185302734375, "loss_aux_layer_8": 0.1171875, "loss_aux_layer_9": 0.1158447265625, "step": 518, "total_loss": 0.7441808581352234 }, { "epoch": 0.10275193031082953, "grad_norm": 1.472617506980896, "learning_rate": 5e-05, "llm_loss": 0.6866963654756546, "loss": 3.3507, "loss_aux_layer_0": 0.031158447265625, "loss_aux_layer_1": 0.1080322265625, "loss_aux_layer_10": 0.1158447265625, "loss_aux_layer_11": 0.122314453125, "loss_aux_layer_12": 0.1312255859375, "loss_aux_layer_13": 0.14208984375, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.1728515625, "loss_aux_layer_16": 0.1865234375, "loss_aux_layer_17": 0.1923828125, "loss_aux_layer_18": 0.200439453125, "loss_aux_layer_19": 0.2001953125, "loss_aux_layer_2": 0.1099853515625, "loss_aux_layer_20": 0.2041015625, "loss_aux_layer_21": 0.208251953125, "loss_aux_layer_22": 0.23095703125, "loss_aux_layer_23": 0.27392578125, "loss_aux_layer_3": 0.1187744140625, "loss_aux_layer_4": 0.120849609375, "loss_aux_layer_5": 0.1226806640625, "loss_aux_layer_6": 0.1236572265625, "loss_aux_layer_7": 0.1177978515625, "loss_aux_layer_8": 0.1162109375, "loss_aux_layer_9": 0.11474609375, "step": 519, "total_loss": 0.8376666903495789 }, { "epoch": 0.10294991090873094, "grad_norm": 1.65735924243927, "learning_rate": 5e-05, "llm_loss": 0.6784844100475311, "loss": 3.3122, "loss_aux_layer_0": 0.03485107421875, "loss_aux_layer_1": 0.1058349609375, "loss_aux_layer_10": 0.11376953125, "loss_aux_layer_11": 0.120361328125, "loss_aux_layer_12": 0.1295166015625, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.156982421875, "loss_aux_layer_15": 0.1708984375, "loss_aux_layer_16": 0.184814453125, "loss_aux_layer_17": 0.19091796875, "loss_aux_layer_18": 0.19921875, "loss_aux_layer_19": 0.198974609375, "loss_aux_layer_2": 0.107666015625, "loss_aux_layer_20": 0.202880859375, "loss_aux_layer_21": 0.207275390625, "loss_aux_layer_22": 0.2294921875, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.1175537109375, "loss_aux_layer_4": 0.119140625, "loss_aux_layer_5": 0.1209716796875, "loss_aux_layer_6": 0.1217041015625, "loss_aux_layer_7": 0.115966796875, "loss_aux_layer_8": 0.114501953125, "loss_aux_layer_9": 0.1126708984375, "step": 520, "total_loss": 0.8280542343854904 }, { "epoch": 0.10314789150663235, "grad_norm": 1.0933574438095093, "learning_rate": 5e-05, "llm_loss": 0.6160976141691208, "loss": 3.0818, "loss_aux_layer_0": 0.03485107421875, "loss_aux_layer_1": 0.11279296875, "loss_aux_layer_10": 0.1190185546875, "loss_aux_layer_11": 0.1259765625, "loss_aux_layer_12": 0.135498046875, "loss_aux_layer_13": 0.146484375, "loss_aux_layer_14": 0.1630859375, "loss_aux_layer_15": 0.1767578125, "loss_aux_layer_16": 0.190185546875, "loss_aux_layer_17": 0.195068359375, "loss_aux_layer_18": 0.20361328125, "loss_aux_layer_19": 0.201171875, "loss_aux_layer_2": 0.1153564453125, "loss_aux_layer_20": 0.20458984375, "loss_aux_layer_21": 0.20947265625, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.27685546875, "loss_aux_layer_3": 0.123046875, "loss_aux_layer_4": 0.1246337890625, "loss_aux_layer_5": 0.126220703125, "loss_aux_layer_6": 0.127197265625, "loss_aux_layer_7": 0.121826171875, "loss_aux_layer_8": 0.1201171875, "loss_aux_layer_9": 0.1181640625, "step": 521, "total_loss": 0.7704547345638275 }, { "epoch": 0.10334587210453376, "grad_norm": 1.6257182359695435, "learning_rate": 5e-05, "llm_loss": 0.5899024903774261, "loss": 2.9575, "loss_aux_layer_0": 0.034942626953125, "loss_aux_layer_1": 0.1064453125, "loss_aux_layer_10": 0.1136474609375, "loss_aux_layer_11": 0.1201171875, "loss_aux_layer_12": 0.1292724609375, "loss_aux_layer_13": 0.139892578125, "loss_aux_layer_14": 0.156005859375, "loss_aux_layer_15": 0.1708984375, "loss_aux_layer_16": 0.18505859375, "loss_aux_layer_17": 0.190185546875, "loss_aux_layer_18": 0.19873046875, "loss_aux_layer_19": 0.19873046875, "loss_aux_layer_2": 0.1082763671875, "loss_aux_layer_20": 0.202880859375, "loss_aux_layer_21": 0.2080078125, "loss_aux_layer_22": 0.230712890625, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.1168212890625, "loss_aux_layer_4": 0.117919921875, "loss_aux_layer_5": 0.1197509765625, "loss_aux_layer_6": 0.1212158203125, "loss_aux_layer_7": 0.1153564453125, "loss_aux_layer_8": 0.11376953125, "loss_aux_layer_9": 0.1126708984375, "step": 522, "total_loss": 0.7393818795681 }, { "epoch": 0.10354385270243516, "grad_norm": 0.9342986345291138, "learning_rate": 5e-05, "llm_loss": 0.5680011808872223, "loss": 2.8606, "loss_aux_layer_0": 0.0303955078125, "loss_aux_layer_1": 0.1014404296875, "loss_aux_layer_10": 0.1109619140625, "loss_aux_layer_11": 0.117431640625, "loss_aux_layer_12": 0.126953125, "loss_aux_layer_13": 0.13818359375, "loss_aux_layer_14": 0.15576171875, "loss_aux_layer_15": 0.171142578125, "loss_aux_layer_16": 0.18603515625, "loss_aux_layer_17": 0.191162109375, "loss_aux_layer_18": 0.19970703125, "loss_aux_layer_19": 0.198974609375, "loss_aux_layer_2": 0.1046142578125, "loss_aux_layer_20": 0.203125, "loss_aux_layer_21": 0.2080078125, "loss_aux_layer_22": 0.22802734375, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1114501953125, "loss_aux_layer_4": 0.1129150390625, "loss_aux_layer_5": 0.11474609375, "loss_aux_layer_6": 0.1163330078125, "loss_aux_layer_7": 0.111328125, "loss_aux_layer_8": 0.11083984375, "loss_aux_layer_9": 0.1097412109375, "step": 523, "total_loss": 0.7151384800672531 }, { "epoch": 0.10374183330033657, "grad_norm": 1.1879297494888306, "learning_rate": 5e-05, "llm_loss": 0.5774187296628952, "loss": 2.9148, "loss_aux_layer_0": 0.031585693359375, "loss_aux_layer_1": 0.10595703125, "loss_aux_layer_10": 0.11572265625, "loss_aux_layer_11": 0.1221923828125, "loss_aux_layer_12": 0.1318359375, "loss_aux_layer_13": 0.14208984375, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.172607421875, "loss_aux_layer_16": 0.18701171875, "loss_aux_layer_17": 0.1923828125, "loss_aux_layer_18": 0.20068359375, "loss_aux_layer_19": 0.199951171875, "loss_aux_layer_2": 0.110107421875, "loss_aux_layer_20": 0.205078125, "loss_aux_layer_21": 0.210205078125, "loss_aux_layer_22": 0.232177734375, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.11865234375, "loss_aux_layer_4": 0.120361328125, "loss_aux_layer_5": 0.12255859375, "loss_aux_layer_6": 0.1240234375, "loss_aux_layer_7": 0.118408203125, "loss_aux_layer_8": 0.1168212890625, "loss_aux_layer_9": 0.115478515625, "step": 524, "total_loss": 0.7286965996026993 }, { "epoch": 0.10393981389823798, "grad_norm": 1.2554248571395874, "learning_rate": 5e-05, "llm_loss": 0.5514357388019562, "loss": 2.8104, "loss_aux_layer_0": 0.03118896484375, "loss_aux_layer_1": 0.1077880859375, "loss_aux_layer_10": 0.1163330078125, "loss_aux_layer_11": 0.12255859375, "loss_aux_layer_12": 0.1312255859375, "loss_aux_layer_13": 0.141357421875, "loss_aux_layer_14": 0.1572265625, "loss_aux_layer_15": 0.172119140625, "loss_aux_layer_16": 0.185302734375, "loss_aux_layer_17": 0.189208984375, "loss_aux_layer_18": 0.198974609375, "loss_aux_layer_19": 0.1982421875, "loss_aux_layer_2": 0.11181640625, "loss_aux_layer_20": 0.202392578125, "loss_aux_layer_21": 0.208984375, "loss_aux_layer_22": 0.231201171875, "loss_aux_layer_23": 0.27734375, "loss_aux_layer_3": 0.1204833984375, "loss_aux_layer_4": 0.1217041015625, "loss_aux_layer_5": 0.1239013671875, "loss_aux_layer_6": 0.1248779296875, "loss_aux_layer_7": 0.1197509765625, "loss_aux_layer_8": 0.1175537109375, "loss_aux_layer_9": 0.1158447265625, "step": 525, "total_loss": 0.7026002705097198 }, { "epoch": 0.10413779449613937, "grad_norm": 1.7690606117248535, "learning_rate": 5e-05, "llm_loss": 0.5897373408079147, "loss": 2.9521, "loss_aux_layer_0": 0.0316162109375, "loss_aux_layer_1": 0.102294921875, "loss_aux_layer_10": 0.11328125, "loss_aux_layer_11": 0.11962890625, "loss_aux_layer_12": 0.1292724609375, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.156982421875, "loss_aux_layer_15": 0.171875, "loss_aux_layer_16": 0.185546875, "loss_aux_layer_17": 0.1904296875, "loss_aux_layer_18": 0.1982421875, "loss_aux_layer_19": 0.197265625, "loss_aux_layer_2": 0.10595703125, "loss_aux_layer_20": 0.201904296875, "loss_aux_layer_21": 0.206298828125, "loss_aux_layer_22": 0.228759765625, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.1138916015625, "loss_aux_layer_4": 0.11572265625, "loss_aux_layer_5": 0.117919921875, "loss_aux_layer_6": 0.1192626953125, "loss_aux_layer_7": 0.11376953125, "loss_aux_layer_8": 0.113525390625, "loss_aux_layer_9": 0.112060546875, "step": 526, "total_loss": 0.7380254566669464 }, { "epoch": 0.10433577509404078, "grad_norm": 1.3177528381347656, "learning_rate": 5e-05, "llm_loss": 0.6721942126750946, "loss": 3.2772, "loss_aux_layer_0": 0.031768798828125, "loss_aux_layer_1": 0.1016845703125, "loss_aux_layer_10": 0.1109619140625, "loss_aux_layer_11": 0.1171875, "loss_aux_layer_12": 0.1265869140625, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.170654296875, "loss_aux_layer_16": 0.185546875, "loss_aux_layer_17": 0.191162109375, "loss_aux_layer_18": 0.2001953125, "loss_aux_layer_19": 0.199951171875, "loss_aux_layer_2": 0.1044921875, "loss_aux_layer_20": 0.2041015625, "loss_aux_layer_21": 0.2080078125, "loss_aux_layer_22": 0.227783203125, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.111572265625, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.1148681640625, "loss_aux_layer_6": 0.1162109375, "loss_aux_layer_7": 0.111083984375, "loss_aux_layer_8": 0.1103515625, "loss_aux_layer_9": 0.1094970703125, "step": 527, "total_loss": 0.8193067759275436 }, { "epoch": 0.10453375569194219, "grad_norm": 1.701084852218628, "learning_rate": 5e-05, "llm_loss": 0.6853243559598923, "loss": 3.3233, "loss_aux_layer_0": 0.032257080078125, "loss_aux_layer_1": 0.1024169921875, "loss_aux_layer_10": 0.1087646484375, "loss_aux_layer_11": 0.1151123046875, "loss_aux_layer_12": 0.1251220703125, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.15234375, "loss_aux_layer_15": 0.16748046875, "loss_aux_layer_16": 0.181396484375, "loss_aux_layer_17": 0.188232421875, "loss_aux_layer_18": 0.197998046875, "loss_aux_layer_19": 0.197998046875, "loss_aux_layer_2": 0.1024169921875, "loss_aux_layer_20": 0.203125, "loss_aux_layer_21": 0.206787109375, "loss_aux_layer_22": 0.227783203125, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.1099853515625, "loss_aux_layer_4": 0.1114501953125, "loss_aux_layer_5": 0.113037109375, "loss_aux_layer_6": 0.1153564453125, "loss_aux_layer_7": 0.1099853515625, "loss_aux_layer_8": 0.1087646484375, "loss_aux_layer_9": 0.107666015625, "step": 528, "total_loss": 0.8308134377002716 }, { "epoch": 0.1047317362898436, "grad_norm": 0.9093453288078308, "learning_rate": 5e-05, "llm_loss": 0.6810428947210312, "loss": 3.3012, "loss_aux_layer_0": 0.032562255859375, "loss_aux_layer_1": 0.1019287109375, "loss_aux_layer_10": 0.1080322265625, "loss_aux_layer_11": 0.114501953125, "loss_aux_layer_12": 0.123291015625, "loss_aux_layer_13": 0.1328125, "loss_aux_layer_14": 0.149169921875, "loss_aux_layer_15": 0.16455078125, "loss_aux_layer_16": 0.179443359375, "loss_aux_layer_17": 0.185302734375, "loss_aux_layer_18": 0.194580078125, "loss_aux_layer_19": 0.1953125, "loss_aux_layer_2": 0.1029052734375, "loss_aux_layer_20": 0.2001953125, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.22509765625, "loss_aux_layer_23": 0.26953125, "loss_aux_layer_3": 0.1103515625, "loss_aux_layer_4": 0.1119384765625, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1156005859375, "loss_aux_layer_7": 0.1104736328125, "loss_aux_layer_8": 0.1090087890625, "loss_aux_layer_9": 0.107666015625, "step": 529, "total_loss": 0.8253064900636673 }, { "epoch": 0.104929716887745, "grad_norm": 2.0964865684509277, "learning_rate": 5e-05, "llm_loss": 0.641674816608429, "loss": 3.1804, "loss_aux_layer_0": 0.033111572265625, "loss_aux_layer_1": 0.107421875, "loss_aux_layer_10": 0.1181640625, "loss_aux_layer_11": 0.1246337890625, "loss_aux_layer_12": 0.134765625, "loss_aux_layer_13": 0.1455078125, "loss_aux_layer_14": 0.16259765625, "loss_aux_layer_15": 0.177001953125, "loss_aux_layer_16": 0.190673828125, "loss_aux_layer_17": 0.196044921875, "loss_aux_layer_18": 0.205078125, "loss_aux_layer_19": 0.20458984375, "loss_aux_layer_2": 0.1090087890625, "loss_aux_layer_20": 0.20849609375, "loss_aux_layer_21": 0.21240234375, "loss_aux_layer_22": 0.23388671875, "loss_aux_layer_23": 0.2802734375, "loss_aux_layer_3": 0.11767578125, "loss_aux_layer_4": 0.1201171875, "loss_aux_layer_5": 0.122314453125, "loss_aux_layer_6": 0.1243896484375, "loss_aux_layer_7": 0.119140625, "loss_aux_layer_8": 0.118408203125, "loss_aux_layer_9": 0.1171875, "step": 530, "total_loss": 0.7951011657714844 }, { "epoch": 0.1051276974856464, "grad_norm": 1.883349061012268, "learning_rate": 5e-05, "llm_loss": 0.7260063737630844, "loss": 3.494, "loss_aux_layer_0": 0.031768798828125, "loss_aux_layer_1": 0.1068115234375, "loss_aux_layer_10": 0.11376953125, "loss_aux_layer_11": 0.119873046875, "loss_aux_layer_12": 0.129150390625, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.1552734375, "loss_aux_layer_15": 0.16943359375, "loss_aux_layer_16": 0.18310546875, "loss_aux_layer_17": 0.18798828125, "loss_aux_layer_18": 0.19580078125, "loss_aux_layer_19": 0.1943359375, "loss_aux_layer_2": 0.106689453125, "loss_aux_layer_20": 0.197998046875, "loss_aux_layer_21": 0.20166015625, "loss_aux_layer_22": 0.222412109375, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.1168212890625, "loss_aux_layer_4": 0.11865234375, "loss_aux_layer_5": 0.1202392578125, "loss_aux_layer_6": 0.1219482421875, "loss_aux_layer_7": 0.116455078125, "loss_aux_layer_8": 0.1142578125, "loss_aux_layer_9": 0.1126708984375, "step": 531, "total_loss": 0.8734992444515228 }, { "epoch": 0.10532567808354781, "grad_norm": 1.544776439666748, "learning_rate": 5e-05, "llm_loss": 0.6767553091049194, "loss": 3.3189, "loss_aux_layer_0": 0.0325927734375, "loss_aux_layer_1": 0.1126708984375, "loss_aux_layer_10": 0.1181640625, "loss_aux_layer_11": 0.1246337890625, "loss_aux_layer_12": 0.13427734375, "loss_aux_layer_13": 0.144287109375, "loss_aux_layer_14": 0.16015625, "loss_aux_layer_15": 0.174560546875, "loss_aux_layer_16": 0.187744140625, "loss_aux_layer_17": 0.193603515625, "loss_aux_layer_18": 0.20263671875, "loss_aux_layer_19": 0.201416015625, "loss_aux_layer_2": 0.1129150390625, "loss_aux_layer_20": 0.205810546875, "loss_aux_layer_21": 0.2099609375, "loss_aux_layer_22": 0.231689453125, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.120849609375, "loss_aux_layer_4": 0.1224365234375, "loss_aux_layer_5": 0.1240234375, "loss_aux_layer_6": 0.126220703125, "loss_aux_layer_7": 0.12060546875, "loss_aux_layer_8": 0.119140625, "loss_aux_layer_9": 0.117431640625, "step": 532, "total_loss": 0.8297310173511505 }, { "epoch": 0.10552365868144922, "grad_norm": 1.7467619180679321, "learning_rate": 5e-05, "llm_loss": 0.6135238707065582, "loss": 3.0475, "loss_aux_layer_0": 0.030548095703125, "loss_aux_layer_1": 0.107421875, "loss_aux_layer_10": 0.11279296875, "loss_aux_layer_11": 0.1190185546875, "loss_aux_layer_12": 0.128662109375, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.1552734375, "loss_aux_layer_15": 0.169677734375, "loss_aux_layer_16": 0.183837890625, "loss_aux_layer_17": 0.189208984375, "loss_aux_layer_18": 0.19775390625, "loss_aux_layer_19": 0.197265625, "loss_aux_layer_2": 0.1083984375, "loss_aux_layer_20": 0.201171875, "loss_aux_layer_21": 0.20556640625, "loss_aux_layer_22": 0.2275390625, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.1168212890625, "loss_aux_layer_4": 0.117919921875, "loss_aux_layer_5": 0.119384765625, "loss_aux_layer_6": 0.1212158203125, "loss_aux_layer_7": 0.1151123046875, "loss_aux_layer_8": 0.1136474609375, "loss_aux_layer_9": 0.111572265625, "step": 533, "total_loss": 0.7618769854307175 }, { "epoch": 0.10572163927935062, "grad_norm": 3.006037950515747, "learning_rate": 5e-05, "llm_loss": 0.6128381788730621, "loss": 3.0449, "loss_aux_layer_0": 0.031463623046875, "loss_aux_layer_1": 0.1019287109375, "loss_aux_layer_10": 0.113525390625, "loss_aux_layer_11": 0.11865234375, "loss_aux_layer_12": 0.1282958984375, "loss_aux_layer_13": 0.1396484375, "loss_aux_layer_14": 0.156982421875, "loss_aux_layer_15": 0.172119140625, "loss_aux_layer_16": 0.186279296875, "loss_aux_layer_17": 0.19189453125, "loss_aux_layer_18": 0.19970703125, "loss_aux_layer_19": 0.199951171875, "loss_aux_layer_2": 0.103515625, "loss_aux_layer_20": 0.2041015625, "loss_aux_layer_21": 0.209716796875, "loss_aux_layer_22": 0.23095703125, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.11181640625, "loss_aux_layer_4": 0.113525390625, "loss_aux_layer_5": 0.1153564453125, "loss_aux_layer_6": 0.1177978515625, "loss_aux_layer_7": 0.1131591796875, "loss_aux_layer_8": 0.11279296875, "loss_aux_layer_9": 0.112060546875, "step": 534, "total_loss": 0.761227086186409 }, { "epoch": 0.10591961987725203, "grad_norm": 1.536629557609558, "learning_rate": 5e-05, "llm_loss": 0.6668670028448105, "loss": 3.2576, "loss_aux_layer_0": 0.034271240234375, "loss_aux_layer_1": 0.1051025390625, "loss_aux_layer_10": 0.112060546875, "loss_aux_layer_11": 0.1185302734375, "loss_aux_layer_12": 0.1275634765625, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.154052734375, "loss_aux_layer_15": 0.168701171875, "loss_aux_layer_16": 0.182861328125, "loss_aux_layer_17": 0.18798828125, "loss_aux_layer_18": 0.1953125, "loss_aux_layer_19": 0.196044921875, "loss_aux_layer_2": 0.1063232421875, "loss_aux_layer_20": 0.201171875, "loss_aux_layer_21": 0.2060546875, "loss_aux_layer_22": 0.227783203125, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1146240234375, "loss_aux_layer_4": 0.1162109375, "loss_aux_layer_5": 0.117919921875, "loss_aux_layer_6": 0.1201171875, "loss_aux_layer_7": 0.11474609375, "loss_aux_layer_8": 0.113037109375, "loss_aux_layer_9": 0.1112060546875, "step": 535, "total_loss": 0.8144075572490692 }, { "epoch": 0.10611760047515344, "grad_norm": 1.8912705183029175, "learning_rate": 5e-05, "llm_loss": 0.6843994110822678, "loss": 3.343, "loss_aux_layer_0": 0.034423828125, "loss_aux_layer_1": 0.108154296875, "loss_aux_layer_10": 0.115966796875, "loss_aux_layer_11": 0.1221923828125, "loss_aux_layer_12": 0.13232421875, "loss_aux_layer_13": 0.14208984375, "loss_aux_layer_14": 0.158935546875, "loss_aux_layer_15": 0.174072265625, "loss_aux_layer_16": 0.1875, "loss_aux_layer_17": 0.193359375, "loss_aux_layer_18": 0.201416015625, "loss_aux_layer_19": 0.201171875, "loss_aux_layer_2": 0.10986328125, "loss_aux_layer_20": 0.204833984375, "loss_aux_layer_21": 0.208984375, "loss_aux_layer_22": 0.230712890625, "loss_aux_layer_23": 0.27490234375, "loss_aux_layer_3": 0.1182861328125, "loss_aux_layer_4": 0.1199951171875, "loss_aux_layer_5": 0.1212158203125, "loss_aux_layer_6": 0.123291015625, "loss_aux_layer_7": 0.11767578125, "loss_aux_layer_8": 0.116455078125, "loss_aux_layer_9": 0.1148681640625, "step": 536, "total_loss": 0.8357421606779099 }, { "epoch": 0.10631558107305485, "grad_norm": 1.1736823320388794, "learning_rate": 5e-05, "llm_loss": 0.6644794344902039, "loss": 3.2493, "loss_aux_layer_0": 0.029937744140625, "loss_aux_layer_1": 0.1058349609375, "loss_aux_layer_10": 0.11279296875, "loss_aux_layer_11": 0.1190185546875, "loss_aux_layer_12": 0.128173828125, "loss_aux_layer_13": 0.138427734375, "loss_aux_layer_14": 0.154052734375, "loss_aux_layer_15": 0.1689453125, "loss_aux_layer_16": 0.182861328125, "loss_aux_layer_17": 0.188720703125, "loss_aux_layer_18": 0.197509765625, "loss_aux_layer_19": 0.19677734375, "loss_aux_layer_2": 0.107177734375, "loss_aux_layer_20": 0.201171875, "loss_aux_layer_21": 0.2060546875, "loss_aux_layer_22": 0.226806640625, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.1148681640625, "loss_aux_layer_4": 0.116943359375, "loss_aux_layer_5": 0.118896484375, "loss_aux_layer_6": 0.1209716796875, "loss_aux_layer_7": 0.115478515625, "loss_aux_layer_8": 0.114013671875, "loss_aux_layer_9": 0.112060546875, "step": 537, "total_loss": 0.8123356699943542 }, { "epoch": 0.10651356167095624, "grad_norm": 1.7240508794784546, "learning_rate": 5e-05, "llm_loss": 0.6997847855091095, "loss": 3.3962, "loss_aux_layer_0": 0.03143310546875, "loss_aux_layer_1": 0.106201171875, "loss_aux_layer_10": 0.1129150390625, "loss_aux_layer_11": 0.1195068359375, "loss_aux_layer_12": 0.12939453125, "loss_aux_layer_13": 0.140625, "loss_aux_layer_14": 0.157470703125, "loss_aux_layer_15": 0.172119140625, "loss_aux_layer_16": 0.186279296875, "loss_aux_layer_17": 0.191650390625, "loss_aux_layer_18": 0.2001953125, "loss_aux_layer_19": 0.199462890625, "loss_aux_layer_2": 0.10791015625, "loss_aux_layer_20": 0.20361328125, "loss_aux_layer_21": 0.20751953125, "loss_aux_layer_22": 0.22900390625, "loss_aux_layer_23": 0.2724609375, "loss_aux_layer_3": 0.1156005859375, "loss_aux_layer_4": 0.1175537109375, "loss_aux_layer_5": 0.119140625, "loss_aux_layer_6": 0.121337890625, "loss_aux_layer_7": 0.1156005859375, "loss_aux_layer_8": 0.1138916015625, "loss_aux_layer_9": 0.112060546875, "step": 538, "total_loss": 0.8490443676710129 }, { "epoch": 0.10671154226885765, "grad_norm": 1.5273727178573608, "learning_rate": 5e-05, "llm_loss": 0.6166246086359024, "loss": 3.0573, "loss_aux_layer_0": 0.03118896484375, "loss_aux_layer_1": 0.1070556640625, "loss_aux_layer_10": 0.112060546875, "loss_aux_layer_11": 0.11865234375, "loss_aux_layer_12": 0.12744140625, "loss_aux_layer_13": 0.137939453125, "loss_aux_layer_14": 0.154541015625, "loss_aux_layer_15": 0.1689453125, "loss_aux_layer_16": 0.182861328125, "loss_aux_layer_17": 0.187744140625, "loss_aux_layer_18": 0.19677734375, "loss_aux_layer_19": 0.19677734375, "loss_aux_layer_2": 0.1075439453125, "loss_aux_layer_20": 0.201416015625, "loss_aux_layer_21": 0.205810546875, "loss_aux_layer_22": 0.2265625, "loss_aux_layer_23": 0.26953125, "loss_aux_layer_3": 0.115966796875, "loss_aux_layer_4": 0.1173095703125, "loss_aux_layer_5": 0.1187744140625, "loss_aux_layer_6": 0.1202392578125, "loss_aux_layer_7": 0.1151123046875, "loss_aux_layer_8": 0.1131591796875, "loss_aux_layer_9": 0.111328125, "step": 539, "total_loss": 0.764314740896225 }, { "epoch": 0.10690952286675906, "grad_norm": 0.8849876523017883, "learning_rate": 5e-05, "llm_loss": 0.6816595941781998, "loss": 3.324, "loss_aux_layer_0": 0.031951904296875, "loss_aux_layer_1": 0.110107421875, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.1214599609375, "loss_aux_layer_12": 0.13037109375, "loss_aux_layer_13": 0.140625, "loss_aux_layer_14": 0.156494140625, "loss_aux_layer_15": 0.1708984375, "loss_aux_layer_16": 0.18408203125, "loss_aux_layer_17": 0.188720703125, "loss_aux_layer_18": 0.197021484375, "loss_aux_layer_19": 0.196533203125, "loss_aux_layer_2": 0.1107177734375, "loss_aux_layer_20": 0.199951171875, "loss_aux_layer_21": 0.202880859375, "loss_aux_layer_22": 0.224609375, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.118896484375, "loss_aux_layer_4": 0.1204833984375, "loss_aux_layer_5": 0.12158203125, "loss_aux_layer_6": 0.12353515625, "loss_aux_layer_7": 0.117919921875, "loss_aux_layer_8": 0.1162109375, "loss_aux_layer_9": 0.1143798828125, "step": 540, "total_loss": 0.8309949785470963 }, { "epoch": 0.10710750346466047, "grad_norm": 1.4031307697296143, "learning_rate": 5e-05, "llm_loss": 0.6625590473413467, "loss": 3.2124, "loss_aux_layer_0": 0.031005859375, "loss_aux_layer_1": 0.0970458984375, "loss_aux_layer_10": 0.103271484375, "loss_aux_layer_11": 0.1092529296875, "loss_aux_layer_12": 0.11865234375, "loss_aux_layer_13": 0.1292724609375, "loss_aux_layer_14": 0.146240234375, "loss_aux_layer_15": 0.162109375, "loss_aux_layer_16": 0.1767578125, "loss_aux_layer_17": 0.1826171875, "loss_aux_layer_18": 0.191650390625, "loss_aux_layer_19": 0.192138671875, "loss_aux_layer_2": 0.0985107421875, "loss_aux_layer_20": 0.197509765625, "loss_aux_layer_21": 0.20263671875, "loss_aux_layer_22": 0.22265625, "loss_aux_layer_23": 0.26611328125, "loss_aux_layer_3": 0.10595703125, "loss_aux_layer_4": 0.1072998046875, "loss_aux_layer_5": 0.1094970703125, "loss_aux_layer_6": 0.1107177734375, "loss_aux_layer_7": 0.10546875, "loss_aux_layer_8": 0.1038818359375, "loss_aux_layer_9": 0.1026611328125, "step": 541, "total_loss": 0.8031104803085327 }, { "epoch": 0.10730548406256186, "grad_norm": 0.9151356220245361, "learning_rate": 5e-05, "llm_loss": 0.6315647214651108, "loss": 3.1155, "loss_aux_layer_0": 0.03173828125, "loss_aux_layer_1": 0.1060791015625, "loss_aux_layer_10": 0.111328125, "loss_aux_layer_11": 0.1177978515625, "loss_aux_layer_12": 0.1275634765625, "loss_aux_layer_13": 0.137939453125, "loss_aux_layer_14": 0.15478515625, "loss_aux_layer_15": 0.169677734375, "loss_aux_layer_16": 0.183349609375, "loss_aux_layer_17": 0.188720703125, "loss_aux_layer_18": 0.197021484375, "loss_aux_layer_19": 0.196044921875, "loss_aux_layer_2": 0.1070556640625, "loss_aux_layer_20": 0.200439453125, "loss_aux_layer_21": 0.205078125, "loss_aux_layer_22": 0.227294921875, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1136474609375, "loss_aux_layer_4": 0.1151123046875, "loss_aux_layer_5": 0.1168212890625, "loss_aux_layer_6": 0.118896484375, "loss_aux_layer_7": 0.113525390625, "loss_aux_layer_8": 0.1123046875, "loss_aux_layer_9": 0.1109619140625, "step": 542, "total_loss": 0.7788837254047394 }, { "epoch": 0.10750346466046327, "grad_norm": 1.785211443901062, "learning_rate": 5e-05, "llm_loss": 0.6968407183885574, "loss": 3.3483, "loss_aux_layer_0": 0.032867431640625, "loss_aux_layer_1": 0.097412109375, "loss_aux_layer_10": 0.104736328125, "loss_aux_layer_11": 0.1107177734375, "loss_aux_layer_12": 0.1202392578125, "loss_aux_layer_13": 0.1307373046875, "loss_aux_layer_14": 0.147705078125, "loss_aux_layer_15": 0.16259765625, "loss_aux_layer_16": 0.17724609375, "loss_aux_layer_17": 0.182373046875, "loss_aux_layer_18": 0.190673828125, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.097412109375, "loss_aux_layer_20": 0.19482421875, "loss_aux_layer_21": 0.1982421875, "loss_aux_layer_22": 0.218994140625, "loss_aux_layer_23": 0.26318359375, "loss_aux_layer_3": 0.1053466796875, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.1104736328125, "loss_aux_layer_7": 0.10595703125, "loss_aux_layer_8": 0.1046142578125, "loss_aux_layer_9": 0.103515625, "step": 543, "total_loss": 0.8370753675699234 }, { "epoch": 0.10770144525836468, "grad_norm": 1.0959182977676392, "learning_rate": 5e-05, "llm_loss": 0.6177248805761337, "loss": 3.062, "loss_aux_layer_0": 0.03179931640625, "loss_aux_layer_1": 0.1053466796875, "loss_aux_layer_10": 0.1131591796875, "loss_aux_layer_11": 0.1199951171875, "loss_aux_layer_12": 0.1295166015625, "loss_aux_layer_13": 0.1396484375, "loss_aux_layer_14": 0.155517578125, "loss_aux_layer_15": 0.170166015625, "loss_aux_layer_16": 0.18310546875, "loss_aux_layer_17": 0.18798828125, "loss_aux_layer_18": 0.19580078125, "loss_aux_layer_19": 0.1953125, "loss_aux_layer_2": 0.107177734375, "loss_aux_layer_20": 0.198974609375, "loss_aux_layer_21": 0.203369140625, "loss_aux_layer_22": 0.224365234375, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.1156005859375, "loss_aux_layer_4": 0.1173095703125, "loss_aux_layer_5": 0.11962890625, "loss_aux_layer_6": 0.12109375, "loss_aux_layer_7": 0.115966796875, "loss_aux_layer_8": 0.1141357421875, "loss_aux_layer_9": 0.1126708984375, "step": 544, "total_loss": 0.7655018121004105 }, { "epoch": 0.10789942585626608, "grad_norm": 1.977233648300171, "learning_rate": 5e-05, "llm_loss": 0.6184408068656921, "loss": 3.0553, "loss_aux_layer_0": 0.03131103515625, "loss_aux_layer_1": 0.1031494140625, "loss_aux_layer_10": 0.10986328125, "loss_aux_layer_11": 0.11572265625, "loss_aux_layer_12": 0.1246337890625, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.15185546875, "loss_aux_layer_15": 0.166748046875, "loss_aux_layer_16": 0.180908203125, "loss_aux_layer_17": 0.1865234375, "loss_aux_layer_18": 0.19580078125, "loss_aux_layer_19": 0.195556640625, "loss_aux_layer_2": 0.10400390625, "loss_aux_layer_20": 0.2001953125, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.2255859375, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.112060546875, "loss_aux_layer_4": 0.1134033203125, "loss_aux_layer_5": 0.114990234375, "loss_aux_layer_6": 0.116943359375, "loss_aux_layer_7": 0.1116943359375, "loss_aux_layer_8": 0.1103515625, "loss_aux_layer_9": 0.10888671875, "step": 545, "total_loss": 0.763823077082634 }, { "epoch": 0.10809740645416749, "grad_norm": 2.541248083114624, "learning_rate": 5e-05, "llm_loss": 0.6745554059743881, "loss": 3.2972, "loss_aux_layer_0": 0.03466796875, "loss_aux_layer_1": 0.10888671875, "loss_aux_layer_10": 0.1141357421875, "loss_aux_layer_11": 0.1212158203125, "loss_aux_layer_12": 0.131103515625, "loss_aux_layer_13": 0.14208984375, "loss_aux_layer_14": 0.157958984375, "loss_aux_layer_15": 0.17236328125, "loss_aux_layer_16": 0.186279296875, "loss_aux_layer_17": 0.190185546875, "loss_aux_layer_18": 0.198486328125, "loss_aux_layer_19": 0.197021484375, "loss_aux_layer_2": 0.108642578125, "loss_aux_layer_20": 0.201171875, "loss_aux_layer_21": 0.205078125, "loss_aux_layer_22": 0.2265625, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.1177978515625, "loss_aux_layer_4": 0.1192626953125, "loss_aux_layer_5": 0.120849609375, "loss_aux_layer_6": 0.122802734375, "loss_aux_layer_7": 0.117431640625, "loss_aux_layer_8": 0.1151123046875, "loss_aux_layer_9": 0.1136474609375, "step": 546, "total_loss": 0.8243113458156586 }, { "epoch": 0.1082953870520689, "grad_norm": 2.5007903575897217, "learning_rate": 5e-05, "llm_loss": 0.6030402481555939, "loss": 3.0098, "loss_aux_layer_0": 0.0321044921875, "loss_aux_layer_1": 0.109375, "loss_aux_layer_10": 0.1142578125, "loss_aux_layer_11": 0.1207275390625, "loss_aux_layer_12": 0.13037109375, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.15673828125, "loss_aux_layer_15": 0.170654296875, "loss_aux_layer_16": 0.183349609375, "loss_aux_layer_17": 0.189208984375, "loss_aux_layer_18": 0.197998046875, "loss_aux_layer_19": 0.1962890625, "loss_aux_layer_2": 0.1097412109375, "loss_aux_layer_20": 0.20068359375, "loss_aux_layer_21": 0.2041015625, "loss_aux_layer_22": 0.2255859375, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.1195068359375, "loss_aux_layer_4": 0.1212158203125, "loss_aux_layer_5": 0.12255859375, "loss_aux_layer_6": 0.1248779296875, "loss_aux_layer_7": 0.1185302734375, "loss_aux_layer_8": 0.116455078125, "loss_aux_layer_9": 0.1141357421875, "step": 547, "total_loss": 0.7524552941322327 }, { "epoch": 0.10849336764997031, "grad_norm": 1.2673394680023193, "learning_rate": 5e-05, "llm_loss": 0.5715154707431793, "loss": 2.8609, "loss_aux_layer_0": 0.031097412109375, "loss_aux_layer_1": 0.1002197265625, "loss_aux_layer_10": 0.1087646484375, "loss_aux_layer_11": 0.115234375, "loss_aux_layer_12": 0.1246337890625, "loss_aux_layer_13": 0.134521484375, "loss_aux_layer_14": 0.150390625, "loss_aux_layer_15": 0.165283203125, "loss_aux_layer_16": 0.178955078125, "loss_aux_layer_17": 0.183349609375, "loss_aux_layer_18": 0.1923828125, "loss_aux_layer_19": 0.192138671875, "loss_aux_layer_2": 0.10400390625, "loss_aux_layer_20": 0.19677734375, "loss_aux_layer_21": 0.201904296875, "loss_aux_layer_22": 0.22216796875, "loss_aux_layer_23": 0.26708984375, "loss_aux_layer_3": 0.1114501953125, "loss_aux_layer_4": 0.112060546875, "loss_aux_layer_5": 0.1136474609375, "loss_aux_layer_6": 0.115234375, "loss_aux_layer_7": 0.1102294921875, "loss_aux_layer_8": 0.109130859375, "loss_aux_layer_9": 0.1080322265625, "step": 548, "total_loss": 0.7152149826288223 }, { "epoch": 0.1086913482478717, "grad_norm": 2.346693754196167, "learning_rate": 5e-05, "llm_loss": 0.7107753604650497, "loss": 3.416, "loss_aux_layer_0": 0.0311279296875, "loss_aux_layer_1": 0.10107421875, "loss_aux_layer_10": 0.1072998046875, "loss_aux_layer_11": 0.11328125, "loss_aux_layer_12": 0.122314453125, "loss_aux_layer_13": 0.132568359375, "loss_aux_layer_14": 0.14892578125, "loss_aux_layer_15": 0.16357421875, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.183349609375, "loss_aux_layer_18": 0.19140625, "loss_aux_layer_19": 0.19189453125, "loss_aux_layer_2": 0.10205078125, "loss_aux_layer_20": 0.19677734375, "loss_aux_layer_21": 0.2021484375, "loss_aux_layer_22": 0.224365234375, "loss_aux_layer_23": 0.26953125, "loss_aux_layer_3": 0.1107177734375, "loss_aux_layer_4": 0.1121826171875, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.116455078125, "loss_aux_layer_7": 0.1103515625, "loss_aux_layer_8": 0.1082763671875, "loss_aux_layer_9": 0.10693359375, "step": 549, "total_loss": 0.8539964258670807 }, { "epoch": 0.10888932884577311, "grad_norm": 2.7298645973205566, "learning_rate": 5e-05, "llm_loss": 0.7014435529708862, "loss": 3.3732, "loss_aux_layer_0": 0.029083251953125, "loss_aux_layer_1": 0.09814453125, "loss_aux_layer_10": 0.1051025390625, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.1209716796875, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.1484375, "loss_aux_layer_15": 0.1630859375, "loss_aux_layer_16": 0.17724609375, "loss_aux_layer_17": 0.183837890625, "loss_aux_layer_18": 0.19287109375, "loss_aux_layer_19": 0.1923828125, "loss_aux_layer_2": 0.100341796875, "loss_aux_layer_20": 0.197509765625, "loss_aux_layer_21": 0.202392578125, "loss_aux_layer_22": 0.22216796875, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.109619140625, "loss_aux_layer_4": 0.110595703125, "loss_aux_layer_5": 0.1124267578125, "loss_aux_layer_6": 0.1131591796875, "loss_aux_layer_7": 0.1075439453125, "loss_aux_layer_8": 0.1055908203125, "loss_aux_layer_9": 0.10400390625, "step": 550, "total_loss": 0.8433107137680054 }, { "epoch": 0.10908730944367452, "grad_norm": 1.6818692684173584, "learning_rate": 5e-05, "llm_loss": 0.6534723788499832, "loss": 3.1969, "loss_aux_layer_0": 0.028900146484375, "loss_aux_layer_1": 0.102783203125, "loss_aux_layer_10": 0.1087646484375, "loss_aux_layer_11": 0.115234375, "loss_aux_layer_12": 0.124755859375, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.152587890625, "loss_aux_layer_15": 0.16748046875, "loss_aux_layer_16": 0.181640625, "loss_aux_layer_17": 0.187744140625, "loss_aux_layer_18": 0.197265625, "loss_aux_layer_19": 0.197021484375, "loss_aux_layer_2": 0.1048583984375, "loss_aux_layer_20": 0.201904296875, "loss_aux_layer_21": 0.205810546875, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.2705078125, "loss_aux_layer_3": 0.11181640625, "loss_aux_layer_4": 0.1136474609375, "loss_aux_layer_5": 0.1151123046875, "loss_aux_layer_6": 0.116943359375, "loss_aux_layer_7": 0.1119384765625, "loss_aux_layer_8": 0.110107421875, "loss_aux_layer_9": 0.1085205078125, "step": 551, "total_loss": 0.7992371767759323 }, { "epoch": 0.10928529004157593, "grad_norm": 3.384679079055786, "learning_rate": 5e-05, "llm_loss": 0.6359049528837204, "loss": 3.1397, "loss_aux_layer_0": 0.031768798828125, "loss_aux_layer_1": 0.1082763671875, "loss_aux_layer_10": 0.1146240234375, "loss_aux_layer_11": 0.12109375, "loss_aux_layer_12": 0.130126953125, "loss_aux_layer_13": 0.1396484375, "loss_aux_layer_14": 0.155517578125, "loss_aux_layer_15": 0.16943359375, "loss_aux_layer_16": 0.182861328125, "loss_aux_layer_17": 0.18701171875, "loss_aux_layer_18": 0.195068359375, "loss_aux_layer_19": 0.194091796875, "loss_aux_layer_2": 0.1099853515625, "loss_aux_layer_20": 0.19873046875, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.11962890625, "loss_aux_layer_4": 0.12109375, "loss_aux_layer_5": 0.1231689453125, "loss_aux_layer_6": 0.12451171875, "loss_aux_layer_7": 0.118408203125, "loss_aux_layer_8": 0.11572265625, "loss_aux_layer_9": 0.11376953125, "step": 552, "total_loss": 0.7849156111478806 }, { "epoch": 0.10948327063947733, "grad_norm": 3.554562568664551, "learning_rate": 5e-05, "llm_loss": 0.6999677419662476, "loss": 3.3736, "loss_aux_layer_0": 0.031036376953125, "loss_aux_layer_1": 0.1004638671875, "loss_aux_layer_10": 0.1063232421875, "loss_aux_layer_11": 0.1121826171875, "loss_aux_layer_12": 0.121337890625, "loss_aux_layer_13": 0.131591796875, "loss_aux_layer_14": 0.147705078125, "loss_aux_layer_15": 0.163330078125, "loss_aux_layer_16": 0.177978515625, "loss_aux_layer_17": 0.184814453125, "loss_aux_layer_18": 0.1943359375, "loss_aux_layer_19": 0.1943359375, "loss_aux_layer_2": 0.1025390625, "loss_aux_layer_20": 0.199462890625, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.2255859375, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.1119384765625, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.114501953125, "loss_aux_layer_6": 0.1141357421875, "loss_aux_layer_7": 0.1092529296875, "loss_aux_layer_8": 0.107421875, "loss_aux_layer_9": 0.1058349609375, "step": 553, "total_loss": 0.8433935195207596 }, { "epoch": 0.10968125123737874, "grad_norm": 2.884490966796875, "learning_rate": 5e-05, "llm_loss": 0.6748989522457123, "loss": 3.2754, "loss_aux_layer_0": 0.030303955078125, "loss_aux_layer_1": 0.099609375, "loss_aux_layer_10": 0.107666015625, "loss_aux_layer_11": 0.11376953125, "loss_aux_layer_12": 0.1234130859375, "loss_aux_layer_13": 0.13427734375, "loss_aux_layer_14": 0.151123046875, "loss_aux_layer_15": 0.166015625, "loss_aux_layer_16": 0.180419921875, "loss_aux_layer_17": 0.186767578125, "loss_aux_layer_18": 0.195556640625, "loss_aux_layer_19": 0.194580078125, "loss_aux_layer_2": 0.1007080078125, "loss_aux_layer_20": 0.199462890625, "loss_aux_layer_21": 0.203857421875, "loss_aux_layer_22": 0.226318359375, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.108642578125, "loss_aux_layer_4": 0.110107421875, "loss_aux_layer_5": 0.1121826171875, "loss_aux_layer_6": 0.1141357421875, "loss_aux_layer_7": 0.109375, "loss_aux_layer_8": 0.1085205078125, "loss_aux_layer_9": 0.10693359375, "step": 554, "total_loss": 0.8188464045524597 }, { "epoch": 0.10987923183528014, "grad_norm": 3.9160046577453613, "learning_rate": 5e-05, "llm_loss": 0.6575887501239777, "loss": 3.2161, "loss_aux_layer_0": 0.030609130859375, "loss_aux_layer_1": 0.103759765625, "loss_aux_layer_10": 0.1124267578125, "loss_aux_layer_11": 0.1185302734375, "loss_aux_layer_12": 0.1273193359375, "loss_aux_layer_13": 0.137451171875, "loss_aux_layer_14": 0.1533203125, "loss_aux_layer_15": 0.1669921875, "loss_aux_layer_16": 0.1806640625, "loss_aux_layer_17": 0.18505859375, "loss_aux_layer_18": 0.194091796875, "loss_aux_layer_19": 0.193115234375, "loss_aux_layer_2": 0.1072998046875, "loss_aux_layer_20": 0.197265625, "loss_aux_layer_21": 0.20166015625, "loss_aux_layer_22": 0.221435546875, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.11669921875, "loss_aux_layer_4": 0.117919921875, "loss_aux_layer_5": 0.12060546875, "loss_aux_layer_6": 0.1219482421875, "loss_aux_layer_7": 0.11669921875, "loss_aux_layer_8": 0.114013671875, "loss_aux_layer_9": 0.1119384765625, "step": 555, "total_loss": 0.8040369004011154 }, { "epoch": 0.11007721243318155, "grad_norm": 2.189624309539795, "learning_rate": 5e-05, "llm_loss": 0.7327115088701248, "loss": 3.494, "loss_aux_layer_0": 0.029541015625, "loss_aux_layer_1": 0.095947265625, "loss_aux_layer_10": 0.1053466796875, "loss_aux_layer_11": 0.111328125, "loss_aux_layer_12": 0.1209716796875, "loss_aux_layer_13": 0.131591796875, "loss_aux_layer_14": 0.148681640625, "loss_aux_layer_15": 0.16357421875, "loss_aux_layer_16": 0.177978515625, "loss_aux_layer_17": 0.18359375, "loss_aux_layer_18": 0.192138671875, "loss_aux_layer_19": 0.192138671875, "loss_aux_layer_2": 0.0966796875, "loss_aux_layer_20": 0.197509765625, "loss_aux_layer_21": 0.200439453125, "loss_aux_layer_22": 0.22119140625, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.105224609375, "loss_aux_layer_4": 0.106689453125, "loss_aux_layer_5": 0.1090087890625, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.105712890625, "loss_aux_layer_8": 0.1048583984375, "loss_aux_layer_9": 0.10400390625, "step": 556, "total_loss": 0.8735077381134033 }, { "epoch": 0.11027519303108295, "grad_norm": 3.0080392360687256, "learning_rate": 5e-05, "llm_loss": 0.6024009883403778, "loss": 2.9976, "loss_aux_layer_0": 0.0303955078125, "loss_aux_layer_1": 0.1041259765625, "loss_aux_layer_10": 0.113037109375, "loss_aux_layer_11": 0.1187744140625, "loss_aux_layer_12": 0.1279296875, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.154541015625, "loss_aux_layer_15": 0.16845703125, "loss_aux_layer_16": 0.182373046875, "loss_aux_layer_17": 0.1875, "loss_aux_layer_18": 0.1953125, "loss_aux_layer_19": 0.1943359375, "loss_aux_layer_2": 0.10546875, "loss_aux_layer_20": 0.198486328125, "loss_aux_layer_21": 0.203369140625, "loss_aux_layer_22": 0.224609375, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.115966796875, "loss_aux_layer_4": 0.1175537109375, "loss_aux_layer_5": 0.119140625, "loss_aux_layer_6": 0.12060546875, "loss_aux_layer_7": 0.11474609375, "loss_aux_layer_8": 0.113037109375, "loss_aux_layer_9": 0.112060546875, "step": 557, "total_loss": 0.7493885904550552 }, { "epoch": 0.11047317362898436, "grad_norm": 4.1393656730651855, "learning_rate": 5e-05, "llm_loss": 0.6472262740135193, "loss": 3.1632, "loss_aux_layer_0": 0.03070068359375, "loss_aux_layer_1": 0.098876953125, "loss_aux_layer_10": 0.1083984375, "loss_aux_layer_11": 0.11376953125, "loss_aux_layer_12": 0.123046875, "loss_aux_layer_13": 0.1337890625, "loss_aux_layer_14": 0.1494140625, "loss_aux_layer_15": 0.164794921875, "loss_aux_layer_16": 0.178466796875, "loss_aux_layer_17": 0.184326171875, "loss_aux_layer_18": 0.19287109375, "loss_aux_layer_19": 0.19287109375, "loss_aux_layer_2": 0.1007080078125, "loss_aux_layer_20": 0.197265625, "loss_aux_layer_21": 0.2021484375, "loss_aux_layer_22": 0.222900390625, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.1116943359375, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.1148681640625, "loss_aux_layer_6": 0.1168212890625, "loss_aux_layer_7": 0.1114501953125, "loss_aux_layer_8": 0.1094970703125, "loss_aux_layer_9": 0.10791015625, "step": 558, "total_loss": 0.7908005118370056 }, { "epoch": 0.11067115422688577, "grad_norm": 3.392435312271118, "learning_rate": 5e-05, "llm_loss": 0.7098150253295898, "loss": 3.4421, "loss_aux_layer_0": 0.03424072265625, "loss_aux_layer_1": 0.10595703125, "loss_aux_layer_10": 0.115966796875, "loss_aux_layer_11": 0.121337890625, "loss_aux_layer_12": 0.130615234375, "loss_aux_layer_13": 0.141357421875, "loss_aux_layer_14": 0.158447265625, "loss_aux_layer_15": 0.17333984375, "loss_aux_layer_16": 0.1875, "loss_aux_layer_17": 0.192626953125, "loss_aux_layer_18": 0.200927734375, "loss_aux_layer_19": 0.20068359375, "loss_aux_layer_2": 0.1065673828125, "loss_aux_layer_20": 0.20556640625, "loss_aux_layer_21": 0.21142578125, "loss_aux_layer_22": 0.235107421875, "loss_aux_layer_23": 0.279296875, "loss_aux_layer_3": 0.1151123046875, "loss_aux_layer_4": 0.11669921875, "loss_aux_layer_5": 0.11865234375, "loss_aux_layer_6": 0.12109375, "loss_aux_layer_7": 0.11572265625, "loss_aux_layer_8": 0.1153564453125, "loss_aux_layer_9": 0.1143798828125, "step": 559, "total_loss": 0.8605308681726456 }, { "epoch": 0.11086913482478718, "grad_norm": 1.7302435636520386, "learning_rate": 5e-05, "llm_loss": 0.7266100347042084, "loss": 3.4786, "loss_aux_layer_0": 0.030120849609375, "loss_aux_layer_1": 0.1005859375, "loss_aux_layer_10": 0.1080322265625, "loss_aux_layer_11": 0.1141357421875, "loss_aux_layer_12": 0.1231689453125, "loss_aux_layer_13": 0.133544921875, "loss_aux_layer_14": 0.150146484375, "loss_aux_layer_15": 0.164794921875, "loss_aux_layer_16": 0.17822265625, "loss_aux_layer_17": 0.183349609375, "loss_aux_layer_18": 0.191162109375, "loss_aux_layer_19": 0.189697265625, "loss_aux_layer_2": 0.10205078125, "loss_aux_layer_20": 0.1953125, "loss_aux_layer_21": 0.200927734375, "loss_aux_layer_22": 0.222900390625, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.1103515625, "loss_aux_layer_4": 0.112060546875, "loss_aux_layer_5": 0.113525390625, "loss_aux_layer_6": 0.1158447265625, "loss_aux_layer_7": 0.1107177734375, "loss_aux_layer_8": 0.109130859375, "loss_aux_layer_9": 0.1070556640625, "step": 560, "total_loss": 0.8696543425321579 }, { "epoch": 0.11106711542268857, "grad_norm": 2.90816068649292, "learning_rate": 5e-05, "llm_loss": 0.6432251930236816, "loss": 3.1636, "loss_aux_layer_0": 0.0294189453125, "loss_aux_layer_1": 0.1004638671875, "loss_aux_layer_10": 0.11181640625, "loss_aux_layer_11": 0.1173095703125, "loss_aux_layer_12": 0.1265869140625, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.169677734375, "loss_aux_layer_16": 0.183837890625, "loss_aux_layer_17": 0.18896484375, "loss_aux_layer_18": 0.199462890625, "loss_aux_layer_19": 0.199462890625, "loss_aux_layer_2": 0.1026611328125, "loss_aux_layer_20": 0.204345703125, "loss_aux_layer_21": 0.209716796875, "loss_aux_layer_22": 0.231201171875, "loss_aux_layer_23": 0.275390625, "loss_aux_layer_3": 0.1131591796875, "loss_aux_layer_4": 0.115234375, "loss_aux_layer_5": 0.1173095703125, "loss_aux_layer_6": 0.1192626953125, "loss_aux_layer_7": 0.113525390625, "loss_aux_layer_8": 0.112060546875, "loss_aux_layer_9": 0.111083984375, "step": 561, "total_loss": 0.7909099608659744 }, { "epoch": 0.11126509602058998, "grad_norm": 1.6946637630462646, "learning_rate": 5e-05, "llm_loss": 0.7026113718748093, "loss": 3.3787, "loss_aux_layer_0": 0.030914306640625, "loss_aux_layer_1": 0.0999755859375, "loss_aux_layer_10": 0.1053466796875, "loss_aux_layer_11": 0.1121826171875, "loss_aux_layer_12": 0.1220703125, "loss_aux_layer_13": 0.1326904296875, "loss_aux_layer_14": 0.149658203125, "loss_aux_layer_15": 0.16455078125, "loss_aux_layer_16": 0.17822265625, "loss_aux_layer_17": 0.184326171875, "loss_aux_layer_18": 0.193359375, "loss_aux_layer_19": 0.192626953125, "loss_aux_layer_2": 0.1002197265625, "loss_aux_layer_20": 0.197265625, "loss_aux_layer_21": 0.201904296875, "loss_aux_layer_22": 0.22216796875, "loss_aux_layer_23": 0.26611328125, "loss_aux_layer_3": 0.1075439453125, "loss_aux_layer_4": 0.1087646484375, "loss_aux_layer_5": 0.1103515625, "loss_aux_layer_6": 0.112060546875, "loss_aux_layer_7": 0.1070556640625, "loss_aux_layer_8": 0.1053466796875, "loss_aux_layer_9": 0.104248046875, "step": 562, "total_loss": 0.8446665853261948 }, { "epoch": 0.11146307661849139, "grad_norm": 2.326917886734009, "learning_rate": 5e-05, "llm_loss": 0.5796890258789062, "loss": 2.9109, "loss_aux_layer_0": 0.03094482421875, "loss_aux_layer_1": 0.1021728515625, "loss_aux_layer_10": 0.1141357421875, "loss_aux_layer_11": 0.1202392578125, "loss_aux_layer_12": 0.129150390625, "loss_aux_layer_13": 0.139404296875, "loss_aux_layer_14": 0.15576171875, "loss_aux_layer_15": 0.169921875, "loss_aux_layer_16": 0.182861328125, "loss_aux_layer_17": 0.188232421875, "loss_aux_layer_18": 0.19580078125, "loss_aux_layer_19": 0.1953125, "loss_aux_layer_2": 0.1043701171875, "loss_aux_layer_20": 0.199951171875, "loss_aux_layer_21": 0.20556640625, "loss_aux_layer_22": 0.228515625, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.115234375, "loss_aux_layer_4": 0.1171875, "loss_aux_layer_5": 0.1195068359375, "loss_aux_layer_6": 0.1217041015625, "loss_aux_layer_7": 0.11572265625, "loss_aux_layer_8": 0.1141357421875, "loss_aux_layer_9": 0.1131591796875, "step": 563, "total_loss": 0.7277326732873917 }, { "epoch": 0.11166105721639279, "grad_norm": 2.5963358879089355, "learning_rate": 5e-05, "llm_loss": 0.7035895138978958, "loss": 3.3924, "loss_aux_layer_0": 0.034149169921875, "loss_aux_layer_1": 0.1038818359375, "loss_aux_layer_10": 0.1087646484375, "loss_aux_layer_11": 0.115478515625, "loss_aux_layer_12": 0.1248779296875, "loss_aux_layer_13": 0.1353759765625, "loss_aux_layer_14": 0.15185546875, "loss_aux_layer_15": 0.16650390625, "loss_aux_layer_16": 0.179931640625, "loss_aux_layer_17": 0.1845703125, "loss_aux_layer_18": 0.19287109375, "loss_aux_layer_19": 0.1923828125, "loss_aux_layer_2": 0.10498046875, "loss_aux_layer_20": 0.19677734375, "loss_aux_layer_21": 0.201171875, "loss_aux_layer_22": 0.22216796875, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.1119384765625, "loss_aux_layer_4": 0.1131591796875, "loss_aux_layer_5": 0.1148681640625, "loss_aux_layer_6": 0.116943359375, "loss_aux_layer_7": 0.1116943359375, "loss_aux_layer_8": 0.109619140625, "loss_aux_layer_9": 0.1082763671875, "step": 564, "total_loss": 0.8481061011552811 }, { "epoch": 0.1118590378142942, "grad_norm": 1.0581672191619873, "learning_rate": 5e-05, "llm_loss": 0.6070528775453568, "loss": 3.0002, "loss_aux_layer_0": 0.02911376953125, "loss_aux_layer_1": 0.10107421875, "loss_aux_layer_10": 0.10791015625, "loss_aux_layer_11": 0.1143798828125, "loss_aux_layer_12": 0.12353515625, "loss_aux_layer_13": 0.134033203125, "loss_aux_layer_14": 0.1494140625, "loss_aux_layer_15": 0.1630859375, "loss_aux_layer_16": 0.176513671875, "loss_aux_layer_17": 0.181884765625, "loss_aux_layer_18": 0.190185546875, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.1024169921875, "loss_aux_layer_20": 0.1953125, "loss_aux_layer_21": 0.201416015625, "loss_aux_layer_22": 0.2236328125, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.1107177734375, "loss_aux_layer_4": 0.1123046875, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1160888671875, "loss_aux_layer_7": 0.11083984375, "loss_aux_layer_8": 0.108642578125, "loss_aux_layer_9": 0.107177734375, "step": 565, "total_loss": 0.7500410825014114 }, { "epoch": 0.1120570184121956, "grad_norm": 2.304898500442505, "learning_rate": 5e-05, "llm_loss": 0.6174862384796143, "loss": 3.0663, "loss_aux_layer_0": 0.0322265625, "loss_aux_layer_1": 0.1046142578125, "loss_aux_layer_10": 0.113037109375, "loss_aux_layer_11": 0.1197509765625, "loss_aux_layer_12": 0.1297607421875, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.15771484375, "loss_aux_layer_15": 0.1728515625, "loss_aux_layer_16": 0.185791015625, "loss_aux_layer_17": 0.19091796875, "loss_aux_layer_18": 0.19921875, "loss_aux_layer_19": 0.1982421875, "loss_aux_layer_2": 0.1082763671875, "loss_aux_layer_20": 0.202392578125, "loss_aux_layer_21": 0.20703125, "loss_aux_layer_22": 0.22802734375, "loss_aux_layer_23": 0.27294921875, "loss_aux_layer_3": 0.115966796875, "loss_aux_layer_4": 0.1173095703125, "loss_aux_layer_5": 0.1195068359375, "loss_aux_layer_6": 0.121337890625, "loss_aux_layer_7": 0.11572265625, "loss_aux_layer_8": 0.1134033203125, "loss_aux_layer_9": 0.112060546875, "step": 566, "total_loss": 0.7665832042694092 }, { "epoch": 0.11225499901009701, "grad_norm": 1.105660319328308, "learning_rate": 5e-05, "llm_loss": 0.647150531411171, "loss": 3.1562, "loss_aux_layer_0": 0.02880859375, "loss_aux_layer_1": 0.09716796875, "loss_aux_layer_10": 0.10595703125, "loss_aux_layer_11": 0.1123046875, "loss_aux_layer_12": 0.1217041015625, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.14892578125, "loss_aux_layer_15": 0.164306640625, "loss_aux_layer_16": 0.17822265625, "loss_aux_layer_17": 0.18408203125, "loss_aux_layer_18": 0.193359375, "loss_aux_layer_19": 0.193603515625, "loss_aux_layer_2": 0.0986328125, "loss_aux_layer_20": 0.198486328125, "loss_aux_layer_21": 0.203857421875, "loss_aux_layer_22": 0.22412109375, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.1058349609375, "loss_aux_layer_4": 0.1077880859375, "loss_aux_layer_5": 0.109375, "loss_aux_layer_6": 0.111572265625, "loss_aux_layer_7": 0.1068115234375, "loss_aux_layer_8": 0.10546875, "loss_aux_layer_9": 0.10498046875, "step": 567, "total_loss": 0.7890439182519913 }, { "epoch": 0.11245297960799841, "grad_norm": 2.462085008621216, "learning_rate": 5e-05, "llm_loss": 0.691306009888649, "loss": 3.3467, "loss_aux_layer_0": 0.03289794921875, "loss_aux_layer_1": 0.103515625, "loss_aux_layer_10": 0.110107421875, "loss_aux_layer_11": 0.1165771484375, "loss_aux_layer_12": 0.12548828125, "loss_aux_layer_13": 0.135986328125, "loss_aux_layer_14": 0.151611328125, "loss_aux_layer_15": 0.16552734375, "loss_aux_layer_16": 0.1787109375, "loss_aux_layer_17": 0.1845703125, "loss_aux_layer_18": 0.193359375, "loss_aux_layer_19": 0.193115234375, "loss_aux_layer_2": 0.105712890625, "loss_aux_layer_20": 0.19775390625, "loss_aux_layer_21": 0.201904296875, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.1143798828125, "loss_aux_layer_4": 0.1160888671875, "loss_aux_layer_5": 0.117431640625, "loss_aux_layer_6": 0.1192626953125, "loss_aux_layer_7": 0.113037109375, "loss_aux_layer_8": 0.111083984375, "loss_aux_layer_9": 0.109375, "step": 568, "total_loss": 0.8366685807704926 }, { "epoch": 0.11265096020589982, "grad_norm": 2.301151990890503, "learning_rate": 5e-05, "llm_loss": 0.6721729040145874, "loss": 3.2736, "loss_aux_layer_0": 0.030548095703125, "loss_aux_layer_1": 0.1002197265625, "loss_aux_layer_10": 0.10986328125, "loss_aux_layer_11": 0.1158447265625, "loss_aux_layer_12": 0.125244140625, "loss_aux_layer_13": 0.135986328125, "loss_aux_layer_14": 0.152587890625, "loss_aux_layer_15": 0.16748046875, "loss_aux_layer_16": 0.18212890625, "loss_aux_layer_17": 0.1884765625, "loss_aux_layer_18": 0.1982421875, "loss_aux_layer_19": 0.19873046875, "loss_aux_layer_2": 0.1024169921875, "loss_aux_layer_20": 0.2041015625, "loss_aux_layer_21": 0.208740234375, "loss_aux_layer_22": 0.23095703125, "loss_aux_layer_23": 0.2763671875, "loss_aux_layer_3": 0.11083984375, "loss_aux_layer_4": 0.1123046875, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1162109375, "loss_aux_layer_7": 0.1104736328125, "loss_aux_layer_8": 0.1094970703125, "loss_aux_layer_9": 0.1085205078125, "step": 569, "total_loss": 0.818403884768486 }, { "epoch": 0.11284894080380123, "grad_norm": 2.222886085510254, "learning_rate": 5e-05, "llm_loss": 0.6418316215276718, "loss": 3.1275, "loss_aux_layer_0": 0.02947998046875, "loss_aux_layer_1": 0.093017578125, "loss_aux_layer_10": 0.1025390625, "loss_aux_layer_11": 0.1083984375, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.12890625, "loss_aux_layer_14": 0.146728515625, "loss_aux_layer_15": 0.1630859375, "loss_aux_layer_16": 0.178466796875, "loss_aux_layer_17": 0.18505859375, "loss_aux_layer_18": 0.193359375, "loss_aux_layer_19": 0.195068359375, "loss_aux_layer_2": 0.0948486328125, "loss_aux_layer_20": 0.2001953125, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.1016845703125, "loss_aux_layer_4": 0.103515625, "loss_aux_layer_5": 0.105712890625, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1033935546875, "loss_aux_layer_8": 0.1024169921875, "loss_aux_layer_9": 0.101318359375, "step": 570, "total_loss": 0.7818667739629745 }, { "epoch": 0.11304692140170264, "grad_norm": 2.941840887069702, "learning_rate": 5e-05, "llm_loss": 0.6014759838581085, "loss": 2.9888, "loss_aux_layer_0": 0.0369873046875, "loss_aux_layer_1": 0.10107421875, "loss_aux_layer_10": 0.1094970703125, "loss_aux_layer_11": 0.114501953125, "loss_aux_layer_12": 0.12451171875, "loss_aux_layer_13": 0.1348876953125, "loss_aux_layer_14": 0.152587890625, "loss_aux_layer_15": 0.167724609375, "loss_aux_layer_16": 0.181640625, "loss_aux_layer_17": 0.1875, "loss_aux_layer_18": 0.19677734375, "loss_aux_layer_19": 0.197509765625, "loss_aux_layer_2": 0.1016845703125, "loss_aux_layer_20": 0.203369140625, "loss_aux_layer_21": 0.207763671875, "loss_aux_layer_22": 0.228271484375, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.1092529296875, "loss_aux_layer_4": 0.111083984375, "loss_aux_layer_5": 0.1131591796875, "loss_aux_layer_6": 0.1160888671875, "loss_aux_layer_7": 0.111083984375, "loss_aux_layer_8": 0.1092529296875, "loss_aux_layer_9": 0.1082763671875, "step": 571, "total_loss": 0.7471902519464493 }, { "epoch": 0.11324490199960403, "grad_norm": 1.6976401805877686, "learning_rate": 5e-05, "llm_loss": 0.6424536556005478, "loss": 3.1617, "loss_aux_layer_0": 0.030609130859375, "loss_aux_layer_1": 0.105712890625, "loss_aux_layer_10": 0.114013671875, "loss_aux_layer_11": 0.1204833984375, "loss_aux_layer_12": 0.1297607421875, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.156494140625, "loss_aux_layer_15": 0.16943359375, "loss_aux_layer_16": 0.1826171875, "loss_aux_layer_17": 0.187744140625, "loss_aux_layer_18": 0.19482421875, "loss_aux_layer_19": 0.194091796875, "loss_aux_layer_2": 0.1075439453125, "loss_aux_layer_20": 0.198486328125, "loss_aux_layer_21": 0.203369140625, "loss_aux_layer_22": 0.22607421875, "loss_aux_layer_23": 0.2705078125, "loss_aux_layer_3": 0.1156005859375, "loss_aux_layer_4": 0.11767578125, "loss_aux_layer_5": 0.1195068359375, "loss_aux_layer_6": 0.1221923828125, "loss_aux_layer_7": 0.1165771484375, "loss_aux_layer_8": 0.1148681640625, "loss_aux_layer_9": 0.11279296875, "step": 572, "total_loss": 0.7904283255338669 }, { "epoch": 0.11344288259750544, "grad_norm": 2.1870877742767334, "learning_rate": 5e-05, "llm_loss": 0.6944336593151093, "loss": 3.3495, "loss_aux_layer_0": 0.029388427734375, "loss_aux_layer_1": 0.097900390625, "loss_aux_layer_10": 0.107421875, "loss_aux_layer_11": 0.1129150390625, "loss_aux_layer_12": 0.1226806640625, "loss_aux_layer_13": 0.133056640625, "loss_aux_layer_14": 0.150146484375, "loss_aux_layer_15": 0.1650390625, "loss_aux_layer_16": 0.17919921875, "loss_aux_layer_17": 0.185302734375, "loss_aux_layer_18": 0.194091796875, "loss_aux_layer_19": 0.194580078125, "loss_aux_layer_2": 0.099365234375, "loss_aux_layer_20": 0.19921875, "loss_aux_layer_21": 0.203369140625, "loss_aux_layer_22": 0.224365234375, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.10693359375, "loss_aux_layer_4": 0.109130859375, "loss_aux_layer_5": 0.111083984375, "loss_aux_layer_6": 0.114013671875, "loss_aux_layer_7": 0.108642578125, "loss_aux_layer_8": 0.1075439453125, "loss_aux_layer_9": 0.1064453125, "step": 573, "total_loss": 0.8373678177595139 }, { "epoch": 0.11364086319540685, "grad_norm": 1.6000745296478271, "learning_rate": 5e-05, "llm_loss": 0.6959877163171768, "loss": 3.3611, "loss_aux_layer_0": 0.033172607421875, "loss_aux_layer_1": 0.104736328125, "loss_aux_layer_10": 0.1107177734375, "loss_aux_layer_11": 0.1170654296875, "loss_aux_layer_12": 0.1260986328125, "loss_aux_layer_13": 0.13623046875, "loss_aux_layer_14": 0.151611328125, "loss_aux_layer_15": 0.165283203125, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.18310546875, "loss_aux_layer_18": 0.191650390625, "loss_aux_layer_19": 0.19091796875, "loss_aux_layer_2": 0.1048583984375, "loss_aux_layer_20": 0.1953125, "loss_aux_layer_21": 0.19873046875, "loss_aux_layer_22": 0.2177734375, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.1126708984375, "loss_aux_layer_4": 0.1146240234375, "loss_aux_layer_5": 0.1165771484375, "loss_aux_layer_6": 0.11865234375, "loss_aux_layer_7": 0.113037109375, "loss_aux_layer_8": 0.1114501953125, "loss_aux_layer_9": 0.1099853515625, "step": 574, "total_loss": 0.840274840593338 }, { "epoch": 0.11383884379330826, "grad_norm": 1.6610991954803467, "learning_rate": 5e-05, "llm_loss": 0.6014770716428757, "loss": 2.9881, "loss_aux_layer_0": 0.03106689453125, "loss_aux_layer_1": 0.10595703125, "loss_aux_layer_10": 0.112060546875, "loss_aux_layer_11": 0.1181640625, "loss_aux_layer_12": 0.126953125, "loss_aux_layer_13": 0.136474609375, "loss_aux_layer_14": 0.152099609375, "loss_aux_layer_15": 0.16552734375, "loss_aux_layer_16": 0.1787109375, "loss_aux_layer_17": 0.183837890625, "loss_aux_layer_18": 0.1923828125, "loss_aux_layer_19": 0.19091796875, "loss_aux_layer_2": 0.1064453125, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.199951171875, "loss_aux_layer_22": 0.221435546875, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.1148681640625, "loss_aux_layer_4": 0.1168212890625, "loss_aux_layer_5": 0.1185302734375, "loss_aux_layer_6": 0.1207275390625, "loss_aux_layer_7": 0.1146240234375, "loss_aux_layer_8": 0.113037109375, "loss_aux_layer_9": 0.11083984375, "step": 575, "total_loss": 0.7470252811908722 }, { "epoch": 0.11403682439120966, "grad_norm": 1.3554432392120361, "learning_rate": 5e-05, "llm_loss": 0.5503480434417725, "loss": 2.7703, "loss_aux_layer_0": 0.0289306640625, "loss_aux_layer_1": 0.098876953125, "loss_aux_layer_10": 0.1060791015625, "loss_aux_layer_11": 0.112548828125, "loss_aux_layer_12": 0.1214599609375, "loss_aux_layer_13": 0.131591796875, "loss_aux_layer_14": 0.147705078125, "loss_aux_layer_15": 0.162109375, "loss_aux_layer_16": 0.17578125, "loss_aux_layer_17": 0.181396484375, "loss_aux_layer_18": 0.1904296875, "loss_aux_layer_19": 0.191162109375, "loss_aux_layer_2": 0.101318359375, "loss_aux_layer_20": 0.1962890625, "loss_aux_layer_21": 0.202880859375, "loss_aux_layer_22": 0.226318359375, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.108642578125, "loss_aux_layer_4": 0.1104736328125, "loss_aux_layer_5": 0.1123046875, "loss_aux_layer_6": 0.1143798828125, "loss_aux_layer_7": 0.1085205078125, "loss_aux_layer_8": 0.106689453125, "loss_aux_layer_9": 0.105224609375, "step": 576, "total_loss": 0.6925776153802872 }, { "epoch": 0.11423480498911107, "grad_norm": 2.1012279987335205, "learning_rate": 5e-05, "llm_loss": 0.5291481465101242, "loss": 2.7087, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.1044921875, "loss_aux_layer_10": 0.11328125, "loss_aux_layer_11": 0.1197509765625, "loss_aux_layer_12": 0.12890625, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.168701171875, "loss_aux_layer_16": 0.181884765625, "loss_aux_layer_17": 0.1875, "loss_aux_layer_18": 0.195556640625, "loss_aux_layer_19": 0.195556640625, "loss_aux_layer_2": 0.1090087890625, "loss_aux_layer_20": 0.199951171875, "loss_aux_layer_21": 0.204833984375, "loss_aux_layer_22": 0.227294921875, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.11669921875, "loss_aux_layer_4": 0.117919921875, "loss_aux_layer_5": 0.119384765625, "loss_aux_layer_6": 0.1209716796875, "loss_aux_layer_7": 0.1158447265625, "loss_aux_layer_8": 0.1143798828125, "loss_aux_layer_9": 0.113037109375, "step": 577, "total_loss": 0.6771802008152008 }, { "epoch": 0.11443278558701248, "grad_norm": 1.1952522993087769, "learning_rate": 5e-05, "llm_loss": 0.6863976120948792, "loss": 3.3335, "loss_aux_layer_0": 0.03204345703125, "loss_aux_layer_1": 0.107177734375, "loss_aux_layer_10": 0.112548828125, "loss_aux_layer_11": 0.1195068359375, "loss_aux_layer_12": 0.128662109375, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.16943359375, "loss_aux_layer_16": 0.182373046875, "loss_aux_layer_17": 0.187255859375, "loss_aux_layer_18": 0.1943359375, "loss_aux_layer_19": 0.193115234375, "loss_aux_layer_2": 0.1064453125, "loss_aux_layer_20": 0.197265625, "loss_aux_layer_21": 0.200439453125, "loss_aux_layer_22": 0.221923828125, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.115966796875, "loss_aux_layer_4": 0.117431640625, "loss_aux_layer_5": 0.119140625, "loss_aux_layer_6": 0.121826171875, "loss_aux_layer_7": 0.1156005859375, "loss_aux_layer_8": 0.1136474609375, "loss_aux_layer_9": 0.1116943359375, "step": 578, "total_loss": 0.8333754986524582 }, { "epoch": 0.11463076618491388, "grad_norm": 1.688474416732788, "learning_rate": 5e-05, "llm_loss": 0.6231563836336136, "loss": 3.0599, "loss_aux_layer_0": 0.032989501953125, "loss_aux_layer_1": 0.1009521484375, "loss_aux_layer_10": 0.1063232421875, "loss_aux_layer_11": 0.11279296875, "loss_aux_layer_12": 0.1219482421875, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.1484375, "loss_aux_layer_15": 0.162841796875, "loss_aux_layer_16": 0.17626953125, "loss_aux_layer_17": 0.18212890625, "loss_aux_layer_18": 0.190673828125, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.099609375, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.20166015625, "loss_aux_layer_22": 0.222412109375, "loss_aux_layer_23": 0.26708984375, "loss_aux_layer_3": 0.107421875, "loss_aux_layer_4": 0.1092529296875, "loss_aux_layer_5": 0.1109619140625, "loss_aux_layer_6": 0.113037109375, "loss_aux_layer_7": 0.1077880859375, "loss_aux_layer_8": 0.1063232421875, "loss_aux_layer_9": 0.10546875, "step": 579, "total_loss": 0.7649737745523453 }, { "epoch": 0.11482874678281528, "grad_norm": 1.163669466972351, "learning_rate": 5e-05, "llm_loss": 0.7392165511846542, "loss": 3.5123, "loss_aux_layer_0": 0.029022216796875, "loss_aux_layer_1": 0.0953369140625, "loss_aux_layer_10": 0.1024169921875, "loss_aux_layer_11": 0.10888671875, "loss_aux_layer_12": 0.11767578125, "loss_aux_layer_13": 0.12841796875, "loss_aux_layer_14": 0.14404296875, "loss_aux_layer_15": 0.158203125, "loss_aux_layer_16": 0.172607421875, "loss_aux_layer_17": 0.178955078125, "loss_aux_layer_18": 0.189697265625, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.09619140625, "loss_aux_layer_20": 0.196533203125, "loss_aux_layer_21": 0.200439453125, "loss_aux_layer_22": 0.221435546875, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.1036376953125, "loss_aux_layer_4": 0.1058349609375, "loss_aux_layer_5": 0.10791015625, "loss_aux_layer_6": 0.1102294921875, "loss_aux_layer_7": 0.10498046875, "loss_aux_layer_8": 0.10302734375, "loss_aux_layer_9": 0.1015625, "step": 580, "total_loss": 0.8780851811170578 }, { "epoch": 0.11502672738071669, "grad_norm": 1.9917793273925781, "learning_rate": 5e-05, "llm_loss": 0.5966746285557747, "loss": 2.9304, "loss_aux_layer_0": 0.0281982421875, "loss_aux_layer_1": 0.0888671875, "loss_aux_layer_10": 0.099853515625, "loss_aux_layer_11": 0.105712890625, "loss_aux_layer_12": 0.1151123046875, "loss_aux_layer_13": 0.125244140625, "loss_aux_layer_14": 0.14208984375, "loss_aux_layer_15": 0.15771484375, "loss_aux_layer_16": 0.17138671875, "loss_aux_layer_17": 0.177978515625, "loss_aux_layer_18": 0.187744140625, "loss_aux_layer_19": 0.18896484375, "loss_aux_layer_2": 0.09130859375, "loss_aux_layer_20": 0.19482421875, "loss_aux_layer_21": 0.20068359375, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.0987548828125, "loss_aux_layer_4": 0.099853515625, "loss_aux_layer_5": 0.101318359375, "loss_aux_layer_6": 0.1038818359375, "loss_aux_layer_7": 0.0992431640625, "loss_aux_layer_8": 0.098876953125, "loss_aux_layer_9": 0.0987548828125, "step": 581, "total_loss": 0.7326101213693619 }, { "epoch": 0.1152247079786181, "grad_norm": 1.5394859313964844, "learning_rate": 5e-05, "llm_loss": 0.6409666389226913, "loss": 3.1268, "loss_aux_layer_0": 0.030914306640625, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.1053466796875, "loss_aux_layer_11": 0.111572265625, "loss_aux_layer_12": 0.1207275390625, "loss_aux_layer_13": 0.130859375, "loss_aux_layer_14": 0.146728515625, "loss_aux_layer_15": 0.161376953125, "loss_aux_layer_16": 0.17529296875, "loss_aux_layer_17": 0.181396484375, "loss_aux_layer_18": 0.189697265625, "loss_aux_layer_19": 0.190185546875, "loss_aux_layer_2": 0.0985107421875, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.19970703125, "loss_aux_layer_22": 0.2216796875, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.1068115234375, "loss_aux_layer_4": 0.1083984375, "loss_aux_layer_5": 0.110107421875, "loss_aux_layer_6": 0.1121826171875, "loss_aux_layer_7": 0.1070556640625, "loss_aux_layer_8": 0.10546875, "loss_aux_layer_9": 0.1043701171875, "step": 582, "total_loss": 0.781707376241684 }, { "epoch": 0.1154226885765195, "grad_norm": 1.2583523988723755, "learning_rate": 5e-05, "llm_loss": 0.7125091254711151, "loss": 3.4414, "loss_aux_layer_0": 0.033050537109375, "loss_aux_layer_1": 0.1068115234375, "loss_aux_layer_10": 0.1124267578125, "loss_aux_layer_11": 0.11962890625, "loss_aux_layer_12": 0.12890625, "loss_aux_layer_13": 0.13916015625, "loss_aux_layer_14": 0.155029296875, "loss_aux_layer_15": 0.169189453125, "loss_aux_layer_16": 0.1826171875, "loss_aux_layer_17": 0.187255859375, "loss_aux_layer_18": 0.19580078125, "loss_aux_layer_19": 0.1953125, "loss_aux_layer_2": 0.1077880859375, "loss_aux_layer_20": 0.20068359375, "loss_aux_layer_21": 0.205322265625, "loss_aux_layer_22": 0.225830078125, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.115478515625, "loss_aux_layer_4": 0.117431640625, "loss_aux_layer_5": 0.11865234375, "loss_aux_layer_6": 0.1212158203125, "loss_aux_layer_7": 0.11572265625, "loss_aux_layer_8": 0.1136474609375, "loss_aux_layer_9": 0.112060546875, "step": 583, "total_loss": 0.8603525459766388 }, { "epoch": 0.1156206691744209, "grad_norm": 1.343222975730896, "learning_rate": 5e-05, "llm_loss": 0.6491066813468933, "loss": 3.1622, "loss_aux_layer_0": 0.0311279296875, "loss_aux_layer_1": 0.0955810546875, "loss_aux_layer_10": 0.1046142578125, "loss_aux_layer_11": 0.1104736328125, "loss_aux_layer_12": 0.119873046875, "loss_aux_layer_13": 0.130859375, "loss_aux_layer_14": 0.148193359375, "loss_aux_layer_15": 0.163330078125, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.183837890625, "loss_aux_layer_18": 0.193359375, "loss_aux_layer_19": 0.19482421875, "loss_aux_layer_2": 0.0970458984375, "loss_aux_layer_20": 0.2001953125, "loss_aux_layer_21": 0.20458984375, "loss_aux_layer_22": 0.225830078125, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.1044921875, "loss_aux_layer_4": 0.1060791015625, "loss_aux_layer_5": 0.1080322265625, "loss_aux_layer_6": 0.1102294921875, "loss_aux_layer_7": 0.1053466796875, "loss_aux_layer_8": 0.1044921875, "loss_aux_layer_9": 0.1036376953125, "step": 584, "total_loss": 0.79053995013237 }, { "epoch": 0.11581864977232231, "grad_norm": 1.187902569770813, "learning_rate": 5e-05, "llm_loss": 0.6483486592769623, "loss": 3.1651, "loss_aux_layer_0": 0.030914306640625, "loss_aux_layer_1": 0.10400390625, "loss_aux_layer_10": 0.1082763671875, "loss_aux_layer_11": 0.1146240234375, "loss_aux_layer_12": 0.1236572265625, "loss_aux_layer_13": 0.133544921875, "loss_aux_layer_14": 0.14892578125, "loss_aux_layer_15": 0.162109375, "loss_aux_layer_16": 0.1748046875, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.18896484375, "loss_aux_layer_19": 0.1884765625, "loss_aux_layer_2": 0.103759765625, "loss_aux_layer_20": 0.193115234375, "loss_aux_layer_21": 0.199462890625, "loss_aux_layer_22": 0.22119140625, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.112060546875, "loss_aux_layer_4": 0.11376953125, "loss_aux_layer_5": 0.1153564453125, "loss_aux_layer_6": 0.1177978515625, "loss_aux_layer_7": 0.1119384765625, "loss_aux_layer_8": 0.109619140625, "loss_aux_layer_9": 0.10791015625, "step": 585, "total_loss": 0.7912683486938477 }, { "epoch": 0.11601663037022372, "grad_norm": 1.5131927728652954, "learning_rate": 5e-05, "llm_loss": 0.6284972429275513, "loss": 3.0805, "loss_aux_layer_0": 0.030029296875, "loss_aux_layer_1": 0.0958251953125, "loss_aux_layer_10": 0.1046142578125, "loss_aux_layer_11": 0.1109619140625, "loss_aux_layer_12": 0.120361328125, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.148681640625, "loss_aux_layer_15": 0.16357421875, "loss_aux_layer_16": 0.17822265625, "loss_aux_layer_17": 0.183837890625, "loss_aux_layer_18": 0.192626953125, "loss_aux_layer_19": 0.193603515625, "loss_aux_layer_2": 0.098388671875, "loss_aux_layer_20": 0.198974609375, "loss_aux_layer_21": 0.20458984375, "loss_aux_layer_22": 0.225341796875, "loss_aux_layer_23": 0.27001953125, "loss_aux_layer_3": 0.1055908203125, "loss_aux_layer_4": 0.10693359375, "loss_aux_layer_5": 0.10888671875, "loss_aux_layer_6": 0.111083984375, "loss_aux_layer_7": 0.10595703125, "loss_aux_layer_8": 0.104736328125, "loss_aux_layer_9": 0.103515625, "step": 586, "total_loss": 0.7701172530651093 }, { "epoch": 0.11621461096812512, "grad_norm": 1.0979546308517456, "learning_rate": 5e-05, "llm_loss": 0.6861996278166771, "loss": 3.3152, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.1011962890625, "loss_aux_layer_10": 0.1085205078125, "loss_aux_layer_11": 0.11474609375, "loss_aux_layer_12": 0.1236572265625, "loss_aux_layer_13": 0.13427734375, "loss_aux_layer_14": 0.1494140625, "loss_aux_layer_15": 0.162841796875, "loss_aux_layer_16": 0.175537109375, "loss_aux_layer_17": 0.1806640625, "loss_aux_layer_18": 0.188232421875, "loss_aux_layer_19": 0.1875, "loss_aux_layer_2": 0.10205078125, "loss_aux_layer_20": 0.19287109375, "loss_aux_layer_21": 0.19921875, "loss_aux_layer_22": 0.223388671875, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.110107421875, "loss_aux_layer_4": 0.1124267578125, "loss_aux_layer_5": 0.1141357421875, "loss_aux_layer_6": 0.11669921875, "loss_aux_layer_7": 0.111083984375, "loss_aux_layer_8": 0.109375, "loss_aux_layer_9": 0.10791015625, "step": 587, "total_loss": 0.8288041055202484 }, { "epoch": 0.11641259156602653, "grad_norm": 1.2808223962783813, "learning_rate": 5e-05, "llm_loss": 0.6327465772628784, "loss": 3.1056, "loss_aux_layer_0": 0.030059814453125, "loss_aux_layer_1": 0.1002197265625, "loss_aux_layer_10": 0.1087646484375, "loss_aux_layer_11": 0.1153564453125, "loss_aux_layer_12": 0.124267578125, "loss_aux_layer_13": 0.135009765625, "loss_aux_layer_14": 0.150390625, "loss_aux_layer_15": 0.164306640625, "loss_aux_layer_16": 0.177490234375, "loss_aux_layer_17": 0.1826171875, "loss_aux_layer_18": 0.190673828125, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.1031494140625, "loss_aux_layer_20": 0.19580078125, "loss_aux_layer_21": 0.200927734375, "loss_aux_layer_22": 0.22412109375, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.1109619140625, "loss_aux_layer_4": 0.1126708984375, "loss_aux_layer_5": 0.1146240234375, "loss_aux_layer_6": 0.117431640625, "loss_aux_layer_7": 0.1116943359375, "loss_aux_layer_8": 0.10986328125, "loss_aux_layer_9": 0.1082763671875, "step": 588, "total_loss": 0.7763991802930832 }, { "epoch": 0.11661057216392794, "grad_norm": 1.508224368095398, "learning_rate": 5e-05, "llm_loss": 0.6072740256786346, "loss": 2.9952, "loss_aux_layer_0": 0.0301513671875, "loss_aux_layer_1": 0.095947265625, "loss_aux_layer_10": 0.105224609375, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.1204833984375, "loss_aux_layer_13": 0.1304931640625, "loss_aux_layer_14": 0.147705078125, "loss_aux_layer_15": 0.162109375, "loss_aux_layer_16": 0.17626953125, "loss_aux_layer_17": 0.1826171875, "loss_aux_layer_18": 0.19189453125, "loss_aux_layer_19": 0.192626953125, "loss_aux_layer_2": 0.0982666015625, "loss_aux_layer_20": 0.197998046875, "loss_aux_layer_21": 0.203125, "loss_aux_layer_22": 0.224609375, "loss_aux_layer_23": 0.27001953125, "loss_aux_layer_3": 0.106689453125, "loss_aux_layer_4": 0.108154296875, "loss_aux_layer_5": 0.110107421875, "loss_aux_layer_6": 0.1124267578125, "loss_aux_layer_7": 0.1072998046875, "loss_aux_layer_8": 0.1060791015625, "loss_aux_layer_9": 0.1048583984375, "step": 589, "total_loss": 0.7488024830818176 }, { "epoch": 0.11680855276182935, "grad_norm": 1.8982124328613281, "learning_rate": 5e-05, "llm_loss": 0.5296313837170601, "loss": 2.6792, "loss_aux_layer_0": 0.028778076171875, "loss_aux_layer_1": 0.093994140625, "loss_aux_layer_10": 0.1048583984375, "loss_aux_layer_11": 0.1107177734375, "loss_aux_layer_12": 0.1199951171875, "loss_aux_layer_13": 0.1307373046875, "loss_aux_layer_14": 0.14697265625, "loss_aux_layer_15": 0.1611328125, "loss_aux_layer_16": 0.174560546875, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.188720703125, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.097412109375, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.20068359375, "loss_aux_layer_22": 0.2216796875, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.10546875, "loss_aux_layer_4": 0.107421875, "loss_aux_layer_5": 0.1094970703125, "loss_aux_layer_6": 0.1119384765625, "loss_aux_layer_7": 0.106689453125, "loss_aux_layer_8": 0.1053466796875, "loss_aux_layer_9": 0.103759765625, "step": 590, "total_loss": 0.6698061525821686 }, { "epoch": 0.11700653335973074, "grad_norm": 1.8981924057006836, "learning_rate": 5e-05, "llm_loss": 0.6960936039686203, "loss": 3.3425, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.09814453125, "loss_aux_layer_10": 0.1041259765625, "loss_aux_layer_11": 0.1102294921875, "loss_aux_layer_12": 0.119384765625, "loss_aux_layer_13": 0.129638671875, "loss_aux_layer_14": 0.145751953125, "loss_aux_layer_15": 0.159912109375, "loss_aux_layer_16": 0.173828125, "loss_aux_layer_17": 0.1796875, "loss_aux_layer_18": 0.18798828125, "loss_aux_layer_19": 0.188720703125, "loss_aux_layer_2": 0.098876953125, "loss_aux_layer_20": 0.193359375, "loss_aux_layer_21": 0.197265625, "loss_aux_layer_22": 0.21875, "loss_aux_layer_23": 0.26171875, "loss_aux_layer_3": 0.10693359375, "loss_aux_layer_4": 0.1083984375, "loss_aux_layer_5": 0.1099853515625, "loss_aux_layer_6": 0.112060546875, "loss_aux_layer_7": 0.1068115234375, "loss_aux_layer_8": 0.104736328125, "loss_aux_layer_9": 0.103271484375, "step": 591, "total_loss": 0.8356223702430725 }, { "epoch": 0.11720451395763215, "grad_norm": 2.028632402420044, "learning_rate": 5e-05, "llm_loss": 0.6172656267881393, "loss": 3.0403, "loss_aux_layer_0": 0.029571533203125, "loss_aux_layer_1": 0.0985107421875, "loss_aux_layer_10": 0.1065673828125, "loss_aux_layer_11": 0.11328125, "loss_aux_layer_12": 0.123046875, "loss_aux_layer_13": 0.13330078125, "loss_aux_layer_14": 0.149658203125, "loss_aux_layer_15": 0.16455078125, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.18408203125, "loss_aux_layer_18": 0.193115234375, "loss_aux_layer_19": 0.193359375, "loss_aux_layer_2": 0.1005859375, "loss_aux_layer_20": 0.198486328125, "loss_aux_layer_21": 0.202392578125, "loss_aux_layer_22": 0.2236328125, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.108642578125, "loss_aux_layer_4": 0.1102294921875, "loss_aux_layer_5": 0.1119384765625, "loss_aux_layer_6": 0.1142578125, "loss_aux_layer_7": 0.1087646484375, "loss_aux_layer_8": 0.10693359375, "loss_aux_layer_9": 0.1058349609375, "step": 592, "total_loss": 0.7600734382867813 }, { "epoch": 0.11740249455553356, "grad_norm": 1.9610068798065186, "learning_rate": 5e-05, "llm_loss": 0.7098450660705566, "loss": 3.4153, "loss_aux_layer_0": 0.0308837890625, "loss_aux_layer_1": 0.102294921875, "loss_aux_layer_10": 0.1097412109375, "loss_aux_layer_11": 0.116943359375, "loss_aux_layer_12": 0.1265869140625, "loss_aux_layer_13": 0.136474609375, "loss_aux_layer_14": 0.15185546875, "loss_aux_layer_15": 0.1650390625, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.182373046875, "loss_aux_layer_18": 0.190185546875, "loss_aux_layer_19": 0.189697265625, "loss_aux_layer_2": 0.1031494140625, "loss_aux_layer_20": 0.1943359375, "loss_aux_layer_21": 0.199462890625, "loss_aux_layer_22": 0.2236328125, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.111572265625, "loss_aux_layer_4": 0.11328125, "loss_aux_layer_5": 0.1151123046875, "loss_aux_layer_6": 0.1175537109375, "loss_aux_layer_7": 0.1114501953125, "loss_aux_layer_8": 0.110107421875, "loss_aux_layer_9": 0.1083984375, "step": 593, "total_loss": 0.8538308441638947 }, { "epoch": 0.11760047515343497, "grad_norm": 1.8953590393066406, "learning_rate": 5e-05, "llm_loss": 0.6642375141382217, "loss": 3.2244, "loss_aux_layer_0": 0.030487060546875, "loss_aux_layer_1": 0.1002197265625, "loss_aux_layer_10": 0.10693359375, "loss_aux_layer_11": 0.113037109375, "loss_aux_layer_12": 0.1221923828125, "loss_aux_layer_13": 0.1318359375, "loss_aux_layer_14": 0.147705078125, "loss_aux_layer_15": 0.16162109375, "loss_aux_layer_16": 0.17529296875, "loss_aux_layer_17": 0.181396484375, "loss_aux_layer_18": 0.1904296875, "loss_aux_layer_19": 0.19140625, "loss_aux_layer_2": 0.100341796875, "loss_aux_layer_20": 0.196533203125, "loss_aux_layer_21": 0.200439453125, "loss_aux_layer_22": 0.220458984375, "loss_aux_layer_23": 0.263671875, "loss_aux_layer_3": 0.109375, "loss_aux_layer_4": 0.1109619140625, "loss_aux_layer_5": 0.1126708984375, "loss_aux_layer_6": 0.1151123046875, "loss_aux_layer_7": 0.1092529296875, "loss_aux_layer_8": 0.1075439453125, "loss_aux_layer_9": 0.10595703125, "step": 594, "total_loss": 0.8061107397079468 }, { "epoch": 0.11779845575133636, "grad_norm": 2.2967727184295654, "learning_rate": 5e-05, "llm_loss": 0.6299433708190918, "loss": 3.0968, "loss_aux_layer_0": 0.029632568359375, "loss_aux_layer_1": 0.100341796875, "loss_aux_layer_10": 0.1094970703125, "loss_aux_layer_11": 0.1156005859375, "loss_aux_layer_12": 0.12451171875, "loss_aux_layer_13": 0.13427734375, "loss_aux_layer_14": 0.150146484375, "loss_aux_layer_15": 0.163818359375, "loss_aux_layer_16": 0.1767578125, "loss_aux_layer_17": 0.183349609375, "loss_aux_layer_18": 0.19189453125, "loss_aux_layer_19": 0.192138671875, "loss_aux_layer_2": 0.103515625, "loss_aux_layer_20": 0.197265625, "loss_aux_layer_21": 0.2021484375, "loss_aux_layer_22": 0.225341796875, "loss_aux_layer_23": 0.26953125, "loss_aux_layer_3": 0.1126708984375, "loss_aux_layer_4": 0.1142578125, "loss_aux_layer_5": 0.1162109375, "loss_aux_layer_6": 0.1182861328125, "loss_aux_layer_7": 0.112548828125, "loss_aux_layer_8": 0.1103515625, "loss_aux_layer_9": 0.1090087890625, "step": 595, "total_loss": 0.774209201335907 }, { "epoch": 0.11799643634923777, "grad_norm": 2.6507441997528076, "learning_rate": 5e-05, "llm_loss": 0.6301014870405197, "loss": 3.0904, "loss_aux_layer_0": 0.031585693359375, "loss_aux_layer_1": 0.1004638671875, "loss_aux_layer_10": 0.109130859375, "loss_aux_layer_11": 0.1153564453125, "loss_aux_layer_12": 0.1241455078125, "loss_aux_layer_13": 0.1336669921875, "loss_aux_layer_14": 0.14892578125, "loss_aux_layer_15": 0.162841796875, "loss_aux_layer_16": 0.176025390625, "loss_aux_layer_17": 0.18115234375, "loss_aux_layer_18": 0.18994140625, "loss_aux_layer_19": 0.1904296875, "loss_aux_layer_2": 0.1026611328125, "loss_aux_layer_20": 0.195068359375, "loss_aux_layer_21": 0.19873046875, "loss_aux_layer_22": 0.21875, "loss_aux_layer_23": 0.2626953125, "loss_aux_layer_3": 0.1097412109375, "loss_aux_layer_4": 0.1116943359375, "loss_aux_layer_5": 0.1136474609375, "loss_aux_layer_6": 0.1158447265625, "loss_aux_layer_7": 0.11083984375, "loss_aux_layer_8": 0.1094970703125, "loss_aux_layer_9": 0.108154296875, "step": 596, "total_loss": 0.7725894749164581 }, { "epoch": 0.11819441694713918, "grad_norm": 1.3312114477157593, "learning_rate": 5e-05, "llm_loss": 0.6924190074205399, "loss": 3.323, "loss_aux_layer_0": 0.03155517578125, "loss_aux_layer_1": 0.0958251953125, "loss_aux_layer_10": 0.10205078125, "loss_aux_layer_11": 0.108154296875, "loss_aux_layer_12": 0.1170654296875, "loss_aux_layer_13": 0.1268310546875, "loss_aux_layer_14": 0.14306640625, "loss_aux_layer_15": 0.157470703125, "loss_aux_layer_16": 0.171630859375, "loss_aux_layer_17": 0.1787109375, "loss_aux_layer_18": 0.188232421875, "loss_aux_layer_19": 0.190185546875, "loss_aux_layer_2": 0.0958251953125, "loss_aux_layer_20": 0.195068359375, "loss_aux_layer_21": 0.2001953125, "loss_aux_layer_22": 0.22021484375, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.103515625, "loss_aux_layer_4": 0.1053466796875, "loss_aux_layer_5": 0.1072998046875, "loss_aux_layer_6": 0.109619140625, "loss_aux_layer_7": 0.10400390625, "loss_aux_layer_8": 0.1025390625, "loss_aux_layer_9": 0.1014404296875, "step": 597, "total_loss": 0.8307467699050903 }, { "epoch": 0.11839239754504059, "grad_norm": 1.8587298393249512, "learning_rate": 5e-05, "llm_loss": 0.656616285443306, "loss": 3.2104, "loss_aux_layer_0": 0.02960205078125, "loss_aux_layer_1": 0.1026611328125, "loss_aux_layer_10": 0.111328125, "loss_aux_layer_11": 0.118408203125, "loss_aux_layer_12": 0.128173828125, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.154052734375, "loss_aux_layer_15": 0.168212890625, "loss_aux_layer_16": 0.181640625, "loss_aux_layer_17": 0.186767578125, "loss_aux_layer_18": 0.194091796875, "loss_aux_layer_19": 0.193359375, "loss_aux_layer_2": 0.104248046875, "loss_aux_layer_20": 0.1982421875, "loss_aux_layer_21": 0.20263671875, "loss_aux_layer_22": 0.22509765625, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.113037109375, "loss_aux_layer_4": 0.1151123046875, "loss_aux_layer_5": 0.11669921875, "loss_aux_layer_6": 0.11962890625, "loss_aux_layer_7": 0.1136474609375, "loss_aux_layer_8": 0.1116943359375, "loss_aux_layer_9": 0.1099853515625, "step": 598, "total_loss": 0.8026055693626404 }, { "epoch": 0.11859037814294199, "grad_norm": 1.3649351596832275, "learning_rate": 5e-05, "llm_loss": 0.5846141278743744, "loss": 2.9035, "loss_aux_layer_0": 0.03106689453125, "loss_aux_layer_1": 0.09814453125, "loss_aux_layer_10": 0.1065673828125, "loss_aux_layer_11": 0.112548828125, "loss_aux_layer_12": 0.1212158203125, "loss_aux_layer_13": 0.1307373046875, "loss_aux_layer_14": 0.146484375, "loss_aux_layer_15": 0.16064453125, "loss_aux_layer_16": 0.174072265625, "loss_aux_layer_17": 0.17919921875, "loss_aux_layer_18": 0.187744140625, "loss_aux_layer_19": 0.188720703125, "loss_aux_layer_2": 0.1007080078125, "loss_aux_layer_20": 0.193359375, "loss_aux_layer_21": 0.19970703125, "loss_aux_layer_22": 0.222900390625, "loss_aux_layer_23": 0.26708984375, "loss_aux_layer_3": 0.109130859375, "loss_aux_layer_4": 0.110595703125, "loss_aux_layer_5": 0.1119384765625, "loss_aux_layer_6": 0.11474609375, "loss_aux_layer_7": 0.1090087890625, "loss_aux_layer_8": 0.107421875, "loss_aux_layer_9": 0.10595703125, "step": 599, "total_loss": 0.7258830219507217 }, { "epoch": 0.1187883587408434, "grad_norm": 0.8989679217338562, "learning_rate": 5e-05, "llm_loss": 0.5830351263284683, "loss": 2.8885, "loss_aux_layer_0": 0.030364990234375, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.10498046875, "loss_aux_layer_11": 0.111328125, "loss_aux_layer_12": 0.1207275390625, "loss_aux_layer_13": 0.1302490234375, "loss_aux_layer_14": 0.145751953125, "loss_aux_layer_15": 0.15966796875, "loss_aux_layer_16": 0.17236328125, "loss_aux_layer_17": 0.17822265625, "loss_aux_layer_18": 0.1865234375, "loss_aux_layer_19": 0.185791015625, "loss_aux_layer_2": 0.0986328125, "loss_aux_layer_20": 0.19140625, "loss_aux_layer_21": 0.196044921875, "loss_aux_layer_22": 0.21630859375, "loss_aux_layer_23": 0.25927734375, "loss_aux_layer_3": 0.1064453125, "loss_aux_layer_4": 0.1080322265625, "loss_aux_layer_5": 0.109375, "loss_aux_layer_6": 0.1119384765625, "loss_aux_layer_7": 0.1064453125, "loss_aux_layer_8": 0.1051025390625, "loss_aux_layer_9": 0.1038818359375, "step": 600, "total_loss": 0.7221217006444931 }, { "epoch": 0.1189863393387448, "grad_norm": 1.5471792221069336, "learning_rate": 5e-05, "llm_loss": 0.5467498004436493, "loss": 2.7701, "loss_aux_layer_0": 0.029937744140625, "loss_aux_layer_1": 0.1016845703125, "loss_aux_layer_10": 0.109619140625, "loss_aux_layer_11": 0.116455078125, "loss_aux_layer_12": 0.125732421875, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.15185546875, "loss_aux_layer_15": 0.16650390625, "loss_aux_layer_16": 0.18017578125, "loss_aux_layer_17": 0.185302734375, "loss_aux_layer_18": 0.1943359375, "loss_aux_layer_19": 0.19482421875, "loss_aux_layer_2": 0.1048583984375, "loss_aux_layer_20": 0.2001953125, "loss_aux_layer_21": 0.20458984375, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.113525390625, "loss_aux_layer_4": 0.1148681640625, "loss_aux_layer_5": 0.1163330078125, "loss_aux_layer_6": 0.11865234375, "loss_aux_layer_7": 0.1129150390625, "loss_aux_layer_8": 0.1109619140625, "loss_aux_layer_9": 0.109130859375, "step": 601, "total_loss": 0.6925130486488342 }, { "epoch": 0.1191843199366462, "grad_norm": 2.0245161056518555, "learning_rate": 5e-05, "llm_loss": 0.6981157809495926, "loss": 3.3714, "loss_aux_layer_0": 0.029571533203125, "loss_aux_layer_1": 0.1015625, "loss_aux_layer_10": 0.1102294921875, "loss_aux_layer_11": 0.1165771484375, "loss_aux_layer_12": 0.1258544921875, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.152099609375, "loss_aux_layer_15": 0.165771484375, "loss_aux_layer_16": 0.178466796875, "loss_aux_layer_17": 0.18310546875, "loss_aux_layer_18": 0.19140625, "loss_aux_layer_19": 0.19091796875, "loss_aux_layer_2": 0.1029052734375, "loss_aux_layer_20": 0.19482421875, "loss_aux_layer_21": 0.20068359375, "loss_aux_layer_22": 0.22412109375, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.1129150390625, "loss_aux_layer_4": 0.1148681640625, "loss_aux_layer_5": 0.1171875, "loss_aux_layer_6": 0.1201171875, "loss_aux_layer_7": 0.11376953125, "loss_aux_layer_8": 0.112060546875, "loss_aux_layer_9": 0.10986328125, "step": 602, "total_loss": 0.8428419381380081 }, { "epoch": 0.11938230053454761, "grad_norm": 1.562103271484375, "learning_rate": 5e-05, "llm_loss": 0.6588421165943146, "loss": 3.1886, "loss_aux_layer_0": 0.0318603515625, "loss_aux_layer_1": 0.0950927734375, "loss_aux_layer_10": 0.102294921875, "loss_aux_layer_11": 0.1087646484375, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.12890625, "loss_aux_layer_14": 0.14453125, "loss_aux_layer_15": 0.159423828125, "loss_aux_layer_16": 0.173095703125, "loss_aux_layer_17": 0.178955078125, "loss_aux_layer_18": 0.187255859375, "loss_aux_layer_19": 0.18798828125, "loss_aux_layer_2": 0.0955810546875, "loss_aux_layer_20": 0.192626953125, "loss_aux_layer_21": 0.197998046875, "loss_aux_layer_22": 0.2216796875, "loss_aux_layer_23": 0.2666015625, "loss_aux_layer_3": 0.10302734375, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.106201171875, "loss_aux_layer_6": 0.1087646484375, "loss_aux_layer_7": 0.103759765625, "loss_aux_layer_8": 0.1025390625, "loss_aux_layer_9": 0.10107421875, "step": 603, "total_loss": 0.7971496880054474 }, { "epoch": 0.11958028113244902, "grad_norm": 1.2408298254013062, "learning_rate": 5e-05, "llm_loss": 0.6927974969148636, "loss": 3.3478, "loss_aux_layer_0": 0.02911376953125, "loss_aux_layer_1": 0.1031494140625, "loss_aux_layer_10": 0.110595703125, "loss_aux_layer_11": 0.1173095703125, "loss_aux_layer_12": 0.126220703125, "loss_aux_layer_13": 0.1357421875, "loss_aux_layer_14": 0.150390625, "loss_aux_layer_15": 0.1640625, "loss_aux_layer_16": 0.17626953125, "loss_aux_layer_17": 0.181884765625, "loss_aux_layer_18": 0.18994140625, "loss_aux_layer_19": 0.189453125, "loss_aux_layer_2": 0.104736328125, "loss_aux_layer_20": 0.194091796875, "loss_aux_layer_21": 0.198486328125, "loss_aux_layer_22": 0.219970703125, "loss_aux_layer_23": 0.2626953125, "loss_aux_layer_3": 0.1141357421875, "loss_aux_layer_4": 0.1162109375, "loss_aux_layer_5": 0.1180419921875, "loss_aux_layer_6": 0.1202392578125, "loss_aux_layer_7": 0.1142578125, "loss_aux_layer_8": 0.1121826171875, "loss_aux_layer_9": 0.110107421875, "step": 604, "total_loss": 0.8369612842798233 }, { "epoch": 0.11977826173035043, "grad_norm": 1.624366044998169, "learning_rate": 5e-05, "llm_loss": 0.6506352722644806, "loss": 3.1635, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.09619140625, "loss_aux_layer_10": 0.1058349609375, "loss_aux_layer_11": 0.112060546875, "loss_aux_layer_12": 0.1214599609375, "loss_aux_layer_13": 0.131591796875, "loss_aux_layer_14": 0.147216796875, "loss_aux_layer_15": 0.16064453125, "loss_aux_layer_16": 0.174072265625, "loss_aux_layer_17": 0.1806640625, "loss_aux_layer_18": 0.1884765625, "loss_aux_layer_19": 0.1884765625, "loss_aux_layer_2": 0.097412109375, "loss_aux_layer_20": 0.193115234375, "loss_aux_layer_21": 0.197998046875, "loss_aux_layer_22": 0.220458984375, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.1064453125, "loss_aux_layer_4": 0.1087646484375, "loss_aux_layer_5": 0.1104736328125, "loss_aux_layer_6": 0.113037109375, "loss_aux_layer_7": 0.107421875, "loss_aux_layer_8": 0.1058349609375, "loss_aux_layer_9": 0.10498046875, "step": 605, "total_loss": 0.7908816784620285 }, { "epoch": 0.11997624232825183, "grad_norm": 1.4733434915542603, "learning_rate": 5e-05, "llm_loss": 0.730141893029213, "loss": 3.4788, "loss_aux_layer_0": 0.030517578125, "loss_aux_layer_1": 0.0985107421875, "loss_aux_layer_10": 0.1041259765625, "loss_aux_layer_11": 0.110107421875, "loss_aux_layer_12": 0.1197509765625, "loss_aux_layer_13": 0.130126953125, "loss_aux_layer_14": 0.146240234375, "loss_aux_layer_15": 0.16015625, "loss_aux_layer_16": 0.174072265625, "loss_aux_layer_17": 0.180419921875, "loss_aux_layer_18": 0.189697265625, "loss_aux_layer_19": 0.189453125, "loss_aux_layer_2": 0.09912109375, "loss_aux_layer_20": 0.193359375, "loss_aux_layer_21": 0.196533203125, "loss_aux_layer_22": 0.2158203125, "loss_aux_layer_23": 0.26025390625, "loss_aux_layer_3": 0.1070556640625, "loss_aux_layer_4": 0.1083984375, "loss_aux_layer_5": 0.1097412109375, "loss_aux_layer_6": 0.112060546875, "loss_aux_layer_7": 0.106201171875, "loss_aux_layer_8": 0.104736328125, "loss_aux_layer_9": 0.103515625, "step": 606, "total_loss": 0.8697118163108826 }, { "epoch": 0.12017422292615323, "grad_norm": 4.126917362213135, "learning_rate": 5e-05, "llm_loss": 0.6031298264861107, "loss": 2.9817, "loss_aux_layer_0": 0.029632568359375, "loss_aux_layer_1": 0.0985107421875, "loss_aux_layer_10": 0.109130859375, "loss_aux_layer_11": 0.1146240234375, "loss_aux_layer_12": 0.1236572265625, "loss_aux_layer_13": 0.1326904296875, "loss_aux_layer_14": 0.14794921875, "loss_aux_layer_15": 0.161865234375, "loss_aux_layer_16": 0.175048828125, "loss_aux_layer_17": 0.179443359375, "loss_aux_layer_18": 0.1884765625, "loss_aux_layer_19": 0.18798828125, "loss_aux_layer_2": 0.1031494140625, "loss_aux_layer_20": 0.192626953125, "loss_aux_layer_21": 0.197998046875, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.1109619140625, "loss_aux_layer_4": 0.1126708984375, "loss_aux_layer_5": 0.11474609375, "loss_aux_layer_6": 0.1162109375, "loss_aux_layer_7": 0.112060546875, "loss_aux_layer_8": 0.1104736328125, "loss_aux_layer_9": 0.1090087890625, "step": 607, "total_loss": 0.7454226464033127 }, { "epoch": 0.12037220352405464, "grad_norm": 6.289308071136475, "learning_rate": 5e-05, "llm_loss": 0.6688730716705322, "loss": 3.2627, "loss_aux_layer_0": 0.03253173828125, "loss_aux_layer_1": 0.1046142578125, "loss_aux_layer_10": 0.1143798828125, "loss_aux_layer_11": 0.1201171875, "loss_aux_layer_12": 0.128662109375, "loss_aux_layer_13": 0.1376953125, "loss_aux_layer_14": 0.15234375, "loss_aux_layer_15": 0.165283203125, "loss_aux_layer_16": 0.17724609375, "loss_aux_layer_17": 0.18115234375, "loss_aux_layer_18": 0.188232421875, "loss_aux_layer_19": 0.18701171875, "loss_aux_layer_2": 0.1099853515625, "loss_aux_layer_20": 0.192138671875, "loss_aux_layer_21": 0.197265625, "loss_aux_layer_22": 0.218994140625, "loss_aux_layer_23": 0.26220703125, "loss_aux_layer_3": 0.1234130859375, "loss_aux_layer_4": 0.1243896484375, "loss_aux_layer_5": 0.1265869140625, "loss_aux_layer_6": 0.1256103515625, "loss_aux_layer_7": 0.1217041015625, "loss_aux_layer_8": 0.1171875, "loss_aux_layer_9": 0.1148681640625, "step": 608, "total_loss": 0.8156765997409821 }, { "epoch": 0.12057018412195605, "grad_norm": 5.025254249572754, "learning_rate": 5e-05, "llm_loss": 0.5856132358312607, "loss": 2.9346, "loss_aux_layer_0": 0.030975341796875, "loss_aux_layer_1": 0.0986328125, "loss_aux_layer_10": 0.1129150390625, "loss_aux_layer_11": 0.1195068359375, "loss_aux_layer_12": 0.1287841796875, "loss_aux_layer_13": 0.138916015625, "loss_aux_layer_14": 0.15380859375, "loss_aux_layer_15": 0.167236328125, "loss_aux_layer_16": 0.180419921875, "loss_aux_layer_17": 0.185302734375, "loss_aux_layer_18": 0.193603515625, "loss_aux_layer_19": 0.1923828125, "loss_aux_layer_2": 0.1063232421875, "loss_aux_layer_20": 0.19580078125, "loss_aux_layer_21": 0.199462890625, "loss_aux_layer_22": 0.219970703125, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.128173828125, "loss_aux_layer_4": 0.1285400390625, "loss_aux_layer_5": 0.1312255859375, "loss_aux_layer_6": 0.1298828125, "loss_aux_layer_7": 0.1185302734375, "loss_aux_layer_8": 0.1153564453125, "loss_aux_layer_9": 0.1124267578125, "step": 609, "total_loss": 0.7336412966251373 }, { "epoch": 0.12076816471985745, "grad_norm": 2.58703351020813, "learning_rate": 5e-05, "llm_loss": 0.7012058943510056, "loss": 3.3595, "loss_aux_layer_0": 0.028228759765625, "loss_aux_layer_1": 0.0938720703125, "loss_aux_layer_10": 0.10498046875, "loss_aux_layer_11": 0.1102294921875, "loss_aux_layer_12": 0.1197509765625, "loss_aux_layer_13": 0.1297607421875, "loss_aux_layer_14": 0.14501953125, "loss_aux_layer_15": 0.159423828125, "loss_aux_layer_16": 0.173583984375, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.187744140625, "loss_aux_layer_19": 0.187255859375, "loss_aux_layer_2": 0.0958251953125, "loss_aux_layer_20": 0.19189453125, "loss_aux_layer_21": 0.1943359375, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.2568359375, "loss_aux_layer_3": 0.108642578125, "loss_aux_layer_4": 0.10986328125, "loss_aux_layer_5": 0.1114501953125, "loss_aux_layer_6": 0.1119384765625, "loss_aux_layer_7": 0.105712890625, "loss_aux_layer_8": 0.1046142578125, "loss_aux_layer_9": 0.103759765625, "step": 610, "total_loss": 0.8398672640323639 }, { "epoch": 0.12096614531775886, "grad_norm": 9.601920127868652, "learning_rate": 5e-05, "llm_loss": 0.6630088835954666, "loss": 3.2591, "loss_aux_layer_0": 0.03338623046875, "loss_aux_layer_1": 0.1036376953125, "loss_aux_layer_10": 0.120849609375, "loss_aux_layer_11": 0.126953125, "loss_aux_layer_12": 0.13623046875, "loss_aux_layer_13": 0.14599609375, "loss_aux_layer_14": 0.160888671875, "loss_aux_layer_15": 0.173095703125, "loss_aux_layer_16": 0.185302734375, "loss_aux_layer_17": 0.189697265625, "loss_aux_layer_18": 0.1962890625, "loss_aux_layer_19": 0.194580078125, "loss_aux_layer_2": 0.1121826171875, "loss_aux_layer_20": 0.198486328125, "loss_aux_layer_21": 0.20166015625, "loss_aux_layer_22": 0.222412109375, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.1224365234375, "loss_aux_layer_4": 0.123779296875, "loss_aux_layer_5": 0.1275634765625, "loss_aux_layer_6": 0.131591796875, "loss_aux_layer_7": 0.127197265625, "loss_aux_layer_8": 0.123291015625, "loss_aux_layer_9": 0.119384765625, "step": 611, "total_loss": 0.8147790282964706 }, { "epoch": 0.12116412591566027, "grad_norm": 6.500195026397705, "learning_rate": 5e-05, "llm_loss": 0.6317166835069656, "loss": 3.1696, "loss_aux_layer_0": 0.031463623046875, "loss_aux_layer_1": 0.1046142578125, "loss_aux_layer_10": 0.1265869140625, "loss_aux_layer_11": 0.132568359375, "loss_aux_layer_12": 0.142333984375, "loss_aux_layer_13": 0.1513671875, "loss_aux_layer_14": 0.16650390625, "loss_aux_layer_15": 0.18017578125, "loss_aux_layer_16": 0.193603515625, "loss_aux_layer_17": 0.197998046875, "loss_aux_layer_18": 0.20703125, "loss_aux_layer_19": 0.204833984375, "loss_aux_layer_2": 0.1165771484375, "loss_aux_layer_20": 0.2080078125, "loss_aux_layer_21": 0.21044921875, "loss_aux_layer_22": 0.2314453125, "loss_aux_layer_23": 0.2744140625, "loss_aux_layer_3": 0.147705078125, "loss_aux_layer_4": 0.147705078125, "loss_aux_layer_5": 0.15087890625, "loss_aux_layer_6": 0.146240234375, "loss_aux_layer_7": 0.130615234375, "loss_aux_layer_8": 0.1273193359375, "loss_aux_layer_9": 0.125244140625, "step": 612, "total_loss": 0.7923890352249146 }, { "epoch": 0.12136210651356168, "grad_norm": 4.382494926452637, "learning_rate": 5e-05, "llm_loss": 0.6612049043178558, "loss": 3.2624, "loss_aux_layer_0": 0.02886962890625, "loss_aux_layer_1": 0.1005859375, "loss_aux_layer_10": 0.1214599609375, "loss_aux_layer_11": 0.125244140625, "loss_aux_layer_12": 0.13427734375, "loss_aux_layer_13": 0.143310546875, "loss_aux_layer_14": 0.15771484375, "loss_aux_layer_15": 0.171630859375, "loss_aux_layer_16": 0.1845703125, "loss_aux_layer_17": 0.1904296875, "loss_aux_layer_18": 0.1982421875, "loss_aux_layer_19": 0.19873046875, "loss_aux_layer_2": 0.10546875, "loss_aux_layer_20": 0.202880859375, "loss_aux_layer_21": 0.20556640625, "loss_aux_layer_22": 0.22607421875, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.138916015625, "loss_aux_layer_4": 0.139892578125, "loss_aux_layer_5": 0.14404296875, "loss_aux_layer_6": 0.1416015625, "loss_aux_layer_7": 0.130126953125, "loss_aux_layer_8": 0.124755859375, "loss_aux_layer_9": 0.1217041015625, "step": 613, "total_loss": 0.8156024068593979 }, { "epoch": 0.12156008711146307, "grad_norm": 2.980473756790161, "learning_rate": 5e-05, "llm_loss": 0.7283374071121216, "loss": 3.5057, "loss_aux_layer_0": 0.0321044921875, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.119873046875, "loss_aux_layer_12": 0.1297607421875, "loss_aux_layer_13": 0.14013671875, "loss_aux_layer_14": 0.15576171875, "loss_aux_layer_15": 0.169189453125, "loss_aux_layer_16": 0.181884765625, "loss_aux_layer_17": 0.18798828125, "loss_aux_layer_18": 0.195556640625, "loss_aux_layer_19": 0.19482421875, "loss_aux_layer_2": 0.111328125, "loss_aux_layer_20": 0.198974609375, "loss_aux_layer_21": 0.20361328125, "loss_aux_layer_22": 0.22607421875, "loss_aux_layer_23": 0.2705078125, "loss_aux_layer_3": 0.1180419921875, "loss_aux_layer_4": 0.1197509765625, "loss_aux_layer_5": 0.1214599609375, "loss_aux_layer_6": 0.120849609375, "loss_aux_layer_7": 0.1148681640625, "loss_aux_layer_8": 0.1143798828125, "loss_aux_layer_9": 0.11376953125, "step": 614, "total_loss": 0.8764131963253021 }, { "epoch": 0.12175806770936448, "grad_norm": 3.5327084064483643, "learning_rate": 5e-05, "llm_loss": 0.6732756346464157, "loss": 3.2958, "loss_aux_layer_0": 0.032196044921875, "loss_aux_layer_1": 0.1007080078125, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.12353515625, "loss_aux_layer_12": 0.1328125, "loss_aux_layer_13": 0.14208984375, "loss_aux_layer_14": 0.157470703125, "loss_aux_layer_15": 0.171142578125, "loss_aux_layer_16": 0.18408203125, "loss_aux_layer_17": 0.188720703125, "loss_aux_layer_18": 0.197021484375, "loss_aux_layer_19": 0.1943359375, "loss_aux_layer_2": 0.1246337890625, "loss_aux_layer_20": 0.198974609375, "loss_aux_layer_21": 0.20361328125, "loss_aux_layer_22": 0.2265625, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.1185302734375, "loss_aux_layer_4": 0.12060546875, "loss_aux_layer_5": 0.122314453125, "loss_aux_layer_6": 0.127685546875, "loss_aux_layer_7": 0.121337890625, "loss_aux_layer_8": 0.1212158203125, "loss_aux_layer_9": 0.1181640625, "step": 615, "total_loss": 0.8239472657442093 }, { "epoch": 0.12195604830726589, "grad_norm": 3.604969024658203, "learning_rate": 5e-05, "llm_loss": 0.7140362411737442, "loss": 3.4492, "loss_aux_layer_0": 0.029693603515625, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.1148681640625, "loss_aux_layer_11": 0.11865234375, "loss_aux_layer_12": 0.12744140625, "loss_aux_layer_13": 0.136962890625, "loss_aux_layer_14": 0.15234375, "loss_aux_layer_15": 0.166748046875, "loss_aux_layer_16": 0.1806640625, "loss_aux_layer_17": 0.18701171875, "loss_aux_layer_18": 0.194580078125, "loss_aux_layer_19": 0.1953125, "loss_aux_layer_2": 0.11669921875, "loss_aux_layer_20": 0.199462890625, "loss_aux_layer_21": 0.202880859375, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.118408203125, "loss_aux_layer_4": 0.1197509765625, "loss_aux_layer_5": 0.1224365234375, "loss_aux_layer_6": 0.13037109375, "loss_aux_layer_7": 0.120849609375, "loss_aux_layer_8": 0.118896484375, "loss_aux_layer_9": 0.1146240234375, "step": 616, "total_loss": 0.8623070120811462 }, { "epoch": 0.1221540289051673, "grad_norm": 2.375696897506714, "learning_rate": 5e-05, "llm_loss": 0.6710423678159714, "loss": 3.2737, "loss_aux_layer_0": 0.0301513671875, "loss_aux_layer_1": 0.099609375, "loss_aux_layer_10": 0.1126708984375, "loss_aux_layer_11": 0.1185302734375, "loss_aux_layer_12": 0.127685546875, "loss_aux_layer_13": 0.137451171875, "loss_aux_layer_14": 0.152587890625, "loss_aux_layer_15": 0.166015625, "loss_aux_layer_16": 0.1787109375, "loss_aux_layer_17": 0.18505859375, "loss_aux_layer_18": 0.1943359375, "loss_aux_layer_19": 0.1943359375, "loss_aux_layer_2": 0.10791015625, "loss_aux_layer_20": 0.198974609375, "loss_aux_layer_21": 0.204345703125, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.27197265625, "loss_aux_layer_3": 0.11767578125, "loss_aux_layer_4": 0.118408203125, "loss_aux_layer_5": 0.1207275390625, "loss_aux_layer_6": 0.1243896484375, "loss_aux_layer_7": 0.1177978515625, "loss_aux_layer_8": 0.11669921875, "loss_aux_layer_9": 0.1141357421875, "step": 617, "total_loss": 0.8184368908405304 }, { "epoch": 0.1223520095030687, "grad_norm": 2.7598016262054443, "learning_rate": 5e-05, "llm_loss": 0.6341149061918259, "loss": 3.1379, "loss_aux_layer_0": 0.0316162109375, "loss_aux_layer_1": 0.1011962890625, "loss_aux_layer_10": 0.1168212890625, "loss_aux_layer_11": 0.1236572265625, "loss_aux_layer_12": 0.132568359375, "loss_aux_layer_13": 0.142333984375, "loss_aux_layer_14": 0.157958984375, "loss_aux_layer_15": 0.171875, "loss_aux_layer_16": 0.18505859375, "loss_aux_layer_17": 0.18896484375, "loss_aux_layer_18": 0.19775390625, "loss_aux_layer_19": 0.197509765625, "loss_aux_layer_2": 0.1126708984375, "loss_aux_layer_20": 0.201171875, "loss_aux_layer_21": 0.205078125, "loss_aux_layer_22": 0.228271484375, "loss_aux_layer_23": 0.2724609375, "loss_aux_layer_3": 0.1201171875, "loss_aux_layer_4": 0.1212158203125, "loss_aux_layer_5": 0.123291015625, "loss_aux_layer_6": 0.125, "loss_aux_layer_7": 0.119384765625, "loss_aux_layer_8": 0.116943359375, "loss_aux_layer_9": 0.115478515625, "step": 618, "total_loss": 0.7844756692647934 }, { "epoch": 0.1225499901009701, "grad_norm": 2.469472646713257, "learning_rate": 5e-05, "llm_loss": 0.6054423898458481, "loss": 3.0006, "loss_aux_layer_0": 0.033233642578125, "loss_aux_layer_1": 0.099609375, "loss_aux_layer_10": 0.1107177734375, "loss_aux_layer_11": 0.117431640625, "loss_aux_layer_12": 0.1270751953125, "loss_aux_layer_13": 0.136962890625, "loss_aux_layer_14": 0.15234375, "loss_aux_layer_15": 0.16552734375, "loss_aux_layer_16": 0.178955078125, "loss_aux_layer_17": 0.1845703125, "loss_aux_layer_18": 0.1923828125, "loss_aux_layer_19": 0.19189453125, "loss_aux_layer_2": 0.1083984375, "loss_aux_layer_20": 0.1962890625, "loss_aux_layer_21": 0.19970703125, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.1126708984375, "loss_aux_layer_4": 0.1134033203125, "loss_aux_layer_5": 0.1151123046875, "loss_aux_layer_6": 0.11767578125, "loss_aux_layer_7": 0.112060546875, "loss_aux_layer_8": 0.11181640625, "loss_aux_layer_9": 0.1099853515625, "step": 619, "total_loss": 0.7501608729362488 }, { "epoch": 0.12274797069887151, "grad_norm": 2.1987619400024414, "learning_rate": 5e-05, "llm_loss": 0.5978272706270218, "loss": 2.9683, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.0950927734375, "loss_aux_layer_10": 0.1094970703125, "loss_aux_layer_11": 0.115478515625, "loss_aux_layer_12": 0.12451171875, "loss_aux_layer_13": 0.13427734375, "loss_aux_layer_14": 0.15087890625, "loss_aux_layer_15": 0.16455078125, "loss_aux_layer_16": 0.177734375, "loss_aux_layer_17": 0.183837890625, "loss_aux_layer_18": 0.192138671875, "loss_aux_layer_19": 0.193115234375, "loss_aux_layer_2": 0.1048583984375, "loss_aux_layer_20": 0.19921875, "loss_aux_layer_21": 0.20458984375, "loss_aux_layer_22": 0.22607421875, "loss_aux_layer_23": 0.27099609375, "loss_aux_layer_3": 0.11083984375, "loss_aux_layer_4": 0.112060546875, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1175537109375, "loss_aux_layer_7": 0.1109619140625, "loss_aux_layer_8": 0.111083984375, "loss_aux_layer_9": 0.1092529296875, "step": 620, "total_loss": 0.7420744895935059 }, { "epoch": 0.12294595129677291, "grad_norm": 2.9881362915039062, "learning_rate": 5e-05, "llm_loss": 0.6434617191553116, "loss": 3.1712, "loss_aux_layer_0": 0.031280517578125, "loss_aux_layer_1": 0.1053466796875, "loss_aux_layer_10": 0.11376953125, "loss_aux_layer_11": 0.12060546875, "loss_aux_layer_12": 0.1297607421875, "loss_aux_layer_13": 0.139892578125, "loss_aux_layer_14": 0.15576171875, "loss_aux_layer_15": 0.169921875, "loss_aux_layer_16": 0.18310546875, "loss_aux_layer_17": 0.189208984375, "loss_aux_layer_18": 0.19677734375, "loss_aux_layer_19": 0.1962890625, "loss_aux_layer_2": 0.112548828125, "loss_aux_layer_20": 0.20068359375, "loss_aux_layer_21": 0.20458984375, "loss_aux_layer_22": 0.22705078125, "loss_aux_layer_23": 0.271484375, "loss_aux_layer_3": 0.120849609375, "loss_aux_layer_4": 0.12158203125, "loss_aux_layer_5": 0.122802734375, "loss_aux_layer_6": 0.125, "loss_aux_layer_7": 0.1170654296875, "loss_aux_layer_8": 0.1156005859375, "loss_aux_layer_9": 0.113525390625, "step": 621, "total_loss": 0.7928052097558975 }, { "epoch": 0.12314393189467432, "grad_norm": 1.409026861190796, "learning_rate": 5e-05, "llm_loss": 0.5969485938549042, "loss": 2.9758, "loss_aux_layer_0": 0.0302734375, "loss_aux_layer_1": 0.1005859375, "loss_aux_layer_10": 0.1126708984375, "loss_aux_layer_11": 0.119873046875, "loss_aux_layer_12": 0.128662109375, "loss_aux_layer_13": 0.138671875, "loss_aux_layer_14": 0.15380859375, "loss_aux_layer_15": 0.16748046875, "loss_aux_layer_16": 0.179931640625, "loss_aux_layer_17": 0.185302734375, "loss_aux_layer_18": 0.19287109375, "loss_aux_layer_19": 0.192626953125, "loss_aux_layer_2": 0.1099853515625, "loss_aux_layer_20": 0.1962890625, "loss_aux_layer_21": 0.201904296875, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.26806640625, "loss_aux_layer_3": 0.119384765625, "loss_aux_layer_4": 0.1204833984375, "loss_aux_layer_5": 0.1214599609375, "loss_aux_layer_6": 0.1224365234375, "loss_aux_layer_7": 0.1162109375, "loss_aux_layer_8": 0.1142578125, "loss_aux_layer_9": 0.1119384765625, "step": 622, "total_loss": 0.7439549416303635 }, { "epoch": 0.12334191249257573, "grad_norm": 1.66435706615448, "learning_rate": 5e-05, "llm_loss": 0.6069232523441315, "loss": 3.006, "loss_aux_layer_0": 0.031005859375, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.1083984375, "loss_aux_layer_11": 0.11572265625, "loss_aux_layer_12": 0.1240234375, "loss_aux_layer_13": 0.133544921875, "loss_aux_layer_14": 0.149658203125, "loss_aux_layer_15": 0.1640625, "loss_aux_layer_16": 0.17724609375, "loss_aux_layer_17": 0.18310546875, "loss_aux_layer_18": 0.1904296875, "loss_aux_layer_19": 0.19091796875, "loss_aux_layer_2": 0.1070556640625, "loss_aux_layer_20": 0.1962890625, "loss_aux_layer_21": 0.202880859375, "loss_aux_layer_22": 0.226318359375, "loss_aux_layer_23": 0.2734375, "loss_aux_layer_3": 0.1148681640625, "loss_aux_layer_4": 0.1156005859375, "loss_aux_layer_5": 0.1171875, "loss_aux_layer_6": 0.11865234375, "loss_aux_layer_7": 0.1123046875, "loss_aux_layer_8": 0.1103515625, "loss_aux_layer_9": 0.10791015625, "step": 623, "total_loss": 0.751493290066719 }, { "epoch": 0.12353989309047714, "grad_norm": 1.6471275091171265, "learning_rate": 5e-05, "llm_loss": 0.6195299327373505, "loss": 3.0787, "loss_aux_layer_0": 0.032470703125, "loss_aux_layer_1": 0.1080322265625, "loss_aux_layer_10": 0.117431640625, "loss_aux_layer_11": 0.1246337890625, "loss_aux_layer_12": 0.13330078125, "loss_aux_layer_13": 0.141845703125, "loss_aux_layer_14": 0.156494140625, "loss_aux_layer_15": 0.169189453125, "loss_aux_layer_16": 0.180908203125, "loss_aux_layer_17": 0.18603515625, "loss_aux_layer_18": 0.193603515625, "loss_aux_layer_19": 0.193115234375, "loss_aux_layer_2": 0.1148681640625, "loss_aux_layer_20": 0.197021484375, "loss_aux_layer_21": 0.201416015625, "loss_aux_layer_22": 0.2236328125, "loss_aux_layer_23": 0.2666015625, "loss_aux_layer_3": 0.1239013671875, "loss_aux_layer_4": 0.12548828125, "loss_aux_layer_5": 0.1270751953125, "loss_aux_layer_6": 0.128662109375, "loss_aux_layer_7": 0.12158203125, "loss_aux_layer_8": 0.119384765625, "loss_aux_layer_9": 0.116943359375, "step": 624, "total_loss": 0.7696748226881027 }, { "epoch": 0.12373787368837853, "grad_norm": 1.7103878259658813, "learning_rate": 5e-05, "llm_loss": 0.6742608845233917, "loss": 3.2835, "loss_aux_layer_0": 0.0333251953125, "loss_aux_layer_1": 0.1015625, "loss_aux_layer_10": 0.11328125, "loss_aux_layer_11": 0.1197509765625, "loss_aux_layer_12": 0.1292724609375, "loss_aux_layer_13": 0.1392822265625, "loss_aux_layer_14": 0.155517578125, "loss_aux_layer_15": 0.16845703125, "loss_aux_layer_16": 0.180908203125, "loss_aux_layer_17": 0.18603515625, "loss_aux_layer_18": 0.193115234375, "loss_aux_layer_19": 0.192626953125, "loss_aux_layer_2": 0.107177734375, "loss_aux_layer_20": 0.19677734375, "loss_aux_layer_21": 0.19970703125, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.26513671875, "loss_aux_layer_3": 0.1153564453125, "loss_aux_layer_4": 0.1171875, "loss_aux_layer_5": 0.1190185546875, "loss_aux_layer_6": 0.12158203125, "loss_aux_layer_7": 0.1156005859375, "loss_aux_layer_8": 0.114501953125, "loss_aux_layer_9": 0.1131591796875, "step": 625, "total_loss": 0.8208671659231186 }, { "epoch": 0.12393585428627994, "grad_norm": 1.4686717987060547, "learning_rate": 5e-05, "llm_loss": 0.6129747033119202, "loss": 3.0431, "loss_aux_layer_0": 0.030181884765625, "loss_aux_layer_1": 0.1029052734375, "loss_aux_layer_10": 0.114501953125, "loss_aux_layer_11": 0.1214599609375, "loss_aux_layer_12": 0.130126953125, "loss_aux_layer_13": 0.140380859375, "loss_aux_layer_14": 0.1552734375, "loss_aux_layer_15": 0.16748046875, "loss_aux_layer_16": 0.179931640625, "loss_aux_layer_17": 0.185791015625, "loss_aux_layer_18": 0.19384765625, "loss_aux_layer_19": 0.193603515625, "loss_aux_layer_2": 0.111328125, "loss_aux_layer_20": 0.197509765625, "loss_aux_layer_21": 0.202392578125, "loss_aux_layer_22": 0.22412109375, "loss_aux_layer_23": 0.26904296875, "loss_aux_layer_3": 0.118408203125, "loss_aux_layer_4": 0.119873046875, "loss_aux_layer_5": 0.1214599609375, "loss_aux_layer_6": 0.1226806640625, "loss_aux_layer_7": 0.1163330078125, "loss_aux_layer_8": 0.114990234375, "loss_aux_layer_9": 0.11376953125, "step": 626, "total_loss": 0.7607870995998383 }, { "epoch": 0.12413383488418135, "grad_norm": 1.9378429651260376, "learning_rate": 5e-05, "llm_loss": 0.6608912348747253, "loss": 3.2118, "loss_aux_layer_0": 0.02911376953125, "loss_aux_layer_1": 0.0977783203125, "loss_aux_layer_10": 0.107666015625, "loss_aux_layer_11": 0.11376953125, "loss_aux_layer_12": 0.122802734375, "loss_aux_layer_13": 0.132568359375, "loss_aux_layer_14": 0.14794921875, "loss_aux_layer_15": 0.161376953125, "loss_aux_layer_16": 0.173828125, "loss_aux_layer_17": 0.179443359375, "loss_aux_layer_18": 0.188232421875, "loss_aux_layer_19": 0.189208984375, "loss_aux_layer_2": 0.1038818359375, "loss_aux_layer_20": 0.194091796875, "loss_aux_layer_21": 0.198974609375, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.1114501953125, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.11474609375, "loss_aux_layer_6": 0.1165771484375, "loss_aux_layer_7": 0.111083984375, "loss_aux_layer_8": 0.1097412109375, "loss_aux_layer_9": 0.10791015625, "step": 627, "total_loss": 0.8029589802026749 }, { "epoch": 0.12433181548208276, "grad_norm": 0.918697714805603, "learning_rate": 5e-05, "llm_loss": 0.6978199034929276, "loss": 3.3516, "loss_aux_layer_0": 0.02935791015625, "loss_aux_layer_1": 0.0953369140625, "loss_aux_layer_10": 0.10595703125, "loss_aux_layer_11": 0.1123046875, "loss_aux_layer_12": 0.1207275390625, "loss_aux_layer_13": 0.13037109375, "loss_aux_layer_14": 0.146240234375, "loss_aux_layer_15": 0.16015625, "loss_aux_layer_16": 0.173095703125, "loss_aux_layer_17": 0.17822265625, "loss_aux_layer_18": 0.18603515625, "loss_aux_layer_19": 0.18603515625, "loss_aux_layer_2": 0.10205078125, "loss_aux_layer_20": 0.191162109375, "loss_aux_layer_21": 0.19580078125, "loss_aux_layer_22": 0.217041015625, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.109619140625, "loss_aux_layer_4": 0.1114501953125, "loss_aux_layer_5": 0.11328125, "loss_aux_layer_6": 0.115234375, "loss_aux_layer_7": 0.1092529296875, "loss_aux_layer_8": 0.1075439453125, "loss_aux_layer_9": 0.10546875, "step": 628, "total_loss": 0.8379003703594208 }, { "epoch": 0.12452979607998416, "grad_norm": 1.6942869424819946, "learning_rate": 5e-05, "llm_loss": 0.6690966486930847, "loss": 3.2414, "loss_aux_layer_0": 0.031463623046875, "loss_aux_layer_1": 0.09619140625, "loss_aux_layer_10": 0.107421875, "loss_aux_layer_11": 0.114501953125, "loss_aux_layer_12": 0.1236572265625, "loss_aux_layer_13": 0.13330078125, "loss_aux_layer_14": 0.148681640625, "loss_aux_layer_15": 0.16162109375, "loss_aux_layer_16": 0.17431640625, "loss_aux_layer_17": 0.1796875, "loss_aux_layer_18": 0.1865234375, "loss_aux_layer_19": 0.18701171875, "loss_aux_layer_2": 0.1011962890625, "loss_aux_layer_20": 0.192138671875, "loss_aux_layer_21": 0.197509765625, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.267578125, "loss_aux_layer_3": 0.1085205078125, "loss_aux_layer_4": 0.1099853515625, "loss_aux_layer_5": 0.11181640625, "loss_aux_layer_6": 0.1138916015625, "loss_aux_layer_7": 0.1083984375, "loss_aux_layer_8": 0.1072998046875, "loss_aux_layer_9": 0.1063232421875, "step": 629, "total_loss": 0.8103458434343338 }, { "epoch": 0.12472777667788557, "grad_norm": 1.4090698957443237, "learning_rate": 5e-05, "llm_loss": 0.6092220097780228, "loss": 2.9952, "loss_aux_layer_0": 0.030548095703125, "loss_aux_layer_1": 0.0948486328125, "loss_aux_layer_10": 0.1055908203125, "loss_aux_layer_11": 0.1121826171875, "loss_aux_layer_12": 0.120361328125, "loss_aux_layer_13": 0.12939453125, "loss_aux_layer_14": 0.14404296875, "loss_aux_layer_15": 0.157958984375, "loss_aux_layer_16": 0.171142578125, "loss_aux_layer_17": 0.176513671875, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.18603515625, "loss_aux_layer_2": 0.101318359375, "loss_aux_layer_20": 0.191162109375, "loss_aux_layer_21": 0.1962890625, "loss_aux_layer_22": 0.2177734375, "loss_aux_layer_23": 0.26318359375, "loss_aux_layer_3": 0.1087646484375, "loss_aux_layer_4": 0.1104736328125, "loss_aux_layer_5": 0.1123046875, "loss_aux_layer_6": 0.114013671875, "loss_aux_layer_7": 0.1085205078125, "loss_aux_layer_8": 0.10693359375, "loss_aux_layer_9": 0.1053466796875, "step": 630, "total_loss": 0.7487905025482178 }, { "epoch": 0.12492575727578697, "grad_norm": 1.5146909952163696, "learning_rate": 5e-05, "llm_loss": 0.585487425327301, "loss": 2.9081, "loss_aux_layer_0": 0.029296875, "loss_aux_layer_1": 0.0977783203125, "loss_aux_layer_10": 0.10693359375, "loss_aux_layer_11": 0.11279296875, "loss_aux_layer_12": 0.12109375, "loss_aux_layer_13": 0.1309814453125, "loss_aux_layer_14": 0.146484375, "loss_aux_layer_15": 0.159912109375, "loss_aux_layer_16": 0.173095703125, "loss_aux_layer_17": 0.1787109375, "loss_aux_layer_18": 0.187255859375, "loss_aux_layer_19": 0.188232421875, "loss_aux_layer_2": 0.10400390625, "loss_aux_layer_20": 0.193603515625, "loss_aux_layer_21": 0.198486328125, "loss_aux_layer_22": 0.220458984375, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.11181640625, "loss_aux_layer_4": 0.11328125, "loss_aux_layer_5": 0.114501953125, "loss_aux_layer_6": 0.1163330078125, "loss_aux_layer_7": 0.1102294921875, "loss_aux_layer_8": 0.1090087890625, "loss_aux_layer_9": 0.106689453125, "step": 631, "total_loss": 0.7270257771015167 }, { "epoch": 0.12512373787368838, "grad_norm": 1.6919296979904175, "learning_rate": 5e-05, "llm_loss": 0.8380150198936462, "loss": 3.8807, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.08984375, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.103759765625, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.1219482421875, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.150390625, "loss_aux_layer_16": 0.16357421875, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.17822265625, "loss_aux_layer_19": 0.178955078125, "loss_aux_layer_2": 0.09521484375, "loss_aux_layer_20": 0.184326171875, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.208984375, "loss_aux_layer_23": 0.252685546875, "loss_aux_layer_3": 0.1014404296875, "loss_aux_layer_4": 0.1026611328125, "loss_aux_layer_5": 0.103759765625, "loss_aux_layer_6": 0.1058349609375, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.098876953125, "loss_aux_layer_9": 0.097412109375, "step": 632, "total_loss": 0.9701752364635468 }, { "epoch": 0.1253217184715898, "grad_norm": 2.528531551361084, "learning_rate": 5e-05, "llm_loss": 0.6229770332574844, "loss": 3.0626, "loss_aux_layer_0": 0.02923583984375, "loss_aux_layer_1": 0.0980224609375, "loss_aux_layer_10": 0.1090087890625, "loss_aux_layer_11": 0.115478515625, "loss_aux_layer_12": 0.1240234375, "loss_aux_layer_13": 0.1328125, "loss_aux_layer_14": 0.148193359375, "loss_aux_layer_15": 0.1611328125, "loss_aux_layer_16": 0.173828125, "loss_aux_layer_17": 0.1796875, "loss_aux_layer_18": 0.187255859375, "loss_aux_layer_19": 0.187744140625, "loss_aux_layer_2": 0.104736328125, "loss_aux_layer_20": 0.193115234375, "loss_aux_layer_21": 0.1982421875, "loss_aux_layer_22": 0.221435546875, "loss_aux_layer_23": 0.266845703125, "loss_aux_layer_3": 0.1121826171875, "loss_aux_layer_4": 0.1143798828125, "loss_aux_layer_5": 0.1162109375, "loss_aux_layer_6": 0.1187744140625, "loss_aux_layer_7": 0.11279296875, "loss_aux_layer_8": 0.110595703125, "loss_aux_layer_9": 0.108642578125, "step": 633, "total_loss": 0.7656435370445251 }, { "epoch": 0.1255196990694912, "grad_norm": 2.983520030975342, "learning_rate": 5e-05, "llm_loss": 0.6447524130344391, "loss": 3.1405, "loss_aux_layer_0": 0.028411865234375, "loss_aux_layer_1": 0.094970703125, "loss_aux_layer_10": 0.106689453125, "loss_aux_layer_11": 0.113037109375, "loss_aux_layer_12": 0.122314453125, "loss_aux_layer_13": 0.1318359375, "loss_aux_layer_14": 0.147216796875, "loss_aux_layer_15": 0.16064453125, "loss_aux_layer_16": 0.173583984375, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.1875, "loss_aux_layer_19": 0.1875, "loss_aux_layer_2": 0.1009521484375, "loss_aux_layer_20": 0.1923828125, "loss_aux_layer_21": 0.1962890625, "loss_aux_layer_22": 0.217041015625, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.1090087890625, "loss_aux_layer_4": 0.1107177734375, "loss_aux_layer_5": 0.112060546875, "loss_aux_layer_6": 0.114013671875, "loss_aux_layer_7": 0.1087646484375, "loss_aux_layer_8": 0.10693359375, "loss_aux_layer_9": 0.1055908203125, "step": 634, "total_loss": 0.785126268863678 }, { "epoch": 0.12571767966739258, "grad_norm": 1.2502232789993286, "learning_rate": 5e-05, "llm_loss": 0.6702560037374496, "loss": 3.2771, "loss_aux_layer_0": 0.03131103515625, "loss_aux_layer_1": 0.109375, "loss_aux_layer_10": 0.1180419921875, "loss_aux_layer_11": 0.124755859375, "loss_aux_layer_12": 0.13330078125, "loss_aux_layer_13": 0.142578125, "loss_aux_layer_14": 0.156494140625, "loss_aux_layer_15": 0.168212890625, "loss_aux_layer_16": 0.1796875, "loss_aux_layer_17": 0.184326171875, "loss_aux_layer_18": 0.1923828125, "loss_aux_layer_19": 0.191162109375, "loss_aux_layer_2": 0.1129150390625, "loss_aux_layer_20": 0.19482421875, "loss_aux_layer_21": 0.197998046875, "loss_aux_layer_22": 0.219482421875, "loss_aux_layer_23": 0.26171875, "loss_aux_layer_3": 0.121826171875, "loss_aux_layer_4": 0.124267578125, "loss_aux_layer_5": 0.1256103515625, "loss_aux_layer_6": 0.1275634765625, "loss_aux_layer_7": 0.121826171875, "loss_aux_layer_8": 0.11962890625, "loss_aux_layer_9": 0.1173095703125, "step": 635, "total_loss": 0.8192669749259949 }, { "epoch": 0.125915660265294, "grad_norm": 2.3047118186950684, "learning_rate": 5e-05, "llm_loss": 0.6112712174654007, "loss": 3.0038, "loss_aux_layer_0": 0.029266357421875, "loss_aux_layer_1": 0.09765625, "loss_aux_layer_10": 0.1060791015625, "loss_aux_layer_11": 0.1124267578125, "loss_aux_layer_12": 0.1212158203125, "loss_aux_layer_13": 0.130615234375, "loss_aux_layer_14": 0.1455078125, "loss_aux_layer_15": 0.158935546875, "loss_aux_layer_16": 0.171630859375, "loss_aux_layer_17": 0.176513671875, "loss_aux_layer_18": 0.18505859375, "loss_aux_layer_19": 0.185302734375, "loss_aux_layer_2": 0.10205078125, "loss_aux_layer_20": 0.189208984375, "loss_aux_layer_21": 0.19482421875, "loss_aux_layer_22": 0.2158203125, "loss_aux_layer_23": 0.25927734375, "loss_aux_layer_3": 0.109619140625, "loss_aux_layer_4": 0.1109619140625, "loss_aux_layer_5": 0.112548828125, "loss_aux_layer_6": 0.1146240234375, "loss_aux_layer_7": 0.1092529296875, "loss_aux_layer_8": 0.1077880859375, "loss_aux_layer_9": 0.10546875, "step": 636, "total_loss": 0.7509495615959167 }, { "epoch": 0.1261136408631954, "grad_norm": 1.5926047563552856, "learning_rate": 5e-05, "llm_loss": 0.6445681601762772, "loss": 3.148, "loss_aux_layer_0": 0.030029296875, "loss_aux_layer_1": 0.09765625, "loss_aux_layer_10": 0.1092529296875, "loss_aux_layer_11": 0.1156005859375, "loss_aux_layer_12": 0.12451171875, "loss_aux_layer_13": 0.134521484375, "loss_aux_layer_14": 0.1494140625, "loss_aux_layer_15": 0.162841796875, "loss_aux_layer_16": 0.176025390625, "loss_aux_layer_17": 0.181640625, "loss_aux_layer_18": 0.189453125, "loss_aux_layer_19": 0.1884765625, "loss_aux_layer_2": 0.102294921875, "loss_aux_layer_20": 0.192626953125, "loss_aux_layer_21": 0.196533203125, "loss_aux_layer_22": 0.2177734375, "loss_aux_layer_23": 0.26220703125, "loss_aux_layer_3": 0.1109619140625, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.114501953125, "loss_aux_layer_6": 0.1171875, "loss_aux_layer_7": 0.1119384765625, "loss_aux_layer_8": 0.110107421875, "loss_aux_layer_9": 0.1090087890625, "step": 637, "total_loss": 0.7869922816753387 }, { "epoch": 0.1263116214610968, "grad_norm": 1.420386791229248, "learning_rate": 5e-05, "llm_loss": 0.7394712567329407, "loss": 3.5178, "loss_aux_layer_0": 0.03106689453125, "loss_aux_layer_1": 0.0966796875, "loss_aux_layer_10": 0.105712890625, "loss_aux_layer_11": 0.1121826171875, "loss_aux_layer_12": 0.12109375, "loss_aux_layer_13": 0.131103515625, "loss_aux_layer_14": 0.14697265625, "loss_aux_layer_15": 0.16064453125, "loss_aux_layer_16": 0.173583984375, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.1875, "loss_aux_layer_19": 0.1875, "loss_aux_layer_2": 0.1004638671875, "loss_aux_layer_20": 0.191162109375, "loss_aux_layer_21": 0.194580078125, "loss_aux_layer_22": 0.21484375, "loss_aux_layer_23": 0.25732421875, "loss_aux_layer_3": 0.1082763671875, "loss_aux_layer_4": 0.1099853515625, "loss_aux_layer_5": 0.111572265625, "loss_aux_layer_6": 0.1148681640625, "loss_aux_layer_7": 0.10888671875, "loss_aux_layer_8": 0.107177734375, "loss_aux_layer_9": 0.105224609375, "step": 638, "total_loss": 0.8794422149658203 }, { "epoch": 0.12650960205899822, "grad_norm": 2.387662887573242, "learning_rate": 5e-05, "llm_loss": 0.6945140659809113, "loss": 3.3281, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.09716796875, "loss_aux_layer_10": 0.1064453125, "loss_aux_layer_11": 0.1124267578125, "loss_aux_layer_12": 0.1212158203125, "loss_aux_layer_13": 0.131103515625, "loss_aux_layer_14": 0.144775390625, "loss_aux_layer_15": 0.15673828125, "loss_aux_layer_16": 0.168701171875, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.181396484375, "loss_aux_layer_19": 0.180419921875, "loss_aux_layer_2": 0.099609375, "loss_aux_layer_20": 0.184814453125, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.25146484375, "loss_aux_layer_3": 0.1075439453125, "loss_aux_layer_4": 0.1092529296875, "loss_aux_layer_5": 0.110595703125, "loss_aux_layer_6": 0.1131591796875, "loss_aux_layer_7": 0.1085205078125, "loss_aux_layer_8": 0.1070556640625, "loss_aux_layer_9": 0.10546875, "step": 639, "total_loss": 0.8320124596357346 }, { "epoch": 0.12670758265689963, "grad_norm": 2.1993048191070557, "learning_rate": 5e-05, "llm_loss": 0.6094708144664764, "loss": 3.0106, "loss_aux_layer_0": 0.030242919921875, "loss_aux_layer_1": 0.0970458984375, "loss_aux_layer_10": 0.1083984375, "loss_aux_layer_11": 0.1153564453125, "loss_aux_layer_12": 0.124755859375, "loss_aux_layer_13": 0.1353759765625, "loss_aux_layer_14": 0.150634765625, "loss_aux_layer_15": 0.16455078125, "loss_aux_layer_16": 0.177978515625, "loss_aux_layer_17": 0.18359375, "loss_aux_layer_18": 0.19140625, "loss_aux_layer_19": 0.19091796875, "loss_aux_layer_2": 0.101318359375, "loss_aux_layer_20": 0.195068359375, "loss_aux_layer_21": 0.19873046875, "loss_aux_layer_22": 0.220458984375, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.1103515625, "loss_aux_layer_4": 0.112548828125, "loss_aux_layer_5": 0.1146240234375, "loss_aux_layer_6": 0.1171875, "loss_aux_layer_7": 0.111328125, "loss_aux_layer_8": 0.109619140625, "loss_aux_layer_9": 0.108154296875, "step": 640, "total_loss": 0.7526414841413498 }, { "epoch": 0.12690556325480104, "grad_norm": 0.7839084267616272, "learning_rate": 5e-05, "llm_loss": 0.6962132006883621, "loss": 3.3287, "loss_aux_layer_0": 0.02911376953125, "loss_aux_layer_1": 0.091796875, "loss_aux_layer_10": 0.1021728515625, "loss_aux_layer_11": 0.107666015625, "loss_aux_layer_12": 0.1165771484375, "loss_aux_layer_13": 0.126708984375, "loss_aux_layer_14": 0.14208984375, "loss_aux_layer_15": 0.15576171875, "loss_aux_layer_16": 0.1689453125, "loss_aux_layer_17": 0.175537109375, "loss_aux_layer_18": 0.183349609375, "loss_aux_layer_19": 0.18310546875, "loss_aux_layer_2": 0.0955810546875, "loss_aux_layer_20": 0.187744140625, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.21240234375, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.1038818359375, "loss_aux_layer_4": 0.1055908203125, "loss_aux_layer_5": 0.107421875, "loss_aux_layer_6": 0.1097412109375, "loss_aux_layer_7": 0.1044921875, "loss_aux_layer_8": 0.1029052734375, "loss_aux_layer_9": 0.1015625, "step": 641, "total_loss": 0.8321801424026489 }, { "epoch": 0.12710354385270242, "grad_norm": 1.7120590209960938, "learning_rate": 5e-05, "llm_loss": 0.6432892382144928, "loss": 3.1219, "loss_aux_layer_0": 0.028289794921875, "loss_aux_layer_1": 0.0911865234375, "loss_aux_layer_10": 0.1033935546875, "loss_aux_layer_11": 0.109375, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.1273193359375, "loss_aux_layer_14": 0.142333984375, "loss_aux_layer_15": 0.15625, "loss_aux_layer_16": 0.168701171875, "loss_aux_layer_17": 0.175048828125, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.18505859375, "loss_aux_layer_2": 0.0960693359375, "loss_aux_layer_20": 0.1904296875, "loss_aux_layer_21": 0.19580078125, "loss_aux_layer_22": 0.217041015625, "loss_aux_layer_23": 0.26171875, "loss_aux_layer_3": 0.1043701171875, "loss_aux_layer_4": 0.1063232421875, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.111083984375, "loss_aux_layer_7": 0.105712890625, "loss_aux_layer_8": 0.1043701171875, "loss_aux_layer_9": 0.10302734375, "step": 642, "total_loss": 0.7804818451404572 }, { "epoch": 0.12730152445060383, "grad_norm": 1.480812668800354, "learning_rate": 5e-05, "llm_loss": 0.637304812669754, "loss": 3.1023, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.094482421875, "loss_aux_layer_10": 0.10498046875, "loss_aux_layer_11": 0.1107177734375, "loss_aux_layer_12": 0.1199951171875, "loss_aux_layer_13": 0.130126953125, "loss_aux_layer_14": 0.14501953125, "loss_aux_layer_15": 0.158447265625, "loss_aux_layer_16": 0.171142578125, "loss_aux_layer_17": 0.177001953125, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.185546875, "loss_aux_layer_2": 0.0965576171875, "loss_aux_layer_20": 0.18994140625, "loss_aux_layer_21": 0.195556640625, "loss_aux_layer_22": 0.21630859375, "loss_aux_layer_23": 0.26123046875, "loss_aux_layer_3": 0.1051025390625, "loss_aux_layer_4": 0.1070556640625, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.1064453125, "loss_aux_layer_8": 0.1053466796875, "loss_aux_layer_9": 0.1038818359375, "step": 643, "total_loss": 0.7755859792232513 }, { "epoch": 0.12749950504850524, "grad_norm": 0.8893394470214844, "learning_rate": 5e-05, "llm_loss": 0.6196967214345932, "loss": 3.046, "loss_aux_layer_0": 0.02972412109375, "loss_aux_layer_1": 0.1009521484375, "loss_aux_layer_10": 0.108642578125, "loss_aux_layer_11": 0.1151123046875, "loss_aux_layer_12": 0.1240234375, "loss_aux_layer_13": 0.133544921875, "loss_aux_layer_14": 0.148193359375, "loss_aux_layer_15": 0.1611328125, "loss_aux_layer_16": 0.173583984375, "loss_aux_layer_17": 0.178955078125, "loss_aux_layer_18": 0.1875, "loss_aux_layer_19": 0.186279296875, "loss_aux_layer_2": 0.1031494140625, "loss_aux_layer_20": 0.19091796875, "loss_aux_layer_21": 0.195556640625, "loss_aux_layer_22": 0.218017578125, "loss_aux_layer_23": 0.26123046875, "loss_aux_layer_3": 0.1119384765625, "loss_aux_layer_4": 0.11376953125, "loss_aux_layer_5": 0.1146240234375, "loss_aux_layer_6": 0.1170654296875, "loss_aux_layer_7": 0.1114501953125, "loss_aux_layer_8": 0.109619140625, "loss_aux_layer_9": 0.10791015625, "step": 644, "total_loss": 0.7614990472793579 }, { "epoch": 0.12769748564640665, "grad_norm": 1.4321317672729492, "learning_rate": 5e-05, "llm_loss": 0.6317346394062042, "loss": 3.0819, "loss_aux_layer_0": 0.031768798828125, "loss_aux_layer_1": 0.09716796875, "loss_aux_layer_10": 0.1048583984375, "loss_aux_layer_11": 0.112060546875, "loss_aux_layer_12": 0.1209716796875, "loss_aux_layer_13": 0.131591796875, "loss_aux_layer_14": 0.146728515625, "loss_aux_layer_15": 0.16015625, "loss_aux_layer_16": 0.172119140625, "loss_aux_layer_17": 0.178466796875, "loss_aux_layer_18": 0.185546875, "loss_aux_layer_19": 0.184814453125, "loss_aux_layer_2": 0.0986328125, "loss_aux_layer_20": 0.18896484375, "loss_aux_layer_21": 0.192626953125, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.255615234375, "loss_aux_layer_3": 0.1072998046875, "loss_aux_layer_4": 0.1087646484375, "loss_aux_layer_5": 0.110107421875, "loss_aux_layer_6": 0.1121826171875, "loss_aux_layer_7": 0.1068115234375, "loss_aux_layer_8": 0.10546875, "loss_aux_layer_9": 0.10400390625, "step": 645, "total_loss": 0.7704801261425018 }, { "epoch": 0.12789546624430806, "grad_norm": 1.3169043064117432, "learning_rate": 5e-05, "llm_loss": 0.618874654173851, "loss": 3.0308, "loss_aux_layer_0": 0.028289794921875, "loss_aux_layer_1": 0.0950927734375, "loss_aux_layer_10": 0.1051025390625, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.1204833984375, "loss_aux_layer_13": 0.130126953125, "loss_aux_layer_14": 0.14501953125, "loss_aux_layer_15": 0.15869140625, "loss_aux_layer_16": 0.171875, "loss_aux_layer_17": 0.1787109375, "loss_aux_layer_18": 0.186767578125, "loss_aux_layer_19": 0.186767578125, "loss_aux_layer_2": 0.0980224609375, "loss_aux_layer_20": 0.190185546875, "loss_aux_layer_21": 0.193359375, "loss_aux_layer_22": 0.215576171875, "loss_aux_layer_23": 0.258056640625, "loss_aux_layer_3": 0.106689453125, "loss_aux_layer_4": 0.1085205078125, "loss_aux_layer_5": 0.1099853515625, "loss_aux_layer_6": 0.11328125, "loss_aux_layer_7": 0.1080322265625, "loss_aux_layer_8": 0.1060791015625, "loss_aux_layer_9": 0.1043701171875, "step": 646, "total_loss": 0.7576926350593567 }, { "epoch": 0.12809344684220947, "grad_norm": 1.0314035415649414, "learning_rate": 5e-05, "llm_loss": 0.5487866550683975, "loss": 2.7527, "loss_aux_layer_0": 0.028778076171875, "loss_aux_layer_1": 0.0953369140625, "loss_aux_layer_10": 0.10498046875, "loss_aux_layer_11": 0.11181640625, "loss_aux_layer_12": 0.120849609375, "loss_aux_layer_13": 0.1309814453125, "loss_aux_layer_14": 0.146484375, "loss_aux_layer_15": 0.159423828125, "loss_aux_layer_16": 0.171630859375, "loss_aux_layer_17": 0.177734375, "loss_aux_layer_18": 0.18505859375, "loss_aux_layer_19": 0.1845703125, "loss_aux_layer_2": 0.099365234375, "loss_aux_layer_20": 0.18994140625, "loss_aux_layer_21": 0.19677734375, "loss_aux_layer_22": 0.21923828125, "loss_aux_layer_23": 0.263671875, "loss_aux_layer_3": 0.107421875, "loss_aux_layer_4": 0.1094970703125, "loss_aux_layer_5": 0.111083984375, "loss_aux_layer_6": 0.1136474609375, "loss_aux_layer_7": 0.1075439453125, "loss_aux_layer_8": 0.1055908203125, "loss_aux_layer_9": 0.10400390625, "step": 647, "total_loss": 0.6881637871265411 }, { "epoch": 0.12829142744011088, "grad_norm": 1.5043970346450806, "learning_rate": 5e-05, "llm_loss": 0.6336675882339478, "loss": 3.0798, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.091552734375, "loss_aux_layer_10": 0.10302734375, "loss_aux_layer_11": 0.109130859375, "loss_aux_layer_12": 0.11767578125, "loss_aux_layer_13": 0.127197265625, "loss_aux_layer_14": 0.141845703125, "loss_aux_layer_15": 0.155029296875, "loss_aux_layer_16": 0.167724609375, "loss_aux_layer_17": 0.173583984375, "loss_aux_layer_18": 0.181396484375, "loss_aux_layer_19": 0.18212890625, "loss_aux_layer_2": 0.0968017578125, "loss_aux_layer_20": 0.186767578125, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.257568359375, "loss_aux_layer_3": 0.1053466796875, "loss_aux_layer_4": 0.107177734375, "loss_aux_layer_5": 0.1092529296875, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.10595703125, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.1026611328125, "step": 648, "total_loss": 0.7699427306652069 }, { "epoch": 0.1284894080380123, "grad_norm": 1.6482347249984741, "learning_rate": 5e-05, "llm_loss": 0.6448605805635452, "loss": 3.1556, "loss_aux_layer_0": 0.028594970703125, "loss_aux_layer_1": 0.09814453125, "loss_aux_layer_10": 0.1103515625, "loss_aux_layer_11": 0.1171875, "loss_aux_layer_12": 0.1258544921875, "loss_aux_layer_13": 0.135986328125, "loss_aux_layer_14": 0.150634765625, "loss_aux_layer_15": 0.163330078125, "loss_aux_layer_16": 0.175537109375, "loss_aux_layer_17": 0.181640625, "loss_aux_layer_18": 0.19091796875, "loss_aux_layer_19": 0.1904296875, "loss_aux_layer_2": 0.1033935546875, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.200927734375, "loss_aux_layer_22": 0.223876953125, "loss_aux_layer_23": 0.2685546875, "loss_aux_layer_3": 0.1126708984375, "loss_aux_layer_4": 0.1146240234375, "loss_aux_layer_5": 0.11669921875, "loss_aux_layer_6": 0.118408203125, "loss_aux_layer_7": 0.1129150390625, "loss_aux_layer_8": 0.1112060546875, "loss_aux_layer_9": 0.109375, "step": 649, "total_loss": 0.7889081835746765 }, { "epoch": 0.12868738863591367, "grad_norm": 0.698460578918457, "learning_rate": 5e-05, "llm_loss": 0.5625706017017365, "loss": 2.7981, "loss_aux_layer_0": 0.030975341796875, "loss_aux_layer_1": 0.095458984375, "loss_aux_layer_10": 0.10302734375, "loss_aux_layer_11": 0.1092529296875, "loss_aux_layer_12": 0.11865234375, "loss_aux_layer_13": 0.12744140625, "loss_aux_layer_14": 0.14208984375, "loss_aux_layer_15": 0.155517578125, "loss_aux_layer_16": 0.169189453125, "loss_aux_layer_17": 0.176025390625, "loss_aux_layer_18": 0.18359375, "loss_aux_layer_19": 0.18359375, "loss_aux_layer_2": 0.0953369140625, "loss_aux_layer_20": 0.18896484375, "loss_aux_layer_21": 0.193603515625, "loss_aux_layer_22": 0.215087890625, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.1044921875, "loss_aux_layer_4": 0.1064453125, "loss_aux_layer_5": 0.1082763671875, "loss_aux_layer_6": 0.11083984375, "loss_aux_layer_7": 0.1053466796875, "loss_aux_layer_8": 0.103515625, "loss_aux_layer_9": 0.1019287109375, "step": 650, "total_loss": 0.6995128095149994 }, { "epoch": 0.12888536923381508, "grad_norm": 2.251720666885376, "learning_rate": 5e-05, "llm_loss": 0.5917703956365585, "loss": 2.9211, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.096435546875, "loss_aux_layer_10": 0.1048583984375, "loss_aux_layer_11": 0.111083984375, "loss_aux_layer_12": 0.119873046875, "loss_aux_layer_13": 0.1295166015625, "loss_aux_layer_14": 0.1435546875, "loss_aux_layer_15": 0.157470703125, "loss_aux_layer_16": 0.170166015625, "loss_aux_layer_17": 0.176513671875, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.183837890625, "loss_aux_layer_2": 0.0999755859375, "loss_aux_layer_20": 0.18896484375, "loss_aux_layer_21": 0.194580078125, "loss_aux_layer_22": 0.216552734375, "loss_aux_layer_23": 0.259765625, "loss_aux_layer_3": 0.1077880859375, "loss_aux_layer_4": 0.1092529296875, "loss_aux_layer_5": 0.1104736328125, "loss_aux_layer_6": 0.1126708984375, "loss_aux_layer_7": 0.107421875, "loss_aux_layer_8": 0.10595703125, "loss_aux_layer_9": 0.1038818359375, "step": 651, "total_loss": 0.7302865087985992 }, { "epoch": 0.1290833498317165, "grad_norm": 2.511012315750122, "learning_rate": 5e-05, "llm_loss": 0.6581210494041443, "loss": 3.1749, "loss_aux_layer_0": 0.029327392578125, "loss_aux_layer_1": 0.09033203125, "loss_aux_layer_10": 0.1011962890625, "loss_aux_layer_11": 0.107177734375, "loss_aux_layer_12": 0.116943359375, "loss_aux_layer_13": 0.126708984375, "loss_aux_layer_14": 0.14208984375, "loss_aux_layer_15": 0.15576171875, "loss_aux_layer_16": 0.1689453125, "loss_aux_layer_17": 0.17578125, "loss_aux_layer_18": 0.18408203125, "loss_aux_layer_19": 0.184814453125, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.18896484375, "loss_aux_layer_21": 0.192626953125, "loss_aux_layer_22": 0.21484375, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.10107421875, "loss_aux_layer_4": 0.103271484375, "loss_aux_layer_5": 0.1051025390625, "loss_aux_layer_6": 0.1080322265625, "loss_aux_layer_7": 0.1029052734375, "loss_aux_layer_8": 0.1016845703125, "loss_aux_layer_9": 0.1004638671875, "step": 652, "total_loss": 0.793714165687561 }, { "epoch": 0.1292813304296179, "grad_norm": 0.9289444088935852, "learning_rate": 5e-05, "llm_loss": 0.6189029291272163, "loss": 3.0373, "loss_aux_layer_0": 0.02850341796875, "loss_aux_layer_1": 0.09814453125, "loss_aux_layer_10": 0.107421875, "loss_aux_layer_11": 0.1138916015625, "loss_aux_layer_12": 0.122802734375, "loss_aux_layer_13": 0.1322021484375, "loss_aux_layer_14": 0.146728515625, "loss_aux_layer_15": 0.159912109375, "loss_aux_layer_16": 0.17138671875, "loss_aux_layer_17": 0.17822265625, "loss_aux_layer_18": 0.18603515625, "loss_aux_layer_19": 0.185791015625, "loss_aux_layer_2": 0.1014404296875, "loss_aux_layer_20": 0.190185546875, "loss_aux_layer_21": 0.194580078125, "loss_aux_layer_22": 0.215576171875, "loss_aux_layer_23": 0.258544921875, "loss_aux_layer_3": 0.1102294921875, "loss_aux_layer_4": 0.1121826171875, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1163330078125, "loss_aux_layer_7": 0.1107177734375, "loss_aux_layer_8": 0.1090087890625, "loss_aux_layer_9": 0.10693359375, "step": 653, "total_loss": 0.7593310326337814 }, { "epoch": 0.1294793110275193, "grad_norm": 1.6257743835449219, "learning_rate": 5e-05, "llm_loss": 0.6395470052957535, "loss": 3.0999, "loss_aux_layer_0": 0.0291748046875, "loss_aux_layer_1": 0.0938720703125, "loss_aux_layer_10": 0.1019287109375, "loss_aux_layer_11": 0.1077880859375, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.1259765625, "loss_aux_layer_14": 0.139892578125, "loss_aux_layer_15": 0.1533203125, "loss_aux_layer_16": 0.166015625, "loss_aux_layer_17": 0.171875, "loss_aux_layer_18": 0.181640625, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.0965576171875, "loss_aux_layer_20": 0.18603515625, "loss_aux_layer_21": 0.190185546875, "loss_aux_layer_22": 0.2119140625, "loss_aux_layer_23": 0.25537109375, "loss_aux_layer_3": 0.10546875, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.1083984375, "loss_aux_layer_6": 0.1104736328125, "loss_aux_layer_7": 0.104736328125, "loss_aux_layer_8": 0.1029052734375, "loss_aux_layer_9": 0.1009521484375, "step": 654, "total_loss": 0.774982139468193 }, { "epoch": 0.12967729162542072, "grad_norm": 0.9829981327056885, "learning_rate": 5e-05, "llm_loss": 0.6561220735311508, "loss": 3.1994, "loss_aux_layer_0": 0.03009033203125, "loss_aux_layer_1": 0.1019287109375, "loss_aux_layer_10": 0.1107177734375, "loss_aux_layer_11": 0.1173095703125, "loss_aux_layer_12": 0.1260986328125, "loss_aux_layer_13": 0.1358642578125, "loss_aux_layer_14": 0.150146484375, "loss_aux_layer_15": 0.16259765625, "loss_aux_layer_16": 0.17529296875, "loss_aux_layer_17": 0.180908203125, "loss_aux_layer_18": 0.190185546875, "loss_aux_layer_19": 0.18896484375, "loss_aux_layer_2": 0.1046142578125, "loss_aux_layer_20": 0.193359375, "loss_aux_layer_21": 0.196533203125, "loss_aux_layer_22": 0.218505859375, "loss_aux_layer_23": 0.26171875, "loss_aux_layer_3": 0.1138916015625, "loss_aux_layer_4": 0.115966796875, "loss_aux_layer_5": 0.1175537109375, "loss_aux_layer_6": 0.1204833984375, "loss_aux_layer_7": 0.114501953125, "loss_aux_layer_8": 0.1121826171875, "loss_aux_layer_9": 0.10986328125, "step": 655, "total_loss": 0.7998588979244232 }, { "epoch": 0.12987527222332212, "grad_norm": 1.4074748754501343, "learning_rate": 5e-05, "llm_loss": 0.5791927725076675, "loss": 2.8724, "loss_aux_layer_0": 0.029632568359375, "loss_aux_layer_1": 0.0943603515625, "loss_aux_layer_10": 0.1038818359375, "loss_aux_layer_11": 0.1102294921875, "loss_aux_layer_12": 0.119140625, "loss_aux_layer_13": 0.1290283203125, "loss_aux_layer_14": 0.144775390625, "loss_aux_layer_15": 0.158203125, "loss_aux_layer_16": 0.17138671875, "loss_aux_layer_17": 0.177490234375, "loss_aux_layer_18": 0.18603515625, "loss_aux_layer_19": 0.187744140625, "loss_aux_layer_2": 0.0982666015625, "loss_aux_layer_20": 0.192626953125, "loss_aux_layer_21": 0.198486328125, "loss_aux_layer_22": 0.220458984375, "loss_aux_layer_23": 0.264892578125, "loss_aux_layer_3": 0.1065673828125, "loss_aux_layer_4": 0.1077880859375, "loss_aux_layer_5": 0.1092529296875, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.1060791015625, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.102783203125, "step": 656, "total_loss": 0.7181001752614975 }, { "epoch": 0.13007325282122353, "grad_norm": 1.196388602256775, "learning_rate": 5e-05, "llm_loss": 0.6497766673564911, "loss": 3.1401, "loss_aux_layer_0": 0.028961181640625, "loss_aux_layer_1": 0.0908203125, "loss_aux_layer_10": 0.100830078125, "loss_aux_layer_11": 0.107177734375, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.1260986328125, "loss_aux_layer_14": 0.141357421875, "loss_aux_layer_15": 0.154296875, "loss_aux_layer_16": 0.16748046875, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.181884765625, "loss_aux_layer_19": 0.1826171875, "loss_aux_layer_2": 0.094482421875, "loss_aux_layer_20": 0.187255859375, "loss_aux_layer_21": 0.1923828125, "loss_aux_layer_22": 0.214599609375, "loss_aux_layer_23": 0.2578125, "loss_aux_layer_3": 0.1031494140625, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.10595703125, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1031494140625, "loss_aux_layer_8": 0.1015625, "loss_aux_layer_9": 0.0999755859375, "step": 657, "total_loss": 0.7850183099508286 }, { "epoch": 0.13027123341912492, "grad_norm": 1.250588059425354, "learning_rate": 5e-05, "llm_loss": 0.667094886302948, "loss": 3.2218, "loss_aux_layer_0": 0.0274658203125, "loss_aux_layer_1": 0.090087890625, "loss_aux_layer_10": 0.1016845703125, "loss_aux_layer_11": 0.1085205078125, "loss_aux_layer_12": 0.1185302734375, "loss_aux_layer_13": 0.128662109375, "loss_aux_layer_14": 0.1455078125, "loss_aux_layer_15": 0.159912109375, "loss_aux_layer_16": 0.173828125, "loss_aux_layer_17": 0.180908203125, "loss_aux_layer_18": 0.18994140625, "loss_aux_layer_19": 0.190673828125, "loss_aux_layer_2": 0.09423828125, "loss_aux_layer_20": 0.195556640625, "loss_aux_layer_21": 0.2001953125, "loss_aux_layer_22": 0.220703125, "loss_aux_layer_23": 0.2646484375, "loss_aux_layer_3": 0.102783203125, "loss_aux_layer_4": 0.1044921875, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.109375, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.1021728515625, "loss_aux_layer_9": 0.1007080078125, "step": 658, "total_loss": 0.8054614365100861 }, { "epoch": 0.13046921401702632, "grad_norm": 1.8549327850341797, "learning_rate": 5e-05, "llm_loss": 0.5996354967355728, "loss": 2.9353, "loss_aux_layer_0": 0.028778076171875, "loss_aux_layer_1": 0.0880126953125, "loss_aux_layer_10": 0.0985107421875, "loss_aux_layer_11": 0.1046142578125, "loss_aux_layer_12": 0.11376953125, "loss_aux_layer_13": 0.1234130859375, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.15380859375, "loss_aux_layer_16": 0.16748046875, "loss_aux_layer_17": 0.174560546875, "loss_aux_layer_18": 0.18359375, "loss_aux_layer_19": 0.1845703125, "loss_aux_layer_2": 0.09130859375, "loss_aux_layer_20": 0.189697265625, "loss_aux_layer_21": 0.195556640625, "loss_aux_layer_22": 0.2177734375, "loss_aux_layer_23": 0.2626953125, "loss_aux_layer_3": 0.099365234375, "loss_aux_layer_4": 0.1007080078125, "loss_aux_layer_5": 0.1021728515625, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.09765625, "step": 659, "total_loss": 0.7338247746229172 }, { "epoch": 0.13066719461492773, "grad_norm": 1.1430164575576782, "learning_rate": 5e-05, "llm_loss": 0.5914132297039032, "loss": 2.9234, "loss_aux_layer_0": 0.030517578125, "loss_aux_layer_1": 0.095458984375, "loss_aux_layer_10": 0.105224609375, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.12060546875, "loss_aux_layer_13": 0.13037109375, "loss_aux_layer_14": 0.145751953125, "loss_aux_layer_15": 0.15869140625, "loss_aux_layer_16": 0.1708984375, "loss_aux_layer_17": 0.177001953125, "loss_aux_layer_18": 0.18505859375, "loss_aux_layer_19": 0.185546875, "loss_aux_layer_2": 0.0985107421875, "loss_aux_layer_20": 0.190673828125, "loss_aux_layer_21": 0.196044921875, "loss_aux_layer_22": 0.219482421875, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.1070556640625, "loss_aux_layer_4": 0.10888671875, "loss_aux_layer_5": 0.11083984375, "loss_aux_layer_6": 0.114013671875, "loss_aux_layer_7": 0.1083984375, "loss_aux_layer_8": 0.1068115234375, "loss_aux_layer_9": 0.1048583984375, "step": 660, "total_loss": 0.7308578789234161 }, { "epoch": 0.13086517521282914, "grad_norm": 2.5778889656066895, "learning_rate": 5e-05, "llm_loss": 0.6766000837087631, "loss": 3.2516, "loss_aux_layer_0": 0.028411865234375, "loss_aux_layer_1": 0.0953369140625, "loss_aux_layer_10": 0.1033935546875, "loss_aux_layer_11": 0.109375, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.127197265625, "loss_aux_layer_14": 0.141357421875, "loss_aux_layer_15": 0.154296875, "loss_aux_layer_16": 0.1669921875, "loss_aux_layer_17": 0.173095703125, "loss_aux_layer_18": 0.181884765625, "loss_aux_layer_19": 0.180908203125, "loss_aux_layer_2": 0.09765625, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.190673828125, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.1065673828125, "loss_aux_layer_4": 0.1080322265625, "loss_aux_layer_5": 0.1092529296875, "loss_aux_layer_6": 0.11181640625, "loss_aux_layer_7": 0.1064453125, "loss_aux_layer_8": 0.10498046875, "loss_aux_layer_9": 0.1029052734375, "step": 661, "total_loss": 0.8128971457481384 }, { "epoch": 0.13106315581073055, "grad_norm": 4.614316940307617, "learning_rate": 5e-05, "llm_loss": 0.6351605355739594, "loss": 3.1056, "loss_aux_layer_0": 0.02642822265625, "loss_aux_layer_1": 0.0950927734375, "loss_aux_layer_10": 0.1083984375, "loss_aux_layer_11": 0.1151123046875, "loss_aux_layer_12": 0.12451171875, "loss_aux_layer_13": 0.134033203125, "loss_aux_layer_14": 0.1484375, "loss_aux_layer_15": 0.16162109375, "loss_aux_layer_16": 0.17431640625, "loss_aux_layer_17": 0.179931640625, "loss_aux_layer_18": 0.18896484375, "loss_aux_layer_19": 0.18701171875, "loss_aux_layer_2": 0.1015625, "loss_aux_layer_20": 0.191162109375, "loss_aux_layer_21": 0.1962890625, "loss_aux_layer_22": 0.217529296875, "loss_aux_layer_23": 0.26123046875, "loss_aux_layer_3": 0.109375, "loss_aux_layer_4": 0.111083984375, "loss_aux_layer_5": 0.1126708984375, "loss_aux_layer_6": 0.1153564453125, "loss_aux_layer_7": 0.111328125, "loss_aux_layer_8": 0.1104736328125, "loss_aux_layer_9": 0.1083984375, "step": 662, "total_loss": 0.7763965129852295 }, { "epoch": 0.13126113640863196, "grad_norm": 2.3559625148773193, "learning_rate": 5e-05, "llm_loss": 0.6747047752141953, "loss": 3.2554, "loss_aux_layer_0": 0.0289306640625, "loss_aux_layer_1": 0.0943603515625, "loss_aux_layer_10": 0.104248046875, "loss_aux_layer_11": 0.10986328125, "loss_aux_layer_12": 0.118408203125, "loss_aux_layer_13": 0.127685546875, "loss_aux_layer_14": 0.143310546875, "loss_aux_layer_15": 0.1572265625, "loss_aux_layer_16": 0.171630859375, "loss_aux_layer_17": 0.17822265625, "loss_aux_layer_18": 0.18701171875, "loss_aux_layer_19": 0.188720703125, "loss_aux_layer_2": 0.0987548828125, "loss_aux_layer_20": 0.194091796875, "loss_aux_layer_21": 0.199462890625, "loss_aux_layer_22": 0.22119140625, "loss_aux_layer_23": 0.265625, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.1077880859375, "loss_aux_layer_5": 0.1092529296875, "loss_aux_layer_6": 0.11181640625, "loss_aux_layer_7": 0.106689453125, "loss_aux_layer_8": 0.1053466796875, "loss_aux_layer_9": 0.1036376953125, "step": 663, "total_loss": 0.8138379007577896 }, { "epoch": 0.13145911700653337, "grad_norm": 3.0903425216674805, "learning_rate": 5e-05, "llm_loss": 0.7836168855428696, "loss": 3.6772, "loss_aux_layer_0": 0.02801513671875, "loss_aux_layer_1": 0.0914306640625, "loss_aux_layer_10": 0.10302734375, "loss_aux_layer_11": 0.1092529296875, "loss_aux_layer_12": 0.1182861328125, "loss_aux_layer_13": 0.1275634765625, "loss_aux_layer_14": 0.142578125, "loss_aux_layer_15": 0.156005859375, "loss_aux_layer_16": 0.16845703125, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.182373046875, "loss_aux_layer_19": 0.18115234375, "loss_aux_layer_2": 0.0950927734375, "loss_aux_layer_20": 0.1865234375, "loss_aux_layer_21": 0.1904296875, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.251708984375, "loss_aux_layer_3": 0.103515625, "loss_aux_layer_4": 0.1055908203125, "loss_aux_layer_5": 0.1072998046875, "loss_aux_layer_6": 0.1107177734375, "loss_aux_layer_7": 0.105712890625, "loss_aux_layer_8": 0.1043701171875, "loss_aux_layer_9": 0.1029052734375, "step": 664, "total_loss": 0.9192977100610733 }, { "epoch": 0.13165709760443475, "grad_norm": 3.78826642036438, "learning_rate": 5e-05, "llm_loss": 0.5947081297636032, "loss": 2.9441, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.096923828125, "loss_aux_layer_10": 0.110595703125, "loss_aux_layer_11": 0.1162109375, "loss_aux_layer_12": 0.1241455078125, "loss_aux_layer_13": 0.13232421875, "loss_aux_layer_14": 0.1455078125, "loss_aux_layer_15": 0.15771484375, "loss_aux_layer_16": 0.169677734375, "loss_aux_layer_17": 0.175537109375, "loss_aux_layer_18": 0.18408203125, "loss_aux_layer_19": 0.18310546875, "loss_aux_layer_2": 0.10595703125, "loss_aux_layer_20": 0.187744140625, "loss_aux_layer_21": 0.19287109375, "loss_aux_layer_22": 0.2138671875, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.114501953125, "loss_aux_layer_4": 0.115966796875, "loss_aux_layer_5": 0.11767578125, "loss_aux_layer_6": 0.1199951171875, "loss_aux_layer_7": 0.1156005859375, "loss_aux_layer_8": 0.1134033203125, "loss_aux_layer_9": 0.11083984375, "step": 665, "total_loss": 0.7360132783651352 }, { "epoch": 0.13185507820233616, "grad_norm": 2.217606544494629, "learning_rate": 5e-05, "llm_loss": 0.6233906149864197, "loss": 3.0425, "loss_aux_layer_0": 0.027862548828125, "loss_aux_layer_1": 0.0921630859375, "loss_aux_layer_10": 0.10400390625, "loss_aux_layer_11": 0.1104736328125, "loss_aux_layer_12": 0.1195068359375, "loss_aux_layer_13": 0.1295166015625, "loss_aux_layer_14": 0.1435546875, "loss_aux_layer_15": 0.156494140625, "loss_aux_layer_16": 0.169921875, "loss_aux_layer_17": 0.175537109375, "loss_aux_layer_18": 0.18359375, "loss_aux_layer_19": 0.18359375, "loss_aux_layer_2": 0.095947265625, "loss_aux_layer_20": 0.188232421875, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.108642578125, "loss_aux_layer_5": 0.1103515625, "loss_aux_layer_6": 0.1121826171875, "loss_aux_layer_7": 0.1064453125, "loss_aux_layer_8": 0.1048583984375, "loss_aux_layer_9": 0.1029052734375, "step": 666, "total_loss": 0.7606332749128342 }, { "epoch": 0.13205305880023757, "grad_norm": 1.531315565109253, "learning_rate": 5e-05, "llm_loss": 0.613084465265274, "loss": 3.0127, "loss_aux_layer_0": 0.02813720703125, "loss_aux_layer_1": 0.096923828125, "loss_aux_layer_10": 0.1072998046875, "loss_aux_layer_11": 0.1136474609375, "loss_aux_layer_12": 0.1214599609375, "loss_aux_layer_13": 0.130859375, "loss_aux_layer_14": 0.14453125, "loss_aux_layer_15": 0.156982421875, "loss_aux_layer_16": 0.169189453125, "loss_aux_layer_17": 0.17529296875, "loss_aux_layer_18": 0.183837890625, "loss_aux_layer_19": 0.184326171875, "loss_aux_layer_2": 0.1029052734375, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.218505859375, "loss_aux_layer_23": 0.26318359375, "loss_aux_layer_3": 0.1112060546875, "loss_aux_layer_4": 0.1129150390625, "loss_aux_layer_5": 0.1141357421875, "loss_aux_layer_6": 0.1165771484375, "loss_aux_layer_7": 0.1112060546875, "loss_aux_layer_8": 0.109130859375, "loss_aux_layer_9": 0.1070556640625, "step": 667, "total_loss": 0.753186360001564 }, { "epoch": 0.13225103939813898, "grad_norm": 1.4747388362884521, "learning_rate": 5e-05, "llm_loss": 0.634234756231308, "loss": 3.091, "loss_aux_layer_0": 0.031219482421875, "loss_aux_layer_1": 0.0958251953125, "loss_aux_layer_10": 0.1055908203125, "loss_aux_layer_11": 0.1121826171875, "loss_aux_layer_12": 0.120849609375, "loss_aux_layer_13": 0.1300048828125, "loss_aux_layer_14": 0.143798828125, "loss_aux_layer_15": 0.156982421875, "loss_aux_layer_16": 0.16845703125, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.18212890625, "loss_aux_layer_19": 0.182373046875, "loss_aux_layer_2": 0.099365234375, "loss_aux_layer_20": 0.186767578125, "loss_aux_layer_21": 0.193603515625, "loss_aux_layer_22": 0.21630859375, "loss_aux_layer_23": 0.26123046875, "loss_aux_layer_3": 0.10888671875, "loss_aux_layer_4": 0.1099853515625, "loss_aux_layer_5": 0.111083984375, "loss_aux_layer_6": 0.113525390625, "loss_aux_layer_7": 0.108154296875, "loss_aux_layer_8": 0.1064453125, "loss_aux_layer_9": 0.104736328125, "step": 668, "total_loss": 0.7727518230676651 }, { "epoch": 0.1324490199960404, "grad_norm": 1.0326229333877563, "learning_rate": 5e-05, "llm_loss": 0.6434657126665115, "loss": 3.1268, "loss_aux_layer_0": 0.028106689453125, "loss_aux_layer_1": 0.094482421875, "loss_aux_layer_10": 0.1051025390625, "loss_aux_layer_11": 0.111572265625, "loss_aux_layer_12": 0.12060546875, "loss_aux_layer_13": 0.1298828125, "loss_aux_layer_14": 0.144287109375, "loss_aux_layer_15": 0.157958984375, "loss_aux_layer_16": 0.17041015625, "loss_aux_layer_17": 0.176513671875, "loss_aux_layer_18": 0.185546875, "loss_aux_layer_19": 0.185302734375, "loss_aux_layer_2": 0.0972900390625, "loss_aux_layer_20": 0.189453125, "loss_aux_layer_21": 0.193115234375, "loss_aux_layer_22": 0.213134765625, "loss_aux_layer_23": 0.25732421875, "loss_aux_layer_3": 0.10693359375, "loss_aux_layer_4": 0.1090087890625, "loss_aux_layer_5": 0.1103515625, "loss_aux_layer_6": 0.113037109375, "loss_aux_layer_7": 0.107666015625, "loss_aux_layer_8": 0.1058349609375, "loss_aux_layer_9": 0.1041259765625, "step": 669, "total_loss": 0.7816984355449677 }, { "epoch": 0.1326470005939418, "grad_norm": 1.003474473953247, "learning_rate": 5e-05, "llm_loss": 0.657340332865715, "loss": 3.1557, "loss_aux_layer_0": 0.0281982421875, "loss_aux_layer_1": 0.0859375, "loss_aux_layer_10": 0.0977783203125, "loss_aux_layer_11": 0.1036376953125, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.1220703125, "loss_aux_layer_14": 0.136962890625, "loss_aux_layer_15": 0.150146484375, "loss_aux_layer_16": 0.163330078125, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.1787109375, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.090087890625, "loss_aux_layer_20": 0.184814453125, "loss_aux_layer_21": 0.189208984375, "loss_aux_layer_22": 0.209228515625, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.0986328125, "loss_aux_layer_4": 0.1005859375, "loss_aux_layer_5": 0.102783203125, "loss_aux_layer_6": 0.1055908203125, "loss_aux_layer_7": 0.10009765625, "loss_aux_layer_8": 0.0985107421875, "loss_aux_layer_9": 0.096923828125, "step": 670, "total_loss": 0.7889322489500046 }, { "epoch": 0.1328449811918432, "grad_norm": 1.6299875974655151, "learning_rate": 5e-05, "llm_loss": 0.5946532189846039, "loss": 2.9167, "loss_aux_layer_0": 0.027191162109375, "loss_aux_layer_1": 0.0889892578125, "loss_aux_layer_10": 0.1014404296875, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.115966796875, "loss_aux_layer_13": 0.1248779296875, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.164306640625, "loss_aux_layer_17": 0.17041015625, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.179443359375, "loss_aux_layer_2": 0.095703125, "loss_aux_layer_20": 0.184814453125, "loss_aux_layer_21": 0.19091796875, "loss_aux_layer_22": 0.213623046875, "loss_aux_layer_23": 0.2578125, "loss_aux_layer_3": 0.1038818359375, "loss_aux_layer_4": 0.1053466796875, "loss_aux_layer_5": 0.107421875, "loss_aux_layer_6": 0.109619140625, "loss_aux_layer_7": 0.1048583984375, "loss_aux_layer_8": 0.1031494140625, "loss_aux_layer_9": 0.1014404296875, "step": 671, "total_loss": 0.7291763126850128 }, { "epoch": 0.13304296178974462, "grad_norm": 1.3910431861877441, "learning_rate": 5e-05, "llm_loss": 0.6491942405700684, "loss": 3.1346, "loss_aux_layer_0": 0.028839111328125, "loss_aux_layer_1": 0.0908203125, "loss_aux_layer_10": 0.1004638671875, "loss_aux_layer_11": 0.1065673828125, "loss_aux_layer_12": 0.115478515625, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.1533203125, "loss_aux_layer_16": 0.165771484375, "loss_aux_layer_17": 0.17236328125, "loss_aux_layer_18": 0.181396484375, "loss_aux_layer_19": 0.181396484375, "loss_aux_layer_2": 0.093994140625, "loss_aux_layer_20": 0.185302734375, "loss_aux_layer_21": 0.19091796875, "loss_aux_layer_22": 0.212646484375, "loss_aux_layer_23": 0.256591796875, "loss_aux_layer_3": 0.1026611328125, "loss_aux_layer_4": 0.1041259765625, "loss_aux_layer_5": 0.1058349609375, "loss_aux_layer_6": 0.1082763671875, "loss_aux_layer_7": 0.1031494140625, "loss_aux_layer_8": 0.1015625, "loss_aux_layer_9": 0.0999755859375, "step": 672, "total_loss": 0.783640667796135 }, { "epoch": 0.133240942387646, "grad_norm": 1.3138889074325562, "learning_rate": 5e-05, "llm_loss": 0.6728244721889496, "loss": 3.254, "loss_aux_layer_0": 0.03155517578125, "loss_aux_layer_1": 0.09765625, "loss_aux_layer_10": 0.1077880859375, "loss_aux_layer_11": 0.1143798828125, "loss_aux_layer_12": 0.1234130859375, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.146484375, "loss_aux_layer_15": 0.159423828125, "loss_aux_layer_16": 0.171142578125, "loss_aux_layer_17": 0.177490234375, "loss_aux_layer_18": 0.185546875, "loss_aux_layer_19": 0.184326171875, "loss_aux_layer_2": 0.10107421875, "loss_aux_layer_20": 0.189453125, "loss_aux_layer_21": 0.194580078125, "loss_aux_layer_22": 0.216552734375, "loss_aux_layer_23": 0.259765625, "loss_aux_layer_3": 0.1107177734375, "loss_aux_layer_4": 0.11279296875, "loss_aux_layer_5": 0.1143798828125, "loss_aux_layer_6": 0.1168212890625, "loss_aux_layer_7": 0.1114501953125, "loss_aux_layer_8": 0.1094970703125, "loss_aux_layer_9": 0.107666015625, "step": 673, "total_loss": 0.8134957104921341 }, { "epoch": 0.1334389229855474, "grad_norm": 1.535970687866211, "learning_rate": 5e-05, "llm_loss": 0.6449108421802521, "loss": 3.1382, "loss_aux_layer_0": 0.028717041015625, "loss_aux_layer_1": 0.0955810546875, "loss_aux_layer_10": 0.107421875, "loss_aux_layer_11": 0.1142578125, "loss_aux_layer_12": 0.1234130859375, "loss_aux_layer_13": 0.133056640625, "loss_aux_layer_14": 0.1474609375, "loss_aux_layer_15": 0.15966796875, "loss_aux_layer_16": 0.171630859375, "loss_aux_layer_17": 0.176513671875, "loss_aux_layer_18": 0.184326171875, "loss_aux_layer_19": 0.183349609375, "loss_aux_layer_2": 0.0997314453125, "loss_aux_layer_20": 0.18701171875, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.213134765625, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.109375, "loss_aux_layer_4": 0.11181640625, "loss_aux_layer_5": 0.1138916015625, "loss_aux_layer_6": 0.1163330078125, "loss_aux_layer_7": 0.1104736328125, "loss_aux_layer_8": 0.1087646484375, "loss_aux_layer_9": 0.1068115234375, "step": 674, "total_loss": 0.7845478504896164 }, { "epoch": 0.13363690358344882, "grad_norm": 1.532602310180664, "learning_rate": 5e-05, "llm_loss": 0.6310196965932846, "loss": 3.0785, "loss_aux_layer_0": 0.0284423828125, "loss_aux_layer_1": 0.096435546875, "loss_aux_layer_10": 0.1072998046875, "loss_aux_layer_11": 0.1136474609375, "loss_aux_layer_12": 0.12255859375, "loss_aux_layer_13": 0.132080078125, "loss_aux_layer_14": 0.145751953125, "loss_aux_layer_15": 0.15771484375, "loss_aux_layer_16": 0.16943359375, "loss_aux_layer_17": 0.1748046875, "loss_aux_layer_18": 0.181640625, "loss_aux_layer_19": 0.18017578125, "loss_aux_layer_2": 0.099853515625, "loss_aux_layer_20": 0.185302734375, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.110595703125, "loss_aux_layer_4": 0.1126708984375, "loss_aux_layer_5": 0.114013671875, "loss_aux_layer_6": 0.1163330078125, "loss_aux_layer_7": 0.1102294921875, "loss_aux_layer_8": 0.108154296875, "loss_aux_layer_9": 0.1063232421875, "step": 675, "total_loss": 0.7696271985769272 }, { "epoch": 0.13383488418135023, "grad_norm": 1.449202537536621, "learning_rate": 5e-05, "llm_loss": 0.5728788524866104, "loss": 2.8347, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.093017578125, "loss_aux_layer_10": 0.102783203125, "loss_aux_layer_11": 0.109130859375, "loss_aux_layer_12": 0.1173095703125, "loss_aux_layer_13": 0.12646484375, "loss_aux_layer_14": 0.140380859375, "loss_aux_layer_15": 0.153076171875, "loss_aux_layer_16": 0.165771484375, "loss_aux_layer_17": 0.171875, "loss_aux_layer_18": 0.180419921875, "loss_aux_layer_19": 0.180908203125, "loss_aux_layer_2": 0.096923828125, "loss_aux_layer_20": 0.1865234375, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.213623046875, "loss_aux_layer_23": 0.25732421875, "loss_aux_layer_3": 0.10595703125, "loss_aux_layer_4": 0.1075439453125, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.11083984375, "loss_aux_layer_7": 0.10546875, "loss_aux_layer_8": 0.1036376953125, "loss_aux_layer_9": 0.1019287109375, "step": 676, "total_loss": 0.7086841464042664 }, { "epoch": 0.13403286477925164, "grad_norm": 1.5217872858047485, "learning_rate": 5e-05, "llm_loss": 0.7003486901521683, "loss": 3.3451, "loss_aux_layer_0": 0.03021240234375, "loss_aux_layer_1": 0.0924072265625, "loss_aux_layer_10": 0.102783203125, "loss_aux_layer_11": 0.1090087890625, "loss_aux_layer_12": 0.1177978515625, "loss_aux_layer_13": 0.126708984375, "loss_aux_layer_14": 0.141357421875, "loss_aux_layer_15": 0.154052734375, "loss_aux_layer_16": 0.16650390625, "loss_aux_layer_17": 0.172607421875, "loss_aux_layer_18": 0.180908203125, "loss_aux_layer_19": 0.181640625, "loss_aux_layer_2": 0.0958251953125, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.19189453125, "loss_aux_layer_22": 0.213134765625, "loss_aux_layer_23": 0.2578125, "loss_aux_layer_3": 0.10498046875, "loss_aux_layer_4": 0.1065673828125, "loss_aux_layer_5": 0.1083984375, "loss_aux_layer_6": 0.1109619140625, "loss_aux_layer_7": 0.1051025390625, "loss_aux_layer_8": 0.1036376953125, "loss_aux_layer_9": 0.101806640625, "step": 677, "total_loss": 0.8362824320793152 }, { "epoch": 0.13423084537715305, "grad_norm": 1.1040492057800293, "learning_rate": 5e-05, "llm_loss": 0.6607287377119064, "loss": 3.1644, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.086181640625, "loss_aux_layer_10": 0.097900390625, "loss_aux_layer_11": 0.103515625, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.1217041015625, "loss_aux_layer_14": 0.13623046875, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.17626953125, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.0888671875, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.20703125, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.0992431640625, "loss_aux_layer_5": 0.1005859375, "loss_aux_layer_6": 0.1033935546875, "loss_aux_layer_7": 0.0989990234375, "loss_aux_layer_8": 0.0977783203125, "loss_aux_layer_9": 0.096923828125, "step": 678, "total_loss": 0.7911066859960556 }, { "epoch": 0.13442882597505446, "grad_norm": 1.517738699913025, "learning_rate": 5e-05, "llm_loss": 0.6169129759073257, "loss": 3.0198, "loss_aux_layer_0": 0.027923583984375, "loss_aux_layer_1": 0.0958251953125, "loss_aux_layer_10": 0.10693359375, "loss_aux_layer_11": 0.11328125, "loss_aux_layer_12": 0.1214599609375, "loss_aux_layer_13": 0.130126953125, "loss_aux_layer_14": 0.143310546875, "loss_aux_layer_15": 0.155517578125, "loss_aux_layer_16": 0.16748046875, "loss_aux_layer_17": 0.1728515625, "loss_aux_layer_18": 0.180908203125, "loss_aux_layer_19": 0.180419921875, "loss_aux_layer_2": 0.1002197265625, "loss_aux_layer_20": 0.185302734375, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.1094970703125, "loss_aux_layer_4": 0.1116943359375, "loss_aux_layer_5": 0.113037109375, "loss_aux_layer_6": 0.115966796875, "loss_aux_layer_7": 0.110595703125, "loss_aux_layer_8": 0.1083984375, "loss_aux_layer_9": 0.1063232421875, "step": 679, "total_loss": 0.7549536675214767 }, { "epoch": 0.13462680657295584, "grad_norm": 1.4599467515945435, "learning_rate": 5e-05, "llm_loss": 0.5713458359241486, "loss": 2.8317, "loss_aux_layer_0": 0.02813720703125, "loss_aux_layer_1": 0.090087890625, "loss_aux_layer_10": 0.1033935546875, "loss_aux_layer_11": 0.1097412109375, "loss_aux_layer_12": 0.1182861328125, "loss_aux_layer_13": 0.12744140625, "loss_aux_layer_14": 0.1414794921875, "loss_aux_layer_15": 0.15478515625, "loss_aux_layer_16": 0.167724609375, "loss_aux_layer_17": 0.173095703125, "loss_aux_layer_18": 0.181396484375, "loss_aux_layer_19": 0.183349609375, "loss_aux_layer_2": 0.0948486328125, "loss_aux_layer_20": 0.189208984375, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.217529296875, "loss_aux_layer_23": 0.26025390625, "loss_aux_layer_3": 0.10400390625, "loss_aux_layer_4": 0.10595703125, "loss_aux_layer_5": 0.1077880859375, "loss_aux_layer_6": 0.110107421875, "loss_aux_layer_7": 0.1051025390625, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.102783203125, "step": 680, "total_loss": 0.7079205811023712 }, { "epoch": 0.13482478717085725, "grad_norm": 1.7769699096679688, "learning_rate": 5e-05, "llm_loss": 0.6344391703605652, "loss": 3.065, "loss_aux_layer_0": 0.028839111328125, "loss_aux_layer_1": 0.088623046875, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.1124267578125, "loss_aux_layer_13": 0.1214599609375, "loss_aux_layer_14": 0.135986328125, "loss_aux_layer_15": 0.1484375, "loss_aux_layer_16": 0.160888671875, "loss_aux_layer_17": 0.16748046875, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.0928955078125, "loss_aux_layer_20": 0.183837890625, "loss_aux_layer_21": 0.1884765625, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.25537109375, "loss_aux_layer_3": 0.100830078125, "loss_aux_layer_4": 0.1026611328125, "loss_aux_layer_5": 0.104248046875, "loss_aux_layer_6": 0.1068115234375, "loss_aux_layer_7": 0.1014404296875, "loss_aux_layer_8": 0.099365234375, "loss_aux_layer_9": 0.09765625, "step": 681, "total_loss": 0.766250804066658 }, { "epoch": 0.13502276776875866, "grad_norm": 1.0420770645141602, "learning_rate": 5e-05, "llm_loss": 0.6703674793243408, "loss": 3.2223, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.090576171875, "loss_aux_layer_10": 0.1025390625, "loss_aux_layer_11": 0.108642578125, "loss_aux_layer_12": 0.117431640625, "loss_aux_layer_13": 0.1270751953125, "loss_aux_layer_14": 0.140625, "loss_aux_layer_15": 0.1533203125, "loss_aux_layer_16": 0.166259765625, "loss_aux_layer_17": 0.172119140625, "loss_aux_layer_18": 0.180908203125, "loss_aux_layer_19": 0.181640625, "loss_aux_layer_2": 0.09521484375, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.104248046875, "loss_aux_layer_4": 0.1063232421875, "loss_aux_layer_5": 0.1080322265625, "loss_aux_layer_6": 0.1104736328125, "loss_aux_layer_7": 0.10498046875, "loss_aux_layer_8": 0.1033935546875, "loss_aux_layer_9": 0.10205078125, "step": 682, "total_loss": 0.8055738508701324 }, { "epoch": 0.13522074836666007, "grad_norm": 1.2355573177337646, "learning_rate": 5e-05, "llm_loss": 0.5793436169624329, "loss": 2.8846, "loss_aux_layer_0": 0.0281982421875, "loss_aux_layer_1": 0.0970458984375, "loss_aux_layer_10": 0.109619140625, "loss_aux_layer_11": 0.1162109375, "loss_aux_layer_12": 0.125, "loss_aux_layer_13": 0.1339111328125, "loss_aux_layer_14": 0.148193359375, "loss_aux_layer_15": 0.16015625, "loss_aux_layer_16": 0.172119140625, "loss_aux_layer_17": 0.177734375, "loss_aux_layer_18": 0.1865234375, "loss_aux_layer_19": 0.18701171875, "loss_aux_layer_2": 0.102294921875, "loss_aux_layer_20": 0.19091796875, "loss_aux_layer_21": 0.196533203125, "loss_aux_layer_22": 0.2177734375, "loss_aux_layer_23": 0.26171875, "loss_aux_layer_3": 0.1121826171875, "loss_aux_layer_4": 0.1142578125, "loss_aux_layer_5": 0.11572265625, "loss_aux_layer_6": 0.117919921875, "loss_aux_layer_7": 0.1126708984375, "loss_aux_layer_8": 0.1109619140625, "loss_aux_layer_9": 0.1090087890625, "step": 683, "total_loss": 0.7211441993713379 }, { "epoch": 0.13541872896456147, "grad_norm": 1.4996932744979858, "learning_rate": 5e-05, "llm_loss": 0.6431179940700531, "loss": 3.131, "loss_aux_layer_0": 0.029754638671875, "loss_aux_layer_1": 0.093994140625, "loss_aux_layer_10": 0.1060791015625, "loss_aux_layer_11": 0.1123046875, "loss_aux_layer_12": 0.1201171875, "loss_aux_layer_13": 0.12939453125, "loss_aux_layer_14": 0.1435546875, "loss_aux_layer_15": 0.15673828125, "loss_aux_layer_16": 0.170166015625, "loss_aux_layer_17": 0.1767578125, "loss_aux_layer_18": 0.18505859375, "loss_aux_layer_19": 0.185302734375, "loss_aux_layer_2": 0.0999755859375, "loss_aux_layer_20": 0.19091796875, "loss_aux_layer_21": 0.19580078125, "loss_aux_layer_22": 0.21826171875, "loss_aux_layer_23": 0.2626953125, "loss_aux_layer_3": 0.10986328125, "loss_aux_layer_4": 0.111572265625, "loss_aux_layer_5": 0.113037109375, "loss_aux_layer_6": 0.11572265625, "loss_aux_layer_7": 0.10986328125, "loss_aux_layer_8": 0.10791015625, "loss_aux_layer_9": 0.10595703125, "step": 684, "total_loss": 0.7827376574277878 }, { "epoch": 0.13561670956246288, "grad_norm": 1.6621029376983643, "learning_rate": 5e-05, "llm_loss": 0.6013440117239952, "loss": 2.9602, "loss_aux_layer_0": 0.02886962890625, "loss_aux_layer_1": 0.093017578125, "loss_aux_layer_10": 0.10693359375, "loss_aux_layer_11": 0.113037109375, "loss_aux_layer_12": 0.1214599609375, "loss_aux_layer_13": 0.1307373046875, "loss_aux_layer_14": 0.14501953125, "loss_aux_layer_15": 0.158203125, "loss_aux_layer_16": 0.17041015625, "loss_aux_layer_17": 0.17724609375, "loss_aux_layer_18": 0.18505859375, "loss_aux_layer_19": 0.184326171875, "loss_aux_layer_2": 0.096923828125, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.19384765625, "loss_aux_layer_22": 0.21533203125, "loss_aux_layer_23": 0.25830078125, "loss_aux_layer_3": 0.106201171875, "loss_aux_layer_4": 0.1083984375, "loss_aux_layer_5": 0.1104736328125, "loss_aux_layer_6": 0.113525390625, "loss_aux_layer_7": 0.108642578125, "loss_aux_layer_8": 0.1075439453125, "loss_aux_layer_9": 0.10595703125, "step": 685, "total_loss": 0.7400532364845276 }, { "epoch": 0.1358146901603643, "grad_norm": 1.074019432067871, "learning_rate": 5e-05, "llm_loss": 0.6731006056070328, "loss": 3.2321, "loss_aux_layer_0": 0.02752685546875, "loss_aux_layer_1": 0.0924072265625, "loss_aux_layer_10": 0.1019287109375, "loss_aux_layer_11": 0.1085205078125, "loss_aux_layer_12": 0.1171875, "loss_aux_layer_13": 0.12646484375, "loss_aux_layer_14": 0.14111328125, "loss_aux_layer_15": 0.153564453125, "loss_aux_layer_16": 0.166015625, "loss_aux_layer_17": 0.173095703125, "loss_aux_layer_18": 0.181884765625, "loss_aux_layer_19": 0.1806640625, "loss_aux_layer_2": 0.095703125, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.1041259765625, "loss_aux_layer_4": 0.1058349609375, "loss_aux_layer_5": 0.1070556640625, "loss_aux_layer_6": 0.109619140625, "loss_aux_layer_7": 0.104248046875, "loss_aux_layer_8": 0.1025390625, "loss_aux_layer_9": 0.1011962890625, "step": 686, "total_loss": 0.8080331236124039 }, { "epoch": 0.1360126707582657, "grad_norm": 1.8307198286056519, "learning_rate": 5e-05, "llm_loss": 0.6639046221971512, "loss": 3.1802, "loss_aux_layer_0": 0.02862548828125, "loss_aux_layer_1": 0.083740234375, "loss_aux_layer_10": 0.09716796875, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.121337890625, "loss_aux_layer_14": 0.1370849609375, "loss_aux_layer_15": 0.15087890625, "loss_aux_layer_16": 0.164306640625, "loss_aux_layer_17": 0.171142578125, "loss_aux_layer_18": 0.179443359375, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.0882568359375, "loss_aux_layer_20": 0.185302734375, "loss_aux_layer_21": 0.19091796875, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.252197265625, "loss_aux_layer_3": 0.0970458984375, "loss_aux_layer_4": 0.0987548828125, "loss_aux_layer_5": 0.1005859375, "loss_aux_layer_6": 0.103271484375, "loss_aux_layer_7": 0.0986328125, "loss_aux_layer_8": 0.0975341796875, "loss_aux_layer_9": 0.09619140625, "step": 687, "total_loss": 0.795045480132103 }, { "epoch": 0.13621065135616708, "grad_norm": 1.5089659690856934, "learning_rate": 5e-05, "llm_loss": 0.6419925093650818, "loss": 3.1075, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.08935546875, "loss_aux_layer_10": 0.1007080078125, "loss_aux_layer_11": 0.1068115234375, "loss_aux_layer_12": 0.1153564453125, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.139892578125, "loss_aux_layer_15": 0.15234375, "loss_aux_layer_16": 0.16552734375, "loss_aux_layer_17": 0.17138671875, "loss_aux_layer_18": 0.18115234375, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.0947265625, "loss_aux_layer_20": 0.187744140625, "loss_aux_layer_21": 0.193115234375, "loss_aux_layer_22": 0.215576171875, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.1036376953125, "loss_aux_layer_4": 0.10546875, "loss_aux_layer_5": 0.10693359375, "loss_aux_layer_6": 0.109130859375, "loss_aux_layer_7": 0.103271484375, "loss_aux_layer_8": 0.1015625, "loss_aux_layer_9": 0.099853515625, "step": 688, "total_loss": 0.7768821269273758 }, { "epoch": 0.1364086319540685, "grad_norm": 0.8300684094429016, "learning_rate": 5e-05, "llm_loss": 0.6291175484657288, "loss": 3.0669, "loss_aux_layer_0": 0.028839111328125, "loss_aux_layer_1": 0.0904541015625, "loss_aux_layer_10": 0.1033935546875, "loss_aux_layer_11": 0.1099853515625, "loss_aux_layer_12": 0.1187744140625, "loss_aux_layer_13": 0.1279296875, "loss_aux_layer_14": 0.14306640625, "loss_aux_layer_15": 0.15576171875, "loss_aux_layer_16": 0.168701171875, "loss_aux_layer_17": 0.175537109375, "loss_aux_layer_18": 0.184326171875, "loss_aux_layer_19": 0.183837890625, "loss_aux_layer_2": 0.0965576171875, "loss_aux_layer_20": 0.189453125, "loss_aux_layer_21": 0.195556640625, "loss_aux_layer_22": 0.218017578125, "loss_aux_layer_23": 0.26220703125, "loss_aux_layer_3": 0.1060791015625, "loss_aux_layer_4": 0.1080322265625, "loss_aux_layer_5": 0.1094970703125, "loss_aux_layer_6": 0.112060546875, "loss_aux_layer_7": 0.106689453125, "loss_aux_layer_8": 0.10498046875, "loss_aux_layer_9": 0.10302734375, "step": 689, "total_loss": 0.7667176872491837 }, { "epoch": 0.1366066125519699, "grad_norm": 2.3195407390594482, "learning_rate": 5e-05, "llm_loss": 0.5940734446048737, "loss": 2.9073, "loss_aux_layer_0": 0.028411865234375, "loss_aux_layer_1": 0.0875244140625, "loss_aux_layer_10": 0.0986328125, "loss_aux_layer_11": 0.1046142578125, "loss_aux_layer_12": 0.1141357421875, "loss_aux_layer_13": 0.12353515625, "loss_aux_layer_14": 0.137939453125, "loss_aux_layer_15": 0.1513671875, "loss_aux_layer_16": 0.163818359375, "loss_aux_layer_17": 0.1708984375, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.092041015625, "loss_aux_layer_20": 0.1845703125, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.1011962890625, "loss_aux_layer_4": 0.1031494140625, "loss_aux_layer_5": 0.1046142578125, "loss_aux_layer_6": 0.1068115234375, "loss_aux_layer_7": 0.1011962890625, "loss_aux_layer_8": 0.099609375, "loss_aux_layer_9": 0.09814453125, "step": 690, "total_loss": 0.7268298268318176 }, { "epoch": 0.1368045931498713, "grad_norm": 2.823559045791626, "learning_rate": 5e-05, "llm_loss": 0.6153095215559006, "loss": 2.9799, "loss_aux_layer_0": 0.02752685546875, "loss_aux_layer_1": 0.0848388671875, "loss_aux_layer_10": 0.0955810546875, "loss_aux_layer_11": 0.1014404296875, "loss_aux_layer_12": 0.1102294921875, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.133544921875, "loss_aux_layer_15": 0.146240234375, "loss_aux_layer_16": 0.1591796875, "loss_aux_layer_17": 0.166015625, "loss_aux_layer_18": 0.1748046875, "loss_aux_layer_19": 0.1767578125, "loss_aux_layer_2": 0.0887451171875, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.097412109375, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.1004638671875, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.0980224609375, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.09521484375, "step": 691, "total_loss": 0.7449725270271301 }, { "epoch": 0.13700257374777272, "grad_norm": 1.2640265226364136, "learning_rate": 5e-05, "llm_loss": 0.6805850118398666, "loss": 3.2572, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.089111328125, "loss_aux_layer_10": 0.100341796875, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.139892578125, "loss_aux_layer_15": 0.15283203125, "loss_aux_layer_16": 0.164794921875, "loss_aux_layer_17": 0.17138671875, "loss_aux_layer_18": 0.18017578125, "loss_aux_layer_19": 0.180419921875, "loss_aux_layer_2": 0.092529296875, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.25439453125, "loss_aux_layer_3": 0.1014404296875, "loss_aux_layer_4": 0.103271484375, "loss_aux_layer_5": 0.1046142578125, "loss_aux_layer_6": 0.107666015625, "loss_aux_layer_7": 0.1021728515625, "loss_aux_layer_8": 0.1007080078125, "loss_aux_layer_9": 0.099365234375, "step": 692, "total_loss": 0.814303457736969 }, { "epoch": 0.13720055434567413, "grad_norm": 3.0315468311309814, "learning_rate": 5e-05, "llm_loss": 0.7198304384946823, "loss": 3.4297, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.0921630859375, "loss_aux_layer_10": 0.1043701171875, "loss_aux_layer_11": 0.110595703125, "loss_aux_layer_12": 0.119384765625, "loss_aux_layer_13": 0.1282958984375, "loss_aux_layer_14": 0.14306640625, "loss_aux_layer_15": 0.155517578125, "loss_aux_layer_16": 0.167724609375, "loss_aux_layer_17": 0.174072265625, "loss_aux_layer_18": 0.18310546875, "loss_aux_layer_19": 0.18408203125, "loss_aux_layer_2": 0.097412109375, "loss_aux_layer_20": 0.18896484375, "loss_aux_layer_21": 0.19482421875, "loss_aux_layer_22": 0.21533203125, "loss_aux_layer_23": 0.259521484375, "loss_aux_layer_3": 0.1068115234375, "loss_aux_layer_4": 0.1087646484375, "loss_aux_layer_5": 0.1103515625, "loss_aux_layer_6": 0.112548828125, "loss_aux_layer_7": 0.1072998046875, "loss_aux_layer_8": 0.105224609375, "loss_aux_layer_9": 0.1038818359375, "step": 693, "total_loss": 0.8574308604001999 }, { "epoch": 0.13739853494357554, "grad_norm": 3.5560314655303955, "learning_rate": 5e-05, "llm_loss": 0.6542726755142212, "loss": 3.1636, "loss_aux_layer_0": 0.027252197265625, "loss_aux_layer_1": 0.091064453125, "loss_aux_layer_10": 0.1025390625, "loss_aux_layer_11": 0.10888671875, "loss_aux_layer_12": 0.1181640625, "loss_aux_layer_13": 0.1282958984375, "loss_aux_layer_14": 0.142333984375, "loss_aux_layer_15": 0.1552734375, "loss_aux_layer_16": 0.16748046875, "loss_aux_layer_17": 0.173828125, "loss_aux_layer_18": 0.18212890625, "loss_aux_layer_19": 0.181640625, "loss_aux_layer_2": 0.0994873046875, "loss_aux_layer_20": 0.186767578125, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.1092529296875, "loss_aux_layer_4": 0.1103515625, "loss_aux_layer_5": 0.11181640625, "loss_aux_layer_6": 0.1124267578125, "loss_aux_layer_7": 0.106201171875, "loss_aux_layer_8": 0.10400390625, "loss_aux_layer_9": 0.1021728515625, "step": 694, "total_loss": 0.7908935099840164 }, { "epoch": 0.13759651554147692, "grad_norm": 1.5417898893356323, "learning_rate": 5e-05, "llm_loss": 0.6866757571697235, "loss": 3.2836, "loss_aux_layer_0": 0.02783203125, "loss_aux_layer_1": 0.091552734375, "loss_aux_layer_10": 0.102783203125, "loss_aux_layer_11": 0.109375, "loss_aux_layer_12": 0.1177978515625, "loss_aux_layer_13": 0.1256103515625, "loss_aux_layer_14": 0.13916015625, "loss_aux_layer_15": 0.1513671875, "loss_aux_layer_16": 0.162841796875, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.17626953125, "loss_aux_layer_19": 0.176025390625, "loss_aux_layer_2": 0.0968017578125, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.1082763671875, "loss_aux_layer_5": 0.109619140625, "loss_aux_layer_6": 0.111572265625, "loss_aux_layer_7": 0.10595703125, "loss_aux_layer_8": 0.1041259765625, "loss_aux_layer_9": 0.10205078125, "step": 695, "total_loss": 0.8208998590707779 }, { "epoch": 0.13779449613937833, "grad_norm": 3.312436103820801, "learning_rate": 5e-05, "llm_loss": 0.6379862576723099, "loss": 3.0924, "loss_aux_layer_0": 0.02740478515625, "loss_aux_layer_1": 0.092041015625, "loss_aux_layer_10": 0.1019287109375, "loss_aux_layer_11": 0.108154296875, "loss_aux_layer_12": 0.1168212890625, "loss_aux_layer_13": 0.126220703125, "loss_aux_layer_14": 0.13916015625, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.164306640625, "loss_aux_layer_17": 0.1708984375, "loss_aux_layer_18": 0.178955078125, "loss_aux_layer_19": 0.178955078125, "loss_aux_layer_2": 0.0982666015625, "loss_aux_layer_20": 0.183349609375, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.21044921875, "loss_aux_layer_23": 0.2529296875, "loss_aux_layer_3": 0.1083984375, "loss_aux_layer_4": 0.1097412109375, "loss_aux_layer_5": 0.1107177734375, "loss_aux_layer_6": 0.112060546875, "loss_aux_layer_7": 0.106201171875, "loss_aux_layer_8": 0.103515625, "loss_aux_layer_9": 0.101318359375, "step": 696, "total_loss": 0.7731017470359802 }, { "epoch": 0.13799247673727974, "grad_norm": 4.0765533447265625, "learning_rate": 5e-05, "llm_loss": 0.6081463843584061, "loss": 2.9709, "loss_aux_layer_0": 0.027587890625, "loss_aux_layer_1": 0.0860595703125, "loss_aux_layer_10": 0.1004638671875, "loss_aux_layer_11": 0.1064453125, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.1241455078125, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.152099609375, "loss_aux_layer_16": 0.165283203125, "loss_aux_layer_17": 0.17236328125, "loss_aux_layer_18": 0.181640625, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.09521484375, "loss_aux_layer_20": 0.187255859375, "loss_aux_layer_21": 0.192626953125, "loss_aux_layer_22": 0.214111328125, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.10400390625, "loss_aux_layer_4": 0.1051025390625, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1041259765625, "loss_aux_layer_8": 0.102294921875, "loss_aux_layer_9": 0.1007080078125, "step": 697, "total_loss": 0.7427168637514114 }, { "epoch": 0.13819045733518115, "grad_norm": 2.084163188934326, "learning_rate": 5e-05, "llm_loss": 0.6832101047039032, "loss": 3.2918, "loss_aux_layer_0": 0.0302734375, "loss_aux_layer_1": 0.09521484375, "loss_aux_layer_10": 0.10693359375, "loss_aux_layer_11": 0.113525390625, "loss_aux_layer_12": 0.122314453125, "loss_aux_layer_13": 0.1318359375, "loss_aux_layer_14": 0.14599609375, "loss_aux_layer_15": 0.158447265625, "loss_aux_layer_16": 0.169921875, "loss_aux_layer_17": 0.176025390625, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.183837890625, "loss_aux_layer_2": 0.0989990234375, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.193603515625, "loss_aux_layer_22": 0.215087890625, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.1092529296875, "loss_aux_layer_4": 0.1116943359375, "loss_aux_layer_5": 0.1136474609375, "loss_aux_layer_6": 0.11669921875, "loss_aux_layer_7": 0.111328125, "loss_aux_layer_8": 0.109130859375, "loss_aux_layer_9": 0.106689453125, "step": 698, "total_loss": 0.8229391276836395 }, { "epoch": 0.13838843793308256, "grad_norm": 2.9603044986724854, "learning_rate": 5e-05, "llm_loss": 0.6694342792034149, "loss": 3.2423, "loss_aux_layer_0": 0.031005859375, "loss_aux_layer_1": 0.0977783203125, "loss_aux_layer_10": 0.1075439453125, "loss_aux_layer_11": 0.1143798828125, "loss_aux_layer_12": 0.123046875, "loss_aux_layer_13": 0.13232421875, "loss_aux_layer_14": 0.146240234375, "loss_aux_layer_15": 0.15869140625, "loss_aux_layer_16": 0.1708984375, "loss_aux_layer_17": 0.177978515625, "loss_aux_layer_18": 0.186767578125, "loss_aux_layer_19": 0.18603515625, "loss_aux_layer_2": 0.10400390625, "loss_aux_layer_20": 0.190673828125, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.21728515625, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.112060546875, "loss_aux_layer_4": 0.114013671875, "loss_aux_layer_5": 0.11474609375, "loss_aux_layer_6": 0.1171875, "loss_aux_layer_7": 0.111572265625, "loss_aux_layer_8": 0.1094970703125, "loss_aux_layer_9": 0.10693359375, "step": 699, "total_loss": 0.8105761706829071 }, { "epoch": 0.13858641853098397, "grad_norm": 2.6405045986175537, "learning_rate": 5e-05, "llm_loss": 0.6001238077878952, "loss": 2.9408, "loss_aux_layer_0": 0.028045654296875, "loss_aux_layer_1": 0.0914306640625, "loss_aux_layer_10": 0.1026611328125, "loss_aux_layer_11": 0.108154296875, "loss_aux_layer_12": 0.1163330078125, "loss_aux_layer_13": 0.124755859375, "loss_aux_layer_14": 0.138427734375, "loss_aux_layer_15": 0.151123046875, "loss_aux_layer_16": 0.162841796875, "loss_aux_layer_17": 0.169189453125, "loss_aux_layer_18": 0.1787109375, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.096923828125, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.212890625, "loss_aux_layer_23": 0.25732421875, "loss_aux_layer_3": 0.1053466796875, "loss_aux_layer_4": 0.10693359375, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.111328125, "loss_aux_layer_7": 0.1068115234375, "loss_aux_layer_8": 0.1048583984375, "loss_aux_layer_9": 0.1029052734375, "step": 700, "total_loss": 0.735211580991745 }, { "epoch": 0.13878439912888538, "grad_norm": 1.825014352798462, "learning_rate": 5e-05, "llm_loss": 0.6242355704307556, "loss": 3.0106, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.083740234375, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.1002197265625, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.116943359375, "loss_aux_layer_14": 0.1312255859375, "loss_aux_layer_15": 0.14501953125, "loss_aux_layer_16": 0.15771484375, "loss_aux_layer_17": 0.1650390625, "loss_aux_layer_18": 0.174560546875, "loss_aux_layer_19": 0.17578125, "loss_aux_layer_2": 0.0882568359375, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.2529296875, "loss_aux_layer_3": 0.0966796875, "loss_aux_layer_4": 0.0982666015625, "loss_aux_layer_5": 0.099853515625, "loss_aux_layer_6": 0.1025390625, "loss_aux_layer_7": 0.096923828125, "loss_aux_layer_8": 0.0953369140625, "loss_aux_layer_9": 0.093994140625, "step": 701, "total_loss": 0.752656564116478 }, { "epoch": 0.1389823797267868, "grad_norm": 1.4806269407272339, "learning_rate": 5e-05, "llm_loss": 0.6286488920450211, "loss": 3.0521, "loss_aux_layer_0": 0.029144287109375, "loss_aux_layer_1": 0.0902099609375, "loss_aux_layer_10": 0.10205078125, "loss_aux_layer_11": 0.108154296875, "loss_aux_layer_12": 0.1168212890625, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.15234375, "loss_aux_layer_16": 0.164794921875, "loss_aux_layer_17": 0.170654296875, "loss_aux_layer_18": 0.1787109375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.0953369140625, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.208984375, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.10400390625, "loss_aux_layer_4": 0.1060791015625, "loss_aux_layer_5": 0.108154296875, "loss_aux_layer_6": 0.110595703125, "loss_aux_layer_7": 0.10546875, "loss_aux_layer_8": 0.10400390625, "loss_aux_layer_9": 0.1016845703125, "step": 702, "total_loss": 0.7630160301923752 }, { "epoch": 0.13918036032468817, "grad_norm": 1.9748610258102417, "learning_rate": 5e-05, "llm_loss": 0.608514666557312, "loss": 2.9659, "loss_aux_layer_0": 0.030731201171875, "loss_aux_layer_1": 0.08984375, "loss_aux_layer_10": 0.0992431640625, "loss_aux_layer_11": 0.105224609375, "loss_aux_layer_12": 0.1138916015625, "loss_aux_layer_13": 0.123291015625, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.15087890625, "loss_aux_layer_16": 0.16259765625, "loss_aux_layer_17": 0.169189453125, "loss_aux_layer_18": 0.177734375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.0933837890625, "loss_aux_layer_20": 0.183837890625, "loss_aux_layer_21": 0.189453125, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.1015625, "loss_aux_layer_4": 0.1031494140625, "loss_aux_layer_5": 0.1046142578125, "loss_aux_layer_6": 0.107421875, "loss_aux_layer_7": 0.1026611328125, "loss_aux_layer_8": 0.1009521484375, "loss_aux_layer_9": 0.099365234375, "step": 703, "total_loss": 0.741471990942955 }, { "epoch": 0.13937834092258958, "grad_norm": 1.6273707151412964, "learning_rate": 5e-05, "llm_loss": 0.7544195652008057, "loss": 3.5392, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.086181640625, "loss_aux_layer_10": 0.09912109375, "loss_aux_layer_11": 0.10498046875, "loss_aux_layer_12": 0.1131591796875, "loss_aux_layer_13": 0.1221923828125, "loss_aux_layer_14": 0.135986328125, "loss_aux_layer_15": 0.148193359375, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.166259765625, "loss_aux_layer_18": 0.17431640625, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.0914306640625, "loss_aux_layer_20": 0.1796875, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.2041015625, "loss_aux_layer_23": 0.246337890625, "loss_aux_layer_3": 0.10009765625, "loss_aux_layer_4": 0.102294921875, "loss_aux_layer_5": 0.10400390625, "loss_aux_layer_6": 0.1065673828125, "loss_aux_layer_7": 0.1014404296875, "loss_aux_layer_8": 0.10009765625, "loss_aux_layer_9": 0.0982666015625, "step": 704, "total_loss": 0.8847958743572235 }, { "epoch": 0.139576321520491, "grad_norm": 1.3343549966812134, "learning_rate": 5e-05, "llm_loss": 0.7221210300922394, "loss": 3.4162, "loss_aux_layer_0": 0.032318115234375, "loss_aux_layer_1": 0.0904541015625, "loss_aux_layer_10": 0.0987548828125, "loss_aux_layer_11": 0.1046142578125, "loss_aux_layer_12": 0.11328125, "loss_aux_layer_13": 0.1219482421875, "loss_aux_layer_14": 0.13623046875, "loss_aux_layer_15": 0.149169921875, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.16796875, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.092529296875, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.187255859375, "loss_aux_layer_22": 0.20947265625, "loss_aux_layer_23": 0.251708984375, "loss_aux_layer_3": 0.1011962890625, "loss_aux_layer_4": 0.102783203125, "loss_aux_layer_5": 0.10400390625, "loss_aux_layer_6": 0.106689453125, "loss_aux_layer_7": 0.1016845703125, "loss_aux_layer_8": 0.0999755859375, "loss_aux_layer_9": 0.09814453125, "step": 705, "total_loss": 0.8540408462285995 }, { "epoch": 0.1397743021183924, "grad_norm": 1.4931825399398804, "learning_rate": 5e-05, "llm_loss": 0.6630760282278061, "loss": 3.2049, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.0936279296875, "loss_aux_layer_10": 0.1064453125, "loss_aux_layer_11": 0.112548828125, "loss_aux_layer_12": 0.12060546875, "loss_aux_layer_13": 0.12890625, "loss_aux_layer_14": 0.142333984375, "loss_aux_layer_15": 0.154052734375, "loss_aux_layer_16": 0.165283203125, "loss_aux_layer_17": 0.171630859375, "loss_aux_layer_18": 0.180419921875, "loss_aux_layer_19": 0.180908203125, "loss_aux_layer_2": 0.1007080078125, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.19140625, "loss_aux_layer_22": 0.214111328125, "loss_aux_layer_23": 0.257080078125, "loss_aux_layer_3": 0.11083984375, "loss_aux_layer_4": 0.113037109375, "loss_aux_layer_5": 0.114501953125, "loss_aux_layer_6": 0.1168212890625, "loss_aux_layer_7": 0.1107177734375, "loss_aux_layer_8": 0.1082763671875, "loss_aux_layer_9": 0.1063232421875, "step": 706, "total_loss": 0.8012165576219559 }, { "epoch": 0.1399722827162938, "grad_norm": 1.2747743129730225, "learning_rate": 5e-05, "llm_loss": 0.6786017119884491, "loss": 3.2713, "loss_aux_layer_0": 0.031097412109375, "loss_aux_layer_1": 0.0921630859375, "loss_aux_layer_10": 0.1053466796875, "loss_aux_layer_11": 0.1119384765625, "loss_aux_layer_12": 0.12158203125, "loss_aux_layer_13": 0.130615234375, "loss_aux_layer_14": 0.146240234375, "loss_aux_layer_15": 0.159423828125, "loss_aux_layer_16": 0.171875, "loss_aux_layer_17": 0.178466796875, "loss_aux_layer_18": 0.186767578125, "loss_aux_layer_19": 0.187744140625, "loss_aux_layer_2": 0.095947265625, "loss_aux_layer_20": 0.192138671875, "loss_aux_layer_21": 0.19580078125, "loss_aux_layer_22": 0.217041015625, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.1085205078125, "loss_aux_layer_5": 0.1103515625, "loss_aux_layer_6": 0.1129150390625, "loss_aux_layer_7": 0.1075439453125, "loss_aux_layer_8": 0.10595703125, "loss_aux_layer_9": 0.1048583984375, "step": 707, "total_loss": 0.8178270608186722 }, { "epoch": 0.14017026331419521, "grad_norm": 1.0848472118377686, "learning_rate": 5e-05, "llm_loss": 0.702012374997139, "loss": 3.3439, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.0885009765625, "loss_aux_layer_10": 0.1016845703125, "loss_aux_layer_11": 0.1080322265625, "loss_aux_layer_12": 0.1162109375, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.137939453125, "loss_aux_layer_15": 0.151123046875, "loss_aux_layer_16": 0.162841796875, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.178466796875, "loss_aux_layer_19": 0.178466796875, "loss_aux_layer_2": 0.0953369140625, "loss_aux_layer_20": 0.183837890625, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.209228515625, "loss_aux_layer_23": 0.251708984375, "loss_aux_layer_3": 0.1044921875, "loss_aux_layer_4": 0.1070556640625, "loss_aux_layer_5": 0.1087646484375, "loss_aux_layer_6": 0.1112060546875, "loss_aux_layer_7": 0.1053466796875, "loss_aux_layer_8": 0.1033935546875, "loss_aux_layer_9": 0.1014404296875, "step": 708, "total_loss": 0.8359864503145218 }, { "epoch": 0.14036824391209662, "grad_norm": 1.5679978132247925, "learning_rate": 5e-05, "llm_loss": 0.5931710749864578, "loss": 2.9269, "loss_aux_layer_0": 0.029388427734375, "loss_aux_layer_1": 0.094970703125, "loss_aux_layer_10": 0.1064453125, "loss_aux_layer_11": 0.1131591796875, "loss_aux_layer_12": 0.1220703125, "loss_aux_layer_13": 0.13134765625, "loss_aux_layer_14": 0.14501953125, "loss_aux_layer_15": 0.1572265625, "loss_aux_layer_16": 0.16943359375, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.18310546875, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.1005859375, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.19091796875, "loss_aux_layer_22": 0.2119140625, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.1102294921875, "loss_aux_layer_4": 0.11181640625, "loss_aux_layer_5": 0.113037109375, "loss_aux_layer_6": 0.1151123046875, "loss_aux_layer_7": 0.1092529296875, "loss_aux_layer_8": 0.107666015625, "loss_aux_layer_9": 0.1058349609375, "step": 709, "total_loss": 0.7317240834236145 }, { "epoch": 0.14056622450999803, "grad_norm": 1.277828574180603, "learning_rate": 5e-05, "llm_loss": 0.6386448293924332, "loss": 3.0969, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0880126953125, "loss_aux_layer_10": 0.10205078125, "loss_aux_layer_11": 0.1083984375, "loss_aux_layer_12": 0.1171875, "loss_aux_layer_13": 0.126953125, "loss_aux_layer_14": 0.14111328125, "loss_aux_layer_15": 0.154296875, "loss_aux_layer_16": 0.1669921875, "loss_aux_layer_17": 0.173095703125, "loss_aux_layer_18": 0.18212890625, "loss_aux_layer_19": 0.18310546875, "loss_aux_layer_2": 0.0938720703125, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.19384765625, "loss_aux_layer_22": 0.2158203125, "loss_aux_layer_23": 0.259765625, "loss_aux_layer_3": 0.1029052734375, "loss_aux_layer_4": 0.1048583984375, "loss_aux_layer_5": 0.1065673828125, "loss_aux_layer_6": 0.1094970703125, "loss_aux_layer_7": 0.104248046875, "loss_aux_layer_8": 0.102783203125, "loss_aux_layer_9": 0.10107421875, "step": 710, "total_loss": 0.7742280215024948 }, { "epoch": 0.14076420510789941, "grad_norm": 1.3916852474212646, "learning_rate": 5e-05, "llm_loss": 0.6097458004951477, "loss": 2.9716, "loss_aux_layer_0": 0.02899169921875, "loss_aux_layer_1": 0.090087890625, "loss_aux_layer_10": 0.10009765625, "loss_aux_layer_11": 0.106201171875, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.12353515625, "loss_aux_layer_14": 0.137451171875, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.1767578125, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.0948486328125, "loss_aux_layer_20": 0.182861328125, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.2529296875, "loss_aux_layer_3": 0.1036376953125, "loss_aux_layer_4": 0.1055908203125, "loss_aux_layer_5": 0.1070556640625, "loss_aux_layer_6": 0.1094970703125, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.10205078125, "loss_aux_layer_9": 0.099609375, "step": 711, "total_loss": 0.7428998947143555 }, { "epoch": 0.14096218570580082, "grad_norm": 1.8731344938278198, "learning_rate": 5e-05, "llm_loss": 0.6738575398921967, "loss": 3.2272, "loss_aux_layer_0": 0.029266357421875, "loss_aux_layer_1": 0.088134765625, "loss_aux_layer_10": 0.1009521484375, "loss_aux_layer_11": 0.10693359375, "loss_aux_layer_12": 0.1158447265625, "loss_aux_layer_13": 0.124755859375, "loss_aux_layer_14": 0.137939453125, "loss_aux_layer_15": 0.150146484375, "loss_aux_layer_16": 0.162109375, "loss_aux_layer_17": 0.16943359375, "loss_aux_layer_18": 0.1767578125, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.0938720703125, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.186767578125, "loss_aux_layer_22": 0.209228515625, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.1026611328125, "loss_aux_layer_4": 0.104736328125, "loss_aux_layer_5": 0.1060791015625, "loss_aux_layer_6": 0.109130859375, "loss_aux_layer_7": 0.1036376953125, "loss_aux_layer_8": 0.1016845703125, "loss_aux_layer_9": 0.099853515625, "step": 712, "total_loss": 0.8067968785762787 }, { "epoch": 0.14116016630370223, "grad_norm": 0.8475925326347351, "learning_rate": 5e-05, "llm_loss": 0.6235338449478149, "loss": 3.0387, "loss_aux_layer_0": 0.027374267578125, "loss_aux_layer_1": 0.089111328125, "loss_aux_layer_10": 0.10302734375, "loss_aux_layer_11": 0.109375, "loss_aux_layer_12": 0.118408203125, "loss_aux_layer_13": 0.1275634765625, "loss_aux_layer_14": 0.14111328125, "loss_aux_layer_15": 0.1533203125, "loss_aux_layer_16": 0.165771484375, "loss_aux_layer_17": 0.172119140625, "loss_aux_layer_18": 0.18115234375, "loss_aux_layer_19": 0.182861328125, "loss_aux_layer_2": 0.095703125, "loss_aux_layer_20": 0.18798828125, "loss_aux_layer_21": 0.193603515625, "loss_aux_layer_22": 0.21484375, "loss_aux_layer_23": 0.258544921875, "loss_aux_layer_3": 0.105224609375, "loss_aux_layer_4": 0.1072998046875, "loss_aux_layer_5": 0.1087646484375, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.10595703125, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.1024169921875, "step": 713, "total_loss": 0.7596784830093384 }, { "epoch": 0.14135814690160364, "grad_norm": 2.0965240001678467, "learning_rate": 5e-05, "llm_loss": 0.6866181939840317, "loss": 3.2917, "loss_aux_layer_0": 0.02874755859375, "loss_aux_layer_1": 0.09033203125, "loss_aux_layer_10": 0.101806640625, "loss_aux_layer_11": 0.1083984375, "loss_aux_layer_12": 0.1175537109375, "loss_aux_layer_13": 0.126708984375, "loss_aux_layer_14": 0.141357421875, "loss_aux_layer_15": 0.15478515625, "loss_aux_layer_16": 0.167724609375, "loss_aux_layer_17": 0.17529296875, "loss_aux_layer_18": 0.184326171875, "loss_aux_layer_19": 0.184814453125, "loss_aux_layer_2": 0.0960693359375, "loss_aux_layer_20": 0.190185546875, "loss_aux_layer_21": 0.1943359375, "loss_aux_layer_22": 0.214599609375, "loss_aux_layer_23": 0.2578125, "loss_aux_layer_3": 0.1041259765625, "loss_aux_layer_4": 0.1060791015625, "loss_aux_layer_5": 0.107421875, "loss_aux_layer_6": 0.1097412109375, "loss_aux_layer_7": 0.1043701171875, "loss_aux_layer_8": 0.1029052734375, "loss_aux_layer_9": 0.101318359375, "step": 714, "total_loss": 0.8229356110095978 }, { "epoch": 0.14155612749950505, "grad_norm": 1.1787402629852295, "learning_rate": 5e-05, "llm_loss": 0.608831375837326, "loss": 2.9624, "loss_aux_layer_0": 0.0296630859375, "loss_aux_layer_1": 0.0882568359375, "loss_aux_layer_10": 0.098876953125, "loss_aux_layer_11": 0.1053466796875, "loss_aux_layer_12": 0.1142578125, "loss_aux_layer_13": 0.12353515625, "loss_aux_layer_14": 0.1376953125, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.162109375, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.177001953125, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.091064453125, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.250244140625, "loss_aux_layer_3": 0.0994873046875, "loss_aux_layer_4": 0.10205078125, "loss_aux_layer_5": 0.1036376953125, "loss_aux_layer_6": 0.1065673828125, "loss_aux_layer_7": 0.101318359375, "loss_aux_layer_8": 0.0997314453125, "loss_aux_layer_9": 0.09814453125, "step": 715, "total_loss": 0.7405913472175598 }, { "epoch": 0.14175410809740646, "grad_norm": 1.7459710836410522, "learning_rate": 5e-05, "llm_loss": 0.6554980725049973, "loss": 3.1673, "loss_aux_layer_0": 0.02740478515625, "loss_aux_layer_1": 0.0909423828125, "loss_aux_layer_10": 0.1038818359375, "loss_aux_layer_11": 0.1103515625, "loss_aux_layer_12": 0.11865234375, "loss_aux_layer_13": 0.1280517578125, "loss_aux_layer_14": 0.141357421875, "loss_aux_layer_15": 0.153564453125, "loss_aux_layer_16": 0.165771484375, "loss_aux_layer_17": 0.172119140625, "loss_aux_layer_18": 0.1806640625, "loss_aux_layer_19": 0.1806640625, "loss_aux_layer_2": 0.0966796875, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.25537109375, "loss_aux_layer_3": 0.1068115234375, "loss_aux_layer_4": 0.1090087890625, "loss_aux_layer_5": 0.1107177734375, "loss_aux_layer_6": 0.113037109375, "loss_aux_layer_7": 0.10791015625, "loss_aux_layer_8": 0.105712890625, "loss_aux_layer_9": 0.1036376953125, "step": 716, "total_loss": 0.7918323278427124 }, { "epoch": 0.14195208869530787, "grad_norm": 0.7894032001495361, "learning_rate": 5e-05, "llm_loss": 0.5713948905467987, "loss": 2.8226, "loss_aux_layer_0": 0.027862548828125, "loss_aux_layer_1": 0.0889892578125, "loss_aux_layer_10": 0.10205078125, "loss_aux_layer_11": 0.1085205078125, "loss_aux_layer_12": 0.1170654296875, "loss_aux_layer_13": 0.1256103515625, "loss_aux_layer_14": 0.13916015625, "loss_aux_layer_15": 0.151611328125, "loss_aux_layer_16": 0.163818359375, "loss_aux_layer_17": 0.171142578125, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.180419921875, "loss_aux_layer_2": 0.0938720703125, "loss_aux_layer_20": 0.18505859375, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.1031494140625, "loss_aux_layer_4": 0.105224609375, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.109130859375, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.10302734375, "loss_aux_layer_9": 0.1011962890625, "step": 717, "total_loss": 0.705658882856369 }, { "epoch": 0.14215006929320925, "grad_norm": 1.8235808610916138, "learning_rate": 5e-05, "llm_loss": 0.6042502671480179, "loss": 2.9364, "loss_aux_layer_0": 0.027862548828125, "loss_aux_layer_1": 0.0848388671875, "loss_aux_layer_10": 0.0977783203125, "loss_aux_layer_11": 0.103515625, "loss_aux_layer_12": 0.112060546875, "loss_aux_layer_13": 0.120849609375, "loss_aux_layer_14": 0.134033203125, "loss_aux_layer_15": 0.146728515625, "loss_aux_layer_16": 0.158447265625, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.17431640625, "loss_aux_layer_19": 0.1748046875, "loss_aux_layer_2": 0.0894775390625, "loss_aux_layer_20": 0.180908203125, "loss_aux_layer_21": 0.18603515625, "loss_aux_layer_22": 0.20703125, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.0985107421875, "loss_aux_layer_4": 0.1005859375, "loss_aux_layer_5": 0.1019287109375, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.10009765625, "loss_aux_layer_8": 0.0986328125, "loss_aux_layer_9": 0.0972900390625, "step": 718, "total_loss": 0.7341054230928421 }, { "epoch": 0.14234804989111066, "grad_norm": 0.9397575259208679, "learning_rate": 5e-05, "llm_loss": 0.5911777168512344, "loss": 2.8884, "loss_aux_layer_0": 0.027618408203125, "loss_aux_layer_1": 0.0882568359375, "loss_aux_layer_10": 0.09912109375, "loss_aux_layer_11": 0.105224609375, "loss_aux_layer_12": 0.1136474609375, "loss_aux_layer_13": 0.1219482421875, "loss_aux_layer_14": 0.13525390625, "loss_aux_layer_15": 0.147216796875, "loss_aux_layer_16": 0.159423828125, "loss_aux_layer_17": 0.166259765625, "loss_aux_layer_18": 0.174072265625, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.0938720703125, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.248291015625, "loss_aux_layer_3": 0.10205078125, "loss_aux_layer_4": 0.1038818359375, "loss_aux_layer_5": 0.1051025390625, "loss_aux_layer_6": 0.107666015625, "loss_aux_layer_7": 0.101806640625, "loss_aux_layer_8": 0.1004638671875, "loss_aux_layer_9": 0.0985107421875, "step": 719, "total_loss": 0.7221063822507858 }, { "epoch": 0.14254603048901207, "grad_norm": 1.4748536348342896, "learning_rate": 5e-05, "llm_loss": 0.6549773067235947, "loss": 3.1664, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.09326171875, "loss_aux_layer_10": 0.105224609375, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.1201171875, "loss_aux_layer_13": 0.12841796875, "loss_aux_layer_14": 0.141845703125, "loss_aux_layer_15": 0.153076171875, "loss_aux_layer_16": 0.1650390625, "loss_aux_layer_17": 0.17138671875, "loss_aux_layer_18": 0.179931640625, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.09814453125, "loss_aux_layer_20": 0.184326171875, "loss_aux_layer_21": 0.189453125, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.25439453125, "loss_aux_layer_3": 0.1080322265625, "loss_aux_layer_4": 0.1102294921875, "loss_aux_layer_5": 0.11181640625, "loss_aux_layer_6": 0.1146240234375, "loss_aux_layer_7": 0.1090087890625, "loss_aux_layer_8": 0.106689453125, "loss_aux_layer_9": 0.1043701171875, "step": 720, "total_loss": 0.7915997207164764 }, { "epoch": 0.14274401108691348, "grad_norm": 1.528478741645813, "learning_rate": 5e-05, "llm_loss": 0.5637249797582626, "loss": 2.7908, "loss_aux_layer_0": 0.02691650390625, "loss_aux_layer_1": 0.08935546875, "loss_aux_layer_10": 0.1007080078125, "loss_aux_layer_11": 0.1068115234375, "loss_aux_layer_12": 0.1151123046875, "loss_aux_layer_13": 0.123779296875, "loss_aux_layer_14": 0.1373291015625, "loss_aux_layer_15": 0.150390625, "loss_aux_layer_16": 0.162353515625, "loss_aux_layer_17": 0.16943359375, "loss_aux_layer_18": 0.178955078125, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.0943603515625, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.212890625, "loss_aux_layer_23": 0.257568359375, "loss_aux_layer_3": 0.103759765625, "loss_aux_layer_4": 0.1053466796875, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.1092529296875, "loss_aux_layer_7": 0.103759765625, "loss_aux_layer_8": 0.1021728515625, "loss_aux_layer_9": 0.1002197265625, "step": 721, "total_loss": 0.6976893693208694 }, { "epoch": 0.1429419916848149, "grad_norm": 2.3733534812927246, "learning_rate": 5e-05, "llm_loss": 0.6736367642879486, "loss": 3.2285, "loss_aux_layer_0": 0.02862548828125, "loss_aux_layer_1": 0.09033203125, "loss_aux_layer_10": 0.0997314453125, "loss_aux_layer_11": 0.1060791015625, "loss_aux_layer_12": 0.11474609375, "loss_aux_layer_13": 0.1226806640625, "loss_aux_layer_14": 0.136474609375, "loss_aux_layer_15": 0.149169921875, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.179443359375, "loss_aux_layer_2": 0.09326171875, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.19189453125, "loss_aux_layer_22": 0.2138671875, "loss_aux_layer_23": 0.2578125, "loss_aux_layer_3": 0.1036376953125, "loss_aux_layer_4": 0.1053466796875, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.10888671875, "loss_aux_layer_7": 0.1029052734375, "loss_aux_layer_8": 0.1009521484375, "loss_aux_layer_9": 0.09912109375, "step": 722, "total_loss": 0.8071313202381134 }, { "epoch": 0.1431399722827163, "grad_norm": 2.223968744277954, "learning_rate": 5e-05, "llm_loss": 0.49149230122566223, "loss": 2.4892, "loss_aux_layer_0": 0.029998779296875, "loss_aux_layer_1": 0.0848388671875, "loss_aux_layer_10": 0.096923828125, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1116943359375, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.134033203125, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.159912109375, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.175537109375, "loss_aux_layer_19": 0.17578125, "loss_aux_layer_2": 0.0916748046875, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.20849609375, "loss_aux_layer_23": 0.252197265625, "loss_aux_layer_3": 0.101806640625, "loss_aux_layer_4": 0.10302734375, "loss_aux_layer_5": 0.1051025390625, "loss_aux_layer_6": 0.1064453125, "loss_aux_layer_7": 0.100341796875, "loss_aux_layer_8": 0.098388671875, "loss_aux_layer_9": 0.096435546875, "step": 723, "total_loss": 0.6222879439592361 }, { "epoch": 0.1433379528806177, "grad_norm": 1.5033479928970337, "learning_rate": 5e-05, "llm_loss": 0.6900645941495895, "loss": 3.2993, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.08837890625, "loss_aux_layer_10": 0.1011962890625, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.1256103515625, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.1650390625, "loss_aux_layer_17": 0.17236328125, "loss_aux_layer_18": 0.18115234375, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.093994140625, "loss_aux_layer_20": 0.187255859375, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.104248046875, "loss_aux_layer_4": 0.106201171875, "loss_aux_layer_5": 0.10791015625, "loss_aux_layer_6": 0.1102294921875, "loss_aux_layer_7": 0.104736328125, "loss_aux_layer_8": 0.10302734375, "loss_aux_layer_9": 0.100830078125, "step": 724, "total_loss": 0.8248146325349808 }, { "epoch": 0.14353593347851912, "grad_norm": 3.569382429122925, "learning_rate": 5e-05, "llm_loss": 0.6362005770206451, "loss": 3.1009, "loss_aux_layer_0": 0.03021240234375, "loss_aux_layer_1": 0.095703125, "loss_aux_layer_10": 0.1064453125, "loss_aux_layer_11": 0.11279296875, "loss_aux_layer_12": 0.1212158203125, "loss_aux_layer_13": 0.1302490234375, "loss_aux_layer_14": 0.1435546875, "loss_aux_layer_15": 0.155517578125, "loss_aux_layer_16": 0.1669921875, "loss_aux_layer_17": 0.173095703125, "loss_aux_layer_18": 0.18212890625, "loss_aux_layer_19": 0.182373046875, "loss_aux_layer_2": 0.1014404296875, "loss_aux_layer_20": 0.18701171875, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.212890625, "loss_aux_layer_23": 0.257080078125, "loss_aux_layer_3": 0.11328125, "loss_aux_layer_4": 0.1148681640625, "loss_aux_layer_5": 0.1158447265625, "loss_aux_layer_6": 0.1170654296875, "loss_aux_layer_7": 0.1107177734375, "loss_aux_layer_8": 0.1082763671875, "loss_aux_layer_9": 0.1060791015625, "step": 725, "total_loss": 0.7752229422330856 }, { "epoch": 0.1437339140764205, "grad_norm": 3.945218324661255, "learning_rate": 5e-05, "llm_loss": 0.5803640335798264, "loss": 2.868, "loss_aux_layer_0": 0.028656005859375, "loss_aux_layer_1": 0.089599609375, "loss_aux_layer_10": 0.1041259765625, "loss_aux_layer_11": 0.10986328125, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.1263427734375, "loss_aux_layer_14": 0.1396484375, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.163330078125, "loss_aux_layer_17": 0.169677734375, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.1004638671875, "loss_aux_layer_20": 0.183349609375, "loss_aux_layer_21": 0.190673828125, "loss_aux_layer_22": 0.21533203125, "loss_aux_layer_23": 0.259765625, "loss_aux_layer_3": 0.11083984375, "loss_aux_layer_4": 0.112548828125, "loss_aux_layer_5": 0.1143798828125, "loss_aux_layer_6": 0.1148681640625, "loss_aux_layer_7": 0.1094970703125, "loss_aux_layer_8": 0.10693359375, "loss_aux_layer_9": 0.1046142578125, "step": 726, "total_loss": 0.7169967740774155 }, { "epoch": 0.1439318946743219, "grad_norm": 1.5733519792556763, "learning_rate": 5e-05, "llm_loss": 0.5976404845714569, "loss": 2.9492, "loss_aux_layer_0": 0.028350830078125, "loss_aux_layer_1": 0.0936279296875, "loss_aux_layer_10": 0.1064453125, "loss_aux_layer_11": 0.1134033203125, "loss_aux_layer_12": 0.121826171875, "loss_aux_layer_13": 0.130615234375, "loss_aux_layer_14": 0.143798828125, "loss_aux_layer_15": 0.15625, "loss_aux_layer_16": 0.168701171875, "loss_aux_layer_17": 0.175537109375, "loss_aux_layer_18": 0.1845703125, "loss_aux_layer_19": 0.1845703125, "loss_aux_layer_2": 0.0994873046875, "loss_aux_layer_20": 0.1904296875, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.21728515625, "loss_aux_layer_23": 0.2587890625, "loss_aux_layer_3": 0.111328125, "loss_aux_layer_4": 0.11376953125, "loss_aux_layer_5": 0.115478515625, "loss_aux_layer_6": 0.1171875, "loss_aux_layer_7": 0.1107177734375, "loss_aux_layer_8": 0.1082763671875, "loss_aux_layer_9": 0.10595703125, "step": 727, "total_loss": 0.7372951358556747 }, { "epoch": 0.14412987527222332, "grad_norm": 3.252253293991089, "learning_rate": 5e-05, "llm_loss": 0.5841148346662521, "loss": 2.8806, "loss_aux_layer_0": 0.029022216796875, "loss_aux_layer_1": 0.08935546875, "loss_aux_layer_10": 0.1014404296875, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.1165771484375, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.14013671875, "loss_aux_layer_15": 0.153076171875, "loss_aux_layer_16": 0.166015625, "loss_aux_layer_17": 0.17236328125, "loss_aux_layer_18": 0.180908203125, "loss_aux_layer_19": 0.181396484375, "loss_aux_layer_2": 0.1002197265625, "loss_aux_layer_20": 0.1875, "loss_aux_layer_21": 0.193359375, "loss_aux_layer_22": 0.214599609375, "loss_aux_layer_23": 0.25830078125, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.1080322265625, "loss_aux_layer_5": 0.1097412109375, "loss_aux_layer_6": 0.1109619140625, "loss_aux_layer_7": 0.106201171875, "loss_aux_layer_8": 0.103759765625, "loss_aux_layer_9": 0.101806640625, "step": 728, "total_loss": 0.7201477140188217 }, { "epoch": 0.14432785587012473, "grad_norm": 0.7013124823570251, "learning_rate": 5e-05, "llm_loss": 0.5453256145119667, "loss": 2.7187, "loss_aux_layer_0": 0.028900146484375, "loss_aux_layer_1": 0.0872802734375, "loss_aux_layer_10": 0.10009765625, "loss_aux_layer_11": 0.106689453125, "loss_aux_layer_12": 0.1156005859375, "loss_aux_layer_13": 0.12451171875, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.151611328125, "loss_aux_layer_16": 0.164794921875, "loss_aux_layer_17": 0.172119140625, "loss_aux_layer_18": 0.181884765625, "loss_aux_layer_19": 0.182861328125, "loss_aux_layer_2": 0.092041015625, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.1943359375, "loss_aux_layer_22": 0.215087890625, "loss_aux_layer_23": 0.25927734375, "loss_aux_layer_3": 0.1009521484375, "loss_aux_layer_4": 0.103271484375, "loss_aux_layer_5": 0.105224609375, "loss_aux_layer_6": 0.1077880859375, "loss_aux_layer_7": 0.1025390625, "loss_aux_layer_8": 0.10107421875, "loss_aux_layer_9": 0.0992431640625, "step": 729, "total_loss": 0.6796824187040329 }, { "epoch": 0.14452583646802614, "grad_norm": 3.112239360809326, "learning_rate": 5e-05, "llm_loss": 0.5544895827770233, "loss": 2.7769, "loss_aux_layer_0": 0.029541015625, "loss_aux_layer_1": 0.0936279296875, "loss_aux_layer_10": 0.1075439453125, "loss_aux_layer_11": 0.1142578125, "loss_aux_layer_12": 0.1220703125, "loss_aux_layer_13": 0.1309814453125, "loss_aux_layer_14": 0.144287109375, "loss_aux_layer_15": 0.156005859375, "loss_aux_layer_16": 0.16796875, "loss_aux_layer_17": 0.17431640625, "loss_aux_layer_18": 0.18310546875, "loss_aux_layer_19": 0.182861328125, "loss_aux_layer_2": 0.10302734375, "loss_aux_layer_20": 0.18798828125, "loss_aux_layer_21": 0.194091796875, "loss_aux_layer_22": 0.2158203125, "loss_aux_layer_23": 0.26025390625, "loss_aux_layer_3": 0.11181640625, "loss_aux_layer_4": 0.1134033203125, "loss_aux_layer_5": 0.11474609375, "loss_aux_layer_6": 0.116455078125, "loss_aux_layer_7": 0.112060546875, "loss_aux_layer_8": 0.109619140625, "loss_aux_layer_9": 0.107421875, "step": 730, "total_loss": 0.6942362189292908 }, { "epoch": 0.14472381706592755, "grad_norm": 1.4457827806472778, "learning_rate": 5e-05, "llm_loss": 0.6539246737957001, "loss": 3.1495, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.088134765625, "loss_aux_layer_10": 0.1015625, "loss_aux_layer_11": 0.107666015625, "loss_aux_layer_12": 0.1163330078125, "loss_aux_layer_13": 0.1253662109375, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.15087890625, "loss_aux_layer_16": 0.162353515625, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.177001953125, "loss_aux_layer_2": 0.0943603515625, "loss_aux_layer_20": 0.181884765625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.208251953125, "loss_aux_layer_23": 0.249755859375, "loss_aux_layer_3": 0.1046142578125, "loss_aux_layer_4": 0.106689453125, "loss_aux_layer_5": 0.1083984375, "loss_aux_layer_6": 0.1104736328125, "loss_aux_layer_7": 0.1048583984375, "loss_aux_layer_8": 0.1031494140625, "loss_aux_layer_9": 0.10107421875, "step": 731, "total_loss": 0.7873812168836594 }, { "epoch": 0.14492179766382896, "grad_norm": 2.3777260780334473, "learning_rate": 5e-05, "llm_loss": 0.7357082664966583, "loss": 3.4842, "loss_aux_layer_0": 0.02777099609375, "loss_aux_layer_1": 0.087158203125, "loss_aux_layer_10": 0.1026611328125, "loss_aux_layer_11": 0.1083984375, "loss_aux_layer_12": 0.1175537109375, "loss_aux_layer_13": 0.1268310546875, "loss_aux_layer_14": 0.14111328125, "loss_aux_layer_15": 0.15380859375, "loss_aux_layer_16": 0.166748046875, "loss_aux_layer_17": 0.17333984375, "loss_aux_layer_18": 0.182373046875, "loss_aux_layer_19": 0.181884765625, "loss_aux_layer_2": 0.0950927734375, "loss_aux_layer_20": 0.18701171875, "loss_aux_layer_21": 0.19140625, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.252685546875, "loss_aux_layer_3": 0.1041259765625, "loss_aux_layer_4": 0.1063232421875, "loss_aux_layer_5": 0.1077880859375, "loss_aux_layer_6": 0.110595703125, "loss_aux_layer_7": 0.1058349609375, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.102294921875, "step": 732, "total_loss": 0.8710554838180542 }, { "epoch": 0.14511977826173034, "grad_norm": 2.900364398956299, "learning_rate": 5e-05, "llm_loss": 0.6655688807368279, "loss": 3.2011, "loss_aux_layer_0": 0.03265380859375, "loss_aux_layer_1": 0.0911865234375, "loss_aux_layer_10": 0.102783203125, "loss_aux_layer_11": 0.1080322265625, "loss_aux_layer_12": 0.11669921875, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.13916015625, "loss_aux_layer_15": 0.1513671875, "loss_aux_layer_16": 0.163818359375, "loss_aux_layer_17": 0.170654296875, "loss_aux_layer_18": 0.179931640625, "loss_aux_layer_19": 0.180908203125, "loss_aux_layer_2": 0.093017578125, "loss_aux_layer_20": 0.1865234375, "loss_aux_layer_21": 0.191162109375, "loss_aux_layer_22": 0.21240234375, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.1025390625, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.1063232421875, "loss_aux_layer_6": 0.1087646484375, "loss_aux_layer_7": 0.1036376953125, "loss_aux_layer_8": 0.102783203125, "loss_aux_layer_9": 0.1015625, "step": 733, "total_loss": 0.8002743870019913 }, { "epoch": 0.14531775885963175, "grad_norm": 3.0258615016937256, "learning_rate": 5e-05, "llm_loss": 0.5808500424027443, "loss": 2.871, "loss_aux_layer_0": 0.029327392578125, "loss_aux_layer_1": 0.0904541015625, "loss_aux_layer_10": 0.1048583984375, "loss_aux_layer_11": 0.111083984375, "loss_aux_layer_12": 0.1187744140625, "loss_aux_layer_13": 0.1270751953125, "loss_aux_layer_14": 0.140869140625, "loss_aux_layer_15": 0.15283203125, "loss_aux_layer_16": 0.164306640625, "loss_aux_layer_17": 0.1708984375, "loss_aux_layer_18": 0.1796875, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.10009765625, "loss_aux_layer_20": 0.184326171875, "loss_aux_layer_21": 0.19091796875, "loss_aux_layer_22": 0.212890625, "loss_aux_layer_23": 0.257568359375, "loss_aux_layer_3": 0.109130859375, "loss_aux_layer_4": 0.111083984375, "loss_aux_layer_5": 0.11279296875, "loss_aux_layer_6": 0.1146240234375, "loss_aux_layer_7": 0.10986328125, "loss_aux_layer_8": 0.1072998046875, "loss_aux_layer_9": 0.1051025390625, "step": 734, "total_loss": 0.7177495211362839 }, { "epoch": 0.14551573945753316, "grad_norm": 2.25754976272583, "learning_rate": 5e-05, "llm_loss": 0.6021641492843628, "loss": 2.9319, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.07958984375, "loss_aux_layer_10": 0.095947265625, "loss_aux_layer_11": 0.1015625, "loss_aux_layer_12": 0.1107177734375, "loss_aux_layer_13": 0.1202392578125, "loss_aux_layer_14": 0.135498046875, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.162353515625, "loss_aux_layer_17": 0.170166015625, "loss_aux_layer_18": 0.179443359375, "loss_aux_layer_19": 0.181396484375, "loss_aux_layer_2": 0.08642578125, "loss_aux_layer_20": 0.187255859375, "loss_aux_layer_21": 0.193115234375, "loss_aux_layer_22": 0.21533203125, "loss_aux_layer_23": 0.26025390625, "loss_aux_layer_3": 0.09619140625, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.096435546875, "loss_aux_layer_8": 0.095947265625, "loss_aux_layer_9": 0.0953369140625, "step": 735, "total_loss": 0.7329659759998322 }, { "epoch": 0.14571372005543456, "grad_norm": 2.5168068408966064, "learning_rate": 5e-05, "llm_loss": 0.5812501311302185, "loss": 2.8564, "loss_aux_layer_0": 0.02716064453125, "loss_aux_layer_1": 0.0830078125, "loss_aux_layer_10": 0.10009765625, "loss_aux_layer_11": 0.1065673828125, "loss_aux_layer_12": 0.1158447265625, "loss_aux_layer_13": 0.125244140625, "loss_aux_layer_14": 0.13818359375, "loss_aux_layer_15": 0.151123046875, "loss_aux_layer_16": 0.16357421875, "loss_aux_layer_17": 0.169677734375, "loss_aux_layer_18": 0.177734375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.09130859375, "loss_aux_layer_20": 0.18408203125, "loss_aux_layer_21": 0.1904296875, "loss_aux_layer_22": 0.212646484375, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.1004638671875, "loss_aux_layer_4": 0.1025390625, "loss_aux_layer_5": 0.104736328125, "loss_aux_layer_6": 0.10693359375, "loss_aux_layer_7": 0.1021728515625, "loss_aux_layer_8": 0.1005859375, "loss_aux_layer_9": 0.09912109375, "step": 736, "total_loss": 0.7140970975160599 }, { "epoch": 0.14591170065333597, "grad_norm": 1.9193941354751587, "learning_rate": 5e-05, "llm_loss": 0.6432742774486542, "loss": 3.1103, "loss_aux_layer_0": 0.02947998046875, "loss_aux_layer_1": 0.08740234375, "loss_aux_layer_10": 0.10107421875, "loss_aux_layer_11": 0.107421875, "loss_aux_layer_12": 0.115966796875, "loss_aux_layer_13": 0.12451171875, "loss_aux_layer_14": 0.1396484375, "loss_aux_layer_15": 0.15234375, "loss_aux_layer_16": 0.164306640625, "loss_aux_layer_17": 0.171142578125, "loss_aux_layer_18": 0.1806640625, "loss_aux_layer_19": 0.180908203125, "loss_aux_layer_2": 0.093017578125, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.254150390625, "loss_aux_layer_3": 0.103271484375, "loss_aux_layer_4": 0.10546875, "loss_aux_layer_5": 0.107177734375, "loss_aux_layer_6": 0.109375, "loss_aux_layer_7": 0.104248046875, "loss_aux_layer_8": 0.1021728515625, "loss_aux_layer_9": 0.1007080078125, "step": 737, "total_loss": 0.7775866091251373 }, { "epoch": 0.14610968125123738, "grad_norm": 1.7369104623794556, "learning_rate": 5e-05, "llm_loss": 0.6590722352266312, "loss": 3.1753, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.087646484375, "loss_aux_layer_10": 0.10107421875, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.11669921875, "loss_aux_layer_13": 0.1258544921875, "loss_aux_layer_14": 0.1396484375, "loss_aux_layer_15": 0.153564453125, "loss_aux_layer_16": 0.165771484375, "loss_aux_layer_17": 0.173828125, "loss_aux_layer_18": 0.181640625, "loss_aux_layer_19": 0.182373046875, "loss_aux_layer_2": 0.0928955078125, "loss_aux_layer_20": 0.1875, "loss_aux_layer_21": 0.192626953125, "loss_aux_layer_22": 0.21337890625, "loss_aux_layer_23": 0.25927734375, "loss_aux_layer_3": 0.1026611328125, "loss_aux_layer_4": 0.1044921875, "loss_aux_layer_5": 0.1055908203125, "loss_aux_layer_6": 0.108154296875, "loss_aux_layer_7": 0.10302734375, "loss_aux_layer_8": 0.1014404296875, "loss_aux_layer_9": 0.10009765625, "step": 738, "total_loss": 0.7938246726989746 }, { "epoch": 0.1463076618491388, "grad_norm": 1.9203860759735107, "learning_rate": 5e-05, "llm_loss": 0.6204503923654556, "loss": 2.9975, "loss_aux_layer_0": 0.02850341796875, "loss_aux_layer_1": 0.0841064453125, "loss_aux_layer_10": 0.0963134765625, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.1109619140625, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.13330078125, "loss_aux_layer_15": 0.145751953125, "loss_aux_layer_16": 0.156982421875, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.171630859375, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.08935546875, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.206787109375, "loss_aux_layer_23": 0.250732421875, "loss_aux_layer_3": 0.098876953125, "loss_aux_layer_4": 0.1009521484375, "loss_aux_layer_5": 0.102294921875, "loss_aux_layer_6": 0.104736328125, "loss_aux_layer_7": 0.0989990234375, "loss_aux_layer_8": 0.0970458984375, "loss_aux_layer_9": 0.095703125, "step": 739, "total_loss": 0.7493803203105927 }, { "epoch": 0.1465056424470402, "grad_norm": 0.9549353122711182, "learning_rate": 5e-05, "llm_loss": 0.6450005620718002, "loss": 3.1113, "loss_aux_layer_0": 0.02813720703125, "loss_aux_layer_1": 0.0869140625, "loss_aux_layer_10": 0.10107421875, "loss_aux_layer_11": 0.107421875, "loss_aux_layer_12": 0.1158447265625, "loss_aux_layer_13": 0.1241455078125, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.149169921875, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.1767578125, "loss_aux_layer_2": 0.0936279296875, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.18798828125, "loss_aux_layer_22": 0.209228515625, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.1031494140625, "loss_aux_layer_4": 0.105224609375, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.1094970703125, "loss_aux_layer_7": 0.1041259765625, "loss_aux_layer_8": 0.102294921875, "loss_aux_layer_9": 0.1005859375, "step": 740, "total_loss": 0.7778165638446808 }, { "epoch": 0.14670362304494158, "grad_norm": 1.6439851522445679, "learning_rate": 5e-05, "llm_loss": 0.5461563766002655, "loss": 2.7257, "loss_aux_layer_0": 0.02838134765625, "loss_aux_layer_1": 0.0863037109375, "loss_aux_layer_10": 0.1011962890625, "loss_aux_layer_11": 0.107421875, "loss_aux_layer_12": 0.1160888671875, "loss_aux_layer_13": 0.1256103515625, "loss_aux_layer_14": 0.1396484375, "loss_aux_layer_15": 0.151611328125, "loss_aux_layer_16": 0.16455078125, "loss_aux_layer_17": 0.171875, "loss_aux_layer_18": 0.181884765625, "loss_aux_layer_19": 0.1826171875, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.188232421875, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.21923828125, "loss_aux_layer_23": 0.26416015625, "loss_aux_layer_3": 0.1021728515625, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.1092529296875, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.1021728515625, "loss_aux_layer_9": 0.100341796875, "step": 741, "total_loss": 0.6814245879650116 }, { "epoch": 0.146901603642843, "grad_norm": 1.4012000560760498, "learning_rate": 5e-05, "llm_loss": 0.7118347585201263, "loss": 3.3711, "loss_aux_layer_0": 0.0302734375, "loss_aux_layer_1": 0.0858154296875, "loss_aux_layer_10": 0.0989990234375, "loss_aux_layer_11": 0.1051025390625, "loss_aux_layer_12": 0.114013671875, "loss_aux_layer_13": 0.1221923828125, "loss_aux_layer_14": 0.13623046875, "loss_aux_layer_15": 0.148681640625, "loss_aux_layer_16": 0.16064453125, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.17529296875, "loss_aux_layer_19": 0.175537109375, "loss_aux_layer_2": 0.08984375, "loss_aux_layer_20": 0.1806640625, "loss_aux_layer_21": 0.185546875, "loss_aux_layer_22": 0.205810546875, "loss_aux_layer_23": 0.24853515625, "loss_aux_layer_3": 0.099853515625, "loss_aux_layer_4": 0.1021728515625, "loss_aux_layer_5": 0.103759765625, "loss_aux_layer_6": 0.106201171875, "loss_aux_layer_7": 0.1016845703125, "loss_aux_layer_8": 0.0997314453125, "loss_aux_layer_9": 0.098388671875, "step": 742, "total_loss": 0.8427726775407791 }, { "epoch": 0.1470995842407444, "grad_norm": 1.743939995765686, "learning_rate": 5e-05, "llm_loss": 0.6009442582726479, "loss": 2.9367, "loss_aux_layer_0": 0.03021240234375, "loss_aux_layer_1": 0.0872802734375, "loss_aux_layer_10": 0.1002197265625, "loss_aux_layer_11": 0.1063232421875, "loss_aux_layer_12": 0.1153564453125, "loss_aux_layer_13": 0.1241455078125, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.1513671875, "loss_aux_layer_16": 0.163330078125, "loss_aux_layer_17": 0.170166015625, "loss_aux_layer_18": 0.178955078125, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.092529296875, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.1884765625, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.2529296875, "loss_aux_layer_3": 0.1026611328125, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.10595703125, "loss_aux_layer_6": 0.10791015625, "loss_aux_layer_7": 0.1026611328125, "loss_aux_layer_8": 0.101318359375, "loss_aux_layer_9": 0.099365234375, "step": 743, "total_loss": 0.7341654002666473 }, { "epoch": 0.1472975648386458, "grad_norm": 1.8495482206344604, "learning_rate": 5e-05, "llm_loss": 0.5848967432975769, "loss": 2.8666, "loss_aux_layer_0": 0.02911376953125, "loss_aux_layer_1": 0.0872802734375, "loss_aux_layer_10": 0.0989990234375, "loss_aux_layer_11": 0.1048583984375, "loss_aux_layer_12": 0.1129150390625, "loss_aux_layer_13": 0.12109375, "loss_aux_layer_14": 0.135009765625, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.17626953125, "loss_aux_layer_19": 0.177734375, "loss_aux_layer_2": 0.093505859375, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.251220703125, "loss_aux_layer_3": 0.1014404296875, "loss_aux_layer_4": 0.103271484375, "loss_aux_layer_5": 0.1053466796875, "loss_aux_layer_6": 0.1077880859375, "loss_aux_layer_7": 0.1025390625, "loss_aux_layer_8": 0.1007080078125, "loss_aux_layer_9": 0.0987548828125, "step": 744, "total_loss": 0.7166580110788345 }, { "epoch": 0.14749554543654722, "grad_norm": 1.267040491104126, "learning_rate": 5e-05, "llm_loss": 0.724781408905983, "loss": 3.4371, "loss_aux_layer_0": 0.02947998046875, "loss_aux_layer_1": 0.09033203125, "loss_aux_layer_10": 0.1029052734375, "loss_aux_layer_11": 0.109130859375, "loss_aux_layer_12": 0.117431640625, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.150390625, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.17724609375, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.0953369140625, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.189208984375, "loss_aux_layer_22": 0.212158203125, "loss_aux_layer_23": 0.255859375, "loss_aux_layer_3": 0.10498046875, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.10791015625, "loss_aux_layer_6": 0.1104736328125, "loss_aux_layer_7": 0.1058349609375, "loss_aux_layer_8": 0.104248046875, "loss_aux_layer_9": 0.102294921875, "step": 745, "total_loss": 0.8592626750469208 }, { "epoch": 0.14769352603444863, "grad_norm": 2.1891896724700928, "learning_rate": 5e-05, "llm_loss": 0.6702503263950348, "loss": 3.2039, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.08447265625, "loss_aux_layer_10": 0.09716796875, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.1124267578125, "loss_aux_layer_13": 0.121337890625, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.1474609375, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.1669921875, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.176025390625, "loss_aux_layer_2": 0.09228515625, "loss_aux_layer_20": 0.181884765625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.2080078125, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.1011962890625, "loss_aux_layer_4": 0.10302734375, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.106201171875, "loss_aux_layer_7": 0.1002197265625, "loss_aux_layer_8": 0.098388671875, "loss_aux_layer_9": 0.0968017578125, "step": 746, "total_loss": 0.8009776473045349 }, { "epoch": 0.14789150663235004, "grad_norm": 0.7939654588699341, "learning_rate": 5e-05, "llm_loss": 0.6428718119859695, "loss": 3.098, "loss_aux_layer_0": 0.028839111328125, "loss_aux_layer_1": 0.086181640625, "loss_aux_layer_10": 0.098876953125, "loss_aux_layer_11": 0.1046142578125, "loss_aux_layer_12": 0.113037109375, "loss_aux_layer_13": 0.121337890625, "loss_aux_layer_14": 0.135498046875, "loss_aux_layer_15": 0.148193359375, "loss_aux_layer_16": 0.16015625, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.175537109375, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.091064453125, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.1884765625, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.253662109375, "loss_aux_layer_3": 0.10107421875, "loss_aux_layer_4": 0.10302734375, "loss_aux_layer_5": 0.104736328125, "loss_aux_layer_6": 0.107177734375, "loss_aux_layer_7": 0.101806640625, "loss_aux_layer_8": 0.100341796875, "loss_aux_layer_9": 0.0982666015625, "step": 747, "total_loss": 0.7744893282651901 }, { "epoch": 0.14808948723025145, "grad_norm": 1.3304303884506226, "learning_rate": 5e-05, "llm_loss": 0.6940000653266907, "loss": 3.2917, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.0850830078125, "loss_aux_layer_10": 0.0975341796875, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1114501953125, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.132568359375, "loss_aux_layer_15": 0.14501953125, "loss_aux_layer_16": 0.1572265625, "loss_aux_layer_17": 0.164306640625, "loss_aux_layer_18": 0.172607421875, "loss_aux_layer_19": 0.17333984375, "loss_aux_layer_2": 0.089599609375, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0997314453125, "loss_aux_layer_4": 0.1025390625, "loss_aux_layer_5": 0.1036376953125, "loss_aux_layer_6": 0.10595703125, "loss_aux_layer_7": 0.1004638671875, "loss_aux_layer_8": 0.098388671875, "loss_aux_layer_9": 0.096923828125, "step": 748, "total_loss": 0.8229328393936157 }, { "epoch": 0.14828746782815283, "grad_norm": 1.2957186698913574, "learning_rate": 5e-05, "llm_loss": 0.6705436855554581, "loss": 3.1987, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.084228515625, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.111083984375, "loss_aux_layer_13": 0.1202392578125, "loss_aux_layer_14": 0.134033203125, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.158447265625, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.175048828125, "loss_aux_layer_19": 0.175537109375, "loss_aux_layer_2": 0.08935546875, "loss_aux_layer_20": 0.18017578125, "loss_aux_layer_21": 0.18505859375, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0985107421875, "loss_aux_layer_4": 0.1004638671875, "loss_aux_layer_5": 0.1019287109375, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.098876953125, "loss_aux_layer_8": 0.0972900390625, "loss_aux_layer_9": 0.095458984375, "step": 749, "total_loss": 0.7996646910905838 }, { "epoch": 0.14848544842605424, "grad_norm": 1.6472710371017456, "learning_rate": 5e-05, "llm_loss": 0.6150357723236084, "loss": 2.9775, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.0833740234375, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.110595703125, "loss_aux_layer_13": 0.119384765625, "loss_aux_layer_14": 0.133056640625, "loss_aux_layer_15": 0.145263671875, "loss_aux_layer_16": 0.157958984375, "loss_aux_layer_17": 0.165283203125, "loss_aux_layer_18": 0.17431640625, "loss_aux_layer_19": 0.17626953125, "loss_aux_layer_2": 0.0880126953125, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.25146484375, "loss_aux_layer_3": 0.0977783203125, "loss_aux_layer_4": 0.099609375, "loss_aux_layer_5": 0.1011962890625, "loss_aux_layer_6": 0.10400390625, "loss_aux_layer_7": 0.0989990234375, "loss_aux_layer_8": 0.0975341796875, "loss_aux_layer_9": 0.0958251953125, "step": 750, "total_loss": 0.7443859726190567 }, { "epoch": 0.14868342902395565, "grad_norm": 0.7596572637557983, "learning_rate": 5e-05, "llm_loss": 0.6635193824768066, "loss": 3.1802, "loss_aux_layer_0": 0.031005859375, "loss_aux_layer_1": 0.0865478515625, "loss_aux_layer_10": 0.098388671875, "loss_aux_layer_11": 0.104248046875, "loss_aux_layer_12": 0.1129150390625, "loss_aux_layer_13": 0.1217041015625, "loss_aux_layer_14": 0.1357421875, "loss_aux_layer_15": 0.1484375, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.16845703125, "loss_aux_layer_18": 0.177734375, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.0899658203125, "loss_aux_layer_20": 0.182861328125, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.0992431640625, "loss_aux_layer_4": 0.101318359375, "loss_aux_layer_5": 0.1029052734375, "loss_aux_layer_6": 0.105712890625, "loss_aux_layer_7": 0.100830078125, "loss_aux_layer_8": 0.0992431640625, "loss_aux_layer_9": 0.0975341796875, "step": 751, "total_loss": 0.7950409650802612 }, { "epoch": 0.14888140962185706, "grad_norm": 1.4103683233261108, "learning_rate": 5e-05, "llm_loss": 0.6256635934114456, "loss": 3.0389, "loss_aux_layer_0": 0.029998779296875, "loss_aux_layer_1": 0.0888671875, "loss_aux_layer_10": 0.1014404296875, "loss_aux_layer_11": 0.1077880859375, "loss_aux_layer_12": 0.1162109375, "loss_aux_layer_13": 0.1251220703125, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.152099609375, "loss_aux_layer_16": 0.163818359375, "loss_aux_layer_17": 0.170654296875, "loss_aux_layer_18": 0.178955078125, "loss_aux_layer_19": 0.1787109375, "loss_aux_layer_2": 0.093505859375, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.254638671875, "loss_aux_layer_3": 0.1033935546875, "loss_aux_layer_4": 0.1053466796875, "loss_aux_layer_5": 0.1070556640625, "loss_aux_layer_6": 0.1094970703125, "loss_aux_layer_7": 0.1043701171875, "loss_aux_layer_8": 0.1029052734375, "loss_aux_layer_9": 0.100830078125, "step": 752, "total_loss": 0.759728416800499 }, { "epoch": 0.14907939021975847, "grad_norm": 1.0772656202316284, "learning_rate": 5e-05, "llm_loss": 0.7363102287054062, "loss": 3.4819, "loss_aux_layer_0": 0.03070068359375, "loss_aux_layer_1": 0.092529296875, "loss_aux_layer_10": 0.10302734375, "loss_aux_layer_11": 0.109619140625, "loss_aux_layer_12": 0.1180419921875, "loss_aux_layer_13": 0.1268310546875, "loss_aux_layer_14": 0.139404296875, "loss_aux_layer_15": 0.15087890625, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.16748046875, "loss_aux_layer_18": 0.17529296875, "loss_aux_layer_19": 0.1748046875, "loss_aux_layer_2": 0.0958251953125, "loss_aux_layer_20": 0.180419921875, "loss_aux_layer_21": 0.186279296875, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.2509765625, "loss_aux_layer_3": 0.105712890625, "loss_aux_layer_4": 0.1080322265625, "loss_aux_layer_5": 0.10888671875, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.1060791015625, "loss_aux_layer_8": 0.1044921875, "loss_aux_layer_9": 0.102294921875, "step": 753, "total_loss": 0.8704759925603867 }, { "epoch": 0.14927737081765988, "grad_norm": 1.0412362813949585, "learning_rate": 5e-05, "llm_loss": 0.6836530491709709, "loss": 3.2652, "loss_aux_layer_0": 0.029052734375, "loss_aux_layer_1": 0.0877685546875, "loss_aux_layer_10": 0.1005859375, "loss_aux_layer_11": 0.1070556640625, "loss_aux_layer_12": 0.1158447265625, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.13818359375, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.16748046875, "loss_aux_layer_18": 0.17578125, "loss_aux_layer_19": 0.17578125, "loss_aux_layer_2": 0.09326171875, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.103271484375, "loss_aux_layer_4": 0.10498046875, "loss_aux_layer_5": 0.1060791015625, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1036376953125, "loss_aux_layer_8": 0.10205078125, "loss_aux_layer_9": 0.10009765625, "step": 754, "total_loss": 0.8162970840930939 }, { "epoch": 0.1494753514155613, "grad_norm": 0.8151003122329712, "learning_rate": 5e-05, "llm_loss": 0.569393590092659, "loss": 2.8272, "loss_aux_layer_0": 0.02923583984375, "loss_aux_layer_1": 0.09326171875, "loss_aux_layer_10": 0.104736328125, "loss_aux_layer_11": 0.1116943359375, "loss_aux_layer_12": 0.1199951171875, "loss_aux_layer_13": 0.129150390625, "loss_aux_layer_14": 0.141845703125, "loss_aux_layer_15": 0.15380859375, "loss_aux_layer_16": 0.16552734375, "loss_aux_layer_17": 0.172119140625, "loss_aux_layer_18": 0.179931640625, "loss_aux_layer_19": 0.179931640625, "loss_aux_layer_2": 0.0982666015625, "loss_aux_layer_20": 0.185546875, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.21728515625, "loss_aux_layer_23": 0.2607421875, "loss_aux_layer_3": 0.1083984375, "loss_aux_layer_4": 0.1102294921875, "loss_aux_layer_5": 0.1114501953125, "loss_aux_layer_6": 0.1136474609375, "loss_aux_layer_7": 0.108154296875, "loss_aux_layer_8": 0.1063232421875, "loss_aux_layer_9": 0.10400390625, "step": 755, "total_loss": 0.7067975699901581 }, { "epoch": 0.14967333201346267, "grad_norm": 1.587014079093933, "learning_rate": 5e-05, "llm_loss": 0.6227233856916428, "loss": 3.0241, "loss_aux_layer_0": 0.029022216796875, "loss_aux_layer_1": 0.0867919921875, "loss_aux_layer_10": 0.1004638671875, "loss_aux_layer_11": 0.10693359375, "loss_aux_layer_12": 0.115478515625, "loss_aux_layer_13": 0.12451171875, "loss_aux_layer_14": 0.1376953125, "loss_aux_layer_15": 0.149658203125, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.169189453125, "loss_aux_layer_18": 0.17822265625, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.184814453125, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.2119140625, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.1024169921875, "loss_aux_layer_4": 0.1043701171875, "loss_aux_layer_5": 0.1058349609375, "loss_aux_layer_6": 0.1082763671875, "loss_aux_layer_7": 0.1033935546875, "loss_aux_layer_8": 0.1015625, "loss_aux_layer_9": 0.099853515625, "step": 756, "total_loss": 0.7560235261917114 }, { "epoch": 0.14987131261136408, "grad_norm": 1.6339848041534424, "learning_rate": 5e-05, "llm_loss": 0.5676173865795135, "loss": 2.8086, "loss_aux_layer_0": 0.02703857421875, "loss_aux_layer_1": 0.087890625, "loss_aux_layer_10": 0.1036376953125, "loss_aux_layer_11": 0.1097412109375, "loss_aux_layer_12": 0.117919921875, "loss_aux_layer_13": 0.1265869140625, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.162109375, "loss_aux_layer_17": 0.168212890625, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.178466796875, "loss_aux_layer_2": 0.0950927734375, "loss_aux_layer_20": 0.1826171875, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.1058349609375, "loss_aux_layer_4": 0.1075439453125, "loss_aux_layer_5": 0.10888671875, "loss_aux_layer_6": 0.1114501953125, "loss_aux_layer_7": 0.1065673828125, "loss_aux_layer_8": 0.1048583984375, "loss_aux_layer_9": 0.10302734375, "step": 757, "total_loss": 0.7021411061286926 }, { "epoch": 0.1500692932092655, "grad_norm": 1.65322744846344, "learning_rate": 5e-05, "llm_loss": 0.6499436944723129, "loss": 3.1183, "loss_aux_layer_0": 0.02850341796875, "loss_aux_layer_1": 0.0811767578125, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.111328125, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.1474609375, "loss_aux_layer_16": 0.159423828125, "loss_aux_layer_17": 0.167724609375, "loss_aux_layer_18": 0.1767578125, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.08740234375, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.1884765625, "loss_aux_layer_22": 0.20947265625, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.09619140625, "loss_aux_layer_4": 0.09814453125, "loss_aux_layer_5": 0.0994873046875, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.0975341796875, "loss_aux_layer_8": 0.096435546875, "loss_aux_layer_9": 0.094970703125, "step": 758, "total_loss": 0.7795656770467758 }, { "epoch": 0.1502672738071669, "grad_norm": 3.374354124069214, "learning_rate": 5e-05, "llm_loss": 0.6452139914035797, "loss": 3.1204, "loss_aux_layer_0": 0.0260009765625, "loss_aux_layer_1": 0.0882568359375, "loss_aux_layer_10": 0.103271484375, "loss_aux_layer_11": 0.109619140625, "loss_aux_layer_12": 0.1185302734375, "loss_aux_layer_13": 0.1268310546875, "loss_aux_layer_14": 0.139892578125, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.1630859375, "loss_aux_layer_17": 0.168701171875, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.09619140625, "loss_aux_layer_20": 0.183837890625, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.1063232421875, "loss_aux_layer_4": 0.1083984375, "loss_aux_layer_5": 0.1097412109375, "loss_aux_layer_6": 0.11279296875, "loss_aux_layer_7": 0.107421875, "loss_aux_layer_8": 0.1051025390625, "loss_aux_layer_9": 0.102783203125, "step": 759, "total_loss": 0.7801061421632767 }, { "epoch": 0.1504652544050683, "grad_norm": 2.7628064155578613, "learning_rate": 5e-05, "llm_loss": 0.5961845219135284, "loss": 2.9113, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.0836181640625, "loss_aux_layer_10": 0.1005859375, "loss_aux_layer_11": 0.1063232421875, "loss_aux_layer_12": 0.11474609375, "loss_aux_layer_13": 0.123046875, "loss_aux_layer_14": 0.135986328125, "loss_aux_layer_15": 0.14892578125, "loss_aux_layer_16": 0.160888671875, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.17529296875, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.0908203125, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.207275390625, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.1007080078125, "loss_aux_layer_4": 0.102783203125, "loss_aux_layer_5": 0.1043701171875, "loss_aux_layer_6": 0.1087646484375, "loss_aux_layer_7": 0.10400390625, "loss_aux_layer_8": 0.1016845703125, "loss_aux_layer_9": 0.099853515625, "step": 760, "total_loss": 0.7278209924697876 }, { "epoch": 0.15066323500296971, "grad_norm": 1.5441524982452393, "learning_rate": 5e-05, "llm_loss": 0.6188507378101349, "loss": 2.993, "loss_aux_layer_0": 0.026611328125, "loss_aux_layer_1": 0.0792236328125, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.111572265625, "loss_aux_layer_13": 0.1202392578125, "loss_aux_layer_14": 0.13427734375, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.159423828125, "loss_aux_layer_17": 0.16650390625, "loss_aux_layer_18": 0.175048828125, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.0867919921875, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.187255859375, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.096435546875, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.100341796875, "loss_aux_layer_6": 0.1038818359375, "loss_aux_layer_7": 0.099365234375, "loss_aux_layer_8": 0.097900390625, "loss_aux_layer_9": 0.096435546875, "step": 761, "total_loss": 0.7482471764087677 }, { "epoch": 0.15086121560087112, "grad_norm": 3.300027847290039, "learning_rate": 5e-05, "llm_loss": 0.6999901533126831, "loss": 3.324, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.0810546875, "loss_aux_layer_10": 0.096435546875, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.1214599609375, "loss_aux_layer_14": 0.13623046875, "loss_aux_layer_15": 0.150146484375, "loss_aux_layer_16": 0.16259765625, "loss_aux_layer_17": 0.169677734375, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.1806640625, "loss_aux_layer_2": 0.0892333984375, "loss_aux_layer_20": 0.186279296875, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.213623046875, "loss_aux_layer_23": 0.2568359375, "loss_aux_layer_3": 0.096923828125, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.0966796875, "loss_aux_layer_9": 0.0950927734375, "step": 762, "total_loss": 0.830999568104744 }, { "epoch": 0.15105919619877253, "grad_norm": 1.2624567747116089, "learning_rate": 5e-05, "llm_loss": 0.6087707504630089, "loss": 2.9709, "loss_aux_layer_0": 0.028717041015625, "loss_aux_layer_1": 0.0885009765625, "loss_aux_layer_10": 0.102783203125, "loss_aux_layer_11": 0.10888671875, "loss_aux_layer_12": 0.116943359375, "loss_aux_layer_13": 0.1256103515625, "loss_aux_layer_14": 0.137939453125, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.17724609375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.09423828125, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.10498046875, "loss_aux_layer_4": 0.107177734375, "loss_aux_layer_5": 0.108642578125, "loss_aux_layer_6": 0.1119384765625, "loss_aux_layer_7": 0.1060791015625, "loss_aux_layer_8": 0.103759765625, "loss_aux_layer_9": 0.101806640625, "step": 763, "total_loss": 0.7427233755588531 }, { "epoch": 0.15125717679667391, "grad_norm": 2.204732656478882, "learning_rate": 5e-05, "llm_loss": 0.625864177942276, "loss": 3.0236, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.08154296875, "loss_aux_layer_10": 0.09765625, "loss_aux_layer_11": 0.1033935546875, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.12060546875, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.146728515625, "loss_aux_layer_16": 0.15869140625, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.175048828125, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.0894775390625, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.186767578125, "loss_aux_layer_22": 0.2080078125, "loss_aux_layer_23": 0.251220703125, "loss_aux_layer_3": 0.0982666015625, "loss_aux_layer_4": 0.100341796875, "loss_aux_layer_5": 0.1021728515625, "loss_aux_layer_6": 0.105224609375, "loss_aux_layer_7": 0.1004638671875, "loss_aux_layer_8": 0.0986328125, "loss_aux_layer_9": 0.09716796875, "step": 764, "total_loss": 0.7558930665254593 }, { "epoch": 0.15145515739457532, "grad_norm": 2.7150051593780518, "learning_rate": 5e-05, "llm_loss": 0.5986123085021973, "loss": 2.9129, "loss_aux_layer_0": 0.027740478515625, "loss_aux_layer_1": 0.081298828125, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.1107177734375, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.133056640625, "loss_aux_layer_15": 0.145751953125, "loss_aux_layer_16": 0.15771484375, "loss_aux_layer_17": 0.1650390625, "loss_aux_layer_18": 0.17333984375, "loss_aux_layer_19": 0.173828125, "loss_aux_layer_2": 0.0889892578125, "loss_aux_layer_20": 0.179931640625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.0997314453125, "loss_aux_layer_4": 0.10107421875, "loss_aux_layer_5": 0.10302734375, "loss_aux_layer_6": 0.10546875, "loss_aux_layer_7": 0.10009765625, "loss_aux_layer_8": 0.09765625, "loss_aux_layer_9": 0.0958251953125, "step": 765, "total_loss": 0.728225588798523 }, { "epoch": 0.15165313799247673, "grad_norm": 1.3390352725982666, "learning_rate": 5e-05, "llm_loss": 0.5740356296300888, "loss": 2.8106, "loss_aux_layer_0": 0.02789306640625, "loss_aux_layer_1": 0.0787353515625, "loss_aux_layer_10": 0.09521484375, "loss_aux_layer_11": 0.101318359375, "loss_aux_layer_12": 0.110107421875, "loss_aux_layer_13": 0.119140625, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.158447265625, "loss_aux_layer_17": 0.165283203125, "loss_aux_layer_18": 0.174560546875, "loss_aux_layer_19": 0.17578125, "loss_aux_layer_2": 0.0869140625, "loss_aux_layer_20": 0.181640625, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.209716796875, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.0958251953125, "loss_aux_layer_4": 0.09765625, "loss_aux_layer_5": 0.0992431640625, "loss_aux_layer_6": 0.10205078125, "loss_aux_layer_7": 0.096923828125, "loss_aux_layer_8": 0.0955810546875, "loss_aux_layer_9": 0.09423828125, "step": 766, "total_loss": 0.7026575356721878 }, { "epoch": 0.15185111859037814, "grad_norm": 2.8060109615325928, "learning_rate": 5e-05, "llm_loss": 0.7223058342933655, "loss": 3.4043, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.080078125, "loss_aux_layer_10": 0.096923828125, "loss_aux_layer_11": 0.1021728515625, "loss_aux_layer_12": 0.1104736328125, "loss_aux_layer_13": 0.118896484375, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.145263671875, "loss_aux_layer_16": 0.156982421875, "loss_aux_layer_17": 0.16357421875, "loss_aux_layer_18": 0.173095703125, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.088134765625, "loss_aux_layer_20": 0.1796875, "loss_aux_layer_21": 0.185791015625, "loss_aux_layer_22": 0.20654296875, "loss_aux_layer_23": 0.250244140625, "loss_aux_layer_3": 0.09814453125, "loss_aux_layer_4": 0.10009765625, "loss_aux_layer_5": 0.1016845703125, "loss_aux_layer_6": 0.1051025390625, "loss_aux_layer_7": 0.099609375, "loss_aux_layer_8": 0.097900390625, "loss_aux_layer_9": 0.0963134765625, "step": 767, "total_loss": 0.8510789126157761 }, { "epoch": 0.15204909918827955, "grad_norm": 1.354508638381958, "learning_rate": 5e-05, "llm_loss": 0.6089600771665573, "loss": 2.9697, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.0867919921875, "loss_aux_layer_10": 0.10107421875, "loss_aux_layer_11": 0.1075439453125, "loss_aux_layer_12": 0.1160888671875, "loss_aux_layer_13": 0.1243896484375, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.14892578125, "loss_aux_layer_16": 0.16015625, "loss_aux_layer_17": 0.16796875, "loss_aux_layer_18": 0.17578125, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.09619140625, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.18798828125, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.256103515625, "loss_aux_layer_3": 0.1046142578125, "loss_aux_layer_4": 0.1064453125, "loss_aux_layer_5": 0.1077880859375, "loss_aux_layer_6": 0.110595703125, "loss_aux_layer_7": 0.1053466796875, "loss_aux_layer_8": 0.102783203125, "loss_aux_layer_9": 0.1004638671875, "step": 768, "total_loss": 0.7424145638942719 }, { "epoch": 0.15224707978618096, "grad_norm": 1.9834247827529907, "learning_rate": 5e-05, "llm_loss": 0.6596610844135284, "loss": 3.1615, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.083984375, "loss_aux_layer_10": 0.099853515625, "loss_aux_layer_11": 0.105224609375, "loss_aux_layer_12": 0.1131591796875, "loss_aux_layer_13": 0.121826171875, "loss_aux_layer_14": 0.1357421875, "loss_aux_layer_15": 0.14794921875, "loss_aux_layer_16": 0.1591796875, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.1748046875, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.091552734375, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.184814453125, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.1009521484375, "loss_aux_layer_4": 0.1029052734375, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.1070556640625, "loss_aux_layer_7": 0.1021728515625, "loss_aux_layer_8": 0.100830078125, "loss_aux_layer_9": 0.0994873046875, "step": 769, "total_loss": 0.7903799712657928 }, { "epoch": 0.15244506038408237, "grad_norm": 1.974450945854187, "learning_rate": 5e-05, "llm_loss": 0.5614774823188782, "loss": 2.7753, "loss_aux_layer_0": 0.026458740234375, "loss_aux_layer_1": 0.085205078125, "loss_aux_layer_10": 0.100341796875, "loss_aux_layer_11": 0.1063232421875, "loss_aux_layer_12": 0.1146240234375, "loss_aux_layer_13": 0.123779296875, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.1611328125, "loss_aux_layer_17": 0.16796875, "loss_aux_layer_18": 0.177001953125, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.0926513671875, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.20751953125, "loss_aux_layer_23": 0.250244140625, "loss_aux_layer_3": 0.103515625, "loss_aux_layer_4": 0.1051025390625, "loss_aux_layer_5": 0.1063232421875, "loss_aux_layer_6": 0.10888671875, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.1019287109375, "loss_aux_layer_9": 0.099609375, "step": 770, "total_loss": 0.6938188523054123 }, { "epoch": 0.15264304098198375, "grad_norm": 1.3250224590301514, "learning_rate": 5e-05, "llm_loss": 0.5876612961292267, "loss": 2.8777, "loss_aux_layer_0": 0.028228759765625, "loss_aux_layer_1": 0.0858154296875, "loss_aux_layer_10": 0.09912109375, "loss_aux_layer_11": 0.1053466796875, "loss_aux_layer_12": 0.113525390625, "loss_aux_layer_13": 0.1226806640625, "loss_aux_layer_14": 0.135986328125, "loss_aux_layer_15": 0.148193359375, "loss_aux_layer_16": 0.160400390625, "loss_aux_layer_17": 0.1669921875, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.0911865234375, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.254150390625, "loss_aux_layer_3": 0.101318359375, "loss_aux_layer_4": 0.1031494140625, "loss_aux_layer_5": 0.104248046875, "loss_aux_layer_6": 0.107177734375, "loss_aux_layer_7": 0.1019287109375, "loss_aux_layer_8": 0.1002197265625, "loss_aux_layer_9": 0.09814453125, "step": 771, "total_loss": 0.7194223403930664 }, { "epoch": 0.15284102157988516, "grad_norm": 1.7997113466262817, "learning_rate": 5e-05, "llm_loss": 0.6213819086551666, "loss": 3.0188, "loss_aux_layer_0": 0.02838134765625, "loss_aux_layer_1": 0.086181640625, "loss_aux_layer_10": 0.1007080078125, "loss_aux_layer_11": 0.1064453125, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.1239013671875, "loss_aux_layer_14": 0.137451171875, "loss_aux_layer_15": 0.151123046875, "loss_aux_layer_16": 0.1640625, "loss_aux_layer_17": 0.170166015625, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.179443359375, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.184814453125, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.20947265625, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.1029052734375, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.10595703125, "loss_aux_layer_6": 0.1090087890625, "loss_aux_layer_7": 0.1038818359375, "loss_aux_layer_8": 0.101806640625, "loss_aux_layer_9": 0.0999755859375, "step": 772, "total_loss": 0.7547025829553604 }, { "epoch": 0.15303900217778657, "grad_norm": 1.447867751121521, "learning_rate": 5e-05, "llm_loss": 0.5952834784984589, "loss": 2.9117, "loss_aux_layer_0": 0.029541015625, "loss_aux_layer_1": 0.0853271484375, "loss_aux_layer_10": 0.0987548828125, "loss_aux_layer_11": 0.104736328125, "loss_aux_layer_12": 0.1141357421875, "loss_aux_layer_13": 0.1231689453125, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.150390625, "loss_aux_layer_16": 0.16259765625, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.17919921875, "loss_aux_layer_19": 0.180419921875, "loss_aux_layer_2": 0.09033203125, "loss_aux_layer_20": 0.18603515625, "loss_aux_layer_21": 0.191650390625, "loss_aux_layer_22": 0.212158203125, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.100341796875, "loss_aux_layer_4": 0.1024169921875, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.106201171875, "loss_aux_layer_7": 0.101318359375, "loss_aux_layer_8": 0.0994873046875, "loss_aux_layer_9": 0.0980224609375, "step": 773, "total_loss": 0.7279265075922012 }, { "epoch": 0.15323698277568798, "grad_norm": 1.2550777196884155, "learning_rate": 5e-05, "llm_loss": 0.6900696158409119, "loss": 3.2712, "loss_aux_layer_0": 0.030242919921875, "loss_aux_layer_1": 0.0826416015625, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1087646484375, "loss_aux_layer_13": 0.1175537109375, "loss_aux_layer_14": 0.1318359375, "loss_aux_layer_15": 0.14404296875, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.172607421875, "loss_aux_layer_19": 0.173828125, "loss_aux_layer_2": 0.087158203125, "loss_aux_layer_20": 0.179931640625, "loss_aux_layer_21": 0.185791015625, "loss_aux_layer_22": 0.207275390625, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.0955810546875, "loss_aux_layer_4": 0.097412109375, "loss_aux_layer_5": 0.0985107421875, "loss_aux_layer_6": 0.101318359375, "loss_aux_layer_7": 0.096435546875, "loss_aux_layer_8": 0.0953369140625, "loss_aux_layer_9": 0.093994140625, "step": 774, "total_loss": 0.817790299654007 }, { "epoch": 0.1534349633735894, "grad_norm": 1.4535448551177979, "learning_rate": 5e-05, "llm_loss": 0.6082641780376434, "loss": 2.981, "loss_aux_layer_0": 0.03021240234375, "loss_aux_layer_1": 0.0921630859375, "loss_aux_layer_10": 0.10498046875, "loss_aux_layer_11": 0.111328125, "loss_aux_layer_12": 0.1202392578125, "loss_aux_layer_13": 0.1287841796875, "loss_aux_layer_14": 0.141845703125, "loss_aux_layer_15": 0.154052734375, "loss_aux_layer_16": 0.166015625, "loss_aux_layer_17": 0.17138671875, "loss_aux_layer_18": 0.179931640625, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.098388671875, "loss_aux_layer_20": 0.18408203125, "loss_aux_layer_21": 0.190185546875, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.1090087890625, "loss_aux_layer_4": 0.1107177734375, "loss_aux_layer_5": 0.1119384765625, "loss_aux_layer_6": 0.11474609375, "loss_aux_layer_7": 0.109130859375, "loss_aux_layer_8": 0.107177734375, "loss_aux_layer_9": 0.104736328125, "step": 775, "total_loss": 0.7452514171600342 }, { "epoch": 0.1536329439714908, "grad_norm": 1.4555498361587524, "learning_rate": 5e-05, "llm_loss": 0.6594593524932861, "loss": 3.1601, "loss_aux_layer_0": 0.026153564453125, "loss_aux_layer_1": 0.0819091796875, "loss_aux_layer_10": 0.1002197265625, "loss_aux_layer_11": 0.106201171875, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.12353515625, "loss_aux_layer_14": 0.13671875, "loss_aux_layer_15": 0.148681640625, "loss_aux_layer_16": 0.160400390625, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.175048828125, "loss_aux_layer_19": 0.1748046875, "loss_aux_layer_2": 0.089599609375, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.184326171875, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.246826171875, "loss_aux_layer_3": 0.099609375, "loss_aux_layer_4": 0.1021728515625, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.1068115234375, "loss_aux_layer_7": 0.102294921875, "loss_aux_layer_8": 0.1005859375, "loss_aux_layer_9": 0.0989990234375, "step": 776, "total_loss": 0.7900310307741165 }, { "epoch": 0.1538309245693922, "grad_norm": 1.806321382522583, "learning_rate": 5e-05, "llm_loss": 0.58584164083004, "loss": 2.8555, "loss_aux_layer_0": 0.026763916015625, "loss_aux_layer_1": 0.078857421875, "loss_aux_layer_10": 0.0943603515625, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.109619140625, "loss_aux_layer_13": 0.1187744140625, "loss_aux_layer_14": 0.13232421875, "loss_aux_layer_15": 0.14501953125, "loss_aux_layer_16": 0.157470703125, "loss_aux_layer_17": 0.1650390625, "loss_aux_layer_18": 0.174560546875, "loss_aux_layer_19": 0.176025390625, "loss_aux_layer_2": 0.0858154296875, "loss_aux_layer_20": 0.181884765625, "loss_aux_layer_21": 0.187255859375, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.0955810546875, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.099365234375, "loss_aux_layer_6": 0.1014404296875, "loss_aux_layer_7": 0.0968017578125, "loss_aux_layer_8": 0.0950927734375, "loss_aux_layer_9": 0.09375, "step": 777, "total_loss": 0.7138766944408417 }, { "epoch": 0.15402890516729362, "grad_norm": 1.9580681324005127, "learning_rate": 5e-05, "llm_loss": 0.5595379024744034, "loss": 2.7861, "loss_aux_layer_0": 0.0299072265625, "loss_aux_layer_1": 0.0897216796875, "loss_aux_layer_10": 0.104248046875, "loss_aux_layer_11": 0.111083984375, "loss_aux_layer_12": 0.1195068359375, "loss_aux_layer_13": 0.128662109375, "loss_aux_layer_14": 0.142578125, "loss_aux_layer_15": 0.1552734375, "loss_aux_layer_16": 0.1669921875, "loss_aux_layer_17": 0.173828125, "loss_aux_layer_18": 0.18310546875, "loss_aux_layer_19": 0.1826171875, "loss_aux_layer_2": 0.0963134765625, "loss_aux_layer_20": 0.18701171875, "loss_aux_layer_21": 0.192138671875, "loss_aux_layer_22": 0.213134765625, "loss_aux_layer_23": 0.255859375, "loss_aux_layer_3": 0.106689453125, "loss_aux_layer_4": 0.1087646484375, "loss_aux_layer_5": 0.1107177734375, "loss_aux_layer_6": 0.113525390625, "loss_aux_layer_7": 0.1075439453125, "loss_aux_layer_8": 0.105712890625, "loss_aux_layer_9": 0.103515625, "step": 778, "total_loss": 0.6965279877185822 }, { "epoch": 0.154226885765195, "grad_norm": 0.6833025813102722, "learning_rate": 5e-05, "llm_loss": 0.7244826257228851, "loss": 3.4325, "loss_aux_layer_0": 0.02825927734375, "loss_aux_layer_1": 0.087890625, "loss_aux_layer_10": 0.1016845703125, "loss_aux_layer_11": 0.1080322265625, "loss_aux_layer_12": 0.116943359375, "loss_aux_layer_13": 0.125732421875, "loss_aux_layer_14": 0.13916015625, "loss_aux_layer_15": 0.151123046875, "loss_aux_layer_16": 0.16357421875, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.177978515625, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.0931396484375, "loss_aux_layer_20": 0.183349609375, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.209228515625, "loss_aux_layer_23": 0.25146484375, "loss_aux_layer_3": 0.103515625, "loss_aux_layer_4": 0.105712890625, "loss_aux_layer_5": 0.106689453125, "loss_aux_layer_6": 0.1094970703125, "loss_aux_layer_7": 0.1043701171875, "loss_aux_layer_8": 0.1026611328125, "loss_aux_layer_9": 0.1007080078125, "step": 779, "total_loss": 0.8581141829490662 }, { "epoch": 0.1544248663630964, "grad_norm": 1.9751856327056885, "learning_rate": 5e-05, "llm_loss": 0.621729664504528, "loss": 3.0104, "loss_aux_layer_0": 0.02642822265625, "loss_aux_layer_1": 0.0794677734375, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.1048583984375, "loss_aux_layer_12": 0.1142578125, "loss_aux_layer_13": 0.1229248046875, "loss_aux_layer_14": 0.135986328125, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.168701171875, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.177978515625, "loss_aux_layer_2": 0.087646484375, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.253173828125, "loss_aux_layer_3": 0.097900390625, "loss_aux_layer_4": 0.100341796875, "loss_aux_layer_5": 0.1019287109375, "loss_aux_layer_6": 0.104736328125, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.098388671875, "loss_aux_layer_9": 0.0972900390625, "step": 780, "total_loss": 0.7526084333658218 }, { "epoch": 0.15462284696099782, "grad_norm": 1.4528608322143555, "learning_rate": 5e-05, "llm_loss": 0.6119478344917297, "loss": 2.9672, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.08349609375, "loss_aux_layer_10": 0.0975341796875, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.1126708984375, "loss_aux_layer_13": 0.1217041015625, "loss_aux_layer_14": 0.135498046875, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.174560546875, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.09033203125, "loss_aux_layer_20": 0.18017578125, "loss_aux_layer_21": 0.18505859375, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.099853515625, "loss_aux_layer_4": 0.1016845703125, "loss_aux_layer_5": 0.1026611328125, "loss_aux_layer_6": 0.105224609375, "loss_aux_layer_7": 0.1005859375, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.0968017578125, "step": 781, "total_loss": 0.7418018877506256 }, { "epoch": 0.15482082755889923, "grad_norm": 1.4393001794815063, "learning_rate": 5e-05, "llm_loss": 0.654211163520813, "loss": 3.1448, "loss_aux_layer_0": 0.02825927734375, "loss_aux_layer_1": 0.0845947265625, "loss_aux_layer_10": 0.0994873046875, "loss_aux_layer_11": 0.1055908203125, "loss_aux_layer_12": 0.1143798828125, "loss_aux_layer_13": 0.1231689453125, "loss_aux_layer_14": 0.13671875, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.168701171875, "loss_aux_layer_18": 0.177001953125, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.091552734375, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.209716796875, "loss_aux_layer_23": 0.25244140625, "loss_aux_layer_3": 0.101318359375, "loss_aux_layer_4": 0.103271484375, "loss_aux_layer_5": 0.104248046875, "loss_aux_layer_6": 0.1072998046875, "loss_aux_layer_7": 0.1025390625, "loss_aux_layer_8": 0.1009521484375, "loss_aux_layer_9": 0.0989990234375, "step": 782, "total_loss": 0.786206990480423 }, { "epoch": 0.15501880815680064, "grad_norm": 2.789619207382202, "learning_rate": 5e-05, "llm_loss": 0.6179426610469818, "loss": 3.0016, "loss_aux_layer_0": 0.0277099609375, "loss_aux_layer_1": 0.08203125, "loss_aux_layer_10": 0.0997314453125, "loss_aux_layer_11": 0.106201171875, "loss_aux_layer_12": 0.114990234375, "loss_aux_layer_13": 0.124755859375, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.15185546875, "loss_aux_layer_16": 0.16357421875, "loss_aux_layer_17": 0.169921875, "loss_aux_layer_18": 0.1787109375, "loss_aux_layer_19": 0.178466796875, "loss_aux_layer_2": 0.089599609375, "loss_aux_layer_20": 0.1845703125, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.2109375, "loss_aux_layer_23": 0.253173828125, "loss_aux_layer_3": 0.10009765625, "loss_aux_layer_4": 0.1026611328125, "loss_aux_layer_5": 0.1043701171875, "loss_aux_layer_6": 0.10693359375, "loss_aux_layer_7": 0.101806640625, "loss_aux_layer_8": 0.100341796875, "loss_aux_layer_9": 0.0987548828125, "step": 783, "total_loss": 0.7504022717475891 }, { "epoch": 0.15521678875470205, "grad_norm": 2.452059268951416, "learning_rate": 5e-05, "llm_loss": 0.6484792530536652, "loss": 3.1336, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0859375, "loss_aux_layer_10": 0.1038818359375, "loss_aux_layer_11": 0.109619140625, "loss_aux_layer_12": 0.118408203125, "loss_aux_layer_13": 0.1273193359375, "loss_aux_layer_14": 0.140869140625, "loss_aux_layer_15": 0.15283203125, "loss_aux_layer_16": 0.1640625, "loss_aux_layer_17": 0.171142578125, "loss_aux_layer_18": 0.178955078125, "loss_aux_layer_19": 0.177978515625, "loss_aux_layer_2": 0.0936279296875, "loss_aux_layer_20": 0.183349609375, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.251220703125, "loss_aux_layer_3": 0.105224609375, "loss_aux_layer_4": 0.1082763671875, "loss_aux_layer_5": 0.110595703125, "loss_aux_layer_6": 0.112548828125, "loss_aux_layer_7": 0.107421875, "loss_aux_layer_8": 0.105224609375, "loss_aux_layer_9": 0.10302734375, "step": 784, "total_loss": 0.7833935171365738 }, { "epoch": 0.15541476935260345, "grad_norm": 1.9116382598876953, "learning_rate": 5e-05, "llm_loss": 0.703113004565239, "loss": 3.3394, "loss_aux_layer_0": 0.02703857421875, "loss_aux_layer_1": 0.082275390625, "loss_aux_layer_10": 0.09814453125, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.1126708984375, "loss_aux_layer_13": 0.1219482421875, "loss_aux_layer_14": 0.136474609375, "loss_aux_layer_15": 0.150146484375, "loss_aux_layer_16": 0.1630859375, "loss_aux_layer_17": 0.1708984375, "loss_aux_layer_18": 0.1796875, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.0887451171875, "loss_aux_layer_20": 0.185302734375, "loss_aux_layer_21": 0.190673828125, "loss_aux_layer_22": 0.212158203125, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.0985107421875, "loss_aux_layer_4": 0.100341796875, "loss_aux_layer_5": 0.10205078125, "loss_aux_layer_6": 0.104736328125, "loss_aux_layer_7": 0.1004638671875, "loss_aux_layer_8": 0.0989990234375, "loss_aux_layer_9": 0.0977783203125, "step": 785, "total_loss": 0.8348483294248581 }, { "epoch": 0.15561274995050486, "grad_norm": 1.3913757801055908, "learning_rate": 5e-05, "llm_loss": 0.5406358316540718, "loss": 2.6932, "loss_aux_layer_0": 0.027862548828125, "loss_aux_layer_1": 0.085693359375, "loss_aux_layer_10": 0.100830078125, "loss_aux_layer_11": 0.1070556640625, "loss_aux_layer_12": 0.115234375, "loss_aux_layer_13": 0.1241455078125, "loss_aux_layer_14": 0.137451171875, "loss_aux_layer_15": 0.1494140625, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.168212890625, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.09228515625, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.252197265625, "loss_aux_layer_3": 0.102783203125, "loss_aux_layer_4": 0.104736328125, "loss_aux_layer_5": 0.1060791015625, "loss_aux_layer_6": 0.108642578125, "loss_aux_layer_7": 0.103515625, "loss_aux_layer_8": 0.1019287109375, "loss_aux_layer_9": 0.0999755859375, "step": 786, "total_loss": 0.673288568854332 }, { "epoch": 0.15581073054840625, "grad_norm": 2.3571603298187256, "learning_rate": 5e-05, "llm_loss": 0.638621985912323, "loss": 3.0856, "loss_aux_layer_0": 0.026458740234375, "loss_aux_layer_1": 0.0853271484375, "loss_aux_layer_10": 0.1031494140625, "loss_aux_layer_11": 0.1085205078125, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.124755859375, "loss_aux_layer_14": 0.13671875, "loss_aux_layer_15": 0.148681640625, "loss_aux_layer_16": 0.159912109375, "loss_aux_layer_17": 0.166015625, "loss_aux_layer_18": 0.174072265625, "loss_aux_layer_19": 0.174560546875, "loss_aux_layer_2": 0.0943603515625, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.1046142578125, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.1087646484375, "loss_aux_layer_6": 0.1112060546875, "loss_aux_layer_7": 0.1060791015625, "loss_aux_layer_8": 0.1043701171875, "loss_aux_layer_9": 0.1024169921875, "step": 787, "total_loss": 0.7714103311300278 }, { "epoch": 0.15600871114630765, "grad_norm": 2.0317294597625732, "learning_rate": 5e-05, "llm_loss": 0.5678390860557556, "loss": 2.8131, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.08984375, "loss_aux_layer_10": 0.1043701171875, "loss_aux_layer_11": 0.1104736328125, "loss_aux_layer_12": 0.118408203125, "loss_aux_layer_13": 0.126953125, "loss_aux_layer_14": 0.14013671875, "loss_aux_layer_15": 0.15234375, "loss_aux_layer_16": 0.1630859375, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.177490234375, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.0975341796875, "loss_aux_layer_20": 0.1826171875, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.1087646484375, "loss_aux_layer_4": 0.110595703125, "loss_aux_layer_5": 0.111328125, "loss_aux_layer_6": 0.11376953125, "loss_aux_layer_7": 0.1083984375, "loss_aux_layer_8": 0.1063232421875, "loss_aux_layer_9": 0.1038818359375, "step": 788, "total_loss": 0.7032866179943085 }, { "epoch": 0.15620669174420906, "grad_norm": 1.8482353687286377, "learning_rate": 5e-05, "llm_loss": 0.6390708088874817, "loss": 3.0807, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.08154296875, "loss_aux_layer_10": 0.098876953125, "loss_aux_layer_11": 0.104736328125, "loss_aux_layer_12": 0.1134033203125, "loss_aux_layer_13": 0.1220703125, "loss_aux_layer_14": 0.13525390625, "loss_aux_layer_15": 0.14794921875, "loss_aux_layer_16": 0.16064453125, "loss_aux_layer_17": 0.167724609375, "loss_aux_layer_18": 0.17724609375, "loss_aux_layer_19": 0.17724609375, "loss_aux_layer_2": 0.08935546875, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.20751953125, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.100341796875, "loss_aux_layer_4": 0.1026611328125, "loss_aux_layer_5": 0.1048583984375, "loss_aux_layer_6": 0.1070556640625, "loss_aux_layer_7": 0.1016845703125, "loss_aux_layer_8": 0.0999755859375, "loss_aux_layer_9": 0.098388671875, "step": 789, "total_loss": 0.7701744586229324 }, { "epoch": 0.15640467234211047, "grad_norm": 1.2020022869110107, "learning_rate": 5e-05, "llm_loss": 0.5837663561105728, "loss": 2.8479, "loss_aux_layer_0": 0.0286865234375, "loss_aux_layer_1": 0.0831298828125, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.11083984375, "loss_aux_layer_13": 0.11865234375, "loss_aux_layer_14": 0.13134765625, "loss_aux_layer_15": 0.14306640625, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.0899658203125, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.10009765625, "loss_aux_layer_4": 0.1019287109375, "loss_aux_layer_5": 0.1031494140625, "loss_aux_layer_6": 0.1053466796875, "loss_aux_layer_7": 0.1002197265625, "loss_aux_layer_8": 0.098388671875, "loss_aux_layer_9": 0.0963134765625, "step": 790, "total_loss": 0.7119680047035217 }, { "epoch": 0.15660265294001188, "grad_norm": 2.7222232818603516, "learning_rate": 5e-05, "llm_loss": 0.5870154201984406, "loss": 2.8692, "loss_aux_layer_0": 0.0308837890625, "loss_aux_layer_1": 0.0830078125, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.1029052734375, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.1199951171875, "loss_aux_layer_14": 0.13427734375, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.158935546875, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.174072265625, "loss_aux_layer_19": 0.176025390625, "loss_aux_layer_2": 0.089111328125, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.099365234375, "loss_aux_layer_4": 0.10107421875, "loss_aux_layer_5": 0.103271484375, "loss_aux_layer_6": 0.1048583984375, "loss_aux_layer_7": 0.0994873046875, "loss_aux_layer_8": 0.09716796875, "loss_aux_layer_9": 0.0955810546875, "step": 791, "total_loss": 0.7173103988170624 }, { "epoch": 0.1568006335379133, "grad_norm": 2.5360007286071777, "learning_rate": 5e-05, "llm_loss": 0.6495748162269592, "loss": 3.1228, "loss_aux_layer_0": 0.0274658203125, "loss_aux_layer_1": 0.08056640625, "loss_aux_layer_10": 0.0982666015625, "loss_aux_layer_11": 0.1041259765625, "loss_aux_layer_12": 0.113037109375, "loss_aux_layer_13": 0.121337890625, "loss_aux_layer_14": 0.13525390625, "loss_aux_layer_15": 0.14794921875, "loss_aux_layer_16": 0.160888671875, "loss_aux_layer_17": 0.167724609375, "loss_aux_layer_18": 0.17626953125, "loss_aux_layer_19": 0.177978515625, "loss_aux_layer_2": 0.08935546875, "loss_aux_layer_20": 0.183837890625, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.254150390625, "loss_aux_layer_3": 0.0997314453125, "loss_aux_layer_4": 0.101318359375, "loss_aux_layer_5": 0.1033935546875, "loss_aux_layer_6": 0.1055908203125, "loss_aux_layer_7": 0.1011962890625, "loss_aux_layer_8": 0.0994873046875, "loss_aux_layer_9": 0.09765625, "step": 792, "total_loss": 0.7806974351406097 }, { "epoch": 0.1569986141358147, "grad_norm": 1.7427444458007812, "learning_rate": 5e-05, "llm_loss": 0.5756417661905289, "loss": 2.8278, "loss_aux_layer_0": 0.02703857421875, "loss_aux_layer_1": 0.08056640625, "loss_aux_layer_10": 0.0977783203125, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.113037109375, "loss_aux_layer_13": 0.1221923828125, "loss_aux_layer_14": 0.1361083984375, "loss_aux_layer_15": 0.149169921875, "loss_aux_layer_16": 0.162109375, "loss_aux_layer_17": 0.169189453125, "loss_aux_layer_18": 0.178466796875, "loss_aux_layer_19": 0.179931640625, "loss_aux_layer_2": 0.0865478515625, "loss_aux_layer_20": 0.185791015625, "loss_aux_layer_21": 0.19189453125, "loss_aux_layer_22": 0.21240234375, "loss_aux_layer_23": 0.25732421875, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.099853515625, "loss_aux_layer_5": 0.1016845703125, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.0994873046875, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0970458984375, "step": 793, "total_loss": 0.7069452553987503 }, { "epoch": 0.15719659473371608, "grad_norm": 3.0351815223693848, "learning_rate": 5e-05, "llm_loss": 0.6561151742935181, "loss": 3.1449, "loss_aux_layer_0": 0.028076171875, "loss_aux_layer_1": 0.0814208984375, "loss_aux_layer_10": 0.096923828125, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.1119384765625, "loss_aux_layer_13": 0.120849609375, "loss_aux_layer_14": 0.13427734375, "loss_aux_layer_15": 0.146728515625, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.177001953125, "loss_aux_layer_2": 0.0914306640625, "loss_aux_layer_20": 0.181640625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.20751953125, "loss_aux_layer_23": 0.25146484375, "loss_aux_layer_3": 0.0997314453125, "loss_aux_layer_4": 0.1014404296875, "loss_aux_layer_5": 0.102783203125, "loss_aux_layer_6": 0.104248046875, "loss_aux_layer_7": 0.0997314453125, "loss_aux_layer_8": 0.097412109375, "loss_aux_layer_9": 0.095947265625, "step": 794, "total_loss": 0.7862319648265839 }, { "epoch": 0.1573945753316175, "grad_norm": 2.4784884452819824, "learning_rate": 5e-05, "llm_loss": 0.6065066307783127, "loss": 2.9582, "loss_aux_layer_0": 0.02789306640625, "loss_aux_layer_1": 0.08447265625, "loss_aux_layer_10": 0.1014404296875, "loss_aux_layer_11": 0.107177734375, "loss_aux_layer_12": 0.1160888671875, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.138427734375, "loss_aux_layer_15": 0.150390625, "loss_aux_layer_16": 0.161865234375, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.17822265625, "loss_aux_layer_19": 0.177734375, "loss_aux_layer_2": 0.0933837890625, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.20947265625, "loss_aux_layer_23": 0.2529296875, "loss_aux_layer_3": 0.103271484375, "loss_aux_layer_4": 0.1051025390625, "loss_aux_layer_5": 0.1060791015625, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1036376953125, "loss_aux_layer_8": 0.1021728515625, "loss_aux_layer_9": 0.100341796875, "step": 795, "total_loss": 0.7395530939102173 }, { "epoch": 0.1575925559295189, "grad_norm": 1.635115623474121, "learning_rate": 5e-05, "llm_loss": 0.6347010433673859, "loss": 3.0607, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.081787109375, "loss_aux_layer_10": 0.0997314453125, "loss_aux_layer_11": 0.105712890625, "loss_aux_layer_12": 0.1142578125, "loss_aux_layer_13": 0.123046875, "loss_aux_layer_14": 0.1357421875, "loss_aux_layer_15": 0.14794921875, "loss_aux_layer_16": 0.16015625, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.175537109375, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.091064453125, "loss_aux_layer_20": 0.180419921875, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.2041015625, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.100830078125, "loss_aux_layer_4": 0.1025390625, "loss_aux_layer_5": 0.10400390625, "loss_aux_layer_6": 0.1065673828125, "loss_aux_layer_7": 0.1015625, "loss_aux_layer_8": 0.099853515625, "loss_aux_layer_9": 0.0982666015625, "step": 796, "total_loss": 0.7651644796133041 }, { "epoch": 0.1577905365274203, "grad_norm": 2.719115734100342, "learning_rate": 5e-05, "llm_loss": 0.6892879456281662, "loss": 3.2803, "loss_aux_layer_0": 0.0277099609375, "loss_aux_layer_1": 0.0830078125, "loss_aux_layer_10": 0.09765625, "loss_aux_layer_11": 0.1036376953125, "loss_aux_layer_12": 0.1126708984375, "loss_aux_layer_13": 0.1214599609375, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.16015625, "loss_aux_layer_17": 0.167236328125, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.17626953125, "loss_aux_layer_2": 0.0899658203125, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.25439453125, "loss_aux_layer_3": 0.099609375, "loss_aux_layer_4": 0.100830078125, "loss_aux_layer_5": 0.1026611328125, "loss_aux_layer_6": 0.1048583984375, "loss_aux_layer_7": 0.1004638671875, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0968017578125, "step": 797, "total_loss": 0.820068821310997 }, { "epoch": 0.15798851712532172, "grad_norm": 1.7270402908325195, "learning_rate": 5e-05, "llm_loss": 0.640234962105751, "loss": 3.0797, "loss_aux_layer_0": 0.026519775390625, "loss_aux_layer_1": 0.08203125, "loss_aux_layer_10": 0.098876953125, "loss_aux_layer_11": 0.10498046875, "loss_aux_layer_12": 0.1131591796875, "loss_aux_layer_13": 0.1219482421875, "loss_aux_layer_14": 0.134521484375, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.1572265625, "loss_aux_layer_17": 0.1640625, "loss_aux_layer_18": 0.17333984375, "loss_aux_layer_19": 0.17333984375, "loss_aux_layer_2": 0.0888671875, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.205810546875, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.099853515625, "loss_aux_layer_4": 0.1021728515625, "loss_aux_layer_5": 0.1041259765625, "loss_aux_layer_6": 0.1064453125, "loss_aux_layer_7": 0.1014404296875, "loss_aux_layer_8": 0.0994873046875, "loss_aux_layer_9": 0.0980224609375, "step": 798, "total_loss": 0.7699287980794907 }, { "epoch": 0.15818649772322313, "grad_norm": 1.772067904472351, "learning_rate": 5e-05, "llm_loss": 0.6895172148942947, "loss": 3.2725, "loss_aux_layer_0": 0.026641845703125, "loss_aux_layer_1": 0.0794677734375, "loss_aux_layer_10": 0.0975341796875, "loss_aux_layer_11": 0.103515625, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.1207275390625, "loss_aux_layer_14": 0.134033203125, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.157958984375, "loss_aux_layer_17": 0.1650390625, "loss_aux_layer_18": 0.173583984375, "loss_aux_layer_19": 0.173583984375, "loss_aux_layer_2": 0.0872802734375, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.20361328125, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.097412109375, "loss_aux_layer_4": 0.0999755859375, "loss_aux_layer_5": 0.1016845703125, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.096435546875, "step": 799, "total_loss": 0.8181251287460327 }, { "epoch": 0.15838447832112454, "grad_norm": 2.5321176052093506, "learning_rate": 5e-05, "llm_loss": 0.5813673436641693, "loss": 2.8333, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0799560546875, "loss_aux_layer_10": 0.094970703125, "loss_aux_layer_11": 0.1009521484375, "loss_aux_layer_12": 0.109130859375, "loss_aux_layer_13": 0.1175537109375, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.0880126953125, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.205322265625, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.1007080078125, "loss_aux_layer_5": 0.1019287109375, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.0975341796875, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.0943603515625, "step": 800, "total_loss": 0.7083233296871185 }, { "epoch": 0.15858245891902595, "grad_norm": 1.246084451675415, "learning_rate": 5e-05, "llm_loss": 0.6777424216270447, "loss": 3.2254, "loss_aux_layer_0": 0.027618408203125, "loss_aux_layer_1": 0.0811767578125, "loss_aux_layer_10": 0.0968017578125, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.111083984375, "loss_aux_layer_13": 0.119384765625, "loss_aux_layer_14": 0.1322021484375, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.156494140625, "loss_aux_layer_17": 0.1640625, "loss_aux_layer_18": 0.1728515625, "loss_aux_layer_19": 0.173828125, "loss_aux_layer_2": 0.087646484375, "loss_aux_layer_20": 0.179443359375, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.09814453125, "loss_aux_layer_4": 0.100341796875, "loss_aux_layer_5": 0.101806640625, "loss_aux_layer_6": 0.1048583984375, "loss_aux_layer_7": 0.1005859375, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0960693359375, "step": 801, "total_loss": 0.8063609451055527 }, { "epoch": 0.15878043951692733, "grad_norm": 1.6444200277328491, "learning_rate": 5e-05, "llm_loss": 0.6383186280727386, "loss": 3.0743, "loss_aux_layer_0": 0.02667236328125, "loss_aux_layer_1": 0.082763671875, "loss_aux_layer_10": 0.100341796875, "loss_aux_layer_11": 0.106201171875, "loss_aux_layer_12": 0.1143798828125, "loss_aux_layer_13": 0.122314453125, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.156982421875, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.172119140625, "loss_aux_layer_19": 0.171630859375, "loss_aux_layer_2": 0.0911865234375, "loss_aux_layer_20": 0.177001953125, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20361328125, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.102783203125, "loss_aux_layer_4": 0.1051025390625, "loss_aux_layer_5": 0.1064453125, "loss_aux_layer_6": 0.1090087890625, "loss_aux_layer_7": 0.1041259765625, "loss_aux_layer_8": 0.101806640625, "loss_aux_layer_9": 0.0994873046875, "step": 802, "total_loss": 0.7685805112123489 }, { "epoch": 0.15897842011482874, "grad_norm": 1.159741759300232, "learning_rate": 5e-05, "llm_loss": 0.6203760802745819, "loss": 2.9933, "loss_aux_layer_0": 0.03125, "loss_aux_layer_1": 0.0850830078125, "loss_aux_layer_10": 0.0955810546875, "loss_aux_layer_11": 0.1014404296875, "loss_aux_layer_12": 0.1103515625, "loss_aux_layer_13": 0.1190185546875, "loss_aux_layer_14": 0.132080078125, "loss_aux_layer_15": 0.14404296875, "loss_aux_layer_16": 0.15576171875, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.1728515625, "loss_aux_layer_2": 0.0863037109375, "loss_aux_layer_20": 0.1787109375, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.249267578125, "loss_aux_layer_3": 0.0963134765625, "loss_aux_layer_4": 0.0982666015625, "loss_aux_layer_5": 0.10009765625, "loss_aux_layer_6": 0.1026611328125, "loss_aux_layer_7": 0.097900390625, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.0947265625, "step": 803, "total_loss": 0.7483192384243011 }, { "epoch": 0.15917640071273015, "grad_norm": 1.9365005493164062, "learning_rate": 5e-05, "llm_loss": 0.6979319155216217, "loss": 3.3051, "loss_aux_layer_0": 0.0279541015625, "loss_aux_layer_1": 0.081298828125, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.102294921875, "loss_aux_layer_12": 0.110107421875, "loss_aux_layer_13": 0.1182861328125, "loss_aux_layer_14": 0.13037109375, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.17138671875, "loss_aux_layer_19": 0.1728515625, "loss_aux_layer_2": 0.0880126953125, "loss_aux_layer_20": 0.1796875, "loss_aux_layer_21": 0.184814453125, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0994873046875, "loss_aux_layer_4": 0.1015625, "loss_aux_layer_5": 0.1029052734375, "loss_aux_layer_6": 0.1055908203125, "loss_aux_layer_7": 0.099853515625, "loss_aux_layer_8": 0.0980224609375, "loss_aux_layer_9": 0.0958251953125, "step": 804, "total_loss": 0.8262742012739182 }, { "epoch": 0.15937438131063156, "grad_norm": 1.4023478031158447, "learning_rate": 5e-05, "llm_loss": 0.6540022492408752, "loss": 3.1311, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.0814208984375, "loss_aux_layer_10": 0.09814453125, "loss_aux_layer_11": 0.1041259765625, "loss_aux_layer_12": 0.1121826171875, "loss_aux_layer_13": 0.1202392578125, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.15576171875, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.171630859375, "loss_aux_layer_19": 0.172119140625, "loss_aux_layer_2": 0.0889892578125, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0999755859375, "loss_aux_layer_4": 0.102294921875, "loss_aux_layer_5": 0.103759765625, "loss_aux_layer_6": 0.1063232421875, "loss_aux_layer_7": 0.1015625, "loss_aux_layer_8": 0.099853515625, "loss_aux_layer_9": 0.09765625, "step": 805, "total_loss": 0.7827689349651337 }, { "epoch": 0.15957236190853297, "grad_norm": 1.2919694185256958, "learning_rate": 5e-05, "llm_loss": 0.6455284655094147, "loss": 3.0877, "loss_aux_layer_0": 0.028350830078125, "loss_aux_layer_1": 0.080322265625, "loss_aux_layer_10": 0.0946044921875, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1087646484375, "loss_aux_layer_13": 0.11669921875, "loss_aux_layer_14": 0.1300048828125, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.0867919921875, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.245849609375, "loss_aux_layer_3": 0.0965576171875, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.1024169921875, "loss_aux_layer_7": 0.0972900390625, "loss_aux_layer_8": 0.0953369140625, "loss_aux_layer_9": 0.09375, "step": 806, "total_loss": 0.7719362527132034 }, { "epoch": 0.15977034250643438, "grad_norm": 1.3547102212905884, "learning_rate": 5e-05, "llm_loss": 0.6039589643478394, "loss": 2.9679, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.09033203125, "loss_aux_layer_10": 0.106201171875, "loss_aux_layer_11": 0.112548828125, "loss_aux_layer_12": 0.121337890625, "loss_aux_layer_13": 0.1300048828125, "loss_aux_layer_14": 0.14306640625, "loss_aux_layer_15": 0.154296875, "loss_aux_layer_16": 0.16552734375, "loss_aux_layer_17": 0.171875, "loss_aux_layer_18": 0.18115234375, "loss_aux_layer_19": 0.18115234375, "loss_aux_layer_2": 0.097900390625, "loss_aux_layer_20": 0.18701171875, "loss_aux_layer_21": 0.193359375, "loss_aux_layer_22": 0.217041015625, "loss_aux_layer_23": 0.259765625, "loss_aux_layer_3": 0.1087646484375, "loss_aux_layer_4": 0.1109619140625, "loss_aux_layer_5": 0.1126708984375, "loss_aux_layer_6": 0.1151123046875, "loss_aux_layer_7": 0.110107421875, "loss_aux_layer_8": 0.107666015625, "loss_aux_layer_9": 0.10546875, "step": 807, "total_loss": 0.7419867813587189 }, { "epoch": 0.15996832310433579, "grad_norm": 1.122497797012329, "learning_rate": 5e-05, "llm_loss": 0.5451409220695496, "loss": 2.6948, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.0787353515625, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.101318359375, "loss_aux_layer_12": 0.1099853515625, "loss_aux_layer_13": 0.118408203125, "loss_aux_layer_14": 0.132080078125, "loss_aux_layer_15": 0.144775390625, "loss_aux_layer_16": 0.156982421875, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.173583984375, "loss_aux_layer_19": 0.17529296875, "loss_aux_layer_2": 0.08642578125, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.25634765625, "loss_aux_layer_3": 0.0963134765625, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.1025390625, "loss_aux_layer_7": 0.0972900390625, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.0941162109375, "step": 808, "total_loss": 0.6737028807401657 }, { "epoch": 0.16016630370223717, "grad_norm": 1.5051058530807495, "learning_rate": 5e-05, "llm_loss": 0.5924899727106094, "loss": 2.9025, "loss_aux_layer_0": 0.027740478515625, "loss_aux_layer_1": 0.0850830078125, "loss_aux_layer_10": 0.1019287109375, "loss_aux_layer_11": 0.108154296875, "loss_aux_layer_12": 0.1162109375, "loss_aux_layer_13": 0.124755859375, "loss_aux_layer_14": 0.137451171875, "loss_aux_layer_15": 0.149169921875, "loss_aux_layer_16": 0.160888671875, "loss_aux_layer_17": 0.16796875, "loss_aux_layer_18": 0.1767578125, "loss_aux_layer_19": 0.177001953125, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.182373046875, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.21044921875, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.10302734375, "loss_aux_layer_4": 0.1055908203125, "loss_aux_layer_5": 0.10693359375, "loss_aux_layer_6": 0.1097412109375, "loss_aux_layer_7": 0.1051025390625, "loss_aux_layer_8": 0.1031494140625, "loss_aux_layer_9": 0.1014404296875, "step": 809, "total_loss": 0.7256197333335876 }, { "epoch": 0.16036428430013858, "grad_norm": 0.9984066486358643, "learning_rate": 5e-05, "llm_loss": 0.5867696702480316, "loss": 2.8674, "loss_aux_layer_0": 0.027191162109375, "loss_aux_layer_1": 0.08154296875, "loss_aux_layer_10": 0.0986328125, "loss_aux_layer_11": 0.104736328125, "loss_aux_layer_12": 0.113037109375, "loss_aux_layer_13": 0.1212158203125, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.158935546875, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.17431640625, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.08837890625, "loss_aux_layer_20": 0.1806640625, "loss_aux_layer_21": 0.18603515625, "loss_aux_layer_22": 0.2080078125, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.0987548828125, "loss_aux_layer_4": 0.1011962890625, "loss_aux_layer_5": 0.1029052734375, "loss_aux_layer_6": 0.1055908203125, "loss_aux_layer_7": 0.1007080078125, "loss_aux_layer_8": 0.0994873046875, "loss_aux_layer_9": 0.097900390625, "step": 810, "total_loss": 0.716857835650444 }, { "epoch": 0.16056226489803999, "grad_norm": 1.6658834218978882, "learning_rate": 5e-05, "llm_loss": 0.6410427093505859, "loss": 3.0645, "loss_aux_layer_0": 0.027374267578125, "loss_aux_layer_1": 0.078369140625, "loss_aux_layer_10": 0.092529296875, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.106689453125, "loss_aux_layer_13": 0.1148681640625, "loss_aux_layer_14": 0.1280517578125, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.15234375, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0850830078125, "loss_aux_layer_20": 0.177001953125, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.09423828125, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.096923828125, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0953369140625, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.09228515625, "step": 811, "total_loss": 0.7661183625459671 }, { "epoch": 0.1607602454959414, "grad_norm": 2.62361216545105, "learning_rate": 5e-05, "llm_loss": 0.599756732583046, "loss": 2.9157, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.0792236328125, "loss_aux_layer_10": 0.09765625, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.12109375, "loss_aux_layer_14": 0.134521484375, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.158203125, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.173828125, "loss_aux_layer_19": 0.174072265625, "loss_aux_layer_2": 0.0875244140625, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.2060546875, "loss_aux_layer_23": 0.249755859375, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.0997314453125, "loss_aux_layer_5": 0.1015625, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.099365234375, "loss_aux_layer_8": 0.0977783203125, "loss_aux_layer_9": 0.0968017578125, "step": 812, "total_loss": 0.7289254069328308 }, { "epoch": 0.1609582260938428, "grad_norm": 2.057979106903076, "learning_rate": 5e-05, "llm_loss": 0.6861648559570312, "loss": 3.2526, "loss_aux_layer_0": 0.02862548828125, "loss_aux_layer_1": 0.077392578125, "loss_aux_layer_10": 0.09326171875, "loss_aux_layer_11": 0.099365234375, "loss_aux_layer_12": 0.108154296875, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.131103515625, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.15673828125, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.1748046875, "loss_aux_layer_19": 0.175537109375, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.18603515625, "loss_aux_layer_22": 0.20654296875, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.0938720703125, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.0975341796875, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.095703125, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.092529296875, "step": 813, "total_loss": 0.8131618350744247 }, { "epoch": 0.1611562066917442, "grad_norm": 1.0051857233047485, "learning_rate": 5e-05, "llm_loss": 0.624068409204483, "loss": 3.0107, "loss_aux_layer_0": 0.02777099609375, "loss_aux_layer_1": 0.0816650390625, "loss_aux_layer_10": 0.09716796875, "loss_aux_layer_11": 0.1033935546875, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.14404296875, "loss_aux_layer_16": 0.155517578125, "loss_aux_layer_17": 0.162109375, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0887451171875, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.183349609375, "loss_aux_layer_22": 0.205322265625, "loss_aux_layer_23": 0.2490234375, "loss_aux_layer_3": 0.0994873046875, "loss_aux_layer_4": 0.1014404296875, "loss_aux_layer_5": 0.1029052734375, "loss_aux_layer_6": 0.10546875, "loss_aux_layer_7": 0.1007080078125, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.0965576171875, "step": 814, "total_loss": 0.7526771128177643 }, { "epoch": 0.16135418728964562, "grad_norm": 1.5474861860275269, "learning_rate": 5e-05, "llm_loss": 0.5877938866615295, "loss": 2.8779, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.080810546875, "loss_aux_layer_10": 0.099853515625, "loss_aux_layer_11": 0.106201171875, "loss_aux_layer_12": 0.11474609375, "loss_aux_layer_13": 0.123779296875, "loss_aux_layer_14": 0.13720703125, "loss_aux_layer_15": 0.149658203125, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.168212890625, "loss_aux_layer_18": 0.177978515625, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.0897216796875, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.18798828125, "loss_aux_layer_22": 0.208251953125, "loss_aux_layer_23": 0.2509765625, "loss_aux_layer_3": 0.099853515625, "loss_aux_layer_4": 0.102294921875, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.1070556640625, "loss_aux_layer_7": 0.102294921875, "loss_aux_layer_8": 0.1005859375, "loss_aux_layer_9": 0.0989990234375, "step": 815, "total_loss": 0.7194751650094986 }, { "epoch": 0.16155216788754703, "grad_norm": 1.565775990486145, "learning_rate": 5e-05, "llm_loss": 0.6239980310201645, "loss": 3.0308, "loss_aux_layer_0": 0.0272216796875, "loss_aux_layer_1": 0.083251953125, "loss_aux_layer_10": 0.1029052734375, "loss_aux_layer_11": 0.1094970703125, "loss_aux_layer_12": 0.11767578125, "loss_aux_layer_13": 0.1263427734375, "loss_aux_layer_14": 0.138916015625, "loss_aux_layer_15": 0.150634765625, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.168212890625, "loss_aux_layer_18": 0.1767578125, "loss_aux_layer_19": 0.1767578125, "loss_aux_layer_2": 0.093017578125, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.104248046875, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.1085205078125, "loss_aux_layer_6": 0.1116943359375, "loss_aux_layer_7": 0.1064453125, "loss_aux_layer_8": 0.1043701171875, "loss_aux_layer_9": 0.1021728515625, "step": 816, "total_loss": 0.7576998621225357 }, { "epoch": 0.16175014848544841, "grad_norm": 1.2321912050247192, "learning_rate": 5e-05, "llm_loss": 0.602302610874176, "loss": 2.9371, "loss_aux_layer_0": 0.027496337890625, "loss_aux_layer_1": 0.0819091796875, "loss_aux_layer_10": 0.0994873046875, "loss_aux_layer_11": 0.10595703125, "loss_aux_layer_12": 0.114501953125, "loss_aux_layer_13": 0.1234130859375, "loss_aux_layer_14": 0.136474609375, "loss_aux_layer_15": 0.14892578125, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.16943359375, "loss_aux_layer_18": 0.1787109375, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.089111328125, "loss_aux_layer_20": 0.1845703125, "loss_aux_layer_21": 0.1904296875, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.254150390625, "loss_aux_layer_3": 0.099609375, "loss_aux_layer_4": 0.1019287109375, "loss_aux_layer_5": 0.103271484375, "loss_aux_layer_6": 0.1063232421875, "loss_aux_layer_7": 0.101318359375, "loss_aux_layer_8": 0.0997314453125, "loss_aux_layer_9": 0.0982666015625, "step": 817, "total_loss": 0.734271913766861 }, { "epoch": 0.16194812908334982, "grad_norm": 1.1989227533340454, "learning_rate": 5e-05, "llm_loss": 0.6468959748744965, "loss": 3.0778, "loss_aux_layer_0": 0.02569580078125, "loss_aux_layer_1": 0.0726318359375, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.095947265625, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.11279296875, "loss_aux_layer_14": 0.125732421875, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.1689453125, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.0792236328125, "loss_aux_layer_20": 0.177001953125, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.0888671875, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0928955078125, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.091796875, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0894775390625, "step": 818, "total_loss": 0.7694440633058548 }, { "epoch": 0.16214610968125123, "grad_norm": 1.769134283065796, "learning_rate": 5e-05, "llm_loss": 0.5769176930189133, "loss": 2.8356, "loss_aux_layer_0": 0.02801513671875, "loss_aux_layer_1": 0.0849609375, "loss_aux_layer_10": 0.09912109375, "loss_aux_layer_11": 0.1055908203125, "loss_aux_layer_12": 0.1138916015625, "loss_aux_layer_13": 0.1224365234375, "loss_aux_layer_14": 0.1357421875, "loss_aux_layer_15": 0.148193359375, "loss_aux_layer_16": 0.16015625, "loss_aux_layer_17": 0.1669921875, "loss_aux_layer_18": 0.176513671875, "loss_aux_layer_19": 0.177490234375, "loss_aux_layer_2": 0.0908203125, "loss_aux_layer_20": 0.18408203125, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.101318359375, "loss_aux_layer_4": 0.1036376953125, "loss_aux_layer_5": 0.105224609375, "loss_aux_layer_6": 0.1077880859375, "loss_aux_layer_7": 0.1025390625, "loss_aux_layer_8": 0.1004638671875, "loss_aux_layer_9": 0.0985107421875, "step": 819, "total_loss": 0.7089058756828308 }, { "epoch": 0.16234409027915264, "grad_norm": 1.4907256364822388, "learning_rate": 5e-05, "llm_loss": 0.6051801294088364, "loss": 2.9455, "loss_aux_layer_0": 0.026519775390625, "loss_aux_layer_1": 0.0821533203125, "loss_aux_layer_10": 0.0982666015625, "loss_aux_layer_11": 0.104248046875, "loss_aux_layer_12": 0.1126708984375, "loss_aux_layer_13": 0.1214599609375, "loss_aux_layer_14": 0.13427734375, "loss_aux_layer_15": 0.146484375, "loss_aux_layer_16": 0.1591796875, "loss_aux_layer_17": 0.166015625, "loss_aux_layer_18": 0.175537109375, "loss_aux_layer_19": 0.17822265625, "loss_aux_layer_2": 0.089111328125, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.256103515625, "loss_aux_layer_3": 0.0999755859375, "loss_aux_layer_4": 0.10205078125, "loss_aux_layer_5": 0.103759765625, "loss_aux_layer_6": 0.1064453125, "loss_aux_layer_7": 0.1019287109375, "loss_aux_layer_8": 0.0999755859375, "loss_aux_layer_9": 0.097900390625, "step": 820, "total_loss": 0.7363795191049576 }, { "epoch": 0.16254207087705405, "grad_norm": 1.8042466640472412, "learning_rate": 5e-05, "llm_loss": 0.6924883872270584, "loss": 3.3033, "loss_aux_layer_0": 0.027618408203125, "loss_aux_layer_1": 0.085205078125, "loss_aux_layer_10": 0.10107421875, "loss_aux_layer_11": 0.1072998046875, "loss_aux_layer_12": 0.1162109375, "loss_aux_layer_13": 0.125, "loss_aux_layer_14": 0.138671875, "loss_aux_layer_15": 0.150634765625, "loss_aux_layer_16": 0.16259765625, "loss_aux_layer_17": 0.169677734375, "loss_aux_layer_18": 0.178466796875, "loss_aux_layer_19": 0.17919921875, "loss_aux_layer_2": 0.0916748046875, "loss_aux_layer_20": 0.1845703125, "loss_aux_layer_21": 0.190185546875, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.1024169921875, "loss_aux_layer_4": 0.1046142578125, "loss_aux_layer_5": 0.105712890625, "loss_aux_layer_6": 0.1085205078125, "loss_aux_layer_7": 0.103759765625, "loss_aux_layer_8": 0.10205078125, "loss_aux_layer_9": 0.100341796875, "step": 821, "total_loss": 0.8258333504199982 }, { "epoch": 0.16274005147495546, "grad_norm": 1.6323645114898682, "learning_rate": 5e-05, "llm_loss": 0.6828128844499588, "loss": 3.2354, "loss_aux_layer_0": 0.026031494140625, "loss_aux_layer_1": 0.078125, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1007080078125, "loss_aux_layer_12": 0.1090087890625, "loss_aux_layer_13": 0.1175537109375, "loss_aux_layer_14": 0.1298828125, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0853271484375, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.18212890625, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.095703125, "loss_aux_layer_4": 0.0975341796875, "loss_aux_layer_5": 0.098876953125, "loss_aux_layer_6": 0.101318359375, "loss_aux_layer_7": 0.097412109375, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.09375, "step": 822, "total_loss": 0.8088577836751938 }, { "epoch": 0.16293803207285687, "grad_norm": 2.030951738357544, "learning_rate": 5e-05, "llm_loss": 0.626424714922905, "loss": 3.0492, "loss_aux_layer_0": 0.026275634765625, "loss_aux_layer_1": 0.0848388671875, "loss_aux_layer_10": 0.1053466796875, "loss_aux_layer_11": 0.1114501953125, "loss_aux_layer_12": 0.120361328125, "loss_aux_layer_13": 0.1287841796875, "loss_aux_layer_14": 0.14208984375, "loss_aux_layer_15": 0.154052734375, "loss_aux_layer_16": 0.165283203125, "loss_aux_layer_17": 0.171630859375, "loss_aux_layer_18": 0.18017578125, "loss_aux_layer_19": 0.1796875, "loss_aux_layer_2": 0.093505859375, "loss_aux_layer_20": 0.1845703125, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.10595703125, "loss_aux_layer_4": 0.108642578125, "loss_aux_layer_5": 0.1102294921875, "loss_aux_layer_6": 0.1131591796875, "loss_aux_layer_7": 0.107666015625, "loss_aux_layer_8": 0.10595703125, "loss_aux_layer_9": 0.1043701171875, "step": 823, "total_loss": 0.7622934132814407 }, { "epoch": 0.16313601267075828, "grad_norm": 2.9427525997161865, "learning_rate": 5e-05, "llm_loss": 0.7644881159067154, "loss": 3.5696, "loss_aux_layer_0": 0.027191162109375, "loss_aux_layer_1": 0.0799560546875, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1025390625, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.15576171875, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.171142578125, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0877685546875, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.246337890625, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.100830078125, "loss_aux_layer_5": 0.1024169921875, "loss_aux_layer_6": 0.1046142578125, "loss_aux_layer_7": 0.0994873046875, "loss_aux_layer_8": 0.097412109375, "loss_aux_layer_9": 0.0955810546875, "step": 824, "total_loss": 0.8924124091863632 }, { "epoch": 0.16333399326865966, "grad_norm": 1.8514162302017212, "learning_rate": 5e-05, "llm_loss": 0.6767818629741669, "loss": 3.2102, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.0770263671875, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.10009765625, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.1396484375, "loss_aux_layer_16": 0.151123046875, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.087158203125, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.100830078125, "loss_aux_layer_5": 0.1025390625, "loss_aux_layer_6": 0.104248046875, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.0960693359375, "loss_aux_layer_9": 0.093994140625, "step": 825, "total_loss": 0.802559420466423 }, { "epoch": 0.16353197386656107, "grad_norm": 2.191162347793579, "learning_rate": 5e-05, "llm_loss": 0.6591533571481705, "loss": 3.1487, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.0802001953125, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.102783203125, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.119140625, "loss_aux_layer_14": 0.132080078125, "loss_aux_layer_15": 0.144287109375, "loss_aux_layer_16": 0.15673828125, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.17236328125, "loss_aux_layer_19": 0.172607421875, "loss_aux_layer_2": 0.087890625, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.18212890625, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.24365234375, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.1009521484375, "loss_aux_layer_5": 0.1026611328125, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.0989990234375, "loss_aux_layer_8": 0.0970458984375, "loss_aux_layer_9": 0.095458984375, "step": 826, "total_loss": 0.7871640920639038 }, { "epoch": 0.16372995446446248, "grad_norm": 2.7250263690948486, "learning_rate": 5e-05, "llm_loss": 0.5925242304801941, "loss": 2.9019, "loss_aux_layer_0": 0.0281982421875, "loss_aux_layer_1": 0.0841064453125, "loss_aux_layer_10": 0.1014404296875, "loss_aux_layer_11": 0.107421875, "loss_aux_layer_12": 0.115478515625, "loss_aux_layer_13": 0.12353515625, "loss_aux_layer_14": 0.136962890625, "loss_aux_layer_15": 0.14892578125, "loss_aux_layer_16": 0.159912109375, "loss_aux_layer_17": 0.16650390625, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.1767578125, "loss_aux_layer_2": 0.09423828125, "loss_aux_layer_20": 0.18212890625, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.209716796875, "loss_aux_layer_23": 0.253173828125, "loss_aux_layer_3": 0.1044921875, "loss_aux_layer_4": 0.106689453125, "loss_aux_layer_5": 0.1082763671875, "loss_aux_layer_6": 0.1103515625, "loss_aux_layer_7": 0.1053466796875, "loss_aux_layer_8": 0.102783203125, "loss_aux_layer_9": 0.1007080078125, "step": 827, "total_loss": 0.7254738956689835 }, { "epoch": 0.1639279350623639, "grad_norm": 1.5443401336669922, "learning_rate": 5e-05, "llm_loss": 0.6192120164632797, "loss": 3.0254, "loss_aux_layer_0": 0.02783203125, "loss_aux_layer_1": 0.0870361328125, "loss_aux_layer_10": 0.1041259765625, "loss_aux_layer_11": 0.11083984375, "loss_aux_layer_12": 0.1201171875, "loss_aux_layer_13": 0.129638671875, "loss_aux_layer_14": 0.143310546875, "loss_aux_layer_15": 0.15576171875, "loss_aux_layer_16": 0.16748046875, "loss_aux_layer_17": 0.174560546875, "loss_aux_layer_18": 0.183349609375, "loss_aux_layer_19": 0.18310546875, "loss_aux_layer_2": 0.0928955078125, "loss_aux_layer_20": 0.188720703125, "loss_aux_layer_21": 0.1953125, "loss_aux_layer_22": 0.21826171875, "loss_aux_layer_23": 0.26220703125, "loss_aux_layer_3": 0.1041259765625, "loss_aux_layer_4": 0.106689453125, "loss_aux_layer_5": 0.1085205078125, "loss_aux_layer_6": 0.1116943359375, "loss_aux_layer_7": 0.10693359375, "loss_aux_layer_8": 0.1048583984375, "loss_aux_layer_9": 0.10302734375, "step": 828, "total_loss": 0.7563498914241791 }, { "epoch": 0.1641259156602653, "grad_norm": 1.2045087814331055, "learning_rate": 5e-05, "llm_loss": 0.6722060292959213, "loss": 3.1963, "loss_aux_layer_0": 0.026580810546875, "loss_aux_layer_1": 0.07861328125, "loss_aux_layer_10": 0.0960693359375, "loss_aux_layer_11": 0.101806640625, "loss_aux_layer_12": 0.1102294921875, "loss_aux_layer_13": 0.118408203125, "loss_aux_layer_14": 0.130859375, "loss_aux_layer_15": 0.142333984375, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.0855712890625, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.24755859375, "loss_aux_layer_3": 0.09619140625, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.0966796875, "loss_aux_layer_9": 0.0950927734375, "step": 829, "total_loss": 0.7990836650133133 }, { "epoch": 0.1643238962581667, "grad_norm": 1.7107511758804321, "learning_rate": 5e-05, "llm_loss": 0.6466550976037979, "loss": 3.1005, "loss_aux_layer_0": 0.026275634765625, "loss_aux_layer_1": 0.0791015625, "loss_aux_layer_10": 0.0960693359375, "loss_aux_layer_11": 0.1021728515625, "loss_aux_layer_12": 0.1109619140625, "loss_aux_layer_13": 0.1195068359375, "loss_aux_layer_14": 0.132080078125, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.1630859375, "loss_aux_layer_18": 0.173583984375, "loss_aux_layer_19": 0.173828125, "loss_aux_layer_2": 0.08740234375, "loss_aux_layer_20": 0.180419921875, "loss_aux_layer_21": 0.185546875, "loss_aux_layer_22": 0.206298828125, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.0980224609375, "loss_aux_layer_4": 0.1004638671875, "loss_aux_layer_5": 0.1021728515625, "loss_aux_layer_6": 0.104736328125, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.097900390625, "loss_aux_layer_9": 0.0958251953125, "step": 830, "total_loss": 0.7751196622848511 }, { "epoch": 0.16452187685606812, "grad_norm": 1.0958693027496338, "learning_rate": 5e-05, "llm_loss": 0.6034899950027466, "loss": 2.9353, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.0814208984375, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1121826171875, "loss_aux_layer_13": 0.12060546875, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.14697265625, "loss_aux_layer_16": 0.159423828125, "loss_aux_layer_17": 0.16650390625, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.177001953125, "loss_aux_layer_2": 0.087890625, "loss_aux_layer_20": 0.182861328125, "loss_aux_layer_21": 0.18896484375, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.25390625, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.1009521484375, "loss_aux_layer_5": 0.1024169921875, "loss_aux_layer_6": 0.105224609375, "loss_aux_layer_7": 0.1007080078125, "loss_aux_layer_8": 0.0980224609375, "loss_aux_layer_9": 0.0963134765625, "step": 831, "total_loss": 0.7338310480117798 }, { "epoch": 0.1647198574539695, "grad_norm": 2.010037899017334, "learning_rate": 5e-05, "llm_loss": 0.6045649945735931, "loss": 2.9291, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.108642578125, "loss_aux_layer_13": 0.1173095703125, "loss_aux_layer_14": 0.13037109375, "loss_aux_layer_15": 0.1435546875, "loss_aux_layer_16": 0.156494140625, "loss_aux_layer_17": 0.164306640625, "loss_aux_layer_18": 0.174072265625, "loss_aux_layer_19": 0.17529296875, "loss_aux_layer_2": 0.0855712890625, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.188232421875, "loss_aux_layer_22": 0.2099609375, "loss_aux_layer_23": 0.25244140625, "loss_aux_layer_3": 0.095703125, "loss_aux_layer_4": 0.0972900390625, "loss_aux_layer_5": 0.098876953125, "loss_aux_layer_6": 0.101318359375, "loss_aux_layer_7": 0.0970458984375, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.0938720703125, "step": 832, "total_loss": 0.7322680205106735 }, { "epoch": 0.1649178380518709, "grad_norm": 0.7930591106414795, "learning_rate": 5e-05, "llm_loss": 0.5706991106271744, "loss": 2.8113, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.0838623046875, "loss_aux_layer_10": 0.1007080078125, "loss_aux_layer_11": 0.1065673828125, "loss_aux_layer_12": 0.1146240234375, "loss_aux_layer_13": 0.122314453125, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.147216796875, "loss_aux_layer_16": 0.158447265625, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.17529296875, "loss_aux_layer_19": 0.1767578125, "loss_aux_layer_2": 0.09130859375, "loss_aux_layer_20": 0.1826171875, "loss_aux_layer_21": 0.18994140625, "loss_aux_layer_22": 0.21142578125, "loss_aux_layer_23": 0.25537109375, "loss_aux_layer_3": 0.102294921875, "loss_aux_layer_4": 0.1043701171875, "loss_aux_layer_5": 0.105712890625, "loss_aux_layer_6": 0.1083984375, "loss_aux_layer_7": 0.1033935546875, "loss_aux_layer_8": 0.1015625, "loss_aux_layer_9": 0.099853515625, "step": 833, "total_loss": 0.7028157711029053 }, { "epoch": 0.16511581864977232, "grad_norm": 1.9336010217666626, "learning_rate": 5e-05, "llm_loss": 0.5967276096343994, "loss": 2.8745, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.07177734375, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.102294921875, "loss_aux_layer_13": 0.1109619140625, "loss_aux_layer_14": 0.123779296875, "loss_aux_layer_15": 0.13720703125, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.08056640625, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.0894775390625, "loss_aux_layer_4": 0.0911865234375, "loss_aux_layer_5": 0.0928955078125, "loss_aux_layer_6": 0.0948486328125, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.0888671875, "loss_aux_layer_9": 0.0872802734375, "step": 834, "total_loss": 0.7186267524957657 }, { "epoch": 0.16531379924767373, "grad_norm": 1.0794358253479004, "learning_rate": 5e-05, "llm_loss": 0.6371616274118423, "loss": 3.0466, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.074951171875, "loss_aux_layer_10": 0.0924072265625, "loss_aux_layer_11": 0.0985107421875, "loss_aux_layer_12": 0.1070556640625, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.12890625, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.08251953125, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.092529296875, "loss_aux_layer_4": 0.094482421875, "loss_aux_layer_5": 0.0963134765625, "loss_aux_layer_6": 0.0992431640625, "loss_aux_layer_7": 0.0946044921875, "loss_aux_layer_8": 0.0927734375, "loss_aux_layer_9": 0.0911865234375, "step": 835, "total_loss": 0.7616579234600067 }, { "epoch": 0.16551177984557514, "grad_norm": 1.692134976387024, "learning_rate": 5e-05, "llm_loss": 0.690062627196312, "loss": 3.2596, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.07666015625, "loss_aux_layer_10": 0.093994140625, "loss_aux_layer_11": 0.0994873046875, "loss_aux_layer_12": 0.1075439453125, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.152099609375, "loss_aux_layer_17": 0.159423828125, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.1689453125, "loss_aux_layer_2": 0.0850830078125, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.095703125, "loss_aux_layer_4": 0.0980224609375, "loss_aux_layer_5": 0.0992431640625, "loss_aux_layer_6": 0.1019287109375, "loss_aux_layer_7": 0.09716796875, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.0931396484375, "step": 836, "total_loss": 0.8148973137140274 }, { "epoch": 0.16570976044347654, "grad_norm": 1.1715604066848755, "learning_rate": 5e-05, "llm_loss": 0.61700239777565, "loss": 2.9945, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.0809326171875, "loss_aux_layer_10": 0.09814453125, "loss_aux_layer_11": 0.10498046875, "loss_aux_layer_12": 0.113525390625, "loss_aux_layer_13": 0.1226806640625, "loss_aux_layer_14": 0.13671875, "loss_aux_layer_15": 0.14990234375, "loss_aux_layer_16": 0.16162109375, "loss_aux_layer_17": 0.1689453125, "loss_aux_layer_18": 0.177978515625, "loss_aux_layer_19": 0.179443359375, "loss_aux_layer_2": 0.0889892578125, "loss_aux_layer_20": 0.18359375, "loss_aux_layer_21": 0.189453125, "loss_aux_layer_22": 0.211181640625, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.099853515625, "loss_aux_layer_4": 0.10205078125, "loss_aux_layer_5": 0.103515625, "loss_aux_layer_6": 0.1060791015625, "loss_aux_layer_7": 0.1011962890625, "loss_aux_layer_8": 0.0992431640625, "loss_aux_layer_9": 0.0972900390625, "step": 837, "total_loss": 0.7486288845539093 }, { "epoch": 0.16590774104137795, "grad_norm": 1.890760064125061, "learning_rate": 5e-05, "llm_loss": 0.6713170856237411, "loss": 3.1757, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.072998046875, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.095947265625, "loss_aux_layer_12": 0.104248046875, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.126708984375, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.150390625, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0810546875, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.09130859375, "loss_aux_layer_4": 0.0931396484375, "loss_aux_layer_5": 0.0947265625, "loss_aux_layer_6": 0.0970458984375, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.0892333984375, "step": 838, "total_loss": 0.7939340174198151 }, { "epoch": 0.16610572163927936, "grad_norm": 1.2844754457473755, "learning_rate": 5e-05, "llm_loss": 0.6930404603481293, "loss": 3.2929, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0802001953125, "loss_aux_layer_10": 0.0985107421875, "loss_aux_layer_11": 0.104736328125, "loss_aux_layer_12": 0.11328125, "loss_aux_layer_13": 0.121826171875, "loss_aux_layer_14": 0.134521484375, "loss_aux_layer_15": 0.146240234375, "loss_aux_layer_16": 0.15771484375, "loss_aux_layer_17": 0.165771484375, "loss_aux_layer_18": 0.17529296875, "loss_aux_layer_19": 0.17578125, "loss_aux_layer_2": 0.0887451171875, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.1865234375, "loss_aux_layer_22": 0.206787109375, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.099609375, "loss_aux_layer_4": 0.10205078125, "loss_aux_layer_5": 0.1036376953125, "loss_aux_layer_6": 0.1064453125, "loss_aux_layer_7": 0.10205078125, "loss_aux_layer_8": 0.10009765625, "loss_aux_layer_9": 0.0977783203125, "step": 839, "total_loss": 0.823218435049057 }, { "epoch": 0.16630370223718074, "grad_norm": 1.815635323524475, "learning_rate": 5e-05, "llm_loss": 0.6636857986450195, "loss": 3.1666, "loss_aux_layer_0": 0.025604248046875, "loss_aux_layer_1": 0.078369140625, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.102294921875, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.1197509765625, "loss_aux_layer_14": 0.1324462890625, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.17236328125, "loss_aux_layer_19": 0.173583984375, "loss_aux_layer_2": 0.086669921875, "loss_aux_layer_20": 0.179443359375, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.206298828125, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.096923828125, "loss_aux_layer_4": 0.0994873046875, "loss_aux_layer_5": 0.10107421875, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.096923828125, "loss_aux_layer_9": 0.0950927734375, "step": 840, "total_loss": 0.7916460037231445 }, { "epoch": 0.16650168283508215, "grad_norm": 1.3808858394622803, "learning_rate": 5e-05, "llm_loss": 0.6164125502109528, "loss": 2.9694, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.076171875, "loss_aux_layer_10": 0.09326171875, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.16259765625, "loss_aux_layer_18": 0.17236328125, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.1796875, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.09375, "loss_aux_layer_4": 0.0963134765625, "loss_aux_layer_5": 0.097900390625, "loss_aux_layer_6": 0.1007080078125, "loss_aux_layer_7": 0.095947265625, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.092041015625, "step": 841, "total_loss": 0.7423417121171951 }, { "epoch": 0.16669966343298356, "grad_norm": 1.4691468477249146, "learning_rate": 5e-05, "llm_loss": 0.582385465502739, "loss": 2.8615, "loss_aux_layer_0": 0.031158447265625, "loss_aux_layer_1": 0.086181640625, "loss_aux_layer_10": 0.1021728515625, "loss_aux_layer_11": 0.1082763671875, "loss_aux_layer_12": 0.116455078125, "loss_aux_layer_13": 0.1241455078125, "loss_aux_layer_14": 0.136474609375, "loss_aux_layer_15": 0.148681640625, "loss_aux_layer_16": 0.159912109375, "loss_aux_layer_17": 0.166259765625, "loss_aux_layer_18": 0.1748046875, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.0924072265625, "loss_aux_layer_20": 0.180908203125, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.251220703125, "loss_aux_layer_3": 0.1044921875, "loss_aux_layer_4": 0.1068115234375, "loss_aux_layer_5": 0.1085205078125, "loss_aux_layer_6": 0.110595703125, "loss_aux_layer_7": 0.105224609375, "loss_aux_layer_8": 0.103271484375, "loss_aux_layer_9": 0.101806640625, "step": 842, "total_loss": 0.7153742611408234 }, { "epoch": 0.16689764403088497, "grad_norm": 1.299237847328186, "learning_rate": 5e-05, "llm_loss": 0.5573515594005585, "loss": 2.7405, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.0765380859375, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.1002197265625, "loss_aux_layer_12": 0.10888671875, "loss_aux_layer_13": 0.117919921875, "loss_aux_layer_14": 0.1307373046875, "loss_aux_layer_15": 0.143310546875, "loss_aux_layer_16": 0.1552734375, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.17333984375, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.08447265625, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.187744140625, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.254638671875, "loss_aux_layer_3": 0.09521484375, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.0992431640625, "loss_aux_layer_6": 0.101806640625, "loss_aux_layer_7": 0.097412109375, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.0938720703125, "step": 843, "total_loss": 0.6851191222667694 }, { "epoch": 0.16709562462878638, "grad_norm": 1.6478594541549683, "learning_rate": 5e-05, "llm_loss": 0.62703737616539, "loss": 3.0284, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.0810546875, "loss_aux_layer_10": 0.0992431640625, "loss_aux_layer_11": 0.10546875, "loss_aux_layer_12": 0.1141357421875, "loss_aux_layer_13": 0.12255859375, "loss_aux_layer_14": 0.1346435546875, "loss_aux_layer_15": 0.146240234375, "loss_aux_layer_16": 0.15771484375, "loss_aux_layer_17": 0.16455078125, "loss_aux_layer_18": 0.1728515625, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.0894775390625, "loss_aux_layer_20": 0.1787109375, "loss_aux_layer_21": 0.184814453125, "loss_aux_layer_22": 0.20654296875, "loss_aux_layer_23": 0.2490234375, "loss_aux_layer_3": 0.100341796875, "loss_aux_layer_4": 0.1029052734375, "loss_aux_layer_5": 0.1041259765625, "loss_aux_layer_6": 0.10693359375, "loss_aux_layer_7": 0.102294921875, "loss_aux_layer_8": 0.1004638671875, "loss_aux_layer_9": 0.0985107421875, "step": 844, "total_loss": 0.7571083754301071 }, { "epoch": 0.1672936052266878, "grad_norm": 0.885923445224762, "learning_rate": 5e-05, "llm_loss": 0.6010705754160881, "loss": 2.9263, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.08203125, "loss_aux_layer_10": 0.099609375, "loss_aux_layer_11": 0.10595703125, "loss_aux_layer_12": 0.1142578125, "loss_aux_layer_13": 0.123046875, "loss_aux_layer_14": 0.1356201171875, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.15869140625, "loss_aux_layer_17": 0.165283203125, "loss_aux_layer_18": 0.174072265625, "loss_aux_layer_19": 0.17431640625, "loss_aux_layer_2": 0.08837890625, "loss_aux_layer_20": 0.18017578125, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.20654296875, "loss_aux_layer_23": 0.2490234375, "loss_aux_layer_3": 0.0999755859375, "loss_aux_layer_4": 0.102294921875, "loss_aux_layer_5": 0.10400390625, "loss_aux_layer_6": 0.1065673828125, "loss_aux_layer_7": 0.1019287109375, "loss_aux_layer_8": 0.100341796875, "loss_aux_layer_9": 0.0985107421875, "step": 845, "total_loss": 0.7315629720687866 }, { "epoch": 0.1674915858245892, "grad_norm": 1.6814626455307007, "learning_rate": 5e-05, "llm_loss": 0.6033976972103119, "loss": 2.9195, "loss_aux_layer_0": 0.02825927734375, "loss_aux_layer_1": 0.0784912109375, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1007080078125, "loss_aux_layer_12": 0.1092529296875, "loss_aux_layer_13": 0.1177978515625, "loss_aux_layer_14": 0.1304931640625, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.15478515625, "loss_aux_layer_17": 0.162109375, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.0863037109375, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.09716796875, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.0994873046875, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.0977783203125, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.09375, "step": 846, "total_loss": 0.7298752665519714 }, { "epoch": 0.16768956642249058, "grad_norm": 1.5943782329559326, "learning_rate": 5e-05, "llm_loss": 0.5949656963348389, "loss": 2.9072, "loss_aux_layer_0": 0.028594970703125, "loss_aux_layer_1": 0.0831298828125, "loss_aux_layer_10": 0.100341796875, "loss_aux_layer_11": 0.1063232421875, "loss_aux_layer_12": 0.114990234375, "loss_aux_layer_13": 0.123779296875, "loss_aux_layer_14": 0.136962890625, "loss_aux_layer_15": 0.149658203125, "loss_aux_layer_16": 0.161376953125, "loss_aux_layer_17": 0.168212890625, "loss_aux_layer_18": 0.17724609375, "loss_aux_layer_19": 0.176513671875, "loss_aux_layer_2": 0.0899658203125, "loss_aux_layer_20": 0.181640625, "loss_aux_layer_21": 0.1865234375, "loss_aux_layer_22": 0.20849609375, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.101318359375, "loss_aux_layer_4": 0.1036376953125, "loss_aux_layer_5": 0.1051025390625, "loss_aux_layer_6": 0.1077880859375, "loss_aux_layer_7": 0.10302734375, "loss_aux_layer_8": 0.1009521484375, "loss_aux_layer_9": 0.09912109375, "step": 847, "total_loss": 0.7267884165048599 }, { "epoch": 0.167887547020392, "grad_norm": 1.2556135654449463, "learning_rate": 5e-05, "llm_loss": 0.6831391155719757, "loss": 3.2388, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.0784912109375, "loss_aux_layer_10": 0.09521484375, "loss_aux_layer_11": 0.1009521484375, "loss_aux_layer_12": 0.109375, "loss_aux_layer_13": 0.11767578125, "loss_aux_layer_14": 0.13037109375, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0860595703125, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0965576171875, "loss_aux_layer_4": 0.0987548828125, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.1026611328125, "loss_aux_layer_7": 0.09814453125, "loss_aux_layer_8": 0.095947265625, "loss_aux_layer_9": 0.0943603515625, "step": 848, "total_loss": 0.8097031712532043 }, { "epoch": 0.1680855276182934, "grad_norm": 1.4799704551696777, "learning_rate": 5e-05, "llm_loss": 0.6117185801267624, "loss": 2.9477, "loss_aux_layer_0": 0.026214599609375, "loss_aux_layer_1": 0.07666015625, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.1163330078125, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.159912109375, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.083740234375, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.093505859375, "loss_aux_layer_4": 0.0955810546875, "loss_aux_layer_5": 0.09716796875, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.09619140625, "loss_aux_layer_8": 0.0941162109375, "loss_aux_layer_9": 0.092529296875, "step": 849, "total_loss": 0.7369246035814285 }, { "epoch": 0.1682835082161948, "grad_norm": 1.6604924201965332, "learning_rate": 5e-05, "llm_loss": 0.6839440315961838, "loss": 3.2435, "loss_aux_layer_0": 0.02587890625, "loss_aux_layer_1": 0.078125, "loss_aux_layer_10": 0.0965576171875, "loss_aux_layer_11": 0.1025390625, "loss_aux_layer_12": 0.11083984375, "loss_aux_layer_13": 0.119140625, "loss_aux_layer_14": 0.13134765625, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.08544921875, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.0999755859375, "loss_aux_layer_5": 0.1015625, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.0994873046875, "loss_aux_layer_8": 0.09716796875, "loss_aux_layer_9": 0.09521484375, "step": 850, "total_loss": 0.8108645677566528 }, { "epoch": 0.16848148881409622, "grad_norm": 1.499876856803894, "learning_rate": 5e-05, "llm_loss": 0.5775725692510605, "loss": 2.8326, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.081298828125, "loss_aux_layer_10": 0.0985107421875, "loss_aux_layer_11": 0.10498046875, "loss_aux_layer_12": 0.1129150390625, "loss_aux_layer_13": 0.1209716796875, "loss_aux_layer_14": 0.13330078125, "loss_aux_layer_15": 0.145263671875, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.1630859375, "loss_aux_layer_18": 0.173828125, "loss_aux_layer_19": 0.1748046875, "loss_aux_layer_2": 0.0899658203125, "loss_aux_layer_20": 0.1806640625, "loss_aux_layer_21": 0.1875, "loss_aux_layer_22": 0.210205078125, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.1025390625, "loss_aux_layer_4": 0.1048583984375, "loss_aux_layer_5": 0.106201171875, "loss_aux_layer_6": 0.1082763671875, "loss_aux_layer_7": 0.1026611328125, "loss_aux_layer_8": 0.10009765625, "loss_aux_layer_9": 0.0977783203125, "step": 851, "total_loss": 0.7081508040428162 }, { "epoch": 0.16867946941199763, "grad_norm": 1.2687554359436035, "learning_rate": 5e-05, "llm_loss": 0.6422356814146042, "loss": 3.0988, "loss_aux_layer_0": 0.030609130859375, "loss_aux_layer_1": 0.088134765625, "loss_aux_layer_10": 0.1021728515625, "loss_aux_layer_11": 0.1083984375, "loss_aux_layer_12": 0.11669921875, "loss_aux_layer_13": 0.12451171875, "loss_aux_layer_14": 0.137451171875, "loss_aux_layer_15": 0.148193359375, "loss_aux_layer_16": 0.15966796875, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.173583984375, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.0931396484375, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.2060546875, "loss_aux_layer_23": 0.24755859375, "loss_aux_layer_3": 0.104736328125, "loss_aux_layer_4": 0.1072998046875, "loss_aux_layer_5": 0.1083984375, "loss_aux_layer_6": 0.111572265625, "loss_aux_layer_7": 0.1063232421875, "loss_aux_layer_8": 0.103759765625, "loss_aux_layer_9": 0.101318359375, "step": 852, "total_loss": 0.7746923267841339 }, { "epoch": 0.16887745000989904, "grad_norm": 1.1449720859527588, "learning_rate": 5e-05, "llm_loss": 0.5603909641504288, "loss": 2.7506, "loss_aux_layer_0": 0.03094482421875, "loss_aux_layer_1": 0.080322265625, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.1009521484375, "loss_aux_layer_12": 0.1092529296875, "loss_aux_layer_13": 0.1173095703125, "loss_aux_layer_14": 0.1302490234375, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.15478515625, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.086181640625, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.1845703125, "loss_aux_layer_22": 0.207275390625, "loss_aux_layer_23": 0.251708984375, "loss_aux_layer_3": 0.09716796875, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.101806640625, "loss_aux_layer_7": 0.0970458984375, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.09375, "step": 853, "total_loss": 0.687651202082634 }, { "epoch": 0.16907543060780045, "grad_norm": 1.4769641160964966, "learning_rate": 5e-05, "llm_loss": 0.5708518475294113, "loss": 2.7801, "loss_aux_layer_0": 0.02813720703125, "loss_aux_layer_1": 0.076171875, "loss_aux_layer_10": 0.092041015625, "loss_aux_layer_11": 0.09765625, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.114501953125, "loss_aux_layer_14": 0.127197265625, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.151123046875, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.16748046875, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.0830078125, "loss_aux_layer_20": 0.175048828125, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.0938720703125, "loss_aux_layer_4": 0.0955810546875, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.0992431640625, "loss_aux_layer_7": 0.094482421875, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0909423828125, "step": 854, "total_loss": 0.6950339078903198 }, { "epoch": 0.16927341120570183, "grad_norm": 1.7691975831985474, "learning_rate": 5e-05, "llm_loss": 0.6256273686885834, "loss": 3.0105, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.0789794921875, "loss_aux_layer_10": 0.095458984375, "loss_aux_layer_11": 0.1011962890625, "loss_aux_layer_12": 0.109375, "loss_aux_layer_13": 0.11767578125, "loss_aux_layer_14": 0.130859375, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.171630859375, "loss_aux_layer_2": 0.0860595703125, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.247314453125, "loss_aux_layer_3": 0.0966796875, "loss_aux_layer_4": 0.098876953125, "loss_aux_layer_5": 0.1007080078125, "loss_aux_layer_6": 0.1024169921875, "loss_aux_layer_7": 0.0977783203125, "loss_aux_layer_8": 0.0960693359375, "loss_aux_layer_9": 0.0946044921875, "step": 855, "total_loss": 0.7526131123304367 }, { "epoch": 0.16947139180360324, "grad_norm": 1.353330135345459, "learning_rate": 5e-05, "llm_loss": 0.6281715482473373, "loss": 3.0112, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.0758056640625, "loss_aux_layer_10": 0.0926513671875, "loss_aux_layer_11": 0.0986328125, "loss_aux_layer_12": 0.1070556640625, "loss_aux_layer_13": 0.1156005859375, "loss_aux_layer_14": 0.1287841796875, "loss_aux_layer_15": 0.14111328125, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.081787109375, "loss_aux_layer_20": 0.177001953125, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.092529296875, "loss_aux_layer_4": 0.094482421875, "loss_aux_layer_5": 0.0963134765625, "loss_aux_layer_6": 0.09912109375, "loss_aux_layer_7": 0.0947265625, "loss_aux_layer_8": 0.093017578125, "loss_aux_layer_9": 0.091552734375, "step": 856, "total_loss": 0.7527915239334106 }, { "epoch": 0.16966937240150465, "grad_norm": 3.622838258743286, "learning_rate": 5e-05, "llm_loss": 0.5661724358797073, "loss": 2.7921, "loss_aux_layer_0": 0.02777099609375, "loss_aux_layer_1": 0.082763671875, "loss_aux_layer_10": 0.1004638671875, "loss_aux_layer_11": 0.107177734375, "loss_aux_layer_12": 0.1151123046875, "loss_aux_layer_13": 0.123291015625, "loss_aux_layer_14": 0.1361083984375, "loss_aux_layer_15": 0.14794921875, "loss_aux_layer_16": 0.158935546875, "loss_aux_layer_17": 0.164306640625, "loss_aux_layer_18": 0.1728515625, "loss_aux_layer_19": 0.173583984375, "loss_aux_layer_2": 0.0927734375, "loss_aux_layer_20": 0.179443359375, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.20947265625, "loss_aux_layer_23": 0.25341796875, "loss_aux_layer_3": 0.10400390625, "loss_aux_layer_4": 0.1060791015625, "loss_aux_layer_5": 0.1085205078125, "loss_aux_layer_6": 0.1102294921875, "loss_aux_layer_7": 0.103515625, "loss_aux_layer_8": 0.101318359375, "loss_aux_layer_9": 0.0992431640625, "step": 857, "total_loss": 0.6980363577604294 }, { "epoch": 0.16986735299940606, "grad_norm": 3.8708958625793457, "learning_rate": 5e-05, "llm_loss": 0.5892640054225922, "loss": 2.884, "loss_aux_layer_0": 0.027374267578125, "loss_aux_layer_1": 0.07958984375, "loss_aux_layer_10": 0.0986328125, "loss_aux_layer_11": 0.1046142578125, "loss_aux_layer_12": 0.1129150390625, "loss_aux_layer_13": 0.121337890625, "loss_aux_layer_14": 0.13525390625, "loss_aux_layer_15": 0.1474609375, "loss_aux_layer_16": 0.159423828125, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.176025390625, "loss_aux_layer_19": 0.177734375, "loss_aux_layer_2": 0.0938720703125, "loss_aux_layer_20": 0.183349609375, "loss_aux_layer_21": 0.189697265625, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.25439453125, "loss_aux_layer_3": 0.103271484375, "loss_aux_layer_4": 0.104736328125, "loss_aux_layer_5": 0.1058349609375, "loss_aux_layer_6": 0.10693359375, "loss_aux_layer_7": 0.10302734375, "loss_aux_layer_8": 0.099853515625, "loss_aux_layer_9": 0.09814453125, "step": 858, "total_loss": 0.7210016995668411 }, { "epoch": 0.17006533359730747, "grad_norm": 2.221391201019287, "learning_rate": 5e-05, "llm_loss": 0.6409479230642319, "loss": 3.0787, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.079345703125, "loss_aux_layer_10": 0.0972900390625, "loss_aux_layer_11": 0.103515625, "loss_aux_layer_12": 0.11181640625, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.133056640625, "loss_aux_layer_15": 0.144775390625, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.16357421875, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.171875, "loss_aux_layer_2": 0.087646484375, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.100341796875, "loss_aux_layer_4": 0.1024169921875, "loss_aux_layer_5": 0.1041259765625, "loss_aux_layer_6": 0.1063232421875, "loss_aux_layer_7": 0.10107421875, "loss_aux_layer_8": 0.0985107421875, "loss_aux_layer_9": 0.0966796875, "step": 859, "total_loss": 0.7696634382009506 }, { "epoch": 0.17026331419520888, "grad_norm": 2.3613228797912598, "learning_rate": 5e-05, "llm_loss": 0.646856814622879, "loss": 3.0737, "loss_aux_layer_0": 0.026214599609375, "loss_aux_layer_1": 0.0743408203125, "loss_aux_layer_10": 0.092041015625, "loss_aux_layer_11": 0.09716796875, "loss_aux_layer_12": 0.1051025390625, "loss_aux_layer_13": 0.1124267578125, "loss_aux_layer_14": 0.123779296875, "loss_aux_layer_15": 0.1351318359375, "loss_aux_layer_16": 0.146484375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0853271484375, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.175537109375, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.0936279296875, "loss_aux_layer_4": 0.095458984375, "loss_aux_layer_5": 0.0966796875, "loss_aux_layer_6": 0.0989990234375, "loss_aux_layer_7": 0.094482421875, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0909423828125, "step": 860, "total_loss": 0.7684266269207001 }, { "epoch": 0.17046129479311029, "grad_norm": 2.0130062103271484, "learning_rate": 5e-05, "llm_loss": 0.6179661005735397, "loss": 2.978, "loss_aux_layer_0": 0.027069091796875, "loss_aux_layer_1": 0.07470703125, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.1002197265625, "loss_aux_layer_12": 0.1087646484375, "loss_aux_layer_13": 0.116943359375, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.142333984375, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.171630859375, "loss_aux_layer_19": 0.173583984375, "loss_aux_layer_2": 0.0830078125, "loss_aux_layer_20": 0.179931640625, "loss_aux_layer_21": 0.185546875, "loss_aux_layer_22": 0.20654296875, "loss_aux_layer_23": 0.24951171875, "loss_aux_layer_3": 0.0943603515625, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.098388671875, "loss_aux_layer_6": 0.101806640625, "loss_aux_layer_7": 0.096923828125, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.093505859375, "step": 861, "total_loss": 0.7444988042116165 }, { "epoch": 0.1706592753910117, "grad_norm": 2.0078482627868652, "learning_rate": 5e-05, "llm_loss": 0.6325401365756989, "loss": 3.0418, "loss_aux_layer_0": 0.028045654296875, "loss_aux_layer_1": 0.07666015625, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.10107421875, "loss_aux_layer_12": 0.1094970703125, "loss_aux_layer_13": 0.1181640625, "loss_aux_layer_14": 0.1314697265625, "loss_aux_layer_15": 0.143798828125, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.173095703125, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.0836181640625, "loss_aux_layer_20": 0.181396484375, "loss_aux_layer_21": 0.188720703125, "loss_aux_layer_22": 0.210693359375, "loss_aux_layer_23": 0.2548828125, "loss_aux_layer_3": 0.0947265625, "loss_aux_layer_4": 0.0968017578125, "loss_aux_layer_5": 0.0985107421875, "loss_aux_layer_6": 0.1015625, "loss_aux_layer_7": 0.0970458984375, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.0936279296875, "step": 862, "total_loss": 0.7604479193687439 }, { "epoch": 0.17085725598891308, "grad_norm": 1.422369360923767, "learning_rate": 5e-05, "llm_loss": 0.6287079453468323, "loss": 3.0127, "loss_aux_layer_0": 0.0260009765625, "loss_aux_layer_1": 0.0762939453125, "loss_aux_layer_10": 0.093017578125, "loss_aux_layer_11": 0.0989990234375, "loss_aux_layer_12": 0.10693359375, "loss_aux_layer_13": 0.1148681640625, "loss_aux_layer_14": 0.1275634765625, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.150390625, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16845703125, "loss_aux_layer_2": 0.084228515625, "loss_aux_layer_20": 0.174560546875, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.094482421875, "loss_aux_layer_4": 0.096435546875, "loss_aux_layer_5": 0.0980224609375, "loss_aux_layer_6": 0.1007080078125, "loss_aux_layer_7": 0.095947265625, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.0924072265625, "step": 863, "total_loss": 0.7531815022230148 }, { "epoch": 0.17105523658681449, "grad_norm": 3.0599679946899414, "learning_rate": 5e-05, "llm_loss": 0.6311982721090317, "loss": 3.0393, "loss_aux_layer_0": 0.0277099609375, "loss_aux_layer_1": 0.0794677734375, "loss_aux_layer_10": 0.0955810546875, "loss_aux_layer_11": 0.1019287109375, "loss_aux_layer_12": 0.10986328125, "loss_aux_layer_13": 0.118408203125, "loss_aux_layer_14": 0.13232421875, "loss_aux_layer_15": 0.144287109375, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.1640625, "loss_aux_layer_18": 0.173828125, "loss_aux_layer_19": 0.175048828125, "loss_aux_layer_2": 0.087158203125, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.20849609375, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.099609375, "loss_aux_layer_5": 0.101318359375, "loss_aux_layer_6": 0.1038818359375, "loss_aux_layer_7": 0.09912109375, "loss_aux_layer_8": 0.0968017578125, "loss_aux_layer_9": 0.0948486328125, "step": 864, "total_loss": 0.7598249763250351 }, { "epoch": 0.1712532171847159, "grad_norm": 0.7681155800819397, "learning_rate": 5e-05, "llm_loss": 0.6326848417520523, "loss": 3.0376, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.0782470703125, "loss_aux_layer_10": 0.095703125, "loss_aux_layer_11": 0.1014404296875, "loss_aux_layer_12": 0.109619140625, "loss_aux_layer_13": 0.1181640625, "loss_aux_layer_14": 0.1307373046875, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.171875, "loss_aux_layer_2": 0.0865478515625, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.18212890625, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.0968017578125, "loss_aux_layer_4": 0.0987548828125, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.0965576171875, "loss_aux_layer_9": 0.094970703125, "step": 865, "total_loss": 0.7594051212072372 }, { "epoch": 0.1714511977826173, "grad_norm": 1.4980778694152832, "learning_rate": 5e-05, "llm_loss": 0.6488036513328552, "loss": 3.0919, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.075439453125, "loss_aux_layer_10": 0.0921630859375, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.107177734375, "loss_aux_layer_13": 0.115234375, "loss_aux_layer_14": 0.1280517578125, "loss_aux_layer_15": 0.14013671875, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.0821533203125, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.093017578125, "loss_aux_layer_4": 0.094970703125, "loss_aux_layer_5": 0.0966796875, "loss_aux_layer_6": 0.098876953125, "loss_aux_layer_7": 0.0941162109375, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.091064453125, "step": 866, "total_loss": 0.7729755342006683 }, { "epoch": 0.1716491783805187, "grad_norm": 1.112766146659851, "learning_rate": 5e-05, "llm_loss": 0.6306016594171524, "loss": 3.0257, "loss_aux_layer_0": 0.027069091796875, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.1165771484375, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.083740234375, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.249267578125, "loss_aux_layer_3": 0.09375, "loss_aux_layer_4": 0.096435546875, "loss_aux_layer_5": 0.09814453125, "loss_aux_layer_6": 0.1007080078125, "loss_aux_layer_7": 0.096435546875, "loss_aux_layer_8": 0.0947265625, "loss_aux_layer_9": 0.093017578125, "step": 867, "total_loss": 0.7564278244972229 }, { "epoch": 0.17184715897842012, "grad_norm": 1.0902529954910278, "learning_rate": 5e-05, "llm_loss": 0.6393438428640366, "loss": 3.0596, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.076416015625, "loss_aux_layer_10": 0.0927734375, "loss_aux_layer_11": 0.0987548828125, "loss_aux_layer_12": 0.107177734375, "loss_aux_layer_13": 0.115966796875, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.170654296875, "loss_aux_layer_19": 0.172119140625, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.178466796875, "loss_aux_layer_21": 0.184326171875, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.247314453125, "loss_aux_layer_3": 0.0938720703125, "loss_aux_layer_4": 0.09619140625, "loss_aux_layer_5": 0.09765625, "loss_aux_layer_6": 0.1002197265625, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.0921630859375, "step": 868, "total_loss": 0.7648893743753433 }, { "epoch": 0.17204513957632153, "grad_norm": 1.3282287120819092, "learning_rate": 5e-05, "llm_loss": 0.6287603229284286, "loss": 3.015, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.074951171875, "loss_aux_layer_10": 0.0941162109375, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.115966796875, "loss_aux_layer_14": 0.1278076171875, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.159912109375, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0821533203125, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.0928955078125, "loss_aux_layer_4": 0.094970703125, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.10009765625, "loss_aux_layer_7": 0.095947265625, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.0927734375, "step": 869, "total_loss": 0.7537565231323242 }, { "epoch": 0.1722431201742229, "grad_norm": 1.3186269998550415, "learning_rate": 5e-05, "llm_loss": 0.6305919736623764, "loss": 3.0493, "loss_aux_layer_0": 0.027679443359375, "loss_aux_layer_1": 0.082763671875, "loss_aux_layer_10": 0.1009521484375, "loss_aux_layer_11": 0.10693359375, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.123291015625, "loss_aux_layer_14": 0.136474609375, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.15869140625, "loss_aux_layer_17": 0.16552734375, "loss_aux_layer_18": 0.173828125, "loss_aux_layer_19": 0.173828125, "loss_aux_layer_2": 0.092041015625, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.185546875, "loss_aux_layer_22": 0.20751953125, "loss_aux_layer_23": 0.2490234375, "loss_aux_layer_3": 0.1036376953125, "loss_aux_layer_4": 0.1060791015625, "loss_aux_layer_5": 0.107421875, "loss_aux_layer_6": 0.1103515625, "loss_aux_layer_7": 0.1048583984375, "loss_aux_layer_8": 0.1025390625, "loss_aux_layer_9": 0.100341796875, "step": 870, "total_loss": 0.7623182386159897 }, { "epoch": 0.17244110077212432, "grad_norm": 0.8785389065742493, "learning_rate": 5e-05, "llm_loss": 0.6361637264490128, "loss": 3.0485, "loss_aux_layer_0": 0.026885986328125, "loss_aux_layer_1": 0.0743408203125, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.09912109375, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.130615234375, "loss_aux_layer_15": 0.14306640625, "loss_aux_layer_16": 0.1552734375, "loss_aux_layer_17": 0.16259765625, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.0819091796875, "loss_aux_layer_20": 0.1796875, "loss_aux_layer_21": 0.18603515625, "loss_aux_layer_22": 0.207763671875, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.0924072265625, "loss_aux_layer_4": 0.0946044921875, "loss_aux_layer_5": 0.0963134765625, "loss_aux_layer_6": 0.09912109375, "loss_aux_layer_7": 0.0948486328125, "loss_aux_layer_8": 0.0931396484375, "loss_aux_layer_9": 0.0919189453125, "step": 871, "total_loss": 0.7621269077062607 }, { "epoch": 0.17263908137002573, "grad_norm": 1.0227845907211304, "learning_rate": 5e-05, "llm_loss": 0.5596252605319023, "loss": 2.764, "loss_aux_layer_0": 0.0289306640625, "loss_aux_layer_1": 0.0809326171875, "loss_aux_layer_10": 0.0992431640625, "loss_aux_layer_11": 0.105224609375, "loss_aux_layer_12": 0.11328125, "loss_aux_layer_13": 0.1220703125, "loss_aux_layer_14": 0.1356201171875, "loss_aux_layer_15": 0.147705078125, "loss_aux_layer_16": 0.159912109375, "loss_aux_layer_17": 0.166748046875, "loss_aux_layer_18": 0.17724609375, "loss_aux_layer_19": 0.177734375, "loss_aux_layer_2": 0.0882568359375, "loss_aux_layer_20": 0.18310546875, "loss_aux_layer_21": 0.189208984375, "loss_aux_layer_22": 0.209716796875, "loss_aux_layer_23": 0.253662109375, "loss_aux_layer_3": 0.1002197265625, "loss_aux_layer_4": 0.1024169921875, "loss_aux_layer_5": 0.10400390625, "loss_aux_layer_6": 0.10693359375, "loss_aux_layer_7": 0.102294921875, "loss_aux_layer_8": 0.1005859375, "loss_aux_layer_9": 0.0985107421875, "step": 872, "total_loss": 0.6909955143928528 }, { "epoch": 0.17283706196792714, "grad_norm": 1.2235678434371948, "learning_rate": 5e-05, "llm_loss": 0.5855392813682556, "loss": 2.8436, "loss_aux_layer_0": 0.02581787109375, "loss_aux_layer_1": 0.076904296875, "loss_aux_layer_10": 0.0948486328125, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.128662109375, "loss_aux_layer_15": 0.14013671875, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0845947265625, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.0960693359375, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.102783203125, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.093994140625, "step": 873, "total_loss": 0.710911825299263 }, { "epoch": 0.17303504256582855, "grad_norm": 0.8279033899307251, "learning_rate": 5e-05, "llm_loss": 0.5615645796060562, "loss": 2.7374, "loss_aux_layer_0": 0.026885986328125, "loss_aux_layer_1": 0.0732421875, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.0958251953125, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.1251220703125, "loss_aux_layer_15": 0.1376953125, "loss_aux_layer_16": 0.149658203125, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.079833984375, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.09033203125, "loss_aux_layer_4": 0.0928955078125, "loss_aux_layer_5": 0.0946044921875, "loss_aux_layer_6": 0.097412109375, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.0911865234375, "loss_aux_layer_9": 0.0892333984375, "step": 874, "total_loss": 0.6843377649784088 }, { "epoch": 0.17323302316372996, "grad_norm": 1.059554934501648, "learning_rate": 5e-05, "llm_loss": 0.6881130039691925, "loss": 3.257, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.076416015625, "loss_aux_layer_10": 0.094970703125, "loss_aux_layer_11": 0.1005859375, "loss_aux_layer_12": 0.109375, "loss_aux_layer_13": 0.117919921875, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.205810546875, "loss_aux_layer_23": 0.2490234375, "loss_aux_layer_3": 0.0948486328125, "loss_aux_layer_4": 0.096923828125, "loss_aux_layer_5": 0.0986328125, "loss_aux_layer_6": 0.1014404296875, "loss_aux_layer_7": 0.096923828125, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.093505859375, "step": 875, "total_loss": 0.814259484410286 }, { "epoch": 0.17343100376163137, "grad_norm": 1.54487943649292, "learning_rate": 5e-05, "llm_loss": 0.6269938200712204, "loss": 3.016, "loss_aux_layer_0": 0.030914306640625, "loss_aux_layer_1": 0.080078125, "loss_aux_layer_10": 0.094970703125, "loss_aux_layer_11": 0.1007080078125, "loss_aux_layer_12": 0.109130859375, "loss_aux_layer_13": 0.117431640625, "loss_aux_layer_14": 0.1300048828125, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.0863037109375, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.24853515625, "loss_aux_layer_3": 0.0968017578125, "loss_aux_layer_4": 0.0989990234375, "loss_aux_layer_5": 0.1005859375, "loss_aux_layer_6": 0.103271484375, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.096435546875, "loss_aux_layer_9": 0.0941162109375, "step": 876, "total_loss": 0.7539981305599213 }, { "epoch": 0.17362898435953278, "grad_norm": 2.495358467102051, "learning_rate": 5e-05, "llm_loss": 0.668493390083313, "loss": 3.1673, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.0745849609375, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0972900390625, "loss_aux_layer_12": 0.1053466796875, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.1263427734375, "loss_aux_layer_15": 0.1376953125, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.15673828125, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.0948486328125, "loss_aux_layer_4": 0.096923828125, "loss_aux_layer_5": 0.0987548828125, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.093017578125, "loss_aux_layer_9": 0.09130859375, "step": 877, "total_loss": 0.7918273210525513 }, { "epoch": 0.17382696495743416, "grad_norm": 1.7526323795318604, "learning_rate": 5e-05, "llm_loss": 0.6247402131557465, "loss": 2.9943, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0740966796875, "loss_aux_layer_10": 0.09228515625, "loss_aux_layer_11": 0.097412109375, "loss_aux_layer_12": 0.105224609375, "loss_aux_layer_13": 0.11328125, "loss_aux_layer_14": 0.126220703125, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.167724609375, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.0943603515625, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.098388671875, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.095703125, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.091796875, "step": 878, "total_loss": 0.7485738545656204 }, { "epoch": 0.17402494555533557, "grad_norm": 1.441172480583191, "learning_rate": 5e-05, "llm_loss": 0.6350670680403709, "loss": 3.0464, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.075439453125, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0994873046875, "loss_aux_layer_12": 0.107421875, "loss_aux_layer_13": 0.1158447265625, "loss_aux_layer_14": 0.128662109375, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.17138671875, "loss_aux_layer_19": 0.17431640625, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.18115234375, "loss_aux_layer_21": 0.187255859375, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.2509765625, "loss_aux_layer_3": 0.095458984375, "loss_aux_layer_4": 0.0975341796875, "loss_aux_layer_5": 0.099365234375, "loss_aux_layer_6": 0.1011962890625, "loss_aux_layer_7": 0.09619140625, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.0928955078125, "step": 879, "total_loss": 0.7616077214479446 }, { "epoch": 0.17422292615323698, "grad_norm": 1.4079583883285522, "learning_rate": 5e-05, "llm_loss": 0.699240043759346, "loss": 3.2964, "loss_aux_layer_0": 0.027923583984375, "loss_aux_layer_1": 0.0762939453125, "loss_aux_layer_10": 0.0921630859375, "loss_aux_layer_11": 0.097900390625, "loss_aux_layer_12": 0.105712890625, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.127197265625, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.16064453125, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.171630859375, "loss_aux_layer_2": 0.0828857421875, "loss_aux_layer_20": 0.17822265625, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.0927734375, "loss_aux_layer_4": 0.0947265625, "loss_aux_layer_5": 0.096435546875, "loss_aux_layer_6": 0.099365234375, "loss_aux_layer_7": 0.0948486328125, "loss_aux_layer_8": 0.0931396484375, "loss_aux_layer_9": 0.09130859375, "step": 880, "total_loss": 0.8240931630134583 }, { "epoch": 0.1744209067511384, "grad_norm": 1.4989787340164185, "learning_rate": 5e-05, "llm_loss": 0.5323819518089294, "loss": 2.629, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.099365234375, "loss_aux_layer_12": 0.1080322265625, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.1285400390625, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.152099609375, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.085205078125, "loss_aux_layer_20": 0.174560546875, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.0950927734375, "loss_aux_layer_4": 0.0970458984375, "loss_aux_layer_5": 0.098388671875, "loss_aux_layer_6": 0.1009521484375, "loss_aux_layer_7": 0.09619140625, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.0924072265625, "step": 881, "total_loss": 0.6572374701499939 }, { "epoch": 0.1746188873490398, "grad_norm": 1.8961210250854492, "learning_rate": 5e-05, "llm_loss": 0.5192453041672707, "loss": 2.5782, "loss_aux_layer_0": 0.026214599609375, "loss_aux_layer_1": 0.07470703125, "loss_aux_layer_10": 0.09375, "loss_aux_layer_11": 0.09912109375, "loss_aux_layer_12": 0.1072998046875, "loss_aux_layer_13": 0.1158447265625, "loss_aux_layer_14": 0.1282958984375, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.171630859375, "loss_aux_layer_2": 0.08251953125, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0936279296875, "loss_aux_layer_4": 0.095703125, "loss_aux_layer_5": 0.0975341796875, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0938720703125, "loss_aux_layer_9": 0.0924072265625, "step": 882, "total_loss": 0.6445579528808594 }, { "epoch": 0.1748168679469412, "grad_norm": 1.9206122159957886, "learning_rate": 5e-05, "llm_loss": 0.6237062811851501, "loss": 3.002, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.0780029296875, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.10205078125, "loss_aux_layer_12": 0.1103515625, "loss_aux_layer_13": 0.1187744140625, "loss_aux_layer_14": 0.1312255859375, "loss_aux_layer_15": 0.142333984375, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.0853271484375, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.099853515625, "loss_aux_layer_5": 0.1019287109375, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.099365234375, "loss_aux_layer_8": 0.096923828125, "loss_aux_layer_9": 0.0950927734375, "step": 883, "total_loss": 0.7505063712596893 }, { "epoch": 0.17501484854484262, "grad_norm": 1.2187509536743164, "learning_rate": 5e-05, "llm_loss": 0.6469869464635849, "loss": 3.094, "loss_aux_layer_0": 0.026031494140625, "loss_aux_layer_1": 0.076416015625, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.10009765625, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.1171875, "loss_aux_layer_14": 0.13037109375, "loss_aux_layer_15": 0.142578125, "loss_aux_layer_16": 0.154541015625, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.17333984375, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.178466796875, "loss_aux_layer_21": 0.184814453125, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.24755859375, "loss_aux_layer_3": 0.0950927734375, "loss_aux_layer_4": 0.0972900390625, "loss_aux_layer_5": 0.0987548828125, "loss_aux_layer_6": 0.1015625, "loss_aux_layer_7": 0.0972900390625, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.0936279296875, "step": 884, "total_loss": 0.7735104411840439 }, { "epoch": 0.175212829142744, "grad_norm": 2.3517065048217773, "learning_rate": 5e-05, "llm_loss": 0.6088406145572662, "loss": 2.933, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.0758056640625, "loss_aux_layer_10": 0.0948486328125, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1083984375, "loss_aux_layer_13": 0.1162109375, "loss_aux_layer_14": 0.1279296875, "loss_aux_layer_15": 0.13916015625, "loss_aux_layer_16": 0.14990234375, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.166748046875, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.241455078125, "loss_aux_layer_3": 0.0958251953125, "loss_aux_layer_4": 0.097900390625, "loss_aux_layer_5": 0.0992431640625, "loss_aux_layer_6": 0.102294921875, "loss_aux_layer_7": 0.09765625, "loss_aux_layer_8": 0.0958251953125, "loss_aux_layer_9": 0.0938720703125, "step": 885, "total_loss": 0.733247309923172 }, { "epoch": 0.1754108097406454, "grad_norm": 1.23514986038208, "learning_rate": 5e-05, "llm_loss": 0.6234436482191086, "loss": 2.9953, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.0755615234375, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.10009765625, "loss_aux_layer_12": 0.1083984375, "loss_aux_layer_13": 0.1168212890625, "loss_aux_layer_14": 0.12939453125, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.082763671875, "loss_aux_layer_20": 0.17724609375, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.093505859375, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.0972900390625, "loss_aux_layer_6": 0.10009765625, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.092529296875, "step": 886, "total_loss": 0.7488141655921936 }, { "epoch": 0.17560879033854682, "grad_norm": 1.5026401281356812, "learning_rate": 5e-05, "llm_loss": 0.6735029518604279, "loss": 3.208, "loss_aux_layer_0": 0.026611328125, "loss_aux_layer_1": 0.0784912109375, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.1038818359375, "loss_aux_layer_12": 0.112060546875, "loss_aux_layer_13": 0.120849609375, "loss_aux_layer_14": 0.13330078125, "loss_aux_layer_15": 0.144775390625, "loss_aux_layer_16": 0.156005859375, "loss_aux_layer_17": 0.1630859375, "loss_aux_layer_18": 0.172119140625, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.0870361328125, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.183349609375, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.098876953125, "loss_aux_layer_4": 0.1015625, "loss_aux_layer_5": 0.1033935546875, "loss_aux_layer_6": 0.1063232421875, "loss_aux_layer_7": 0.101318359375, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.09716796875, "step": 887, "total_loss": 0.802012026309967 }, { "epoch": 0.17580677093644823, "grad_norm": 1.2150650024414062, "learning_rate": 5e-05, "llm_loss": 0.6458766013383865, "loss": 3.0801, "loss_aux_layer_0": 0.028289794921875, "loss_aux_layer_1": 0.0784912109375, "loss_aux_layer_10": 0.0926513671875, "loss_aux_layer_11": 0.098388671875, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.1148681640625, "loss_aux_layer_14": 0.126708984375, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.1494140625, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.168701171875, "loss_aux_layer_2": 0.0826416015625, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.093505859375, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.09765625, "loss_aux_layer_6": 0.1002197265625, "loss_aux_layer_7": 0.0960693359375, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0916748046875, "step": 888, "total_loss": 0.770018920302391 }, { "epoch": 0.17600475153434963, "grad_norm": 1.1915061473846436, "learning_rate": 5e-05, "llm_loss": 0.6192182153463364, "loss": 2.9863, "loss_aux_layer_0": 0.026580810546875, "loss_aux_layer_1": 0.078857421875, "loss_aux_layer_10": 0.096923828125, "loss_aux_layer_11": 0.1029052734375, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.1195068359375, "loss_aux_layer_14": 0.1328125, "loss_aux_layer_15": 0.144287109375, "loss_aux_layer_16": 0.155029296875, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.0859375, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0980224609375, "loss_aux_layer_4": 0.100830078125, "loss_aux_layer_5": 0.1024169921875, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.099609375, "loss_aux_layer_8": 0.09765625, "loss_aux_layer_9": 0.095703125, "step": 889, "total_loss": 0.7465705871582031 }, { "epoch": 0.17620273213225104, "grad_norm": 1.3092890977859497, "learning_rate": 5e-05, "llm_loss": 0.6279847323894501, "loss": 3.0165, "loss_aux_layer_0": 0.02734375, "loss_aux_layer_1": 0.073974609375, "loss_aux_layer_10": 0.09326171875, "loss_aux_layer_11": 0.0987548828125, "loss_aux_layer_12": 0.1070556640625, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.171142578125, "loss_aux_layer_19": 0.17431640625, "loss_aux_layer_2": 0.0811767578125, "loss_aux_layer_20": 0.180908203125, "loss_aux_layer_21": 0.1884765625, "loss_aux_layer_22": 0.21044921875, "loss_aux_layer_23": 0.253173828125, "loss_aux_layer_3": 0.09228515625, "loss_aux_layer_4": 0.0947265625, "loss_aux_layer_5": 0.0968017578125, "loss_aux_layer_6": 0.099609375, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.093505859375, "loss_aux_layer_9": 0.092041015625, "step": 890, "total_loss": 0.7541339248418808 }, { "epoch": 0.17640071273015245, "grad_norm": 1.799271821975708, "learning_rate": 5e-05, "llm_loss": 0.6017882972955704, "loss": 2.9085, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.0736083984375, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.1292724609375, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0821533203125, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.2060546875, "loss_aux_layer_23": 0.248291015625, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.0958251953125, "loss_aux_layer_5": 0.097412109375, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.095458984375, "loss_aux_layer_8": 0.0943603515625, "loss_aux_layer_9": 0.092529296875, "step": 891, "total_loss": 0.727125272154808 }, { "epoch": 0.17659869332805386, "grad_norm": 1.6754717826843262, "learning_rate": 5e-05, "llm_loss": 0.6183764338493347, "loss": 2.9793, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.075439453125, "loss_aux_layer_10": 0.09521484375, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.11669921875, "loss_aux_layer_14": 0.1292724609375, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.0845947265625, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.0968017578125, "loss_aux_layer_4": 0.099609375, "loss_aux_layer_5": 0.1014404296875, "loss_aux_layer_6": 0.1036376953125, "loss_aux_layer_7": 0.0987548828125, "loss_aux_layer_8": 0.0966796875, "loss_aux_layer_9": 0.0946044921875, "step": 892, "total_loss": 0.7448298037052155 }, { "epoch": 0.17679667392595524, "grad_norm": 3.118359088897705, "learning_rate": 5e-05, "llm_loss": 0.6170264333486557, "loss": 2.9715, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.0750732421875, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.09912109375, "loss_aux_layer_12": 0.1068115234375, "loss_aux_layer_13": 0.11474609375, "loss_aux_layer_14": 0.1273193359375, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.151123046875, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.086669921875, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.24853515625, "loss_aux_layer_3": 0.0985107421875, "loss_aux_layer_4": 0.1005859375, "loss_aux_layer_5": 0.1024169921875, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.0958251953125, "loss_aux_layer_9": 0.09326171875, "step": 893, "total_loss": 0.742886483669281 }, { "epoch": 0.17699465452385665, "grad_norm": 1.6425960063934326, "learning_rate": 5e-05, "llm_loss": 0.6274716258049011, "loss": 3.0111, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1007080078125, "loss_aux_layer_12": 0.1087646484375, "loss_aux_layer_13": 0.1168212890625, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.14111328125, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0855712890625, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.096435546875, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.10009765625, "loss_aux_layer_6": 0.1024169921875, "loss_aux_layer_7": 0.0980224609375, "loss_aux_layer_8": 0.0960693359375, "loss_aux_layer_9": 0.0941162109375, "step": 894, "total_loss": 0.7527836263179779 }, { "epoch": 0.17719263512175806, "grad_norm": 1.9941178560256958, "learning_rate": 5e-05, "llm_loss": 0.7124056667089462, "loss": 3.3526, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.0733642578125, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.0999755859375, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.1175537109375, "loss_aux_layer_14": 0.13037109375, "loss_aux_layer_15": 0.142578125, "loss_aux_layer_16": 0.1552734375, "loss_aux_layer_17": 0.1630859375, "loss_aux_layer_18": 0.171630859375, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.08154296875, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0931396484375, "loss_aux_layer_4": 0.095703125, "loss_aux_layer_5": 0.0975341796875, "loss_aux_layer_6": 0.1002197265625, "loss_aux_layer_7": 0.095947265625, "loss_aux_layer_8": 0.094970703125, "loss_aux_layer_9": 0.093505859375, "step": 895, "total_loss": 0.8381444960832596 }, { "epoch": 0.17739061571965947, "grad_norm": 0.9643200039863586, "learning_rate": 5e-05, "llm_loss": 0.6188590973615646, "loss": 2.9795, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.0765380859375, "loss_aux_layer_10": 0.0948486328125, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.11669921875, "loss_aux_layer_14": 0.1297607421875, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.16064453125, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.17138671875, "loss_aux_layer_2": 0.0830078125, "loss_aux_layer_20": 0.17724609375, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.09423828125, "loss_aux_layer_4": 0.0970458984375, "loss_aux_layer_5": 0.09912109375, "loss_aux_layer_6": 0.1019287109375, "loss_aux_layer_7": 0.09716796875, "loss_aux_layer_8": 0.0958251953125, "loss_aux_layer_9": 0.0941162109375, "step": 896, "total_loss": 0.744872510433197 }, { "epoch": 0.17758859631756088, "grad_norm": 1.2975713014602661, "learning_rate": 5e-05, "llm_loss": 0.6553588509559631, "loss": 3.1342, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.077880859375, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.10400390625, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.132568359375, "loss_aux_layer_15": 0.144287109375, "loss_aux_layer_16": 0.155517578125, "loss_aux_layer_17": 0.16259765625, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.086669921875, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.203857421875, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.0987548828125, "loss_aux_layer_4": 0.1014404296875, "loss_aux_layer_5": 0.10302734375, "loss_aux_layer_6": 0.1058349609375, "loss_aux_layer_7": 0.1009521484375, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.0968017578125, "step": 897, "total_loss": 0.7835495173931122 }, { "epoch": 0.1777865769154623, "grad_norm": 1.1772890090942383, "learning_rate": 5e-05, "llm_loss": 0.6977859139442444, "loss": 3.2834, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0728759765625, "loss_aux_layer_10": 0.0916748046875, "loss_aux_layer_11": 0.097412109375, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.1260986328125, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0814208984375, "loss_aux_layer_20": 0.173828125, "loss_aux_layer_21": 0.1806640625, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.09228515625, "loss_aux_layer_4": 0.094970703125, "loss_aux_layer_5": 0.096435546875, "loss_aux_layer_6": 0.098876953125, "loss_aux_layer_7": 0.0943603515625, "loss_aux_layer_8": 0.09228515625, "loss_aux_layer_9": 0.0906982421875, "step": 898, "total_loss": 0.8208552300930023 }, { "epoch": 0.1779845575133637, "grad_norm": 1.2880862951278687, "learning_rate": 5e-05, "llm_loss": 0.5977775305509567, "loss": 2.8977, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.07763671875, "loss_aux_layer_10": 0.095947265625, "loss_aux_layer_11": 0.1015625, "loss_aux_layer_12": 0.109619140625, "loss_aux_layer_13": 0.1177978515625, "loss_aux_layer_14": 0.1300048828125, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.170654296875, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0859375, "loss_aux_layer_20": 0.17724609375, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0977783203125, "loss_aux_layer_4": 0.099853515625, "loss_aux_layer_5": 0.1011962890625, "loss_aux_layer_6": 0.1038818359375, "loss_aux_layer_7": 0.09912109375, "loss_aux_layer_8": 0.096923828125, "loss_aux_layer_9": 0.094970703125, "step": 899, "total_loss": 0.7244363129138947 }, { "epoch": 0.1781825381112651, "grad_norm": 1.2713162899017334, "learning_rate": 5e-05, "llm_loss": 0.707492858171463, "loss": 3.3406, "loss_aux_layer_0": 0.025604248046875, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0953369140625, "loss_aux_layer_11": 0.1014404296875, "loss_aux_layer_12": 0.1099853515625, "loss_aux_layer_13": 0.11865234375, "loss_aux_layer_14": 0.1322021484375, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.156494140625, "loss_aux_layer_17": 0.164794921875, "loss_aux_layer_18": 0.174560546875, "loss_aux_layer_19": 0.17529296875, "loss_aux_layer_2": 0.0836181640625, "loss_aux_layer_20": 0.180908203125, "loss_aux_layer_21": 0.18603515625, "loss_aux_layer_22": 0.20751953125, "loss_aux_layer_23": 0.249755859375, "loss_aux_layer_3": 0.09521484375, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.0989990234375, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.097900390625, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.093994140625, "step": 900, "total_loss": 0.835148498415947 }, { "epoch": 0.1783805187091665, "grad_norm": 1.6120399236679077, "learning_rate": 5e-05, "llm_loss": 0.7000473290681839, "loss": 3.3017, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.0771484375, "loss_aux_layer_10": 0.0948486328125, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.12890625, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.1513671875, "loss_aux_layer_17": 0.15869140625, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.167724609375, "loss_aux_layer_2": 0.0859375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.241455078125, "loss_aux_layer_3": 0.097412109375, "loss_aux_layer_4": 0.0997314453125, "loss_aux_layer_5": 0.1009521484375, "loss_aux_layer_6": 0.103759765625, "loss_aux_layer_7": 0.0986328125, "loss_aux_layer_8": 0.0963134765625, "loss_aux_layer_9": 0.09423828125, "step": 901, "total_loss": 0.8254210203886032 }, { "epoch": 0.1785784993070679, "grad_norm": 2.3718693256378174, "learning_rate": 5e-05, "llm_loss": 0.6773792505264282, "loss": 3.2061, "loss_aux_layer_0": 0.02728271484375, "loss_aux_layer_1": 0.0738525390625, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.107421875, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.159423828125, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.1689453125, "loss_aux_layer_2": 0.081298828125, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.20068359375, "loss_aux_layer_23": 0.241455078125, "loss_aux_layer_3": 0.0938720703125, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.0975341796875, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.095947265625, "loss_aux_layer_8": 0.093505859375, "loss_aux_layer_9": 0.0919189453125, "step": 902, "total_loss": 0.8015360534191132 }, { "epoch": 0.1787764799049693, "grad_norm": 2.615588903427124, "learning_rate": 5e-05, "llm_loss": 0.5930003821849823, "loss": 2.8876, "loss_aux_layer_0": 0.027984619140625, "loss_aux_layer_1": 0.0787353515625, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1121826171875, "loss_aux_layer_13": 0.1209716796875, "loss_aux_layer_14": 0.1337890625, "loss_aux_layer_15": 0.145751953125, "loss_aux_layer_16": 0.156982421875, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.172119140625, "loss_aux_layer_19": 0.172607421875, "loss_aux_layer_2": 0.08740234375, "loss_aux_layer_20": 0.177734375, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.249267578125, "loss_aux_layer_3": 0.098876953125, "loss_aux_layer_4": 0.101318359375, "loss_aux_layer_5": 0.1036376953125, "loss_aux_layer_6": 0.105712890625, "loss_aux_layer_7": 0.100341796875, "loss_aux_layer_8": 0.0980224609375, "loss_aux_layer_9": 0.09619140625, "step": 903, "total_loss": 0.721888855099678 }, { "epoch": 0.17897446050287072, "grad_norm": 1.7466931343078613, "learning_rate": 5e-05, "llm_loss": 0.6296731233596802, "loss": 3.0181, "loss_aux_layer_0": 0.026275634765625, "loss_aux_layer_1": 0.07568359375, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.1002197265625, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.1162109375, "loss_aux_layer_14": 0.1287841796875, "loss_aux_layer_15": 0.140625, "loss_aux_layer_16": 0.15185546875, "loss_aux_layer_17": 0.159423828125, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.169189453125, "loss_aux_layer_2": 0.0836181640625, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0946044921875, "loss_aux_layer_4": 0.0968017578125, "loss_aux_layer_5": 0.09814453125, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.0963134765625, "loss_aux_layer_8": 0.094482421875, "loss_aux_layer_9": 0.0928955078125, "step": 904, "total_loss": 0.7545256018638611 }, { "epoch": 0.17917244110077213, "grad_norm": 1.6023249626159668, "learning_rate": 5e-05, "llm_loss": 0.5938170850276947, "loss": 2.8864, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.0797119140625, "loss_aux_layer_10": 0.095703125, "loss_aux_layer_11": 0.1021728515625, "loss_aux_layer_12": 0.1109619140625, "loss_aux_layer_13": 0.1192626953125, "loss_aux_layer_14": 0.131591796875, "loss_aux_layer_15": 0.143310546875, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.0869140625, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.20703125, "loss_aux_layer_23": 0.251708984375, "loss_aux_layer_3": 0.09765625, "loss_aux_layer_4": 0.0994873046875, "loss_aux_layer_5": 0.1009521484375, "loss_aux_layer_6": 0.103515625, "loss_aux_layer_7": 0.09912109375, "loss_aux_layer_8": 0.0966796875, "loss_aux_layer_9": 0.0950927734375, "step": 905, "total_loss": 0.72161003947258 }, { "epoch": 0.17937042169867354, "grad_norm": 2.546182632446289, "learning_rate": 5e-05, "llm_loss": 0.5760027766227722, "loss": 2.7956, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.0689697265625, "loss_aux_layer_10": 0.0899658203125, "loss_aux_layer_11": 0.095947265625, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.113037109375, "loss_aux_layer_14": 0.1259765625, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.150390625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.16845703125, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.079345703125, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.1845703125, "loss_aux_layer_22": 0.206298828125, "loss_aux_layer_23": 0.250732421875, "loss_aux_layer_3": 0.08935546875, "loss_aux_layer_4": 0.0911865234375, "loss_aux_layer_5": 0.0928955078125, "loss_aux_layer_6": 0.0953369140625, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.0897216796875, "loss_aux_layer_9": 0.0887451171875, "step": 906, "total_loss": 0.6988988220691681 }, { "epoch": 0.17956840229657495, "grad_norm": 2.721071481704712, "learning_rate": 5e-05, "llm_loss": 0.6076016947627068, "loss": 2.9227, "loss_aux_layer_0": 0.026641845703125, "loss_aux_layer_1": 0.0753173828125, "loss_aux_layer_10": 0.092529296875, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.125732421875, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0950927734375, "loss_aux_layer_4": 0.0970458984375, "loss_aux_layer_5": 0.098388671875, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0916748046875, "step": 907, "total_loss": 0.7306724041700363 }, { "epoch": 0.17976638289447633, "grad_norm": 2.3341314792633057, "learning_rate": 5e-05, "llm_loss": 0.6102591753005981, "loss": 2.9624, "loss_aux_layer_0": 0.025543212890625, "loss_aux_layer_1": 0.0775146484375, "loss_aux_layer_10": 0.100341796875, "loss_aux_layer_11": 0.106689453125, "loss_aux_layer_12": 0.1148681640625, "loss_aux_layer_13": 0.123046875, "loss_aux_layer_14": 0.134765625, "loss_aux_layer_15": 0.14599609375, "loss_aux_layer_16": 0.1572265625, "loss_aux_layer_17": 0.16357421875, "loss_aux_layer_18": 0.172607421875, "loss_aux_layer_19": 0.172119140625, "loss_aux_layer_2": 0.0892333984375, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.1025390625, "loss_aux_layer_4": 0.1058349609375, "loss_aux_layer_5": 0.1083984375, "loss_aux_layer_6": 0.10986328125, "loss_aux_layer_7": 0.1044921875, "loss_aux_layer_8": 0.1016845703125, "loss_aux_layer_9": 0.099365234375, "step": 908, "total_loss": 0.740595281124115 }, { "epoch": 0.17996436349237774, "grad_norm": 1.9776428937911987, "learning_rate": 5e-05, "llm_loss": 0.5819093585014343, "loss": 2.8464, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.0780029296875, "loss_aux_layer_10": 0.09765625, "loss_aux_layer_11": 0.103515625, "loss_aux_layer_12": 0.1114501953125, "loss_aux_layer_13": 0.1197509765625, "loss_aux_layer_14": 0.132568359375, "loss_aux_layer_15": 0.145263671875, "loss_aux_layer_16": 0.15673828125, "loss_aux_layer_17": 0.163818359375, "loss_aux_layer_18": 0.17333984375, "loss_aux_layer_19": 0.174072265625, "loss_aux_layer_2": 0.087646484375, "loss_aux_layer_20": 0.1806640625, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.2080078125, "loss_aux_layer_23": 0.2509765625, "loss_aux_layer_3": 0.1005859375, "loss_aux_layer_4": 0.1029052734375, "loss_aux_layer_5": 0.1046142578125, "loss_aux_layer_6": 0.107421875, "loss_aux_layer_7": 0.102294921875, "loss_aux_layer_8": 0.0994873046875, "loss_aux_layer_9": 0.09716796875, "step": 909, "total_loss": 0.7116069942712784 }, { "epoch": 0.18016234409027915, "grad_norm": 1.671722173690796, "learning_rate": 5e-05, "llm_loss": 0.6147506088018417, "loss": 2.9617, "loss_aux_layer_0": 0.0262451171875, "loss_aux_layer_1": 0.0733642578125, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.0994873046875, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.1298828125, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.171142578125, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.178466796875, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.248046875, "loss_aux_layer_3": 0.0943603515625, "loss_aux_layer_4": 0.09619140625, "loss_aux_layer_5": 0.0982666015625, "loss_aux_layer_6": 0.1002197265625, "loss_aux_layer_7": 0.0953369140625, "loss_aux_layer_8": 0.0938720703125, "loss_aux_layer_9": 0.092041015625, "step": 910, "total_loss": 0.740421399474144 }, { "epoch": 0.18036032468818056, "grad_norm": 2.466310501098633, "learning_rate": 5e-05, "llm_loss": 0.6011406034231186, "loss": 2.9073, "loss_aux_layer_0": 0.0274658203125, "loss_aux_layer_1": 0.0758056640625, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.100830078125, "loss_aux_layer_12": 0.10888671875, "loss_aux_layer_13": 0.1165771484375, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.159912109375, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.169921875, "loss_aux_layer_2": 0.08447265625, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.0960693359375, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.10009765625, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.0958251953125, "loss_aux_layer_9": 0.0938720703125, "step": 911, "total_loss": 0.7268340885639191 }, { "epoch": 0.18055830528608197, "grad_norm": 0.950183093547821, "learning_rate": 5e-05, "llm_loss": 0.6987102180719376, "loss": 3.3032, "loss_aux_layer_0": 0.026123046875, "loss_aux_layer_1": 0.0762939453125, "loss_aux_layer_10": 0.096435546875, "loss_aux_layer_11": 0.102294921875, "loss_aux_layer_12": 0.1109619140625, "loss_aux_layer_13": 0.119384765625, "loss_aux_layer_14": 0.1318359375, "loss_aux_layer_15": 0.1435546875, "loss_aux_layer_16": 0.1552734375, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.095947265625, "loss_aux_layer_4": 0.0987548828125, "loss_aux_layer_5": 0.1004638671875, "loss_aux_layer_6": 0.1031494140625, "loss_aux_layer_7": 0.0986328125, "loss_aux_layer_8": 0.0965576171875, "loss_aux_layer_9": 0.09521484375, "step": 912, "total_loss": 0.8257930278778076 }, { "epoch": 0.18075628588398338, "grad_norm": 2.1713991165161133, "learning_rate": 5e-05, "llm_loss": 0.7219216972589493, "loss": 3.3866, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.07373046875, "loss_aux_layer_10": 0.0941162109375, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.115234375, "loss_aux_layer_14": 0.12744140625, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.0823974609375, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.203857421875, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.0938720703125, "loss_aux_layer_4": 0.09619140625, "loss_aux_layer_5": 0.097900390625, "loss_aux_layer_6": 0.100830078125, "loss_aux_layer_7": 0.0965576171875, "loss_aux_layer_8": 0.0948486328125, "loss_aux_layer_9": 0.0928955078125, "step": 913, "total_loss": 0.8466379791498184 }, { "epoch": 0.18095426648188478, "grad_norm": 0.9247862100601196, "learning_rate": 5e-05, "llm_loss": 0.6102811247110367, "loss": 2.9436, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.079833984375, "loss_aux_layer_10": 0.0931396484375, "loss_aux_layer_11": 0.098876953125, "loss_aux_layer_12": 0.1072998046875, "loss_aux_layer_13": 0.1153564453125, "loss_aux_layer_14": 0.1282958984375, "loss_aux_layer_15": 0.14013671875, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.0855712890625, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.18212890625, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.24755859375, "loss_aux_layer_3": 0.0960693359375, "loss_aux_layer_4": 0.097900390625, "loss_aux_layer_5": 0.0987548828125, "loss_aux_layer_6": 0.1009521484375, "loss_aux_layer_7": 0.0960693359375, "loss_aux_layer_8": 0.0941162109375, "loss_aux_layer_9": 0.0919189453125, "step": 914, "total_loss": 0.7358917742967606 }, { "epoch": 0.1811522470797862, "grad_norm": 2.4066643714904785, "learning_rate": 5e-05, "llm_loss": 0.6295308917760849, "loss": 3.0193, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.0751953125, "loss_aux_layer_10": 0.09423828125, "loss_aux_layer_11": 0.10009765625, "loss_aux_layer_12": 0.108154296875, "loss_aux_layer_13": 0.1168212890625, "loss_aux_layer_14": 0.1295166015625, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.082275390625, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.2431640625, "loss_aux_layer_3": 0.0941162109375, "loss_aux_layer_4": 0.0965576171875, "loss_aux_layer_5": 0.0982666015625, "loss_aux_layer_6": 0.1011962890625, "loss_aux_layer_7": 0.0968017578125, "loss_aux_layer_8": 0.0948486328125, "loss_aux_layer_9": 0.093017578125, "step": 915, "total_loss": 0.7548282593488693 }, { "epoch": 0.18135022767768758, "grad_norm": 1.748380422592163, "learning_rate": 5e-05, "llm_loss": 0.625917062163353, "loss": 3.0087, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0787353515625, "loss_aux_layer_10": 0.094970703125, "loss_aux_layer_11": 0.1011962890625, "loss_aux_layer_12": 0.1090087890625, "loss_aux_layer_13": 0.116943359375, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.1689453125, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.086181640625, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.0977783203125, "loss_aux_layer_4": 0.0999755859375, "loss_aux_layer_5": 0.101318359375, "loss_aux_layer_6": 0.1033935546875, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.0938720703125, "step": 916, "total_loss": 0.7521627098321915 }, { "epoch": 0.18154820827558898, "grad_norm": 1.2821149826049805, "learning_rate": 5e-05, "llm_loss": 0.64908567070961, "loss": 3.1041, "loss_aux_layer_0": 0.026458740234375, "loss_aux_layer_1": 0.077392578125, "loss_aux_layer_10": 0.0960693359375, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.1107177734375, "loss_aux_layer_13": 0.1187744140625, "loss_aux_layer_14": 0.131103515625, "loss_aux_layer_15": 0.1435546875, "loss_aux_layer_16": 0.155517578125, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.17138671875, "loss_aux_layer_19": 0.171875, "loss_aux_layer_2": 0.0838623046875, "loss_aux_layer_20": 0.177734375, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0955810546875, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.100341796875, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.0943603515625, "step": 917, "total_loss": 0.7760240137577057 }, { "epoch": 0.1817461888734904, "grad_norm": 1.3146651983261108, "learning_rate": 5e-05, "llm_loss": 0.7175363749265671, "loss": 3.3763, "loss_aux_layer_0": 0.028045654296875, "loss_aux_layer_1": 0.0780029296875, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.1103515625, "loss_aux_layer_13": 0.118408203125, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.159912109375, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.169921875, "loss_aux_layer_2": 0.0858154296875, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0972900390625, "loss_aux_layer_4": 0.100341796875, "loss_aux_layer_5": 0.10205078125, "loss_aux_layer_6": 0.1048583984375, "loss_aux_layer_7": 0.10009765625, "loss_aux_layer_8": 0.09765625, "loss_aux_layer_9": 0.09521484375, "step": 918, "total_loss": 0.8440730571746826 }, { "epoch": 0.1819441694713918, "grad_norm": 1.8207504749298096, "learning_rate": 5e-05, "llm_loss": 0.6472627371549606, "loss": 3.0793, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.071533203125, "loss_aux_layer_10": 0.0904541015625, "loss_aux_layer_11": 0.0963134765625, "loss_aux_layer_12": 0.104736328125, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.1259765625, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.14990234375, "loss_aux_layer_17": 0.15771484375, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.080322265625, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.091552734375, "loss_aux_layer_4": 0.093505859375, "loss_aux_layer_5": 0.095458984375, "loss_aux_layer_6": 0.097900390625, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.0911865234375, "loss_aux_layer_9": 0.0894775390625, "step": 919, "total_loss": 0.7698238492012024 }, { "epoch": 0.1821421500692932, "grad_norm": 0.9157879948616028, "learning_rate": 5e-05, "llm_loss": 0.6423311084508896, "loss": 3.0668, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0753173828125, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.082763671875, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.1806640625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.09423828125, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.0980224609375, "loss_aux_layer_6": 0.100830078125, "loss_aux_layer_7": 0.0963134765625, "loss_aux_layer_8": 0.0941162109375, "loss_aux_layer_9": 0.092529296875, "step": 920, "total_loss": 0.7666918337345123 }, { "epoch": 0.18234013066719462, "grad_norm": 1.3434025049209595, "learning_rate": 5e-05, "llm_loss": 0.6163619384169579, "loss": 2.9684, "loss_aux_layer_0": 0.02899169921875, "loss_aux_layer_1": 0.0751953125, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.11669921875, "loss_aux_layer_14": 0.1298828125, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.172119140625, "loss_aux_layer_2": 0.0819091796875, "loss_aux_layer_20": 0.178466796875, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.20556640625, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.093017578125, "loss_aux_layer_4": 0.09521484375, "loss_aux_layer_5": 0.097412109375, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.095458984375, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.0919189453125, "step": 921, "total_loss": 0.7420995980501175 }, { "epoch": 0.18253811126509603, "grad_norm": 1.0028105974197388, "learning_rate": 5e-05, "llm_loss": 0.5811598896980286, "loss": 2.8408, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.0780029296875, "loss_aux_layer_10": 0.0982666015625, "loss_aux_layer_11": 0.1044921875, "loss_aux_layer_12": 0.1129150390625, "loss_aux_layer_13": 0.12109375, "loss_aux_layer_14": 0.133056640625, "loss_aux_layer_15": 0.14453125, "loss_aux_layer_16": 0.1552734375, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.172607421875, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.0865478515625, "loss_aux_layer_20": 0.178466796875, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.250244140625, "loss_aux_layer_3": 0.0989990234375, "loss_aux_layer_4": 0.1016845703125, "loss_aux_layer_5": 0.103271484375, "loss_aux_layer_6": 0.105712890625, "loss_aux_layer_7": 0.1014404296875, "loss_aux_layer_8": 0.099609375, "loss_aux_layer_9": 0.09716796875, "step": 922, "total_loss": 0.7102061957120895 }, { "epoch": 0.1827360918629974, "grad_norm": 1.2370121479034424, "learning_rate": 5e-05, "llm_loss": 0.5983878374099731, "loss": 2.8871, "loss_aux_layer_0": 0.0264892578125, "loss_aux_layer_1": 0.0751953125, "loss_aux_layer_10": 0.0926513671875, "loss_aux_layer_11": 0.098388671875, "loss_aux_layer_12": 0.106201171875, "loss_aux_layer_13": 0.114013671875, "loss_aux_layer_14": 0.126220703125, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.1494140625, "loss_aux_layer_17": 0.156982421875, "loss_aux_layer_18": 0.16552734375, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.082275390625, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.0941162109375, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.09765625, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0953369140625, "loss_aux_layer_8": 0.0933837890625, "loss_aux_layer_9": 0.091064453125, "step": 923, "total_loss": 0.7217734307050705 }, { "epoch": 0.18293407246089882, "grad_norm": 1.3356561660766602, "learning_rate": 5e-05, "llm_loss": 0.679144948720932, "loss": 3.2169, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.075439453125, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.1075439453125, "loss_aux_layer_13": 0.115478515625, "loss_aux_layer_14": 0.128662109375, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.152099609375, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.168701171875, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0946044921875, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.098388671875, "loss_aux_layer_6": 0.1009521484375, "loss_aux_layer_7": 0.09619140625, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.0927734375, "step": 924, "total_loss": 0.8042345941066742 }, { "epoch": 0.18313205305880023, "grad_norm": 1.0751692056655884, "learning_rate": 5e-05, "llm_loss": 0.6031570062041283, "loss": 2.8977, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.0899658203125, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.1116943359375, "loss_aux_layer_14": 0.1239013671875, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.146240234375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.163330078125, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.0791015625, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0927734375, "loss_aux_layer_5": 0.0947265625, "loss_aux_layer_6": 0.0970458984375, "loss_aux_layer_7": 0.0927734375, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.0887451171875, "step": 925, "total_loss": 0.7244142889976501 }, { "epoch": 0.18333003365670164, "grad_norm": 1.4666845798492432, "learning_rate": 5e-05, "llm_loss": 0.7054042965173721, "loss": 3.328, "loss_aux_layer_0": 0.026611328125, "loss_aux_layer_1": 0.076904296875, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.101318359375, "loss_aux_layer_12": 0.1097412109375, "loss_aux_layer_13": 0.117919921875, "loss_aux_layer_14": 0.130615234375, "loss_aux_layer_15": 0.142822265625, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.17138671875, "loss_aux_layer_19": 0.173583984375, "loss_aux_layer_2": 0.08349609375, "loss_aux_layer_20": 0.1787109375, "loss_aux_layer_21": 0.183837890625, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.095458984375, "loss_aux_layer_4": 0.09814453125, "loss_aux_layer_5": 0.0994873046875, "loss_aux_layer_6": 0.1024169921875, "loss_aux_layer_7": 0.0977783203125, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.0938720703125, "step": 926, "total_loss": 0.8319899588823318 }, { "epoch": 0.18352801425460305, "grad_norm": 1.214800477027893, "learning_rate": 5e-05, "llm_loss": 0.584957629442215, "loss": 2.8488, "loss_aux_layer_0": 0.02783203125, "loss_aux_layer_1": 0.0753173828125, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.101318359375, "loss_aux_layer_12": 0.1097412109375, "loss_aux_layer_13": 0.1177978515625, "loss_aux_layer_14": 0.130615234375, "loss_aux_layer_15": 0.142578125, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.16162109375, "loss_aux_layer_18": 0.171142578125, "loss_aux_layer_19": 0.1728515625, "loss_aux_layer_2": 0.083740234375, "loss_aux_layer_20": 0.17919921875, "loss_aux_layer_21": 0.185302734375, "loss_aux_layer_22": 0.208740234375, "loss_aux_layer_23": 0.251953125, "loss_aux_layer_3": 0.0958251953125, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.097900390625, "loss_aux_layer_8": 0.0960693359375, "loss_aux_layer_9": 0.0941162109375, "step": 927, "total_loss": 0.7121904939413071 }, { "epoch": 0.18372599485250446, "grad_norm": 1.0307918787002563, "learning_rate": 5e-05, "llm_loss": 0.5306302383542061, "loss": 2.6166, "loss_aux_layer_0": 0.02557373046875, "loss_aux_layer_1": 0.0731201171875, "loss_aux_layer_10": 0.0931396484375, "loss_aux_layer_11": 0.0986328125, "loss_aux_layer_12": 0.106201171875, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.1256103515625, "loss_aux_layer_15": 0.136962890625, "loss_aux_layer_16": 0.148193359375, "loss_aux_layer_17": 0.15576171875, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.081787109375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.0936279296875, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.0977783203125, "loss_aux_layer_6": 0.1002197265625, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.0916748046875, "step": 928, "total_loss": 0.6541409343481064 }, { "epoch": 0.18392397545040587, "grad_norm": 1.415459394454956, "learning_rate": 5e-05, "llm_loss": 0.6233289241790771, "loss": 2.9797, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.0711669921875, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.0958251953125, "loss_aux_layer_12": 0.103515625, "loss_aux_layer_13": 0.111083984375, "loss_aux_layer_14": 0.123779296875, "loss_aux_layer_15": 0.1357421875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.166259765625, "loss_aux_layer_2": 0.078125, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.0924072265625, "loss_aux_layer_5": 0.094482421875, "loss_aux_layer_6": 0.0975341796875, "loss_aux_layer_7": 0.0933837890625, "loss_aux_layer_8": 0.09130859375, "loss_aux_layer_9": 0.08935546875, "step": 929, "total_loss": 0.7449301779270172 }, { "epoch": 0.18412195604830728, "grad_norm": 0.9010565876960754, "learning_rate": 5e-05, "llm_loss": 0.6834714859724045, "loss": 3.2332, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.0750732421875, "loss_aux_layer_10": 0.094482421875, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.115966796875, "loss_aux_layer_14": 0.12841796875, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.083740234375, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.095703125, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.1026611328125, "loss_aux_layer_7": 0.09765625, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.093505859375, "step": 930, "total_loss": 0.808303028345108 }, { "epoch": 0.18431993664620866, "grad_norm": 1.8961787223815918, "learning_rate": 5e-05, "llm_loss": 0.6055013090372086, "loss": 2.9213, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.0745849609375, "loss_aux_layer_10": 0.0941162109375, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.128662109375, "loss_aux_layer_15": 0.14013671875, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0816650390625, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.095458984375, "loss_aux_layer_5": 0.09716796875, "loss_aux_layer_6": 0.1004638671875, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.0943603515625, "loss_aux_layer_9": 0.0927734375, "step": 931, "total_loss": 0.7303294837474823 }, { "epoch": 0.18451791724411007, "grad_norm": 1.8392279148101807, "learning_rate": 5e-05, "llm_loss": 0.6078513115644455, "loss": 2.9123, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.0692138671875, "loss_aux_layer_10": 0.0892333984375, "loss_aux_layer_11": 0.094970703125, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.1231689453125, "loss_aux_layer_15": 0.134521484375, "loss_aux_layer_16": 0.146240234375, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.077880859375, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.08935546875, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.093994140625, "loss_aux_layer_6": 0.0963134765625, "loss_aux_layer_7": 0.091796875, "loss_aux_layer_8": 0.0897216796875, "loss_aux_layer_9": 0.0882568359375, "step": 932, "total_loss": 0.7280797809362411 }, { "epoch": 0.18471589784201148, "grad_norm": 1.432772159576416, "learning_rate": 5e-05, "llm_loss": 0.6251791566610336, "loss": 2.9747, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.1019287109375, "loss_aux_layer_13": 0.10986328125, "loss_aux_layer_14": 0.1220703125, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.144287109375, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.16064453125, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0775146484375, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.0885009765625, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0926513671875, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.091064453125, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.087646484375, "step": 933, "total_loss": 0.7436870187520981 }, { "epoch": 0.1849138784399129, "grad_norm": 1.956445574760437, "learning_rate": 5e-05, "llm_loss": 0.7012444883584976, "loss": 3.3046, "loss_aux_layer_0": 0.026641845703125, "loss_aux_layer_1": 0.0731201171875, "loss_aux_layer_10": 0.09326171875, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.1072998046875, "loss_aux_layer_13": 0.1156005859375, "loss_aux_layer_14": 0.1285400390625, "loss_aux_layer_15": 0.140625, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.170654296875, "loss_aux_layer_19": 0.17236328125, "loss_aux_layer_2": 0.0802001953125, "loss_aux_layer_20": 0.177978515625, "loss_aux_layer_21": 0.183349609375, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.0921630859375, "loss_aux_layer_4": 0.0946044921875, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0950927734375, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.091796875, "step": 934, "total_loss": 0.8261566013097763 }, { "epoch": 0.1851118590378143, "grad_norm": 1.6730973720550537, "learning_rate": 5e-05, "llm_loss": 0.6886879503726959, "loss": 3.2452, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.073486328125, "loss_aux_layer_10": 0.0914306640625, "loss_aux_layer_11": 0.0970458984375, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.113037109375, "loss_aux_layer_14": 0.1263427734375, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.150146484375, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.080810546875, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.091796875, "loss_aux_layer_4": 0.09423828125, "loss_aux_layer_5": 0.095703125, "loss_aux_layer_6": 0.0982666015625, "loss_aux_layer_7": 0.0938720703125, "loss_aux_layer_8": 0.092041015625, "loss_aux_layer_9": 0.09033203125, "step": 935, "total_loss": 0.8112955242395401 }, { "epoch": 0.1853098396357157, "grad_norm": 1.4144418239593506, "learning_rate": 5e-05, "llm_loss": 0.7778163701295853, "loss": 3.5989, "loss_aux_layer_0": 0.02490234375, "loss_aux_layer_1": 0.071533203125, "loss_aux_layer_10": 0.09130859375, "loss_aux_layer_11": 0.0970458984375, "loss_aux_layer_12": 0.10498046875, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.1258544921875, "loss_aux_layer_15": 0.1378173828125, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.166259765625, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0791015625, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.0906982421875, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.097900390625, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.0916748046875, "loss_aux_layer_9": 0.08984375, "step": 936, "total_loss": 0.8997127562761307 }, { "epoch": 0.18550782023361712, "grad_norm": 1.7572187185287476, "learning_rate": 5e-05, "llm_loss": 0.5654308199882507, "loss": 2.7767, "loss_aux_layer_0": 0.025909423828125, "loss_aux_layer_1": 0.0794677734375, "loss_aux_layer_10": 0.0982666015625, "loss_aux_layer_11": 0.104248046875, "loss_aux_layer_12": 0.1123046875, "loss_aux_layer_13": 0.1202392578125, "loss_aux_layer_14": 0.1326904296875, "loss_aux_layer_15": 0.14404296875, "loss_aux_layer_16": 0.15478515625, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.17041015625, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.0882568359375, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.183349609375, "loss_aux_layer_22": 0.205810546875, "loss_aux_layer_23": 0.248779296875, "loss_aux_layer_3": 0.10009765625, "loss_aux_layer_4": 0.1026611328125, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.106689453125, "loss_aux_layer_7": 0.101806640625, "loss_aux_layer_8": 0.0997314453125, "loss_aux_layer_9": 0.09765625, "step": 937, "total_loss": 0.6941855847835541 }, { "epoch": 0.18570580083151852, "grad_norm": 2.0930967330932617, "learning_rate": 5e-05, "llm_loss": 0.637876957654953, "loss": 3.0417, "loss_aux_layer_0": 0.025390625, "loss_aux_layer_1": 0.07110595703125, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.112548828125, "loss_aux_layer_14": 0.1248779296875, "loss_aux_layer_15": 0.13720703125, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.168701171875, "loss_aux_layer_2": 0.0810546875, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.24755859375, "loss_aux_layer_3": 0.091552734375, "loss_aux_layer_4": 0.093505859375, "loss_aux_layer_5": 0.0948486328125, "loss_aux_layer_6": 0.0972900390625, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0892333984375, "step": 938, "total_loss": 0.7604209333658218 }, { "epoch": 0.1859037814294199, "grad_norm": 1.160187005996704, "learning_rate": 5e-05, "llm_loss": 0.6663451790809631, "loss": 3.1694, "loss_aux_layer_0": 0.02655029296875, "loss_aux_layer_1": 0.076416015625, "loss_aux_layer_10": 0.095703125, "loss_aux_layer_11": 0.101318359375, "loss_aux_layer_12": 0.1087646484375, "loss_aux_layer_13": 0.1163330078125, "loss_aux_layer_14": 0.128662109375, "loss_aux_layer_15": 0.14013671875, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.0850830078125, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0970458984375, "loss_aux_layer_4": 0.099365234375, "loss_aux_layer_5": 0.1007080078125, "loss_aux_layer_6": 0.1033935546875, "loss_aux_layer_7": 0.0987548828125, "loss_aux_layer_8": 0.0965576171875, "loss_aux_layer_9": 0.0947265625, "step": 939, "total_loss": 0.7923477739095688 }, { "epoch": 0.18610176202732132, "grad_norm": 2.316213607788086, "learning_rate": 5e-05, "llm_loss": 0.6034110262989998, "loss": 2.9008, "loss_aux_layer_0": 0.028076171875, "loss_aux_layer_1": 0.072998046875, "loss_aux_layer_10": 0.0892333984375, "loss_aux_layer_11": 0.094970703125, "loss_aux_layer_12": 0.1029052734375, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.12353515625, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.15576171875, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0809326171875, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.245849609375, "loss_aux_layer_3": 0.091064453125, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.09423828125, "loss_aux_layer_6": 0.0963134765625, "loss_aux_layer_7": 0.0916748046875, "loss_aux_layer_8": 0.0902099609375, "loss_aux_layer_9": 0.0882568359375, "step": 940, "total_loss": 0.725189208984375 }, { "epoch": 0.18629974262522273, "grad_norm": 1.5574427843093872, "learning_rate": 5e-05, "llm_loss": 0.6131033599376678, "loss": 2.9357, "loss_aux_layer_0": 0.026123046875, "loss_aux_layer_1": 0.0697021484375, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.09521484375, "loss_aux_layer_12": 0.1033935546875, "loss_aux_layer_13": 0.1112060546875, "loss_aux_layer_14": 0.1239013671875, "loss_aux_layer_15": 0.13525390625, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.166748046875, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.241455078125, "loss_aux_layer_3": 0.0887451171875, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.09326171875, "loss_aux_layer_6": 0.0960693359375, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.09033203125, "loss_aux_layer_9": 0.0889892578125, "step": 941, "total_loss": 0.733920231461525 }, { "epoch": 0.18649772322312413, "grad_norm": 1.339356780052185, "learning_rate": 5e-05, "llm_loss": 0.6569471955299377, "loss": 3.1182, "loss_aux_layer_0": 0.02740478515625, "loss_aux_layer_1": 0.0731201171875, "loss_aux_layer_10": 0.091552734375, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.113525390625, "loss_aux_layer_14": 0.1260986328125, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.149658203125, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0787353515625, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0904541015625, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.0947265625, "loss_aux_layer_6": 0.0975341796875, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.091796875, "loss_aux_layer_9": 0.0904541015625, "step": 942, "total_loss": 0.7795490324497223 }, { "epoch": 0.18669570382102554, "grad_norm": 1.9038084745407104, "learning_rate": 5e-05, "llm_loss": 0.587944284081459, "loss": 2.8527, "loss_aux_layer_0": 0.02734375, "loss_aux_layer_1": 0.074462890625, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.108642578125, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.1295166015625, "loss_aux_layer_15": 0.14111328125, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.169677734375, "loss_aux_layer_2": 0.081787109375, "loss_aux_layer_20": 0.175048828125, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.0946044921875, "loss_aux_layer_4": 0.0975341796875, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.102783203125, "loss_aux_layer_7": 0.09765625, "loss_aux_layer_8": 0.0955810546875, "loss_aux_layer_9": 0.09375, "step": 943, "total_loss": 0.7131744474172592 }, { "epoch": 0.18689368441892695, "grad_norm": 1.4023690223693848, "learning_rate": 5e-05, "llm_loss": 0.639451339840889, "loss": 3.0561, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.0723876953125, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.1275634765625, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.169189453125, "loss_aux_layer_2": 0.0806884765625, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.205810546875, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.0931396484375, "loss_aux_layer_4": 0.095458984375, "loss_aux_layer_5": 0.097412109375, "loss_aux_layer_6": 0.10009765625, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.0924072265625, "step": 944, "total_loss": 0.7640293538570404 }, { "epoch": 0.18709166501682836, "grad_norm": 1.1874544620513916, "learning_rate": 5e-05, "llm_loss": 0.6811078488826752, "loss": 3.2292, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.0743408203125, "loss_aux_layer_10": 0.0966796875, "loss_aux_layer_11": 0.102294921875, "loss_aux_layer_12": 0.11083984375, "loss_aux_layer_13": 0.118408203125, "loss_aux_layer_14": 0.130615234375, "loss_aux_layer_15": 0.14208984375, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.16064453125, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.0823974609375, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.0953369140625, "loss_aux_layer_4": 0.09814453125, "loss_aux_layer_5": 0.100341796875, "loss_aux_layer_6": 0.1031494140625, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.096923828125, "loss_aux_layer_9": 0.0953369140625, "step": 945, "total_loss": 0.8072973638772964 }, { "epoch": 0.18728964561472974, "grad_norm": 1.2267274856567383, "learning_rate": 5e-05, "llm_loss": 0.6271179169416428, "loss": 3.0081, "loss_aux_layer_0": 0.0260009765625, "loss_aux_layer_1": 0.0726318359375, "loss_aux_layer_10": 0.093994140625, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.116455078125, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.080078125, "loss_aux_layer_20": 0.17724609375, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.09228515625, "loss_aux_layer_4": 0.094970703125, "loss_aux_layer_5": 0.096923828125, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0938720703125, "loss_aux_layer_9": 0.0926513671875, "step": 946, "total_loss": 0.7520227134227753 }, { "epoch": 0.18748762621263115, "grad_norm": 1.1743226051330566, "learning_rate": 5e-05, "llm_loss": 0.6800569593906403, "loss": 3.1962, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.0673828125, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.093505859375, "loss_aux_layer_12": 0.1015625, "loss_aux_layer_13": 0.1099853515625, "loss_aux_layer_14": 0.122802734375, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.146240234375, "loss_aux_layer_17": 0.154541015625, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.1650390625, "loss_aux_layer_2": 0.0743408203125, "loss_aux_layer_20": 0.17138671875, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.0860595703125, "loss_aux_layer_4": 0.088623046875, "loss_aux_layer_5": 0.0906982421875, "loss_aux_layer_6": 0.09326171875, "loss_aux_layer_7": 0.0894775390625, "loss_aux_layer_8": 0.0880126953125, "loss_aux_layer_9": 0.086669921875, "step": 947, "total_loss": 0.7990462332963943 }, { "epoch": 0.18768560681053256, "grad_norm": 1.310736060142517, "learning_rate": 5e-05, "llm_loss": 0.6709482669830322, "loss": 3.1859, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.0743408203125, "loss_aux_layer_10": 0.095703125, "loss_aux_layer_11": 0.101806640625, "loss_aux_layer_12": 0.1097412109375, "loss_aux_layer_13": 0.1175537109375, "loss_aux_layer_14": 0.1297607421875, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15234375, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.083740234375, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.0960693359375, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.102783203125, "loss_aux_layer_7": 0.09814453125, "loss_aux_layer_8": 0.0960693359375, "loss_aux_layer_9": 0.0943603515625, "step": 948, "total_loss": 0.7964805662631989 }, { "epoch": 0.18788358740843397, "grad_norm": 0.9623796939849854, "learning_rate": 5e-05, "llm_loss": 0.6396494656801224, "loss": 3.047, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.072998046875, "loss_aux_layer_10": 0.0904541015625, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.126220703125, "loss_aux_layer_15": 0.1376953125, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.08056640625, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.092041015625, "loss_aux_layer_4": 0.0941162109375, "loss_aux_layer_5": 0.0953369140625, "loss_aux_layer_6": 0.09814453125, "loss_aux_layer_7": 0.0936279296875, "loss_aux_layer_8": 0.09130859375, "loss_aux_layer_9": 0.08935546875, "step": 949, "total_loss": 0.7617556899785995 }, { "epoch": 0.18808156800633538, "grad_norm": 1.0207661390304565, "learning_rate": 5e-05, "llm_loss": 0.5773279815912247, "loss": 2.7892, "loss_aux_layer_0": 0.026763916015625, "loss_aux_layer_1": 0.0692138671875, "loss_aux_layer_10": 0.087646484375, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.109130859375, "loss_aux_layer_14": 0.1224365234375, "loss_aux_layer_15": 0.1343994140625, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.0765380859375, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.087890625, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.091552734375, "loss_aux_layer_6": 0.0941162109375, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.08837890625, "loss_aux_layer_9": 0.0867919921875, "step": 950, "total_loss": 0.6973120123147964 }, { "epoch": 0.1882795486042368, "grad_norm": 1.090744137763977, "learning_rate": 5e-05, "llm_loss": 0.6450579911470413, "loss": 3.0804, "loss_aux_layer_0": 0.027069091796875, "loss_aux_layer_1": 0.075439453125, "loss_aux_layer_10": 0.0946044921875, "loss_aux_layer_11": 0.1004638671875, "loss_aux_layer_12": 0.1080322265625, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.1282958984375, "loss_aux_layer_15": 0.1396484375, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0828857421875, "loss_aux_layer_20": 0.174560546875, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.095458984375, "loss_aux_layer_4": 0.0980224609375, "loss_aux_layer_5": 0.099853515625, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.09814453125, "loss_aux_layer_8": 0.095703125, "loss_aux_layer_9": 0.0936279296875, "step": 951, "total_loss": 0.7701047211885452 }, { "epoch": 0.1884775292021382, "grad_norm": 0.9464146494865417, "learning_rate": 5e-05, "llm_loss": 0.629631981253624, "loss": 3.021, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.076416015625, "loss_aux_layer_10": 0.0955810546875, "loss_aux_layer_11": 0.1015625, "loss_aux_layer_12": 0.1094970703125, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.129150390625, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.1513671875, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16845703125, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.096435546875, "loss_aux_layer_4": 0.09912109375, "loss_aux_layer_5": 0.100341796875, "loss_aux_layer_6": 0.1029052734375, "loss_aux_layer_7": 0.098388671875, "loss_aux_layer_8": 0.0963134765625, "loss_aux_layer_9": 0.094482421875, "step": 952, "total_loss": 0.7552378922700882 }, { "epoch": 0.1886755098000396, "grad_norm": 1.6955678462982178, "learning_rate": 5e-05, "llm_loss": 0.7495379447937012, "loss": 3.4861, "loss_aux_layer_0": 0.02545166015625, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0977783203125, "loss_aux_layer_12": 0.1064453125, "loss_aux_layer_13": 0.1146240234375, "loss_aux_layer_14": 0.1268310546875, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.14990234375, "loss_aux_layer_17": 0.156982421875, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.16552734375, "loss_aux_layer_2": 0.078369140625, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.0906982421875, "loss_aux_layer_4": 0.09326171875, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.09326171875, "loss_aux_layer_8": 0.091796875, "loss_aux_layer_9": 0.0902099609375, "step": 953, "total_loss": 0.8715289533138275 }, { "epoch": 0.188873490397941, "grad_norm": 1.5551754236221313, "learning_rate": 5e-05, "llm_loss": 0.6335493922233582, "loss": 3.0194, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.072021484375, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.112060546875, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.155517578125, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.078857421875, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.239990234375, "loss_aux_layer_3": 0.0902099609375, "loss_aux_layer_4": 0.09326171875, "loss_aux_layer_5": 0.0950927734375, "loss_aux_layer_6": 0.09765625, "loss_aux_layer_7": 0.0933837890625, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.0897216796875, "step": 954, "total_loss": 0.7548473924398422 }, { "epoch": 0.1890714709958424, "grad_norm": 1.7468187808990479, "learning_rate": 5e-05, "llm_loss": 0.54584851115942, "loss": 2.7004, "loss_aux_layer_0": 0.0272216796875, "loss_aux_layer_1": 0.0770263671875, "loss_aux_layer_10": 0.0989990234375, "loss_aux_layer_11": 0.10498046875, "loss_aux_layer_12": 0.112548828125, "loss_aux_layer_13": 0.1201171875, "loss_aux_layer_14": 0.13232421875, "loss_aux_layer_15": 0.1435546875, "loss_aux_layer_16": 0.155029296875, "loss_aux_layer_17": 0.162353515625, "loss_aux_layer_18": 0.172119140625, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.08740234375, "loss_aux_layer_20": 0.178955078125, "loss_aux_layer_21": 0.185546875, "loss_aux_layer_22": 0.208251953125, "loss_aux_layer_23": 0.2509765625, "loss_aux_layer_3": 0.0999755859375, "loss_aux_layer_4": 0.1024169921875, "loss_aux_layer_5": 0.1038818359375, "loss_aux_layer_6": 0.106689453125, "loss_aux_layer_7": 0.1016845703125, "loss_aux_layer_8": 0.10009765625, "loss_aux_layer_9": 0.0977783203125, "step": 955, "total_loss": 0.6751082688570023 }, { "epoch": 0.1892694515937438, "grad_norm": 1.3580307960510254, "learning_rate": 5e-05, "llm_loss": 0.619478315114975, "loss": 2.9687, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.0712890625, "loss_aux_layer_10": 0.0919189453125, "loss_aux_layer_11": 0.09814453125, "loss_aux_layer_12": 0.1058349609375, "loss_aux_layer_13": 0.1141357421875, "loss_aux_layer_14": 0.1258544921875, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.156982421875, "loss_aux_layer_18": 0.166259765625, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0797119140625, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.0919189453125, "loss_aux_layer_4": 0.0946044921875, "loss_aux_layer_5": 0.09619140625, "loss_aux_layer_6": 0.0989990234375, "loss_aux_layer_7": 0.0948486328125, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0908203125, "step": 956, "total_loss": 0.7421778738498688 }, { "epoch": 0.18946743219164522, "grad_norm": 1.6310371160507202, "learning_rate": 5e-05, "llm_loss": 0.6074537932872772, "loss": 2.936, "loss_aux_layer_0": 0.031890869140625, "loss_aux_layer_1": 0.0797119140625, "loss_aux_layer_10": 0.0960693359375, "loss_aux_layer_11": 0.10205078125, "loss_aux_layer_12": 0.1103515625, "loss_aux_layer_13": 0.1182861328125, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.169921875, "loss_aux_layer_2": 0.0833740234375, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.0960693359375, "loss_aux_layer_4": 0.0986328125, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.098876953125, "loss_aux_layer_8": 0.096923828125, "loss_aux_layer_9": 0.0946044921875, "step": 957, "total_loss": 0.7340019345283508 }, { "epoch": 0.18966541278954663, "grad_norm": 1.6952584981918335, "learning_rate": 5e-05, "llm_loss": 0.6123815476894379, "loss": 2.9498, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.0726318359375, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.171142578125, "loss_aux_layer_2": 0.07958984375, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.184326171875, "loss_aux_layer_22": 0.206298828125, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.0916748046875, "loss_aux_layer_4": 0.0946044921875, "loss_aux_layer_5": 0.0966796875, "loss_aux_layer_6": 0.09912109375, "loss_aux_layer_7": 0.0947265625, "loss_aux_layer_8": 0.0931396484375, "loss_aux_layer_9": 0.0916748046875, "step": 958, "total_loss": 0.7374508380889893 }, { "epoch": 0.18986339338744804, "grad_norm": 1.5929861068725586, "learning_rate": 5e-05, "llm_loss": 0.6402821838855743, "loss": 3.0538, "loss_aux_layer_0": 0.026031494140625, "loss_aux_layer_1": 0.071044921875, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.1141357421875, "loss_aux_layer_14": 0.1270751953125, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.1513671875, "loss_aux_layer_17": 0.15869140625, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.0784912109375, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.095458984375, "loss_aux_layer_6": 0.098388671875, "loss_aux_layer_7": 0.0941162109375, "loss_aux_layer_8": 0.09228515625, "loss_aux_layer_9": 0.0909423828125, "step": 959, "total_loss": 0.7634430229663849 }, { "epoch": 0.19006137398534945, "grad_norm": 1.6072310209274292, "learning_rate": 5e-05, "llm_loss": 0.6236449778079987, "loss": 3.0024, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.07421875, "loss_aux_layer_10": 0.0960693359375, "loss_aux_layer_11": 0.1021728515625, "loss_aux_layer_12": 0.1104736328125, "loss_aux_layer_13": 0.1190185546875, "loss_aux_layer_14": 0.132568359375, "loss_aux_layer_15": 0.144287109375, "loss_aux_layer_16": 0.155517578125, "loss_aux_layer_17": 0.163330078125, "loss_aux_layer_18": 0.17236328125, "loss_aux_layer_19": 0.172607421875, "loss_aux_layer_2": 0.08251953125, "loss_aux_layer_20": 0.17822265625, "loss_aux_layer_21": 0.18359375, "loss_aux_layer_22": 0.204833984375, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.094970703125, "loss_aux_layer_4": 0.0975341796875, "loss_aux_layer_5": 0.0992431640625, "loss_aux_layer_6": 0.102294921875, "loss_aux_layer_7": 0.09814453125, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.094482421875, "step": 960, "total_loss": 0.7505984157323837 }, { "epoch": 0.19025935458325083, "grad_norm": 1.1013528108596802, "learning_rate": 5e-05, "llm_loss": 0.631349116563797, "loss": 3.0265, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.0745849609375, "loss_aux_layer_10": 0.0948486328125, "loss_aux_layer_11": 0.1009521484375, "loss_aux_layer_12": 0.1090087890625, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.12890625, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.159423828125, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.1689453125, "loss_aux_layer_2": 0.08251953125, "loss_aux_layer_20": 0.176025390625, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.203857421875, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.094482421875, "loss_aux_layer_4": 0.0975341796875, "loss_aux_layer_5": 0.099609375, "loss_aux_layer_6": 0.102294921875, "loss_aux_layer_7": 0.0975341796875, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.0936279296875, "step": 961, "total_loss": 0.7566291689872742 }, { "epoch": 0.19045733518115224, "grad_norm": 1.5426384210586548, "learning_rate": 5e-05, "llm_loss": 0.7321645617485046, "loss": 3.4344, "loss_aux_layer_0": 0.028228759765625, "loss_aux_layer_1": 0.0753173828125, "loss_aux_layer_10": 0.095458984375, "loss_aux_layer_11": 0.1014404296875, "loss_aux_layer_12": 0.109130859375, "loss_aux_layer_13": 0.1171875, "loss_aux_layer_14": 0.1297607421875, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.1708984375, "loss_aux_layer_19": 0.171630859375, "loss_aux_layer_2": 0.0833740234375, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.0955810546875, "loss_aux_layer_4": 0.0987548828125, "loss_aux_layer_5": 0.1002197265625, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.095947265625, "loss_aux_layer_9": 0.093994140625, "step": 962, "total_loss": 0.8585907965898514 }, { "epoch": 0.19065531577905365, "grad_norm": 1.7013344764709473, "learning_rate": 5e-05, "llm_loss": 0.7014570385217667, "loss": 3.3013, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.07421875, "loss_aux_layer_10": 0.093994140625, "loss_aux_layer_11": 0.10009765625, "loss_aux_layer_12": 0.108154296875, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.1279296875, "loss_aux_layer_15": 0.13916015625, "loss_aux_layer_16": 0.1494140625, "loss_aux_layer_17": 0.156982421875, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.17138671875, "loss_aux_layer_21": 0.17724609375, "loss_aux_layer_22": 0.198486328125, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.09619140625, "loss_aux_layer_4": 0.0985107421875, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.1024169921875, "loss_aux_layer_7": 0.097412109375, "loss_aux_layer_8": 0.0948486328125, "loss_aux_layer_9": 0.0927734375, "step": 963, "total_loss": 0.8253296315670013 }, { "epoch": 0.19085329637695506, "grad_norm": 1.2785420417785645, "learning_rate": 5e-05, "llm_loss": 0.6236854195594788, "loss": 2.9896, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.073486328125, "loss_aux_layer_10": 0.0931396484375, "loss_aux_layer_11": 0.09912109375, "loss_aux_layer_12": 0.107177734375, "loss_aux_layer_13": 0.114990234375, "loss_aux_layer_14": 0.127197265625, "loss_aux_layer_15": 0.13916015625, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.168701171875, "loss_aux_layer_2": 0.0802001953125, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0926513671875, "loss_aux_layer_4": 0.09521484375, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.0999755859375, "loss_aux_layer_7": 0.095458984375, "loss_aux_layer_8": 0.0933837890625, "loss_aux_layer_9": 0.0919189453125, "step": 964, "total_loss": 0.7474100738763809 }, { "epoch": 0.19105127697485647, "grad_norm": 0.9276214241981506, "learning_rate": 5e-05, "llm_loss": 0.6628706008195877, "loss": 3.1415, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.0706787109375, "loss_aux_layer_10": 0.092041015625, "loss_aux_layer_11": 0.097900390625, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.1146240234375, "loss_aux_layer_14": 0.126220703125, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0794677734375, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.092041015625, "loss_aux_layer_4": 0.094482421875, "loss_aux_layer_5": 0.0958251953125, "loss_aux_layer_6": 0.09912109375, "loss_aux_layer_7": 0.0946044921875, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0906982421875, "step": 965, "total_loss": 0.7853867560625076 }, { "epoch": 0.19124925757275787, "grad_norm": 1.2480173110961914, "learning_rate": 5e-05, "llm_loss": 0.6444105058908463, "loss": 3.0842, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.074462890625, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1024169921875, "loss_aux_layer_12": 0.1102294921875, "loss_aux_layer_13": 0.1185302734375, "loss_aux_layer_14": 0.1304931640625, "loss_aux_layer_15": 0.142333984375, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.160888671875, "loss_aux_layer_18": 0.170166015625, "loss_aux_layer_19": 0.170654296875, "loss_aux_layer_2": 0.08251953125, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.2060546875, "loss_aux_layer_23": 0.249755859375, "loss_aux_layer_3": 0.0950927734375, "loss_aux_layer_4": 0.097900390625, "loss_aux_layer_5": 0.0997314453125, "loss_aux_layer_6": 0.102783203125, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.096435546875, "loss_aux_layer_9": 0.0946044921875, "step": 966, "total_loss": 0.7710611373186111 }, { "epoch": 0.19144723817065928, "grad_norm": 1.5776108503341675, "learning_rate": 5e-05, "llm_loss": 0.6120376139879227, "loss": 2.9382, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.09130859375, "loss_aux_layer_11": 0.0970458984375, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.12548828125, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.14990234375, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0789794921875, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0902099609375, "loss_aux_layer_4": 0.0926513671875, "loss_aux_layer_5": 0.0943603515625, "loss_aux_layer_6": 0.09765625, "loss_aux_layer_7": 0.09326171875, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.0902099609375, "step": 967, "total_loss": 0.7345408797264099 }, { "epoch": 0.1916452187685607, "grad_norm": 2.1426374912261963, "learning_rate": 5e-05, "llm_loss": 0.5322445780038834, "loss": 2.624, "loss_aux_layer_0": 0.027801513671875, "loss_aux_layer_1": 0.0726318359375, "loss_aux_layer_10": 0.0906982421875, "loss_aux_layer_11": 0.096923828125, "loss_aux_layer_12": 0.105712890625, "loss_aux_layer_13": 0.1143798828125, "loss_aux_layer_14": 0.127197265625, "loss_aux_layer_15": 0.13916015625, "loss_aux_layer_16": 0.150390625, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.168212890625, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.079345703125, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.184326171875, "loss_aux_layer_22": 0.207275390625, "loss_aux_layer_23": 0.25, "loss_aux_layer_3": 0.09130859375, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.0946044921875, "loss_aux_layer_6": 0.0975341796875, "loss_aux_layer_7": 0.0928955078125, "loss_aux_layer_8": 0.0914306640625, "loss_aux_layer_9": 0.08984375, "step": 968, "total_loss": 0.6559984385967255 }, { "epoch": 0.19184319936646207, "grad_norm": 1.786871314048767, "learning_rate": 5e-05, "llm_loss": 0.5803475975990295, "loss": 2.7948, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.0673828125, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.0992431640625, "loss_aux_layer_13": 0.10791015625, "loss_aux_layer_14": 0.12109375, "loss_aux_layer_15": 0.1336669921875, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.0855712890625, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.0894775390625, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.087646484375, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.0843505859375, "step": 969, "total_loss": 0.6986987292766571 }, { "epoch": 0.19204117996436348, "grad_norm": 1.5962550640106201, "learning_rate": 5e-05, "llm_loss": 0.5657854154706001, "loss": 2.7454, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.06890869140625, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1103515625, "loss_aux_layer_14": 0.1231689453125, "loss_aux_layer_15": 0.135009765625, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.0880126953125, "loss_aux_layer_4": 0.0902099609375, "loss_aux_layer_5": 0.09228515625, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.091064453125, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.087890625, "step": 970, "total_loss": 0.6863449662923813 }, { "epoch": 0.1922391605622649, "grad_norm": 1.2202882766723633, "learning_rate": 5e-05, "llm_loss": 0.7536587417125702, "loss": 3.4934, "loss_aux_layer_0": 0.0247802734375, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.08837890625, "loss_aux_layer_11": 0.093994140625, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.110107421875, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.163330078125, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.0762939453125, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.240966796875, "loss_aux_layer_3": 0.0877685546875, "loss_aux_layer_4": 0.09033203125, "loss_aux_layer_5": 0.092041015625, "loss_aux_layer_6": 0.0950927734375, "loss_aux_layer_7": 0.0904541015625, "loss_aux_layer_8": 0.088623046875, "loss_aux_layer_9": 0.0870361328125, "step": 971, "total_loss": 0.8733533918857574 }, { "epoch": 0.1924371411601663, "grad_norm": 1.2055954933166504, "learning_rate": 5e-05, "llm_loss": 0.7095299810171127, "loss": 3.3253, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.0718994140625, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.0965576171875, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1126708984375, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.1484375, "loss_aux_layer_17": 0.15625, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.078857421875, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.093505859375, "loss_aux_layer_5": 0.09521484375, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.0911865234375, "loss_aux_layer_9": 0.08935546875, "step": 972, "total_loss": 0.8313156366348267 }, { "epoch": 0.1926351217580677, "grad_norm": 1.150330662727356, "learning_rate": 5e-05, "llm_loss": 0.6182162314653397, "loss": 2.9793, "loss_aux_layer_0": 0.029144287109375, "loss_aux_layer_1": 0.0775146484375, "loss_aux_layer_10": 0.0958251953125, "loss_aux_layer_11": 0.101806640625, "loss_aux_layer_12": 0.1099853515625, "loss_aux_layer_13": 0.1177978515625, "loss_aux_layer_14": 0.1304931640625, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.16064453125, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.0836181640625, "loss_aux_layer_20": 0.17626953125, "loss_aux_layer_21": 0.1826171875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.095458984375, "loss_aux_layer_4": 0.0980224609375, "loss_aux_layer_5": 0.10009765625, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.0965576171875, "loss_aux_layer_9": 0.0946044921875, "step": 973, "total_loss": 0.7448218911886215 }, { "epoch": 0.19283310235596912, "grad_norm": 1.3709982633590698, "learning_rate": 5e-05, "llm_loss": 0.644666776061058, "loss": 3.0601, "loss_aux_layer_0": 0.026763916015625, "loss_aux_layer_1": 0.06884765625, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.1021728515625, "loss_aux_layer_13": 0.1102294921875, "loss_aux_layer_14": 0.123046875, "loss_aux_layer_15": 0.135009765625, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.08984375, "loss_aux_layer_5": 0.092041015625, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.090576171875, "loss_aux_layer_8": 0.089111328125, "loss_aux_layer_9": 0.0875244140625, "step": 974, "total_loss": 0.765025407075882 }, { "epoch": 0.19303108295387053, "grad_norm": 1.4719198942184448, "learning_rate": 5e-05, "llm_loss": 0.7171565294265747, "loss": 3.3408, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.1090087890625, "loss_aux_layer_14": 0.121826171875, "loss_aux_layer_15": 0.13330078125, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.16259765625, "loss_aux_layer_2": 0.0753173828125, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.236572265625, "loss_aux_layer_3": 0.0872802734375, "loss_aux_layer_4": 0.0894775390625, "loss_aux_layer_5": 0.0906982421875, "loss_aux_layer_6": 0.0928955078125, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.086669921875, "loss_aux_layer_9": 0.0850830078125, "step": 975, "total_loss": 0.8351921737194061 }, { "epoch": 0.19322906355177194, "grad_norm": 2.280507802963257, "learning_rate": 5e-05, "llm_loss": 0.6361114978790283, "loss": 3.0238, "loss_aux_layer_0": 0.03271484375, "loss_aux_layer_1": 0.0723876953125, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.093017578125, "loss_aux_layer_12": 0.100830078125, "loss_aux_layer_13": 0.1087646484375, "loss_aux_layer_14": 0.1214599609375, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0765380859375, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.240966796875, "loss_aux_layer_3": 0.087890625, "loss_aux_layer_4": 0.089599609375, "loss_aux_layer_5": 0.092041015625, "loss_aux_layer_6": 0.0946044921875, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.087890625, "loss_aux_layer_9": 0.086181640625, "step": 976, "total_loss": 0.7559581696987152 }, { "epoch": 0.19342704414967332, "grad_norm": 2.052279233932495, "learning_rate": 5e-05, "llm_loss": 0.7798392176628113, "loss": 3.6179, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.0733642578125, "loss_aux_layer_10": 0.09326171875, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.1163330078125, "loss_aux_layer_14": 0.1290283203125, "loss_aux_layer_15": 0.140625, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.0809326171875, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.2431640625, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.0953369140625, "loss_aux_layer_5": 0.0968017578125, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.095703125, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.0919189453125, "step": 977, "total_loss": 0.9044651985168457 }, { "epoch": 0.19362502474757473, "grad_norm": 1.5593554973602295, "learning_rate": 5e-05, "llm_loss": 0.611185610294342, "loss": 2.9311, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.1041259765625, "loss_aux_layer_13": 0.112060546875, "loss_aux_layer_14": 0.1240234375, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.166748046875, "loss_aux_layer_2": 0.077880859375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.0924072265625, "loss_aux_layer_5": 0.0941162109375, "loss_aux_layer_6": 0.0972900390625, "loss_aux_layer_7": 0.0928955078125, "loss_aux_layer_8": 0.091064453125, "loss_aux_layer_9": 0.08935546875, "step": 978, "total_loss": 0.7327786386013031 }, { "epoch": 0.19382300534547614, "grad_norm": 1.4687923192977905, "learning_rate": 5e-05, "llm_loss": 0.6736351847648621, "loss": 3.174, "loss_aux_layer_0": 0.02630615234375, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.1109619140625, "loss_aux_layer_14": 0.123291015625, "loss_aux_layer_15": 0.13525390625, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.163818359375, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.0751953125, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.20068359375, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0863037109375, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.0902099609375, "loss_aux_layer_6": 0.0933837890625, "loss_aux_layer_7": 0.08935546875, "loss_aux_layer_8": 0.08837890625, "loss_aux_layer_9": 0.0872802734375, "step": 979, "total_loss": 0.7935103178024292 }, { "epoch": 0.19402098594337755, "grad_norm": 0.8847082853317261, "learning_rate": 5e-05, "llm_loss": 0.6469926536083221, "loss": 3.0815, "loss_aux_layer_0": 0.025390625, "loss_aux_layer_1": 0.07373046875, "loss_aux_layer_10": 0.0943603515625, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.1151123046875, "loss_aux_layer_14": 0.126953125, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.1484375, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.16455078125, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.0816650390625, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.0943603515625, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.0986328125, "loss_aux_layer_6": 0.1015625, "loss_aux_layer_7": 0.0966796875, "loss_aux_layer_8": 0.094482421875, "loss_aux_layer_9": 0.0927734375, "step": 980, "total_loss": 0.7703696638345718 }, { "epoch": 0.19421896654127896, "grad_norm": 1.4768998622894287, "learning_rate": 5e-05, "llm_loss": 0.5749422460794449, "loss": 2.772, "loss_aux_layer_0": 0.027099609375, "loss_aux_layer_1": 0.06707763671875, "loss_aux_layer_10": 0.0869140625, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.10791015625, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0755615234375, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1767578125, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.08642578125, "loss_aux_layer_4": 0.08837890625, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.093017578125, "loss_aux_layer_7": 0.0885009765625, "loss_aux_layer_8": 0.0872802734375, "loss_aux_layer_9": 0.085693359375, "step": 981, "total_loss": 0.6930026710033417 }, { "epoch": 0.19441694713918037, "grad_norm": 2.2135238647460938, "learning_rate": 5e-05, "llm_loss": 0.6566688120365143, "loss": 3.1321, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.0755615234375, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.103271484375, "loss_aux_layer_12": 0.1107177734375, "loss_aux_layer_13": 0.1182861328125, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.15771484375, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0845947265625, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.246826171875, "loss_aux_layer_3": 0.0975341796875, "loss_aux_layer_4": 0.10009765625, "loss_aux_layer_5": 0.1021728515625, "loss_aux_layer_6": 0.10498046875, "loss_aux_layer_7": 0.0999755859375, "loss_aux_layer_8": 0.0980224609375, "loss_aux_layer_9": 0.0958251953125, "step": 982, "total_loss": 0.7830177545547485 }, { "epoch": 0.19461492773708178, "grad_norm": 2.975057363510132, "learning_rate": 5e-05, "llm_loss": 0.7336403280496597, "loss": 3.4292, "loss_aux_layer_0": 0.028167724609375, "loss_aux_layer_1": 0.0732421875, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1080322265625, "loss_aux_layer_13": 0.1153564453125, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.150390625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0806884765625, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.23779296875, "loss_aux_layer_3": 0.09375, "loss_aux_layer_4": 0.096923828125, "loss_aux_layer_5": 0.0986328125, "loss_aux_layer_6": 0.10107421875, "loss_aux_layer_7": 0.0963134765625, "loss_aux_layer_8": 0.093994140625, "loss_aux_layer_9": 0.0924072265625, "step": 983, "total_loss": 0.8573027998209 }, { "epoch": 0.19481290833498316, "grad_norm": 2.2145538330078125, "learning_rate": 5e-05, "llm_loss": 0.6737174540758133, "loss": 3.1933, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.072509765625, "loss_aux_layer_10": 0.09521484375, "loss_aux_layer_11": 0.10107421875, "loss_aux_layer_12": 0.1085205078125, "loss_aux_layer_13": 0.1165771484375, "loss_aux_layer_14": 0.1282958984375, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.1494140625, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.085205078125, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.096435546875, "loss_aux_layer_4": 0.0994873046875, "loss_aux_layer_5": 0.1015625, "loss_aux_layer_6": 0.1036376953125, "loss_aux_layer_7": 0.0985107421875, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.09375, "step": 984, "total_loss": 0.7983283549547195 }, { "epoch": 0.19501088893288457, "grad_norm": 1.5881224870681763, "learning_rate": 5e-05, "llm_loss": 0.613254651427269, "loss": 2.9598, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.077392578125, "loss_aux_layer_10": 0.097412109375, "loss_aux_layer_11": 0.103759765625, "loss_aux_layer_12": 0.111572265625, "loss_aux_layer_13": 0.1195068359375, "loss_aux_layer_14": 0.130859375, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.15185546875, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.16748046875, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0859375, "loss_aux_layer_20": 0.174072265625, "loss_aux_layer_21": 0.1796875, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.24169921875, "loss_aux_layer_3": 0.098876953125, "loss_aux_layer_4": 0.1016845703125, "loss_aux_layer_5": 0.103271484375, "loss_aux_layer_6": 0.10595703125, "loss_aux_layer_7": 0.1007080078125, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0960693359375, "step": 985, "total_loss": 0.7399518638849258 }, { "epoch": 0.19520886953078598, "grad_norm": 3.913971424102783, "learning_rate": 5e-05, "llm_loss": 0.618747815489769, "loss": 2.9398, "loss_aux_layer_0": 0.02557373046875, "loss_aux_layer_1": 0.0654296875, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.08984375, "loss_aux_layer_12": 0.097900390625, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.1182861328125, "loss_aux_layer_15": 0.1297607421875, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.16162109375, "loss_aux_layer_2": 0.0760498046875, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.2353515625, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.090576171875, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.08447265625, "loss_aux_layer_9": 0.083251953125, "step": 986, "total_loss": 0.7349466681480408 }, { "epoch": 0.1954068501286874, "grad_norm": 3.8826022148132324, "learning_rate": 5e-05, "llm_loss": 0.6487789452075958, "loss": 3.1047, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0755615234375, "loss_aux_layer_10": 0.09716796875, "loss_aux_layer_11": 0.103271484375, "loss_aux_layer_12": 0.111572265625, "loss_aux_layer_13": 0.11962890625, "loss_aux_layer_14": 0.1317138671875, "loss_aux_layer_15": 0.142578125, "loss_aux_layer_16": 0.153076171875, "loss_aux_layer_17": 0.16015625, "loss_aux_layer_18": 0.168701171875, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.0877685546875, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.1002197265625, "loss_aux_layer_4": 0.1025390625, "loss_aux_layer_5": 0.103759765625, "loss_aux_layer_6": 0.105712890625, "loss_aux_layer_7": 0.1002197265625, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0960693359375, "step": 987, "total_loss": 0.7761855572462082 }, { "epoch": 0.1956048307265888, "grad_norm": 2.173517942428589, "learning_rate": 5e-05, "llm_loss": 0.5769026875495911, "loss": 2.7961, "loss_aux_layer_0": 0.02655029296875, "loss_aux_layer_1": 0.0703125, "loss_aux_layer_10": 0.0911865234375, "loss_aux_layer_11": 0.0965576171875, "loss_aux_layer_12": 0.1043701171875, "loss_aux_layer_13": 0.1119384765625, "loss_aux_layer_14": 0.124267578125, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.148193359375, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.079345703125, "loss_aux_layer_20": 0.17431640625, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.246337890625, "loss_aux_layer_3": 0.0909423828125, "loss_aux_layer_4": 0.0931396484375, "loss_aux_layer_5": 0.09521484375, "loss_aux_layer_6": 0.0972900390625, "loss_aux_layer_7": 0.092529296875, "loss_aux_layer_8": 0.0908203125, "loss_aux_layer_9": 0.08984375, "step": 988, "total_loss": 0.6990129053592682 }, { "epoch": 0.1958028113244902, "grad_norm": 1.8278356790542603, "learning_rate": 5e-05, "llm_loss": 0.5979356914758682, "loss": 2.8795, "loss_aux_layer_0": 0.027557373046875, "loss_aux_layer_1": 0.0711669921875, "loss_aux_layer_10": 0.0924072265625, "loss_aux_layer_11": 0.09814453125, "loss_aux_layer_12": 0.105224609375, "loss_aux_layer_13": 0.11279296875, "loss_aux_layer_14": 0.1240234375, "loss_aux_layer_15": 0.1343994140625, "loss_aux_layer_16": 0.1455078125, "loss_aux_layer_17": 0.153076171875, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.0819091796875, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.0927734375, "loss_aux_layer_4": 0.0950927734375, "loss_aux_layer_5": 0.096923828125, "loss_aux_layer_6": 0.099365234375, "loss_aux_layer_7": 0.094970703125, "loss_aux_layer_8": 0.093017578125, "loss_aux_layer_9": 0.091064453125, "step": 989, "total_loss": 0.7198836356401443 }, { "epoch": 0.19600079192239162, "grad_norm": 1.7333292961120605, "learning_rate": 5e-05, "llm_loss": 0.6110113263130188, "loss": 2.9332, "loss_aux_layer_0": 0.02581787109375, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.0966796875, "loss_aux_layer_12": 0.104736328125, "loss_aux_layer_13": 0.113037109375, "loss_aux_layer_14": 0.1258544921875, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.166015625, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0821533203125, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0916748046875, "loss_aux_layer_4": 0.093994140625, "loss_aux_layer_5": 0.09619140625, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.0914306640625, "loss_aux_layer_9": 0.08984375, "step": 990, "total_loss": 0.7333046495914459 }, { "epoch": 0.19619877252029302, "grad_norm": 1.293610692024231, "learning_rate": 5e-05, "llm_loss": 0.6717724949121475, "loss": 3.1717, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0718994140625, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.103271484375, "loss_aux_layer_13": 0.1112060546875, "loss_aux_layer_14": 0.1234130859375, "loss_aux_layer_15": 0.134765625, "loss_aux_layer_16": 0.146240234375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.078857421875, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.1796875, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.0904541015625, "loss_aux_layer_4": 0.0931396484375, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.09765625, "loss_aux_layer_7": 0.092529296875, "loss_aux_layer_8": 0.0904541015625, "loss_aux_layer_9": 0.088623046875, "step": 991, "total_loss": 0.7929237633943558 }, { "epoch": 0.1963967531181944, "grad_norm": 1.7701185941696167, "learning_rate": 5e-05, "llm_loss": 0.5805663168430328, "loss": 2.7934, "loss_aux_layer_0": 0.02532958984375, "loss_aux_layer_1": 0.0682373046875, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.118408203125, "loss_aux_layer_15": 0.13037109375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.150146484375, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.076416015625, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0867919921875, "loss_aux_layer_4": 0.0887451171875, "loss_aux_layer_5": 0.0904541015625, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0850830078125, "step": 992, "total_loss": 0.6983400285243988 }, { "epoch": 0.19659473371609582, "grad_norm": 1.7171252965927124, "learning_rate": 5e-05, "llm_loss": 0.6595899239182472, "loss": 3.1422, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.072265625, "loss_aux_layer_10": 0.09375, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.108154296875, "loss_aux_layer_13": 0.1163330078125, "loss_aux_layer_14": 0.1298828125, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.154052734375, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.17138671875, "loss_aux_layer_19": 0.17333984375, "loss_aux_layer_2": 0.08056640625, "loss_aux_layer_20": 0.180419921875, "loss_aux_layer_21": 0.18701171875, "loss_aux_layer_22": 0.209716796875, "loss_aux_layer_23": 0.25146484375, "loss_aux_layer_3": 0.0924072265625, "loss_aux_layer_4": 0.0948486328125, "loss_aux_layer_5": 0.0965576171875, "loss_aux_layer_6": 0.0999755859375, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0924072265625, "step": 993, "total_loss": 0.7855431139469147 }, { "epoch": 0.19679271431399722, "grad_norm": 1.303661584854126, "learning_rate": 5e-05, "llm_loss": 0.5743348896503448, "loss": 2.7853, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0699462890625, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.09716796875, "loss_aux_layer_12": 0.1051025390625, "loss_aux_layer_13": 0.11279296875, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.15576171875, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0780029296875, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.0902099609375, "loss_aux_layer_4": 0.0926513671875, "loss_aux_layer_5": 0.0943603515625, "loss_aux_layer_6": 0.0970458984375, "loss_aux_layer_7": 0.0927734375, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.090087890625, "step": 994, "total_loss": 0.6963335126638412 }, { "epoch": 0.19699069491189863, "grad_norm": 1.5258309841156006, "learning_rate": 5e-05, "llm_loss": 0.6103292405605316, "loss": 2.9287, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.06982421875, "loss_aux_layer_10": 0.0904541015625, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.104736328125, "loss_aux_layer_13": 0.11279296875, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.13720703125, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.169189453125, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.096435546875, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.0892333984375, "step": 995, "total_loss": 0.7321778237819672 }, { "epoch": 0.19718867550980004, "grad_norm": 1.0775295495986938, "learning_rate": 5e-05, "llm_loss": 0.7367961257696152, "loss": 3.4297, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.0677490234375, "loss_aux_layer_10": 0.0892333984375, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.1041259765625, "loss_aux_layer_13": 0.112548828125, "loss_aux_layer_14": 0.12548828125, "loss_aux_layer_15": 0.13720703125, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.0753173828125, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0869140625, "loss_aux_layer_4": 0.08984375, "loss_aux_layer_5": 0.091796875, "loss_aux_layer_6": 0.094482421875, "loss_aux_layer_7": 0.0904541015625, "loss_aux_layer_8": 0.0887451171875, "loss_aux_layer_9": 0.087646484375, "step": 996, "total_loss": 0.8574175089597702 }, { "epoch": 0.19738665610770145, "grad_norm": 1.6038686037063599, "learning_rate": 5e-05, "llm_loss": 0.7245538681745529, "loss": 3.3939, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.075927734375, "loss_aux_layer_10": 0.0946044921875, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.1153564453125, "loss_aux_layer_14": 0.1270751953125, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.15576171875, "loss_aux_layer_18": 0.16455078125, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.08203125, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.2392578125, "loss_aux_layer_3": 0.0947265625, "loss_aux_layer_4": 0.09814453125, "loss_aux_layer_5": 0.10009765625, "loss_aux_layer_6": 0.10302734375, "loss_aux_layer_7": 0.09814453125, "loss_aux_layer_8": 0.0958251953125, "loss_aux_layer_9": 0.0938720703125, "step": 997, "total_loss": 0.8484873175621033 }, { "epoch": 0.19758463670560286, "grad_norm": 1.0815836191177368, "learning_rate": 5e-05, "llm_loss": 0.6730568706989288, "loss": 3.1784, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0694580078125, "loss_aux_layer_10": 0.0916748046875, "loss_aux_layer_11": 0.097412109375, "loss_aux_layer_12": 0.1053466796875, "loss_aux_layer_13": 0.113525390625, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.0772705078125, "loss_aux_layer_20": 0.17138671875, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0902099609375, "loss_aux_layer_4": 0.0928955078125, "loss_aux_layer_5": 0.0946044921875, "loss_aux_layer_6": 0.0975341796875, "loss_aux_layer_7": 0.0931396484375, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.0902099609375, "step": 998, "total_loss": 0.7945896089076996 }, { "epoch": 0.19778261730350424, "grad_norm": 1.8521652221679688, "learning_rate": 5e-05, "llm_loss": 0.6519103348255157, "loss": 3.1026, "loss_aux_layer_0": 0.027496337890625, "loss_aux_layer_1": 0.07275390625, "loss_aux_layer_10": 0.0927734375, "loss_aux_layer_11": 0.0987548828125, "loss_aux_layer_12": 0.106689453125, "loss_aux_layer_13": 0.1142578125, "loss_aux_layer_14": 0.1265869140625, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.150146484375, "loss_aux_layer_17": 0.157958984375, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.167724609375, "loss_aux_layer_2": 0.0806884765625, "loss_aux_layer_20": 0.173828125, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.0960693359375, "loss_aux_layer_5": 0.097412109375, "loss_aux_layer_6": 0.100341796875, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0919189453125, "step": 999, "total_loss": 0.7756389379501343 }, { "epoch": 0.19798059790140565, "grad_norm": 0.8703190088272095, "learning_rate": 5e-05, "llm_loss": 0.6978929787874222, "loss": 3.2819, "loss_aux_layer_0": 0.026123046875, "loss_aux_layer_1": 0.0699462890625, "loss_aux_layer_10": 0.0911865234375, "loss_aux_layer_11": 0.096923828125, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.11328125, "loss_aux_layer_14": 0.1260986328125, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.16845703125, "loss_aux_layer_2": 0.0787353515625, "loss_aux_layer_20": 0.174560546875, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0928955078125, "loss_aux_layer_5": 0.094482421875, "loss_aux_layer_6": 0.09716796875, "loss_aux_layer_7": 0.0927734375, "loss_aux_layer_8": 0.09130859375, "loss_aux_layer_9": 0.089599609375, "step": 1000, "total_loss": 0.8204749673604965 }, { "epoch": 0.19817857849930706, "grad_norm": 1.7471532821655273, "learning_rate": 5e-05, "llm_loss": 0.6008066684007645, "loss": 2.8872, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.07080078125, "loss_aux_layer_10": 0.0892333984375, "loss_aux_layer_11": 0.0955810546875, "loss_aux_layer_12": 0.104248046875, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.124267578125, "loss_aux_layer_15": 0.13525390625, "loss_aux_layer_16": 0.146728515625, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.1640625, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.0775146484375, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.089599609375, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.0931396484375, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.0888671875, "loss_aux_layer_9": 0.0875244140625, "step": 1001, "total_loss": 0.721795380115509 }, { "epoch": 0.19837655909720847, "grad_norm": 1.6184782981872559, "learning_rate": 5e-05, "llm_loss": 0.6511450558900833, "loss": 3.0839, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0657958984375, "loss_aux_layer_10": 0.087158203125, "loss_aux_layer_11": 0.0928955078125, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.123779296875, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.156005859375, "loss_aux_layer_18": 0.166015625, "loss_aux_layer_19": 0.1689453125, "loss_aux_layer_2": 0.0732421875, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.181396484375, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.247802734375, "loss_aux_layer_3": 0.0849609375, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.0889892578125, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.08544921875, "step": 1002, "total_loss": 0.770986795425415 }, { "epoch": 0.19857453969510988, "grad_norm": 1.3439595699310303, "learning_rate": 5e-05, "llm_loss": 0.5015664994716644, "loss": 2.4887, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.0689697265625, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.0955810546875, "loss_aux_layer_12": 0.1029052734375, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.1231689453125, "loss_aux_layer_15": 0.134521484375, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.163818359375, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.0771484375, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.0894775390625, "loss_aux_layer_4": 0.092041015625, "loss_aux_layer_5": 0.0936279296875, "loss_aux_layer_6": 0.0965576171875, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0888671875, "step": 1003, "total_loss": 0.6221870630979538 }, { "epoch": 0.1987725202930113, "grad_norm": 1.5835082530975342, "learning_rate": 5e-05, "llm_loss": 0.5787209123373032, "loss": 2.8071, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.071533203125, "loss_aux_layer_10": 0.0921630859375, "loss_aux_layer_11": 0.0985107421875, "loss_aux_layer_12": 0.1065673828125, "loss_aux_layer_13": 0.1148681640625, "loss_aux_layer_14": 0.1268310546875, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.156005859375, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.166748046875, "loss_aux_layer_2": 0.080078125, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.1796875, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.093017578125, "loss_aux_layer_4": 0.095458984375, "loss_aux_layer_5": 0.09716796875, "loss_aux_layer_6": 0.0997314453125, "loss_aux_layer_7": 0.094970703125, "loss_aux_layer_8": 0.0928955078125, "loss_aux_layer_9": 0.0914306640625, "step": 1004, "total_loss": 0.7017770111560822 }, { "epoch": 0.1989705008909127, "grad_norm": 1.7775917053222656, "learning_rate": 5e-05, "llm_loss": 0.6703303754329681, "loss": 3.1558, "loss_aux_layer_0": 0.024017333984375, "loss_aux_layer_1": 0.068359375, "loss_aux_layer_10": 0.08837890625, "loss_aux_layer_11": 0.0938720703125, "loss_aux_layer_12": 0.10107421875, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.133056640625, "loss_aux_layer_16": 0.14501953125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.075439453125, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.1767578125, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.087158203125, "loss_aux_layer_4": 0.08935546875, "loss_aux_layer_5": 0.09130859375, "loss_aux_layer_6": 0.093505859375, "loss_aux_layer_7": 0.089599609375, "loss_aux_layer_8": 0.0880126953125, "loss_aux_layer_9": 0.0869140625, "step": 1005, "total_loss": 0.7889598309993744 }, { "epoch": 0.1991684814888141, "grad_norm": 1.3295234441757202, "learning_rate": 5e-05, "llm_loss": 0.5493840798735619, "loss": 2.7051, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.07568359375, "loss_aux_layer_10": 0.0970458984375, "loss_aux_layer_11": 0.1031494140625, "loss_aux_layer_12": 0.1103515625, "loss_aux_layer_13": 0.11767578125, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.140869140625, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.16845703125, "loss_aux_layer_2": 0.085693359375, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.24853515625, "loss_aux_layer_3": 0.098876953125, "loss_aux_layer_4": 0.1015625, "loss_aux_layer_5": 0.1033935546875, "loss_aux_layer_6": 0.106201171875, "loss_aux_layer_7": 0.10107421875, "loss_aux_layer_8": 0.0987548828125, "loss_aux_layer_9": 0.096435546875, "step": 1006, "total_loss": 0.676283672451973 }, { "epoch": 0.1993664620867155, "grad_norm": 1.4986562728881836, "learning_rate": 5e-05, "llm_loss": 0.6638448238372803, "loss": 3.1474, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.069580078125, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.107421875, "loss_aux_layer_13": 0.114990234375, "loss_aux_layer_14": 0.126708984375, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.150146484375, "loss_aux_layer_17": 0.157470703125, "loss_aux_layer_18": 0.166015625, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0784912109375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.091552734375, "loss_aux_layer_4": 0.0941162109375, "loss_aux_layer_5": 0.0958251953125, "loss_aux_layer_6": 0.0986328125, "loss_aux_layer_7": 0.0946044921875, "loss_aux_layer_8": 0.0933837890625, "loss_aux_layer_9": 0.092041015625, "step": 1007, "total_loss": 0.7868430763483047 }, { "epoch": 0.1995644426846169, "grad_norm": 1.5175613164901733, "learning_rate": 5e-05, "llm_loss": 0.610330730676651, "loss": 2.9201, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.0701904296875, "loss_aux_layer_10": 0.0889892578125, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.1021728515625, "loss_aux_layer_13": 0.1097412109375, "loss_aux_layer_14": 0.1224365234375, "loss_aux_layer_15": 0.133544921875, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.15234375, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.0770263671875, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.17724609375, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.08935546875, "loss_aux_layer_4": 0.091552734375, "loss_aux_layer_5": 0.0938720703125, "loss_aux_layer_6": 0.09619140625, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.087646484375, "step": 1008, "total_loss": 0.7300287932157516 }, { "epoch": 0.1997624232825183, "grad_norm": 1.2698837518692017, "learning_rate": 5e-05, "llm_loss": 0.5662638396024704, "loss": 2.7467, "loss_aux_layer_0": 0.025299072265625, "loss_aux_layer_1": 0.0693359375, "loss_aux_layer_10": 0.0897216796875, "loss_aux_layer_11": 0.0955810546875, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.112060546875, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.1357421875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.163818359375, "loss_aux_layer_19": 0.16552734375, "loss_aux_layer_2": 0.076904296875, "loss_aux_layer_20": 0.17138671875, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.0889892578125, "loss_aux_layer_4": 0.0911865234375, "loss_aux_layer_5": 0.0926513671875, "loss_aux_layer_6": 0.0955810546875, "loss_aux_layer_7": 0.0916748046875, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0882568359375, "step": 1009, "total_loss": 0.6866818964481354 }, { "epoch": 0.19996040388041972, "grad_norm": 1.738901972770691, "learning_rate": 5e-05, "llm_loss": 0.5928874611854553, "loss": 2.8543, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.104736328125, "loss_aux_layer_13": 0.1136474609375, "loss_aux_layer_14": 0.126220703125, "loss_aux_layer_15": 0.1376953125, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.1669921875, "loss_aux_layer_2": 0.0765380859375, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.196533203125, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.0885009765625, "step": 1010, "total_loss": 0.7135702073574066 }, { "epoch": 0.20015838447832113, "grad_norm": 1.5264983177185059, "learning_rate": 5e-05, "llm_loss": 0.5849151313304901, "loss": 2.829, "loss_aux_layer_0": 0.029876708984375, "loss_aux_layer_1": 0.0740966796875, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.1051025390625, "loss_aux_layer_13": 0.1129150390625, "loss_aux_layer_14": 0.124755859375, "loss_aux_layer_15": 0.136474609375, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.155517578125, "loss_aux_layer_18": 0.16455078125, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.0791015625, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.091796875, "loss_aux_layer_4": 0.0941162109375, "loss_aux_layer_5": 0.095703125, "loss_aux_layer_6": 0.0985107421875, "loss_aux_layer_7": 0.093994140625, "loss_aux_layer_8": 0.0921630859375, "loss_aux_layer_9": 0.0906982421875, "step": 1011, "total_loss": 0.7072598338127136 }, { "epoch": 0.20035636507622254, "grad_norm": 1.1480296850204468, "learning_rate": 5e-05, "llm_loss": 0.6843364983797073, "loss": 3.2099, "loss_aux_layer_0": 0.025604248046875, "loss_aux_layer_1": 0.0672607421875, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.0933837890625, "loss_aux_layer_12": 0.101318359375, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.12158203125, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.144775390625, "loss_aux_layer_17": 0.15283203125, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.073974609375, "loss_aux_layer_20": 0.16943359375, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.086181640625, "loss_aux_layer_4": 0.088623046875, "loss_aux_layer_5": 0.090576171875, "loss_aux_layer_6": 0.093505859375, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.0877685546875, "loss_aux_layer_9": 0.0863037109375, "step": 1012, "total_loss": 0.8024829179048538 }, { "epoch": 0.20055434567412395, "grad_norm": 1.6391719579696655, "learning_rate": 5e-05, "llm_loss": 0.6111386716365814, "loss": 2.9462, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.0723876953125, "loss_aux_layer_10": 0.0946044921875, "loss_aux_layer_11": 0.10107421875, "loss_aux_layer_12": 0.1090087890625, "loss_aux_layer_13": 0.117431640625, "loss_aux_layer_14": 0.129638671875, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15283203125, "loss_aux_layer_17": 0.160400390625, "loss_aux_layer_18": 0.16845703125, "loss_aux_layer_19": 0.169189453125, "loss_aux_layer_2": 0.081787109375, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.18212890625, "loss_aux_layer_22": 0.203857421875, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.0947265625, "loss_aux_layer_4": 0.0977783203125, "loss_aux_layer_5": 0.09912109375, "loss_aux_layer_6": 0.101806640625, "loss_aux_layer_7": 0.0970458984375, "loss_aux_layer_8": 0.094970703125, "loss_aux_layer_9": 0.093505859375, "step": 1013, "total_loss": 0.7365429252386093 }, { "epoch": 0.20075232627202533, "grad_norm": 1.4163784980773926, "learning_rate": 5e-05, "llm_loss": 0.6225013956427574, "loss": 2.9662, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.08837890625, "loss_aux_layer_11": 0.093994140625, "loss_aux_layer_12": 0.101806640625, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.1219482421875, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.171875, "loss_aux_layer_21": 0.177001953125, "loss_aux_layer_22": 0.196533203125, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.090087890625, "loss_aux_layer_5": 0.0916748046875, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.090087890625, "loss_aux_layer_8": 0.0885009765625, "loss_aux_layer_9": 0.0869140625, "step": 1014, "total_loss": 0.7415439635515213 }, { "epoch": 0.20095030686992674, "grad_norm": 1.2760003805160522, "learning_rate": 5e-05, "llm_loss": 0.6994903236627579, "loss": 3.2813, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.0712890625, "loss_aux_layer_10": 0.0914306640625, "loss_aux_layer_11": 0.09716796875, "loss_aux_layer_12": 0.1051025390625, "loss_aux_layer_13": 0.11279296875, "loss_aux_layer_14": 0.1241455078125, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.146728515625, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.164306640625, "loss_aux_layer_2": 0.078125, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.093505859375, "loss_aux_layer_5": 0.0950927734375, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.091796875, "loss_aux_layer_9": 0.0899658203125, "step": 1015, "total_loss": 0.8203303962945938 }, { "epoch": 0.20114828746782815, "grad_norm": 1.6511261463165283, "learning_rate": 5e-05, "llm_loss": 0.6039071530103683, "loss": 2.8887, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.0677490234375, "loss_aux_layer_10": 0.0872802734375, "loss_aux_layer_11": 0.0924072265625, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.152099609375, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.0740966796875, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.08642578125, "loss_aux_layer_4": 0.0889892578125, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.0860595703125, "step": 1016, "total_loss": 0.7221684902906418 }, { "epoch": 0.20134626806572956, "grad_norm": 1.582266092300415, "learning_rate": 5e-05, "llm_loss": 0.5972115024924278, "loss": 2.9038, "loss_aux_layer_0": 0.028076171875, "loss_aux_layer_1": 0.0751953125, "loss_aux_layer_10": 0.097412109375, "loss_aux_layer_11": 0.1033935546875, "loss_aux_layer_12": 0.1114501953125, "loss_aux_layer_13": 0.1199951171875, "loss_aux_layer_14": 0.132568359375, "loss_aux_layer_15": 0.14404296875, "loss_aux_layer_16": 0.15576171875, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.172607421875, "loss_aux_layer_19": 0.17333984375, "loss_aux_layer_2": 0.0836181640625, "loss_aux_layer_20": 0.179931640625, "loss_aux_layer_21": 0.18798828125, "loss_aux_layer_22": 0.211669921875, "loss_aux_layer_23": 0.255126953125, "loss_aux_layer_3": 0.0965576171875, "loss_aux_layer_4": 0.099365234375, "loss_aux_layer_5": 0.1014404296875, "loss_aux_layer_6": 0.1041259765625, "loss_aux_layer_7": 0.0992431640625, "loss_aux_layer_8": 0.0975341796875, "loss_aux_layer_9": 0.095703125, "step": 1017, "total_loss": 0.7259445041418076 }, { "epoch": 0.20154424866363096, "grad_norm": 1.073304295539856, "learning_rate": 5e-05, "llm_loss": 0.516712561249733, "loss": 2.5417, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.0675048828125, "loss_aux_layer_10": 0.0888671875, "loss_aux_layer_11": 0.0946044921875, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.1094970703125, "loss_aux_layer_14": 0.120849609375, "loss_aux_layer_15": 0.1322021484375, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.16259765625, "loss_aux_layer_2": 0.0753173828125, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.090087890625, "loss_aux_layer_5": 0.0919189453125, "loss_aux_layer_6": 0.0946044921875, "loss_aux_layer_7": 0.09033203125, "loss_aux_layer_8": 0.088623046875, "loss_aux_layer_9": 0.08740234375, "step": 1018, "total_loss": 0.635427862405777 }, { "epoch": 0.20174222926153237, "grad_norm": 1.3762400150299072, "learning_rate": 5e-05, "llm_loss": 0.6274792030453682, "loss": 3.0033, "loss_aux_layer_0": 0.028594970703125, "loss_aux_layer_1": 0.072265625, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.09765625, "loss_aux_layer_12": 0.1053466796875, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.1268310546875, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.1689453125, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.078857421875, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0936279296875, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.0982666015625, "loss_aux_layer_7": 0.0941162109375, "loss_aux_layer_8": 0.0924072265625, "loss_aux_layer_9": 0.090576171875, "step": 1019, "total_loss": 0.7508236169815063 }, { "epoch": 0.20194020985943378, "grad_norm": 1.5580662488937378, "learning_rate": 5e-05, "llm_loss": 0.6976830810308456, "loss": 3.2694, "loss_aux_layer_0": 0.024932861328125, "loss_aux_layer_1": 0.068115234375, "loss_aux_layer_10": 0.08935546875, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.1026611328125, "loss_aux_layer_13": 0.1102294921875, "loss_aux_layer_14": 0.122314453125, "loss_aux_layer_15": 0.134521484375, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.1767578125, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.0885009765625, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0927734375, "loss_aux_layer_6": 0.0958251953125, "loss_aux_layer_7": 0.091552734375, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0880126953125, "step": 1020, "total_loss": 0.8173388689756393 }, { "epoch": 0.2021381904573352, "grad_norm": 1.333916425704956, "learning_rate": 5e-05, "llm_loss": 0.5879855901002884, "loss": 2.8604, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.073486328125, "loss_aux_layer_10": 0.09814453125, "loss_aux_layer_11": 0.104736328125, "loss_aux_layer_12": 0.112548828125, "loss_aux_layer_13": 0.120361328125, "loss_aux_layer_14": 0.13232421875, "loss_aux_layer_15": 0.1435546875, "loss_aux_layer_16": 0.154296875, "loss_aux_layer_17": 0.1611328125, "loss_aux_layer_18": 0.16943359375, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0845947265625, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.180419921875, "loss_aux_layer_22": 0.20361328125, "loss_aux_layer_23": 0.2451171875, "loss_aux_layer_3": 0.09814453125, "loss_aux_layer_4": 0.1005859375, "loss_aux_layer_5": 0.1015625, "loss_aux_layer_6": 0.1043701171875, "loss_aux_layer_7": 0.1002197265625, "loss_aux_layer_8": 0.0986328125, "loss_aux_layer_9": 0.096435546875, "step": 1021, "total_loss": 0.7150909900665283 }, { "epoch": 0.20233617105523657, "grad_norm": 1.6460310220718384, "learning_rate": 5e-05, "llm_loss": 0.618524506688118, "loss": 2.9677, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.0699462890625, "loss_aux_layer_10": 0.0921630859375, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.1070556640625, "loss_aux_layer_13": 0.115966796875, "loss_aux_layer_14": 0.1290283203125, "loss_aux_layer_15": 0.14111328125, "loss_aux_layer_16": 0.15185546875, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.169189453125, "loss_aux_layer_19": 0.169921875, "loss_aux_layer_2": 0.0777587890625, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.201416015625, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0906982421875, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.09814453125, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.0916748046875, "loss_aux_layer_9": 0.09033203125, "step": 1022, "total_loss": 0.7419137507677078 }, { "epoch": 0.20253415165313798, "grad_norm": 0.9323902726173401, "learning_rate": 5e-05, "llm_loss": 0.6153440326452255, "loss": 2.937, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.0697021484375, "loss_aux_layer_10": 0.08935546875, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.110107421875, "loss_aux_layer_14": 0.12109375, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.14208984375, "loss_aux_layer_17": 0.14990234375, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.076904296875, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0894775390625, "loss_aux_layer_4": 0.0919189453125, "loss_aux_layer_5": 0.093505859375, "loss_aux_layer_6": 0.096435546875, "loss_aux_layer_7": 0.092529296875, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.0882568359375, "step": 1023, "total_loss": 0.7342460751533508 }, { "epoch": 0.2027321322510394, "grad_norm": 2.3263771533966064, "learning_rate": 5e-05, "llm_loss": 0.6817083954811096, "loss": 3.2208, "loss_aux_layer_0": 0.030792236328125, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.0970458984375, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.114990234375, "loss_aux_layer_14": 0.1290283203125, "loss_aux_layer_15": 0.1416015625, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.162841796875, "loss_aux_layer_18": 0.171875, "loss_aux_layer_19": 0.173095703125, "loss_aux_layer_2": 0.0755615234375, "loss_aux_layer_20": 0.17822265625, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.24462890625, "loss_aux_layer_3": 0.0875244140625, "loss_aux_layer_4": 0.090087890625, "loss_aux_layer_5": 0.09228515625, "loss_aux_layer_6": 0.0947265625, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.0899658203125, "loss_aux_layer_9": 0.089111328125, "step": 1024, "total_loss": 0.8052033632993698 }, { "epoch": 0.2029301128489408, "grad_norm": 1.0298258066177368, "learning_rate": 5e-05, "llm_loss": 0.6464149504899979, "loss": 3.0674, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.070556640625, "loss_aux_layer_10": 0.09033203125, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.104248046875, "loss_aux_layer_13": 0.112060546875, "loss_aux_layer_14": 0.1241455078125, "loss_aux_layer_15": 0.13525390625, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0782470703125, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.0899658203125, "loss_aux_layer_4": 0.092529296875, "loss_aux_layer_5": 0.0941162109375, "loss_aux_layer_6": 0.0970458984375, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0887451171875, "step": 1025, "total_loss": 0.7668418139219284 }, { "epoch": 0.2031280934468422, "grad_norm": 1.5534323453903198, "learning_rate": 5e-05, "llm_loss": 0.7290381491184235, "loss": 3.4172, "loss_aux_layer_0": 0.02545166015625, "loss_aux_layer_1": 0.072509765625, "loss_aux_layer_10": 0.0950927734375, "loss_aux_layer_11": 0.1011962890625, "loss_aux_layer_12": 0.108642578125, "loss_aux_layer_13": 0.1170654296875, "loss_aux_layer_14": 0.1297607421875, "loss_aux_layer_15": 0.141845703125, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.170654296875, "loss_aux_layer_19": 0.172119140625, "loss_aux_layer_2": 0.0794677734375, "loss_aux_layer_20": 0.17822265625, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.2021484375, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.092529296875, "loss_aux_layer_4": 0.095703125, "loss_aux_layer_5": 0.09716796875, "loss_aux_layer_6": 0.1004638671875, "loss_aux_layer_7": 0.0968017578125, "loss_aux_layer_8": 0.09521484375, "loss_aux_layer_9": 0.09375, "step": 1026, "total_loss": 0.8543030172586441 }, { "epoch": 0.20332607404474362, "grad_norm": 1.127432942390442, "learning_rate": 5e-05, "llm_loss": 0.5267993062734604, "loss": 2.5935, "loss_aux_layer_0": 0.026885986328125, "loss_aux_layer_1": 0.0711669921875, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.1124267578125, "loss_aux_layer_14": 0.1246337890625, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.16455078125, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0780029296875, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.18017578125, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.245849609375, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.0916748046875, "loss_aux_layer_5": 0.0928955078125, "loss_aux_layer_6": 0.0958251953125, "loss_aux_layer_7": 0.0921630859375, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.089111328125, "step": 1027, "total_loss": 0.6483826190233231 }, { "epoch": 0.20352405464264503, "grad_norm": 1.1840894222259521, "learning_rate": 5e-05, "llm_loss": 0.5993949770927429, "loss": 2.8912, "loss_aux_layer_0": 0.024505615234375, "loss_aux_layer_1": 0.0712890625, "loss_aux_layer_10": 0.093017578125, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.1280517578125, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.151123046875, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.16796875, "loss_aux_layer_19": 0.169677734375, "loss_aux_layer_2": 0.077880859375, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.1806640625, "loss_aux_layer_22": 0.201904296875, "loss_aux_layer_23": 0.244140625, "loss_aux_layer_3": 0.0904541015625, "loss_aux_layer_4": 0.09326171875, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.09814453125, "loss_aux_layer_7": 0.09375, "loss_aux_layer_8": 0.0921630859375, "loss_aux_layer_9": 0.0909423828125, "step": 1028, "total_loss": 0.7228077501058578 }, { "epoch": 0.20372203524054644, "grad_norm": 1.563109040260315, "learning_rate": 5e-05, "llm_loss": 0.6659422367811203, "loss": 3.1502, "loss_aux_layer_0": 0.02545166015625, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.0908203125, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.1121826171875, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.148193359375, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.166259765625, "loss_aux_layer_19": 0.16845703125, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.175537109375, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.0904541015625, "loss_aux_layer_5": 0.0928955078125, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.0919189453125, "loss_aux_layer_8": 0.0908203125, "loss_aux_layer_9": 0.089599609375, "step": 1029, "total_loss": 0.7875403165817261 }, { "epoch": 0.20392001583844782, "grad_norm": 2.1638050079345703, "learning_rate": 5e-05, "llm_loss": 0.6687003076076508, "loss": 3.1487, "loss_aux_layer_0": 0.025421142578125, "loss_aux_layer_1": 0.06683349609375, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.1002197265625, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.132080078125, "loss_aux_layer_16": 0.143798828125, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.074951171875, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.198486328125, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.0869140625, "loss_aux_layer_4": 0.08935546875, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.0892333984375, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.086181640625, "step": 1030, "total_loss": 0.7871803492307663 }, { "epoch": 0.20411799643634923, "grad_norm": 2.4585115909576416, "learning_rate": 5e-05, "llm_loss": 0.73436239361763, "loss": 3.4365, "loss_aux_layer_0": 0.02667236328125, "loss_aux_layer_1": 0.073486328125, "loss_aux_layer_10": 0.0931396484375, "loss_aux_layer_11": 0.0987548828125, "loss_aux_layer_12": 0.1064453125, "loss_aux_layer_13": 0.1143798828125, "loss_aux_layer_14": 0.12744140625, "loss_aux_layer_15": 0.13916015625, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.168701171875, "loss_aux_layer_19": 0.170166015625, "loss_aux_layer_2": 0.08447265625, "loss_aux_layer_20": 0.177001953125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.24609375, "loss_aux_layer_3": 0.0947265625, "loss_aux_layer_4": 0.0972900390625, "loss_aux_layer_5": 0.0985107421875, "loss_aux_layer_6": 0.1004638671875, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.093505859375, "loss_aux_layer_9": 0.091552734375, "step": 1031, "total_loss": 0.8591184169054031 }, { "epoch": 0.20431597703425064, "grad_norm": 2.7195630073547363, "learning_rate": 5e-05, "llm_loss": 0.6692844927310944, "loss": 3.1657, "loss_aux_layer_0": 0.0262451171875, "loss_aux_layer_1": 0.06982421875, "loss_aux_layer_10": 0.092529296875, "loss_aux_layer_11": 0.09814453125, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.1141357421875, "loss_aux_layer_14": 0.12646484375, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.1494140625, "loss_aux_layer_17": 0.15673828125, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0780029296875, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.090087890625, "loss_aux_layer_4": 0.0927734375, "loss_aux_layer_5": 0.0950927734375, "loss_aux_layer_6": 0.0975341796875, "loss_aux_layer_7": 0.09423828125, "loss_aux_layer_8": 0.0928955078125, "loss_aux_layer_9": 0.091552734375, "step": 1032, "total_loss": 0.7914306074380875 }, { "epoch": 0.20451395763215205, "grad_norm": 2.5599188804626465, "learning_rate": 5e-05, "llm_loss": 0.5854532420635223, "loss": 2.8235, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.06805419921875, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.1033935546875, "loss_aux_layer_13": 0.1114501953125, "loss_aux_layer_14": 0.123291015625, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.155517578125, "loss_aux_layer_18": 0.164306640625, "loss_aux_layer_19": 0.166259765625, "loss_aux_layer_2": 0.0753173828125, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.089111328125, "loss_aux_layer_4": 0.091552734375, "loss_aux_layer_5": 0.0927734375, "loss_aux_layer_6": 0.0958251953125, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0885009765625, "step": 1033, "total_loss": 0.7058812975883484 }, { "epoch": 0.20471193823005346, "grad_norm": 4.10886812210083, "learning_rate": 5e-05, "llm_loss": 0.5474261045455933, "loss": 2.6875, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.09521484375, "loss_aux_layer_11": 0.100830078125, "loss_aux_layer_12": 0.1083984375, "loss_aux_layer_13": 0.1160888671875, "loss_aux_layer_14": 0.127197265625, "loss_aux_layer_15": 0.137939453125, "loss_aux_layer_16": 0.148681640625, "loss_aux_layer_17": 0.156005859375, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.09619140625, "loss_aux_layer_4": 0.09912109375, "loss_aux_layer_5": 0.1021728515625, "loss_aux_layer_6": 0.1036376953125, "loss_aux_layer_7": 0.0982666015625, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.0947265625, "step": 1034, "total_loss": 0.6718839555978775 }, { "epoch": 0.20490991882795487, "grad_norm": 2.539973020553589, "learning_rate": 5e-05, "llm_loss": 0.6761990487575531, "loss": 3.1785, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.067138671875, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.0931396484375, "loss_aux_layer_12": 0.1005859375, "loss_aux_layer_13": 0.1083984375, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.1307373046875, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.14990234375, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0797119140625, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.090576171875, "loss_aux_layer_4": 0.0924072265625, "loss_aux_layer_5": 0.0924072265625, "loss_aux_layer_6": 0.0948486328125, "loss_aux_layer_7": 0.0902099609375, "loss_aux_layer_8": 0.08837890625, "loss_aux_layer_9": 0.0869140625, "step": 1035, "total_loss": 0.7946151942014694 }, { "epoch": 0.20510789942585628, "grad_norm": 3.133392333984375, "learning_rate": 5e-05, "llm_loss": 0.6562544256448746, "loss": 3.1043, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.0894775390625, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.1029052734375, "loss_aux_layer_13": 0.1112060546875, "loss_aux_layer_14": 0.1224365234375, "loss_aux_layer_15": 0.134033203125, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.079833984375, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.09228515625, "loss_aux_layer_4": 0.0943603515625, "loss_aux_layer_5": 0.093994140625, "loss_aux_layer_6": 0.096923828125, "loss_aux_layer_7": 0.091796875, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0882568359375, "step": 1036, "total_loss": 0.7760734260082245 }, { "epoch": 0.20530588002375766, "grad_norm": 2.1568844318389893, "learning_rate": 5e-05, "llm_loss": 0.6473293006420135, "loss": 3.0651, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.0894775390625, "loss_aux_layer_11": 0.09521484375, "loss_aux_layer_12": 0.1025390625, "loss_aux_layer_13": 0.110107421875, "loss_aux_layer_14": 0.1214599609375, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.175537109375, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.23828125, "loss_aux_layer_3": 0.0889892578125, "loss_aux_layer_4": 0.091552734375, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.0960693359375, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.089599609375, "loss_aux_layer_9": 0.0885009765625, "step": 1037, "total_loss": 0.7662837952375412 }, { "epoch": 0.20550386062165907, "grad_norm": 2.210386037826538, "learning_rate": 5e-05, "llm_loss": 0.668491929769516, "loss": 3.1559, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.07025146484375, "loss_aux_layer_10": 0.091552734375, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.1044921875, "loss_aux_layer_13": 0.1116943359375, "loss_aux_layer_14": 0.12353515625, "loss_aux_layer_15": 0.1343994140625, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.15234375, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.16162109375, "loss_aux_layer_2": 0.0794677734375, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.175048828125, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.09130859375, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.0950927734375, "loss_aux_layer_6": 0.0980224609375, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.091796875, "loss_aux_layer_9": 0.0904541015625, "step": 1038, "total_loss": 0.7889680117368698 }, { "epoch": 0.20570184121956048, "grad_norm": 3.7392079830169678, "learning_rate": 5e-05, "llm_loss": 0.5419385805726051, "loss": 2.6551, "loss_aux_layer_0": 0.0247802734375, "loss_aux_layer_1": 0.06951904296875, "loss_aux_layer_10": 0.0928955078125, "loss_aux_layer_11": 0.098388671875, "loss_aux_layer_12": 0.1055908203125, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.146484375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.163330078125, "loss_aux_layer_19": 0.164306640625, "loss_aux_layer_2": 0.0802001953125, "loss_aux_layer_20": 0.17041015625, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.0947265625, "loss_aux_layer_4": 0.0970458984375, "loss_aux_layer_5": 0.0999755859375, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.0947265625, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0914306640625, "step": 1039, "total_loss": 0.6637687385082245 }, { "epoch": 0.2058998218174619, "grad_norm": 3.3921189308166504, "learning_rate": 5e-05, "llm_loss": 0.6241206377744675, "loss": 3.0068, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.0706787109375, "loss_aux_layer_10": 0.097900390625, "loss_aux_layer_11": 0.1036376953125, "loss_aux_layer_12": 0.1112060546875, "loss_aux_layer_13": 0.1192626953125, "loss_aux_layer_14": 0.13134765625, "loss_aux_layer_15": 0.143310546875, "loss_aux_layer_16": 0.153564453125, "loss_aux_layer_17": 0.161376953125, "loss_aux_layer_18": 0.169921875, "loss_aux_layer_19": 0.169677734375, "loss_aux_layer_2": 0.092529296875, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.245849609375, "loss_aux_layer_3": 0.1002197265625, "loss_aux_layer_4": 0.1036376953125, "loss_aux_layer_5": 0.104248046875, "loss_aux_layer_6": 0.1055908203125, "loss_aux_layer_7": 0.10009765625, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0963134765625, "step": 1040, "total_loss": 0.7516979575157166 }, { "epoch": 0.2060978024153633, "grad_norm": 1.8041054010391235, "learning_rate": 5e-05, "llm_loss": 0.5981845259666443, "loss": 2.881, "loss_aux_layer_0": 0.02606201171875, "loss_aux_layer_1": 0.0699462890625, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.0966796875, "loss_aux_layer_12": 0.1041259765625, "loss_aux_layer_13": 0.112548828125, "loss_aux_layer_14": 0.1248779296875, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.1484375, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.164306640625, "loss_aux_layer_19": 0.165771484375, "loss_aux_layer_2": 0.081298828125, "loss_aux_layer_20": 0.17333984375, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.092041015625, "loss_aux_layer_4": 0.094482421875, "loss_aux_layer_5": 0.096435546875, "loss_aux_layer_6": 0.0980224609375, "loss_aux_layer_7": 0.0936279296875, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.08984375, "step": 1041, "total_loss": 0.7202595919370651 }, { "epoch": 0.2062957830132647, "grad_norm": 2.450263023376465, "learning_rate": 5e-05, "llm_loss": 0.6581035852432251, "loss": 3.1244, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0706787109375, "loss_aux_layer_10": 0.0928955078125, "loss_aux_layer_11": 0.0987548828125, "loss_aux_layer_12": 0.1070556640625, "loss_aux_layer_13": 0.115478515625, "loss_aux_layer_14": 0.1278076171875, "loss_aux_layer_15": 0.138916015625, "loss_aux_layer_16": 0.150146484375, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.16552734375, "loss_aux_layer_19": 0.166748046875, "loss_aux_layer_2": 0.0872802734375, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.0931396484375, "loss_aux_layer_4": 0.0953369140625, "loss_aux_layer_5": 0.095703125, "loss_aux_layer_6": 0.098388671875, "loss_aux_layer_7": 0.093994140625, "loss_aux_layer_8": 0.0926513671875, "loss_aux_layer_9": 0.091064453125, "step": 1042, "total_loss": 0.7811016589403152 }, { "epoch": 0.20649376361116611, "grad_norm": 1.5170323848724365, "learning_rate": 5e-05, "llm_loss": 0.6387656331062317, "loss": 3.0417, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.0673828125, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0972900390625, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1131591796875, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.156005859375, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0794677734375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.2431640625, "loss_aux_layer_3": 0.0894775390625, "loss_aux_layer_4": 0.09228515625, "loss_aux_layer_5": 0.0938720703125, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.0921630859375, "loss_aux_layer_8": 0.09130859375, "loss_aux_layer_9": 0.0902099609375, "step": 1043, "total_loss": 0.760422870516777 }, { "epoch": 0.20669174420906752, "grad_norm": 1.366398811340332, "learning_rate": 5e-05, "llm_loss": 0.6355501264333725, "loss": 3.0194, "loss_aux_layer_0": 0.024932861328125, "loss_aux_layer_1": 0.067626953125, "loss_aux_layer_10": 0.0897216796875, "loss_aux_layer_11": 0.094970703125, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1103515625, "loss_aux_layer_14": 0.1224365234375, "loss_aux_layer_15": 0.134521484375, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.1533203125, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.163818359375, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.237548828125, "loss_aux_layer_3": 0.0877685546875, "loss_aux_layer_4": 0.0904541015625, "loss_aux_layer_5": 0.0919189453125, "loss_aux_layer_6": 0.0947265625, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.088134765625, "step": 1044, "total_loss": 0.754846841096878 }, { "epoch": 0.2068897248069689, "grad_norm": 1.6549599170684814, "learning_rate": 5e-05, "llm_loss": 0.5424822121858597, "loss": 2.6684, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.09375, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.107421875, "loss_aux_layer_13": 0.115234375, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.151611328125, "loss_aux_layer_17": 0.159912109375, "loss_aux_layer_18": 0.169677734375, "loss_aux_layer_19": 0.1708984375, "loss_aux_layer_2": 0.0819091796875, "loss_aux_layer_20": 0.1767578125, "loss_aux_layer_21": 0.182861328125, "loss_aux_layer_22": 0.203369140625, "loss_aux_layer_23": 0.24560546875, "loss_aux_layer_3": 0.0924072265625, "loss_aux_layer_4": 0.0948486328125, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.0999755859375, "loss_aux_layer_7": 0.095703125, "loss_aux_layer_8": 0.0941162109375, "loss_aux_layer_9": 0.0926513671875, "step": 1045, "total_loss": 0.6671120226383209 }, { "epoch": 0.20708770540487031, "grad_norm": 1.4194848537445068, "learning_rate": 5e-05, "llm_loss": 0.5733784735202789, "loss": 2.7637, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.0667724609375, "loss_aux_layer_10": 0.0869140625, "loss_aux_layer_11": 0.09228515625, "loss_aux_layer_12": 0.1004638671875, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.12060546875, "loss_aux_layer_15": 0.132080078125, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.162109375, "loss_aux_layer_2": 0.074951171875, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.240966796875, "loss_aux_layer_3": 0.085205078125, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.0889892578125, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0875244140625, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0853271484375, "step": 1046, "total_loss": 0.6909162700176239 }, { "epoch": 0.20728568600277172, "grad_norm": 1.0898404121398926, "learning_rate": 5e-05, "llm_loss": 0.543922133743763, "loss": 2.6365, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.0654296875, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.0982666015625, "loss_aux_layer_13": 0.1058349609375, "loss_aux_layer_14": 0.117431640625, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.15966796875, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.234375, "loss_aux_layer_3": 0.083740234375, "loss_aux_layer_4": 0.0860595703125, "loss_aux_layer_5": 0.0880126953125, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.083984375, "step": 1047, "total_loss": 0.6591182202100754 }, { "epoch": 0.20748366660067313, "grad_norm": 1.3046172857284546, "learning_rate": 5e-05, "llm_loss": 0.57542584836483, "loss": 2.8006, "loss_aux_layer_0": 0.025421142578125, "loss_aux_layer_1": 0.0716552734375, "loss_aux_layer_10": 0.093994140625, "loss_aux_layer_11": 0.0999755859375, "loss_aux_layer_12": 0.10791015625, "loss_aux_layer_13": 0.1158447265625, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.140380859375, "loss_aux_layer_16": 0.1513671875, "loss_aux_layer_17": 0.158935546875, "loss_aux_layer_18": 0.16845703125, "loss_aux_layer_19": 0.169677734375, "loss_aux_layer_2": 0.08154296875, "loss_aux_layer_20": 0.17578125, "loss_aux_layer_21": 0.18310546875, "loss_aux_layer_22": 0.20458984375, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.09375, "loss_aux_layer_4": 0.0960693359375, "loss_aux_layer_5": 0.09765625, "loss_aux_layer_6": 0.0999755859375, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.09375, "loss_aux_layer_9": 0.092529296875, "step": 1048, "total_loss": 0.7001430690288544 }, { "epoch": 0.20768164719857454, "grad_norm": 1.5228360891342163, "learning_rate": 5e-05, "llm_loss": 0.5702069997787476, "loss": 2.7812, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.07568359375, "loss_aux_layer_10": 0.096435546875, "loss_aux_layer_11": 0.10205078125, "loss_aux_layer_12": 0.109375, "loss_aux_layer_13": 0.1168212890625, "loss_aux_layer_14": 0.128173828125, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.15625, "loss_aux_layer_18": 0.165283203125, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.085205078125, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.0963134765625, "loss_aux_layer_4": 0.099853515625, "loss_aux_layer_5": 0.1011962890625, "loss_aux_layer_6": 0.1038818359375, "loss_aux_layer_7": 0.099365234375, "loss_aux_layer_8": 0.0970458984375, "loss_aux_layer_9": 0.095458984375, "step": 1049, "total_loss": 0.6953079402446747 }, { "epoch": 0.20787962779647595, "grad_norm": 2.4812889099121094, "learning_rate": 5e-05, "llm_loss": 0.6198265999555588, "loss": 2.9765, "loss_aux_layer_0": 0.02423095703125, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.0938720703125, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.1080322265625, "loss_aux_layer_13": 0.1162109375, "loss_aux_layer_14": 0.1280517578125, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.151123046875, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.1669921875, "loss_aux_layer_19": 0.168701171875, "loss_aux_layer_2": 0.0823974609375, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.1806640625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.244873046875, "loss_aux_layer_3": 0.0936279296875, "loss_aux_layer_4": 0.0965576171875, "loss_aux_layer_5": 0.097900390625, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.09423828125, "loss_aux_layer_9": 0.0927734375, "step": 1050, "total_loss": 0.7441331297159195 }, { "epoch": 0.20807760839437736, "grad_norm": 1.3851828575134277, "learning_rate": 5e-05, "llm_loss": 0.7015658169984818, "loss": 3.3076, "loss_aux_layer_0": 0.029083251953125, "loss_aux_layer_1": 0.07421875, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.099853515625, "loss_aux_layer_12": 0.1082763671875, "loss_aux_layer_13": 0.1168212890625, "loss_aux_layer_14": 0.130126953125, "loss_aux_layer_15": 0.141357421875, "loss_aux_layer_16": 0.15380859375, "loss_aux_layer_17": 0.161865234375, "loss_aux_layer_18": 0.170654296875, "loss_aux_layer_19": 0.17138671875, "loss_aux_layer_2": 0.0823974609375, "loss_aux_layer_20": 0.177490234375, "loss_aux_layer_21": 0.181884765625, "loss_aux_layer_22": 0.20263671875, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.093505859375, "loss_aux_layer_4": 0.095703125, "loss_aux_layer_5": 0.0972900390625, "loss_aux_layer_6": 0.099609375, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0938720703125, "loss_aux_layer_9": 0.09228515625, "step": 1051, "total_loss": 0.8268887996673584 }, { "epoch": 0.20827558899227874, "grad_norm": 1.6021215915679932, "learning_rate": 5e-05, "llm_loss": 0.5607172697782516, "loss": 2.7492, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.07470703125, "loss_aux_layer_10": 0.0980224609375, "loss_aux_layer_11": 0.10400390625, "loss_aux_layer_12": 0.112060546875, "loss_aux_layer_13": 0.1199951171875, "loss_aux_layer_14": 0.1314697265625, "loss_aux_layer_15": 0.142333984375, "loss_aux_layer_16": 0.152587890625, "loss_aux_layer_17": 0.1591796875, "loss_aux_layer_18": 0.167724609375, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.0860595703125, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.179443359375, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.0977783203125, "loss_aux_layer_4": 0.1005859375, "loss_aux_layer_5": 0.10205078125, "loss_aux_layer_6": 0.1044921875, "loss_aux_layer_7": 0.099853515625, "loss_aux_layer_8": 0.0982666015625, "loss_aux_layer_9": 0.0965576171875, "step": 1052, "total_loss": 0.6873026043176651 }, { "epoch": 0.20847356959018015, "grad_norm": 1.4296969175338745, "learning_rate": 5e-05, "llm_loss": 0.6352377682924271, "loss": 3.0136, "loss_aux_layer_0": 0.0264892578125, "loss_aux_layer_1": 0.0667724609375, "loss_aux_layer_10": 0.0887451171875, "loss_aux_layer_11": 0.093994140625, "loss_aux_layer_12": 0.101806640625, "loss_aux_layer_13": 0.1097412109375, "loss_aux_layer_14": 0.1221923828125, "loss_aux_layer_15": 0.1337890625, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.1533203125, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.162841796875, "loss_aux_layer_2": 0.0753173828125, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.08642578125, "loss_aux_layer_4": 0.0892333984375, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.09375, "loss_aux_layer_7": 0.0892333984375, "loss_aux_layer_8": 0.088134765625, "loss_aux_layer_9": 0.0870361328125, "step": 1053, "total_loss": 0.7533951699733734 }, { "epoch": 0.20867155018808156, "grad_norm": 1.4124608039855957, "learning_rate": 5e-05, "llm_loss": 0.6118550598621368, "loss": 2.9395, "loss_aux_layer_0": 0.023651123046875, "loss_aux_layer_1": 0.0701904296875, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1077880859375, "loss_aux_layer_13": 0.115966796875, "loss_aux_layer_14": 0.1278076171875, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.16552734375, "loss_aux_layer_19": 0.166259765625, "loss_aux_layer_2": 0.0806884765625, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.0926513671875, "loss_aux_layer_4": 0.095458984375, "loss_aux_layer_5": 0.0970458984375, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.091796875, "step": 1054, "total_loss": 0.7348712533712387 }, { "epoch": 0.20886953078598297, "grad_norm": 0.8124503493309021, "learning_rate": 5e-05, "llm_loss": 0.6683681309223175, "loss": 3.1397, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.065185546875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.10009765625, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.118896484375, "loss_aux_layer_15": 0.1298828125, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0736083984375, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.196533203125, "loss_aux_layer_23": 0.239013671875, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.08984375, "loss_aux_layer_6": 0.0928955078125, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.0867919921875, "loss_aux_layer_9": 0.0860595703125, "step": 1055, "total_loss": 0.7849233746528625 }, { "epoch": 0.20906751138388438, "grad_norm": 1.4216344356536865, "learning_rate": 5e-05, "llm_loss": 0.6589899510145187, "loss": 3.1194, "loss_aux_layer_0": 0.02789306640625, "loss_aux_layer_1": 0.07080078125, "loss_aux_layer_10": 0.0897216796875, "loss_aux_layer_11": 0.0955810546875, "loss_aux_layer_12": 0.1033935546875, "loss_aux_layer_13": 0.111572265625, "loss_aux_layer_14": 0.1236572265625, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.164306640625, "loss_aux_layer_19": 0.166259765625, "loss_aux_layer_2": 0.0770263671875, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.24365234375, "loss_aux_layer_3": 0.0882568359375, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.092041015625, "loss_aux_layer_6": 0.0950927734375, "loss_aux_layer_7": 0.0909423828125, "loss_aux_layer_8": 0.0892333984375, "loss_aux_layer_9": 0.088134765625, "step": 1056, "total_loss": 0.779847040772438 }, { "epoch": 0.2092654919817858, "grad_norm": 1.5989892482757568, "learning_rate": 5e-05, "llm_loss": 0.6503040194511414, "loss": 3.072, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.087158203125, "loss_aux_layer_11": 0.0928955078125, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.1090087890625, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.160400390625, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.23681640625, "loss_aux_layer_3": 0.0870361328125, "loss_aux_layer_4": 0.08935546875, "loss_aux_layer_5": 0.090576171875, "loss_aux_layer_6": 0.0931396484375, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.0872802734375, "loss_aux_layer_9": 0.0858154296875, "step": 1057, "total_loss": 0.7679987549781799 }, { "epoch": 0.2094634725796872, "grad_norm": 1.6104815006256104, "learning_rate": 5e-05, "llm_loss": 0.5386709570884705, "loss": 2.6544, "loss_aux_layer_0": 0.02825927734375, "loss_aux_layer_1": 0.0765380859375, "loss_aux_layer_10": 0.0947265625, "loss_aux_layer_11": 0.1007080078125, "loss_aux_layer_12": 0.108154296875, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.1273193359375, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.149658203125, "loss_aux_layer_17": 0.156982421875, "loss_aux_layer_18": 0.165771484375, "loss_aux_layer_19": 0.16748046875, "loss_aux_layer_2": 0.083251953125, "loss_aux_layer_20": 0.173828125, "loss_aux_layer_21": 0.180908203125, "loss_aux_layer_22": 0.202880859375, "loss_aux_layer_23": 0.245361328125, "loss_aux_layer_3": 0.095458984375, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.099365234375, "loss_aux_layer_6": 0.1021728515625, "loss_aux_layer_7": 0.09765625, "loss_aux_layer_8": 0.095458984375, "loss_aux_layer_9": 0.09375, "step": 1058, "total_loss": 0.6635966002941132 }, { "epoch": 0.2096614531775886, "grad_norm": 1.1886019706726074, "learning_rate": 5e-05, "llm_loss": 0.7934093475341797, "loss": 3.6552, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.068115234375, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.1038818359375, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.12353515625, "loss_aux_layer_15": 0.134765625, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.076416015625, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.177001953125, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.2392578125, "loss_aux_layer_3": 0.089599609375, "loss_aux_layer_4": 0.0926513671875, "loss_aux_layer_5": 0.094482421875, "loss_aux_layer_6": 0.0972900390625, "loss_aux_layer_7": 0.0928955078125, "loss_aux_layer_8": 0.0911865234375, "loss_aux_layer_9": 0.0894775390625, "step": 1059, "total_loss": 0.9137934148311615 }, { "epoch": 0.20985943377549, "grad_norm": 1.5063389539718628, "learning_rate": 5e-05, "llm_loss": 0.6354876160621643, "loss": 3.032, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.0711669921875, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.0994873046875, "loss_aux_layer_12": 0.10693359375, "loss_aux_layer_13": 0.1146240234375, "loss_aux_layer_14": 0.12646484375, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.149169921875, "loss_aux_layer_17": 0.15625, "loss_aux_layer_18": 0.166015625, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0789794921875, "loss_aux_layer_20": 0.172607421875, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.0916748046875, "loss_aux_layer_4": 0.0941162109375, "loss_aux_layer_5": 0.0958251953125, "loss_aux_layer_6": 0.0986328125, "loss_aux_layer_7": 0.094482421875, "loss_aux_layer_8": 0.0931396484375, "loss_aux_layer_9": 0.092041015625, "step": 1060, "total_loss": 0.7580102980136871 }, { "epoch": 0.2100574143733914, "grad_norm": 1.1112436056137085, "learning_rate": 5e-05, "llm_loss": 0.6526033729314804, "loss": 3.0804, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.06787109375, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.1004638671875, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.1197509765625, "loss_aux_layer_15": 0.13134765625, "loss_aux_layer_16": 0.14208984375, "loss_aux_layer_17": 0.150146484375, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.086181640625, "loss_aux_layer_4": 0.08837890625, "loss_aux_layer_5": 0.0899658203125, "loss_aux_layer_6": 0.0926513671875, "loss_aux_layer_7": 0.088623046875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.08642578125, "step": 1061, "total_loss": 0.770103707909584 }, { "epoch": 0.2102553949712928, "grad_norm": 1.4527455568313599, "learning_rate": 5e-05, "llm_loss": 0.606033094227314, "loss": 2.9119, "loss_aux_layer_0": 0.026641845703125, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.0916748046875, "loss_aux_layer_11": 0.09765625, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.1134033203125, "loss_aux_layer_14": 0.1256103515625, "loss_aux_layer_15": 0.136962890625, "loss_aux_layer_16": 0.148193359375, "loss_aux_layer_17": 0.155517578125, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.1669921875, "loss_aux_layer_2": 0.0791015625, "loss_aux_layer_20": 0.173095703125, "loss_aux_layer_21": 0.1796875, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.24365234375, "loss_aux_layer_3": 0.0904541015625, "loss_aux_layer_4": 0.0926513671875, "loss_aux_layer_5": 0.093994140625, "loss_aux_layer_6": 0.0966796875, "loss_aux_layer_7": 0.0921630859375, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.089599609375, "step": 1062, "total_loss": 0.7279775440692902 }, { "epoch": 0.21045337556919422, "grad_norm": 1.2080810070037842, "learning_rate": 5e-05, "llm_loss": 0.663512647151947, "loss": 3.132, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.06671142578125, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.0946044921875, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1103515625, "loss_aux_layer_14": 0.1229248046875, "loss_aux_layer_15": 0.134033203125, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.1533203125, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.171630859375, "loss_aux_layer_21": 0.1787109375, "loss_aux_layer_22": 0.20068359375, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.0897216796875, "loss_aux_layer_5": 0.091552734375, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.08984375, "loss_aux_layer_8": 0.08837890625, "loss_aux_layer_9": 0.0867919921875, "step": 1063, "total_loss": 0.7829913347959518 }, { "epoch": 0.21065135616709563, "grad_norm": 1.4577580690383911, "learning_rate": 5e-05, "llm_loss": 0.6308604776859283, "loss": 2.989, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.0638427734375, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.0986328125, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.1182861328125, "loss_aux_layer_15": 0.1297607421875, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.15869140625, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.0853271484375, "loss_aux_layer_4": 0.0875244140625, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.09130859375, "loss_aux_layer_7": 0.0870361328125, "loss_aux_layer_8": 0.0855712890625, "loss_aux_layer_9": 0.084716796875, "step": 1064, "total_loss": 0.7472508698701859 }, { "epoch": 0.21084933676499704, "grad_norm": 1.434922218322754, "learning_rate": 5e-05, "llm_loss": 0.6511425971984863, "loss": 3.0759, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.06549072265625, "loss_aux_layer_10": 0.08837890625, "loss_aux_layer_11": 0.09375, "loss_aux_layer_12": 0.1011962890625, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1204833984375, "loss_aux_layer_15": 0.132080078125, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.091064453125, "loss_aux_layer_6": 0.0938720703125, "loss_aux_layer_7": 0.0894775390625, "loss_aux_layer_8": 0.088134765625, "loss_aux_layer_9": 0.0869140625, "step": 1065, "total_loss": 0.7689714282751083 }, { "epoch": 0.21104731736289845, "grad_norm": 1.7241978645324707, "learning_rate": 5e-05, "llm_loss": 0.6233316510915756, "loss": 2.9983, "loss_aux_layer_0": 0.027679443359375, "loss_aux_layer_1": 0.073486328125, "loss_aux_layer_10": 0.09619140625, "loss_aux_layer_11": 0.1029052734375, "loss_aux_layer_12": 0.110595703125, "loss_aux_layer_13": 0.1190185546875, "loss_aux_layer_14": 0.130615234375, "loss_aux_layer_15": 0.142578125, "loss_aux_layer_16": 0.1533203125, "loss_aux_layer_17": 0.15966796875, "loss_aux_layer_18": 0.16845703125, "loss_aux_layer_19": 0.169189453125, "loss_aux_layer_2": 0.083984375, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.205078125, "loss_aux_layer_23": 0.24658203125, "loss_aux_layer_3": 0.0955810546875, "loss_aux_layer_4": 0.098388671875, "loss_aux_layer_5": 0.099609375, "loss_aux_layer_6": 0.101806640625, "loss_aux_layer_7": 0.0975341796875, "loss_aux_layer_8": 0.09619140625, "loss_aux_layer_9": 0.094482421875, "step": 1066, "total_loss": 0.7495687454938889 }, { "epoch": 0.21124529796079985, "grad_norm": 1.222422480583191, "learning_rate": 5e-05, "llm_loss": 0.5828482657670975, "loss": 2.7999, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0650634765625, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.119140625, "loss_aux_layer_15": 0.13037109375, "loss_aux_layer_16": 0.1416015625, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0740966796875, "loss_aux_layer_20": 0.16943359375, "loss_aux_layer_21": 0.17724609375, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.24169921875, "loss_aux_layer_3": 0.0853271484375, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.0892333984375, "loss_aux_layer_6": 0.091552734375, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0863037109375, "loss_aux_layer_9": 0.0848388671875, "step": 1067, "total_loss": 0.6999860256910324 }, { "epoch": 0.21144327855870124, "grad_norm": 1.3102182149887085, "learning_rate": 5e-05, "llm_loss": 0.5762145668268204, "loss": 2.796, "loss_aux_layer_0": 0.02642822265625, "loss_aux_layer_1": 0.0673828125, "loss_aux_layer_10": 0.0919189453125, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.1055908203125, "loss_aux_layer_13": 0.1136474609375, "loss_aux_layer_14": 0.126708984375, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.150146484375, "loss_aux_layer_17": 0.158203125, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.17041015625, "loss_aux_layer_2": 0.0760498046875, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.182373046875, "loss_aux_layer_22": 0.205322265625, "loss_aux_layer_23": 0.24853515625, "loss_aux_layer_3": 0.0887451171875, "loss_aux_layer_4": 0.0916748046875, "loss_aux_layer_5": 0.09326171875, "loss_aux_layer_6": 0.0966796875, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.091064453125, "loss_aux_layer_9": 0.09033203125, "step": 1068, "total_loss": 0.6990074813365936 }, { "epoch": 0.21164125915660265, "grad_norm": 1.9818732738494873, "learning_rate": 5e-05, "llm_loss": 0.7063381224870682, "loss": 3.2983, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.06689453125, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.0941162109375, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1099853515625, "loss_aux_layer_14": 0.122314453125, "loss_aux_layer_15": 0.133544921875, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.16259765625, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.08935546875, "loss_aux_layer_5": 0.091064453125, "loss_aux_layer_6": 0.0933837890625, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.0877685546875, "loss_aux_layer_9": 0.0867919921875, "step": 1069, "total_loss": 0.8245662301778793 }, { "epoch": 0.21183923975450406, "grad_norm": 1.8641505241394043, "learning_rate": 5e-05, "llm_loss": 0.6559775322675705, "loss": 3.1052, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.06805419921875, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.0958251953125, "loss_aux_layer_12": 0.104248046875, "loss_aux_layer_13": 0.1126708984375, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.1357421875, "loss_aux_layer_16": 0.14697265625, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.163330078125, "loss_aux_layer_19": 0.164306640625, "loss_aux_layer_2": 0.077392578125, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.1767578125, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.0882568359375, "loss_aux_layer_4": 0.0914306640625, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.0958251953125, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.0899658203125, "loss_aux_layer_9": 0.088623046875, "step": 1070, "total_loss": 0.7762999385595322 }, { "epoch": 0.21203722035240546, "grad_norm": 0.8538686037063599, "learning_rate": 5e-05, "llm_loss": 0.6415567398071289, "loss": 3.0296, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0670166015625, "loss_aux_layer_10": 0.0869140625, "loss_aux_layer_11": 0.0924072265625, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.10693359375, "loss_aux_layer_14": 0.1180419921875, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.147705078125, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.0892333984375, "loss_aux_layer_5": 0.0906982421875, "loss_aux_layer_6": 0.09326171875, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.0855712890625, "step": 1071, "total_loss": 0.7574100494384766 }, { "epoch": 0.21223520095030687, "grad_norm": 2.380711078643799, "learning_rate": 5e-05, "llm_loss": 0.579231783747673, "loss": 2.7914, "loss_aux_layer_0": 0.025543212890625, "loss_aux_layer_1": 0.06689453125, "loss_aux_layer_10": 0.0882568359375, "loss_aux_layer_11": 0.0943603515625, "loss_aux_layer_12": 0.1015625, "loss_aux_layer_13": 0.1094970703125, "loss_aux_layer_14": 0.1212158203125, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.169189453125, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.087158203125, "loss_aux_layer_4": 0.08984375, "loss_aux_layer_5": 0.091796875, "loss_aux_layer_6": 0.093994140625, "loss_aux_layer_7": 0.0894775390625, "loss_aux_layer_8": 0.0880126953125, "loss_aux_layer_9": 0.0867919921875, "step": 1072, "total_loss": 0.6978523433208466 }, { "epoch": 0.21243318154820828, "grad_norm": 1.9283804893493652, "learning_rate": 5e-05, "llm_loss": 0.6099755167961121, "loss": 2.931, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.0716552734375, "loss_aux_layer_10": 0.0926513671875, "loss_aux_layer_11": 0.098876953125, "loss_aux_layer_12": 0.1060791015625, "loss_aux_layer_13": 0.1136474609375, "loss_aux_layer_14": 0.1260986328125, "loss_aux_layer_15": 0.13818359375, "loss_aux_layer_16": 0.1484375, "loss_aux_layer_17": 0.156494140625, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.0811767578125, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.240966796875, "loss_aux_layer_3": 0.093017578125, "loss_aux_layer_4": 0.095947265625, "loss_aux_layer_5": 0.097412109375, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0946044921875, "loss_aux_layer_8": 0.0926513671875, "loss_aux_layer_9": 0.091064453125, "step": 1073, "total_loss": 0.7327540963888168 }, { "epoch": 0.2126311621461097, "grad_norm": 1.1021801233291626, "learning_rate": 5e-05, "llm_loss": 0.6434116810560226, "loss": 3.0667, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0704345703125, "loss_aux_layer_10": 0.0931396484375, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.1075439453125, "loss_aux_layer_13": 0.11572265625, "loss_aux_layer_14": 0.1273193359375, "loss_aux_layer_15": 0.138671875, "loss_aux_layer_16": 0.14990234375, "loss_aux_layer_17": 0.15771484375, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.167724609375, "loss_aux_layer_2": 0.0787353515625, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.203125, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.0909423828125, "loss_aux_layer_4": 0.0941162109375, "loss_aux_layer_5": 0.0955810546875, "loss_aux_layer_6": 0.098876953125, "loss_aux_layer_7": 0.094482421875, "loss_aux_layer_8": 0.0928955078125, "loss_aux_layer_9": 0.0914306640625, "step": 1074, "total_loss": 0.766684278845787 }, { "epoch": 0.21282914274401107, "grad_norm": 2.3627707958221436, "learning_rate": 5e-05, "llm_loss": 0.6704481542110443, "loss": 3.1681, "loss_aux_layer_0": 0.02435302734375, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.098876953125, "loss_aux_layer_12": 0.10693359375, "loss_aux_layer_13": 0.1146240234375, "loss_aux_layer_14": 0.1263427734375, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0799560546875, "loss_aux_layer_20": 0.16943359375, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0921630859375, "loss_aux_layer_4": 0.095703125, "loss_aux_layer_5": 0.097900390625, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.093505859375, "loss_aux_layer_9": 0.0921630859375, "step": 1075, "total_loss": 0.792034775018692 }, { "epoch": 0.21302712334191248, "grad_norm": 1.7539879083633423, "learning_rate": 5e-05, "llm_loss": 0.6930158883333206, "loss": 3.2517, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.067626953125, "loss_aux_layer_10": 0.0899658203125, "loss_aux_layer_11": 0.09521484375, "loss_aux_layer_12": 0.10302734375, "loss_aux_layer_13": 0.110595703125, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.134033203125, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.153076171875, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.076904296875, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.089111328125, "loss_aux_layer_4": 0.0919189453125, "loss_aux_layer_5": 0.09375, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.0919189453125, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0885009765625, "step": 1076, "total_loss": 0.8129304498434067 }, { "epoch": 0.2132251039398139, "grad_norm": 2.0432469844818115, "learning_rate": 5e-05, "llm_loss": 0.6512560397386551, "loss": 3.0952, "loss_aux_layer_0": 0.023895263671875, "loss_aux_layer_1": 0.0694580078125, "loss_aux_layer_10": 0.092529296875, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.1058349609375, "loss_aux_layer_13": 0.113525390625, "loss_aux_layer_14": 0.1253662109375, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.16455078125, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0806884765625, "loss_aux_layer_20": 0.17236328125, "loss_aux_layer_21": 0.17919921875, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.2431640625, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.0966796875, "loss_aux_layer_5": 0.0982666015625, "loss_aux_layer_6": 0.1005859375, "loss_aux_layer_7": 0.0948486328125, "loss_aux_layer_8": 0.0928955078125, "loss_aux_layer_9": 0.091064453125, "step": 1077, "total_loss": 0.7738078385591507 }, { "epoch": 0.2134230845377153, "grad_norm": 3.061217784881592, "learning_rate": 5e-05, "llm_loss": 0.6840234100818634, "loss": 3.202, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.06451416015625, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.0931396484375, "loss_aux_layer_12": 0.1005859375, "loss_aux_layer_13": 0.1080322265625, "loss_aux_layer_14": 0.119873046875, "loss_aux_layer_15": 0.1304931640625, "loss_aux_layer_16": 0.1416015625, "loss_aux_layer_17": 0.14990234375, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.15966796875, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0870361328125, "loss_aux_layer_4": 0.0897216796875, "loss_aux_layer_5": 0.0911865234375, "loss_aux_layer_6": 0.093505859375, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.0872802734375, "loss_aux_layer_9": 0.0860595703125, "step": 1078, "total_loss": 0.8005072176456451 }, { "epoch": 0.2136210651356167, "grad_norm": 1.0900274515151978, "learning_rate": 5e-05, "llm_loss": 0.6134267449378967, "loss": 2.9228, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.06512451171875, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.099365234375, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.120361328125, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.143798828125, "loss_aux_layer_17": 0.152099609375, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.17041015625, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.0836181640625, "loss_aux_layer_4": 0.0865478515625, "loss_aux_layer_5": 0.0882568359375, "loss_aux_layer_6": 0.0911865234375, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0855712890625, "loss_aux_layer_9": 0.0845947265625, "step": 1079, "total_loss": 0.7307037562131882 }, { "epoch": 0.21381904573351812, "grad_norm": 2.082979917526245, "learning_rate": 5e-05, "llm_loss": 0.6188114881515503, "loss": 2.9462, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.06536865234375, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.109375, "loss_aux_layer_14": 0.1212158203125, "loss_aux_layer_15": 0.132568359375, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.07421875, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.0872802734375, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.0921630859375, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.0897216796875, "loss_aux_layer_8": 0.088134765625, "loss_aux_layer_9": 0.0870361328125, "step": 1080, "total_loss": 0.7365516275167465 }, { "epoch": 0.21401702633141953, "grad_norm": 1.5993032455444336, "learning_rate": 5e-05, "llm_loss": 0.6321681141853333, "loss": 3.025, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.0701904296875, "loss_aux_layer_10": 0.0933837890625, "loss_aux_layer_11": 0.099609375, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.1156005859375, "loss_aux_layer_14": 0.1278076171875, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.16943359375, "loss_aux_layer_2": 0.0784912109375, "loss_aux_layer_20": 0.176513671875, "loss_aux_layer_21": 0.18408203125, "loss_aux_layer_22": 0.208251953125, "loss_aux_layer_23": 0.25048828125, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.0948486328125, "loss_aux_layer_6": 0.09765625, "loss_aux_layer_7": 0.093994140625, "loss_aux_layer_8": 0.0928955078125, "loss_aux_layer_9": 0.0916748046875, "step": 1081, "total_loss": 0.7562386095523834 }, { "epoch": 0.21421500692932094, "grad_norm": 2.5608322620391846, "learning_rate": 5e-05, "llm_loss": 0.6608536690473557, "loss": 3.1278, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0697021484375, "loss_aux_layer_10": 0.091552734375, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1129150390625, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.154296875, "loss_aux_layer_18": 0.163330078125, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.0791015625, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.197021484375, "loss_aux_layer_23": 0.23681640625, "loss_aux_layer_3": 0.091552734375, "loss_aux_layer_4": 0.09423828125, "loss_aux_layer_5": 0.0963134765625, "loss_aux_layer_6": 0.0982666015625, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.0899658203125, "step": 1082, "total_loss": 0.7819474041461945 }, { "epoch": 0.21441298752722232, "grad_norm": 1.3536959886550903, "learning_rate": 5e-05, "llm_loss": 0.5508774816989899, "loss": 2.7016, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0703125, "loss_aux_layer_10": 0.0946044921875, "loss_aux_layer_11": 0.1011962890625, "loss_aux_layer_12": 0.1090087890625, "loss_aux_layer_13": 0.11669921875, "loss_aux_layer_14": 0.12841796875, "loss_aux_layer_15": 0.139404296875, "loss_aux_layer_16": 0.150634765625, "loss_aux_layer_17": 0.15673828125, "loss_aux_layer_18": 0.166748046875, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0811767578125, "loss_aux_layer_20": 0.1748046875, "loss_aux_layer_21": 0.181640625, "loss_aux_layer_22": 0.204345703125, "loss_aux_layer_23": 0.2470703125, "loss_aux_layer_3": 0.0933837890625, "loss_aux_layer_4": 0.0965576171875, "loss_aux_layer_5": 0.0982666015625, "loss_aux_layer_6": 0.1011962890625, "loss_aux_layer_7": 0.09619140625, "loss_aux_layer_8": 0.0946044921875, "loss_aux_layer_9": 0.0928955078125, "step": 1083, "total_loss": 0.6754101812839508 }, { "epoch": 0.21461096812512373, "grad_norm": 1.3446980714797974, "learning_rate": 5e-05, "llm_loss": 0.6904955953359604, "loss": 3.2217, "loss_aux_layer_0": 0.0247802734375, "loss_aux_layer_1": 0.0635986328125, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.090087890625, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1051025390625, "loss_aux_layer_14": 0.11767578125, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.1484375, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.073486328125, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.232421875, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0869140625, "loss_aux_layer_5": 0.0885009765625, "loss_aux_layer_6": 0.0908203125, "loss_aux_layer_7": 0.0860595703125, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.0830078125, "step": 1084, "total_loss": 0.8054214417934418 }, { "epoch": 0.21480894872302514, "grad_norm": 1.2241368293762207, "learning_rate": 5e-05, "llm_loss": 0.5615702420473099, "loss": 2.7224, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.06689453125, "loss_aux_layer_10": 0.0889892578125, "loss_aux_layer_11": 0.0946044921875, "loss_aux_layer_12": 0.1026611328125, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.134765625, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.1533203125, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.163818359375, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.23828125, "loss_aux_layer_3": 0.0872802734375, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.0914306640625, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.0904541015625, "loss_aux_layer_8": 0.0889892578125, "loss_aux_layer_9": 0.087646484375, "step": 1085, "total_loss": 0.6805922985076904 }, { "epoch": 0.21500692932092655, "grad_norm": 1.946886420249939, "learning_rate": 5e-05, "llm_loss": 0.6091530919075012, "loss": 2.9191, "loss_aux_layer_0": 0.027130126953125, "loss_aux_layer_1": 0.0714111328125, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.1114501953125, "loss_aux_layer_14": 0.1240234375, "loss_aux_layer_15": 0.135498046875, "loss_aux_layer_16": 0.146484375, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.164306640625, "loss_aux_layer_2": 0.0804443359375, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.196533203125, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.09033203125, "loss_aux_layer_4": 0.0928955078125, "loss_aux_layer_5": 0.094482421875, "loss_aux_layer_6": 0.0968017578125, "loss_aux_layer_7": 0.091796875, "loss_aux_layer_8": 0.0904541015625, "loss_aux_layer_9": 0.0887451171875, "step": 1086, "total_loss": 0.7297862023115158 }, { "epoch": 0.21520490991882796, "grad_norm": 1.064130187034607, "learning_rate": 5e-05, "llm_loss": 0.6782003492116928, "loss": 3.1751, "loss_aux_layer_0": 0.02410888671875, "loss_aux_layer_1": 0.06402587890625, "loss_aux_layer_10": 0.0855712890625, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.0987548828125, "loss_aux_layer_13": 0.1065673828125, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.130615234375, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.149658203125, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.07080078125, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.08544921875, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.0899658203125, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.08349609375, "step": 1087, "total_loss": 0.7937695980072021 }, { "epoch": 0.21540289051672937, "grad_norm": 1.8249469995498657, "learning_rate": 5e-05, "llm_loss": 0.6736612915992737, "loss": 3.1658, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.093505859375, "loss_aux_layer_12": 0.10107421875, "loss_aux_layer_13": 0.1087646484375, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.1324462890625, "loss_aux_layer_16": 0.14404296875, "loss_aux_layer_17": 0.15185546875, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0762939453125, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.08984375, "loss_aux_layer_5": 0.0911865234375, "loss_aux_layer_6": 0.093994140625, "loss_aux_layer_7": 0.0897216796875, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0865478515625, "step": 1088, "total_loss": 0.7914621531963348 }, { "epoch": 0.21560087111463078, "grad_norm": 1.531193733215332, "learning_rate": 5e-05, "llm_loss": 0.6322687417268753, "loss": 3.011, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.06732177734375, "loss_aux_layer_10": 0.09033203125, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.1038818359375, "loss_aux_layer_13": 0.1123046875, "loss_aux_layer_14": 0.124755859375, "loss_aux_layer_15": 0.136474609375, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.15576171875, "loss_aux_layer_18": 0.164794921875, "loss_aux_layer_19": 0.166015625, "loss_aux_layer_2": 0.0743408203125, "loss_aux_layer_20": 0.172119140625, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.0875244140625, "loss_aux_layer_4": 0.0906982421875, "loss_aux_layer_5": 0.09228515625, "loss_aux_layer_6": 0.0950927734375, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0889892578125, "step": 1089, "total_loss": 0.7527506947517395 }, { "epoch": 0.21579885171253216, "grad_norm": 1.3179984092712402, "learning_rate": 5e-05, "llm_loss": 0.6133746951818466, "loss": 2.9227, "loss_aux_layer_0": 0.02447509765625, "loss_aux_layer_1": 0.06591796875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.10009765625, "loss_aux_layer_13": 0.10791015625, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.130859375, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.14990234375, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.074462890625, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.197509765625, "loss_aux_layer_23": 0.2392578125, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.08935546875, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.09326171875, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.0853271484375, "step": 1090, "total_loss": 0.7306866347789764 }, { "epoch": 0.21599683231043357, "grad_norm": 1.1745818853378296, "learning_rate": 5e-05, "llm_loss": 0.6878840178251266, "loss": 3.2288, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.067138671875, "loss_aux_layer_10": 0.0908203125, "loss_aux_layer_11": 0.0968017578125, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1121826171875, "loss_aux_layer_14": 0.123046875, "loss_aux_layer_15": 0.1337890625, "loss_aux_layer_16": 0.144287109375, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0755615234375, "loss_aux_layer_20": 0.16748046875, "loss_aux_layer_21": 0.175048828125, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.23876953125, "loss_aux_layer_3": 0.08837890625, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0931396484375, "loss_aux_layer_6": 0.09619140625, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0892333984375, "step": 1091, "total_loss": 0.8072077929973602 }, { "epoch": 0.21619481290833498, "grad_norm": 1.3468892574310303, "learning_rate": 5e-05, "llm_loss": 0.5918448865413666, "loss": 2.8367, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.06500244140625, "loss_aux_layer_10": 0.087646484375, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.1011962890625, "loss_aux_layer_13": 0.1092529296875, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.132080078125, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.16064453125, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.0726318359375, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236328125, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.0875244140625, "loss_aux_layer_5": 0.08935546875, "loss_aux_layer_6": 0.0924072265625, "loss_aux_layer_7": 0.088134765625, "loss_aux_layer_8": 0.0867919921875, "loss_aux_layer_9": 0.0858154296875, "step": 1092, "total_loss": 0.7091643065214157 }, { "epoch": 0.21639279350623639, "grad_norm": 1.376523733139038, "learning_rate": 5e-05, "llm_loss": 0.659725621342659, "loss": 3.122, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.0963134765625, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.1240234375, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.155517578125, "loss_aux_layer_18": 0.16552734375, "loss_aux_layer_19": 0.167236328125, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.173828125, "loss_aux_layer_21": 0.179931640625, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.241455078125, "loss_aux_layer_3": 0.0877685546875, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.09130859375, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.088623046875, "step": 1093, "total_loss": 0.7804897278547287 }, { "epoch": 0.2165907741041378, "grad_norm": 1.629018783569336, "learning_rate": 5e-05, "llm_loss": 0.6338806748390198, "loss": 3.0238, "loss_aux_layer_0": 0.0272216796875, "loss_aux_layer_1": 0.0699462890625, "loss_aux_layer_10": 0.0908203125, "loss_aux_layer_11": 0.0966796875, "loss_aux_layer_12": 0.104248046875, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.124755859375, "loss_aux_layer_15": 0.136962890625, "loss_aux_layer_16": 0.14892578125, "loss_aux_layer_17": 0.1572265625, "loss_aux_layer_18": 0.16650390625, "loss_aux_layer_19": 0.168212890625, "loss_aux_layer_2": 0.0782470703125, "loss_aux_layer_20": 0.17529296875, "loss_aux_layer_21": 0.18115234375, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.093994140625, "loss_aux_layer_6": 0.096923828125, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.090576171875, "loss_aux_layer_9": 0.0892333984375, "step": 1094, "total_loss": 0.7559551149606705 }, { "epoch": 0.2167887547020392, "grad_norm": 1.3992102146148682, "learning_rate": 5e-05, "llm_loss": 0.5649345964193344, "loss": 2.7382, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.06793212890625, "loss_aux_layer_10": 0.0899658203125, "loss_aux_layer_11": 0.0958251953125, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.1112060546875, "loss_aux_layer_14": 0.1231689453125, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.1455078125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.162109375, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.175537109375, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.239990234375, "loss_aux_layer_3": 0.0888671875, "loss_aux_layer_4": 0.0919189453125, "loss_aux_layer_5": 0.093505859375, "loss_aux_layer_6": 0.0960693359375, "loss_aux_layer_7": 0.091552734375, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.08837890625, "step": 1095, "total_loss": 0.6845587193965912 }, { "epoch": 0.21698673529994061, "grad_norm": 1.6945598125457764, "learning_rate": 5e-05, "llm_loss": 0.6696109473705292, "loss": 3.1579, "loss_aux_layer_0": 0.024932861328125, "loss_aux_layer_1": 0.0675048828125, "loss_aux_layer_10": 0.0897216796875, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.1029052734375, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.123291015625, "loss_aux_layer_15": 0.134765625, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.153076171875, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.17822265625, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.241943359375, "loss_aux_layer_3": 0.0877685546875, "loss_aux_layer_4": 0.090576171875, "loss_aux_layer_5": 0.0926513671875, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.0914306640625, "loss_aux_layer_8": 0.089599609375, "loss_aux_layer_9": 0.0880126953125, "step": 1096, "total_loss": 0.7894731909036636 }, { "epoch": 0.21718471589784202, "grad_norm": 1.5334417819976807, "learning_rate": 5e-05, "llm_loss": 0.5491135269403458, "loss": 2.6688, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.0997314453125, "loss_aux_layer_13": 0.1072998046875, "loss_aux_layer_14": 0.1192626953125, "loss_aux_layer_15": 0.1307373046875, "loss_aux_layer_16": 0.142333984375, "loss_aux_layer_17": 0.150146484375, "loss_aux_layer_18": 0.16064453125, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.0740966796875, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.20068359375, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.089111328125, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.0924072265625, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.08544921875, "step": 1097, "total_loss": 0.6671900302171707 }, { "epoch": 0.2173826964957434, "grad_norm": 1.2340219020843506, "learning_rate": 5e-05, "llm_loss": 0.6621438190340996, "loss": 3.1272, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.06817626953125, "loss_aux_layer_10": 0.091064453125, "loss_aux_layer_11": 0.0965576171875, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.12158203125, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.160400390625, "loss_aux_layer_19": 0.16259765625, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.16943359375, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.24169921875, "loss_aux_layer_3": 0.0887451171875, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.0963134765625, "loss_aux_layer_7": 0.09228515625, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.08935546875, "step": 1098, "total_loss": 0.7818105518817902 }, { "epoch": 0.21758067709364481, "grad_norm": 1.223228931427002, "learning_rate": 5e-05, "llm_loss": 0.6363938897848129, "loss": 3.0263, "loss_aux_layer_0": 0.026458740234375, "loss_aux_layer_1": 0.06597900390625, "loss_aux_layer_10": 0.09033203125, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.10400390625, "loss_aux_layer_13": 0.1123046875, "loss_aux_layer_14": 0.125244140625, "loss_aux_layer_15": 0.136474609375, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.164306640625, "loss_aux_layer_19": 0.165283203125, "loss_aux_layer_2": 0.07421875, "loss_aux_layer_20": 0.171875, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.19970703125, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.08642578125, "loss_aux_layer_4": 0.0897216796875, "loss_aux_layer_5": 0.0914306640625, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.0902099609375, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.0885009765625, "step": 1099, "total_loss": 0.7565648406744003 }, { "epoch": 0.21777865769154622, "grad_norm": 0.8082004189491272, "learning_rate": 5e-05, "llm_loss": 0.5898593813180923, "loss": 2.8334, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.06787109375, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.0947265625, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.10986328125, "loss_aux_layer_14": 0.1214599609375, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.143798828125, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.16162109375, "loss_aux_layer_2": 0.0751953125, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.197509765625, "loss_aux_layer_23": 0.239013671875, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.091552734375, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.090087890625, "loss_aux_layer_8": 0.088623046875, "loss_aux_layer_9": 0.0875244140625, "step": 1100, "total_loss": 0.7083595991134644 }, { "epoch": 0.21797663828944763, "grad_norm": 1.2829071283340454, "learning_rate": 5e-05, "llm_loss": 0.7265191376209259, "loss": 3.3807, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0684814453125, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.096435546875, "loss_aux_layer_12": 0.103515625, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.1214599609375, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.14208984375, "loss_aux_layer_17": 0.1494140625, "loss_aux_layer_18": 0.15771484375, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.0775146484375, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.0904541015625, "loss_aux_layer_4": 0.0933837890625, "loss_aux_layer_5": 0.09423828125, "loss_aux_layer_6": 0.0968017578125, "loss_aux_layer_7": 0.0926513671875, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.0892333984375, "step": 1101, "total_loss": 0.8451782912015915 }, { "epoch": 0.21817461888734904, "grad_norm": 1.076915979385376, "learning_rate": 5e-05, "llm_loss": 0.601551428437233, "loss": 2.869, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.0633544921875, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.10693359375, "loss_aux_layer_14": 0.11865234375, "loss_aux_layer_15": 0.1302490234375, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.175048828125, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0819091796875, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.0902099609375, "loss_aux_layer_7": 0.0867919921875, "loss_aux_layer_8": 0.085693359375, "loss_aux_layer_9": 0.084716796875, "step": 1102, "total_loss": 0.7172446548938751 }, { "epoch": 0.21837259948525045, "grad_norm": 1.4586505889892578, "learning_rate": 5e-05, "llm_loss": 0.6785108894109726, "loss": 3.1803, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.0657958984375, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.0933837890625, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.119140625, "loss_aux_layer_15": 0.1298828125, "loss_aux_layer_16": 0.140380859375, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.07373046875, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.086181640625, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.0902099609375, "loss_aux_layer_6": 0.0928955078125, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.087890625, "loss_aux_layer_9": 0.086181640625, "step": 1103, "total_loss": 0.7950753420591354 }, { "epoch": 0.21857058008315186, "grad_norm": 2.2384934425354004, "learning_rate": 5e-05, "llm_loss": 0.6048709154129028, "loss": 2.9066, "loss_aux_layer_0": 0.0260009765625, "loss_aux_layer_1": 0.0711669921875, "loss_aux_layer_10": 0.093505859375, "loss_aux_layer_11": 0.0992431640625, "loss_aux_layer_12": 0.1064453125, "loss_aux_layer_13": 0.1142578125, "loss_aux_layer_14": 0.125732421875, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.162109375, "loss_aux_layer_2": 0.080322265625, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.196533203125, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0936279296875, "loss_aux_layer_4": 0.09619140625, "loss_aux_layer_5": 0.0972900390625, "loss_aux_layer_6": 0.0997314453125, "loss_aux_layer_7": 0.09521484375, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0919189453125, "step": 1104, "total_loss": 0.7266519367694855 }, { "epoch": 0.21876856068105327, "grad_norm": 1.7589117288589478, "learning_rate": 5e-05, "llm_loss": 0.6430857628583908, "loss": 3.0291, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.06072998046875, "loss_aux_layer_10": 0.084716796875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.1051025390625, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.1285400390625, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.157958984375, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.233154296875, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.0845947265625, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.0848388671875, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.082763671875, "step": 1105, "total_loss": 0.7572803199291229 }, { "epoch": 0.21896654127895465, "grad_norm": 1.8925095796585083, "learning_rate": 5e-05, "llm_loss": 0.6414453238248825, "loss": 3.0497, "loss_aux_layer_0": 0.0283203125, "loss_aux_layer_1": 0.069580078125, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.0955810546875, "loss_aux_layer_12": 0.1031494140625, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.123779296875, "loss_aux_layer_15": 0.1357421875, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.1552734375, "loss_aux_layer_18": 0.1650390625, "loss_aux_layer_19": 0.16650390625, "loss_aux_layer_2": 0.0782470703125, "loss_aux_layer_20": 0.1728515625, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.09033203125, "loss_aux_layer_4": 0.09228515625, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.0911865234375, "loss_aux_layer_8": 0.0899658203125, "loss_aux_layer_9": 0.088623046875, "step": 1106, "total_loss": 0.7624198496341705 }, { "epoch": 0.21916452187685606, "grad_norm": 1.5980606079101562, "learning_rate": 5e-05, "llm_loss": 0.6387901604175568, "loss": 3.0403, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.067626953125, "loss_aux_layer_10": 0.0921630859375, "loss_aux_layer_11": 0.098388671875, "loss_aux_layer_12": 0.10595703125, "loss_aux_layer_13": 0.1136474609375, "loss_aux_layer_14": 0.125732421875, "loss_aux_layer_15": 0.136474609375, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.163818359375, "loss_aux_layer_2": 0.07666015625, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.198974609375, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.090087890625, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.09521484375, "loss_aux_layer_6": 0.09814453125, "loss_aux_layer_7": 0.093994140625, "loss_aux_layer_8": 0.0924072265625, "loss_aux_layer_9": 0.091064453125, "step": 1107, "total_loss": 0.7600844949483871 }, { "epoch": 0.21936250247475747, "grad_norm": 1.371624231338501, "learning_rate": 5e-05, "llm_loss": 0.6648349463939667, "loss": 3.125, "loss_aux_layer_0": 0.024383544921875, "loss_aux_layer_1": 0.0650634765625, "loss_aux_layer_10": 0.0875244140625, "loss_aux_layer_11": 0.0931396484375, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.131103515625, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.1494140625, "loss_aux_layer_18": 0.158447265625, "loss_aux_layer_19": 0.16064453125, "loss_aux_layer_2": 0.0716552734375, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.194091796875, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.0833740234375, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.088134765625, "loss_aux_layer_6": 0.09130859375, "loss_aux_layer_7": 0.0875244140625, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0858154296875, "step": 1108, "total_loss": 0.7812396287918091 }, { "epoch": 0.21956048307265888, "grad_norm": 1.6736054420471191, "learning_rate": 5e-05, "llm_loss": 0.6432134807109833, "loss": 3.0423, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.067138671875, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.0933837890625, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.10791015625, "loss_aux_layer_14": 0.119384765625, "loss_aux_layer_15": 0.130126953125, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.16748046875, "loss_aux_layer_21": 0.175048828125, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0870361328125, "loss_aux_layer_4": 0.0892333984375, "loss_aux_layer_5": 0.0911865234375, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.08642578125, "step": 1109, "total_loss": 0.7605843991041183 }, { "epoch": 0.2197584636705603, "grad_norm": 1.2169737815856934, "learning_rate": 5e-05, "llm_loss": 0.650787815451622, "loss": 3.0884, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.068359375, "loss_aux_layer_10": 0.0914306640625, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1126708984375, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.13623046875, "loss_aux_layer_16": 0.14794921875, "loss_aux_layer_17": 0.15625, "loss_aux_layer_18": 0.164306640625, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.076904296875, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.24169921875, "loss_aux_layer_3": 0.0897216796875, "loss_aux_layer_4": 0.0926513671875, "loss_aux_layer_5": 0.0941162109375, "loss_aux_layer_6": 0.09716796875, "loss_aux_layer_7": 0.093017578125, "loss_aux_layer_8": 0.0916748046875, "loss_aux_layer_9": 0.090087890625, "step": 1110, "total_loss": 0.7720932960510254 }, { "epoch": 0.2199564442684617, "grad_norm": 1.8005552291870117, "learning_rate": 5e-05, "llm_loss": 0.6473695784807205, "loss": 3.0759, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0694580078125, "loss_aux_layer_10": 0.0926513671875, "loss_aux_layer_11": 0.09814453125, "loss_aux_layer_12": 0.10546875, "loss_aux_layer_13": 0.1124267578125, "loss_aux_layer_14": 0.1240234375, "loss_aux_layer_15": 0.135009765625, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.0775146484375, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0938720703125, "loss_aux_layer_5": 0.095947265625, "loss_aux_layer_6": 0.098876953125, "loss_aux_layer_7": 0.094970703125, "loss_aux_layer_8": 0.09326171875, "loss_aux_layer_9": 0.09130859375, "step": 1111, "total_loss": 0.7689656317234039 }, { "epoch": 0.2201544248663631, "grad_norm": 1.087158203125, "learning_rate": 5e-05, "llm_loss": 0.6107042729854584, "loss": 2.913, "loss_aux_layer_0": 0.026611328125, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.093017578125, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.120361328125, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.160400390625, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0726318359375, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.085205078125, "loss_aux_layer_4": 0.0880126953125, "loss_aux_layer_5": 0.0897216796875, "loss_aux_layer_6": 0.0928955078125, "loss_aux_layer_7": 0.088623046875, "loss_aux_layer_8": 0.0872802734375, "loss_aux_layer_9": 0.0858154296875, "step": 1112, "total_loss": 0.7282428741455078 }, { "epoch": 0.2203524054642645, "grad_norm": 1.84640371799469, "learning_rate": 5e-05, "llm_loss": 0.6428373008966446, "loss": 3.0287, "loss_aux_layer_0": 0.02581787109375, "loss_aux_layer_1": 0.06268310546875, "loss_aux_layer_10": 0.0830078125, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.0955810546875, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.1165771484375, "loss_aux_layer_15": 0.129150390625, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.150146484375, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.162841796875, "loss_aux_layer_2": 0.0687255859375, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.175537109375, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.08251953125, "loss_aux_layer_5": 0.0843505859375, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.083251953125, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.081298828125, "step": 1113, "total_loss": 0.7571866661310196 }, { "epoch": 0.2205503860621659, "grad_norm": 2.278855562210083, "learning_rate": 5e-05, "llm_loss": 0.6500896215438843, "loss": 3.0786, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.06658935546875, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.095703125, "loss_aux_layer_12": 0.103271484375, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.1229248046875, "loss_aux_layer_15": 0.134521484375, "loss_aux_layer_16": 0.145263671875, "loss_aux_layer_17": 0.15283203125, "loss_aux_layer_18": 0.162109375, "loss_aux_layer_19": 0.163818359375, "loss_aux_layer_2": 0.0745849609375, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.24267578125, "loss_aux_layer_3": 0.0872802734375, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.091796875, "loss_aux_layer_6": 0.0948486328125, "loss_aux_layer_7": 0.0904541015625, "loss_aux_layer_8": 0.0897216796875, "loss_aux_layer_9": 0.08837890625, "step": 1114, "total_loss": 0.769659548997879 }, { "epoch": 0.2207483666600673, "grad_norm": 1.4552333354949951, "learning_rate": 5e-05, "llm_loss": 0.6431630551815033, "loss": 3.0271, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.0625, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.095947265625, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.127685546875, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.1552734375, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.06903076171875, "loss_aux_layer_20": 0.1650390625, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.0865478515625, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.08544921875, "loss_aux_layer_8": 0.0838623046875, "loss_aux_layer_9": 0.0823974609375, "step": 1115, "total_loss": 0.7567655146121979 }, { "epoch": 0.22094634725796872, "grad_norm": 1.6995059251785278, "learning_rate": 5e-05, "llm_loss": 0.5951076000928879, "loss": 2.8633, "loss_aux_layer_0": 0.02642822265625, "loss_aux_layer_1": 0.06890869140625, "loss_aux_layer_10": 0.0914306640625, "loss_aux_layer_11": 0.0972900390625, "loss_aux_layer_12": 0.1048583984375, "loss_aux_layer_13": 0.1126708984375, "loss_aux_layer_14": 0.1241455078125, "loss_aux_layer_15": 0.13525390625, "loss_aux_layer_16": 0.146484375, "loss_aux_layer_17": 0.15283203125, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.0770263671875, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.1767578125, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.0927734375, "loss_aux_layer_5": 0.0947265625, "loss_aux_layer_6": 0.0972900390625, "loss_aux_layer_7": 0.0928955078125, "loss_aux_layer_8": 0.091552734375, "loss_aux_layer_9": 0.08984375, "step": 1116, "total_loss": 0.7158252894878387 }, { "epoch": 0.22114432785587013, "grad_norm": 1.6814934015274048, "learning_rate": 5e-05, "llm_loss": 0.6626685112714767, "loss": 3.1246, "loss_aux_layer_0": 0.02581787109375, "loss_aux_layer_1": 0.0660400390625, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.1016845703125, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.121826171875, "loss_aux_layer_15": 0.133056640625, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.16162109375, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.23828125, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.0904541015625, "loss_aux_layer_6": 0.0933837890625, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.088134765625, "loss_aux_layer_9": 0.0870361328125, "step": 1117, "total_loss": 0.7811579555273056 }, { "epoch": 0.22134230845377154, "grad_norm": 1.4881556034088135, "learning_rate": 5e-05, "llm_loss": 0.6715016961097717, "loss": 3.1606, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.0673828125, "loss_aux_layer_10": 0.089599609375, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.111328125, "loss_aux_layer_14": 0.1224365234375, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.16748046875, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236572265625, "loss_aux_layer_3": 0.0885009765625, "loss_aux_layer_4": 0.09130859375, "loss_aux_layer_5": 0.0927734375, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.091064453125, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.087890625, "step": 1118, "total_loss": 0.7901618182659149 }, { "epoch": 0.22154028905167295, "grad_norm": 1.4767835140228271, "learning_rate": 5e-05, "llm_loss": 0.5836783200502396, "loss": 2.805, "loss_aux_layer_0": 0.024932861328125, "loss_aux_layer_1": 0.0645751953125, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.0938720703125, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.1207275390625, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.072998046875, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.23876953125, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.0880126953125, "loss_aux_layer_5": 0.08984375, "loss_aux_layer_6": 0.0928955078125, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.08642578125, "step": 1119, "total_loss": 0.7012388557195663 }, { "epoch": 0.22173826964957435, "grad_norm": 1.2217140197753906, "learning_rate": 5e-05, "llm_loss": 0.6760660633444786, "loss": 3.1719, "loss_aux_layer_0": 0.02587890625, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.086669921875, "loss_aux_layer_11": 0.0921630859375, "loss_aux_layer_12": 0.0997314453125, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.142822265625, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.16162109375, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.083984375, "loss_aux_layer_4": 0.0863037109375, "loss_aux_layer_5": 0.0880126953125, "loss_aux_layer_6": 0.0906982421875, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.0850830078125, "step": 1120, "total_loss": 0.7929863780736923 }, { "epoch": 0.22193625024747574, "grad_norm": 1.2753798961639404, "learning_rate": 5e-05, "llm_loss": 0.6542513370513916, "loss": 3.0813, "loss_aux_layer_0": 0.02569580078125, "loss_aux_layer_1": 0.063232421875, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.099365234375, "loss_aux_layer_13": 0.10693359375, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.158935546875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.07080078125, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.083984375, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.088134765625, "loss_aux_layer_6": 0.0911865234375, "loss_aux_layer_7": 0.087646484375, "loss_aux_layer_8": 0.086181640625, "loss_aux_layer_9": 0.0850830078125, "step": 1121, "total_loss": 0.7703186124563217 }, { "epoch": 0.22213423084537715, "grad_norm": 1.2724789381027222, "learning_rate": 5e-05, "llm_loss": 0.6208052039146423, "loss": 2.9618, "loss_aux_layer_0": 0.02435302734375, "loss_aux_layer_1": 0.067138671875, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0975341796875, "loss_aux_layer_12": 0.10498046875, "loss_aux_layer_13": 0.1126708984375, "loss_aux_layer_14": 0.1241455078125, "loss_aux_layer_15": 0.135009765625, "loss_aux_layer_16": 0.1455078125, "loss_aux_layer_17": 0.15283203125, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.23681640625, "loss_aux_layer_3": 0.08837890625, "loss_aux_layer_4": 0.0916748046875, "loss_aux_layer_5": 0.0931396484375, "loss_aux_layer_6": 0.0960693359375, "loss_aux_layer_7": 0.09228515625, "loss_aux_layer_8": 0.0914306640625, "loss_aux_layer_9": 0.09033203125, "step": 1122, "total_loss": 0.740453839302063 }, { "epoch": 0.22233221144327855, "grad_norm": 1.2448264360427856, "learning_rate": 5e-05, "llm_loss": 0.6362779438495636, "loss": 3.0081, "loss_aux_layer_0": 0.02691650390625, "loss_aux_layer_1": 0.06634521484375, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.092041015625, "loss_aux_layer_12": 0.0989990234375, "loss_aux_layer_13": 0.1060791015625, "loss_aux_layer_14": 0.1173095703125, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.07275390625, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.171875, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.08544921875, "loss_aux_layer_4": 0.088134765625, "loss_aux_layer_5": 0.08984375, "loss_aux_layer_6": 0.0921630859375, "loss_aux_layer_7": 0.088134765625, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0850830078125, "step": 1123, "total_loss": 0.7520187199115753 }, { "epoch": 0.22253019204117996, "grad_norm": 1.1012864112854004, "learning_rate": 5e-05, "llm_loss": 0.5934167802333832, "loss": 2.8331, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.0654296875, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.104736328125, "loss_aux_layer_14": 0.115966796875, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.14501953125, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.072509765625, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.087646484375, "loss_aux_layer_5": 0.089111328125, "loss_aux_layer_6": 0.091796875, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.084716796875, "step": 1124, "total_loss": 0.7082827240228653 }, { "epoch": 0.22272817263908137, "grad_norm": 0.8587628602981567, "learning_rate": 5e-05, "llm_loss": 0.5460153669118881, "loss": 2.6509, "loss_aux_layer_0": 0.02642822265625, "loss_aux_layer_1": 0.0667724609375, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.0999755859375, "loss_aux_layer_13": 0.1072998046875, "loss_aux_layer_14": 0.1187744140625, "loss_aux_layer_15": 0.130126953125, "loss_aux_layer_16": 0.140380859375, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.157958984375, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.073486328125, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.2353515625, "loss_aux_layer_3": 0.086181640625, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.0931396484375, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.0859375, "step": 1125, "total_loss": 0.6627233177423477 }, { "epoch": 0.22292615323698278, "grad_norm": 1.2266381978988647, "learning_rate": 5e-05, "llm_loss": 0.6761036217212677, "loss": 3.1672, "loss_aux_layer_0": 0.0257568359375, "loss_aux_layer_1": 0.0645751953125, "loss_aux_layer_10": 0.0872802734375, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.1002197265625, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.118896484375, "loss_aux_layer_15": 0.130126953125, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.1484375, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.072021484375, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.0889892578125, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.0880126953125, "loss_aux_layer_8": 0.086669921875, "loss_aux_layer_9": 0.08544921875, "step": 1126, "total_loss": 0.7918014973402023 }, { "epoch": 0.2231241338348842, "grad_norm": 1.0161317586898804, "learning_rate": 5e-05, "llm_loss": 0.5772299766540527, "loss": 2.7952, "loss_aux_layer_0": 0.026519775390625, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.0977783203125, "loss_aux_layer_12": 0.10498046875, "loss_aux_layer_13": 0.112548828125, "loss_aux_layer_14": 0.1246337890625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.1474609375, "loss_aux_layer_17": 0.155029296875, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.0777587890625, "loss_aux_layer_20": 0.171142578125, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.200927734375, "loss_aux_layer_23": 0.24365234375, "loss_aux_layer_3": 0.090087890625, "loss_aux_layer_4": 0.0928955078125, "loss_aux_layer_5": 0.0948486328125, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.0933837890625, "loss_aux_layer_8": 0.0921630859375, "loss_aux_layer_9": 0.0906982421875, "step": 1127, "total_loss": 0.6988045126199722 }, { "epoch": 0.22332211443278557, "grad_norm": 0.8648751974105835, "learning_rate": 5e-05, "llm_loss": 0.6013593226671219, "loss": 2.8885, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.068115234375, "loss_aux_layer_10": 0.091796875, "loss_aux_layer_11": 0.09765625, "loss_aux_layer_12": 0.1058349609375, "loss_aux_layer_13": 0.11328125, "loss_aux_layer_14": 0.1256103515625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.154296875, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.163818359375, "loss_aux_layer_2": 0.0762939453125, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0897216796875, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.094970703125, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.09326171875, "loss_aux_layer_8": 0.0916748046875, "loss_aux_layer_9": 0.090087890625, "step": 1128, "total_loss": 0.7221197932958603 }, { "epoch": 0.22352009503068698, "grad_norm": 1.0203652381896973, "learning_rate": 5e-05, "llm_loss": 0.6570687144994736, "loss": 3.0983, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.06500244140625, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.1204833984375, "loss_aux_layer_15": 0.1309814453125, "loss_aux_layer_16": 0.141357421875, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.0745849609375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.0880126953125, "loss_aux_layer_4": 0.0908203125, "loss_aux_layer_5": 0.0924072265625, "loss_aux_layer_6": 0.094970703125, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.089599609375, "loss_aux_layer_9": 0.08837890625, "step": 1129, "total_loss": 0.7745648920536041 }, { "epoch": 0.2237180756285884, "grad_norm": 0.8847259283065796, "learning_rate": 5e-05, "llm_loss": 0.5546069070696831, "loss": 2.6882, "loss_aux_layer_0": 0.028564453125, "loss_aux_layer_1": 0.067138671875, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.0938720703125, "loss_aux_layer_12": 0.10107421875, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.12060546875, "loss_aux_layer_15": 0.1312255859375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.158935546875, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.0732421875, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236328125, "loss_aux_layer_3": 0.0860595703125, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.0904541015625, "loss_aux_layer_6": 0.093505859375, "loss_aux_layer_7": 0.0894775390625, "loss_aux_layer_8": 0.08837890625, "loss_aux_layer_9": 0.0870361328125, "step": 1130, "total_loss": 0.6720595210790634 }, { "epoch": 0.2239160562264898, "grad_norm": 1.0437488555908203, "learning_rate": 5e-05, "llm_loss": 0.6336624324321747, "loss": 2.9788, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.0616455078125, "loss_aux_layer_10": 0.0830078125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1019287109375, "loss_aux_layer_14": 0.1126708984375, "loss_aux_layer_15": 0.1231689453125, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0823974609375, "loss_aux_layer_9": 0.0811767578125, "step": 1131, "total_loss": 0.7447123825550079 }, { "epoch": 0.2241140368243912, "grad_norm": 1.2280046939849854, "learning_rate": 5e-05, "llm_loss": 0.5753832831978798, "loss": 2.7447, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.05853271484375, "loss_aux_layer_10": 0.0799560546875, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.092529296875, "loss_aux_layer_13": 0.100341796875, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.1552734375, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.06488037109375, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.171875, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0780029296875, "step": 1132, "total_loss": 0.6861861050128937 }, { "epoch": 0.22431201742229262, "grad_norm": 1.2393473386764526, "learning_rate": 5e-05, "llm_loss": 0.6420157626271248, "loss": 3.0388, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.0875244140625, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.101318359375, "loss_aux_layer_13": 0.1094970703125, "loss_aux_layer_14": 0.121826171875, "loss_aux_layer_15": 0.132568359375, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0731201171875, "loss_aux_layer_20": 0.16796875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.197021484375, "loss_aux_layer_23": 0.23828125, "loss_aux_layer_3": 0.085693359375, "loss_aux_layer_4": 0.0882568359375, "loss_aux_layer_5": 0.0897216796875, "loss_aux_layer_6": 0.0926513671875, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0858154296875, "step": 1133, "total_loss": 0.759703129529953 }, { "epoch": 0.22450999802019403, "grad_norm": 1.3550105094909668, "learning_rate": 5e-05, "llm_loss": 0.6932096630334854, "loss": 3.2369, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.06475830078125, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.1070556640625, "loss_aux_layer_14": 0.1190185546875, "loss_aux_layer_15": 0.1292724609375, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.194091796875, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.087646484375, "loss_aux_layer_5": 0.0892333984375, "loss_aux_layer_6": 0.0924072265625, "loss_aux_layer_7": 0.0885009765625, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.0859375, "step": 1134, "total_loss": 0.8092344850301743 }, { "epoch": 0.22470797861809544, "grad_norm": 0.9520754814147949, "learning_rate": 5e-05, "llm_loss": 0.5748120471835136, "loss": 2.7815, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.092529296875, "loss_aux_layer_11": 0.0982666015625, "loss_aux_layer_12": 0.1055908203125, "loss_aux_layer_13": 0.1134033203125, "loss_aux_layer_14": 0.12451171875, "loss_aux_layer_15": 0.1357421875, "loss_aux_layer_16": 0.146240234375, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.2392578125, "loss_aux_layer_3": 0.0888671875, "loss_aux_layer_4": 0.092041015625, "loss_aux_layer_5": 0.0936279296875, "loss_aux_layer_6": 0.0968017578125, "loss_aux_layer_7": 0.09326171875, "loss_aux_layer_8": 0.092041015625, "loss_aux_layer_9": 0.0908203125, "step": 1135, "total_loss": 0.6953647434711456 }, { "epoch": 0.22490595921599682, "grad_norm": 2.487880229949951, "learning_rate": 5e-05, "llm_loss": 0.6125191897153854, "loss": 2.8959, "loss_aux_layer_0": 0.02783203125, "loss_aux_layer_1": 0.06146240234375, "loss_aux_layer_10": 0.08056640625, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.1011962890625, "loss_aux_layer_14": 0.1136474609375, "loss_aux_layer_15": 0.1258544921875, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.06634521484375, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0802001953125, "loss_aux_layer_5": 0.0816650390625, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.07861328125, "step": 1136, "total_loss": 0.7239679843187332 }, { "epoch": 0.22510393981389823, "grad_norm": 2.5547969341278076, "learning_rate": 5e-05, "llm_loss": 0.6136593669652939, "loss": 2.9243, "loss_aux_layer_0": 0.0272216796875, "loss_aux_layer_1": 0.0633544921875, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.0992431640625, "loss_aux_layer_13": 0.107421875, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.16259765625, "loss_aux_layer_2": 0.0731201171875, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.177734375, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.0848388671875, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.089599609375, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0849609375, "step": 1137, "total_loss": 0.7310779839754105 }, { "epoch": 0.22530192041179964, "grad_norm": 2.132204055786133, "learning_rate": 5e-05, "llm_loss": 0.5942329987883568, "loss": 2.8526, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.06634521484375, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.094970703125, "loss_aux_layer_12": 0.1025390625, "loss_aux_layer_13": 0.110595703125, "loss_aux_layer_14": 0.122314453125, "loss_aux_layer_15": 0.1337890625, "loss_aux_layer_16": 0.1455078125, "loss_aux_layer_17": 0.1533203125, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0736083984375, "loss_aux_layer_20": 0.170166015625, "loss_aux_layer_21": 0.177001953125, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.0888671875, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.09423828125, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.0887451171875, "loss_aux_layer_9": 0.0875244140625, "step": 1138, "total_loss": 0.7131377905607224 }, { "epoch": 0.22549990100970105, "grad_norm": 1.785855770111084, "learning_rate": 5e-05, "llm_loss": 0.6155270040035248, "loss": 2.9374, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.065185546875, "loss_aux_layer_10": 0.090576171875, "loss_aux_layer_11": 0.095947265625, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.11083984375, "loss_aux_layer_14": 0.1217041015625, "loss_aux_layer_15": 0.1324462890625, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.2353515625, "loss_aux_layer_3": 0.08935546875, "loss_aux_layer_4": 0.092529296875, "loss_aux_layer_5": 0.0947265625, "loss_aux_layer_6": 0.09716796875, "loss_aux_layer_7": 0.0919189453125, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.0888671875, "step": 1139, "total_loss": 0.7343472093343735 }, { "epoch": 0.22569788160760246, "grad_norm": 1.6551557779312134, "learning_rate": 5e-05, "llm_loss": 0.6310880035161972, "loss": 2.985, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.06304931640625, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.0906982421875, "loss_aux_layer_12": 0.0982666015625, "loss_aux_layer_13": 0.105712890625, "loss_aux_layer_14": 0.1173095703125, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0716552734375, "loss_aux_layer_20": 0.1669921875, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.0845947265625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.08837890625, "loss_aux_layer_6": 0.0911865234375, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.0833740234375, "step": 1140, "total_loss": 0.7462605535984039 }, { "epoch": 0.22589586220550387, "grad_norm": 2.086967706680298, "learning_rate": 5e-05, "llm_loss": 0.6487609446048737, "loss": 3.036, "loss_aux_layer_0": 0.026580810546875, "loss_aux_layer_1": 0.059326171875, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.0919189453125, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.1119384765625, "loss_aux_layer_15": 0.1234130859375, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.1533203125, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.06719970703125, "loss_aux_layer_20": 0.1630859375, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.0848388671875, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.078125, "step": 1141, "total_loss": 0.7589913755655289 }, { "epoch": 0.22609384280340528, "grad_norm": 1.5714091062545776, "learning_rate": 5e-05, "llm_loss": 0.5462693721055984, "loss": 2.6559, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.06365966796875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0931396484375, "loss_aux_layer_12": 0.10107421875, "loss_aux_layer_13": 0.109130859375, "loss_aux_layer_14": 0.12109375, "loss_aux_layer_15": 0.1326904296875, "loss_aux_layer_16": 0.144287109375, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.162109375, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.24169921875, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0865478515625, "loss_aux_layer_5": 0.0877685546875, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.0848388671875, "step": 1142, "total_loss": 0.6639809161424637 }, { "epoch": 0.22629182340130669, "grad_norm": 1.9363491535186768, "learning_rate": 5e-05, "llm_loss": 0.7405392825603485, "loss": 3.4323, "loss_aux_layer_0": 0.024017333984375, "loss_aux_layer_1": 0.065185546875, "loss_aux_layer_10": 0.0897216796875, "loss_aux_layer_11": 0.0947265625, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.1097412109375, "loss_aux_layer_14": 0.121337890625, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.160400390625, "loss_aux_layer_2": 0.075439453125, "loss_aux_layer_20": 0.166259765625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.0869140625, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.0916748046875, "loss_aux_layer_6": 0.094970703125, "loss_aux_layer_7": 0.0908203125, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.0880126953125, "step": 1143, "total_loss": 0.8580739200115204 }, { "epoch": 0.22648980399920807, "grad_norm": 1.6275497674942017, "learning_rate": 5e-05, "llm_loss": 0.6224101781845093, "loss": 2.9699, "loss_aux_layer_0": 0.026153564453125, "loss_aux_layer_1": 0.06732177734375, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.0960693359375, "loss_aux_layer_12": 0.1041259765625, "loss_aux_layer_13": 0.112060546875, "loss_aux_layer_14": 0.125, "loss_aux_layer_15": 0.1356201171875, "loss_aux_layer_16": 0.14697265625, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0758056640625, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.24072265625, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.09130859375, "loss_aux_layer_5": 0.0926513671875, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.09130859375, "loss_aux_layer_8": 0.08984375, "loss_aux_layer_9": 0.0882568359375, "step": 1144, "total_loss": 0.7424758970737457 }, { "epoch": 0.22668778459710948, "grad_norm": 1.724866271018982, "learning_rate": 5e-05, "llm_loss": 0.6819659322500229, "loss": 3.2031, "loss_aux_layer_0": 0.029449462890625, "loss_aux_layer_1": 0.06829833984375, "loss_aux_layer_10": 0.0894775390625, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.1026611328125, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.134033203125, "loss_aux_layer_16": 0.14501953125, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.073486328125, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.08642578125, "loss_aux_layer_4": 0.0887451171875, "loss_aux_layer_5": 0.09033203125, "loss_aux_layer_6": 0.09326171875, "loss_aux_layer_7": 0.08935546875, "loss_aux_layer_8": 0.0885009765625, "loss_aux_layer_9": 0.087646484375, "step": 1145, "total_loss": 0.800775945186615 }, { "epoch": 0.22688576519501089, "grad_norm": 1.8087784051895142, "learning_rate": 5e-05, "llm_loss": 0.6658012270927429, "loss": 3.1293, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.06365966796875, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.092041015625, "loss_aux_layer_12": 0.099365234375, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.119384765625, "loss_aux_layer_15": 0.13134765625, "loss_aux_layer_16": 0.142333984375, "loss_aux_layer_17": 0.14990234375, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.087158203125, "loss_aux_layer_5": 0.0888671875, "loss_aux_layer_6": 0.091552734375, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.0853271484375, "step": 1146, "total_loss": 0.7823303639888763 }, { "epoch": 0.2270837457929123, "grad_norm": 2.869732618331909, "learning_rate": 5e-05, "llm_loss": 0.5993388593196869, "loss": 2.8676, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.0650634765625, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.093505859375, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.1082763671875, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.19580078125, "loss_aux_layer_23": 0.237548828125, "loss_aux_layer_3": 0.085205078125, "loss_aux_layer_4": 0.087646484375, "loss_aux_layer_5": 0.0897216796875, "loss_aux_layer_6": 0.09228515625, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.08642578125, "step": 1147, "total_loss": 0.7168898433446884 }, { "epoch": 0.2272817263908137, "grad_norm": 1.7566126585006714, "learning_rate": 5e-05, "llm_loss": 0.6069810688495636, "loss": 2.914, "loss_aux_layer_0": 0.02862548828125, "loss_aux_layer_1": 0.07177734375, "loss_aux_layer_10": 0.093017578125, "loss_aux_layer_11": 0.09912109375, "loss_aux_layer_12": 0.1064453125, "loss_aux_layer_13": 0.1138916015625, "loss_aux_layer_14": 0.1256103515625, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.16259765625, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.0762939453125, "loss_aux_layer_20": 0.17041015625, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0894775390625, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.0948486328125, "loss_aux_layer_6": 0.0982666015625, "loss_aux_layer_7": 0.0943603515625, "loss_aux_layer_8": 0.09326171875, "loss_aux_layer_9": 0.091796875, "step": 1148, "total_loss": 0.7285099923610687 }, { "epoch": 0.2274797069887151, "grad_norm": 2.5631277561187744, "learning_rate": 5e-05, "llm_loss": 0.6511960625648499, "loss": 3.0612, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.06414794921875, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.089599609375, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0718994140625, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.0892333984375, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.083251953125, "step": 1149, "total_loss": 0.7653035223484039 }, { "epoch": 0.22767768758661652, "grad_norm": 1.4479470252990723, "learning_rate": 5e-05, "llm_loss": 0.5697089582681656, "loss": 2.7527, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.0667724609375, "loss_aux_layer_10": 0.0888671875, "loss_aux_layer_11": 0.0946044921875, "loss_aux_layer_12": 0.102294921875, "loss_aux_layer_13": 0.1102294921875, "loss_aux_layer_14": 0.122314453125, "loss_aux_layer_15": 0.133544921875, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.152099609375, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.162109375, "loss_aux_layer_2": 0.0743408203125, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0870361328125, "loss_aux_layer_4": 0.090087890625, "loss_aux_layer_5": 0.0916748046875, "loss_aux_layer_6": 0.094482421875, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.0885009765625, "loss_aux_layer_9": 0.0872802734375, "step": 1150, "total_loss": 0.6881705522537231 }, { "epoch": 0.2278756681845179, "grad_norm": 1.9491486549377441, "learning_rate": 5e-05, "llm_loss": 0.5540680289268494, "loss": 2.6925, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.06671142578125, "loss_aux_layer_10": 0.0909423828125, "loss_aux_layer_11": 0.096923828125, "loss_aux_layer_12": 0.1041259765625, "loss_aux_layer_13": 0.111572265625, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.143798828125, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.0887451171875, "loss_aux_layer_4": 0.0921630859375, "loss_aux_layer_5": 0.0941162109375, "loss_aux_layer_6": 0.0970458984375, "loss_aux_layer_7": 0.092529296875, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.0894775390625, "step": 1151, "total_loss": 0.6731307655572891 }, { "epoch": 0.2280736487824193, "grad_norm": 1.3991519212722778, "learning_rate": 5e-05, "llm_loss": 0.6527725458145142, "loss": 3.0758, "loss_aux_layer_0": 0.027008056640625, "loss_aux_layer_1": 0.0662841796875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0921630859375, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.1065673828125, "loss_aux_layer_14": 0.117919921875, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.147705078125, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.072021484375, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.237548828125, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0880126953125, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.085693359375, "step": 1152, "total_loss": 0.7689437568187714 }, { "epoch": 0.22827162938032072, "grad_norm": 1.0916799306869507, "learning_rate": 5e-05, "llm_loss": 0.6748979240655899, "loss": 3.1587, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.064208984375, "loss_aux_layer_10": 0.087158203125, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1168212890625, "loss_aux_layer_15": 0.12744140625, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.0728759765625, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.084716796875, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.089599609375, "loss_aux_layer_6": 0.092529296875, "loss_aux_layer_7": 0.0885009765625, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.0858154296875, "step": 1153, "total_loss": 0.789676621556282 }, { "epoch": 0.22846960997822213, "grad_norm": 1.1922706365585327, "learning_rate": 5e-05, "llm_loss": 0.6433283910155296, "loss": 3.0372, "loss_aux_layer_0": 0.024505615234375, "loss_aux_layer_1": 0.0667724609375, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.107666015625, "loss_aux_layer_14": 0.11865234375, "loss_aux_layer_15": 0.129150390625, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.0748291015625, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.229736328125, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.0894775390625, "loss_aux_layer_5": 0.0906982421875, "loss_aux_layer_6": 0.093017578125, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0859375, "step": 1154, "total_loss": 0.7593113780021667 }, { "epoch": 0.22866759057612354, "grad_norm": 1.0876600742340088, "learning_rate": 5e-05, "llm_loss": 0.6243837773799896, "loss": 2.9324, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.05816650390625, "loss_aux_layer_10": 0.0799560546875, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0989990234375, "loss_aux_layer_14": 0.10986328125, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.1317138671875, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06536865234375, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.0784912109375, "step": 1155, "total_loss": 0.7331015318632126 }, { "epoch": 0.22886557117402495, "grad_norm": 1.0671731233596802, "learning_rate": 5e-05, "llm_loss": 0.6698109954595566, "loss": 3.1236, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.05975341796875, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0867919921875, "loss_aux_layer_12": 0.0943603515625, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.1258544921875, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.06787109375, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0787353515625, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.07958984375, "step": 1156, "total_loss": 0.7808983325958252 }, { "epoch": 0.22906355177192636, "grad_norm": 1.1571509838104248, "learning_rate": 5e-05, "llm_loss": 0.7329362034797668, "loss": 3.3842, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.06268310546875, "loss_aux_layer_10": 0.0841064453125, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.095947265625, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0858154296875, "loss_aux_layer_6": 0.088623046875, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.08251953125, "step": 1157, "total_loss": 0.846040740609169 }, { "epoch": 0.22926153236982777, "grad_norm": 2.049626350402832, "learning_rate": 5e-05, "llm_loss": 0.6379738003015518, "loss": 3.0236, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.06494140625, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.093994140625, "loss_aux_layer_12": 0.1015625, "loss_aux_layer_13": 0.1092529296875, "loss_aux_layer_14": 0.120849609375, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.14404296875, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.073974609375, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.0865478515625, "loss_aux_layer_4": 0.0899658203125, "loss_aux_layer_5": 0.092041015625, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.08935546875, "loss_aux_layer_8": 0.0882568359375, "loss_aux_layer_9": 0.0869140625, "step": 1158, "total_loss": 0.7558980137109756 }, { "epoch": 0.22945951296772915, "grad_norm": 1.719138503074646, "learning_rate": 5e-05, "llm_loss": 0.594500258564949, "loss": 2.8411, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.065673828125, "loss_aux_layer_10": 0.0875244140625, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.10009765625, "loss_aux_layer_13": 0.107421875, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.0738525390625, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.08544921875, "loss_aux_layer_4": 0.08837890625, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.0926513671875, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.08642578125, "step": 1159, "total_loss": 0.710281252861023 }, { "epoch": 0.22965749356563056, "grad_norm": 1.146241307258606, "learning_rate": 5e-05, "llm_loss": 0.6471432745456696, "loss": 3.0608, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.066650390625, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.095703125, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.1102294921875, "loss_aux_layer_14": 0.121337890625, "loss_aux_layer_15": 0.1314697265625, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.1494140625, "loss_aux_layer_18": 0.158447265625, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.075927734375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.093505859375, "loss_aux_layer_6": 0.0963134765625, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.0904541015625, "loss_aux_layer_9": 0.0887451171875, "step": 1160, "total_loss": 0.7652067244052887 }, { "epoch": 0.22985547416353197, "grad_norm": 2.224433183670044, "learning_rate": 5e-05, "llm_loss": 0.7052761316299438, "loss": 3.2904, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.06329345703125, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.101806640625, "loss_aux_layer_13": 0.1094970703125, "loss_aux_layer_14": 0.1214599609375, "loss_aux_layer_15": 0.133056640625, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.15283203125, "loss_aux_layer_18": 0.16162109375, "loss_aux_layer_19": 0.1630859375, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.0841064453125, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.0892333984375, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.0875244140625, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.085693359375, "step": 1161, "total_loss": 0.8225942850112915 }, { "epoch": 0.23005345476143338, "grad_norm": 1.7655760049819946, "learning_rate": 5e-05, "llm_loss": 0.5767179578542709, "loss": 2.7757, "loss_aux_layer_0": 0.026611328125, "loss_aux_layer_1": 0.0655517578125, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.093994140625, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.1092529296875, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.087890625, "loss_aux_layer_5": 0.0897216796875, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.0865478515625, "step": 1162, "total_loss": 0.6939278692007065 }, { "epoch": 0.2302514353593348, "grad_norm": 1.4022902250289917, "learning_rate": 5e-05, "llm_loss": 0.5207635685801506, "loss": 2.5492, "loss_aux_layer_0": 0.025299072265625, "loss_aux_layer_1": 0.0635986328125, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.0921630859375, "loss_aux_layer_12": 0.099609375, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.118896484375, "loss_aux_layer_15": 0.1297607421875, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.15869140625, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.1982421875, "loss_aux_layer_23": 0.2412109375, "loss_aux_layer_3": 0.0838623046875, "loss_aux_layer_4": 0.0867919921875, "loss_aux_layer_5": 0.0885009765625, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0875244140625, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.084716796875, "step": 1163, "total_loss": 0.6372931897640228 }, { "epoch": 0.2304494159572362, "grad_norm": 2.5379862785339355, "learning_rate": 5e-05, "llm_loss": 0.595915898680687, "loss": 2.8576, "loss_aux_layer_0": 0.023895263671875, "loss_aux_layer_1": 0.06634521484375, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.10302734375, "loss_aux_layer_13": 0.1103515625, "loss_aux_layer_14": 0.12158203125, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.160400390625, "loss_aux_layer_2": 0.0760498046875, "loss_aux_layer_20": 0.1669921875, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.0882568359375, "loss_aux_layer_4": 0.0911865234375, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.0909423828125, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.08837890625, "step": 1164, "total_loss": 0.7143992781639099 }, { "epoch": 0.2306473965551376, "grad_norm": 1.480021595954895, "learning_rate": 5e-05, "llm_loss": 0.6111943125724792, "loss": 2.9242, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.103759765625, "loss_aux_layer_13": 0.111328125, "loss_aux_layer_14": 0.1229248046875, "loss_aux_layer_15": 0.1337890625, "loss_aux_layer_16": 0.14453125, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.16064453125, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0762939453125, "loss_aux_layer_20": 0.16943359375, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.2001953125, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0882568359375, "loss_aux_layer_4": 0.091552734375, "loss_aux_layer_5": 0.09375, "loss_aux_layer_6": 0.096435546875, "loss_aux_layer_7": 0.092041015625, "loss_aux_layer_8": 0.0906982421875, "loss_aux_layer_9": 0.0889892578125, "step": 1165, "total_loss": 0.7310487776994705 }, { "epoch": 0.230845377153039, "grad_norm": 1.9823311567306519, "learning_rate": 5e-05, "llm_loss": 0.6422764509916306, "loss": 3.0228, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.05987548828125, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.09619140625, "loss_aux_layer_13": 0.1038818359375, "loss_aux_layer_14": 0.11669921875, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.068115234375, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.234375, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.084228515625, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.08251953125, "loss_aux_layer_9": 0.0816650390625, "step": 1166, "total_loss": 0.755694255232811 }, { "epoch": 0.2310433577509404, "grad_norm": 2.170109272003174, "learning_rate": 5e-05, "llm_loss": 0.6570770442485809, "loss": 3.0849, "loss_aux_layer_0": 0.02532958984375, "loss_aux_layer_1": 0.06170654296875, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.140380859375, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.15869140625, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0816650390625, "step": 1167, "total_loss": 0.7712259590625763 }, { "epoch": 0.2312413383488418, "grad_norm": 1.0648163557052612, "learning_rate": 5e-05, "llm_loss": 0.6596725583076477, "loss": 3.091, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.0618896484375, "loss_aux_layer_10": 0.0849609375, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1043701171875, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154296875, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0704345703125, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.082275390625, "loss_aux_layer_4": 0.0853271484375, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.0833740234375, "step": 1168, "total_loss": 0.7727511674165726 }, { "epoch": 0.23143931894674322, "grad_norm": 1.546248435974121, "learning_rate": 5e-05, "llm_loss": 0.6008226126432419, "loss": 2.8769, "loss_aux_layer_0": 0.025421142578125, "loss_aux_layer_1": 0.0640869140625, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.0941162109375, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.120849609375, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0732421875, "loss_aux_layer_20": 0.169189453125, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.202392578125, "loss_aux_layer_23": 0.244384765625, "loss_aux_layer_3": 0.0859375, "loss_aux_layer_4": 0.088623046875, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.093017578125, "loss_aux_layer_7": 0.0894775390625, "loss_aux_layer_8": 0.088623046875, "loss_aux_layer_9": 0.0870361328125, "step": 1169, "total_loss": 0.7192335277795792 }, { "epoch": 0.23163729954464463, "grad_norm": 1.2373130321502686, "learning_rate": 5e-05, "llm_loss": 0.6414952725172043, "loss": 3.0349, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.06500244140625, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.100830078125, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.119873046875, "loss_aux_layer_15": 0.130859375, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.149658203125, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0732421875, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.23828125, "loss_aux_layer_3": 0.0849609375, "loss_aux_layer_4": 0.0882568359375, "loss_aux_layer_5": 0.08984375, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.08642578125, "step": 1170, "total_loss": 0.7587328106164932 }, { "epoch": 0.23183528014254604, "grad_norm": 1.667547583580017, "learning_rate": 5e-05, "llm_loss": 0.6654282212257385, "loss": 3.13, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.0634765625, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.1085205078125, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.130859375, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.238037109375, "loss_aux_layer_3": 0.0848388671875, "loss_aux_layer_4": 0.08740234375, "loss_aux_layer_5": 0.089111328125, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.0885009765625, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0863037109375, "step": 1171, "total_loss": 0.7825042903423309 }, { "epoch": 0.23203326074044744, "grad_norm": 2.029050588607788, "learning_rate": 5e-05, "llm_loss": 0.6170593798160553, "loss": 2.9459, "loss_aux_layer_0": 0.026641845703125, "loss_aux_layer_1": 0.06585693359375, "loss_aux_layer_10": 0.0906982421875, "loss_aux_layer_11": 0.09619140625, "loss_aux_layer_12": 0.1031494140625, "loss_aux_layer_13": 0.1107177734375, "loss_aux_layer_14": 0.1217041015625, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.076416015625, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.198486328125, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0892333984375, "loss_aux_layer_4": 0.09228515625, "loss_aux_layer_5": 0.093994140625, "loss_aux_layer_6": 0.0966796875, "loss_aux_layer_7": 0.0921630859375, "loss_aux_layer_8": 0.0909423828125, "loss_aux_layer_9": 0.08935546875, "step": 1172, "total_loss": 0.7364870607852936 }, { "epoch": 0.23223124133834885, "grad_norm": 1.9444036483764648, "learning_rate": 5e-05, "llm_loss": 0.6347829550504684, "loss": 3.002, "loss_aux_layer_0": 0.03228759765625, "loss_aux_layer_1": 0.06756591796875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0924072265625, "loss_aux_layer_12": 0.099609375, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.1187744140625, "loss_aux_layer_15": 0.1298828125, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0838623046875, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.089111328125, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.08837890625, "loss_aux_layer_8": 0.0870361328125, "loss_aux_layer_9": 0.085693359375, "step": 1173, "total_loss": 0.7505026012659073 }, { "epoch": 0.23242922193625024, "grad_norm": 2.0458426475524902, "learning_rate": 5e-05, "llm_loss": 0.5282999202609062, "loss": 2.5883, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.0660400390625, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.1026611328125, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.1217041015625, "loss_aux_layer_15": 0.1331787109375, "loss_aux_layer_16": 0.144775390625, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.1611328125, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.074951171875, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.0867919921875, "loss_aux_layer_4": 0.0897216796875, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.09375, "loss_aux_layer_7": 0.08984375, "loss_aux_layer_8": 0.088623046875, "loss_aux_layer_9": 0.08740234375, "step": 1174, "total_loss": 0.647072896361351 }, { "epoch": 0.23262720253415164, "grad_norm": 1.8034933805465698, "learning_rate": 5e-05, "llm_loss": 0.6303353756666183, "loss": 3.0066, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.0687255859375, "loss_aux_layer_10": 0.0936279296875, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1068115234375, "loss_aux_layer_13": 0.114501953125, "loss_aux_layer_14": 0.1256103515625, "loss_aux_layer_15": 0.135986328125, "loss_aux_layer_16": 0.14697265625, "loss_aux_layer_17": 0.15380859375, "loss_aux_layer_18": 0.1630859375, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.0771484375, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.094482421875, "loss_aux_layer_5": 0.0965576171875, "loss_aux_layer_6": 0.099609375, "loss_aux_layer_7": 0.0958251953125, "loss_aux_layer_8": 0.0938720703125, "loss_aux_layer_9": 0.0919189453125, "step": 1175, "total_loss": 0.7516587823629379 }, { "epoch": 0.23282518313205305, "grad_norm": 1.2764654159545898, "learning_rate": 5e-05, "llm_loss": 0.5767592266201973, "loss": 2.7704, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.06280517578125, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.0931396484375, "loss_aux_layer_12": 0.10009765625, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.1298828125, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.0716552734375, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.171875, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.083740234375, "loss_aux_layer_4": 0.0869140625, "loss_aux_layer_5": 0.088623046875, "loss_aux_layer_6": 0.091796875, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0863037109375, "step": 1176, "total_loss": 0.6926029622554779 }, { "epoch": 0.23302316372995446, "grad_norm": 1.8427644968032837, "learning_rate": 5e-05, "llm_loss": 0.7475440800189972, "loss": 3.4466, "loss_aux_layer_0": 0.02789306640625, "loss_aux_layer_1": 0.06390380859375, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1160888671875, "loss_aux_layer_15": 0.126953125, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0836181640625, "loss_aux_layer_4": 0.0865478515625, "loss_aux_layer_5": 0.0877685546875, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.083740234375, "step": 1177, "total_loss": 0.8616484999656677 }, { "epoch": 0.23322114432785587, "grad_norm": 1.2640572786331177, "learning_rate": 5e-05, "llm_loss": 0.7037684470415115, "loss": 3.2524, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.0574951171875, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.100341796875, "loss_aux_layer_14": 0.11181640625, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.13525390625, "loss_aux_layer_17": 0.143798828125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.06396484375, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.077880859375, "step": 1178, "total_loss": 0.8131082952022552 }, { "epoch": 0.23341912492575728, "grad_norm": 1.1804587841033936, "learning_rate": 5e-05, "llm_loss": 0.657636359333992, "loss": 3.1004, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.06494140625, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.1092529296875, "loss_aux_layer_14": 0.120849609375, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0736083984375, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.19580078125, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0848388671875, "loss_aux_layer_4": 0.0877685546875, "loss_aux_layer_5": 0.0902099609375, "loss_aux_layer_6": 0.0931396484375, "loss_aux_layer_7": 0.0887451171875, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.0863037109375, "step": 1179, "total_loss": 0.7750978916883469 }, { "epoch": 0.2336171055236587, "grad_norm": 1.2638981342315674, "learning_rate": 5e-05, "llm_loss": 0.5687796175479889, "loss": 2.7327, "loss_aux_layer_0": 0.026458740234375, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.0841064453125, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1163330078125, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0704345703125, "loss_aux_layer_20": 0.166259765625, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.0887451171875, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.08349609375, "loss_aux_layer_9": 0.08251953125, "step": 1180, "total_loss": 0.6831744313240051 }, { "epoch": 0.2338150861215601, "grad_norm": 1.2199187278747559, "learning_rate": 5e-05, "llm_loss": 0.6697566956281662, "loss": 3.1392, "loss_aux_layer_0": 0.028411865234375, "loss_aux_layer_1": 0.06390380859375, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.097900390625, "loss_aux_layer_13": 0.1051025390625, "loss_aux_layer_14": 0.1168212890625, "loss_aux_layer_15": 0.1278076171875, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0703125, "loss_aux_layer_20": 0.166259765625, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.0860595703125, "loss_aux_layer_5": 0.087890625, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.0855712890625, "loss_aux_layer_9": 0.084228515625, "step": 1181, "total_loss": 0.7848071604967117 }, { "epoch": 0.23401306671946148, "grad_norm": 1.238169550895691, "learning_rate": 5e-05, "llm_loss": 0.6048736125230789, "loss": 2.8983, "loss_aux_layer_0": 0.02557373046875, "loss_aux_layer_1": 0.0648193359375, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1099853515625, "loss_aux_layer_14": 0.1219482421875, "loss_aux_layer_15": 0.133056640625, "loss_aux_layer_16": 0.144775390625, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.161865234375, "loss_aux_layer_19": 0.16455078125, "loss_aux_layer_2": 0.075439453125, "loss_aux_layer_20": 0.17138671875, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.201171875, "loss_aux_layer_23": 0.243408203125, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.09033203125, "loss_aux_layer_5": 0.0924072265625, "loss_aux_layer_6": 0.0950927734375, "loss_aux_layer_7": 0.091552734375, "loss_aux_layer_8": 0.0902099609375, "loss_aux_layer_9": 0.088623046875, "step": 1182, "total_loss": 0.7245812118053436 }, { "epoch": 0.2342110473173629, "grad_norm": 1.272699236869812, "learning_rate": 5e-05, "llm_loss": 0.6183622628450394, "loss": 2.9279, "loss_aux_layer_0": 0.02783203125, "loss_aux_layer_1": 0.0626220703125, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.090087890625, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.1041259765625, "loss_aux_layer_14": 0.1151123046875, "loss_aux_layer_15": 0.126220703125, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0697021484375, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.083984375, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.0850830078125, "loss_aux_layer_8": 0.0841064453125, "loss_aux_layer_9": 0.083251953125, "step": 1183, "total_loss": 0.7319763451814651 }, { "epoch": 0.2344090279152643, "grad_norm": 1.9648065567016602, "learning_rate": 5e-05, "llm_loss": 0.6395069658756256, "loss": 3.0178, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.0609130859375, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.1070556640625, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.130126953125, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.1494140625, "loss_aux_layer_18": 0.157958984375, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.070556640625, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.085693359375, "loss_aux_layer_5": 0.087646484375, "loss_aux_layer_6": 0.09033203125, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.08544921875, "loss_aux_layer_9": 0.0843505859375, "step": 1184, "total_loss": 0.7544465363025665 }, { "epoch": 0.2346070085131657, "grad_norm": 2.590104818344116, "learning_rate": 5e-05, "llm_loss": 0.6463034749031067, "loss": 3.0475, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.06280517578125, "loss_aux_layer_10": 0.086181640625, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.1190185546875, "loss_aux_layer_15": 0.130126953125, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.158935546875, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.0712890625, "loss_aux_layer_20": 0.1669921875, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.0830078125, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.0877685546875, "loss_aux_layer_6": 0.09033203125, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.0849609375, "step": 1185, "total_loss": 0.7618861496448517 }, { "epoch": 0.23480498911106712, "grad_norm": 2.0421695709228516, "learning_rate": 5e-05, "llm_loss": 0.6237212717533112, "loss": 2.9872, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.06884765625, "loss_aux_layer_10": 0.09375, "loss_aux_layer_11": 0.0997314453125, "loss_aux_layer_12": 0.1072998046875, "loss_aux_layer_13": 0.1153564453125, "loss_aux_layer_14": 0.1282958984375, "loss_aux_layer_15": 0.139892578125, "loss_aux_layer_16": 0.15087890625, "loss_aux_layer_17": 0.158447265625, "loss_aux_layer_18": 0.167236328125, "loss_aux_layer_19": 0.16796875, "loss_aux_layer_2": 0.0765380859375, "loss_aux_layer_20": 0.173583984375, "loss_aux_layer_21": 0.178955078125, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.240478515625, "loss_aux_layer_3": 0.0908203125, "loss_aux_layer_4": 0.0946044921875, "loss_aux_layer_5": 0.0966796875, "loss_aux_layer_6": 0.0994873046875, "loss_aux_layer_7": 0.094970703125, "loss_aux_layer_8": 0.0936279296875, "loss_aux_layer_9": 0.0921630859375, "step": 1186, "total_loss": 0.7468082159757614 }, { "epoch": 0.23500296970896853, "grad_norm": 1.6651413440704346, "learning_rate": 5e-05, "llm_loss": 0.6672796756029129, "loss": 3.1374, "loss_aux_layer_0": 0.026214599609375, "loss_aux_layer_1": 0.06158447265625, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.091796875, "loss_aux_layer_12": 0.0989990234375, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.11962890625, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.151611328125, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.162841796875, "loss_aux_layer_2": 0.0716552734375, "loss_aux_layer_20": 0.169921875, "loss_aux_layer_21": 0.17724609375, "loss_aux_layer_22": 0.19921875, "loss_aux_layer_23": 0.2421875, "loss_aux_layer_3": 0.0830078125, "loss_aux_layer_4": 0.0860595703125, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0921630859375, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.086181640625, "loss_aux_layer_9": 0.0850830078125, "step": 1187, "total_loss": 0.7843538075685501 }, { "epoch": 0.23520095030686994, "grad_norm": 2.9337973594665527, "learning_rate": 5e-05, "llm_loss": 0.6595662385225296, "loss": 3.1073, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.0638427734375, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.0924072265625, "loss_aux_layer_12": 0.1002197265625, "loss_aux_layer_13": 0.1087646484375, "loss_aux_layer_14": 0.12109375, "loss_aux_layer_15": 0.133056640625, "loss_aux_layer_16": 0.144287109375, "loss_aux_layer_17": 0.152099609375, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.0731201171875, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.176025390625, "loss_aux_layer_22": 0.197509765625, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.08837890625, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.0875244140625, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.0848388671875, "step": 1188, "total_loss": 0.7768183052539825 }, { "epoch": 0.23539893090477132, "grad_norm": 2.5854554176330566, "learning_rate": 5e-05, "llm_loss": 0.5954093486070633, "loss": 2.8519, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.0614013671875, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.100830078125, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1212158203125, "loss_aux_layer_15": 0.13330078125, "loss_aux_layer_16": 0.14404296875, "loss_aux_layer_17": 0.15234375, "loss_aux_layer_18": 0.161376953125, "loss_aux_layer_19": 0.162841796875, "loss_aux_layer_2": 0.0748291015625, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.08544921875, "loss_aux_layer_4": 0.089111328125, "loss_aux_layer_5": 0.09130859375, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.0867919921875, "loss_aux_layer_9": 0.0859375, "step": 1189, "total_loss": 0.7129832655191422 }, { "epoch": 0.23559691150267273, "grad_norm": 2.644176721572876, "learning_rate": 5e-05, "llm_loss": 0.6755333095788956, "loss": 3.1767, "loss_aux_layer_0": 0.025787353515625, "loss_aux_layer_1": 0.065185546875, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.0943603515625, "loss_aux_layer_12": 0.1011962890625, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.120849609375, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.0787353515625, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.0882568359375, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.0950927734375, "loss_aux_layer_6": 0.09716796875, "loss_aux_layer_7": 0.09130859375, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.0872802734375, "step": 1190, "total_loss": 0.794177234172821 }, { "epoch": 0.23579489210057414, "grad_norm": 4.157247543334961, "learning_rate": 5e-05, "llm_loss": 0.6882794350385666, "loss": 3.2219, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.06451416015625, "loss_aux_layer_10": 0.087158203125, "loss_aux_layer_11": 0.0921630859375, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.1317138671875, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.1513671875, "loss_aux_layer_18": 0.16064453125, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.074951171875, "loss_aux_layer_20": 0.169677734375, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.0860595703125, "loss_aux_layer_4": 0.0887451171875, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.092529296875, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0853271484375, "step": 1191, "total_loss": 0.8054726868867874 }, { "epoch": 0.23599287269847555, "grad_norm": 2.481997013092041, "learning_rate": 5e-05, "llm_loss": 0.6007043048739433, "loss": 2.8728, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.06488037109375, "loss_aux_layer_10": 0.0889892578125, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.1302490234375, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.15771484375, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.1650390625, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.23681640625, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.0914306640625, "loss_aux_layer_5": 0.09326171875, "loss_aux_layer_6": 0.095947265625, "loss_aux_layer_7": 0.090576171875, "loss_aux_layer_8": 0.089111328125, "loss_aux_layer_9": 0.0875244140625, "step": 1192, "total_loss": 0.7182012647390366 }, { "epoch": 0.23619085329637696, "grad_norm": 1.5013189315795898, "learning_rate": 5e-05, "llm_loss": 0.6439436674118042, "loss": 3.0244, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.05999755859375, "loss_aux_layer_10": 0.0826416015625, "loss_aux_layer_11": 0.0880126953125, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.06781005859375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.22900390625, "loss_aux_layer_3": 0.0794677734375, "loss_aux_layer_4": 0.08251953125, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.0860595703125, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.0814208984375, "loss_aux_layer_9": 0.08056640625, "step": 1193, "total_loss": 0.7560886144638062 }, { "epoch": 0.23638883389427837, "grad_norm": 2.609356641769409, "learning_rate": 5e-05, "llm_loss": 0.6204312518239021, "loss": 2.9411, "loss_aux_layer_0": 0.02691650390625, "loss_aux_layer_1": 0.06329345703125, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.127685546875, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.0726318359375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.194091796875, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.0855712890625, "loss_aux_layer_5": 0.089599609375, "loss_aux_layer_6": 0.091064453125, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.0838623046875, "step": 1194, "total_loss": 0.7352679371833801 }, { "epoch": 0.23658681449217978, "grad_norm": 1.7492352724075317, "learning_rate": 5e-05, "llm_loss": 0.6597016304731369, "loss": 3.0899, "loss_aux_layer_0": 0.02435302734375, "loss_aux_layer_1": 0.0611572265625, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.0960693359375, "loss_aux_layer_13": 0.1036376953125, "loss_aux_layer_14": 0.115234375, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23095703125, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.085205078125, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0819091796875, "step": 1195, "total_loss": 0.7724756896495819 }, { "epoch": 0.23678479509008118, "grad_norm": 1.7415366172790527, "learning_rate": 5e-05, "llm_loss": 0.5851604416966438, "loss": 2.8156, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.0655517578125, "loss_aux_layer_10": 0.0919189453125, "loss_aux_layer_11": 0.0972900390625, "loss_aux_layer_12": 0.1046142578125, "loss_aux_layer_13": 0.11181640625, "loss_aux_layer_14": 0.1226806640625, "loss_aux_layer_15": 0.133544921875, "loss_aux_layer_16": 0.14404296875, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.158447265625, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.1650390625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.093017578125, "loss_aux_layer_6": 0.096435546875, "loss_aux_layer_7": 0.0928955078125, "loss_aux_layer_8": 0.0914306640625, "loss_aux_layer_9": 0.09033203125, "step": 1196, "total_loss": 0.7039067149162292 }, { "epoch": 0.23698277568798257, "grad_norm": 1.6467010974884033, "learning_rate": 5e-05, "llm_loss": 0.5787461996078491, "loss": 2.7572, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.058349609375, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0933837890625, "loss_aux_layer_13": 0.1004638671875, "loss_aux_layer_14": 0.1116943359375, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.06744384765625, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.190673828125, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.08056640625, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.079345703125, "step": 1197, "total_loss": 0.689301535487175 }, { "epoch": 0.23718075628588398, "grad_norm": 1.6777604818344116, "learning_rate": 5e-05, "llm_loss": 0.49048538506031036, "loss": 2.4443, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.06646728515625, "loss_aux_layer_10": 0.0928955078125, "loss_aux_layer_11": 0.0989990234375, "loss_aux_layer_12": 0.1063232421875, "loss_aux_layer_13": 0.114501953125, "loss_aux_layer_14": 0.12646484375, "loss_aux_layer_15": 0.137451171875, "loss_aux_layer_16": 0.147705078125, "loss_aux_layer_17": 0.15478515625, "loss_aux_layer_18": 0.16357421875, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.07568359375, "loss_aux_layer_20": 0.1689453125, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.2353515625, "loss_aux_layer_3": 0.0888671875, "loss_aux_layer_4": 0.091796875, "loss_aux_layer_5": 0.0941162109375, "loss_aux_layer_6": 0.0963134765625, "loss_aux_layer_7": 0.0924072265625, "loss_aux_layer_8": 0.091796875, "loss_aux_layer_9": 0.0908203125, "step": 1198, "total_loss": 0.611063688993454 }, { "epoch": 0.23737873688378539, "grad_norm": 1.0724941492080688, "learning_rate": 5e-05, "llm_loss": 0.5958815813064575, "loss": 2.8355, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.06060791015625, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0892333984375, "loss_aux_layer_12": 0.0963134765625, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.115966796875, "loss_aux_layer_15": 0.12744140625, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0826416015625, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0819091796875, "step": 1199, "total_loss": 0.7088863998651505 }, { "epoch": 0.2375767174816868, "grad_norm": 1.6559919118881226, "learning_rate": 5e-05, "llm_loss": 0.5869181826710701, "loss": 2.8144, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0623779296875, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.1005859375, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.13134765625, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.159423828125, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.07232666015625, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.0849609375, "loss_aux_layer_4": 0.0875244140625, "loss_aux_layer_5": 0.088623046875, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.087158203125, "loss_aux_layer_9": 0.0860595703125, "step": 1200, "total_loss": 0.7036005556583405 }, { "epoch": 0.2377746980795882, "grad_norm": 1.0712339878082275, "learning_rate": 5e-05, "llm_loss": 0.5331423133611679, "loss": 2.5978, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.06329345703125, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.091064453125, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.11865234375, "loss_aux_layer_15": 0.1300048828125, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.158447265625, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.072509765625, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17724609375, "loss_aux_layer_22": 0.199462890625, "loss_aux_layer_23": 0.24365234375, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.0855712890625, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.0855712890625, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.0836181640625, "step": 1201, "total_loss": 0.6494374871253967 }, { "epoch": 0.2379726786774896, "grad_norm": 1.3697110414505005, "learning_rate": 5e-05, "llm_loss": 0.7451434284448624, "loss": 3.4276, "loss_aux_layer_0": 0.02740478515625, "loss_aux_layer_1": 0.0609130859375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.15625, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.06640625, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.22900390625, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.0816650390625, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.0799560546875, "step": 1202, "total_loss": 0.8569087982177734 }, { "epoch": 0.23817065927539102, "grad_norm": 0.9790863394737244, "learning_rate": 5e-05, "llm_loss": 0.6043817847967148, "loss": 2.8773, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.06341552734375, "loss_aux_layer_10": 0.0863037109375, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0987548828125, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.1279296875, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15625, "loss_aux_layer_2": 0.0726318359375, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.083984375, "loss_aux_layer_4": 0.0867919921875, "loss_aux_layer_5": 0.0885009765625, "loss_aux_layer_6": 0.0908203125, "loss_aux_layer_7": 0.0870361328125, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.084716796875, "step": 1203, "total_loss": 0.7193230986595154 }, { "epoch": 0.2383686398732924, "grad_norm": 0.8939967751502991, "learning_rate": 5e-05, "llm_loss": 0.577002763748169, "loss": 2.778, "loss_aux_layer_0": 0.02606201171875, "loss_aux_layer_1": 0.0654296875, "loss_aux_layer_10": 0.0888671875, "loss_aux_layer_11": 0.0943603515625, "loss_aux_layer_12": 0.10107421875, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.130859375, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.15966796875, "loss_aux_layer_2": 0.0736083984375, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.239013671875, "loss_aux_layer_3": 0.0858154296875, "loss_aux_layer_4": 0.0885009765625, "loss_aux_layer_5": 0.09033203125, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.0887451171875, "loss_aux_layer_9": 0.087158203125, "step": 1204, "total_loss": 0.6945060938596725 }, { "epoch": 0.2385666204711938, "grad_norm": 1.0050082206726074, "learning_rate": 5e-05, "llm_loss": 0.580662727355957, "loss": 2.7824, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.06219482421875, "loss_aux_layer_10": 0.0859375, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.1063232421875, "loss_aux_layer_14": 0.117431640625, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.15625, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.0703125, "loss_aux_layer_20": 0.1650390625, "loss_aux_layer_21": 0.172607421875, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.082763671875, "loss_aux_layer_4": 0.08544921875, "loss_aux_layer_5": 0.0872802734375, "loss_aux_layer_6": 0.09033203125, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.08447265625, "step": 1205, "total_loss": 0.695603534579277 }, { "epoch": 0.23876460106909522, "grad_norm": 0.9936111569404602, "learning_rate": 5e-05, "llm_loss": 0.6792219281196594, "loss": 3.1834, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.06341552734375, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.094482421875, "loss_aux_layer_12": 0.1014404296875, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.1307373046875, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.1728515625, "loss_aux_layer_22": 0.1943359375, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.088134765625, "loss_aux_layer_5": 0.0897216796875, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0869140625, "step": 1206, "total_loss": 0.7958477139472961 }, { "epoch": 0.23896258166699663, "grad_norm": 0.8300755620002747, "learning_rate": 5e-05, "llm_loss": 0.5745754688978195, "loss": 2.7837, "loss_aux_layer_0": 0.026336669921875, "loss_aux_layer_1": 0.0679931640625, "loss_aux_layer_10": 0.0927734375, "loss_aux_layer_11": 0.098388671875, "loss_aux_layer_12": 0.105224609375, "loss_aux_layer_13": 0.1129150390625, "loss_aux_layer_14": 0.124267578125, "loss_aux_layer_15": 0.1351318359375, "loss_aux_layer_16": 0.14599609375, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.164794921875, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.17041015625, "loss_aux_layer_21": 0.177978515625, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.08984375, "loss_aux_layer_4": 0.093017578125, "loss_aux_layer_5": 0.0948486328125, "loss_aux_layer_6": 0.0977783203125, "loss_aux_layer_7": 0.093505859375, "loss_aux_layer_8": 0.092529296875, "loss_aux_layer_9": 0.0911865234375, "step": 1207, "total_loss": 0.6959162056446075 }, { "epoch": 0.23916056226489804, "grad_norm": 1.086415410041809, "learning_rate": 5e-05, "llm_loss": 0.6846066564321518, "loss": 3.1857, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.05914306640625, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.0966796875, "loss_aux_layer_13": 0.10400390625, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.145263671875, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.06781005859375, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.2255859375, "loss_aux_layer_3": 0.07958984375, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.0841064453125, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.08203125, "step": 1208, "total_loss": 0.7964247912168503 }, { "epoch": 0.23935854286279945, "grad_norm": 1.0191084146499634, "learning_rate": 5e-05, "llm_loss": 0.5691576898097992, "loss": 2.7276, "loss_aux_layer_0": 0.02410888671875, "loss_aux_layer_1": 0.060546875, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.089599609375, "loss_aux_layer_12": 0.0963134765625, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.1153564453125, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154296875, "loss_aux_layer_19": 0.156494140625, "loss_aux_layer_2": 0.0687255859375, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.085205078125, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.084716796875, "loss_aux_layer_8": 0.0838623046875, "loss_aux_layer_9": 0.0823974609375, "step": 1209, "total_loss": 0.6819020062685013 }, { "epoch": 0.23955652346070086, "grad_norm": 0.9519187808036804, "learning_rate": 5e-05, "llm_loss": 0.6325609982013702, "loss": 2.9877, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.0982666015625, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1177978515625, "loss_aux_layer_15": 0.1287841796875, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.082763671875, "loss_aux_layer_4": 0.08544921875, "loss_aux_layer_5": 0.0872802734375, "loss_aux_layer_6": 0.090087890625, "loss_aux_layer_7": 0.086181640625, "loss_aux_layer_8": 0.0855712890625, "loss_aux_layer_9": 0.084228515625, "step": 1210, "total_loss": 0.7469264417886734 }, { "epoch": 0.23975450405860227, "grad_norm": 1.0429269075393677, "learning_rate": 5e-05, "llm_loss": 0.5155173093080521, "loss": 2.5228, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.0618896484375, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.0987548828125, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.117919921875, "loss_aux_layer_15": 0.1290283203125, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.147705078125, "loss_aux_layer_18": 0.15625, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.07080078125, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.2392578125, "loss_aux_layer_3": 0.08251953125, "loss_aux_layer_4": 0.0853271484375, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.083984375, "step": 1211, "total_loss": 0.6307009905576706 }, { "epoch": 0.23995248465650365, "grad_norm": 1.390647530555725, "learning_rate": 5e-05, "llm_loss": 0.6467273384332657, "loss": 3.0428, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.06146240234375, "loss_aux_layer_10": 0.0849609375, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.097412109375, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.1552734375, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.069580078125, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.0816650390625, "loss_aux_layer_4": 0.0850830078125, "loss_aux_layer_5": 0.0869140625, "loss_aux_layer_6": 0.08984375, "loss_aux_layer_7": 0.0860595703125, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.083740234375, "step": 1212, "total_loss": 0.7606911212205887 }, { "epoch": 0.24015046525440506, "grad_norm": 0.9257344007492065, "learning_rate": 5e-05, "llm_loss": 0.6786288768053055, "loss": 3.1699, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.059814453125, "loss_aux_layer_10": 0.0828857421875, "loss_aux_layer_11": 0.08837890625, "loss_aux_layer_12": 0.095458984375, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.1153564453125, "loss_aux_layer_15": 0.1275634765625, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0848388671875, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.0823974609375, "loss_aux_layer_9": 0.0811767578125, "step": 1213, "total_loss": 0.7924715131521225 }, { "epoch": 0.24034844585230647, "grad_norm": 1.0353111028671265, "learning_rate": 5e-05, "llm_loss": 0.669820249080658, "loss": 3.1394, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.06243896484375, "loss_aux_layer_10": 0.0867919921875, "loss_aux_layer_11": 0.092041015625, "loss_aux_layer_12": 0.099365234375, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.11865234375, "loss_aux_layer_15": 0.1295166015625, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0706787109375, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.08837890625, "loss_aux_layer_6": 0.0914306640625, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.0853271484375, "step": 1214, "total_loss": 0.7848472148180008 }, { "epoch": 0.24054642645020788, "grad_norm": 1.13998281955719, "learning_rate": 5e-05, "llm_loss": 0.6997064799070358, "loss": 3.2542, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.06280517578125, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0987548828125, "loss_aux_layer_13": 0.1055908203125, "loss_aux_layer_14": 0.1168212890625, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.0855712890625, "loss_aux_layer_5": 0.087158203125, "loss_aux_layer_6": 0.08984375, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.08544921875, "loss_aux_layer_9": 0.084228515625, "step": 1215, "total_loss": 0.8135537505149841 }, { "epoch": 0.2407444070481093, "grad_norm": 0.8573062419891357, "learning_rate": 5e-05, "llm_loss": 0.6103245615959167, "loss": 2.8951, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.0609130859375, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.115966796875, "loss_aux_layer_15": 0.126953125, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.0687255859375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.232421875, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.08349609375, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.0848388671875, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.0828857421875, "step": 1216, "total_loss": 0.7237804979085922 }, { "epoch": 0.2409423876460107, "grad_norm": 1.1789166927337646, "learning_rate": 5e-05, "llm_loss": 0.6100417226552963, "loss": 2.898, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.061767578125, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.127197265625, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.0687255859375, "loss_aux_layer_20": 0.1669921875, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.197021484375, "loss_aux_layer_23": 0.239013671875, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.083251953125, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.084716796875, "loss_aux_layer_8": 0.084228515625, "loss_aux_layer_9": 0.083251953125, "step": 1217, "total_loss": 0.7244937568902969 }, { "epoch": 0.2411403682439121, "grad_norm": 1.0635770559310913, "learning_rate": 5e-05, "llm_loss": 0.6427600830793381, "loss": 3.0013, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.05474853515625, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.109375, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.132568359375, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.06170654296875, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.080810546875, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0770263671875, "loss_aux_layer_9": 0.0765380859375, "step": 1218, "total_loss": 0.7503143697977066 }, { "epoch": 0.24133834884181352, "grad_norm": 1.063928484916687, "learning_rate": 5e-05, "llm_loss": 0.6609726995229721, "loss": 3.1116, "loss_aux_layer_0": 0.030975341796875, "loss_aux_layer_1": 0.0672607421875, "loss_aux_layer_10": 0.087646484375, "loss_aux_layer_11": 0.093505859375, "loss_aux_layer_12": 0.1002197265625, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.119140625, "loss_aux_layer_15": 0.1304931640625, "loss_aux_layer_16": 0.14111328125, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.15869140625, "loss_aux_layer_19": 0.160400390625, "loss_aux_layer_2": 0.072021484375, "loss_aux_layer_20": 0.16748046875, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.236572265625, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.087158203125, "loss_aux_layer_9": 0.0859375, "step": 1219, "total_loss": 0.777894601225853 }, { "epoch": 0.2415363294397149, "grad_norm": 0.865451991558075, "learning_rate": 5e-05, "llm_loss": 0.5941593423485756, "loss": 2.8125, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.05609130859375, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.092041015625, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.1219482421875, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.064208984375, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.0791015625, "step": 1220, "total_loss": 0.7031333073973656 }, { "epoch": 0.2417343100376163, "grad_norm": 1.1634656190872192, "learning_rate": 5e-05, "llm_loss": 0.5720752328634262, "loss": 2.7422, "loss_aux_layer_0": 0.02447509765625, "loss_aux_layer_1": 0.06280517578125, "loss_aux_layer_10": 0.085205078125, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.097412109375, "loss_aux_layer_13": 0.1051025390625, "loss_aux_layer_14": 0.116455078125, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.230712890625, "loss_aux_layer_3": 0.08203125, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.083740234375, "step": 1221, "total_loss": 0.6855453103780746 }, { "epoch": 0.24193229063551772, "grad_norm": 1.2396323680877686, "learning_rate": 5e-05, "llm_loss": 0.6914525181055069, "loss": 3.2163, "loss_aux_layer_0": 0.024505615234375, "loss_aux_layer_1": 0.0604248046875, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.0899658203125, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.1163330078125, "loss_aux_layer_15": 0.12744140625, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.084716796875, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.082763671875, "step": 1222, "total_loss": 0.8040708005428314 }, { "epoch": 0.24213027123341913, "grad_norm": 1.8128119707107544, "learning_rate": 5e-05, "llm_loss": 0.5966124385595322, "loss": 2.86, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.06610107421875, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.095947265625, "loss_aux_layer_12": 0.1024169921875, "loss_aux_layer_13": 0.1097412109375, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.1309814453125, "loss_aux_layer_16": 0.142578125, "loss_aux_layer_17": 0.149658203125, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0750732421875, "loss_aux_layer_20": 0.16748046875, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.197265625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.087646484375, "loss_aux_layer_4": 0.090576171875, "loss_aux_layer_5": 0.0924072265625, "loss_aux_layer_6": 0.0953369140625, "loss_aux_layer_7": 0.091552734375, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.0885009765625, "step": 1223, "total_loss": 0.7149886637926102 }, { "epoch": 0.24232825183132053, "grad_norm": 2.327989101409912, "learning_rate": 5e-05, "llm_loss": 0.5872641205787659, "loss": 2.8038, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.06219482421875, "loss_aux_layer_10": 0.0841064453125, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.104736328125, "loss_aux_layer_14": 0.1165771484375, "loss_aux_layer_15": 0.1278076171875, "loss_aux_layer_16": 0.138916015625, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.084228515625, "loss_aux_layer_5": 0.0860595703125, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.0823974609375, "step": 1224, "total_loss": 0.7009425759315491 }, { "epoch": 0.24252623242922194, "grad_norm": 1.7478291988372803, "learning_rate": 5e-05, "llm_loss": 0.6419389843940735, "loss": 3.0309, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.061279296875, "loss_aux_layer_10": 0.087646484375, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.1004638671875, "loss_aux_layer_13": 0.107666015625, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.131103515625, "loss_aux_layer_16": 0.1416015625, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.157958984375, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0712890625, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.083740234375, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.0885009765625, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0870361328125, "loss_aux_layer_9": 0.086181640625, "step": 1225, "total_loss": 0.7577148824930191 }, { "epoch": 0.24272421302712335, "grad_norm": 1.3562952280044556, "learning_rate": 5e-05, "llm_loss": 0.6217976212501526, "loss": 2.9442, "loss_aux_layer_0": 0.026519775390625, "loss_aux_layer_1": 0.06182861328125, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.11669921875, "loss_aux_layer_15": 0.1278076171875, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.15625, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.0892333984375, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.08349609375, "step": 1226, "total_loss": 0.7360410988330841 }, { "epoch": 0.24292219362502473, "grad_norm": 1.9775257110595703, "learning_rate": 5e-05, "llm_loss": 0.6805709302425385, "loss": 3.1799, "loss_aux_layer_0": 0.026519775390625, "loss_aux_layer_1": 0.06207275390625, "loss_aux_layer_10": 0.086181640625, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.098388671875, "loss_aux_layer_13": 0.1060791015625, "loss_aux_layer_14": 0.1181640625, "loss_aux_layer_15": 0.1292724609375, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.08349609375, "loss_aux_layer_5": 0.0850830078125, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.0853271484375, "loss_aux_layer_8": 0.0849609375, "loss_aux_layer_9": 0.08447265625, "step": 1227, "total_loss": 0.7949713319540024 }, { "epoch": 0.24312017422292614, "grad_norm": 1.2249592542648315, "learning_rate": 5e-05, "llm_loss": 0.5782286077737808, "loss": 2.7632, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.05938720703125, "loss_aux_layer_10": 0.083251953125, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.0955810546875, "loss_aux_layer_13": 0.1036376953125, "loss_aux_layer_14": 0.1151123046875, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.158203125, "loss_aux_layer_2": 0.0667724609375, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.233154296875, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.0816650390625, "step": 1228, "total_loss": 0.6908057332038879 }, { "epoch": 0.24331815482082755, "grad_norm": 1.7830997705459595, "learning_rate": 5e-05, "llm_loss": 0.5836289077997208, "loss": 2.7889, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.06005859375, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.091064453125, "loss_aux_layer_12": 0.0977783203125, "loss_aux_layer_13": 0.104736328125, "loss_aux_layer_14": 0.115966796875, "loss_aux_layer_15": 0.12744140625, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.068603515625, "loss_aux_layer_20": 0.1640625, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0858154296875, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.0855712890625, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.083740234375, "step": 1229, "total_loss": 0.6972204148769379 }, { "epoch": 0.24351613541872896, "grad_norm": 1.5988341569900513, "learning_rate": 5e-05, "llm_loss": 0.6810897588729858, "loss": 3.1664, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.05792236328125, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.09423828125, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.1229248046875, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.066650390625, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0792236328125, "loss_aux_layer_4": 0.081787109375, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.0821533203125, "loss_aux_layer_8": 0.0814208984375, "loss_aux_layer_9": 0.0804443359375, "step": 1230, "total_loss": 0.7916093468666077 }, { "epoch": 0.24371411601663037, "grad_norm": 1.7724900245666504, "learning_rate": 5e-05, "llm_loss": 0.7489113211631775, "loss": 3.4475, "loss_aux_layer_0": 0.026397705078125, "loss_aux_layer_1": 0.06146240234375, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.0960693359375, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.127197265625, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.156982421875, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.085205078125, "loss_aux_layer_6": 0.0882568359375, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.0821533203125, "step": 1231, "total_loss": 0.8618629276752472 }, { "epoch": 0.24391209661453178, "grad_norm": 1.9282854795455933, "learning_rate": 5e-05, "llm_loss": 0.6653992235660553, "loss": 3.1155, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.06011962890625, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.097412109375, "loss_aux_layer_13": 0.1053466796875, "loss_aux_layer_14": 0.117431640625, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.1552734375, "loss_aux_layer_19": 0.156494140625, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0811767578125, "loss_aux_layer_4": 0.083984375, "loss_aux_layer_5": 0.0853271484375, "loss_aux_layer_6": 0.0885009765625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.084228515625, "loss_aux_layer_9": 0.0833740234375, "step": 1232, "total_loss": 0.7788862884044647 }, { "epoch": 0.2441100772124332, "grad_norm": 1.8047183752059937, "learning_rate": 5e-05, "llm_loss": 0.6967184990644455, "loss": 3.249, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.064453125, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.1292724609375, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.158203125, "loss_aux_layer_2": 0.0728759765625, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0853271484375, "loss_aux_layer_4": 0.088134765625, "loss_aux_layer_5": 0.0894775390625, "loss_aux_layer_6": 0.09228515625, "loss_aux_layer_7": 0.08837890625, "loss_aux_layer_8": 0.087158203125, "loss_aux_layer_9": 0.08544921875, "step": 1233, "total_loss": 0.8122553378343582 }, { "epoch": 0.2443080578103346, "grad_norm": 1.5696009397506714, "learning_rate": 5e-05, "llm_loss": 0.6541447192430496, "loss": 3.0831, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.0631103515625, "loss_aux_layer_10": 0.089599609375, "loss_aux_layer_11": 0.0950927734375, "loss_aux_layer_12": 0.1019287109375, "loss_aux_layer_13": 0.1094970703125, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.13037109375, "loss_aux_layer_16": 0.141357421875, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.072509765625, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.0885009765625, "loss_aux_layer_5": 0.0904541015625, "loss_aux_layer_6": 0.09326171875, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.0892333984375, "loss_aux_layer_9": 0.087890625, "step": 1234, "total_loss": 0.7707870006561279 }, { "epoch": 0.24450603840823598, "grad_norm": 1.684175729751587, "learning_rate": 5e-05, "llm_loss": 0.6699339598417282, "loss": 3.143, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.062744140625, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.099609375, "loss_aux_layer_13": 0.10693359375, "loss_aux_layer_14": 0.11865234375, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.15771484375, "loss_aux_layer_19": 0.15966796875, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0843505859375, "loss_aux_layer_4": 0.08740234375, "loss_aux_layer_5": 0.0888671875, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.0870361328125, "loss_aux_layer_9": 0.0858154296875, "step": 1235, "total_loss": 0.7857510447502136 }, { "epoch": 0.2447040190061374, "grad_norm": 0.9770814776420593, "learning_rate": 5e-05, "llm_loss": 0.5784689635038376, "loss": 2.778, "loss_aux_layer_0": 0.02447509765625, "loss_aux_layer_1": 0.0615234375, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.093017578125, "loss_aux_layer_12": 0.1002197265625, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.1611328125, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.0858154296875, "loss_aux_layer_5": 0.087646484375, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0872802734375, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.0855712890625, "step": 1236, "total_loss": 0.6945020705461502 }, { "epoch": 0.2449019996040388, "grad_norm": 1.2829681634902954, "learning_rate": 5e-05, "llm_loss": 0.5222148597240448, "loss": 2.551, "loss_aux_layer_0": 0.02728271484375, "loss_aux_layer_1": 0.06317138671875, "loss_aux_layer_10": 0.086669921875, "loss_aux_layer_11": 0.09228515625, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.12744140625, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.07080078125, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.19580078125, "loss_aux_layer_23": 0.237060546875, "loss_aux_layer_3": 0.0841064453125, "loss_aux_layer_4": 0.0867919921875, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.091796875, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0867919921875, "loss_aux_layer_9": 0.085205078125, "step": 1237, "total_loss": 0.6377609223127365 }, { "epoch": 0.2450999802019402, "grad_norm": 0.9429695010185242, "learning_rate": 5e-05, "llm_loss": 0.5941322073340416, "loss": 2.8307, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.061767578125, "loss_aux_layer_10": 0.084228515625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.1168212890625, "loss_aux_layer_15": 0.127685546875, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.083984375, "loss_aux_layer_5": 0.0853271484375, "loss_aux_layer_6": 0.0882568359375, "loss_aux_layer_7": 0.084228515625, "loss_aux_layer_8": 0.08349609375, "loss_aux_layer_9": 0.0826416015625, "step": 1238, "total_loss": 0.7076677680015564 }, { "epoch": 0.24529796079984162, "grad_norm": 1.4282091856002808, "learning_rate": 5e-05, "llm_loss": 0.6453545838594437, "loss": 3.0276, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.05987548828125, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.095947265625, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.125732421875, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.152587890625, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.0673828125, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0792236328125, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.0841064453125, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.081787109375, "step": 1239, "total_loss": 0.7569025158882141 }, { "epoch": 0.24549594139774303, "grad_norm": 1.7880951166152954, "learning_rate": 5e-05, "llm_loss": 0.6727142482995987, "loss": 3.146, "loss_aux_layer_0": 0.028076171875, "loss_aux_layer_1": 0.0640869140625, "loss_aux_layer_10": 0.084228515625, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0966796875, "loss_aux_layer_13": 0.1041259765625, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.1552734375, "loss_aux_layer_19": 0.158203125, "loss_aux_layer_2": 0.06854248046875, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.171142578125, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.08154296875, "loss_aux_layer_4": 0.0845947265625, "loss_aux_layer_5": 0.0869140625, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.0843505859375, "loss_aux_layer_9": 0.0826416015625, "step": 1240, "total_loss": 0.7864988744258881 }, { "epoch": 0.24569392199564444, "grad_norm": 3.4398252964019775, "learning_rate": 5e-05, "llm_loss": 0.6078224927186966, "loss": 2.8895, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.06195068359375, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.091796875, "loss_aux_layer_12": 0.0986328125, "loss_aux_layer_13": 0.1063232421875, "loss_aux_layer_14": 0.11767578125, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.0830078125, "loss_aux_layer_4": 0.08642578125, "loss_aux_layer_5": 0.08935546875, "loss_aux_layer_6": 0.0911865234375, "loss_aux_layer_7": 0.087158203125, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.0845947265625, "step": 1241, "total_loss": 0.72236767411232 }, { "epoch": 0.24589190259354582, "grad_norm": 2.874178886413574, "learning_rate": 5e-05, "llm_loss": 0.5774007737636566, "loss": 2.7688, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.06109619140625, "loss_aux_layer_10": 0.0863037109375, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.098388671875, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.116943359375, "loss_aux_layer_15": 0.127685546875, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.144775390625, "loss_aux_layer_18": 0.1533203125, "loss_aux_layer_19": 0.155029296875, "loss_aux_layer_2": 0.0740966796875, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.0853271484375, "loss_aux_layer_4": 0.0892333984375, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.0855712890625, "loss_aux_layer_9": 0.0848388671875, "step": 1242, "total_loss": 0.6922036111354828 }, { "epoch": 0.24608988319144723, "grad_norm": 1.6343727111816406, "learning_rate": 5e-05, "llm_loss": 0.6100845709443092, "loss": 2.9084, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.0902099609375, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.1019287109375, "loss_aux_layer_13": 0.109130859375, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.131103515625, "loss_aux_layer_16": 0.141357421875, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.0740966796875, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.23193359375, "loss_aux_layer_3": 0.08740234375, "loss_aux_layer_4": 0.0908203125, "loss_aux_layer_5": 0.0943603515625, "loss_aux_layer_6": 0.095458984375, "loss_aux_layer_7": 0.091064453125, "loss_aux_layer_8": 0.089599609375, "loss_aux_layer_9": 0.088623046875, "step": 1243, "total_loss": 0.7271058112382889 }, { "epoch": 0.24628786378934864, "grad_norm": 3.761047840118408, "learning_rate": 5e-05, "llm_loss": 0.5930713266134262, "loss": 2.8372, "loss_aux_layer_0": 0.025787353515625, "loss_aux_layer_1": 0.0648193359375, "loss_aux_layer_10": 0.0855712890625, "loss_aux_layer_11": 0.091064453125, "loss_aux_layer_12": 0.0980224609375, "loss_aux_layer_13": 0.105712890625, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.15771484375, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.0771484375, "loss_aux_layer_20": 0.1669921875, "loss_aux_layer_21": 0.174560546875, "loss_aux_layer_22": 0.19677734375, "loss_aux_layer_23": 0.239013671875, "loss_aux_layer_3": 0.085693359375, "loss_aux_layer_4": 0.0889892578125, "loss_aux_layer_5": 0.0902099609375, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.086181640625, "loss_aux_layer_9": 0.0843505859375, "step": 1244, "total_loss": 0.7093008458614349 }, { "epoch": 0.24648584438725005, "grad_norm": 1.785300374031067, "learning_rate": 5e-05, "llm_loss": 0.6327503472566605, "loss": 2.9873, "loss_aux_layer_0": 0.024627685546875, "loss_aux_layer_1": 0.060791015625, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.091064453125, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.1053466796875, "loss_aux_layer_14": 0.116455078125, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.156494140625, "loss_aux_layer_2": 0.0718994140625, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.190673828125, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.0863037109375, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0841064453125, "step": 1245, "total_loss": 0.7468247264623642 }, { "epoch": 0.24668382498515146, "grad_norm": 2.290415048599243, "learning_rate": 5e-05, "llm_loss": 0.5722166150808334, "loss": 2.7516, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.06243896484375, "loss_aux_layer_10": 0.0872802734375, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.107421875, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.13037109375, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.14892578125, "loss_aux_layer_18": 0.157958984375, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.0726318359375, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0836181640625, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.08935546875, "loss_aux_layer_6": 0.09228515625, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0865478515625, "loss_aux_layer_9": 0.085693359375, "step": 1246, "total_loss": 0.6879030168056488 }, { "epoch": 0.24688180558305287, "grad_norm": 1.4174572229385376, "learning_rate": 5e-05, "llm_loss": 0.6626674234867096, "loss": 3.1166, "loss_aux_layer_0": 0.028411865234375, "loss_aux_layer_1": 0.06463623046875, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.09375, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.1195068359375, "loss_aux_layer_15": 0.1298828125, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.1484375, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.07275390625, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.0855712890625, "loss_aux_layer_4": 0.08837890625, "loss_aux_layer_5": 0.0902099609375, "loss_aux_layer_6": 0.0926513671875, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0869140625, "step": 1247, "total_loss": 0.779156357049942 }, { "epoch": 0.24707978618095428, "grad_norm": 1.7138785123825073, "learning_rate": 5e-05, "llm_loss": 0.6181203275918961, "loss": 2.9378, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.06536865234375, "loss_aux_layer_10": 0.088623046875, "loss_aux_layer_11": 0.0941162109375, "loss_aux_layer_12": 0.1009521484375, "loss_aux_layer_13": 0.1080322265625, "loss_aux_layer_14": 0.119140625, "loss_aux_layer_15": 0.1297607421875, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0767822265625, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.0860595703125, "loss_aux_layer_4": 0.0892333984375, "loss_aux_layer_5": 0.09033203125, "loss_aux_layer_6": 0.0931396484375, "loss_aux_layer_7": 0.0892333984375, "loss_aux_layer_8": 0.088134765625, "loss_aux_layer_9": 0.0867919921875, "step": 1248, "total_loss": 0.7344398200511932 }, { "epoch": 0.24727776677885568, "grad_norm": 1.571441888809204, "learning_rate": 5e-05, "llm_loss": 0.6132816672325134, "loss": 2.9134, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.06048583984375, "loss_aux_layer_10": 0.0859375, "loss_aux_layer_11": 0.091796875, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.1065673828125, "loss_aux_layer_14": 0.118408203125, "loss_aux_layer_15": 0.129150390625, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.0714111328125, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.19580078125, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0831298828125, "loss_aux_layer_4": 0.08544921875, "loss_aux_layer_5": 0.0869140625, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0841064453125, "step": 1249, "total_loss": 0.7283476889133453 }, { "epoch": 0.24747574737675707, "grad_norm": 1.6421773433685303, "learning_rate": 5e-05, "llm_loss": 0.5406917333602905, "loss": 2.6519, "loss_aux_layer_0": 0.026580810546875, "loss_aux_layer_1": 0.0692138671875, "loss_aux_layer_10": 0.0943603515625, "loss_aux_layer_11": 0.100341796875, "loss_aux_layer_12": 0.107666015625, "loss_aux_layer_13": 0.11474609375, "loss_aux_layer_14": 0.12646484375, "loss_aux_layer_15": 0.13671875, "loss_aux_layer_16": 0.147216796875, "loss_aux_layer_17": 0.154052734375, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.1640625, "loss_aux_layer_2": 0.0794677734375, "loss_aux_layer_20": 0.17041015625, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.23974609375, "loss_aux_layer_3": 0.0921630859375, "loss_aux_layer_4": 0.0950927734375, "loss_aux_layer_5": 0.09716796875, "loss_aux_layer_6": 0.099853515625, "loss_aux_layer_7": 0.0955810546875, "loss_aux_layer_8": 0.0943603515625, "loss_aux_layer_9": 0.0926513671875, "step": 1250, "total_loss": 0.6629829853773117 }, { "epoch": 0.24767372797465848, "grad_norm": 1.2577790021896362, "learning_rate": 5e-05, "llm_loss": 0.6072070598602295, "loss": 2.8892, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.06341552734375, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.1070556640625, "loss_aux_layer_14": 0.1177978515625, "loss_aux_layer_15": 0.1282958984375, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.07373046875, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.0855712890625, "loss_aux_layer_4": 0.0887451171875, "loss_aux_layer_5": 0.0899658203125, "loss_aux_layer_6": 0.092529296875, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.0877685546875, "loss_aux_layer_9": 0.0859375, "step": 1251, "total_loss": 0.7223046123981476 }, { "epoch": 0.24787170857255988, "grad_norm": 2.1592047214508057, "learning_rate": 5e-05, "llm_loss": 0.6131749898195267, "loss": 2.918, "loss_aux_layer_0": 0.02557373046875, "loss_aux_layer_1": 0.062744140625, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.09228515625, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.1077880859375, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.131591796875, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.151123046875, "loss_aux_layer_18": 0.16015625, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.071044921875, "loss_aux_layer_20": 0.168701171875, "loss_aux_layer_21": 0.1748046875, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.08642578125, "loss_aux_layer_5": 0.08837890625, "loss_aux_layer_6": 0.0902099609375, "loss_aux_layer_7": 0.0860595703125, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0843505859375, "step": 1252, "total_loss": 0.729487806558609 }, { "epoch": 0.2480696891704613, "grad_norm": 2.030597686767578, "learning_rate": 5e-05, "llm_loss": 0.5061198621988297, "loss": 2.4758, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.0606689453125, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.0958251953125, "loss_aux_layer_13": 0.1031494140625, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.1256103515625, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.08349609375, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.0877685546875, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.0826416015625, "step": 1253, "total_loss": 0.618937760591507 }, { "epoch": 0.2482676697683627, "grad_norm": 1.6010435819625854, "learning_rate": 5e-05, "llm_loss": 0.5671851485967636, "loss": 2.7329, "loss_aux_layer_0": 0.026123046875, "loss_aux_layer_1": 0.064208984375, "loss_aux_layer_10": 0.0872802734375, "loss_aux_layer_11": 0.092529296875, "loss_aux_layer_12": 0.099853515625, "loss_aux_layer_13": 0.1070556640625, "loss_aux_layer_14": 0.118896484375, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.073486328125, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0850830078125, "loss_aux_layer_4": 0.08837890625, "loss_aux_layer_5": 0.0906982421875, "loss_aux_layer_6": 0.093017578125, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.086181640625, "step": 1254, "total_loss": 0.6832277625799179 }, { "epoch": 0.2484656503662641, "grad_norm": 1.4651374816894531, "learning_rate": 5e-05, "llm_loss": 0.5921217352151871, "loss": 2.8433, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.06591796875, "loss_aux_layer_10": 0.091064453125, "loss_aux_layer_11": 0.0966796875, "loss_aux_layer_12": 0.1036376953125, "loss_aux_layer_13": 0.111328125, "loss_aux_layer_14": 0.1219482421875, "loss_aux_layer_15": 0.13232421875, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.150146484375, "loss_aux_layer_18": 0.15869140625, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.074951171875, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.173095703125, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.0880126953125, "loss_aux_layer_4": 0.0916748046875, "loss_aux_layer_5": 0.0933837890625, "loss_aux_layer_6": 0.09619140625, "loss_aux_layer_7": 0.092529296875, "loss_aux_layer_8": 0.09130859375, "loss_aux_layer_9": 0.089599609375, "step": 1255, "total_loss": 0.7108248025178909 }, { "epoch": 0.24866363096416552, "grad_norm": 2.1985790729522705, "learning_rate": 5e-05, "llm_loss": 0.7115809619426727, "loss": 3.2942, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.05950927734375, "loss_aux_layer_10": 0.0823974609375, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.0943603515625, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.11376953125, "loss_aux_layer_15": 0.125244140625, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.143798828125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.15625, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.083251953125, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.0810546875, "step": 1256, "total_loss": 0.8235610574483871 }, { "epoch": 0.24886161156206693, "grad_norm": 2.21638560295105, "learning_rate": 5e-05, "llm_loss": 0.6146818250417709, "loss": 2.9308, "loss_aux_layer_0": 0.025604248046875, "loss_aux_layer_1": 0.06201171875, "loss_aux_layer_10": 0.0882568359375, "loss_aux_layer_11": 0.0938720703125, "loss_aux_layer_12": 0.1011962890625, "loss_aux_layer_13": 0.109375, "loss_aux_layer_14": 0.1217041015625, "loss_aux_layer_15": 0.1319580078125, "loss_aux_layer_16": 0.1435546875, "loss_aux_layer_17": 0.15234375, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.07232666015625, "loss_aux_layer_20": 0.170654296875, "loss_aux_layer_21": 0.177490234375, "loss_aux_layer_22": 0.20166015625, "loss_aux_layer_23": 0.243896484375, "loss_aux_layer_3": 0.0836181640625, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.088623046875, "loss_aux_layer_6": 0.091796875, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.0870361328125, "loss_aux_layer_9": 0.0863037109375, "step": 1257, "total_loss": 0.7326887100934982 }, { "epoch": 0.2490595921599683, "grad_norm": 1.990093469619751, "learning_rate": 5e-05, "llm_loss": 0.6363052129745483, "loss": 3.0054, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.06072998046875, "loss_aux_layer_10": 0.0867919921875, "loss_aux_layer_11": 0.0921630859375, "loss_aux_layer_12": 0.0989990234375, "loss_aux_layer_13": 0.106689453125, "loss_aux_layer_14": 0.117919921875, "loss_aux_layer_15": 0.1287841796875, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0694580078125, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.08203125, "loss_aux_layer_4": 0.085693359375, "loss_aux_layer_5": 0.0875244140625, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.0849609375, "step": 1258, "total_loss": 0.7513412088155746 }, { "epoch": 0.24925757275786972, "grad_norm": 2.476531744003296, "learning_rate": 5e-05, "llm_loss": 0.5572950020432472, "loss": 2.6989, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.060546875, "loss_aux_layer_10": 0.0870361328125, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.101806640625, "loss_aux_layer_13": 0.10986328125, "loss_aux_layer_14": 0.1219482421875, "loss_aux_layer_15": 0.1339111328125, "loss_aux_layer_16": 0.14501953125, "loss_aux_layer_17": 0.152587890625, "loss_aux_layer_18": 0.162353515625, "loss_aux_layer_19": 0.164306640625, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.1708984375, "loss_aux_layer_21": 0.178466796875, "loss_aux_layer_22": 0.200439453125, "loss_aux_layer_23": 0.242431640625, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.08642578125, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0853271484375, "loss_aux_layer_9": 0.0848388671875, "step": 1259, "total_loss": 0.6747183948755264 }, { "epoch": 0.24945555335577113, "grad_norm": 1.7869118452072144, "learning_rate": 5e-05, "llm_loss": 0.6967117488384247, "loss": 3.2304, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.05841064453125, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.0955810546875, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.11376953125, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06756591796875, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.2255859375, "loss_aux_layer_3": 0.079833984375, "loss_aux_layer_4": 0.0830078125, "loss_aux_layer_5": 0.0843505859375, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.083984375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.081787109375, "step": 1260, "total_loss": 0.8076123148202896 }, { "epoch": 0.24965353395367254, "grad_norm": 1.8455225229263306, "learning_rate": 5e-05, "llm_loss": 0.6508462354540825, "loss": 3.0703, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.061279296875, "loss_aux_layer_10": 0.0867919921875, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.10009765625, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.1209716796875, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.143798828125, "loss_aux_layer_17": 0.152099609375, "loss_aux_layer_18": 0.161376953125, "loss_aux_layer_19": 0.163330078125, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.169189453125, "loss_aux_layer_21": 0.17626953125, "loss_aux_layer_22": 0.19775390625, "loss_aux_layer_23": 0.240234375, "loss_aux_layer_3": 0.0816650390625, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.0867919921875, "loss_aux_layer_6": 0.08984375, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.085693359375, "loss_aux_layer_9": 0.0849609375, "step": 1261, "total_loss": 0.7675729542970657 }, { "epoch": 0.24985151455157395, "grad_norm": 1.2556819915771484, "learning_rate": 5e-05, "llm_loss": 0.5861700773239136, "loss": 2.7986, "loss_aux_layer_0": 0.0269775390625, "loss_aux_layer_1": 0.0618896484375, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.0980224609375, "loss_aux_layer_13": 0.105224609375, "loss_aux_layer_14": 0.1165771484375, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0697021484375, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.168212890625, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0821533203125, "loss_aux_layer_4": 0.0849609375, "loss_aux_layer_5": 0.0867919921875, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.0860595703125, "loss_aux_layer_8": 0.0853271484375, "loss_aux_layer_9": 0.084228515625, "step": 1262, "total_loss": 0.6996427029371262 }, { "epoch": 0.25004949514947533, "grad_norm": 1.7807408571243286, "learning_rate": 5e-05, "llm_loss": 0.5859292894601822, "loss": 2.8159, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.0623779296875, "loss_aux_layer_10": 0.0887451171875, "loss_aux_layer_11": 0.0948486328125, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.1104736328125, "loss_aux_layer_14": 0.12255859375, "loss_aux_layer_15": 0.13427734375, "loss_aux_layer_16": 0.145751953125, "loss_aux_layer_17": 0.153564453125, "loss_aux_layer_18": 0.162841796875, "loss_aux_layer_19": 0.16357421875, "loss_aux_layer_2": 0.07177734375, "loss_aux_layer_20": 0.169189453125, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.237548828125, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.089111328125, "loss_aux_layer_6": 0.0924072265625, "loss_aux_layer_7": 0.0885009765625, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.086669921875, "step": 1263, "total_loss": 0.70396888256073 }, { "epoch": 0.25024747574737677, "grad_norm": 1.161712408065796, "learning_rate": 5e-05, "llm_loss": 0.598863810300827, "loss": 2.8586, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.06121826171875, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.101318359375, "loss_aux_layer_13": 0.1087646484375, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.130859375, "loss_aux_layer_16": 0.1416015625, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.158447265625, "loss_aux_layer_2": 0.0704345703125, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.0831298828125, "loss_aux_layer_4": 0.0863037109375, "loss_aux_layer_5": 0.0882568359375, "loss_aux_layer_6": 0.0911865234375, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.0860595703125, "step": 1264, "total_loss": 0.7146528959274292 }, { "epoch": 0.25044545634527815, "grad_norm": 1.433027982711792, "learning_rate": 5e-05, "llm_loss": 0.6374986916780472, "loss": 3.011, "loss_aux_layer_0": 0.026092529296875, "loss_aux_layer_1": 0.06182861328125, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.0985107421875, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1182861328125, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.069580078125, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.23779296875, "loss_aux_layer_3": 0.08203125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.0867919921875, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.0841064453125, "step": 1265, "total_loss": 0.752755343914032 }, { "epoch": 0.2506434369431796, "grad_norm": 1.3629333972930908, "learning_rate": 5e-05, "llm_loss": 0.6807184219360352, "loss": 3.1691, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.0587158203125, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.1016845703125, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1251220703125, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.145263671875, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0665283203125, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.080078125, "step": 1266, "total_loss": 0.7922762036323547 }, { "epoch": 0.25084141754108097, "grad_norm": 1.2485545873641968, "learning_rate": 5e-05, "llm_loss": 0.6246088147163391, "loss": 2.9491, "loss_aux_layer_0": 0.02423095703125, "loss_aux_layer_1": 0.0589599609375, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0966796875, "loss_aux_layer_13": 0.1043701171875, "loss_aux_layer_14": 0.115966796875, "loss_aux_layer_15": 0.1270751953125, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.068359375, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.189453125, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.0828857421875, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.08203125, "step": 1267, "total_loss": 0.7372846156358719 }, { "epoch": 0.2510393981389824, "grad_norm": 0.9912510514259338, "learning_rate": 5e-05, "llm_loss": 0.5642274618148804, "loss": 2.7229, "loss_aux_layer_0": 0.02764892578125, "loss_aux_layer_1": 0.065673828125, "loss_aux_layer_10": 0.0877685546875, "loss_aux_layer_11": 0.0933837890625, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.1075439453125, "loss_aux_layer_14": 0.118896484375, "loss_aux_layer_15": 0.13037109375, "loss_aux_layer_16": 0.141357421875, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.159912109375, "loss_aux_layer_2": 0.072021484375, "loss_aux_layer_20": 0.166259765625, "loss_aux_layer_21": 0.172607421875, "loss_aux_layer_22": 0.195068359375, "loss_aux_layer_23": 0.236572265625, "loss_aux_layer_3": 0.084228515625, "loss_aux_layer_4": 0.0870361328125, "loss_aux_layer_5": 0.0888671875, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.0880126953125, "loss_aux_layer_8": 0.08740234375, "loss_aux_layer_9": 0.0860595703125, "step": 1268, "total_loss": 0.6807166934013367 }, { "epoch": 0.2512373787368838, "grad_norm": 1.3107316493988037, "learning_rate": 5e-05, "llm_loss": 0.6234727054834366, "loss": 2.9488, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.062255859375, "loss_aux_layer_10": 0.0863037109375, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.0982666015625, "loss_aux_layer_13": 0.105224609375, "loss_aux_layer_14": 0.1163330078125, "loss_aux_layer_15": 0.1268310546875, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.0706787109375, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.082763671875, "loss_aux_layer_4": 0.0860595703125, "loss_aux_layer_5": 0.087890625, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0872802734375, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.084716796875, "step": 1269, "total_loss": 0.737197756767273 }, { "epoch": 0.25143535933478517, "grad_norm": 0.8807740211486816, "learning_rate": 5e-05, "llm_loss": 0.6396261006593704, "loss": 3.0051, "loss_aux_layer_0": 0.02435302734375, "loss_aux_layer_1": 0.05792236328125, "loss_aux_layer_10": 0.083251953125, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.0963134765625, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.13525390625, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.06689453125, "loss_aux_layer_20": 0.1630859375, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.08154296875, "loss_aux_layer_5": 0.083740234375, "loss_aux_layer_6": 0.0865478515625, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.081787109375, "step": 1270, "total_loss": 0.7512837052345276 }, { "epoch": 0.2516333399326866, "grad_norm": 0.9022597074508667, "learning_rate": 5e-05, "llm_loss": 0.5067209750413895, "loss": 2.4905, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.062255859375, "loss_aux_layer_10": 0.0869140625, "loss_aux_layer_11": 0.093017578125, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.107666015625, "loss_aux_layer_14": 0.119140625, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.15771484375, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.0714111328125, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.083251953125, "loss_aux_layer_4": 0.0859375, "loss_aux_layer_5": 0.087890625, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.087158203125, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.0848388671875, "step": 1271, "total_loss": 0.6226356774568558 }, { "epoch": 0.251831320530588, "grad_norm": 1.0526574850082397, "learning_rate": 5e-05, "llm_loss": 0.5996371060609818, "loss": 2.8615, "loss_aux_layer_0": 0.031463623046875, "loss_aux_layer_1": 0.0662841796875, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.09228515625, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.105712890625, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.15966796875, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.174072265625, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.239990234375, "loss_aux_layer_3": 0.0821533203125, "loss_aux_layer_4": 0.0850830078125, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.08984375, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.085693359375, "loss_aux_layer_9": 0.0845947265625, "step": 1272, "total_loss": 0.7153856754302979 }, { "epoch": 0.2520293011284894, "grad_norm": 1.0378763675689697, "learning_rate": 5e-05, "llm_loss": 0.6954503953456879, "loss": 3.2187, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.0567626953125, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.1231689453125, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.06402587890625, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.079833984375, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.078857421875, "step": 1273, "total_loss": 0.8046729862689972 }, { "epoch": 0.2522272817263908, "grad_norm": 1.6573415994644165, "learning_rate": 5e-05, "llm_loss": 0.559471845626831, "loss": 2.7078, "loss_aux_layer_0": 0.02545166015625, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.1007080078125, "loss_aux_layer_13": 0.108154296875, "loss_aux_layer_14": 0.120361328125, "loss_aux_layer_15": 0.1319580078125, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.150634765625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.161865234375, "loss_aux_layer_2": 0.0714111328125, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17529296875, "loss_aux_layer_22": 0.198486328125, "loss_aux_layer_23": 0.240966796875, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.08740234375, "loss_aux_layer_5": 0.0894775390625, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.0880126953125, "loss_aux_layer_9": 0.0869140625, "step": 1274, "total_loss": 0.676959291100502 }, { "epoch": 0.25242526232429224, "grad_norm": 1.6581377983093262, "learning_rate": 5e-05, "llm_loss": 0.6408894956111908, "loss": 3.0085, "loss_aux_layer_0": 0.027191162109375, "loss_aux_layer_1": 0.060791015625, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.0950927734375, "loss_aux_layer_13": 0.1024169921875, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1240234375, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06683349609375, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.08203125, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.083251953125, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.081298828125, "step": 1275, "total_loss": 0.752122089266777 }, { "epoch": 0.2526232429221936, "grad_norm": 1.421006441116333, "learning_rate": 5e-05, "llm_loss": 0.5639861226081848, "loss": 2.718, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.06097412109375, "loss_aux_layer_10": 0.088134765625, "loss_aux_layer_11": 0.093505859375, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.1072998046875, "loss_aux_layer_14": 0.1181640625, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.16552734375, "loss_aux_layer_21": 0.172607421875, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.234619140625, "loss_aux_layer_3": 0.0828857421875, "loss_aux_layer_4": 0.08642578125, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.092041015625, "loss_aux_layer_7": 0.0888671875, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.086181640625, "step": 1276, "total_loss": 0.67949278652668 }, { "epoch": 0.252821223520095, "grad_norm": 1.8614085912704468, "learning_rate": 5e-05, "llm_loss": 0.5510532483458519, "loss": 2.6418, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.05731201171875, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.0926513671875, "loss_aux_layer_13": 0.0999755859375, "loss_aux_layer_14": 0.111328125, "loss_aux_layer_15": 0.1220703125, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.0784912109375, "step": 1277, "total_loss": 0.660455122590065 }, { "epoch": 0.25301920411799644, "grad_norm": 2.7718772888183594, "learning_rate": 5e-05, "llm_loss": 0.6326511651277542, "loss": 2.9855, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.06103515625, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.105712890625, "loss_aux_layer_14": 0.1173095703125, "loss_aux_layer_15": 0.1278076171875, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146240234375, "loss_aux_layer_18": 0.154296875, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.0703125, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.087646484375, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.087158203125, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.084228515625, "step": 1278, "total_loss": 0.7463846206665039 }, { "epoch": 0.2532171847158978, "grad_norm": 2.219233274459839, "learning_rate": 5e-05, "llm_loss": 0.5846325904130936, "loss": 2.7943, "loss_aux_layer_0": 0.028045654296875, "loss_aux_layer_1": 0.062255859375, "loss_aux_layer_10": 0.084716796875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.116455078125, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154296875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.193115234375, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.089111328125, "loss_aux_layer_7": 0.0855712890625, "loss_aux_layer_8": 0.08447265625, "loss_aux_layer_9": 0.0831298828125, "step": 1279, "total_loss": 0.698579341173172 }, { "epoch": 0.25341516531379926, "grad_norm": 1.7840455770492554, "learning_rate": 5e-05, "llm_loss": 0.5957076400518417, "loss": 2.8505, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.0625, "loss_aux_layer_10": 0.0899658203125, "loss_aux_layer_11": 0.095458984375, "loss_aux_layer_12": 0.10205078125, "loss_aux_layer_13": 0.10888671875, "loss_aux_layer_14": 0.1201171875, "loss_aux_layer_15": 0.130615234375, "loss_aux_layer_16": 0.140625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0877685546875, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.093017578125, "loss_aux_layer_6": 0.09521484375, "loss_aux_layer_7": 0.0909423828125, "loss_aux_layer_8": 0.0899658203125, "loss_aux_layer_9": 0.0882568359375, "step": 1280, "total_loss": 0.7126330435276031 }, { "epoch": 0.25361314591170064, "grad_norm": 1.8976596593856812, "learning_rate": 5e-05, "llm_loss": 0.5830715447664261, "loss": 2.7999, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.0599365234375, "loss_aux_layer_10": 0.0880126953125, "loss_aux_layer_11": 0.0936279296875, "loss_aux_layer_12": 0.100341796875, "loss_aux_layer_13": 0.10791015625, "loss_aux_layer_14": 0.120361328125, "loss_aux_layer_15": 0.1314697265625, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.1591796875, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.07000732421875, "loss_aux_layer_20": 0.16845703125, "loss_aux_layer_21": 0.17578125, "loss_aux_layer_22": 0.197998046875, "loss_aux_layer_23": 0.23876953125, "loss_aux_layer_3": 0.083740234375, "loss_aux_layer_4": 0.0869140625, "loss_aux_layer_5": 0.0894775390625, "loss_aux_layer_6": 0.092529296875, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.08642578125, "step": 1281, "total_loss": 0.699965313076973 }, { "epoch": 0.2538111265096021, "grad_norm": 1.688167929649353, "learning_rate": 5e-05, "llm_loss": 0.594752624630928, "loss": 2.8431, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.06298828125, "loss_aux_layer_10": 0.0875244140625, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.0997314453125, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.1190185546875, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.147705078125, "loss_aux_layer_18": 0.156494140625, "loss_aux_layer_19": 0.15869140625, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.171875, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0853271484375, "loss_aux_layer_4": 0.088623046875, "loss_aux_layer_5": 0.0908203125, "loss_aux_layer_6": 0.0927734375, "loss_aux_layer_7": 0.0889892578125, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.086181640625, "step": 1282, "total_loss": 0.7107661366462708 }, { "epoch": 0.25400910710750346, "grad_norm": 1.290854573249817, "learning_rate": 5e-05, "llm_loss": 0.703216165304184, "loss": 3.269, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.05950927734375, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1168212890625, "loss_aux_layer_15": 0.1282958984375, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.15625, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.0850830078125, "loss_aux_layer_5": 0.087158203125, "loss_aux_layer_6": 0.0902099609375, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.083740234375, "step": 1283, "total_loss": 0.8172468543052673 }, { "epoch": 0.25420708770540484, "grad_norm": 1.8063509464263916, "learning_rate": 5e-05, "llm_loss": 0.5689778253436089, "loss": 2.736, "loss_aux_layer_0": 0.02734375, "loss_aux_layer_1": 0.06103515625, "loss_aux_layer_10": 0.0858154296875, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.0977783203125, "loss_aux_layer_13": 0.105224609375, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.1279296875, "loss_aux_layer_16": 0.138916015625, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.1591796875, "loss_aux_layer_2": 0.07049560546875, "loss_aux_layer_20": 0.166259765625, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.238525390625, "loss_aux_layer_3": 0.082275390625, "loss_aux_layer_4": 0.0849609375, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.0899658203125, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.0853271484375, "loss_aux_layer_9": 0.084228515625, "step": 1284, "total_loss": 0.6840086281299591 }, { "epoch": 0.2544050683033063, "grad_norm": 1.2790162563323975, "learning_rate": 5e-05, "llm_loss": 0.6139280050992966, "loss": 2.9269, "loss_aux_layer_0": 0.025421142578125, "loss_aux_layer_1": 0.06475830078125, "loss_aux_layer_10": 0.089599609375, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.102294921875, "loss_aux_layer_13": 0.109619140625, "loss_aux_layer_14": 0.1212158203125, "loss_aux_layer_15": 0.1318359375, "loss_aux_layer_16": 0.143310546875, "loss_aux_layer_17": 0.150390625, "loss_aux_layer_18": 0.158935546875, "loss_aux_layer_19": 0.16015625, "loss_aux_layer_2": 0.0736083984375, "loss_aux_layer_20": 0.166748046875, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.236572265625, "loss_aux_layer_3": 0.0863037109375, "loss_aux_layer_4": 0.0894775390625, "loss_aux_layer_5": 0.09130859375, "loss_aux_layer_6": 0.09423828125, "loss_aux_layer_7": 0.0906982421875, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.0880126953125, "step": 1285, "total_loss": 0.7317139506340027 }, { "epoch": 0.25460304890120766, "grad_norm": 1.9290387630462646, "learning_rate": 5e-05, "llm_loss": 0.7017326503992081, "loss": 3.2584, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.05975341796875, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0982666015625, "loss_aux_layer_13": 0.105224609375, "loss_aux_layer_14": 0.1160888671875, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.069580078125, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.08251953125, "loss_aux_layer_4": 0.0859375, "loss_aux_layer_5": 0.0875244140625, "loss_aux_layer_6": 0.0904541015625, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.084716796875, "step": 1286, "total_loss": 0.8146089017391205 }, { "epoch": 0.2548010294991091, "grad_norm": 1.7592958211898804, "learning_rate": 5e-05, "llm_loss": 0.6122603416442871, "loss": 2.8905, "loss_aux_layer_0": 0.025787353515625, "loss_aux_layer_1": 0.05670166015625, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.1114501953125, "loss_aux_layer_15": 0.1229248046875, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.155029296875, "loss_aux_layer_2": 0.0655517578125, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0802001953125, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.0809326171875, "loss_aux_layer_9": 0.079833984375, "step": 1287, "total_loss": 0.7226127237081528 }, { "epoch": 0.2549990100970105, "grad_norm": 1.4213284254074097, "learning_rate": 5e-05, "llm_loss": 0.6441207081079483, "loss": 3.0154, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.0557861328125, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.0867919921875, "loss_aux_layer_12": 0.09375, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.112060546875, "loss_aux_layer_15": 0.122802734375, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06475830078125, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.0767822265625, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.0816650390625, "loss_aux_layer_6": 0.0848388671875, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0797119140625, "step": 1288, "total_loss": 0.7538401633501053 }, { "epoch": 0.2551969906949119, "grad_norm": 1.1743139028549194, "learning_rate": 5e-05, "llm_loss": 0.5859153717756271, "loss": 2.7973, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.060546875, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.097900390625, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1260986328125, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.156494140625, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.1630859375, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0821533203125, "loss_aux_layer_4": 0.0850830078125, "loss_aux_layer_5": 0.0869140625, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.086181640625, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.0841064453125, "step": 1289, "total_loss": 0.6993148624897003 }, { "epoch": 0.2553949712928133, "grad_norm": 1.304418683052063, "learning_rate": 5e-05, "llm_loss": 0.6258173584938049, "loss": 2.9415, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.0565185546875, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.092041015625, "loss_aux_layer_13": 0.0999755859375, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.123291015625, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.155029296875, "loss_aux_layer_2": 0.0643310546875, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.078125, "step": 1290, "total_loss": 0.7353640198707581 }, { "epoch": 0.25559295189071474, "grad_norm": 1.2014409303665161, "learning_rate": 5e-05, "llm_loss": 0.5982755124568939, "loss": 2.8304, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.05908203125, "loss_aux_layer_10": 0.0811767578125, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.099853515625, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.14892578125, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.06597900390625, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0830078125, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.0821533203125, "loss_aux_layer_8": 0.0810546875, "loss_aux_layer_9": 0.0799560546875, "step": 1291, "total_loss": 0.707604244351387 }, { "epoch": 0.2557909324886161, "grad_norm": 1.2909471988677979, "learning_rate": 5e-05, "llm_loss": 0.5985634475946426, "loss": 2.8549, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.06427001953125, "loss_aux_layer_10": 0.0894775390625, "loss_aux_layer_11": 0.09521484375, "loss_aux_layer_12": 0.1015625, "loss_aux_layer_13": 0.1085205078125, "loss_aux_layer_14": 0.1187744140625, "loss_aux_layer_15": 0.1279296875, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0731201171875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.0869140625, "loss_aux_layer_4": 0.090087890625, "loss_aux_layer_5": 0.0919189453125, "loss_aux_layer_6": 0.09423828125, "loss_aux_layer_7": 0.0906982421875, "loss_aux_layer_8": 0.08935546875, "loss_aux_layer_9": 0.087890625, "step": 1292, "total_loss": 0.7137296497821808 }, { "epoch": 0.2559889130865175, "grad_norm": 0.8896702527999878, "learning_rate": 5e-05, "llm_loss": 0.5523487105965614, "loss": 2.6416, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.056396484375, "loss_aux_layer_10": 0.0799560546875, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.1199951171875, "loss_aux_layer_16": 0.1309814453125, "loss_aux_layer_17": 0.139404296875, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.06353759765625, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.0806884765625, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.0784912109375, "step": 1293, "total_loss": 0.6603992283344269 }, { "epoch": 0.25618689368441894, "grad_norm": 1.372287392616272, "learning_rate": 5e-05, "llm_loss": 0.578053392469883, "loss": 2.7706, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.062744140625, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.0986328125, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.1173095703125, "loss_aux_layer_15": 0.1279296875, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.0709228515625, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.171630859375, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.087890625, "loss_aux_layer_6": 0.09033203125, "loss_aux_layer_7": 0.0869140625, "loss_aux_layer_8": 0.085693359375, "loss_aux_layer_9": 0.08447265625, "step": 1294, "total_loss": 0.6926529258489609 }, { "epoch": 0.2563848742823203, "grad_norm": 1.050334095954895, "learning_rate": 5e-05, "llm_loss": 0.6071358025074005, "loss": 2.8729, "loss_aux_layer_0": 0.027252197265625, "loss_aux_layer_1": 0.0579833984375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.09326171875, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.1240234375, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0657958984375, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.190673828125, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.0828857421875, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0823974609375, "loss_aux_layer_8": 0.081298828125, "loss_aux_layer_9": 0.0799560546875, "step": 1295, "total_loss": 0.7182261198759079 }, { "epoch": 0.25658285488022176, "grad_norm": 2.0076634883880615, "learning_rate": 5e-05, "llm_loss": 0.6280806660652161, "loss": 2.9613, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.06103515625, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.0888671875, "loss_aux_layer_12": 0.0950927734375, "loss_aux_layer_13": 0.1019287109375, "loss_aux_layer_14": 0.11328125, "loss_aux_layer_15": 0.1240234375, "loss_aux_layer_16": 0.1341552734375, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.0701904296875, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.082275390625, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.0865478515625, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.0850830078125, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.0821533203125, "step": 1296, "total_loss": 0.7403235137462616 }, { "epoch": 0.25678083547812314, "grad_norm": 1.353153109550476, "learning_rate": 5e-05, "llm_loss": 0.6618471741676331, "loss": 3.085, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.05596923828125, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.08740234375, "loss_aux_layer_12": 0.0941162109375, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.1234130859375, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.1416015625, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.065673828125, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.08251953125, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.080322265625, "step": 1297, "total_loss": 0.7712482064962387 }, { "epoch": 0.2569788160760246, "grad_norm": 1.2743409872055054, "learning_rate": 5e-05, "llm_loss": 0.6347986906766891, "loss": 2.9983, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.0594482421875, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.0980224609375, "loss_aux_layer_13": 0.1055908203125, "loss_aux_layer_14": 0.1175537109375, "loss_aux_layer_15": 0.12939453125, "loss_aux_layer_16": 0.141845703125, "loss_aux_layer_17": 0.149658203125, "loss_aux_layer_18": 0.158935546875, "loss_aux_layer_19": 0.16162109375, "loss_aux_layer_2": 0.069580078125, "loss_aux_layer_20": 0.167724609375, "loss_aux_layer_21": 0.172607421875, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0892333984375, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.0838623046875, "step": 1298, "total_loss": 0.7495786547660828 }, { "epoch": 0.25717679667392596, "grad_norm": 1.1916447877883911, "learning_rate": 5e-05, "llm_loss": 0.5853987336158752, "loss": 2.8005, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.0589599609375, "loss_aux_layer_10": 0.0867919921875, "loss_aux_layer_11": 0.0926513671875, "loss_aux_layer_12": 0.0997314453125, "loss_aux_layer_13": 0.107177734375, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.129150390625, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.148681640625, "loss_aux_layer_18": 0.1572265625, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.165771484375, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0828857421875, "loss_aux_layer_4": 0.0858154296875, "loss_aux_layer_5": 0.08740234375, "loss_aux_layer_6": 0.0899658203125, "loss_aux_layer_7": 0.0867919921875, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.084716796875, "step": 1299, "total_loss": 0.7001217901706696 }, { "epoch": 0.25737477727182734, "grad_norm": 1.1067131757736206, "learning_rate": 5e-05, "llm_loss": 0.6216893792152405, "loss": 2.9149, "loss_aux_layer_0": 0.022735595703125, "loss_aux_layer_1": 0.0545654296875, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.09716796875, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.06414794921875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.076904296875, "step": 1300, "total_loss": 0.7287245392799377 }, { "epoch": 0.2575727578697288, "grad_norm": 1.1546233892440796, "learning_rate": 5e-05, "llm_loss": 0.6334140598773956, "loss": 2.9705, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.05670166015625, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.1002197265625, "loss_aux_layer_14": 0.1109619140625, "loss_aux_layer_15": 0.1214599609375, "loss_aux_layer_16": 0.132568359375, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.06640625, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.078369140625, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.079345703125, "step": 1301, "total_loss": 0.7426207363605499 }, { "epoch": 0.25777073846763016, "grad_norm": 1.3983545303344727, "learning_rate": 5e-05, "llm_loss": 0.7183940708637238, "loss": 3.3143, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.05657958984375, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.093505859375, "loss_aux_layer_13": 0.1011962890625, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.1236572265625, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.15234375, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.06689453125, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.08251953125, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.07958984375, "step": 1302, "total_loss": 0.8285719156265259 }, { "epoch": 0.2579687190655316, "grad_norm": 2.149601936340332, "learning_rate": 5e-05, "llm_loss": 0.6235194057226181, "loss": 2.9476, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.06011962890625, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.1376953125, "loss_aux_layer_17": 0.145263671875, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.070556640625, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.191650390625, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.0819091796875, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.088623046875, "loss_aux_layer_7": 0.08544921875, "loss_aux_layer_8": 0.084228515625, "loss_aux_layer_9": 0.0828857421875, "step": 1303, "total_loss": 0.7368928492069244 }, { "epoch": 0.258166699663433, "grad_norm": 1.3603177070617676, "learning_rate": 5e-05, "llm_loss": 0.6088439077138901, "loss": 2.8928, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.05914306640625, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0986328125, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.1173095703125, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.158203125, "loss_aux_layer_2": 0.0706787109375, "loss_aux_layer_20": 0.1640625, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.0838623046875, "loss_aux_layer_4": 0.08740234375, "loss_aux_layer_5": 0.0888671875, "loss_aux_layer_6": 0.09130859375, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0863037109375, "loss_aux_layer_9": 0.0850830078125, "step": 1304, "total_loss": 0.723207101225853 }, { "epoch": 0.2583646802613344, "grad_norm": 1.6279000043869019, "learning_rate": 5e-05, "llm_loss": 0.6292210519313812, "loss": 2.9741, "loss_aux_layer_0": 0.02667236328125, "loss_aux_layer_1": 0.063720703125, "loss_aux_layer_10": 0.086181640625, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.09814453125, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.1258544921875, "loss_aux_layer_16": 0.135986328125, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.0733642578125, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0849609375, "loss_aux_layer_4": 0.08740234375, "loss_aux_layer_5": 0.0885009765625, "loss_aux_layer_6": 0.0909423828125, "loss_aux_layer_7": 0.0872802734375, "loss_aux_layer_8": 0.0860595703125, "loss_aux_layer_9": 0.0845947265625, "step": 1305, "total_loss": 0.7435149550437927 }, { "epoch": 0.2585626608592358, "grad_norm": 1.7521082162857056, "learning_rate": 5e-05, "llm_loss": 0.6168878376483917, "loss": 2.9145, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.05780029296875, "loss_aux_layer_10": 0.083984375, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.095703125, "loss_aux_layer_13": 0.10302734375, "loss_aux_layer_14": 0.1142578125, "loss_aux_layer_15": 0.125, "loss_aux_layer_16": 0.1357421875, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.0870361328125, "loss_aux_layer_6": 0.08935546875, "loss_aux_layer_7": 0.085205078125, "loss_aux_layer_8": 0.0841064453125, "loss_aux_layer_9": 0.0826416015625, "step": 1306, "total_loss": 0.7286266684532166 }, { "epoch": 0.2587606414571372, "grad_norm": 1.5364875793457031, "learning_rate": 5e-05, "llm_loss": 0.5939894616603851, "loss": 2.8203, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.05670166015625, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.095703125, "loss_aux_layer_13": 0.1024169921875, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1243896484375, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.08349609375, "loss_aux_layer_5": 0.0848388671875, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0819091796875, "step": 1307, "total_loss": 0.7050834894180298 }, { "epoch": 0.2589586220550386, "grad_norm": 1.6722321510314941, "learning_rate": 5e-05, "llm_loss": 0.6044445633888245, "loss": 2.8674, "loss_aux_layer_0": 0.029052734375, "loss_aux_layer_1": 0.0623779296875, "loss_aux_layer_10": 0.0849609375, "loss_aux_layer_11": 0.08984375, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.114013671875, "loss_aux_layer_15": 0.1239013671875, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0706787109375, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.082763671875, "loss_aux_layer_4": 0.085693359375, "loss_aux_layer_5": 0.0877685546875, "loss_aux_layer_6": 0.090087890625, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0833740234375, "step": 1308, "total_loss": 0.7168563902378082 }, { "epoch": 0.25915660265294, "grad_norm": 2.3192007541656494, "learning_rate": 5e-05, "llm_loss": 0.6958673447370529, "loss": 3.2346, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0579833984375, "loss_aux_layer_10": 0.0841064453125, "loss_aux_layer_11": 0.08935546875, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.1041259765625, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.126953125, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.156494140625, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.080810546875, "loss_aux_layer_4": 0.083251953125, "loss_aux_layer_5": 0.0853271484375, "loss_aux_layer_6": 0.0877685546875, "loss_aux_layer_7": 0.0843505859375, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.082275390625, "step": 1309, "total_loss": 0.8086437582969666 }, { "epoch": 0.25935458325084143, "grad_norm": 1.7818682193756104, "learning_rate": 5e-05, "llm_loss": 0.6068730801343918, "loss": 2.8926, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.061279296875, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.095703125, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.10986328125, "loss_aux_layer_14": 0.1199951171875, "loss_aux_layer_15": 0.130615234375, "loss_aux_layer_16": 0.140869140625, "loss_aux_layer_17": 0.147705078125, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0723876953125, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.22900390625, "loss_aux_layer_3": 0.087890625, "loss_aux_layer_4": 0.0909423828125, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.0943603515625, "loss_aux_layer_7": 0.0906982421875, "loss_aux_layer_8": 0.0894775390625, "loss_aux_layer_9": 0.088134765625, "step": 1310, "total_loss": 0.7231548875570297 }, { "epoch": 0.2595525638487428, "grad_norm": 1.4831933975219727, "learning_rate": 5e-05, "llm_loss": 0.6040475368499756, "loss": 2.8665, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.056884765625, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.1260986328125, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.1533203125, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.0887451171875, "loss_aux_layer_7": 0.0850830078125, "loss_aux_layer_8": 0.08447265625, "loss_aux_layer_9": 0.083740234375, "step": 1311, "total_loss": 0.7166345119476318 }, { "epoch": 0.25975054444664425, "grad_norm": 1.60126531124115, "learning_rate": 5e-05, "llm_loss": 0.5976260602474213, "loss": 2.8668, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.0623779296875, "loss_aux_layer_10": 0.090087890625, "loss_aux_layer_11": 0.095703125, "loss_aux_layer_12": 0.102783203125, "loss_aux_layer_13": 0.1103515625, "loss_aux_layer_14": 0.1226806640625, "loss_aux_layer_15": 0.133544921875, "loss_aux_layer_16": 0.144775390625, "loss_aux_layer_17": 0.15185546875, "loss_aux_layer_18": 0.160888671875, "loss_aux_layer_19": 0.162353515625, "loss_aux_layer_2": 0.07470703125, "loss_aux_layer_20": 0.169189453125, "loss_aux_layer_21": 0.175537109375, "loss_aux_layer_22": 0.19873046875, "loss_aux_layer_23": 0.239501953125, "loss_aux_layer_3": 0.088623046875, "loss_aux_layer_4": 0.091064453125, "loss_aux_layer_5": 0.0927734375, "loss_aux_layer_6": 0.0950927734375, "loss_aux_layer_7": 0.0909423828125, "loss_aux_layer_8": 0.090087890625, "loss_aux_layer_9": 0.08837890625, "step": 1312, "total_loss": 0.7166989743709564 }, { "epoch": 0.25994852504454563, "grad_norm": 1.376412272453308, "learning_rate": 5e-05, "llm_loss": 0.5732910335063934, "loss": 2.7362, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.05682373046875, "loss_aux_layer_10": 0.0830078125, "loss_aux_layer_11": 0.08837890625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1021728515625, "loss_aux_layer_14": 0.1136474609375, "loss_aux_layer_15": 0.1241455078125, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.143310546875, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06591796875, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.081298828125, "step": 1313, "total_loss": 0.6840517371892929 }, { "epoch": 0.26014650564244707, "grad_norm": 1.0974985361099243, "learning_rate": 5e-05, "llm_loss": 0.5559939295053482, "loss": 2.6904, "loss_aux_layer_0": 0.026275634765625, "loss_aux_layer_1": 0.0633544921875, "loss_aux_layer_10": 0.08984375, "loss_aux_layer_11": 0.0953369140625, "loss_aux_layer_12": 0.1016845703125, "loss_aux_layer_13": 0.108642578125, "loss_aux_layer_14": 0.119384765625, "loss_aux_layer_15": 0.129638671875, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0738525390625, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0870361328125, "loss_aux_layer_4": 0.0906982421875, "loss_aux_layer_5": 0.092529296875, "loss_aux_layer_6": 0.0953369140625, "loss_aux_layer_7": 0.09130859375, "loss_aux_layer_8": 0.0899658203125, "loss_aux_layer_9": 0.0885009765625, "step": 1314, "total_loss": 0.6726034730672836 }, { "epoch": 0.26034448624034845, "grad_norm": 1.180257797241211, "learning_rate": 5e-05, "llm_loss": 0.6317901909351349, "loss": 2.9788, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.0576171875, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.08935546875, "loss_aux_layer_12": 0.0960693359375, "loss_aux_layer_13": 0.1036376953125, "loss_aux_layer_14": 0.1151123046875, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137939453125, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0673828125, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.194091796875, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.0794677734375, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.0877685546875, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.0821533203125, "step": 1315, "total_loss": 0.7446939647197723 }, { "epoch": 0.26054246683824983, "grad_norm": 1.406132698059082, "learning_rate": 5e-05, "llm_loss": 0.6508791744709015, "loss": 3.0462, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.05877685546875, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.0950927734375, "loss_aux_layer_13": 0.1016845703125, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.122802734375, "loss_aux_layer_16": 0.1329345703125, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.068603515625, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.0859375, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.0853271484375, "loss_aux_layer_8": 0.0841064453125, "loss_aux_layer_9": 0.0826416015625, "step": 1316, "total_loss": 0.7615497708320618 }, { "epoch": 0.26074044743615127, "grad_norm": 1.0991103649139404, "learning_rate": 5e-05, "llm_loss": 0.6153599247336388, "loss": 2.9091, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.05877685546875, "loss_aux_layer_10": 0.083984375, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.082275390625, "step": 1317, "total_loss": 0.727286159992218 }, { "epoch": 0.26093842803405265, "grad_norm": 1.1758301258087158, "learning_rate": 5e-05, "llm_loss": 0.5467003434896469, "loss": 2.6289, "loss_aux_layer_0": 0.0244140625, "loss_aux_layer_1": 0.05755615234375, "loss_aux_layer_10": 0.0811767578125, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.100341796875, "loss_aux_layer_14": 0.1116943359375, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.06707763671875, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.234375, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.0814208984375, "loss_aux_layer_5": 0.0828857421875, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.0794677734375, "step": 1318, "total_loss": 0.657235637307167 }, { "epoch": 0.2611364086319541, "grad_norm": 1.1380459070205688, "learning_rate": 5e-05, "llm_loss": 0.5718232542276382, "loss": 2.7455, "loss_aux_layer_0": 0.02679443359375, "loss_aux_layer_1": 0.05975341796875, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.097412109375, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.140380859375, "loss_aux_layer_17": 0.149169921875, "loss_aux_layer_18": 0.158447265625, "loss_aux_layer_19": 0.160888671875, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.17431640625, "loss_aux_layer_22": 0.193603515625, "loss_aux_layer_23": 0.234375, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.085205078125, "loss_aux_layer_8": 0.0843505859375, "loss_aux_layer_9": 0.08349609375, "step": 1319, "total_loss": 0.6863801926374435 }, { "epoch": 0.26133438922985547, "grad_norm": 0.9720070958137512, "learning_rate": 5e-05, "llm_loss": 0.5584797561168671, "loss": 2.671, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.05682373046875, "loss_aux_layer_10": 0.0826416015625, "loss_aux_layer_11": 0.087646484375, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1005859375, "loss_aux_layer_14": 0.1103515625, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.0675048828125, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.079833984375, "loss_aux_layer_4": 0.0830078125, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.083984375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.08154296875, "step": 1320, "total_loss": 0.6677460223436356 }, { "epoch": 0.2615323698277569, "grad_norm": 0.9667856097221375, "learning_rate": 5e-05, "llm_loss": 0.6524083018302917, "loss": 3.0536, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.0599365234375, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.0899658203125, "loss_aux_layer_12": 0.0958251953125, "loss_aux_layer_13": 0.10302734375, "loss_aux_layer_14": 0.1131591796875, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.069580078125, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.081787109375, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.085693359375, "loss_aux_layer_6": 0.0885009765625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.0828857421875, "step": 1321, "total_loss": 0.7634055018424988 }, { "epoch": 0.2617303504256583, "grad_norm": 1.442841649055481, "learning_rate": 5e-05, "llm_loss": 0.6521730422973633, "loss": 3.0456, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.05560302734375, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.0931396484375, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.140380859375, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0655517578125, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0823974609375, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.0804443359375, "step": 1322, "total_loss": 0.7614094614982605 }, { "epoch": 0.26192833102355967, "grad_norm": 1.3640203475952148, "learning_rate": 5e-05, "llm_loss": 0.6124205589294434, "loss": 2.8958, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.058349609375, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.1021728515625, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1243896484375, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.082763671875, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0819091796875, "step": 1323, "total_loss": 0.7239571958780289 }, { "epoch": 0.2621263116214611, "grad_norm": 1.0830702781677246, "learning_rate": 5e-05, "llm_loss": 0.673868715763092, "loss": 3.1327, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.05670166015625, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.0999755859375, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.121337890625, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.139892578125, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0657958984375, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.0821533203125, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.0804443359375, "step": 1324, "total_loss": 0.783187136054039 }, { "epoch": 0.2623242922193625, "grad_norm": 1.294057846069336, "learning_rate": 5e-05, "llm_loss": 0.5719746649265289, "loss": 2.7619, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0635986328125, "loss_aux_layer_10": 0.0908203125, "loss_aux_layer_11": 0.0966796875, "loss_aux_layer_12": 0.1033935546875, "loss_aux_layer_13": 0.11083984375, "loss_aux_layer_14": 0.121826171875, "loss_aux_layer_15": 0.1328125, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.15966796875, "loss_aux_layer_19": 0.16064453125, "loss_aux_layer_2": 0.0743408203125, "loss_aux_layer_20": 0.16650390625, "loss_aux_layer_21": 0.173583984375, "loss_aux_layer_22": 0.196044921875, "loss_aux_layer_23": 0.237548828125, "loss_aux_layer_3": 0.0872802734375, "loss_aux_layer_4": 0.0906982421875, "loss_aux_layer_5": 0.09228515625, "loss_aux_layer_6": 0.0953369140625, "loss_aux_layer_7": 0.0919189453125, "loss_aux_layer_8": 0.0906982421875, "loss_aux_layer_9": 0.0889892578125, "step": 1325, "total_loss": 0.6904733031988144 }, { "epoch": 0.2625222728172639, "grad_norm": 0.8892141580581665, "learning_rate": 5e-05, "llm_loss": 0.6308819055557251, "loss": 2.9546, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.1331787109375, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.233154296875, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0771484375, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.0760498046875, "step": 1326, "total_loss": 0.7386602908372879 }, { "epoch": 0.2627202534151653, "grad_norm": 1.892386794090271, "learning_rate": 5e-05, "llm_loss": 0.6299464479088783, "loss": 2.974, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.05877685546875, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.09033203125, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.126953125, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.0697021484375, "loss_aux_layer_20": 0.1640625, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23095703125, "loss_aux_layer_3": 0.0816650390625, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.09033203125, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.08544921875, "loss_aux_layer_9": 0.084228515625, "step": 1327, "total_loss": 0.7435059994459152 }, { "epoch": 0.26291823401306674, "grad_norm": 1.515329122543335, "learning_rate": 5e-05, "llm_loss": 0.605332687497139, "loss": 2.8481, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.0537109375, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.1072998046875, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.1297607421875, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.06353759765625, "loss_aux_layer_20": 0.158935546875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0755615234375, "step": 1328, "total_loss": 0.7120241969823837 }, { "epoch": 0.2631162146109681, "grad_norm": 2.114396095275879, "learning_rate": 5e-05, "llm_loss": 0.5694160461425781, "loss": 2.7443, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.06072998046875, "loss_aux_layer_10": 0.089111328125, "loss_aux_layer_11": 0.094970703125, "loss_aux_layer_12": 0.101806640625, "loss_aux_layer_13": 0.1092529296875, "loss_aux_layer_14": 0.1204833984375, "loss_aux_layer_15": 0.131103515625, "loss_aux_layer_16": 0.1416015625, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.17236328125, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.23486328125, "loss_aux_layer_3": 0.0848388671875, "loss_aux_layer_4": 0.0887451171875, "loss_aux_layer_5": 0.090576171875, "loss_aux_layer_6": 0.09375, "loss_aux_layer_7": 0.0899658203125, "loss_aux_layer_8": 0.0885009765625, "loss_aux_layer_9": 0.0877685546875, "step": 1329, "total_loss": 0.6860636174678802 }, { "epoch": 0.2633141952088695, "grad_norm": 1.3250043392181396, "learning_rate": 5e-05, "llm_loss": 0.6297250241041183, "loss": 2.9577, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.05548095703125, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.093505859375, "loss_aux_layer_13": 0.10107421875, "loss_aux_layer_14": 0.1124267578125, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2255859375, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.0853271484375, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.0811767578125, "loss_aux_layer_9": 0.0802001953125, "step": 1330, "total_loss": 0.7394314259290695 }, { "epoch": 0.26351217580677094, "grad_norm": 1.5251424312591553, "learning_rate": 5e-05, "llm_loss": 0.6291209757328033, "loss": 2.9655, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.05914306640625, "loss_aux_layer_10": 0.0843505859375, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.1251220703125, "loss_aux_layer_16": 0.135986328125, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.06927490234375, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.08642578125, "loss_aux_layer_6": 0.08935546875, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0843505859375, "loss_aux_layer_9": 0.082763671875, "step": 1331, "total_loss": 0.7413636893033981 }, { "epoch": 0.2637101564046723, "grad_norm": 1.2990273237228394, "learning_rate": 5e-05, "llm_loss": 0.647789403796196, "loss": 3.0368, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.0540771484375, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.0947265625, "loss_aux_layer_13": 0.1029052734375, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.126220703125, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.145263671875, "loss_aux_layer_18": 0.154541015625, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.065185546875, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.190673828125, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.08251953125, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.081298828125, "step": 1332, "total_loss": 0.7591887414455414 }, { "epoch": 0.26390813700257376, "grad_norm": 2.070596694946289, "learning_rate": 5e-05, "llm_loss": 0.6362598836421967, "loss": 2.9743, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.0533447265625, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.090576171875, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1094970703125, "loss_aux_layer_15": 0.120361328125, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.148681640625, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0635986328125, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.0775146484375, "step": 1333, "total_loss": 0.74356509745121 }, { "epoch": 0.26410611760047514, "grad_norm": 2.053251266479492, "learning_rate": 5e-05, "llm_loss": 0.6608269810676575, "loss": 3.0942, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05987548828125, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.0977783203125, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1260986328125, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.152587890625, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.071044921875, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.083251953125, "loss_aux_layer_4": 0.0863037109375, "loss_aux_layer_5": 0.0875244140625, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.0838623046875, "step": 1334, "total_loss": 0.7735600173473358 }, { "epoch": 0.2643040981983766, "grad_norm": 1.8892918825149536, "learning_rate": 5e-05, "llm_loss": 0.6152320504188538, "loss": 2.909, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.0584716796875, "loss_aux_layer_10": 0.083984375, "loss_aux_layer_11": 0.08935546875, "loss_aux_layer_12": 0.0963134765625, "loss_aux_layer_13": 0.1041259765625, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.136962890625, "loss_aux_layer_17": 0.144775390625, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.0699462890625, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.0814208984375, "loss_aux_layer_4": 0.083984375, "loss_aux_layer_5": 0.085693359375, "loss_aux_layer_6": 0.087890625, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.0823974609375, "step": 1335, "total_loss": 0.7272501438856125 }, { "epoch": 0.26450207879627796, "grad_norm": 1.4323656558990479, "learning_rate": 5e-05, "llm_loss": 0.5319535583257675, "loss": 2.5799, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.057861328125, "loss_aux_layer_10": 0.0843505859375, "loss_aux_layer_11": 0.08984375, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.1265869140625, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.14501953125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.068359375, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.235595703125, "loss_aux_layer_3": 0.080322265625, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0850830078125, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.082763671875, "step": 1336, "total_loss": 0.6449825763702393 }, { "epoch": 0.26470005939417934, "grad_norm": 2.538496494293213, "learning_rate": 5e-05, "llm_loss": 0.5924983620643616, "loss": 2.8089, "loss_aux_layer_0": 0.02496337890625, "loss_aux_layer_1": 0.0587158203125, "loss_aux_layer_10": 0.0823974609375, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.1114501953125, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.139404296875, "loss_aux_layer_18": 0.148193359375, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.0848388671875, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.08251953125, "loss_aux_layer_9": 0.0814208984375, "step": 1337, "total_loss": 0.7022128701210022 }, { "epoch": 0.2648980399920808, "grad_norm": 1.6581140756607056, "learning_rate": 5e-05, "llm_loss": 0.6082580387592316, "loss": 2.8744, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.05596923828125, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.087890625, "loss_aux_layer_12": 0.0943603515625, "loss_aux_layer_13": 0.101318359375, "loss_aux_layer_14": 0.112060546875, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.0677490234375, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.083740234375, "loss_aux_layer_6": 0.0865478515625, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.0810546875, "step": 1338, "total_loss": 0.7186118215322495 }, { "epoch": 0.26509602058998216, "grad_norm": 1.5578337907791138, "learning_rate": 5e-05, "llm_loss": 0.5780190229415894, "loss": 2.7644, "loss_aux_layer_0": 0.023681640625, "loss_aux_layer_1": 0.058837890625, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.091552734375, "loss_aux_layer_12": 0.0980224609375, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.13671875, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.070068359375, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.0831298828125, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.088134765625, "loss_aux_layer_6": 0.0914306640625, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.0863037109375, "loss_aux_layer_9": 0.0845947265625, "step": 1339, "total_loss": 0.6910920441150665 }, { "epoch": 0.2652940011878836, "grad_norm": 1.5505315065383911, "learning_rate": 5e-05, "llm_loss": 0.689532920718193, "loss": 3.2033, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.05859375, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.1038818359375, "loss_aux_layer_14": 0.1146240234375, "loss_aux_layer_15": 0.1239013671875, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.14111328125, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.2255859375, "loss_aux_layer_3": 0.080810546875, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.086181640625, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.083740234375, "step": 1340, "total_loss": 0.8008230626583099 }, { "epoch": 0.265491981785785, "grad_norm": 1.9339640140533447, "learning_rate": 5e-05, "llm_loss": 0.5825861096382141, "loss": 2.7725, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.0579833984375, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.1121826171875, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.1329345703125, "loss_aux_layer_17": 0.140380859375, "loss_aux_layer_18": 0.148681640625, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.068603515625, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.0828857421875, "loss_aux_layer_5": 0.084716796875, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.08251953125, "loss_aux_layer_9": 0.08154296875, "step": 1341, "total_loss": 0.6931257247924805 }, { "epoch": 0.2656899623836864, "grad_norm": 1.130313754081726, "learning_rate": 5e-05, "llm_loss": 0.5279036164283752, "loss": 2.5686, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.05804443359375, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.09033203125, "loss_aux_layer_12": 0.097412109375, "loss_aux_layer_13": 0.1053466796875, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.1474609375, "loss_aux_layer_18": 0.15673828125, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.166015625, "loss_aux_layer_21": 0.17333984375, "loss_aux_layer_22": 0.1962890625, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0816650390625, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.0828857421875, "step": 1342, "total_loss": 0.6421560347080231 }, { "epoch": 0.2658879429815878, "grad_norm": 1.3063218593597412, "learning_rate": 5e-05, "llm_loss": 0.6358319371938705, "loss": 2.9713, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.09814453125, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.119140625, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.079345703125, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.077880859375, "step": 1343, "total_loss": 0.7428294271230698 }, { "epoch": 0.26608592357948924, "grad_norm": 1.5995270013809204, "learning_rate": 5e-05, "llm_loss": 0.546184591948986, "loss": 2.6174, "loss_aux_layer_0": 0.02581787109375, "loss_aux_layer_1": 0.055419921875, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.06378173828125, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.07666015625, "step": 1344, "total_loss": 0.6543533504009247 }, { "epoch": 0.2662839041773906, "grad_norm": 2.267181634902954, "learning_rate": 5e-05, "llm_loss": 0.6086202710866928, "loss": 2.8792, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.0594482421875, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.087890625, "loss_aux_layer_12": 0.0946044921875, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.1142578125, "loss_aux_layer_15": 0.1251220703125, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.152587890625, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.0826416015625, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.0863037109375, "loss_aux_layer_7": 0.082763671875, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.0810546875, "step": 1345, "total_loss": 0.7197967767715454 }, { "epoch": 0.266481884775292, "grad_norm": 2.078171968460083, "learning_rate": 5e-05, "llm_loss": 0.6308841332793236, "loss": 2.9835, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.05950927734375, "loss_aux_layer_10": 0.0867919921875, "loss_aux_layer_11": 0.092041015625, "loss_aux_layer_12": 0.0989990234375, "loss_aux_layer_13": 0.106689453125, "loss_aux_layer_14": 0.11767578125, "loss_aux_layer_15": 0.1282958984375, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.16455078125, "loss_aux_layer_21": 0.1708984375, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.233154296875, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.0872802734375, "loss_aux_layer_5": 0.0888671875, "loss_aux_layer_6": 0.0919189453125, "loss_aux_layer_7": 0.088134765625, "loss_aux_layer_8": 0.0870361328125, "loss_aux_layer_9": 0.085205078125, "step": 1346, "total_loss": 0.7458660900592804 }, { "epoch": 0.26667986537319344, "grad_norm": 2.0241053104400635, "learning_rate": 5e-05, "llm_loss": 0.5514426454901695, "loss": 2.6385, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.05487060546875, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1090087890625, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.148193359375, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.06524658203125, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0770263671875, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.0780029296875, "step": 1347, "total_loss": 0.6596203297376633 }, { "epoch": 0.2668778459710948, "grad_norm": 1.9385651350021362, "learning_rate": 5e-05, "llm_loss": 0.5906921625137329, "loss": 2.8249, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.05999755859375, "loss_aux_layer_10": 0.086669921875, "loss_aux_layer_11": 0.09228515625, "loss_aux_layer_12": 0.099365234375, "loss_aux_layer_13": 0.106689453125, "loss_aux_layer_14": 0.1180419921875, "loss_aux_layer_15": 0.128662109375, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.158203125, "loss_aux_layer_2": 0.07373046875, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.172607421875, "loss_aux_layer_22": 0.195556640625, "loss_aux_layer_23": 0.2373046875, "loss_aux_layer_3": 0.0843505859375, "loss_aux_layer_4": 0.087158203125, "loss_aux_layer_5": 0.088623046875, "loss_aux_layer_6": 0.091064453125, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.0863037109375, "loss_aux_layer_9": 0.0850830078125, "step": 1348, "total_loss": 0.7062135636806488 }, { "epoch": 0.26707582656899626, "grad_norm": 1.2493489980697632, "learning_rate": 5e-05, "llm_loss": 0.5920539945363998, "loss": 2.8147, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.05718994140625, "loss_aux_layer_10": 0.08349609375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.0953369140625, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.1131591796875, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.06793212890625, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.233154296875, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0853271484375, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.0843505859375, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0819091796875, "step": 1349, "total_loss": 0.7036823928356171 }, { "epoch": 0.26727380716689764, "grad_norm": 2.27935528755188, "learning_rate": 5e-05, "llm_loss": 0.6379538625478745, "loss": 3.0107, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.05926513671875, "loss_aux_layer_10": 0.0859375, "loss_aux_layer_11": 0.0914306640625, "loss_aux_layer_12": 0.098388671875, "loss_aux_layer_13": 0.10595703125, "loss_aux_layer_14": 0.117919921875, "loss_aux_layer_15": 0.1282958984375, "loss_aux_layer_16": 0.139404296875, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.156982421875, "loss_aux_layer_19": 0.158935546875, "loss_aux_layer_2": 0.071533203125, "loss_aux_layer_20": 0.1650390625, "loss_aux_layer_21": 0.172119140625, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.235107421875, "loss_aux_layer_3": 0.08251953125, "loss_aux_layer_4": 0.0858154296875, "loss_aux_layer_5": 0.08740234375, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.0841064453125, "step": 1350, "total_loss": 0.752666100859642 }, { "epoch": 0.2674717877647991, "grad_norm": 0.9108063578605652, "learning_rate": 5e-05, "llm_loss": 0.6747933924198151, "loss": 3.1367, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.05615234375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1011962890625, "loss_aux_layer_14": 0.112060546875, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.0816650390625, "loss_aux_layer_6": 0.0843505859375, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.079833984375, "step": 1351, "total_loss": 0.784166008234024 }, { "epoch": 0.26766976836270046, "grad_norm": 1.9903795719146729, "learning_rate": 5e-05, "llm_loss": 0.6131850332021713, "loss": 2.9057, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.06011962890625, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1043701171875, "loss_aux_layer_14": 0.115478515625, "loss_aux_layer_15": 0.12548828125, "loss_aux_layer_16": 0.13671875, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.1533203125, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.0811767578125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.0867919921875, "loss_aux_layer_6": 0.0899658203125, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.08447265625, "loss_aux_layer_9": 0.08349609375, "step": 1352, "total_loss": 0.7264212965965271 }, { "epoch": 0.26786774896060184, "grad_norm": 2.258514404296875, "learning_rate": 5e-05, "llm_loss": 0.5604672133922577, "loss": 2.6774, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.0556640625, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.092529296875, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.110595703125, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.066162109375, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.08154296875, "loss_aux_layer_5": 0.083251953125, "loss_aux_layer_6": 0.0860595703125, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.0802001953125, "step": 1353, "total_loss": 0.6693557053804398 }, { "epoch": 0.2680657295585033, "grad_norm": 1.2765980958938599, "learning_rate": 5e-05, "llm_loss": 0.6135463267564774, "loss": 2.9145, "loss_aux_layer_0": 0.024017333984375, "loss_aux_layer_1": 0.0623779296875, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.09326171875, "loss_aux_layer_12": 0.0997314453125, "loss_aux_layer_13": 0.107421875, "loss_aux_layer_14": 0.1185302734375, "loss_aux_layer_15": 0.12890625, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.147216796875, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.072265625, "loss_aux_layer_20": 0.163818359375, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0843505859375, "loss_aux_layer_4": 0.0875244140625, "loss_aux_layer_5": 0.0889892578125, "loss_aux_layer_6": 0.0916748046875, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.0869140625, "loss_aux_layer_9": 0.085693359375, "step": 1354, "total_loss": 0.7286228835582733 }, { "epoch": 0.26826371015640466, "grad_norm": 2.4599313735961914, "learning_rate": 5e-05, "llm_loss": 0.6224515736103058, "loss": 2.9285, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.05438232421875, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.0926513671875, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.111572265625, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.079833984375, "step": 1355, "total_loss": 0.7321236580610275 }, { "epoch": 0.2684616907543061, "grad_norm": 1.0956645011901855, "learning_rate": 5e-05, "llm_loss": 0.5668695047497749, "loss": 2.7042, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.080810546875, "loss_aux_layer_11": 0.0860595703125, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0992431640625, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.1209716796875, "loss_aux_layer_16": 0.1319580078125, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0845947265625, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0789794921875, "step": 1356, "total_loss": 0.6760623753070831 }, { "epoch": 0.2686596713522075, "grad_norm": 2.1480085849761963, "learning_rate": 5e-05, "llm_loss": 0.5115576460957527, "loss": 2.505, "loss_aux_layer_0": 0.023681640625, "loss_aux_layer_1": 0.05938720703125, "loss_aux_layer_10": 0.0872802734375, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.1068115234375, "loss_aux_layer_14": 0.1175537109375, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0711669921875, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.232421875, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.086669921875, "loss_aux_layer_5": 0.088623046875, "loss_aux_layer_6": 0.091064453125, "loss_aux_layer_7": 0.088134765625, "loss_aux_layer_8": 0.087158203125, "loss_aux_layer_9": 0.0859375, "step": 1357, "total_loss": 0.6262555867433548 }, { "epoch": 0.2688576519501089, "grad_norm": 1.248384714126587, "learning_rate": 5e-05, "llm_loss": 0.6343340426683426, "loss": 2.9667, "loss_aux_layer_0": 0.024688720703125, "loss_aux_layer_1": 0.05438232421875, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.0982666015625, "loss_aux_layer_14": 0.1090087890625, "loss_aux_layer_15": 0.1190185546875, "loss_aux_layer_16": 0.129638671875, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.0638427734375, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.079833984375, "loss_aux_layer_6": 0.0823974609375, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0780029296875, "step": 1358, "total_loss": 0.7416863292455673 }, { "epoch": 0.2690556325480103, "grad_norm": 1.3866877555847168, "learning_rate": 5e-05, "llm_loss": 0.5534501075744629, "loss": 2.6633, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.056884765625, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.095458984375, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.11376953125, "loss_aux_layer_15": 0.124755859375, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.144775390625, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.067138671875, "loss_aux_layer_20": 0.1640625, "loss_aux_layer_21": 0.171875, "loss_aux_layer_22": 0.194580078125, "loss_aux_layer_23": 0.23583984375, "loss_aux_layer_3": 0.079345703125, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.083984375, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.0830078125, "loss_aux_layer_9": 0.08154296875, "step": 1359, "total_loss": 0.665826141834259 }, { "epoch": 0.2692536131459117, "grad_norm": 1.405951738357544, "learning_rate": 5e-05, "llm_loss": 0.5294087380170822, "loss": 2.5613, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.055419921875, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.09423828125, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.124267578125, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.143310546875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.06634521484375, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.08056640625, "step": 1360, "total_loss": 0.6403326243162155 }, { "epoch": 0.2694515937438131, "grad_norm": 1.6166940927505493, "learning_rate": 5e-05, "llm_loss": 0.5934208333492279, "loss": 2.8209, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.05908203125, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.0899658203125, "loss_aux_layer_12": 0.09619140625, "loss_aux_layer_13": 0.103515625, "loss_aux_layer_14": 0.1141357421875, "loss_aux_layer_15": 0.1246337890625, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.0816650390625, "loss_aux_layer_4": 0.0849609375, "loss_aux_layer_5": 0.0865478515625, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.0843505859375, "loss_aux_layer_9": 0.0828857421875, "step": 1361, "total_loss": 0.705216720700264 }, { "epoch": 0.2696495743417145, "grad_norm": 2.9878041744232178, "learning_rate": 5e-05, "llm_loss": 0.6479092836380005, "loss": 3.0535, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.06109619140625, "loss_aux_layer_10": 0.08740234375, "loss_aux_layer_11": 0.0928955078125, "loss_aux_layer_12": 0.099609375, "loss_aux_layer_13": 0.1065673828125, "loss_aux_layer_14": 0.117431640625, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.138916015625, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.157470703125, "loss_aux_layer_2": 0.0718994140625, "loss_aux_layer_20": 0.164306640625, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.08447265625, "loss_aux_layer_4": 0.088134765625, "loss_aux_layer_5": 0.09130859375, "loss_aux_layer_6": 0.0933837890625, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.086181640625, "step": 1362, "total_loss": 0.7633754163980484 }, { "epoch": 0.26984755493961593, "grad_norm": 1.771222472190857, "learning_rate": 5e-05, "llm_loss": 0.584342934191227, "loss": 2.7737, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.0538330078125, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.091552734375, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.110595703125, "loss_aux_layer_15": 0.1220703125, "loss_aux_layer_16": 0.1334228515625, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0848388671875, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.0799560546875, "loss_aux_layer_9": 0.078857421875, "step": 1363, "total_loss": 0.6934270858764648 }, { "epoch": 0.2700455355375173, "grad_norm": 1.4596798419952393, "learning_rate": 5e-05, "llm_loss": 0.5354110077023506, "loss": 2.5655, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.0531005859375, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.0947265625, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1273193359375, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.0758056640625, "step": 1364, "total_loss": 0.641382023692131 }, { "epoch": 0.27024351613541875, "grad_norm": 1.9964007139205933, "learning_rate": 5e-05, "llm_loss": 0.5978411436080933, "loss": 2.8424, "loss_aux_layer_0": 0.025543212890625, "loss_aux_layer_1": 0.05950927734375, "loss_aux_layer_10": 0.085693359375, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.0975341796875, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.1153564453125, "loss_aux_layer_15": 0.1260986328125, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.143798828125, "loss_aux_layer_18": 0.15234375, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.071533203125, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.08203125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.089111328125, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0853271484375, "loss_aux_layer_9": 0.0843505859375, "step": 1365, "total_loss": 0.7106105834245682 }, { "epoch": 0.27044149673332013, "grad_norm": 1.2011135816574097, "learning_rate": 5e-05, "llm_loss": 0.5468245521187782, "loss": 2.625, "loss_aux_layer_0": 0.02447509765625, "loss_aux_layer_1": 0.056884765625, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.08740234375, "loss_aux_layer_12": 0.09375, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.130615234375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.067626953125, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.0826416015625, "loss_aux_layer_5": 0.0843505859375, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.0809326171875, "step": 1366, "total_loss": 0.6562537550926208 }, { "epoch": 0.27063947733122157, "grad_norm": 1.6078343391418457, "learning_rate": 5e-05, "llm_loss": 0.5983452126383781, "loss": 2.8458, "loss_aux_layer_0": 0.0262451171875, "loss_aux_layer_1": 0.0592041015625, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.1256103515625, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.144775390625, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.06927490234375, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.19384765625, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.0892333984375, "loss_aux_layer_7": 0.085205078125, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.08251953125, "step": 1367, "total_loss": 0.7114482820034027 }, { "epoch": 0.27083745792912295, "grad_norm": 1.526332139968872, "learning_rate": 5e-05, "llm_loss": 0.6686852425336838, "loss": 3.1168, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.055419921875, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.0885009765625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1016845703125, "loss_aux_layer_14": 0.1121826171875, "loss_aux_layer_15": 0.1219482421875, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.0882568359375, "loss_aux_layer_7": 0.084228515625, "loss_aux_layer_8": 0.0830078125, "loss_aux_layer_9": 0.0819091796875, "step": 1368, "total_loss": 0.7792018055915833 }, { "epoch": 0.27103543852702433, "grad_norm": 1.2571014165878296, "learning_rate": 5e-05, "llm_loss": 0.6128269359469414, "loss": 2.8839, "loss_aux_layer_0": 0.0244140625, "loss_aux_layer_1": 0.056640625, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1097412109375, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.1300048828125, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.067138671875, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.07861328125, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.0814208984375, "loss_aux_layer_9": 0.0799560546875, "step": 1369, "total_loss": 0.7209710627794266 }, { "epoch": 0.27123341912492577, "grad_norm": 1.4737595319747925, "learning_rate": 5e-05, "llm_loss": 0.6986356228590012, "loss": 3.2456, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.05792236328125, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.16162109375, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.087158203125, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.083251953125, "step": 1370, "total_loss": 0.8113969564437866 }, { "epoch": 0.27143139972282715, "grad_norm": 1.2655915021896362, "learning_rate": 5e-05, "llm_loss": 0.6280775219202042, "loss": 2.9646, "loss_aux_layer_0": 0.0244140625, "loss_aux_layer_1": 0.05810546875, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.08984375, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.1033935546875, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.1258544921875, "loss_aux_layer_16": 0.136962890625, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.162841796875, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.087158203125, "loss_aux_layer_6": 0.0902099609375, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.0836181640625, "step": 1371, "total_loss": 0.7411481589078903 }, { "epoch": 0.2716293803207286, "grad_norm": 1.670616626739502, "learning_rate": 5e-05, "llm_loss": 0.5952649265527725, "loss": 2.8376, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.05810546875, "loss_aux_layer_10": 0.0860595703125, "loss_aux_layer_11": 0.0916748046875, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.1065673828125, "loss_aux_layer_14": 0.1181640625, "loss_aux_layer_15": 0.1290283203125, "loss_aux_layer_16": 0.1396484375, "loss_aux_layer_17": 0.14794921875, "loss_aux_layer_18": 0.157470703125, "loss_aux_layer_19": 0.159423828125, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.165283203125, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.080322265625, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0848388671875, "loss_aux_layer_9": 0.0845947265625, "step": 1372, "total_loss": 0.7094003558158875 }, { "epoch": 0.27182736091862997, "grad_norm": 1.297201156616211, "learning_rate": 5e-05, "llm_loss": 0.5936165302991867, "loss": 2.8113, "loss_aux_layer_0": 0.02490234375, "loss_aux_layer_1": 0.05450439453125, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0919189453125, "loss_aux_layer_13": 0.0989990234375, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.1217041015625, "loss_aux_layer_16": 0.132568359375, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.063720703125, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.078369140625, "step": 1373, "total_loss": 0.7028157263994217 }, { "epoch": 0.2720253415165314, "grad_norm": 1.5470494031906128, "learning_rate": 5e-05, "llm_loss": 0.5191290825605392, "loss": 2.5421, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.05810546875, "loss_aux_layer_10": 0.0885009765625, "loss_aux_layer_11": 0.09423828125, "loss_aux_layer_12": 0.1011962890625, "loss_aux_layer_13": 0.1090087890625, "loss_aux_layer_14": 0.1202392578125, "loss_aux_layer_15": 0.13134765625, "loss_aux_layer_16": 0.14306640625, "loss_aux_layer_17": 0.15087890625, "loss_aux_layer_18": 0.159912109375, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.0697021484375, "loss_aux_layer_20": 0.167236328125, "loss_aux_layer_21": 0.173828125, "loss_aux_layer_22": 0.1953125, "loss_aux_layer_23": 0.236328125, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.08642578125, "loss_aux_layer_5": 0.08837890625, "loss_aux_layer_6": 0.0914306640625, "loss_aux_layer_7": 0.0882568359375, "loss_aux_layer_8": 0.087646484375, "loss_aux_layer_9": 0.0865478515625, "step": 1374, "total_loss": 0.6355288624763489 }, { "epoch": 0.2722233221144328, "grad_norm": 1.2125040292739868, "learning_rate": 5e-05, "llm_loss": 0.5780403539538383, "loss": 2.765, "loss_aux_layer_0": 0.025787353515625, "loss_aux_layer_1": 0.0599365234375, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0911865234375, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.1263427734375, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.1552734375, "loss_aux_layer_2": 0.0699462890625, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.085693359375, "loss_aux_layer_5": 0.0872802734375, "loss_aux_layer_6": 0.090087890625, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.0838623046875, "step": 1375, "total_loss": 0.6912500709295273 }, { "epoch": 0.27242130271233417, "grad_norm": 2.0494930744171143, "learning_rate": 5e-05, "llm_loss": 0.5971136689186096, "loss": 2.8217, "loss_aux_layer_0": 0.026824951171875, "loss_aux_layer_1": 0.05560302734375, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0909423828125, "loss_aux_layer_13": 0.098876953125, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.140380859375, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.064453125, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0753173828125, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.077392578125, "step": 1376, "total_loss": 0.7054286599159241 }, { "epoch": 0.2726192833102356, "grad_norm": 1.6633785963058472, "learning_rate": 5e-05, "llm_loss": 0.6420328468084335, "loss": 3.0092, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.05560302734375, "loss_aux_layer_10": 0.0823974609375, "loss_aux_layer_11": 0.087890625, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.11328125, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.06640625, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.0787353515625, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.0810546875, "step": 1377, "total_loss": 0.7522947788238525 }, { "epoch": 0.272817263908137, "grad_norm": 1.6752071380615234, "learning_rate": 5e-05, "llm_loss": 0.6230457276105881, "loss": 2.9374, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.0552978515625, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.093994140625, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.1134033203125, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.1533203125, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.170654296875, "loss_aux_layer_22": 0.19482421875, "loss_aux_layer_23": 0.236083984375, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.080322265625, "step": 1378, "total_loss": 0.7343423813581467 }, { "epoch": 0.2730152445060384, "grad_norm": 2.313260555267334, "learning_rate": 5e-05, "llm_loss": 0.5820596888661385, "loss": 2.752, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.0882568359375, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1080322265625, "loss_aux_layer_15": 0.119384765625, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.14892578125, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.060546875, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0712890625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.0743408203125, "step": 1379, "total_loss": 0.6879944950342178 }, { "epoch": 0.2732132251039398, "grad_norm": 2.108628273010254, "learning_rate": 5e-05, "llm_loss": 0.6816458702087402, "loss": 3.1814, "loss_aux_layer_0": 0.025299072265625, "loss_aux_layer_1": 0.0596923828125, "loss_aux_layer_10": 0.085205078125, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1163330078125, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154296875, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.0694580078125, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.23193359375, "loss_aux_layer_3": 0.0831298828125, "loss_aux_layer_4": 0.0859375, "loss_aux_layer_5": 0.087646484375, "loss_aux_layer_6": 0.0904541015625, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.083740234375, "step": 1380, "total_loss": 0.795348197221756 }, { "epoch": 0.27341120570184124, "grad_norm": 1.4750972986221313, "learning_rate": 5e-05, "llm_loss": 0.5989890396595001, "loss": 2.8561, "loss_aux_layer_0": 0.02587890625, "loss_aux_layer_1": 0.0592041015625, "loss_aux_layer_10": 0.0841064453125, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.1043701171875, "loss_aux_layer_14": 0.1165771484375, "loss_aux_layer_15": 0.12841796875, "loss_aux_layer_16": 0.14013671875, "loss_aux_layer_17": 0.1484375, "loss_aux_layer_18": 0.158203125, "loss_aux_layer_19": 0.161376953125, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.168212890625, "loss_aux_layer_21": 0.176513671875, "loss_aux_layer_22": 0.199951171875, "loss_aux_layer_23": 0.242919921875, "loss_aux_layer_3": 0.0811767578125, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.0887451171875, "loss_aux_layer_7": 0.0853271484375, "loss_aux_layer_8": 0.0841064453125, "loss_aux_layer_9": 0.08251953125, "step": 1381, "total_loss": 0.7140311002731323 }, { "epoch": 0.2736091862997426, "grad_norm": 1.4608911275863647, "learning_rate": 5e-05, "llm_loss": 0.589062437415123, "loss": 2.8082, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.05908203125, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1160888671875, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.137451171875, "loss_aux_layer_17": 0.144775390625, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.23095703125, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.084228515625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.0836181640625, "step": 1382, "total_loss": 0.7020428329706192 }, { "epoch": 0.273807166897644, "grad_norm": 1.3921641111373901, "learning_rate": 5e-05, "llm_loss": 0.6330687329173088, "loss": 2.9745, "loss_aux_layer_0": 0.025115966796875, "loss_aux_layer_1": 0.0565185546875, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.09326171875, "loss_aux_layer_13": 0.101318359375, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.123291015625, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.0672607421875, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.168212890625, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0787353515625, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.0814208984375, "loss_aux_layer_9": 0.080078125, "step": 1383, "total_loss": 0.7436358332633972 }, { "epoch": 0.27400514749554544, "grad_norm": 1.3948520421981812, "learning_rate": 5e-05, "llm_loss": 0.6208263337612152, "loss": 2.918, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.080810546875, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.110595703125, "loss_aux_layer_15": 0.1209716796875, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.139892578125, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0657958984375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.08154296875, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0791015625, "step": 1384, "total_loss": 0.7294891029596329 }, { "epoch": 0.2742031280934468, "grad_norm": 1.5017740726470947, "learning_rate": 5e-05, "llm_loss": 0.7442249357700348, "loss": 3.4156, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.05596923828125, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.093994140625, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.11376953125, "loss_aux_layer_15": 0.1241455078125, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0767822265625, "loss_aux_layer_4": 0.0802001953125, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0845947265625, "loss_aux_layer_7": 0.0816650390625, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0797119140625, "step": 1385, "total_loss": 0.8538893461227417 }, { "epoch": 0.27440110869134826, "grad_norm": 1.2756588459014893, "learning_rate": 5e-05, "llm_loss": 0.5601929575204849, "loss": 2.6943, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.05731201171875, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.1162109375, "loss_aux_layer_15": 0.1273193359375, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.146728515625, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.157958984375, "loss_aux_layer_2": 0.0687255859375, "loss_aux_layer_20": 0.164794921875, "loss_aux_layer_21": 0.17138671875, "loss_aux_layer_22": 0.19287109375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.0830078125, "step": 1386, "total_loss": 0.6735784858465195 }, { "epoch": 0.27459908928924964, "grad_norm": 1.5467357635498047, "learning_rate": 5e-05, "llm_loss": 0.6894551068544388, "loss": 3.2107, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.05902099609375, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.090576171875, "loss_aux_layer_12": 0.0977783203125, "loss_aux_layer_13": 0.105712890625, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.128173828125, "loss_aux_layer_16": 0.138916015625, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15673828125, "loss_aux_layer_2": 0.0677490234375, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0794677734375, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.088623046875, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.0841064453125, "loss_aux_layer_9": 0.0830078125, "step": 1387, "total_loss": 0.8026641756296158 }, { "epoch": 0.2747970698871511, "grad_norm": 1.517199993133545, "learning_rate": 5e-05, "llm_loss": 0.6125182807445526, "loss": 2.9051, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.06024169921875, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.0975341796875, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.116455078125, "loss_aux_layer_15": 0.127685546875, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.14697265625, "loss_aux_layer_18": 0.155517578125, "loss_aux_layer_19": 0.156982421875, "loss_aux_layer_2": 0.0694580078125, "loss_aux_layer_20": 0.163330078125, "loss_aux_layer_21": 0.169921875, "loss_aux_layer_22": 0.19140625, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.08154296875, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.08935546875, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0850830078125, "loss_aux_layer_9": 0.083984375, "step": 1388, "total_loss": 0.7262728959321976 }, { "epoch": 0.27499505048505246, "grad_norm": 1.3001865148544312, "learning_rate": 5e-05, "llm_loss": 0.5339260697364807, "loss": 2.5628, "loss_aux_layer_0": 0.023651123046875, "loss_aux_layer_1": 0.05419921875, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.1290283203125, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0634765625, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.077880859375, "step": 1389, "total_loss": 0.640712171792984 }, { "epoch": 0.27519303108295384, "grad_norm": 1.2411091327667236, "learning_rate": 5e-05, "llm_loss": 0.6132033616304398, "loss": 2.8929, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.0562744140625, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.0933837890625, "loss_aux_layer_13": 0.1009521484375, "loss_aux_layer_14": 0.11181640625, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.06475830078125, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.079833984375, "step": 1390, "total_loss": 0.7232286632061005 }, { "epoch": 0.2753910116808553, "grad_norm": 1.4097572565078735, "learning_rate": 5e-05, "llm_loss": 0.574313223361969, "loss": 2.7476, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.05633544921875, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1160888671875, "loss_aux_layer_15": 0.126708984375, "loss_aux_layer_16": 0.13720703125, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.167724609375, "loss_aux_layer_22": 0.189453125, "loss_aux_layer_23": 0.229736328125, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.084228515625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.0849609375, "loss_aux_layer_9": 0.083740234375, "step": 1391, "total_loss": 0.686892107129097 }, { "epoch": 0.27558899227875666, "grad_norm": 1.6060352325439453, "learning_rate": 5e-05, "llm_loss": 0.5686093121767044, "loss": 2.722, "loss_aux_layer_0": 0.029296875, "loss_aux_layer_1": 0.0595703125, "loss_aux_layer_10": 0.08349609375, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.095458984375, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.1143798828125, "loss_aux_layer_15": 0.124755859375, "loss_aux_layer_16": 0.13525390625, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.06695556640625, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.16943359375, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.0787353515625, "loss_aux_layer_4": 0.08203125, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.0831298828125, "loss_aux_layer_9": 0.0819091796875, "step": 1392, "total_loss": 0.6804994493722916 }, { "epoch": 0.2757869728766581, "grad_norm": 2.034728527069092, "learning_rate": 5e-05, "llm_loss": 0.5951050817966461, "loss": 2.8096, "loss_aux_layer_0": 0.0289306640625, "loss_aux_layer_1": 0.05615234375, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.11865234375, "loss_aux_layer_16": 0.1304931640625, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.065673828125, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.0787353515625, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.076416015625, "step": 1393, "total_loss": 0.7023979127407074 }, { "epoch": 0.2759849534745595, "grad_norm": 1.4185377359390259, "learning_rate": 5e-05, "llm_loss": 0.5844494849443436, "loss": 2.7997, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05963134765625, "loss_aux_layer_10": 0.0894775390625, "loss_aux_layer_11": 0.0948486328125, "loss_aux_layer_12": 0.100830078125, "loss_aux_layer_13": 0.1080322265625, "loss_aux_layer_14": 0.1182861328125, "loss_aux_layer_15": 0.1282958984375, "loss_aux_layer_16": 0.138671875, "loss_aux_layer_17": 0.145263671875, "loss_aux_layer_18": 0.15380859375, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.0721435546875, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.23291015625, "loss_aux_layer_3": 0.0849609375, "loss_aux_layer_4": 0.0882568359375, "loss_aux_layer_5": 0.090087890625, "loss_aux_layer_6": 0.0936279296875, "loss_aux_layer_7": 0.0906982421875, "loss_aux_layer_8": 0.089599609375, "loss_aux_layer_9": 0.088134765625, "step": 1394, "total_loss": 0.699935644865036 }, { "epoch": 0.2761829340724609, "grad_norm": 2.163367986679077, "learning_rate": 5e-05, "llm_loss": 0.615996852517128, "loss": 2.9033, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.05548095703125, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.09326171875, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.1214599609375, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.139892578125, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.068359375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0830078125, "loss_aux_layer_5": 0.0843505859375, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.083984375, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.0811767578125, "step": 1395, "total_loss": 0.7258258312940598 }, { "epoch": 0.2763809146703623, "grad_norm": 1.4921927452087402, "learning_rate": 5e-05, "llm_loss": 0.7836344838142395, "loss": 3.5849, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05780029296875, "loss_aux_layer_10": 0.085205078125, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.1048583984375, "loss_aux_layer_14": 0.1151123046875, "loss_aux_layer_15": 0.12548828125, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.192138671875, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.080810546875, "loss_aux_layer_4": 0.084228515625, "loss_aux_layer_5": 0.0860595703125, "loss_aux_layer_6": 0.0889892578125, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.0836181640625, "step": 1396, "total_loss": 0.896228164434433 }, { "epoch": 0.27657889526826374, "grad_norm": 1.3867452144622803, "learning_rate": 5e-05, "llm_loss": 0.638010635972023, "loss": 2.9873, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.05255126953125, "loss_aux_layer_10": 0.08056640625, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.0919189453125, "loss_aux_layer_13": 0.099853515625, "loss_aux_layer_14": 0.111328125, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.064697265625, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.08349609375, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078857421875, "step": 1397, "total_loss": 0.7468164265155792 }, { "epoch": 0.2767768758661651, "grad_norm": 1.3009048700332642, "learning_rate": 5e-05, "llm_loss": 0.6115424558520317, "loss": 2.8931, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.058349609375, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.08935546875, "loss_aux_layer_12": 0.095703125, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.1138916015625, "loss_aux_layer_15": 0.12451171875, "loss_aux_layer_16": 0.13525390625, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.15234375, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.08251953125, "step": 1398, "total_loss": 0.7232660353183746 }, { "epoch": 0.2769748564640665, "grad_norm": 1.3044977188110352, "learning_rate": 5e-05, "llm_loss": 0.6717548072338104, "loss": 3.1246, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.05364990234375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0645751953125, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0799560546875, "step": 1399, "total_loss": 0.7811417579650879 }, { "epoch": 0.27717283706196794, "grad_norm": 1.0667521953582764, "learning_rate": 5e-05, "llm_loss": 0.6737585812807083, "loss": 3.1282, "loss_aux_layer_0": 0.022369384765625, "loss_aux_layer_1": 0.05645751953125, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1011962890625, "loss_aux_layer_14": 0.111328125, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0667724609375, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0784912109375, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.083251953125, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.0810546875, "step": 1400, "total_loss": 0.7820533812046051 }, { "epoch": 0.2773708176598693, "grad_norm": 1.4206351041793823, "learning_rate": 5e-05, "llm_loss": 0.6954103261232376, "loss": 3.2375, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.05792236328125, "loss_aux_layer_10": 0.0869140625, "loss_aux_layer_11": 0.0924072265625, "loss_aux_layer_12": 0.09912109375, "loss_aux_layer_13": 0.1064453125, "loss_aux_layer_14": 0.1170654296875, "loss_aux_layer_15": 0.127197265625, "loss_aux_layer_16": 0.138427734375, "loss_aux_layer_17": 0.145751953125, "loss_aux_layer_18": 0.154052734375, "loss_aux_layer_19": 0.15625, "loss_aux_layer_2": 0.0709228515625, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.168212890625, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.08349609375, "loss_aux_layer_4": 0.0869140625, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0914306640625, "loss_aux_layer_7": 0.087890625, "loss_aux_layer_8": 0.0867919921875, "loss_aux_layer_9": 0.08544921875, "step": 1401, "total_loss": 0.8093691617250443 }, { "epoch": 0.27756879825777075, "grad_norm": 1.129948377609253, "learning_rate": 5e-05, "llm_loss": 0.6810193359851837, "loss": 3.1798, "loss_aux_layer_0": 0.0255126953125, "loss_aux_layer_1": 0.05865478515625, "loss_aux_layer_10": 0.0865478515625, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.106201171875, "loss_aux_layer_14": 0.1175537109375, "loss_aux_layer_15": 0.1275634765625, "loss_aux_layer_16": 0.13818359375, "loss_aux_layer_17": 0.1455078125, "loss_aux_layer_18": 0.15478515625, "loss_aux_layer_19": 0.156005859375, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0823974609375, "loss_aux_layer_4": 0.0858154296875, "loss_aux_layer_5": 0.0877685546875, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.0872802734375, "loss_aux_layer_8": 0.08642578125, "loss_aux_layer_9": 0.0850830078125, "step": 1402, "total_loss": 0.794937938451767 }, { "epoch": 0.27776677885567214, "grad_norm": 1.29505455493927, "learning_rate": 5e-05, "llm_loss": 0.6953589022159576, "loss": 3.2375, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.05908203125, "loss_aux_layer_10": 0.086181640625, "loss_aux_layer_11": 0.0919189453125, "loss_aux_layer_12": 0.0985107421875, "loss_aux_layer_13": 0.1060791015625, "loss_aux_layer_14": 0.1171875, "loss_aux_layer_15": 0.1278076171875, "loss_aux_layer_16": 0.138916015625, "loss_aux_layer_17": 0.14599609375, "loss_aux_layer_18": 0.155029296875, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.07080078125, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.1689453125, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.0859375, "loss_aux_layer_5": 0.08740234375, "loss_aux_layer_6": 0.090576171875, "loss_aux_layer_7": 0.087158203125, "loss_aux_layer_8": 0.0859375, "loss_aux_layer_9": 0.0843505859375, "step": 1403, "total_loss": 0.8093636631965637 }, { "epoch": 0.2779647594535736, "grad_norm": 1.2670085430145264, "learning_rate": 5e-05, "llm_loss": 0.5738522261381149, "loss": 2.7374, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.05596923828125, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.0784912109375, "loss_aux_layer_4": 0.0814208984375, "loss_aux_layer_5": 0.083251953125, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.081298828125, "step": 1404, "total_loss": 0.684342235326767 }, { "epoch": 0.27816274005147495, "grad_norm": 1.1158498525619507, "learning_rate": 5e-05, "llm_loss": 0.7002184242010117, "loss": 3.2378, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.054931640625, "loss_aux_layer_10": 0.0828857421875, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.09423828125, "loss_aux_layer_13": 0.1009521484375, "loss_aux_layer_14": 0.1112060546875, "loss_aux_layer_15": 0.121337890625, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06658935546875, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.0821533203125, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.081298828125, "step": 1405, "total_loss": 0.8094435185194016 }, { "epoch": 0.27836072064937634, "grad_norm": 1.39328932762146, "learning_rate": 5e-05, "llm_loss": 0.6327411979436874, "loss": 2.9607, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.0535888671875, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.0623779296875, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.075927734375, "step": 1406, "total_loss": 0.7401816993951797 }, { "epoch": 0.2785587012472778, "grad_norm": 1.720499038696289, "learning_rate": 5e-05, "llm_loss": 0.5107621550559998, "loss": 2.4769, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.05364990234375, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.098876953125, "loss_aux_layer_14": 0.1097412109375, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.148681640625, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.080078125, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.0794677734375, "step": 1407, "total_loss": 0.6192221194505692 }, { "epoch": 0.27875668184517916, "grad_norm": 0.8997940421104431, "learning_rate": 5e-05, "llm_loss": 0.5593018978834152, "loss": 2.6868, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.055908203125, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.0899658203125, "loss_aux_layer_12": 0.0970458984375, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.11572265625, "loss_aux_layer_15": 0.1260986328125, "loss_aux_layer_16": 0.13671875, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.170166015625, "loss_aux_layer_22": 0.192626953125, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.082763671875, "step": 1408, "total_loss": 0.6716974824666977 }, { "epoch": 0.2789546624430806, "grad_norm": 1.009126901626587, "learning_rate": 5e-05, "llm_loss": 0.5686463117599487, "loss": 2.7209, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.0556640625, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0960693359375, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.1143798828125, "loss_aux_layer_15": 0.1251220703125, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.143310546875, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.067138671875, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.07958984375, "loss_aux_layer_4": 0.082763671875, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.087646484375, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.0823974609375, "step": 1409, "total_loss": 0.6802255660295486 }, { "epoch": 0.279152643040982, "grad_norm": 1.0241844654083252, "learning_rate": 5e-05, "llm_loss": 0.5922405868768692, "loss": 2.8114, "loss_aux_layer_0": 0.024261474609375, "loss_aux_layer_1": 0.05645751953125, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1019287109375, "loss_aux_layer_14": 0.1124267578125, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.14111328125, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.0672607421875, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.0799560546875, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0850830078125, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.0848388671875, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.08251953125, "step": 1410, "total_loss": 0.7028494477272034 }, { "epoch": 0.2793506236388834, "grad_norm": 0.8916775584220886, "learning_rate": 5e-05, "llm_loss": 0.6670347452163696, "loss": 3.1095, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.0560302734375, "loss_aux_layer_10": 0.0830078125, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.0946044921875, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.1131591796875, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.08251953125, "loss_aux_layer_9": 0.0811767578125, "step": 1411, "total_loss": 0.777383491396904 }, { "epoch": 0.2795486042367848, "grad_norm": 1.6315867900848389, "learning_rate": 5e-05, "llm_loss": 0.5867514759302139, "loss": 2.7922, "loss_aux_layer_0": 0.0267333984375, "loss_aux_layer_1": 0.05487060546875, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.114501953125, "loss_aux_layer_15": 0.125244140625, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.14453125, "loss_aux_layer_18": 0.153076171875, "loss_aux_layer_19": 0.155517578125, "loss_aux_layer_2": 0.06512451171875, "loss_aux_layer_20": 0.162109375, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.19091796875, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.082763671875, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.080810546875, "step": 1412, "total_loss": 0.6980527341365814 }, { "epoch": 0.2797465848346862, "grad_norm": 1.5119524002075195, "learning_rate": 5e-05, "llm_loss": 0.6106058806180954, "loss": 2.8623, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.0540771484375, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.1273193359375, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.063720703125, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0753173828125, "loss_aux_layer_4": 0.078857421875, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.079833984375, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0775146484375, "step": 1413, "total_loss": 0.7155871242284775 }, { "epoch": 0.2799445654325876, "grad_norm": 1.7448481321334839, "learning_rate": 5e-05, "llm_loss": 0.5933872610330582, "loss": 2.8146, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.05572509765625, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.1025390625, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.0670166015625, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.0794677734375, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.083984375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0816650390625, "step": 1414, "total_loss": 0.703640416264534 }, { "epoch": 0.280142546030489, "grad_norm": 1.0619218349456787, "learning_rate": 5e-05, "llm_loss": 0.6359870135784149, "loss": 2.9678, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.0521240234375, "loss_aux_layer_10": 0.0787353515625, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.09765625, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.1295166015625, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.14599609375, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06256103515625, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0733642578125, "loss_aux_layer_4": 0.076416015625, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0770263671875, "step": 1415, "total_loss": 0.7419481724500656 }, { "epoch": 0.28034052662839043, "grad_norm": 1.7482799291610718, "learning_rate": 5e-05, "llm_loss": 0.5851682275533676, "loss": 2.7792, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.0565185546875, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.0953369140625, "loss_aux_layer_13": 0.10205078125, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.1220703125, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06787109375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0804443359375, "loss_aux_layer_4": 0.0833740234375, "loss_aux_layer_5": 0.0853271484375, "loss_aux_layer_6": 0.0877685546875, "loss_aux_layer_7": 0.0843505859375, "loss_aux_layer_8": 0.08349609375, "loss_aux_layer_9": 0.082275390625, "step": 1416, "total_loss": 0.6948029398918152 }, { "epoch": 0.2805385072262918, "grad_norm": 1.8142144680023193, "learning_rate": 5e-05, "llm_loss": 0.6646926999092102, "loss": 3.0927, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.05419921875, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0989990234375, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.121337890625, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.06494140625, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2255859375, "loss_aux_layer_3": 0.076904296875, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.0841064453125, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0799560546875, "loss_aux_layer_9": 0.07861328125, "step": 1417, "total_loss": 0.7731750905513763 }, { "epoch": 0.28073648782419325, "grad_norm": 1.4789575338363647, "learning_rate": 5e-05, "llm_loss": 0.5933241248130798, "loss": 2.8158, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.05572509765625, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.10205078125, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.1236572265625, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.067138671875, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0794677734375, "loss_aux_layer_4": 0.082763671875, "loss_aux_layer_5": 0.0848388671875, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.0816650390625, "step": 1418, "total_loss": 0.7039507925510406 }, { "epoch": 0.28093446842209463, "grad_norm": 1.2499452829360962, "learning_rate": 5e-05, "llm_loss": 0.5547657012939453, "loss": 2.6678, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.05877685546875, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.0885009765625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1025390625, "loss_aux_layer_14": 0.114013671875, "loss_aux_layer_15": 0.125, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.14306640625, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.15576171875, "loss_aux_layer_2": 0.0673828125, "loss_aux_layer_20": 0.16259765625, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.234130859375, "loss_aux_layer_3": 0.079833984375, "loss_aux_layer_4": 0.0828857421875, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.08203125, "step": 1419, "total_loss": 0.6669389307498932 }, { "epoch": 0.28113244901999607, "grad_norm": 2.111928701400757, "learning_rate": 5e-05, "llm_loss": 0.6043940633535385, "loss": 2.8611, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.05670166015625, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.093505859375, "loss_aux_layer_13": 0.10107421875, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.123291015625, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.1416015625, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.0799560546875, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0850830078125, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.0806884765625, "step": 1420, "total_loss": 0.7152854204177856 }, { "epoch": 0.28133042961789745, "grad_norm": 1.2448310852050781, "learning_rate": 5e-05, "llm_loss": 0.6202588081359863, "loss": 2.9117, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.0537109375, "loss_aux_layer_10": 0.0809326171875, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.1097412109375, "loss_aux_layer_15": 0.1199951171875, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.064208984375, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.07958984375, "loss_aux_layer_5": 0.0819091796875, "loss_aux_layer_6": 0.0845947265625, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.07958984375, "step": 1421, "total_loss": 0.7279192805290222 }, { "epoch": 0.28152841021579883, "grad_norm": 1.5839871168136597, "learning_rate": 5e-05, "llm_loss": 0.6113898754119873, "loss": 2.8865, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.055908203125, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.1116943359375, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.1339111328125, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.06689453125, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.22900390625, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.0828857421875, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.0823974609375, "loss_aux_layer_8": 0.081298828125, "loss_aux_layer_9": 0.0802001953125, "step": 1422, "total_loss": 0.7216233313083649 }, { "epoch": 0.28172639081370027, "grad_norm": 1.6431611776351929, "learning_rate": 5e-05, "llm_loss": 0.6237646490335464, "loss": 2.9452, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.056640625, "loss_aux_layer_10": 0.0855712890625, "loss_aux_layer_11": 0.091064453125, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.104736328125, "loss_aux_layer_14": 0.1160888671875, "loss_aux_layer_15": 0.12646484375, "loss_aux_layer_16": 0.136474609375, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.152587890625, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.0692138671875, "loss_aux_layer_20": 0.160400390625, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.0845947265625, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.08544921875, "loss_aux_layer_9": 0.083984375, "step": 1423, "total_loss": 0.736290767788887 }, { "epoch": 0.28192437141160165, "grad_norm": 1.5355675220489502, "learning_rate": 5e-05, "llm_loss": 0.6058123856782913, "loss": 2.8496, "loss_aux_layer_0": 0.022979736328125, "loss_aux_layer_1": 0.0535888671875, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.0904541015625, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.128662109375, "loss_aux_layer_17": 0.136962890625, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.06329345703125, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.080322265625, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.078369140625, "step": 1424, "total_loss": 0.7124043852090836 }, { "epoch": 0.2821223520095031, "grad_norm": 1.446699619293213, "learning_rate": 5e-05, "llm_loss": 0.6985877007246017, "loss": 3.2274, "loss_aux_layer_0": 0.024261474609375, "loss_aux_layer_1": 0.05517578125, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.092529296875, "loss_aux_layer_13": 0.0997314453125, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.066162109375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.079345703125, "step": 1425, "total_loss": 0.8068436980247498 }, { "epoch": 0.28232033260740447, "grad_norm": 1.3135203123092651, "learning_rate": 5e-05, "llm_loss": 0.6321958601474762, "loss": 2.9626, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.0540771484375, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0863037109375, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.120361328125, "loss_aux_layer_16": 0.1309814453125, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.0657958984375, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0775146484375, "loss_aux_layer_4": 0.0802001953125, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.0810546875, "loss_aux_layer_9": 0.0799560546875, "step": 1426, "total_loss": 0.7406516671180725 }, { "epoch": 0.2825183132053059, "grad_norm": 1.3306671380996704, "learning_rate": 5e-05, "llm_loss": 0.7329294383525848, "loss": 3.3736, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.05517578125, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.122802734375, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.06597900390625, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.0784912109375, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.0867919921875, "loss_aux_layer_7": 0.083984375, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.081298828125, "step": 1427, "total_loss": 0.8434049785137177 }, { "epoch": 0.2827162938032073, "grad_norm": 1.0149387121200562, "learning_rate": 5e-05, "llm_loss": 0.5548081994056702, "loss": 2.6709, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.057373046875, "loss_aux_layer_10": 0.0855712890625, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.0970458984375, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1153564453125, "loss_aux_layer_15": 0.1253662109375, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.14404296875, "loss_aux_layer_18": 0.15234375, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.068359375, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.193359375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.0845947265625, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.083740234375, "step": 1428, "total_loss": 0.6677246689796448 }, { "epoch": 0.28291427440110867, "grad_norm": 1.5315356254577637, "learning_rate": 5e-05, "llm_loss": 0.6362524181604385, "loss": 2.972, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.05316162109375, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.090576171875, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.1287841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.064697265625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.078369140625, "step": 1429, "total_loss": 0.742994949221611 }, { "epoch": 0.2831122549990101, "grad_norm": 1.1940439939498901, "learning_rate": 5e-05, "llm_loss": 0.6082964688539505, "loss": 2.8587, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.0528564453125, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1085205078125, "loss_aux_layer_15": 0.118408203125, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06390380859375, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.079833984375, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.0775146484375, "step": 1430, "total_loss": 0.7146743908524513 }, { "epoch": 0.2833102355969115, "grad_norm": 1.4508085250854492, "learning_rate": 5e-05, "llm_loss": 0.579053670167923, "loss": 2.734, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.05224609375, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.1064453125, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.06182861328125, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0802001953125, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.07568359375, "step": 1431, "total_loss": 0.6834950000047684 }, { "epoch": 0.2835082161948129, "grad_norm": 1.0426313877105713, "learning_rate": 5e-05, "llm_loss": 0.6152193546295166, "loss": 2.9018, "loss_aux_layer_0": 0.029541015625, "loss_aux_layer_1": 0.0579833984375, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.123291015625, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.06707763671875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.0821533203125, "loss_aux_layer_5": 0.083984375, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.08154296875, "step": 1432, "total_loss": 0.7254383563995361 }, { "epoch": 0.2837061967927143, "grad_norm": 1.4635570049285889, "learning_rate": 5e-05, "llm_loss": 0.586011677980423, "loss": 2.7764, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.05517578125, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.0989990234375, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0665283203125, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.0814208984375, "loss_aux_layer_6": 0.0841064453125, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.0802001953125, "loss_aux_layer_9": 0.0787353515625, "step": 1433, "total_loss": 0.6940936595201492 }, { "epoch": 0.28390417739061574, "grad_norm": 1.4270440340042114, "learning_rate": 5e-05, "llm_loss": 0.572545625269413, "loss": 2.7038, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.0518798828125, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.113525390625, "loss_aux_layer_16": 0.12451171875, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.060302734375, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0712890625, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.07373046875, "step": 1434, "total_loss": 0.6759474575519562 }, { "epoch": 0.2841021579885171, "grad_norm": 1.0515927076339722, "learning_rate": 5e-05, "llm_loss": 0.6677646487951279, "loss": 3.0949, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.05206298828125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1072998046875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0615234375, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.0760498046875, "step": 1435, "total_loss": 0.7737243920564651 }, { "epoch": 0.2843001385864185, "grad_norm": 1.463126301765442, "learning_rate": 5e-05, "llm_loss": 0.5672233328223228, "loss": 2.7138, "loss_aux_layer_0": 0.023681640625, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.083251953125, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.1025390625, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1241455078125, "loss_aux_layer_16": 0.135986328125, "loss_aux_layer_17": 0.143798828125, "loss_aux_layer_18": 0.152587890625, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.162353515625, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.190673828125, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.0814208984375, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.081787109375, "step": 1436, "total_loss": 0.6784467101097107 }, { "epoch": 0.28449811918431994, "grad_norm": 1.0601164102554321, "learning_rate": 5e-05, "llm_loss": 0.5090620592236519, "loss": 2.47, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.109619140625, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.067138671875, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.0784912109375, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.082763671875, "loss_aux_layer_6": 0.085693359375, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.0792236328125, "step": 1437, "total_loss": 0.6174914836883545 }, { "epoch": 0.2846960997822213, "grad_norm": 1.168541669845581, "learning_rate": 5e-05, "llm_loss": 0.6093466877937317, "loss": 2.8727, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.0548095703125, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.12109375, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.076904296875, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.07958984375, "step": 1438, "total_loss": 0.7181655466556549 }, { "epoch": 0.28489408038012276, "grad_norm": 1.3216078281402588, "learning_rate": 5e-05, "llm_loss": 0.5908782631158829, "loss": 2.7949, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.05267333984375, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.0909423828125, "loss_aux_layer_13": 0.09814453125, "loss_aux_layer_14": 0.1094970703125, "loss_aux_layer_15": 0.120361328125, "loss_aux_layer_16": 0.1318359375, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.077880859375, "step": 1439, "total_loss": 0.6987141966819763 }, { "epoch": 0.28509206097802414, "grad_norm": 1.250522494316101, "learning_rate": 5e-05, "llm_loss": 0.541973702609539, "loss": 2.5978, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05426025390625, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.1080322265625, "loss_aux_layer_15": 0.1181640625, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.066162109375, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.0816650390625, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0791015625, "step": 1440, "total_loss": 0.6494378596544266 }, { "epoch": 0.2852900415759256, "grad_norm": 0.9770023822784424, "learning_rate": 5e-05, "llm_loss": 0.5819809436798096, "loss": 2.7491, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.05072021484375, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.087890625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.127685546875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06256103515625, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.076171875, "step": 1441, "total_loss": 0.6872783601284027 }, { "epoch": 0.28548802217382696, "grad_norm": 1.2382843494415283, "learning_rate": 5e-05, "llm_loss": 0.57694411277771, "loss": 2.7433, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05487060546875, "loss_aux_layer_10": 0.0821533203125, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.1002197265625, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.131103515625, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.06591796875, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0830078125, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.080810546875, "step": 1442, "total_loss": 0.6858250945806503 }, { "epoch": 0.2856860027717284, "grad_norm": 1.1409047842025757, "learning_rate": 5e-05, "llm_loss": 0.6613222807645798, "loss": 3.0838, "loss_aux_layer_0": 0.02520751953125, "loss_aux_layer_1": 0.05487060546875, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.09375, "loss_aux_layer_13": 0.1004638671875, "loss_aux_layer_14": 0.1119384765625, "loss_aux_layer_15": 0.122314453125, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.0806884765625, "step": 1443, "total_loss": 0.7709416002035141 }, { "epoch": 0.2858839833696298, "grad_norm": 1.148313283920288, "learning_rate": 5e-05, "llm_loss": 0.6850923597812653, "loss": 3.1823, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.054443359375, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1246337890625, "loss_aux_layer_16": 0.1357421875, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.15283203125, "loss_aux_layer_19": 0.15478515625, "loss_aux_layer_2": 0.06500244140625, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.168212890625, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.08056640625, "step": 1444, "total_loss": 0.7955812960863113 }, { "epoch": 0.28608196396753116, "grad_norm": 2.3021628856658936, "learning_rate": 5e-05, "llm_loss": 0.5590920075774193, "loss": 2.6658, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.0914306640625, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.1287841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.081298828125, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.0780029296875, "step": 1445, "total_loss": 0.6664621978998184 }, { "epoch": 0.2862799445654326, "grad_norm": 1.8896938562393188, "learning_rate": 5e-05, "llm_loss": 0.5270476713776588, "loss": 2.5505, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.05560302734375, "loss_aux_layer_10": 0.0830078125, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.09423828125, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.1126708984375, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.066650390625, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.079345703125, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.084716796875, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.08349609375, "loss_aux_layer_9": 0.0819091796875, "step": 1446, "total_loss": 0.6376373171806335 }, { "epoch": 0.286477925163334, "grad_norm": 0.9970418810844421, "learning_rate": 5e-05, "llm_loss": 0.5843954905867577, "loss": 2.7778, "loss_aux_layer_0": 0.0225830078125, "loss_aux_layer_1": 0.05450439453125, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.1009521484375, "loss_aux_layer_14": 0.1124267578125, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.160888671875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.0809326171875, "step": 1447, "total_loss": 0.694439485669136 }, { "epoch": 0.2866759057612354, "grad_norm": 1.1972453594207764, "learning_rate": 5e-05, "llm_loss": 0.5719684064388275, "loss": 2.7243, "loss_aux_layer_0": 0.024383544921875, "loss_aux_layer_1": 0.05499267578125, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.0928955078125, "loss_aux_layer_13": 0.100341796875, "loss_aux_layer_14": 0.1109619140625, "loss_aux_layer_15": 0.1214599609375, "loss_aux_layer_16": 0.1318359375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.066162109375, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.078125, "loss_aux_layer_4": 0.081787109375, "loss_aux_layer_5": 0.0841064453125, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.0830078125, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.0806884765625, "step": 1448, "total_loss": 0.6810730546712875 }, { "epoch": 0.2868738863591368, "grad_norm": 1.1800470352172852, "learning_rate": 5e-05, "llm_loss": 0.6059781461954117, "loss": 2.8542, "loss_aux_layer_0": 0.024383544921875, "loss_aux_layer_1": 0.051513671875, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.091552734375, "loss_aux_layer_13": 0.0985107421875, "loss_aux_layer_14": 0.110107421875, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.131103515625, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0623779296875, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.07421875, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.08349609375, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.0787353515625, "step": 1449, "total_loss": 0.7135481685400009 }, { "epoch": 0.28707186695703824, "grad_norm": 1.3174009323120117, "learning_rate": 5e-05, "llm_loss": 0.6507560312747955, "loss": 3.032, "loss_aux_layer_0": 0.024505615234375, "loss_aux_layer_1": 0.05426025390625, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.0914306640625, "loss_aux_layer_13": 0.09814453125, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.128173828125, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.065673828125, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.0770263671875, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.0814208984375, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.0799560546875, "loss_aux_layer_9": 0.0784912109375, "step": 1450, "total_loss": 0.7580060809850693 }, { "epoch": 0.2872698475549396, "grad_norm": 1.3941956758499146, "learning_rate": 5e-05, "llm_loss": 0.6071843951940536, "loss": 2.873, "loss_aux_layer_0": 0.0228271484375, "loss_aux_layer_1": 0.0552978515625, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.0885009765625, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1024169921875, "loss_aux_layer_14": 0.1136474609375, "loss_aux_layer_15": 0.12451171875, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.0672607421875, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.0828857421875, "loss_aux_layer_9": 0.08154296875, "step": 1451, "total_loss": 0.7182391285896301 }, { "epoch": 0.287467828152841, "grad_norm": 1.3800166845321655, "learning_rate": 5e-05, "llm_loss": 0.6576045453548431, "loss": 3.0555, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.0521240234375, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.08984375, "loss_aux_layer_13": 0.097412109375, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.11962890625, "loss_aux_layer_16": 0.1309814453125, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.0615234375, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.07666015625, "step": 1452, "total_loss": 0.7638747096061707 }, { "epoch": 0.28766580875074244, "grad_norm": 1.4810960292816162, "learning_rate": 5e-05, "llm_loss": 0.687204048037529, "loss": 3.1926, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.0545654296875, "loss_aux_layer_10": 0.083251953125, "loss_aux_layer_11": 0.08837890625, "loss_aux_layer_12": 0.0950927734375, "loss_aux_layer_13": 0.1026611328125, "loss_aux_layer_14": 0.1134033203125, "loss_aux_layer_15": 0.124267578125, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.08154296875, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.0865478515625, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.08154296875, "step": 1453, "total_loss": 0.7981491088867188 }, { "epoch": 0.2878637893486438, "grad_norm": 1.286110281944275, "learning_rate": 5e-05, "llm_loss": 0.6351252049207687, "loss": 2.9742, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.056396484375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0867919921875, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.130615234375, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.06689453125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.0821533203125, "loss_aux_layer_5": 0.083740234375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.0802001953125, "step": 1454, "total_loss": 0.7435476183891296 }, { "epoch": 0.28806176994654525, "grad_norm": 1.421482801437378, "learning_rate": 5e-05, "llm_loss": 0.5813621282577515, "loss": 2.7844, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.0579833984375, "loss_aux_layer_10": 0.087890625, "loss_aux_layer_11": 0.0927734375, "loss_aux_layer_12": 0.0994873046875, "loss_aux_layer_13": 0.1070556640625, "loss_aux_layer_14": 0.1182861328125, "loss_aux_layer_15": 0.1287841796875, "loss_aux_layer_16": 0.13916015625, "loss_aux_layer_17": 0.146484375, "loss_aux_layer_18": 0.15576171875, "loss_aux_layer_19": 0.1572265625, "loss_aux_layer_2": 0.06884765625, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.17041015625, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.23193359375, "loss_aux_layer_3": 0.0828857421875, "loss_aux_layer_4": 0.08642578125, "loss_aux_layer_5": 0.0887451171875, "loss_aux_layer_6": 0.0921630859375, "loss_aux_layer_7": 0.089111328125, "loss_aux_layer_8": 0.0875244140625, "loss_aux_layer_9": 0.0863037109375, "step": 1455, "total_loss": 0.6960983723402023 }, { "epoch": 0.28825975054444664, "grad_norm": 1.68048894405365, "learning_rate": 5e-05, "llm_loss": 0.7045239582657814, "loss": 3.2746, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.05645751953125, "loss_aux_layer_10": 0.08642578125, "loss_aux_layer_11": 0.092041015625, "loss_aux_layer_12": 0.098876953125, "loss_aux_layer_13": 0.106689453125, "loss_aux_layer_14": 0.1180419921875, "loss_aux_layer_15": 0.1292724609375, "loss_aux_layer_16": 0.139892578125, "loss_aux_layer_17": 0.148193359375, "loss_aux_layer_18": 0.156005859375, "loss_aux_layer_19": 0.15771484375, "loss_aux_layer_2": 0.0689697265625, "loss_aux_layer_20": 0.16357421875, "loss_aux_layer_21": 0.169677734375, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.081298828125, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.0872802734375, "loss_aux_layer_6": 0.0899658203125, "loss_aux_layer_7": 0.086669921875, "loss_aux_layer_8": 0.085693359375, "loss_aux_layer_9": 0.0848388671875, "step": 1456, "total_loss": 0.8186410814523697 }, { "epoch": 0.2884577311423481, "grad_norm": 1.610737681388855, "learning_rate": 5e-05, "llm_loss": 0.59966941177845, "loss": 2.8484, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05548095703125, "loss_aux_layer_10": 0.0849609375, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.0966796875, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.114990234375, "loss_aux_layer_15": 0.125244140625, "loss_aux_layer_16": 0.13623046875, "loss_aux_layer_17": 0.144287109375, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.154541015625, "loss_aux_layer_2": 0.0679931640625, "loss_aux_layer_20": 0.161865234375, "loss_aux_layer_21": 0.168701171875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.232177734375, "loss_aux_layer_3": 0.0810546875, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.0860595703125, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.0855712890625, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.08349609375, "step": 1457, "total_loss": 0.712090253829956 }, { "epoch": 0.28865571174024945, "grad_norm": 1.039942741394043, "learning_rate": 5e-05, "llm_loss": 0.5667577460408211, "loss": 2.6918, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.0518798828125, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0970458984375, "loss_aux_layer_14": 0.1080322265625, "loss_aux_layer_15": 0.118896484375, "loss_aux_layer_16": 0.1297607421875, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06201171875, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.0760498046875, "step": 1458, "total_loss": 0.6729468405246735 }, { "epoch": 0.28885369233815084, "grad_norm": 1.485703706741333, "learning_rate": 5e-05, "llm_loss": 0.5684695541858673, "loss": 2.7194, "loss_aux_layer_0": 0.025726318359375, "loss_aux_layer_1": 0.0574951171875, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0960693359375, "loss_aux_layer_13": 0.1031494140625, "loss_aux_layer_14": 0.11376953125, "loss_aux_layer_15": 0.1234130859375, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.14111328125, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.228515625, "loss_aux_layer_3": 0.0811767578125, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.086181640625, "loss_aux_layer_6": 0.089111328125, "loss_aux_layer_7": 0.0859375, "loss_aux_layer_8": 0.0845947265625, "loss_aux_layer_9": 0.083251953125, "step": 1459, "total_loss": 0.6798425018787384 }, { "epoch": 0.2890516729360523, "grad_norm": 1.03111732006073, "learning_rate": 5e-05, "llm_loss": 0.6016290783882141, "loss": 2.8484, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.05615234375, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.0888671875, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.1024169921875, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.1224365234375, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.067626953125, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.07958984375, "loss_aux_layer_4": 0.0830078125, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.087890625, "loss_aux_layer_7": 0.0848388671875, "loss_aux_layer_8": 0.0838623046875, "loss_aux_layer_9": 0.0826416015625, "step": 1460, "total_loss": 0.7121109664440155 }, { "epoch": 0.28924965353395365, "grad_norm": 1.1770209074020386, "learning_rate": 5e-05, "llm_loss": 0.6666563451290131, "loss": 3.083, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.05096435546875, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.09423828125, "loss_aux_layer_14": 0.1044921875, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.075927734375, "step": 1461, "total_loss": 0.7707469016313553 }, { "epoch": 0.2894476341318551, "grad_norm": 0.8965128660202026, "learning_rate": 5e-05, "llm_loss": 0.6194035410881042, "loss": 2.9141, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0552978515625, "loss_aux_layer_10": 0.08349609375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.094970703125, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.111572265625, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.0867919921875, "loss_aux_layer_7": 0.083740234375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0819091796875, "step": 1462, "total_loss": 0.7285275906324387 }, { "epoch": 0.2896456147297565, "grad_norm": 1.2927452325820923, "learning_rate": 5e-05, "llm_loss": 0.6206945776939392, "loss": 2.9257, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.0555419921875, "loss_aux_layer_10": 0.0831298828125, "loss_aux_layer_11": 0.0885009765625, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.10205078125, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.0672607421875, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.1904296875, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.0838623046875, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0816650390625, "step": 1463, "total_loss": 0.7314363718032837 }, { "epoch": 0.2898435953276579, "grad_norm": 1.1121532917022705, "learning_rate": 5e-05, "llm_loss": 0.5517299920320511, "loss": 2.6509, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.05615234375, "loss_aux_layer_10": 0.083984375, "loss_aux_layer_11": 0.089599609375, "loss_aux_layer_12": 0.095703125, "loss_aux_layer_13": 0.1024169921875, "loss_aux_layer_14": 0.1131591796875, "loss_aux_layer_15": 0.1240234375, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.06787109375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.085205078125, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.085205078125, "loss_aux_layer_8": 0.083984375, "loss_aux_layer_9": 0.0826416015625, "step": 1464, "total_loss": 0.6627279967069626 }, { "epoch": 0.2900415759255593, "grad_norm": 0.9662739634513855, "learning_rate": 5e-05, "llm_loss": 0.6131722256541252, "loss": 2.8791, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.0523681640625, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.128662109375, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0626220703125, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.077392578125, "step": 1465, "total_loss": 0.7197683751583099 }, { "epoch": 0.2902395565234607, "grad_norm": 1.4459172487258911, "learning_rate": 5e-05, "llm_loss": 0.6223901957273483, "loss": 2.8936, "loss_aux_layer_0": 0.02410888671875, "loss_aux_layer_1": 0.04949951171875, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.1312255859375, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.06964111328125, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.072265625, "step": 1466, "total_loss": 0.7234114557504654 }, { "epoch": 0.2904375371213621, "grad_norm": 0.8956267237663269, "learning_rate": 5e-05, "llm_loss": 0.5661024302244186, "loss": 2.7112, "loss_aux_layer_0": 0.022735595703125, "loss_aux_layer_1": 0.05517578125, "loss_aux_layer_10": 0.084228515625, "loss_aux_layer_11": 0.0894775390625, "loss_aux_layer_12": 0.0955810546875, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.1141357421875, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.135498046875, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.154296875, "loss_aux_layer_2": 0.06634521484375, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.16845703125, "loss_aux_layer_22": 0.1923828125, "loss_aux_layer_23": 0.233642578125, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.0845947265625, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.0823974609375, "step": 1467, "total_loss": 0.6778106093406677 }, { "epoch": 0.2906355177192635, "grad_norm": 1.47757089138031, "learning_rate": 5e-05, "llm_loss": 0.5968684032559395, "loss": 2.8144, "loss_aux_layer_0": 0.022979736328125, "loss_aux_layer_1": 0.05255126953125, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0972900390625, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.118408203125, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.080322265625, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.077880859375, "step": 1468, "total_loss": 0.7035885453224182 }, { "epoch": 0.29083349831716493, "grad_norm": 1.4645029306411743, "learning_rate": 5e-05, "llm_loss": 0.5269241780042648, "loss": 2.5412, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.053466796875, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.09228515625, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.110107421875, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.1317138671875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.06488037109375, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.076171875, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.0799560546875, "loss_aux_layer_9": 0.078857421875, "step": 1469, "total_loss": 0.6353006958961487 }, { "epoch": 0.2910314789150663, "grad_norm": 1.553648591041565, "learning_rate": 5e-05, "llm_loss": 0.6762530952692032, "loss": 3.1412, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.05303955078125, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.099853515625, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.12109375, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.140380859375, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.064697265625, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.080078125, "loss_aux_layer_5": 0.08203125, "loss_aux_layer_6": 0.0853271484375, "loss_aux_layer_7": 0.082275390625, "loss_aux_layer_8": 0.0811767578125, "loss_aux_layer_9": 0.0802001953125, "step": 1470, "total_loss": 0.7853041440248489 }, { "epoch": 0.29122945951296775, "grad_norm": 1.3350054025650024, "learning_rate": 5e-05, "llm_loss": 0.5854842811822891, "loss": 2.7765, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.110107421875, "loss_aux_layer_15": 0.1209716796875, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06396484375, "loss_aux_layer_20": 0.1611328125, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.23095703125, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.0782470703125, "step": 1471, "total_loss": 0.6941149085760117 }, { "epoch": 0.29142744011086913, "grad_norm": 1.3970764875411987, "learning_rate": 5e-05, "llm_loss": 0.6251011788845062, "loss": 2.9371, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.05389404296875, "loss_aux_layer_10": 0.0811767578125, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0933837890625, "loss_aux_layer_13": 0.1009521484375, "loss_aux_layer_14": 0.1119384765625, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.1416015625, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.06524658203125, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.076904296875, "loss_aux_layer_4": 0.0802001953125, "loss_aux_layer_5": 0.0821533203125, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0809326171875, "loss_aux_layer_9": 0.0794677734375, "step": 1472, "total_loss": 0.734271451830864 }, { "epoch": 0.29162542070877057, "grad_norm": 1.0834054946899414, "learning_rate": 5e-05, "llm_loss": 0.5935707688331604, "loss": 2.8239, "loss_aux_layer_0": 0.026947021484375, "loss_aux_layer_1": 0.0576171875, "loss_aux_layer_10": 0.085205078125, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.1148681640625, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.0693359375, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.232421875, "loss_aux_layer_3": 0.08154296875, "loss_aux_layer_4": 0.0848388671875, "loss_aux_layer_5": 0.0867919921875, "loss_aux_layer_6": 0.0897216796875, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0841064453125, "step": 1473, "total_loss": 0.7059800326824188 }, { "epoch": 0.29182340130667195, "grad_norm": 1.4692891836166382, "learning_rate": 5e-05, "llm_loss": 0.6226870715618134, "loss": 2.9336, "loss_aux_layer_0": 0.025970458984375, "loss_aux_layer_1": 0.05755615234375, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.0943603515625, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.1124267578125, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.133544921875, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0677490234375, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.07958984375, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.084716796875, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.0838623046875, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.081298828125, "step": 1474, "total_loss": 0.7334118485450745 }, { "epoch": 0.29202138190457333, "grad_norm": 0.9368181228637695, "learning_rate": 5e-05, "llm_loss": 0.580928698182106, "loss": 2.7391, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.05059814453125, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.076904296875, "step": 1475, "total_loss": 0.6847865581512451 }, { "epoch": 0.29221936250247477, "grad_norm": 1.284374475479126, "learning_rate": 5e-05, "llm_loss": 0.6652000695466995, "loss": 3.0712, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.0498046875, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.085693359375, "loss_aux_layer_13": 0.0924072265625, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1248779296875, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.073486328125, "step": 1476, "total_loss": 0.7678089141845703 }, { "epoch": 0.29241734310037615, "grad_norm": 1.0256985425949097, "learning_rate": 5e-05, "llm_loss": 0.7267249673604965, "loss": 3.3262, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.04998779296875, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.082275390625, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.1280517578125, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.0618896484375, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0767822265625, "loss_aux_layer_9": 0.07568359375, "step": 1477, "total_loss": 0.8315567970275879 }, { "epoch": 0.2926153236982776, "grad_norm": 1.0571614503860474, "learning_rate": 5e-05, "llm_loss": 0.6251863390207291, "loss": 2.9466, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.05523681640625, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.0909423828125, "loss_aux_layer_12": 0.0972900390625, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.1153564453125, "loss_aux_layer_15": 0.125, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.06768798828125, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.080078125, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.0858154296875, "loss_aux_layer_6": 0.088623046875, "loss_aux_layer_7": 0.0855712890625, "loss_aux_layer_8": 0.084716796875, "loss_aux_layer_9": 0.083251953125, "step": 1478, "total_loss": 0.7366611659526825 }, { "epoch": 0.29281330429617897, "grad_norm": 1.3741790056228638, "learning_rate": 5e-05, "llm_loss": 0.5911531299352646, "loss": 2.7966, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.0537109375, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.09228515625, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.120361328125, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.0638427734375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.081298828125, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.0816650390625, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.0789794921875, "step": 1479, "total_loss": 0.6991483122110367 }, { "epoch": 0.2930112848940804, "grad_norm": 1.6257237195968628, "learning_rate": 5e-05, "llm_loss": 0.6040390282869339, "loss": 2.8398, "loss_aux_layer_0": 0.026397705078125, "loss_aux_layer_1": 0.053466796875, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.1064453125, "loss_aux_layer_15": 0.1173095703125, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06231689453125, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0762939453125, "step": 1480, "total_loss": 0.7099544405937195 }, { "epoch": 0.2932092654919818, "grad_norm": 1.230639934539795, "learning_rate": 5e-05, "llm_loss": 0.6276087611913681, "loss": 2.9307, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.052734375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.083251953125, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.1343994140625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.0767822265625, "step": 1481, "total_loss": 0.7326733320951462 }, { "epoch": 0.29340724608988317, "grad_norm": 1.370875597000122, "learning_rate": 5e-05, "llm_loss": 0.6139998137950897, "loss": 2.8777, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.052490234375, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.117431640625, "loss_aux_layer_16": 0.1280517578125, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0799560546875, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.0775146484375, "step": 1482, "total_loss": 0.7194269299507141 }, { "epoch": 0.2936052266877846, "grad_norm": 1.3630791902542114, "learning_rate": 5e-05, "llm_loss": 0.5968708842992783, "loss": 2.8158, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.05328369140625, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.0904541015625, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.12939453125, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0643310546875, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0772705078125, "step": 1483, "total_loss": 0.703954815864563 }, { "epoch": 0.293803207285686, "grad_norm": 1.3827388286590576, "learning_rate": 5e-05, "llm_loss": 0.5171673148870468, "loss": 2.5137, "loss_aux_layer_0": 0.024383544921875, "loss_aux_layer_1": 0.0555419921875, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.1138916015625, "loss_aux_layer_15": 0.1241455078125, "loss_aux_layer_16": 0.134521484375, "loss_aux_layer_17": 0.1416015625, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.06695556640625, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.079833984375, "loss_aux_layer_4": 0.08349609375, "loss_aux_layer_5": 0.0855712890625, "loss_aux_layer_6": 0.0887451171875, "loss_aux_layer_7": 0.08544921875, "loss_aux_layer_8": 0.0843505859375, "loss_aux_layer_9": 0.08349609375, "step": 1484, "total_loss": 0.6284218728542328 }, { "epoch": 0.2940011878835874, "grad_norm": 1.1819969415664673, "learning_rate": 5e-05, "llm_loss": 0.6806588023900986, "loss": 3.1634, "loss_aux_layer_0": 0.02801513671875, "loss_aux_layer_1": 0.055908203125, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.093505859375, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.1119384765625, "loss_aux_layer_15": 0.1224365234375, "loss_aux_layer_16": 0.1331787109375, "loss_aux_layer_17": 0.14111328125, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.229736328125, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.081298828125, "loss_aux_layer_5": 0.083251953125, "loss_aux_layer_6": 0.0863037109375, "loss_aux_layer_7": 0.082763671875, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.08056640625, "step": 1485, "total_loss": 0.7908595502376556 }, { "epoch": 0.2941991684814888, "grad_norm": 1.0040334463119507, "learning_rate": 5e-05, "llm_loss": 0.5731280893087387, "loss": 2.7341, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.0548095703125, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.0885009765625, "loss_aux_layer_12": 0.0946044921875, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.1124267578125, "loss_aux_layer_15": 0.122802734375, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.0673828125, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.22900390625, "loss_aux_layer_3": 0.0792236328125, "loss_aux_layer_4": 0.0826416015625, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0821533203125, "step": 1486, "total_loss": 0.6835155785083771 }, { "epoch": 0.29439714907939024, "grad_norm": 1.3813060522079468, "learning_rate": 5e-05, "llm_loss": 0.670221358537674, "loss": 3.1153, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.05194091796875, "loss_aux_layer_10": 0.0814208984375, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.092529296875, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.14990234375, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.06414794921875, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.0816650390625, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0810546875, "loss_aux_layer_9": 0.0797119140625, "step": 1487, "total_loss": 0.7788327783346176 }, { "epoch": 0.2945951296772916, "grad_norm": 1.1012901067733765, "learning_rate": 5e-05, "llm_loss": 0.5844649150967598, "loss": 2.777, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.0548095703125, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.1126708984375, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.066650390625, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.0821533203125, "loss_aux_layer_5": 0.0841064453125, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.084228515625, "loss_aux_layer_8": 0.0831298828125, "loss_aux_layer_9": 0.081298828125, "step": 1488, "total_loss": 0.6942487359046936 }, { "epoch": 0.294793110275193, "grad_norm": 0.8801693320274353, "learning_rate": 5e-05, "llm_loss": 0.6946549862623215, "loss": 3.1898, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.0489501953125, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.05950927734375, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.073486328125, "step": 1489, "total_loss": 0.797446146607399 }, { "epoch": 0.29499109087309444, "grad_norm": 1.0777668952941895, "learning_rate": 5e-05, "llm_loss": 0.6341166198253632, "loss": 2.9661, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.0509033203125, "loss_aux_layer_10": 0.0792236328125, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.09716796875, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1190185546875, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.139404296875, "loss_aux_layer_18": 0.148681640625, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06158447265625, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.0799560546875, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.07763671875, "step": 1490, "total_loss": 0.7415165454149246 }, { "epoch": 0.2951890714709958, "grad_norm": 1.4806785583496094, "learning_rate": 5e-05, "llm_loss": 0.6047451496124268, "loss": 2.8437, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.1171875, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.06268310546875, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0787353515625, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.0767822265625, "step": 1491, "total_loss": 0.7109334319829941 }, { "epoch": 0.29538705206889726, "grad_norm": 1.3883806467056274, "learning_rate": 5e-05, "llm_loss": 0.5650915950536728, "loss": 2.6869, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.051025390625, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0908203125, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.1290283203125, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.063720703125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.07568359375, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.07861328125, "step": 1492, "total_loss": 0.6717144548892975 }, { "epoch": 0.29558503266679864, "grad_norm": 1.2520651817321777, "learning_rate": 5e-05, "llm_loss": 0.6566452383995056, "loss": 3.051, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.05242919921875, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.08935546875, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.1070556640625, "loss_aux_layer_15": 0.1175537109375, "loss_aux_layer_16": 0.128173828125, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.14599609375, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0767822265625, "step": 1493, "total_loss": 0.7627456337213516 }, { "epoch": 0.2957830132647001, "grad_norm": 1.885769248008728, "learning_rate": 5e-05, "llm_loss": 0.6371822655200958, "loss": 2.9849, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.0546875, "loss_aux_layer_10": 0.0821533203125, "loss_aux_layer_11": 0.08740234375, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.1004638671875, "loss_aux_layer_14": 0.1114501953125, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.06549072265625, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.0841064453125, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.083251953125, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.080810546875, "step": 1494, "total_loss": 0.7462301254272461 }, { "epoch": 0.29598099386260146, "grad_norm": 1.4372612237930298, "learning_rate": 5e-05, "llm_loss": 0.6767890900373459, "loss": 3.1389, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.051513671875, "loss_aux_layer_10": 0.0792236328125, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.0982666015625, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.121337890625, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06243896484375, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.188720703125, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.0775146484375, "step": 1495, "total_loss": 0.7847287952899933 }, { "epoch": 0.2961789744605029, "grad_norm": 1.881165623664856, "learning_rate": 5e-05, "llm_loss": 0.6948622763156891, "loss": 3.2071, "loss_aux_layer_0": 0.0247802734375, "loss_aux_layer_1": 0.05169677734375, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.08935546875, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.0623779296875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.186767578125, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.0823974609375, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.077392578125, "step": 1496, "total_loss": 0.8017782270908356 }, { "epoch": 0.2963769550584043, "grad_norm": 1.46369206905365, "learning_rate": 5e-05, "llm_loss": 0.7177628427743912, "loss": 3.3022, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.05267333984375, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.09814453125, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.1202392578125, "loss_aux_layer_16": 0.1307373046875, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0645751953125, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0767822265625, "loss_aux_layer_4": 0.080078125, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.079833984375, "loss_aux_layer_9": 0.0784912109375, "step": 1497, "total_loss": 0.8255595266819 }, { "epoch": 0.29657493565630566, "grad_norm": 1.8295822143554688, "learning_rate": 5e-05, "llm_loss": 0.6955447047948837, "loss": 3.2052, "loss_aux_layer_0": 0.025604248046875, "loss_aux_layer_1": 0.05194091796875, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.117431640625, "loss_aux_layer_16": 0.1278076171875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.06292724609375, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.07421875, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.0758056640625, "step": 1498, "total_loss": 0.8012907952070236 }, { "epoch": 0.2967729162542071, "grad_norm": 1.6625608205795288, "learning_rate": 5e-05, "llm_loss": 0.7056552022695541, "loss": 3.2483, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.0513916015625, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.090576171875, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.1077880859375, "loss_aux_layer_15": 0.1173095703125, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.064453125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.07763671875, "step": 1499, "total_loss": 0.8120678961277008 }, { "epoch": 0.2969708968521085, "grad_norm": 1.6665496826171875, "learning_rate": 5e-05, "llm_loss": 0.6495231613516808, "loss": 3.0344, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.05206298828125, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.1331787109375, "loss_aux_layer_17": 0.1416015625, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.15380859375, "loss_aux_layer_2": 0.06396484375, "loss_aux_layer_20": 0.161376953125, "loss_aux_layer_21": 0.169189453125, "loss_aux_layer_22": 0.19189453125, "loss_aux_layer_23": 0.2333984375, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0787353515625, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.079833984375, "loss_aux_layer_9": 0.0782470703125, "step": 1500, "total_loss": 0.758601576089859 }, { "epoch": 0.2971688774500099, "grad_norm": 1.744645595550537, "learning_rate": 5e-05, "llm_loss": 0.6108159571886063, "loss": 2.8878, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.05322265625, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.09521484375, "loss_aux_layer_13": 0.1021728515625, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1240234375, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.142333984375, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.067626953125, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.16796875, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.232666015625, "loss_aux_layer_3": 0.0792236328125, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.084228515625, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.0843505859375, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.082275390625, "step": 1501, "total_loss": 0.7219418734312057 }, { "epoch": 0.2973668580479113, "grad_norm": 1.4799288511276245, "learning_rate": 5e-05, "llm_loss": 0.6483712196350098, "loss": 3.0335, "loss_aux_layer_0": 0.023895263671875, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.0941162109375, "loss_aux_layer_13": 0.10107421875, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0662841796875, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.187255859375, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.0811767578125, "step": 1502, "total_loss": 0.7583656013011932 }, { "epoch": 0.29756483864581273, "grad_norm": 1.270719051361084, "learning_rate": 5e-05, "llm_loss": 0.6156376451253891, "loss": 2.8858, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.1175537109375, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.0615234375, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0810546875, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.0767822265625, "loss_aux_layer_9": 0.075927734375, "step": 1503, "total_loss": 0.7214378118515015 }, { "epoch": 0.2977628192437141, "grad_norm": 1.360865831375122, "learning_rate": 5e-05, "llm_loss": 0.5981027483940125, "loss": 2.823, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.05438232421875, "loss_aux_layer_10": 0.0809326171875, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.0919189453125, "loss_aux_layer_13": 0.0985107421875, "loss_aux_layer_14": 0.1092529296875, "loss_aux_layer_15": 0.11962890625, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06573486328125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.0823974609375, "loss_aux_layer_8": 0.081298828125, "loss_aux_layer_9": 0.079833984375, "step": 1504, "total_loss": 0.7057404667139053 }, { "epoch": 0.2979607998416155, "grad_norm": 1.441229224205017, "learning_rate": 5e-05, "llm_loss": 0.6169548407196999, "loss": 2.9009, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.0528564453125, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0867919921875, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.109619140625, "loss_aux_layer_15": 0.11962890625, "loss_aux_layer_16": 0.130615234375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06591796875, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0804443359375, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.081298828125, "loss_aux_layer_9": 0.080078125, "step": 1505, "total_loss": 0.7252220064401627 }, { "epoch": 0.29815878043951693, "grad_norm": 1.5884137153625488, "learning_rate": 5e-05, "llm_loss": 0.6897167563438416, "loss": 3.1832, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.052001953125, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0985107421875, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1171875, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.1353759765625, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06475830078125, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.0853271484375, "loss_aux_layer_7": 0.0821533203125, "loss_aux_layer_8": 0.081298828125, "loss_aux_layer_9": 0.0799560546875, "step": 1506, "total_loss": 0.7957958579063416 }, { "epoch": 0.2983567610374183, "grad_norm": 1.2000114917755127, "learning_rate": 5e-05, "llm_loss": 0.586032509803772, "loss": 2.7921, "loss_aux_layer_0": 0.025177001953125, "loss_aux_layer_1": 0.0562744140625, "loss_aux_layer_10": 0.0849609375, "loss_aux_layer_11": 0.0902099609375, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.1138916015625, "loss_aux_layer_15": 0.124267578125, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.152099609375, "loss_aux_layer_19": 0.154052734375, "loss_aux_layer_2": 0.06787109375, "loss_aux_layer_20": 0.16064453125, "loss_aux_layer_21": 0.167236328125, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.084716796875, "loss_aux_layer_5": 0.08642578125, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0865478515625, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0836181640625, "step": 1507, "total_loss": 0.6980204284191132 }, { "epoch": 0.29855474163531975, "grad_norm": 1.6583614349365234, "learning_rate": 5e-05, "llm_loss": 0.6340101286768913, "loss": 2.9644, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.0533447265625, "loss_aux_layer_10": 0.0816650390625, "loss_aux_layer_11": 0.0863037109375, "loss_aux_layer_12": 0.09228515625, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.1087646484375, "loss_aux_layer_15": 0.1181640625, "loss_aux_layer_16": 0.1282958984375, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06500244140625, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.082763671875, "loss_aux_layer_6": 0.0855712890625, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.08154296875, "loss_aux_layer_9": 0.0802001953125, "step": 1508, "total_loss": 0.7410966455936432 }, { "epoch": 0.29875272223322114, "grad_norm": 1.888659954071045, "learning_rate": 5e-05, "llm_loss": 0.5616400241851807, "loss": 2.6848, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0867919921875, "loss_aux_layer_12": 0.093994140625, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.113037109375, "loss_aux_layer_15": 0.12353515625, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.1533203125, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0765380859375, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0809326171875, "loss_aux_layer_9": 0.079833984375, "step": 1509, "total_loss": 0.6712038666009903 }, { "epoch": 0.2989507028311226, "grad_norm": 1.485253930091858, "learning_rate": 5e-05, "llm_loss": 0.7125635743141174, "loss": 3.287, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.05474853515625, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.1005859375, "loss_aux_layer_14": 0.1116943359375, "loss_aux_layer_15": 0.1224365234375, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.06658935546875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.08251953125, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0810546875, "loss_aux_layer_9": 0.07958984375, "step": 1510, "total_loss": 0.8217608481645584 }, { "epoch": 0.29914868342902395, "grad_norm": 1.2298628091812134, "learning_rate": 5e-05, "llm_loss": 0.596255399286747, "loss": 2.8136, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.05194091796875, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.11865234375, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.0780029296875, "step": 1511, "total_loss": 0.7033876776695251 }, { "epoch": 0.29934666402692534, "grad_norm": 1.2362775802612305, "learning_rate": 5e-05, "llm_loss": 0.7023205608129501, "loss": 3.2287, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.04998779296875, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.093994140625, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0604248046875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.0736083984375, "loss_aux_layer_5": 0.075927734375, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.074462890625, "step": 1512, "total_loss": 0.8071649670600891 }, { "epoch": 0.2995446446248268, "grad_norm": 1.1021308898925781, "learning_rate": 5e-05, "llm_loss": 0.5721645429730415, "loss": 2.7312, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.05450439453125, "loss_aux_layer_10": 0.0845947265625, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.0958251953125, "loss_aux_layer_13": 0.10302734375, "loss_aux_layer_14": 0.11328125, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.140380859375, "loss_aux_layer_18": 0.14892578125, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0677490234375, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.18798828125, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0799560546875, "loss_aux_layer_4": 0.083251953125, "loss_aux_layer_5": 0.085205078125, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.0849609375, "loss_aux_layer_8": 0.084228515625, "loss_aux_layer_9": 0.082763671875, "step": 1513, "total_loss": 0.6827889978885651 }, { "epoch": 0.29974262522272815, "grad_norm": 1.2769874334335327, "learning_rate": 5e-05, "llm_loss": 0.5389150157570839, "loss": 2.5909, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.05419921875, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.0997314453125, "loss_aux_layer_14": 0.1104736328125, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06640625, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.080810546875, "step": 1514, "total_loss": 0.6477127075195312 }, { "epoch": 0.2999406058206296, "grad_norm": 1.0293478965759277, "learning_rate": 5e-05, "llm_loss": 0.6236437261104584, "loss": 2.9155, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.0517578125, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.095703125, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.126708984375, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.07666015625, "step": 1515, "total_loss": 0.7288801074028015 }, { "epoch": 0.300138586418531, "grad_norm": 1.3433992862701416, "learning_rate": 5e-05, "llm_loss": 0.6455977857112885, "loss": 3.0112, "loss_aux_layer_0": 0.0225830078125, "loss_aux_layer_1": 0.05206298828125, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.0863037109375, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0997314453125, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.1202392578125, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.0633544921875, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.08349609375, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.07958984375, "step": 1516, "total_loss": 0.7527879774570465 }, { "epoch": 0.3003365670164324, "grad_norm": 1.3098105192184448, "learning_rate": 5e-05, "llm_loss": 0.657230943441391, "loss": 3.0775, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.05706787109375, "loss_aux_layer_10": 0.086181640625, "loss_aux_layer_11": 0.091552734375, "loss_aux_layer_12": 0.09765625, "loss_aux_layer_13": 0.1044921875, "loss_aux_layer_14": 0.1148681640625, "loss_aux_layer_15": 0.1246337890625, "loss_aux_layer_16": 0.134765625, "loss_aux_layer_17": 0.14111328125, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.06982421875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.0826416015625, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.088134765625, "loss_aux_layer_6": 0.091064453125, "loss_aux_layer_7": 0.0877685546875, "loss_aux_layer_8": 0.0863037109375, "loss_aux_layer_9": 0.0848388671875, "step": 1517, "total_loss": 0.7693684101104736 }, { "epoch": 0.3005345476143338, "grad_norm": 1.365853190422058, "learning_rate": 5e-05, "llm_loss": 0.6518736183643341, "loss": 3.0517, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.0562744140625, "loss_aux_layer_10": 0.0836181640625, "loss_aux_layer_11": 0.088623046875, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.1119384765625, "loss_aux_layer_15": 0.1221923828125, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0684814453125, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0859375, "loss_aux_layer_6": 0.0888671875, "loss_aux_layer_7": 0.085693359375, "loss_aux_layer_8": 0.084228515625, "loss_aux_layer_9": 0.08251953125, "step": 1518, "total_loss": 0.7629212141036987 }, { "epoch": 0.30073252821223523, "grad_norm": 1.0215305089950562, "learning_rate": 5e-05, "llm_loss": 0.6280486881732941, "loss": 2.9441, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.05181884765625, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.09228515625, "loss_aux_layer_13": 0.1002197265625, "loss_aux_layer_14": 0.1112060546875, "loss_aux_layer_15": 0.1217041015625, "loss_aux_layer_16": 0.132568359375, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.0780029296875, "step": 1519, "total_loss": 0.7360327541828156 }, { "epoch": 0.3009305088101366, "grad_norm": 1.1482707262039185, "learning_rate": 5e-05, "llm_loss": 0.5517894923686981, "loss": 2.6389, "loss_aux_layer_0": 0.02484130859375, "loss_aux_layer_1": 0.0543212890625, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.090576171875, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.1302490234375, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.079345703125, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078125, "step": 1520, "total_loss": 0.6597306579351425 }, { "epoch": 0.301128489408038, "grad_norm": 1.29452645778656, "learning_rate": 5e-05, "llm_loss": 0.5701974332332611, "loss": 2.7163, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.053466796875, "loss_aux_layer_10": 0.0826416015625, "loss_aux_layer_11": 0.0877685546875, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.101318359375, "loss_aux_layer_14": 0.1121826171875, "loss_aux_layer_15": 0.1219482421875, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.06549072265625, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.081298828125, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.083251953125, "loss_aux_layer_8": 0.0823974609375, "loss_aux_layer_9": 0.0810546875, "step": 1521, "total_loss": 0.6790817528963089 }, { "epoch": 0.30132647000593943, "grad_norm": 1.3502552509307861, "learning_rate": 5e-05, "llm_loss": 0.5963858217000961, "loss": 2.8297, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.05609130859375, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.0947265625, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.1126708984375, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.15087890625, "loss_aux_layer_19": 0.153564453125, "loss_aux_layer_2": 0.0682373046875, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.0830078125, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.088134765625, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.08349609375, "loss_aux_layer_9": 0.08203125, "step": 1522, "total_loss": 0.7074189335107803 }, { "epoch": 0.3015244506038408, "grad_norm": 1.0191693305969238, "learning_rate": 5e-05, "llm_loss": 0.5828668624162674, "loss": 2.7361, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.0478515625, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.05792236328125, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.071533203125, "step": 1523, "total_loss": 0.6840194463729858 }, { "epoch": 0.30172243120174225, "grad_norm": 2.070026159286499, "learning_rate": 5e-05, "llm_loss": 0.6346209049224854, "loss": 2.9739, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.05364990234375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.0997314453125, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.065673828125, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.0830078125, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.0804443359375, "step": 1524, "total_loss": 0.7434691786766052 }, { "epoch": 0.30192041179964363, "grad_norm": 1.6251798868179321, "learning_rate": 5e-05, "llm_loss": 0.615864634513855, "loss": 2.8912, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.054443359375, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1085205078125, "loss_aux_layer_15": 0.118408203125, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.0848388671875, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.0789794921875, "step": 1525, "total_loss": 0.7228020429611206 }, { "epoch": 0.30211839239754507, "grad_norm": 1.5100831985473633, "learning_rate": 5e-05, "llm_loss": 0.5379632413387299, "loss": 2.5924, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.0550537109375, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.1015625, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.1231689453125, "loss_aux_layer_16": 0.1337890625, "loss_aux_layer_17": 0.141357421875, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0665283203125, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.084228515625, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.0831298828125, "loss_aux_layer_9": 0.081787109375, "step": 1526, "total_loss": 0.6480972170829773 }, { "epoch": 0.30231637299544645, "grad_norm": 1.1527570486068726, "learning_rate": 5e-05, "llm_loss": 0.5896675735712051, "loss": 2.802, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.05609130859375, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0897216796875, "loss_aux_layer_12": 0.095947265625, "loss_aux_layer_13": 0.1029052734375, "loss_aux_layer_14": 0.1136474609375, "loss_aux_layer_15": 0.124267578125, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.151611328125, "loss_aux_layer_19": 0.152587890625, "loss_aux_layer_2": 0.0667724609375, "loss_aux_layer_20": 0.158935546875, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.0789794921875, "loss_aux_layer_4": 0.08251953125, "loss_aux_layer_5": 0.08447265625, "loss_aux_layer_6": 0.0875244140625, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.0836181640625, "loss_aux_layer_9": 0.082275390625, "step": 1527, "total_loss": 0.700506329536438 }, { "epoch": 0.30251435359334783, "grad_norm": 0.8653295636177063, "learning_rate": 5e-05, "llm_loss": 0.6156217753887177, "loss": 2.899, "loss_aux_layer_0": 0.02490234375, "loss_aux_layer_1": 0.055419921875, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.1103515625, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.14892578125, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.22802734375, "loss_aux_layer_3": 0.0770263671875, "loss_aux_layer_4": 0.080078125, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.08251953125, "loss_aux_layer_8": 0.0811767578125, "loss_aux_layer_9": 0.0802001953125, "step": 1528, "total_loss": 0.72475565969944 }, { "epoch": 0.30271233419124927, "grad_norm": 1.3928821086883545, "learning_rate": 5e-05, "llm_loss": 0.5898633897304535, "loss": 2.7728, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04949951171875, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.0849609375, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1256103515625, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06048583984375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0731201171875, "step": 1529, "total_loss": 0.6931914985179901 }, { "epoch": 0.30291031478915065, "grad_norm": 0.9802279472351074, "learning_rate": 5e-05, "llm_loss": 0.5479738414287567, "loss": 2.6336, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0556640625, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.0887451171875, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.1016845703125, "loss_aux_layer_14": 0.112548828125, "loss_aux_layer_15": 0.122314453125, "loss_aux_layer_16": 0.1324462890625, "loss_aux_layer_17": 0.139892578125, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.1513671875, "loss_aux_layer_2": 0.06732177734375, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.22998046875, "loss_aux_layer_3": 0.079345703125, "loss_aux_layer_4": 0.0823974609375, "loss_aux_layer_5": 0.0843505859375, "loss_aux_layer_6": 0.08740234375, "loss_aux_layer_7": 0.083984375, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0819091796875, "step": 1530, "total_loss": 0.6583970636129379 }, { "epoch": 0.3031082953870521, "grad_norm": 1.3126939535140991, "learning_rate": 5e-05, "llm_loss": 0.6239206492900848, "loss": 2.9141, "loss_aux_layer_0": 0.022125244140625, "loss_aux_layer_1": 0.05157470703125, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1263427734375, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.063720703125, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.076904296875, "step": 1531, "total_loss": 0.7285300046205521 }, { "epoch": 0.30330627598495347, "grad_norm": 1.573298692703247, "learning_rate": 5e-05, "llm_loss": 0.6432210355997086, "loss": 3.0041, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.0526123046875, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1201171875, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06390380859375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.08154296875, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.07958984375, "step": 1532, "total_loss": 0.7510302811861038 }, { "epoch": 0.3035042565828549, "grad_norm": 1.1454766988754272, "learning_rate": 5e-05, "llm_loss": 0.6261947154998779, "loss": 2.921, "loss_aux_layer_0": 0.025390625, "loss_aux_layer_1": 0.0506591796875, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.0875244140625, "loss_aux_layer_13": 0.09423828125, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1153564453125, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.0750732421875, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.080078125, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0755615234375, "step": 1533, "total_loss": 0.7302596718072891 }, { "epoch": 0.3037022371807563, "grad_norm": 1.4401743412017822, "learning_rate": 5e-05, "llm_loss": 0.6200275421142578, "loss": 2.909, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.052734375, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06365966796875, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.23046875, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.0777587890625, "step": 1534, "total_loss": 0.7272472381591797 }, { "epoch": 0.30390021777865767, "grad_norm": 1.6004180908203125, "learning_rate": 5e-05, "llm_loss": 0.6668702811002731, "loss": 3.0991, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.05548095703125, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0992431640625, "loss_aux_layer_14": 0.109375, "loss_aux_layer_15": 0.1192626953125, "loss_aux_layer_16": 0.1295166015625, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.0665283203125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.161376953125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.07861328125, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.082763671875, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.080078125, "step": 1535, "total_loss": 0.7747653424739838 }, { "epoch": 0.3040981983765591, "grad_norm": 1.297401785850525, "learning_rate": 5e-05, "llm_loss": 0.6930228024721146, "loss": 3.1864, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.103759765625, "loss_aux_layer_15": 0.113525390625, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.06182861328125, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0755615234375, "step": 1536, "total_loss": 0.7965910285711288 }, { "epoch": 0.3042961789744605, "grad_norm": 1.0537859201431274, "learning_rate": 5e-05, "llm_loss": 0.6144625544548035, "loss": 2.884, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.05377197265625, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.06378173828125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.07568359375, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.0780029296875, "step": 1537, "total_loss": 0.7210035175085068 }, { "epoch": 0.3044941595723619, "grad_norm": 1.274925947189331, "learning_rate": 5e-05, "llm_loss": 0.6653977632522583, "loss": 3.0866, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05010986328125, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.117919921875, "loss_aux_layer_16": 0.1285400390625, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.0823974609375, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.077392578125, "step": 1538, "total_loss": 0.7716549038887024 }, { "epoch": 0.3046921401702633, "grad_norm": 1.2727811336517334, "learning_rate": 5e-05, "llm_loss": 0.5304118990898132, "loss": 2.5431, "loss_aux_layer_0": 0.022918701171875, "loss_aux_layer_1": 0.05084228515625, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06231689453125, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.07666015625, "step": 1539, "total_loss": 0.6357862204313278 }, { "epoch": 0.30489012076816474, "grad_norm": 1.2150704860687256, "learning_rate": 5e-05, "llm_loss": 0.5739014148712158, "loss": 2.7256, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.053466796875, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1094970703125, "loss_aux_layer_15": 0.1199951171875, "loss_aux_layer_16": 0.131103515625, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.0775146484375, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.078125, "step": 1540, "total_loss": 0.6813988089561462 }, { "epoch": 0.3050881013660661, "grad_norm": 1.2110635042190552, "learning_rate": 5e-05, "llm_loss": 0.6041911542415619, "loss": 2.8447, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.0531005859375, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.1295166015625, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.064453125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.0841064453125, "loss_aux_layer_7": 0.0810546875, "loss_aux_layer_8": 0.0799560546875, "loss_aux_layer_9": 0.0787353515625, "step": 1541, "total_loss": 0.7111751735210419 }, { "epoch": 0.3052860819639675, "grad_norm": 1.1690149307250977, "learning_rate": 5e-05, "llm_loss": 0.5999269783496857, "loss": 2.8254, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.0523681640625, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.129150390625, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.14599609375, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.079833984375, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.0775146484375, "step": 1542, "total_loss": 0.706344947218895 }, { "epoch": 0.30548406256186894, "grad_norm": 1.2792423963546753, "learning_rate": 5e-05, "llm_loss": 0.6576664745807648, "loss": 3.0606, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.0550537109375, "loss_aux_layer_10": 0.080810546875, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.091552734375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06585693359375, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.082763671875, "loss_aux_layer_6": 0.085693359375, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.079345703125, "step": 1543, "total_loss": 0.7651601880788803 }, { "epoch": 0.3056820431597703, "grad_norm": 1.2047544717788696, "learning_rate": 5e-05, "llm_loss": 0.6706185340881348, "loss": 3.1069, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.05242919921875, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.090576171875, "loss_aux_layer_13": 0.09765625, "loss_aux_layer_14": 0.1085205078125, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.129150390625, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0775146484375, "step": 1544, "total_loss": 0.7767343819141388 }, { "epoch": 0.30588002375767176, "grad_norm": 1.386101245880127, "learning_rate": 5e-05, "llm_loss": 0.6587212532758713, "loss": 3.0546, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.0517578125, "loss_aux_layer_10": 0.0787353515625, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.127685546875, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.06268310546875, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.0771484375, "step": 1545, "total_loss": 0.7636592835187912 }, { "epoch": 0.30607800435557314, "grad_norm": 1.0472662448883057, "learning_rate": 5e-05, "llm_loss": 0.5719125792384148, "loss": 2.7213, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.0545654296875, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.0999755859375, "loss_aux_layer_14": 0.110107421875, "loss_aux_layer_15": 0.1207275390625, "loss_aux_layer_16": 0.1312255859375, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.0665283203125, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0784912109375, "loss_aux_layer_4": 0.081787109375, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.08056640625, "step": 1546, "total_loss": 0.6803143471479416 }, { "epoch": 0.3062759849534746, "grad_norm": 1.0872986316680908, "learning_rate": 5e-05, "llm_loss": 0.5576419681310654, "loss": 2.6523, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.05145263671875, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06182861328125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.0767822265625, "loss_aux_layer_9": 0.0758056640625, "step": 1547, "total_loss": 0.6630647927522659 }, { "epoch": 0.30647396555137596, "grad_norm": 0.9406499862670898, "learning_rate": 5e-05, "llm_loss": 0.5984862297773361, "loss": 2.8218, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05120849609375, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.0977783203125, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.06280517578125, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078369140625, "step": 1548, "total_loss": 0.7054530829191208 }, { "epoch": 0.3066719461492774, "grad_norm": 1.6701502799987793, "learning_rate": 5e-05, "llm_loss": 0.5861008986830711, "loss": 2.7835, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.0526123046875, "loss_aux_layer_10": 0.0838623046875, "loss_aux_layer_11": 0.0889892578125, "loss_aux_layer_12": 0.0947265625, "loss_aux_layer_13": 0.10205078125, "loss_aux_layer_14": 0.1123046875, "loss_aux_layer_15": 0.1226806640625, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.0655517578125, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.084228515625, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.08447265625, "loss_aux_layer_8": 0.0833740234375, "loss_aux_layer_9": 0.082275390625, "step": 1549, "total_loss": 0.6958777010440826 }, { "epoch": 0.3068699267471788, "grad_norm": 1.419435739517212, "learning_rate": 5e-05, "llm_loss": 0.5683064833283424, "loss": 2.7042, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.05419921875, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.0924072265625, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1094970703125, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0814208984375, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.0804443359375, "step": 1550, "total_loss": 0.6760557740926743 }, { "epoch": 0.30706790734508016, "grad_norm": 0.9954577088356018, "learning_rate": 5e-05, "llm_loss": 0.5889966636896133, "loss": 2.7773, "loss_aux_layer_0": 0.0225830078125, "loss_aux_layer_1": 0.05169677734375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.1268310546875, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.0626220703125, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0767822265625, "step": 1551, "total_loss": 0.6943265199661255 }, { "epoch": 0.3072658879429816, "grad_norm": 1.8395274877548218, "learning_rate": 5e-05, "llm_loss": 0.6993534862995148, "loss": 3.2327, "loss_aux_layer_0": 0.024932861328125, "loss_aux_layer_1": 0.05535888671875, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.09326171875, "loss_aux_layer_13": 0.099853515625, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06768798828125, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.184814453125, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.078857421875, "loss_aux_layer_4": 0.0819091796875, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.0867919921875, "loss_aux_layer_7": 0.0833740234375, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.08056640625, "step": 1552, "total_loss": 0.8081812560558319 }, { "epoch": 0.307463868540883, "grad_norm": 1.6624553203582764, "learning_rate": 5e-05, "llm_loss": 0.6856556832790375, "loss": 3.15, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04962158203125, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.103759765625, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.0740966796875, "loss_aux_layer_5": 0.0758056640625, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0740966796875, "step": 1553, "total_loss": 0.7875111401081085 }, { "epoch": 0.3076618491387844, "grad_norm": 2.213548183441162, "learning_rate": 5e-05, "llm_loss": 0.5994330048561096, "loss": 2.8227, "loss_aux_layer_0": 0.024505615234375, "loss_aux_layer_1": 0.05438232421875, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.065185546875, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0753173828125, "loss_aux_layer_4": 0.0794677734375, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.0777587890625, "step": 1554, "total_loss": 0.7056843936443329 }, { "epoch": 0.3078598297366858, "grad_norm": 1.1429860591888428, "learning_rate": 5e-05, "llm_loss": 0.6910087019205093, "loss": 3.167, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04925537109375, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.11083984375, "loss_aux_layer_16": 0.1214599609375, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.0743408203125, "loss_aux_layer_9": 0.0732421875, "step": 1555, "total_loss": 0.791741356253624 }, { "epoch": 0.30805781033458723, "grad_norm": 2.2815299034118652, "learning_rate": 5e-05, "llm_loss": 0.6164816170930862, "loss": 2.8972, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.054443359375, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.085693359375, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.0992431640625, "loss_aux_layer_14": 0.10986328125, "loss_aux_layer_15": 0.1199951171875, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.079345703125, "step": 1556, "total_loss": 0.7242911458015442 }, { "epoch": 0.3082557909324886, "grad_norm": 1.8688362836837769, "learning_rate": 5e-05, "llm_loss": 0.6591728627681732, "loss": 3.0622, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.05267333984375, "loss_aux_layer_10": 0.0787353515625, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.107666015625, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.1280517578125, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0772705078125, "step": 1557, "total_loss": 0.7655597031116486 }, { "epoch": 0.30845377153039, "grad_norm": 1.4484702348709106, "learning_rate": 5e-05, "llm_loss": 0.5900831818580627, "loss": 2.782, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.050048828125, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.087890625, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06201171875, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.075927734375, "step": 1558, "total_loss": 0.6955010741949081 }, { "epoch": 0.30865175212829143, "grad_norm": 1.3581831455230713, "learning_rate": 5e-05, "llm_loss": 0.6581475883722305, "loss": 3.0564, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1077880859375, "loss_aux_layer_15": 0.1175537109375, "loss_aux_layer_16": 0.128173828125, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.064453125, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0787353515625, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.079833984375, "loss_aux_layer_9": 0.0782470703125, "step": 1559, "total_loss": 0.764103353023529 }, { "epoch": 0.3088497327261928, "grad_norm": 1.292136788368225, "learning_rate": 5e-05, "llm_loss": 0.6077136099338531, "loss": 2.8402, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.048583984375, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0699462890625, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0732421875, "step": 1560, "total_loss": 0.7100619673728943 }, { "epoch": 0.30904771332409425, "grad_norm": 1.3989027738571167, "learning_rate": 5e-05, "llm_loss": 0.6735578030347824, "loss": 3.1186, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.054443359375, "loss_aux_layer_10": 0.0792236328125, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.1077880859375, "loss_aux_layer_15": 0.11767578125, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.14599609375, "loss_aux_layer_2": 0.0653076171875, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.079345703125, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.0777587890625, "step": 1561, "total_loss": 0.7796383202075958 }, { "epoch": 0.30924569392199563, "grad_norm": 1.6108314990997314, "learning_rate": 5e-05, "llm_loss": 0.6411966979503632, "loss": 2.9733, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.0489501953125, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06005859375, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.073974609375, "step": 1562, "total_loss": 0.7433358430862427 }, { "epoch": 0.30944367451989707, "grad_norm": 1.4726721048355103, "learning_rate": 5e-05, "llm_loss": 0.6880330294370651, "loss": 3.1695, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.05023193359375, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06256103515625, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.076171875, "step": 1563, "total_loss": 0.7923686504364014 }, { "epoch": 0.30964165511779845, "grad_norm": 1.3242305517196655, "learning_rate": 5e-05, "llm_loss": 0.5860560834407806, "loss": 2.7836, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.05511474609375, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.10009765625, "loss_aux_layer_14": 0.111083984375, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.1326904296875, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06695556640625, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.1884765625, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.08642578125, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.080322265625, "step": 1564, "total_loss": 0.6959103047847748 }, { "epoch": 0.30983963571569983, "grad_norm": 1.4738636016845703, "learning_rate": 5e-05, "llm_loss": 0.5931343734264374, "loss": 2.8043, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.051025390625, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0992431640625, "loss_aux_layer_14": 0.110107421875, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.132568359375, "loss_aux_layer_17": 0.140625, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.06298828125, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.07861328125, "step": 1565, "total_loss": 0.7010768204927444 }, { "epoch": 0.31003761631360127, "grad_norm": 2.1161766052246094, "learning_rate": 5e-05, "llm_loss": 0.5991052240133286, "loss": 2.805, "loss_aux_layer_0": 0.0250244140625, "loss_aux_layer_1": 0.05078125, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.1234130859375, "loss_aux_layer_17": 0.1312255859375, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06158447265625, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.07275390625, "step": 1566, "total_loss": 0.701246440410614 }, { "epoch": 0.31023559691150265, "grad_norm": 0.7945410013198853, "learning_rate": 5e-05, "llm_loss": 0.6227360144257545, "loss": 2.9144, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.05206298828125, "loss_aux_layer_10": 0.0792236328125, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.08984375, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.1072998046875, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.1280517578125, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.06243896484375, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.07763671875, "step": 1567, "total_loss": 0.7286103814840317 }, { "epoch": 0.3104335775094041, "grad_norm": 1.5682827234268188, "learning_rate": 5e-05, "llm_loss": 0.5749486684799194, "loss": 2.7354, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.05511474609375, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.0880126953125, "loss_aux_layer_12": 0.0941162109375, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.111328125, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.067626953125, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0791015625, "loss_aux_layer_4": 0.082275390625, "loss_aux_layer_5": 0.084228515625, "loss_aux_layer_6": 0.0870361328125, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.081298828125, "step": 1568, "total_loss": 0.6838595420122147 }, { "epoch": 0.31063155810730547, "grad_norm": 1.1637012958526611, "learning_rate": 5e-05, "llm_loss": 0.6765955239534378, "loss": 3.1275, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.077392578125, "step": 1569, "total_loss": 0.7818646430969238 }, { "epoch": 0.3108295387052069, "grad_norm": 1.4478330612182617, "learning_rate": 5e-05, "llm_loss": 0.5938505679368973, "loss": 2.8124, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.057373046875, "loss_aux_layer_10": 0.0828857421875, "loss_aux_layer_11": 0.0877685546875, "loss_aux_layer_12": 0.0941162109375, "loss_aux_layer_13": 0.1005859375, "loss_aux_layer_14": 0.110595703125, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.07000732421875, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.0811767578125, "loss_aux_layer_4": 0.084228515625, "loss_aux_layer_5": 0.08544921875, "loss_aux_layer_6": 0.0880126953125, "loss_aux_layer_7": 0.084716796875, "loss_aux_layer_8": 0.083251953125, "loss_aux_layer_9": 0.0814208984375, "step": 1570, "total_loss": 0.7031092047691345 }, { "epoch": 0.3110275193031083, "grad_norm": 1.13417649269104, "learning_rate": 5e-05, "llm_loss": 0.642524927854538, "loss": 2.9921, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.090087890625, "loss_aux_layer_13": 0.0965576171875, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1263427734375, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.161376953125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0772705078125, "step": 1571, "total_loss": 0.7480313330888748 }, { "epoch": 0.31122549990100973, "grad_norm": 1.6983040571212769, "learning_rate": 5e-05, "llm_loss": 0.6706201881170273, "loss": 3.1181, "loss_aux_layer_0": 0.029510498046875, "loss_aux_layer_1": 0.0556640625, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.0987548828125, "loss_aux_layer_14": 0.10986328125, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.06561279296875, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0780029296875, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0789794921875, "step": 1572, "total_loss": 0.7795353531837463 }, { "epoch": 0.3114234804989111, "grad_norm": 1.398979902267456, "learning_rate": 5e-05, "llm_loss": 0.5504758581519127, "loss": 2.6149, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.0504150390625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.0926513671875, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.05950927734375, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.07275390625, "step": 1573, "total_loss": 0.6537126749753952 }, { "epoch": 0.3116214610968125, "grad_norm": 2.16475510597229, "learning_rate": 5e-05, "llm_loss": 0.5804224759340286, "loss": 2.7459, "loss_aux_layer_0": 0.024322509765625, "loss_aux_layer_1": 0.0511474609375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.1287841796875, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.06158447265625, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.0787353515625, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.0767822265625, "step": 1574, "total_loss": 0.6864766180515289 }, { "epoch": 0.31181944169471393, "grad_norm": 1.6920320987701416, "learning_rate": 5e-05, "llm_loss": 0.6264041662216187, "loss": 2.9199, "loss_aux_layer_0": 0.022979736328125, "loss_aux_layer_1": 0.048828125, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.104248046875, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.124267578125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.06036376953125, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0767822265625, "loss_aux_layer_9": 0.0755615234375, "step": 1575, "total_loss": 0.7299669682979584 }, { "epoch": 0.3120174222926153, "grad_norm": 1.2073087692260742, "learning_rate": 5e-05, "llm_loss": 0.6355860084295273, "loss": 2.98, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.05303955078125, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.0875244140625, "loss_aux_layer_12": 0.0933837890625, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.111572265625, "loss_aux_layer_15": 0.1217041015625, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.158935546875, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.08154296875, "loss_aux_layer_5": 0.0836181640625, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.082763671875, "loss_aux_layer_9": 0.0814208984375, "step": 1576, "total_loss": 0.7449942231178284 }, { "epoch": 0.31221540289051675, "grad_norm": 1.7690134048461914, "learning_rate": 5e-05, "llm_loss": 0.6159028261899948, "loss": 2.8793, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0931396484375, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.1141357421875, "loss_aux_layer_16": 0.1253662109375, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0751953125, "step": 1577, "total_loss": 0.7198344022035599 }, { "epoch": 0.31241338348841813, "grad_norm": 1.1329935789108276, "learning_rate": 5e-05, "llm_loss": 0.5605447217822075, "loss": 2.6891, "loss_aux_layer_0": 0.023681640625, "loss_aux_layer_1": 0.0546875, "loss_aux_layer_10": 0.0853271484375, "loss_aux_layer_11": 0.0908203125, "loss_aux_layer_12": 0.09716796875, "loss_aux_layer_13": 0.104248046875, "loss_aux_layer_14": 0.114501953125, "loss_aux_layer_15": 0.124755859375, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.15185546875, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.068115234375, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0806884765625, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.086669921875, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.0853271484375, "loss_aux_layer_9": 0.0841064453125, "step": 1578, "total_loss": 0.6722853034734726 }, { "epoch": 0.31261136408631957, "grad_norm": 1.3778289556503296, "learning_rate": 5e-05, "llm_loss": 0.634521558880806, "loss": 2.9683, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.05126953125, "loss_aux_layer_10": 0.0809326171875, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.0919189453125, "loss_aux_layer_13": 0.098876953125, "loss_aux_layer_14": 0.1094970703125, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.139404296875, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.063232421875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.0794677734375, "step": 1579, "total_loss": 0.7420769184827805 }, { "epoch": 0.31280934468422095, "grad_norm": 1.5010380744934082, "learning_rate": 5e-05, "llm_loss": 0.5417063981294632, "loss": 2.5986, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.05218505859375, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1092529296875, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.0643310546875, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0792236328125, "step": 1580, "total_loss": 0.6496533006429672 }, { "epoch": 0.31300732528212233, "grad_norm": 0.8984509706497192, "learning_rate": 5e-05, "llm_loss": 0.5448343008756638, "loss": 2.6158, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.05145263671875, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.0872802734375, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.112060546875, "loss_aux_layer_15": 0.122802734375, "loss_aux_layer_16": 0.13427734375, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.06390380859375, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.076171875, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.0811767578125, "loss_aux_layer_9": 0.080322265625, "step": 1581, "total_loss": 0.6539573967456818 }, { "epoch": 0.31320530588002377, "grad_norm": 1.352264642715454, "learning_rate": 5e-05, "llm_loss": 0.6730760633945465, "loss": 3.1284, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.0537109375, "loss_aux_layer_10": 0.0821533203125, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.1005859375, "loss_aux_layer_14": 0.1109619140625, "loss_aux_layer_15": 0.121826171875, "loss_aux_layer_16": 0.1318359375, "loss_aux_layer_17": 0.139892578125, "loss_aux_layer_18": 0.148681640625, "loss_aux_layer_19": 0.150390625, "loss_aux_layer_2": 0.0657958984375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0830078125, "loss_aux_layer_6": 0.0859375, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.0806884765625, "step": 1582, "total_loss": 0.782091036438942 }, { "epoch": 0.31340328647792515, "grad_norm": 1.143871545791626, "learning_rate": 5e-05, "llm_loss": 0.599749431014061, "loss": 2.8189, "loss_aux_layer_0": 0.024383544921875, "loss_aux_layer_1": 0.05169677734375, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.087890625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.0623779296875, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.0758056640625, "step": 1583, "total_loss": 0.704714760184288 }, { "epoch": 0.3136012670758266, "grad_norm": 1.1358810663223267, "learning_rate": 5e-05, "llm_loss": 0.640966609120369, "loss": 2.9665, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0926513671875, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05926513671875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.07373046875, "step": 1584, "total_loss": 0.7416277229785919 }, { "epoch": 0.31379924767372797, "grad_norm": 1.1798638105392456, "learning_rate": 5e-05, "llm_loss": 0.6710387170314789, "loss": 3.0856, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.046630859375, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.091064453125, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0716552734375, "step": 1585, "total_loss": 0.7714027762413025 }, { "epoch": 0.3139972282716294, "grad_norm": 1.2060790061950684, "learning_rate": 5e-05, "llm_loss": 0.6325880587100983, "loss": 2.9598, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.0513916015625, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.0989990234375, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.0645751953125, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.0819091796875, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.079345703125, "step": 1586, "total_loss": 0.7399506568908691 }, { "epoch": 0.3141952088695308, "grad_norm": 1.736839771270752, "learning_rate": 5e-05, "llm_loss": 0.6066425144672394, "loss": 2.8455, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.082275390625, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.14599609375, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.076416015625, "step": 1587, "total_loss": 0.7113799601793289 }, { "epoch": 0.31439318946743217, "grad_norm": 1.1158276796340942, "learning_rate": 5e-05, "llm_loss": 0.5508151352405548, "loss": 2.6283, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.05242919921875, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.0965576171875, "loss_aux_layer_14": 0.1070556640625, "loss_aux_layer_15": 0.1173095703125, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.156005859375, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.184814453125, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0775146484375, "step": 1588, "total_loss": 0.6570838391780853 }, { "epoch": 0.3145911700653336, "grad_norm": 1.3861634731292725, "learning_rate": 5e-05, "llm_loss": 0.6189781129360199, "loss": 2.9152, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.052978515625, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.1004638671875, "loss_aux_layer_14": 0.111328125, "loss_aux_layer_15": 0.1221923828125, "loss_aux_layer_16": 0.133056640625, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.152099609375, "loss_aux_layer_2": 0.0660400390625, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.16650390625, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.231689453125, "loss_aux_layer_3": 0.078125, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.0828857421875, "loss_aux_layer_6": 0.0860595703125, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.0804443359375, "step": 1589, "total_loss": 0.7287994176149368 }, { "epoch": 0.314789150663235, "grad_norm": 1.0679329633712769, "learning_rate": 5e-05, "llm_loss": 0.5712498277425766, "loss": 2.7023, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.05157470703125, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.104248046875, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.1331787109375, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.06280517578125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.07403564453125, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.07537841796875, "step": 1590, "total_loss": 0.6755781769752502 }, { "epoch": 0.3149871312611364, "grad_norm": 1.2434698343276978, "learning_rate": 5e-05, "llm_loss": 0.6488780677318573, "loss": 3.0251, "loss_aux_layer_0": 0.025665283203125, "loss_aux_layer_1": 0.05181884765625, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.119384765625, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.06268310546875, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.186279296875, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.0748291015625, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.0784912109375, "step": 1591, "total_loss": 0.7562864124774933 }, { "epoch": 0.3151851118590378, "grad_norm": 0.9012163877487183, "learning_rate": 5e-05, "llm_loss": 0.5295151099562645, "loss": 2.5435, "loss_aux_layer_0": 0.026397705078125, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.117919921875, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.076904296875, "step": 1592, "total_loss": 0.6358736753463745 }, { "epoch": 0.31538309245693924, "grad_norm": 1.0244921445846558, "learning_rate": 5e-05, "llm_loss": 0.5959723442792892, "loss": 2.8051, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.05218505859375, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.08984375, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.063720703125, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.0775146484375, "step": 1593, "total_loss": 0.7012748569250107 }, { "epoch": 0.3155810730548406, "grad_norm": 1.0380277633666992, "learning_rate": 5e-05, "llm_loss": 0.5937848314642906, "loss": 2.8197, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.0552978515625, "loss_aux_layer_10": 0.083740234375, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.0955810546875, "loss_aux_layer_13": 0.1029052734375, "loss_aux_layer_14": 0.113525390625, "loss_aux_layer_15": 0.1236572265625, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.142578125, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06793212890625, "loss_aux_layer_20": 0.159912109375, "loss_aux_layer_21": 0.166748046875, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.229736328125, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.0877685546875, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.083740234375, "loss_aux_layer_9": 0.082275390625, "step": 1594, "total_loss": 0.7049335092306137 }, { "epoch": 0.31577905365274206, "grad_norm": 0.9677935838699341, "learning_rate": 5e-05, "llm_loss": 0.6193811893463135, "loss": 2.9016, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.051025390625, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.09716796875, "loss_aux_layer_14": 0.1075439453125, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.078369140625, "step": 1595, "total_loss": 0.7253930419683456 }, { "epoch": 0.31597703425064344, "grad_norm": 1.1853004693984985, "learning_rate": 5e-05, "llm_loss": 0.6907575502991676, "loss": 3.1907, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.053466796875, "loss_aux_layer_10": 0.080810546875, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.0985107421875, "loss_aux_layer_14": 0.1087646484375, "loss_aux_layer_15": 0.11865234375, "loss_aux_layer_16": 0.1290283203125, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.0655517578125, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.079345703125, "step": 1596, "total_loss": 0.7976801693439484 }, { "epoch": 0.3161750148485448, "grad_norm": 1.5385183095932007, "learning_rate": 5e-05, "llm_loss": 0.5868865475058556, "loss": 2.7719, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04974365234375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.083251953125, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.107177734375, "loss_aux_layer_15": 0.1175537109375, "loss_aux_layer_16": 0.129150390625, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06170654296875, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.076904296875, "step": 1597, "total_loss": 0.6929726153612137 }, { "epoch": 0.31637299544644626, "grad_norm": 1.0606560707092285, "learning_rate": 5e-05, "llm_loss": 0.5220181047916412, "loss": 2.4984, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.0484619140625, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0865478515625, "loss_aux_layer_13": 0.0936279296875, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.1256103515625, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.05902099609375, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.07421875, "step": 1598, "total_loss": 0.624605342745781 }, { "epoch": 0.31657097604434764, "grad_norm": 1.307281494140625, "learning_rate": 5e-05, "llm_loss": 0.6205205470323563, "loss": 2.9016, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06243896484375, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.0799560546875, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.077392578125, "step": 1599, "total_loss": 0.7253988236188889 }, { "epoch": 0.3167689566422491, "grad_norm": 1.2657392024993896, "learning_rate": 5e-05, "llm_loss": 0.5989580899477005, "loss": 2.8247, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.05084228515625, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.109375, "loss_aux_layer_15": 0.11962890625, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.14599609375, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.0634765625, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.0819091796875, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0789794921875, "step": 1600, "total_loss": 0.7061693668365479 }, { "epoch": 0.31696693724015046, "grad_norm": 1.3140478134155273, "learning_rate": 5e-05, "llm_loss": 0.6159578412771225, "loss": 2.887, "loss_aux_layer_0": 0.0225830078125, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.0904541015625, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.127685546875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.06365966796875, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078369140625, "step": 1601, "total_loss": 0.7217554301023483 }, { "epoch": 0.3171649178380519, "grad_norm": 1.0611025094985962, "learning_rate": 5e-05, "llm_loss": 0.5658134669065475, "loss": 2.6763, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.080322265625, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.0933837890625, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.114501953125, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06005859375, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.07421875, "step": 1602, "total_loss": 0.6690662950277328 }, { "epoch": 0.3173628984359533, "grad_norm": 1.0232024192810059, "learning_rate": 5e-05, "llm_loss": 0.6265232563018799, "loss": 2.9391, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.05224609375, "loss_aux_layer_10": 0.08203125, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.09326171875, "loss_aux_layer_13": 0.1005859375, "loss_aux_layer_14": 0.111083984375, "loss_aux_layer_15": 0.1209716796875, "loss_aux_layer_16": 0.131103515625, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.082763671875, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.082763671875, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.0804443359375, "step": 1603, "total_loss": 0.7347778379917145 }, { "epoch": 0.31756087903385466, "grad_norm": 1.0134369134902954, "learning_rate": 5e-05, "llm_loss": 0.4303280636668205, "loss": 2.1367, "loss_aux_layer_0": 0.02313232421875, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.14599609375, "loss_aux_layer_2": 0.06011962890625, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.071533203125, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0748291015625, "step": 1604, "total_loss": 0.5341852754354477 }, { "epoch": 0.3177588596317561, "grad_norm": 1.0426199436187744, "learning_rate": 5e-05, "llm_loss": 0.5085141807794571, "loss": 2.4595, "loss_aux_layer_0": 0.023651123046875, "loss_aux_layer_1": 0.0518798828125, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.107177734375, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.1285400390625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06317138671875, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.184814453125, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.07763671875, "step": 1605, "total_loss": 0.6148725599050522 }, { "epoch": 0.3179568402296575, "grad_norm": 1.2100955247879028, "learning_rate": 5e-05, "llm_loss": 0.5671726763248444, "loss": 2.6702, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05853271484375, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0726318359375, "step": 1606, "total_loss": 0.6675532460212708 }, { "epoch": 0.3181548208275589, "grad_norm": 1.0764786005020142, "learning_rate": 5e-05, "llm_loss": 0.6624502837657928, "loss": 3.0667, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.05126953125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.0626220703125, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.07421875, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.07666015625, "step": 1607, "total_loss": 0.7666716426610947 }, { "epoch": 0.3183528014254603, "grad_norm": 1.2240101099014282, "learning_rate": 5e-05, "llm_loss": 0.5388412773609161, "loss": 2.5754, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04974365234375, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.1268310546875, "loss_aux_layer_17": 0.1346435546875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.0771484375, "step": 1608, "total_loss": 0.6438604146242142 }, { "epoch": 0.31855078202336173, "grad_norm": 0.8478697538375854, "learning_rate": 5e-05, "llm_loss": 0.6188554018735886, "loss": 2.8835, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.048095703125, "loss_aux_layer_10": 0.07568359375, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.11376953125, "loss_aux_layer_16": 0.124267578125, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05877685546875, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.15771484375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.07421875, "step": 1609, "total_loss": 0.7208651751279831 }, { "epoch": 0.3187487626212631, "grad_norm": 1.4092339277267456, "learning_rate": 5e-05, "llm_loss": 0.6263973265886307, "loss": 2.9445, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.05230712890625, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.08837890625, "loss_aux_layer_12": 0.0947265625, "loss_aux_layer_13": 0.101806640625, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.1231689453125, "loss_aux_layer_16": 0.13330078125, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.158203125, "loss_aux_layer_21": 0.165283203125, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.077392578125, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.0867919921875, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.0826416015625, "loss_aux_layer_9": 0.0814208984375, "step": 1610, "total_loss": 0.7361259609460831 }, { "epoch": 0.3189467432191645, "grad_norm": 2.0385003089904785, "learning_rate": 5e-05, "llm_loss": 0.7518088221549988, "loss": 3.4118, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.074951171875, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.092041015625, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.1239013671875, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.0736083984375, "step": 1611, "total_loss": 0.8529418557882309 }, { "epoch": 0.31914472381706593, "grad_norm": 1.562624216079712, "learning_rate": 5e-05, "llm_loss": 0.5362366288900375, "loss": 2.5666, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.0496826171875, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.107421875, "loss_aux_layer_15": 0.1180419921875, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.136962890625, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.0772705078125, "step": 1612, "total_loss": 0.6416545212268829 }, { "epoch": 0.3193427044149673, "grad_norm": 1.9093331098556519, "learning_rate": 5e-05, "llm_loss": 0.63679039478302, "loss": 2.9636, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.095703125, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.059326171875, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.0753173828125, "step": 1613, "total_loss": 0.7409003227949142 }, { "epoch": 0.31954068501286875, "grad_norm": 3.097033739089966, "learning_rate": 5e-05, "llm_loss": 0.5615016967058182, "loss": 2.6681, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.0506591796875, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.1064453125, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.06402587890625, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.08154296875, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.0780029296875, "step": 1614, "total_loss": 0.6670359522104263 }, { "epoch": 0.31973866561077013, "grad_norm": 2.1896896362304688, "learning_rate": 5e-05, "llm_loss": 0.6530687361955643, "loss": 3.0325, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.1273193359375, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.080810546875, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.07666015625, "loss_aux_layer_9": 0.0755615234375, "step": 1615, "total_loss": 0.758121520280838 }, { "epoch": 0.31993664620867157, "grad_norm": 2.4984450340270996, "learning_rate": 5e-05, "llm_loss": 0.5591368153691292, "loss": 2.6602, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.05084228515625, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.1070556640625, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.06439208984375, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.0794677734375, "loss_aux_layer_5": 0.081298828125, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.07861328125, "step": 1616, "total_loss": 0.6650534868240356 }, { "epoch": 0.32013462680657295, "grad_norm": 2.0901992321014404, "learning_rate": 5e-05, "llm_loss": 0.7004128992557526, "loss": 3.2217, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.05181884765625, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1153564453125, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.0643310546875, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.161376953125, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.076171875, "step": 1617, "total_loss": 0.8054181337356567 }, { "epoch": 0.32033260740447433, "grad_norm": 1.9625005722045898, "learning_rate": 5e-05, "llm_loss": 0.580293670296669, "loss": 2.7306, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.048583984375, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.1302490234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06146240234375, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.075439453125, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.07421875, "step": 1618, "total_loss": 0.682654544711113 }, { "epoch": 0.32053058800237577, "grad_norm": 1.5402735471725464, "learning_rate": 5e-05, "llm_loss": 0.6367611289024353, "loss": 2.9878, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.05682373046875, "loss_aux_layer_10": 0.08544921875, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.0968017578125, "loss_aux_layer_13": 0.103271484375, "loss_aux_layer_14": 0.11328125, "loss_aux_layer_15": 0.123046875, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.069091796875, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.082275390625, "loss_aux_layer_4": 0.086181640625, "loss_aux_layer_5": 0.0880126953125, "loss_aux_layer_6": 0.091064453125, "loss_aux_layer_7": 0.08740234375, "loss_aux_layer_8": 0.0858154296875, "loss_aux_layer_9": 0.0843505859375, "step": 1619, "total_loss": 0.7469505816698074 }, { "epoch": 0.32072856860027715, "grad_norm": 1.5963555574417114, "learning_rate": 5e-05, "llm_loss": 0.6331436187028885, "loss": 2.9591, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.0908203125, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.1307373046875, "loss_aux_layer_17": 0.13818359375, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06329345703125, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078369140625, "step": 1620, "total_loss": 0.7397802770137787 }, { "epoch": 0.3209265491981786, "grad_norm": 1.4239860773086548, "learning_rate": 5e-05, "llm_loss": 0.5435566753149033, "loss": 2.5908, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.04876708984375, "loss_aux_layer_10": 0.07568359375, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06097412109375, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.0771484375, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0743408203125, "step": 1621, "total_loss": 0.6477028727531433 }, { "epoch": 0.32112452979607997, "grad_norm": 1.2406036853790283, "learning_rate": 5e-05, "llm_loss": 0.6597555428743362, "loss": 3.0698, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.05340576171875, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.08642578125, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.109375, "loss_aux_layer_15": 0.1190185546875, "loss_aux_layer_16": 0.1295166015625, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.06353759765625, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.07568359375, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0819091796875, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0809326171875, "loss_aux_layer_9": 0.080078125, "step": 1622, "total_loss": 0.7674409002065659 }, { "epoch": 0.3213225103939814, "grad_norm": 1.1293020248413086, "learning_rate": 5e-05, "llm_loss": 0.5225486680865288, "loss": 2.5221, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.05120849609375, "loss_aux_layer_10": 0.0816650390625, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.1107177734375, "loss_aux_layer_15": 0.1217041015625, "loss_aux_layer_16": 0.1328125, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.06414794921875, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0814208984375, "loss_aux_layer_9": 0.080322265625, "step": 1623, "total_loss": 0.6305132061243057 }, { "epoch": 0.3215204909918828, "grad_norm": 1.3948209285736084, "learning_rate": 5e-05, "llm_loss": 0.5764328092336655, "loss": 2.7279, "loss_aux_layer_0": 0.02313232421875, "loss_aux_layer_1": 0.05035400390625, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.08984375, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.128173828125, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06298828125, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0767822265625, "step": 1624, "total_loss": 0.6819813698530197 }, { "epoch": 0.3217184715897842, "grad_norm": 1.3499995470046997, "learning_rate": 5e-05, "llm_loss": 0.5873429030179977, "loss": 2.7689, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06097412109375, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.07666015625, "step": 1625, "total_loss": 0.6922168582677841 }, { "epoch": 0.3219164521876856, "grad_norm": 1.0996512174606323, "learning_rate": 5e-05, "llm_loss": 0.6372494548559189, "loss": 2.9765, "loss_aux_layer_0": 0.023895263671875, "loss_aux_layer_1": 0.0523681640625, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0909423828125, "loss_aux_layer_13": 0.0982666015625, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06390380859375, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.080322265625, "loss_aux_layer_6": 0.08349609375, "loss_aux_layer_7": 0.080322265625, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.078369140625, "step": 1626, "total_loss": 0.7441372871398926 }, { "epoch": 0.322114432785587, "grad_norm": 1.062037467956543, "learning_rate": 5e-05, "llm_loss": 0.5322768837213516, "loss": 2.5573, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.0499267578125, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.0908203125, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.109619140625, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.149169921875, "loss_aux_layer_19": 0.151123046875, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.0772705078125, "step": 1627, "total_loss": 0.6393214166164398 }, { "epoch": 0.3223124133834884, "grad_norm": 1.1725767850875854, "learning_rate": 5e-05, "llm_loss": 0.6236113905906677, "loss": 2.8978, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.1219482421875, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0726318359375, "step": 1628, "total_loss": 0.7244575321674347 }, { "epoch": 0.3225103939813898, "grad_norm": 1.0674420595169067, "learning_rate": 5e-05, "llm_loss": 0.5227595269680023, "loss": 2.5132, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.05010986328125, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.1173095703125, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.077392578125, "step": 1629, "total_loss": 0.628298357129097 }, { "epoch": 0.32270837457929125, "grad_norm": 1.072373628616333, "learning_rate": 5e-05, "llm_loss": 0.6303239613771439, "loss": 2.938, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.05078125, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.0875244140625, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0755615234375, "step": 1630, "total_loss": 0.7344958782196045 }, { "epoch": 0.3229063551771926, "grad_norm": 1.0121570825576782, "learning_rate": 5e-05, "llm_loss": 0.5962361246347427, "loss": 2.8032, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.049560546875, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.126708984375, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06036376953125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.07135009765625, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.079833984375, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.0751953125, "step": 1631, "total_loss": 0.7007883936166763 }, { "epoch": 0.32310433577509406, "grad_norm": 1.3454928398132324, "learning_rate": 5e-05, "llm_loss": 0.6590060591697693, "loss": 3.0579, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.0498046875, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.08984375, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.107177734375, "loss_aux_layer_15": 0.1165771484375, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.06146240234375, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0823974609375, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0771484375, "step": 1632, "total_loss": 0.764464795589447 }, { "epoch": 0.32330231637299545, "grad_norm": 1.092397689819336, "learning_rate": 5e-05, "llm_loss": 0.5281447768211365, "loss": 2.5204, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.04827880859375, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.1131591796875, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.05914306640625, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.070068359375, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.073486328125, "step": 1633, "total_loss": 0.630088284611702 }, { "epoch": 0.32350029697089683, "grad_norm": 1.1087501049041748, "learning_rate": 5e-05, "llm_loss": 0.5415138453245163, "loss": 2.5829, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.0474853515625, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.1353759765625, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0751953125, "step": 1634, "total_loss": 0.6457258313894272 }, { "epoch": 0.32369827756879826, "grad_norm": 1.2170414924621582, "learning_rate": 5e-05, "llm_loss": 0.6786434799432755, "loss": 3.1456, "loss_aux_layer_0": 0.025299072265625, "loss_aux_layer_1": 0.0538330078125, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.091796875, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1087646484375, "loss_aux_layer_15": 0.11865234375, "loss_aux_layer_16": 0.129150390625, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.06524658203125, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.081298828125, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.0869140625, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0821533203125, "loss_aux_layer_9": 0.080322265625, "step": 1635, "total_loss": 0.7864042222499847 }, { "epoch": 0.32389625816669965, "grad_norm": 0.8679546117782593, "learning_rate": 5e-05, "llm_loss": 0.6410569846630096, "loss": 2.9882, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.052001953125, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.1180419921875, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.080322265625, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.079833984375, "loss_aux_layer_9": 0.078369140625, "step": 1636, "total_loss": 0.747048556804657 }, { "epoch": 0.3240942387646011, "grad_norm": 0.8540616035461426, "learning_rate": 5e-05, "llm_loss": 0.6160189658403397, "loss": 2.89, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.05224609375, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.0972900390625, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.129638671875, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.146240234375, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0634765625, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0753173828125, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.0806884765625, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078369140625, "step": 1637, "total_loss": 0.7225085943937302 }, { "epoch": 0.32429221936250247, "grad_norm": 0.9370478987693787, "learning_rate": 5e-05, "llm_loss": 0.6220853626728058, "loss": 2.8965, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.048095703125, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.0933837890625, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.113525390625, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.07470703125, "step": 1638, "total_loss": 0.7241154909133911 }, { "epoch": 0.3244901999604039, "grad_norm": 1.3485403060913086, "learning_rate": 5e-05, "llm_loss": 0.5767276287078857, "loss": 2.7514, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.05426025390625, "loss_aux_layer_10": 0.08447265625, "loss_aux_layer_11": 0.090087890625, "loss_aux_layer_12": 0.0966796875, "loss_aux_layer_13": 0.1036376953125, "loss_aux_layer_14": 0.1141357421875, "loss_aux_layer_15": 0.123779296875, "loss_aux_layer_16": 0.134033203125, "loss_aux_layer_17": 0.141845703125, "loss_aux_layer_18": 0.150634765625, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.06689453125, "loss_aux_layer_20": 0.158447265625, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.2275390625, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.083740234375, "loss_aux_layer_5": 0.0858154296875, "loss_aux_layer_6": 0.0892333984375, "loss_aux_layer_7": 0.0858154296875, "loss_aux_layer_8": 0.08447265625, "loss_aux_layer_9": 0.0830078125, "step": 1639, "total_loss": 0.6878531128168106 }, { "epoch": 0.3246881805583053, "grad_norm": 1.2649881839752197, "learning_rate": 5e-05, "llm_loss": 0.6690400689840317, "loss": 3.0995, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.05035400390625, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0970458984375, "loss_aux_layer_14": 0.107666015625, "loss_aux_layer_15": 0.1177978515625, "loss_aux_layer_16": 0.128662109375, "loss_aux_layer_17": 0.13671875, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.0799560546875, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.0777587890625, "step": 1640, "total_loss": 0.7748734652996063 }, { "epoch": 0.32488616115620667, "grad_norm": 1.440529704093933, "learning_rate": 5e-05, "llm_loss": 0.6422866731882095, "loss": 2.9751, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.101806640625, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.1314697265625, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.0721435546875, "step": 1641, "total_loss": 0.7437851130962372 }, { "epoch": 0.3250841417541081, "grad_norm": 1.3402823209762573, "learning_rate": 5e-05, "llm_loss": 0.5284875929355621, "loss": 2.5302, "loss_aux_layer_0": 0.02410888671875, "loss_aux_layer_1": 0.04937744140625, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1156005859375, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.076171875, "step": 1642, "total_loss": 0.632538229227066 }, { "epoch": 0.3252821223520095, "grad_norm": 1.0602344274520874, "learning_rate": 5e-05, "llm_loss": 0.6351658403873444, "loss": 2.9754, "loss_aux_layer_0": 0.0252685546875, "loss_aux_layer_1": 0.05377197265625, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.08740234375, "loss_aux_layer_12": 0.093505859375, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.1109619140625, "loss_aux_layer_15": 0.12109375, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.146728515625, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.066162109375, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.08349609375, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.0804443359375, "step": 1643, "total_loss": 0.7438454777002335 }, { "epoch": 0.3254801029499109, "grad_norm": 1.332727074623108, "learning_rate": 5e-05, "llm_loss": 0.6858408600091934, "loss": 3.1631, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.0889892578125, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.0621337890625, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.0765380859375, "step": 1644, "total_loss": 0.7907772958278656 }, { "epoch": 0.3256780835478123, "grad_norm": 1.3737635612487793, "learning_rate": 5e-05, "llm_loss": 0.5734953284263611, "loss": 2.6916, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.119140625, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05877685546875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.072021484375, "step": 1645, "total_loss": 0.6728924959897995 }, { "epoch": 0.32587606414571374, "grad_norm": 1.0408234596252441, "learning_rate": 5e-05, "llm_loss": 0.5958559513092041, "loss": 2.8009, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.0491943359375, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06060791015625, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.071533203125, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.0745849609375, "step": 1646, "total_loss": 0.7002318054437637 }, { "epoch": 0.3260740447436151, "grad_norm": 1.3980658054351807, "learning_rate": 5e-05, "llm_loss": 0.6065976768732071, "loss": 2.842, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.05010986328125, "loss_aux_layer_10": 0.076416015625, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.1044921875, "loss_aux_layer_15": 0.11474609375, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.0750732421875, "loss_aux_layer_5": 0.076904296875, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.074951171875, "step": 1647, "total_loss": 0.7104995250701904 }, { "epoch": 0.32627202534151656, "grad_norm": 1.2359914779663086, "learning_rate": 5e-05, "llm_loss": 0.6016670167446136, "loss": 2.8208, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.0714111328125, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0753173828125, "step": 1648, "total_loss": 0.7052092105150223 }, { "epoch": 0.32647000593941794, "grad_norm": 1.0776461362838745, "learning_rate": 5e-05, "llm_loss": 0.6518570929765701, "loss": 3.0218, "loss_aux_layer_0": 0.026275634765625, "loss_aux_layer_1": 0.0496826171875, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.093994140625, "loss_aux_layer_14": 0.103759765625, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.124267578125, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.075439453125, "step": 1649, "total_loss": 0.7554555088281631 }, { "epoch": 0.3266679865373193, "grad_norm": 1.0163465738296509, "learning_rate": 5e-05, "llm_loss": 0.5966175347566605, "loss": 2.79, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.0472412109375, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1209716796875, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.0726318359375, "step": 1650, "total_loss": 0.6975024491548538 }, { "epoch": 0.32686596713522076, "grad_norm": 0.8793151378631592, "learning_rate": 5e-05, "llm_loss": 0.5938966944813728, "loss": 2.7959, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.0509033203125, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06298828125, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.0823974609375, "loss_aux_layer_7": 0.0797119140625, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0772705078125, "step": 1651, "total_loss": 0.6989828795194626 }, { "epoch": 0.32706394773312214, "grad_norm": 1.0593886375427246, "learning_rate": 5e-05, "llm_loss": 0.6297499686479568, "loss": 2.9584, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.0556640625, "loss_aux_layer_10": 0.083984375, "loss_aux_layer_11": 0.089111328125, "loss_aux_layer_12": 0.0948486328125, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.111572265625, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.1317138671875, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06829833984375, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.080322265625, "loss_aux_layer_4": 0.0836181640625, "loss_aux_layer_5": 0.0850830078125, "loss_aux_layer_6": 0.08837890625, "loss_aux_layer_7": 0.085205078125, "loss_aux_layer_8": 0.0838623046875, "loss_aux_layer_9": 0.08251953125, "step": 1652, "total_loss": 0.7396029084920883 }, { "epoch": 0.3272619283310236, "grad_norm": 0.978118360042572, "learning_rate": 5e-05, "llm_loss": 0.6432595700025558, "loss": 2.9813, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.04962158203125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.0869140625, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.138916015625, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.080810546875, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.075439453125, "step": 1653, "total_loss": 0.7453205585479736 }, { "epoch": 0.32745990892892496, "grad_norm": 1.1481908559799194, "learning_rate": 5e-05, "llm_loss": 0.6034228205680847, "loss": 2.8269, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.05169677734375, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.1046142578125, "loss_aux_layer_15": 0.1141357421875, "loss_aux_layer_16": 0.1240234375, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.06298828125, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.0775146484375, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.076171875, "step": 1654, "total_loss": 0.7067321240901947 }, { "epoch": 0.3276578895268264, "grad_norm": 1.4033777713775635, "learning_rate": 5e-05, "llm_loss": 0.57502780854702, "loss": 2.7325, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0501708984375, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.109619140625, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.15087890625, "loss_aux_layer_2": 0.06341552734375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.166015625, "loss_aux_layer_22": 0.18994140625, "loss_aux_layer_23": 0.2314453125, "loss_aux_layer_3": 0.0748291015625, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.079833984375, "loss_aux_layer_9": 0.078369140625, "step": 1655, "total_loss": 0.6831150352954865 }, { "epoch": 0.3278558701247278, "grad_norm": 0.9342002868652344, "learning_rate": 5e-05, "llm_loss": 0.5480460673570633, "loss": 2.6105, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.048583984375, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0751953125, "step": 1656, "total_loss": 0.6526224911212921 }, { "epoch": 0.32805385072262916, "grad_norm": 2.078994035720825, "learning_rate": 5e-05, "llm_loss": 0.6683386564254761, "loss": 3.0942, "loss_aux_layer_0": 0.0213623046875, "loss_aux_layer_1": 0.04913330078125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.0975341796875, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06097412109375, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.075927734375, "step": 1657, "total_loss": 0.773559108376503 }, { "epoch": 0.3282518313205306, "grad_norm": 1.5883336067199707, "learning_rate": 5e-05, "llm_loss": 0.6732512414455414, "loss": 3.1387, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.05413818359375, "loss_aux_layer_10": 0.0850830078125, "loss_aux_layer_11": 0.0906982421875, "loss_aux_layer_12": 0.0965576171875, "loss_aux_layer_13": 0.103759765625, "loss_aux_layer_14": 0.11474609375, "loss_aux_layer_15": 0.124755859375, "loss_aux_layer_16": 0.135009765625, "loss_aux_layer_17": 0.14208984375, "loss_aux_layer_18": 0.150146484375, "loss_aux_layer_19": 0.151611328125, "loss_aux_layer_2": 0.0677490234375, "loss_aux_layer_20": 0.157958984375, "loss_aux_layer_21": 0.164794921875, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.227783203125, "loss_aux_layer_3": 0.08056640625, "loss_aux_layer_4": 0.08447265625, "loss_aux_layer_5": 0.08642578125, "loss_aux_layer_6": 0.089599609375, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.083740234375, "step": 1658, "total_loss": 0.7846677452325821 }, { "epoch": 0.328449811918432, "grad_norm": 1.2976940870285034, "learning_rate": 5e-05, "llm_loss": 0.5783836022019386, "loss": 2.7336, "loss_aux_layer_0": 0.02313232421875, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06109619140625, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.0770263671875, "step": 1659, "total_loss": 0.683403804898262 }, { "epoch": 0.3286477925163334, "grad_norm": 1.3255646228790283, "learning_rate": 5e-05, "llm_loss": 0.6188408881425858, "loss": 2.8898, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.0498046875, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.11474609375, "loss_aux_layer_16": 0.125732421875, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0767822265625, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.0743408203125, "step": 1660, "total_loss": 0.7224498242139816 }, { "epoch": 0.3288457731142348, "grad_norm": 1.2024767398834229, "learning_rate": 5e-05, "llm_loss": 0.6106324642896652, "loss": 2.855, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.047607421875, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.093994140625, "loss_aux_layer_14": 0.1046142578125, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.06005859375, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0750732421875, "step": 1661, "total_loss": 0.7137518376111984 }, { "epoch": 0.32904375371213623, "grad_norm": 1.4259594678878784, "learning_rate": 5e-05, "llm_loss": 0.5447199791669846, "loss": 2.6119, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.05169677734375, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.098876953125, "loss_aux_layer_14": 0.1092529296875, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.130615234375, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.065185546875, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.166259765625, "loss_aux_layer_22": 0.188232421875, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0775146484375, "loss_aux_layer_4": 0.0806884765625, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0791015625, "step": 1662, "total_loss": 0.6529669314622879 }, { "epoch": 0.3292417343100376, "grad_norm": 1.0085694789886475, "learning_rate": 5e-05, "llm_loss": 0.6416104435920715, "loss": 2.9962, "loss_aux_layer_0": 0.02197265625, "loss_aux_layer_1": 0.05059814453125, "loss_aux_layer_10": 0.0816650390625, "loss_aux_layer_11": 0.086669921875, "loss_aux_layer_12": 0.0928955078125, "loss_aux_layer_13": 0.099853515625, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0762939453125, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.0816650390625, "loss_aux_layer_6": 0.0849609375, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0810546875, "loss_aux_layer_9": 0.0799560546875, "step": 1663, "total_loss": 0.7490590214729309 }, { "epoch": 0.329439714907939, "grad_norm": 1.3469136953353882, "learning_rate": 5e-05, "llm_loss": 0.6434137374162674, "loss": 3.012, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.05230712890625, "loss_aux_layer_10": 0.08251953125, "loss_aux_layer_11": 0.087646484375, "loss_aux_layer_12": 0.0936279296875, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.112060546875, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.1336669921875, "loss_aux_layer_17": 0.140869140625, "loss_aux_layer_18": 0.149658203125, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.15869140625, "loss_aux_layer_21": 0.1650390625, "loss_aux_layer_22": 0.1875, "loss_aux_layer_23": 0.228759765625, "loss_aux_layer_3": 0.0777587890625, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.0863037109375, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.080810546875, "step": 1664, "total_loss": 0.7529998421669006 }, { "epoch": 0.32963769550584043, "grad_norm": 1.0857288837432861, "learning_rate": 5e-05, "llm_loss": 0.6105442941188812, "loss": 2.8548, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.080322265625, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.1041259765625, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06060791015625, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0745849609375, "step": 1665, "total_loss": 0.7136970460414886 }, { "epoch": 0.3298356761037418, "grad_norm": 1.3112742900848389, "learning_rate": 5e-05, "llm_loss": 0.6144035011529922, "loss": 2.8826, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.0504150390625, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0970458984375, "loss_aux_layer_14": 0.107666015625, "loss_aux_layer_15": 0.11767578125, "loss_aux_layer_16": 0.1287841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06207275390625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.184814453125, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.0777587890625, "step": 1666, "total_loss": 0.7206591814756393 }, { "epoch": 0.33003365670164325, "grad_norm": 0.9496421813964844, "learning_rate": 5e-05, "llm_loss": 0.5863818228244781, "loss": 2.7584, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.04754638671875, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.08056640625, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0936279296875, "loss_aux_layer_14": 0.1044921875, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.14599609375, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0699462890625, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.07421875, "step": 1667, "total_loss": 0.6896045207977295 }, { "epoch": 0.33023163729954463, "grad_norm": 1.2834025621414185, "learning_rate": 5e-05, "llm_loss": 0.6802002936601639, "loss": 3.1455, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.051025390625, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1085205078125, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.078369140625, "step": 1668, "total_loss": 0.7863866686820984 }, { "epoch": 0.33042961789744607, "grad_norm": 0.9651650786399841, "learning_rate": 5e-05, "llm_loss": 0.6076201796531677, "loss": 2.8564, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.0499267578125, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.1181640625, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.062255859375, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.184814453125, "loss_aux_layer_23": 0.22607421875, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.077880859375, "step": 1669, "total_loss": 0.7141042798757553 }, { "epoch": 0.33062759849534745, "grad_norm": 1.384774088859558, "learning_rate": 5e-05, "llm_loss": 0.5909239649772644, "loss": 2.7961, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.05230712890625, "loss_aux_layer_10": 0.08154296875, "loss_aux_layer_11": 0.0863037109375, "loss_aux_layer_12": 0.092529296875, "loss_aux_layer_13": 0.0994873046875, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.132080078125, "loss_aux_layer_17": 0.13916015625, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.06396484375, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.07958984375, "loss_aux_layer_5": 0.081298828125, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.0816650390625, "loss_aux_layer_8": 0.0809326171875, "loss_aux_layer_9": 0.079833984375, "step": 1670, "total_loss": 0.69903264939785 }, { "epoch": 0.3308255790932489, "grad_norm": 1.29607093334198, "learning_rate": 5e-05, "llm_loss": 0.6070275157690048, "loss": 2.8412, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1265869140625, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.059814453125, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.0748291015625, "step": 1671, "total_loss": 0.710306316614151 }, { "epoch": 0.33102355969115027, "grad_norm": 1.1463589668273926, "learning_rate": 5e-05, "llm_loss": 0.5650112703442574, "loss": 2.6982, "loss_aux_layer_0": 0.0247802734375, "loss_aux_layer_1": 0.0537109375, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.092041015625, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.11083984375, "loss_aux_layer_15": 0.1217041015625, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.14013671875, "loss_aux_layer_18": 0.1494140625, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.06591796875, "loss_aux_layer_20": 0.16015625, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.191162109375, "loss_aux_layer_23": 0.23388671875, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.082275390625, "loss_aux_layer_6": 0.0850830078125, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.080810546875, "loss_aux_layer_9": 0.0792236328125, "step": 1672, "total_loss": 0.6745561957359314 }, { "epoch": 0.33122154028905165, "grad_norm": 1.4124482870101929, "learning_rate": 5e-05, "llm_loss": 0.5982308387756348, "loss": 2.8125, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.06329345703125, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.07568359375, "loss_aux_layer_4": 0.0789794921875, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0789794921875, "loss_aux_layer_9": 0.077392578125, "step": 1673, "total_loss": 0.7031248658895493 }, { "epoch": 0.3314195208869531, "grad_norm": 1.2585010528564453, "learning_rate": 5e-05, "llm_loss": 0.6271653398871422, "loss": 2.931, "loss_aux_layer_0": 0.025634765625, "loss_aux_layer_1": 0.05023193359375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.083251953125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.1180419921875, "loss_aux_layer_16": 0.12890625, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.076904296875, "step": 1674, "total_loss": 0.7327559888362885 }, { "epoch": 0.33161750148485447, "grad_norm": 1.5522881746292114, "learning_rate": 5e-05, "llm_loss": 0.611988291144371, "loss": 2.8485, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.1224365234375, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.07196044921875, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0709228515625, "step": 1675, "total_loss": 0.7121169418096542 }, { "epoch": 0.3318154820827559, "grad_norm": 1.6095002889633179, "learning_rate": 5e-05, "llm_loss": 0.6034069061279297, "loss": 2.8289, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.050537109375, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.083251953125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.1246337890625, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.063232421875, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.0771484375, "step": 1676, "total_loss": 0.7072198987007141 }, { "epoch": 0.3320134626806573, "grad_norm": 1.6155203580856323, "learning_rate": 5e-05, "llm_loss": 0.6213433369994164, "loss": 2.9175, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.05108642578125, "loss_aux_layer_10": 0.0816650390625, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.0933837890625, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.1112060546875, "loss_aux_layer_15": 0.120849609375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138671875, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.06500244140625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0809326171875, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.080322265625, "step": 1677, "total_loss": 0.7293733507394791 }, { "epoch": 0.3322114432785587, "grad_norm": 1.943565845489502, "learning_rate": 5e-05, "llm_loss": 0.5908292829990387, "loss": 2.7943, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.0511474609375, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.109130859375, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.130859375, "loss_aux_layer_17": 0.139404296875, "loss_aux_layer_18": 0.148193359375, "loss_aux_layer_19": 0.15234375, "loss_aux_layer_2": 0.064697265625, "loss_aux_layer_20": 0.159423828125, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.189208984375, "loss_aux_layer_23": 0.230712890625, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.077392578125, "step": 1678, "total_loss": 0.698580801486969 }, { "epoch": 0.3324094238764601, "grad_norm": 1.7012710571289062, "learning_rate": 5e-05, "llm_loss": 0.6399919539690018, "loss": 2.9739, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.0474853515625, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.071044921875, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.07421875, "step": 1679, "total_loss": 0.7434661090373993 }, { "epoch": 0.3326074044743615, "grad_norm": 2.177527904510498, "learning_rate": 5e-05, "llm_loss": 0.679828405380249, "loss": 3.1547, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.05145263671875, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.087646484375, "loss_aux_layer_12": 0.09375, "loss_aux_layer_13": 0.100830078125, "loss_aux_layer_14": 0.1114501953125, "loss_aux_layer_15": 0.12158203125, "loss_aux_layer_16": 0.1322021484375, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.1484375, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.066650390625, "loss_aux_layer_20": 0.156005859375, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.0782470703125, "loss_aux_layer_4": 0.0816650390625, "loss_aux_layer_5": 0.0833740234375, "loss_aux_layer_6": 0.0867919921875, "loss_aux_layer_7": 0.0836181640625, "loss_aux_layer_8": 0.0823974609375, "loss_aux_layer_9": 0.081298828125, "step": 1680, "total_loss": 0.7886647284030914 }, { "epoch": 0.3328053850722629, "grad_norm": 3.2508771419525146, "learning_rate": 5e-05, "llm_loss": 0.6460597366094589, "loss": 3.0179, "loss_aux_layer_0": 0.02459716796875, "loss_aux_layer_1": 0.05322265625, "loss_aux_layer_10": 0.0826416015625, "loss_aux_layer_11": 0.087890625, "loss_aux_layer_12": 0.0938720703125, "loss_aux_layer_13": 0.1007080078125, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.136962890625, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.066650390625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.079345703125, "loss_aux_layer_4": 0.0831298828125, "loss_aux_layer_5": 0.0859375, "loss_aux_layer_6": 0.0872802734375, "loss_aux_layer_7": 0.08349609375, "loss_aux_layer_8": 0.082275390625, "loss_aux_layer_9": 0.0809326171875, "step": 1681, "total_loss": 0.75447678565979 }, { "epoch": 0.3330033656701643, "grad_norm": 1.540601372718811, "learning_rate": 5e-05, "llm_loss": 0.6534059047698975, "loss": 3.03, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.04779052734375, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1141357421875, "loss_aux_layer_16": 0.1246337890625, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.0618896484375, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.0762939453125, "step": 1682, "total_loss": 0.7575091570615768 }, { "epoch": 0.33320134626806575, "grad_norm": 1.6175477504730225, "learning_rate": 5e-05, "llm_loss": 0.5734997540712357, "loss": 2.7205, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.05023193359375, "loss_aux_layer_10": 0.0799560546875, "loss_aux_layer_11": 0.084716796875, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0965576171875, "loss_aux_layer_14": 0.1070556640625, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.126708984375, "loss_aux_layer_17": 0.134521484375, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06591796875, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.0819091796875, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0787353515625, "step": 1683, "total_loss": 0.6801223009824753 }, { "epoch": 0.3333993268659671, "grad_norm": 1.492573857307434, "learning_rate": 5e-05, "llm_loss": 0.5844560265541077, "loss": 2.7707, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.05035400390625, "loss_aux_layer_10": 0.081298828125, "loss_aux_layer_11": 0.0865478515625, "loss_aux_layer_12": 0.0928955078125, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.1103515625, "loss_aux_layer_15": 0.12060546875, "loss_aux_layer_16": 0.131103515625, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06536865234375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0770263671875, "loss_aux_layer_4": 0.080810546875, "loss_aux_layer_5": 0.0826416015625, "loss_aux_layer_6": 0.0858154296875, "loss_aux_layer_7": 0.0828857421875, "loss_aux_layer_8": 0.081787109375, "loss_aux_layer_9": 0.0804443359375, "step": 1684, "total_loss": 0.692667543888092 }, { "epoch": 0.33359730746386856, "grad_norm": 1.337532639503479, "learning_rate": 5e-05, "llm_loss": 0.5942310392856598, "loss": 2.7963, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.04852294921875, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.07568359375, "step": 1685, "total_loss": 0.6990748196840286 }, { "epoch": 0.33379528806176995, "grad_norm": 1.3165709972381592, "learning_rate": 5e-05, "llm_loss": 0.5753073841333389, "loss": 2.7339, "loss_aux_layer_0": 0.024871826171875, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.08544921875, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.0986328125, "loss_aux_layer_14": 0.1090087890625, "loss_aux_layer_15": 0.1195068359375, "loss_aux_layer_16": 0.1302490234375, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.150634765625, "loss_aux_layer_2": 0.0650634765625, "loss_aux_layer_20": 0.1591796875, "loss_aux_layer_21": 0.16748046875, "loss_aux_layer_22": 0.189697265625, "loss_aux_layer_23": 0.231201171875, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.07861328125, "step": 1686, "total_loss": 0.6834626495838165 }, { "epoch": 0.3339932686596713, "grad_norm": 1.771622657775879, "learning_rate": 5e-05, "llm_loss": 0.5748388022184372, "loss": 2.7286, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.0809326171875, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.091552734375, "loss_aux_layer_13": 0.09814453125, "loss_aux_layer_14": 0.1085205078125, "loss_aux_layer_15": 0.1192626953125, "loss_aux_layer_16": 0.129638671875, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.065673828125, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.07568359375, "loss_aux_layer_4": 0.0787353515625, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.0848388671875, "loss_aux_layer_7": 0.08154296875, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.0792236328125, "step": 1687, "total_loss": 0.6821443289518356 }, { "epoch": 0.33419124925757276, "grad_norm": 1.288301944732666, "learning_rate": 5e-05, "llm_loss": 0.6539255827665329, "loss": 3.0417, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.05023193359375, "loss_aux_layer_10": 0.0797119140625, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.1087646484375, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.1292724609375, "loss_aux_layer_17": 0.13720703125, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06573486328125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.0765380859375, "loss_aux_layer_4": 0.0794677734375, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.0782470703125, "step": 1688, "total_loss": 0.7604357302188873 }, { "epoch": 0.33438922985547415, "grad_norm": 1.1699196100234985, "learning_rate": 5e-05, "llm_loss": 0.5743250995874405, "loss": 2.7179, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.082275390625, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.0623779296875, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.0733642578125, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.0758056640625, "step": 1689, "total_loss": 0.6794808954000473 }, { "epoch": 0.3345872104533756, "grad_norm": 1.48055100440979, "learning_rate": 5e-05, "llm_loss": 0.7035970985889435, "loss": 3.2321, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.05078125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.062255859375, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.0775146484375, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.0787353515625, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.0760498046875, "step": 1690, "total_loss": 0.808033362030983 }, { "epoch": 0.33478519105127696, "grad_norm": 1.1442620754241943, "learning_rate": 5e-05, "llm_loss": 0.5637632608413696, "loss": 2.6618, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.0738525390625, "step": 1691, "total_loss": 0.6654584854841232 }, { "epoch": 0.3349831716491784, "grad_norm": 1.4147106409072876, "learning_rate": 5e-05, "llm_loss": 0.6085936576128006, "loss": 2.8529, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.04962158203125, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.144775390625, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0750732421875, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0750732421875, "step": 1692, "total_loss": 0.7132198810577393 }, { "epoch": 0.3351811522470798, "grad_norm": 1.48377525806427, "learning_rate": 5e-05, "llm_loss": 0.6517778933048248, "loss": 3.0165, "loss_aux_layer_0": 0.0245361328125, "loss_aux_layer_1": 0.05084228515625, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0865478515625, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1033935546875, "loss_aux_layer_15": 0.1131591796875, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.0743408203125, "step": 1693, "total_loss": 0.7541241347789764 }, { "epoch": 0.33537913284498116, "grad_norm": 1.2931909561157227, "learning_rate": 5e-05, "llm_loss": 0.6052389740943909, "loss": 2.8598, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.054443359375, "loss_aux_layer_10": 0.0833740234375, "loss_aux_layer_11": 0.0888671875, "loss_aux_layer_12": 0.0950927734375, "loss_aux_layer_13": 0.102294921875, "loss_aux_layer_14": 0.11279296875, "loss_aux_layer_15": 0.12255859375, "loss_aux_layer_16": 0.13232421875, "loss_aux_layer_17": 0.1396484375, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.0673828125, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.0797119140625, "loss_aux_layer_4": 0.082763671875, "loss_aux_layer_5": 0.084716796875, "loss_aux_layer_6": 0.087890625, "loss_aux_layer_7": 0.0845947265625, "loss_aux_layer_8": 0.0833740234375, "loss_aux_layer_9": 0.08203125, "step": 1694, "total_loss": 0.7149541229009628 }, { "epoch": 0.3355771134428826, "grad_norm": 1.4675253629684448, "learning_rate": 5e-05, "llm_loss": 0.670205220580101, "loss": 3.0987, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.0513916015625, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06378173828125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.074462890625, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.0826416015625, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.0767822265625, "step": 1695, "total_loss": 0.7746659219264984 }, { "epoch": 0.335775094040784, "grad_norm": 0.9344261288642883, "learning_rate": 5e-05, "llm_loss": 0.5367997363209724, "loss": 2.5543, "loss_aux_layer_0": 0.024658203125, "loss_aux_layer_1": 0.049072265625, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072509765625, "step": 1696, "total_loss": 0.6385723501443863 }, { "epoch": 0.3359730746386854, "grad_norm": 1.8695030212402344, "learning_rate": 5e-05, "llm_loss": 0.5936279594898224, "loss": 2.7897, "loss_aux_layer_0": 0.022369384765625, "loss_aux_layer_1": 0.04937744140625, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.114501953125, "loss_aux_layer_16": 0.1251220703125, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.0626220703125, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.07421875, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0755615234375, "step": 1697, "total_loss": 0.6974290460348129 }, { "epoch": 0.3361710552365868, "grad_norm": 1.3961821794509888, "learning_rate": 5e-05, "llm_loss": 0.6075388044118881, "loss": 2.8328, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.04730224609375, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.1297607421875, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05853271484375, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0728759765625, "step": 1698, "total_loss": 0.708197683095932 }, { "epoch": 0.33636903583448824, "grad_norm": 1.5632730722427368, "learning_rate": 5e-05, "llm_loss": 0.6405972838401794, "loss": 3.01, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.05523681640625, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.096435546875, "loss_aux_layer_13": 0.103515625, "loss_aux_layer_14": 0.1142578125, "loss_aux_layer_15": 0.1248779296875, "loss_aux_layer_16": 0.1357421875, "loss_aux_layer_17": 0.142822265625, "loss_aux_layer_18": 0.151123046875, "loss_aux_layer_19": 0.15283203125, "loss_aux_layer_2": 0.0679931640625, "loss_aux_layer_20": 0.158935546875, "loss_aux_layer_21": 0.1669921875, "loss_aux_layer_22": 0.190185546875, "loss_aux_layer_23": 0.230712890625, "loss_aux_layer_3": 0.0809326171875, "loss_aux_layer_4": 0.0843505859375, "loss_aux_layer_5": 0.0860595703125, "loss_aux_layer_6": 0.0892333984375, "loss_aux_layer_7": 0.0863037109375, "loss_aux_layer_8": 0.085205078125, "loss_aux_layer_9": 0.0838623046875, "step": 1699, "total_loss": 0.7525074630975723 }, { "epoch": 0.3365670164323896, "grad_norm": 1.6137906312942505, "learning_rate": 5e-05, "llm_loss": 0.6100230515003204, "loss": 2.8557, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.04827880859375, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.09423828125, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.06036376953125, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.075439453125, "step": 1700, "total_loss": 0.7139257341623306 }, { "epoch": 0.33676499703029106, "grad_norm": 1.3939461708068848, "learning_rate": 5e-05, "llm_loss": 0.5429922342300415, "loss": 2.6044, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.0806884765625, "loss_aux_layer_11": 0.0859375, "loss_aux_layer_12": 0.092041015625, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.1097412109375, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147705078125, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.157470703125, "loss_aux_layer_21": 0.16552734375, "loss_aux_layer_22": 0.18701171875, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.0794677734375, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.08447265625, "loss_aux_layer_7": 0.081787109375, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.0792236328125, "step": 1701, "total_loss": 0.6510950177907944 }, { "epoch": 0.33696297762819244, "grad_norm": 1.5617283582687378, "learning_rate": 5e-05, "llm_loss": 0.6325867474079132, "loss": 2.9451, "loss_aux_layer_0": 0.022247314453125, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.14111328125, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.06146240234375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0750732421875, "step": 1702, "total_loss": 0.736271470785141 }, { "epoch": 0.3371609582260938, "grad_norm": 1.4549988508224487, "learning_rate": 5e-05, "llm_loss": 0.6476563513278961, "loss": 3.0063, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.04852294921875, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.1141357421875, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.06011962890625, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.076904296875, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.07470703125, "step": 1703, "total_loss": 0.7515712380409241 }, { "epoch": 0.33735893882399526, "grad_norm": 1.0785317420959473, "learning_rate": 5e-05, "llm_loss": 0.5829618871212006, "loss": 2.7431, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.049560546875, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.076904296875, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0745849609375, "step": 1704, "total_loss": 0.6857702285051346 }, { "epoch": 0.33755691942189664, "grad_norm": 1.633133053779602, "learning_rate": 5e-05, "llm_loss": 0.6023953706026077, "loss": 2.8114, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.04620361328125, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.1224365234375, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0711669921875, "step": 1705, "total_loss": 0.7028414458036423 }, { "epoch": 0.3377549000197981, "grad_norm": 1.6528918743133545, "learning_rate": 5e-05, "llm_loss": 0.6742136925458908, "loss": 3.1193, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.05126953125, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.11767578125, "loss_aux_layer_16": 0.1273193359375, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.06365966796875, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.079833984375, "loss_aux_layer_5": 0.08154296875, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.08203125, "loss_aux_layer_8": 0.0806884765625, "loss_aux_layer_9": 0.0791015625, "step": 1706, "total_loss": 0.779836431145668 }, { "epoch": 0.33795288061769946, "grad_norm": 1.0857678651809692, "learning_rate": 5e-05, "llm_loss": 0.5536796748638153, "loss": 2.6141, "loss_aux_layer_0": 0.023895263671875, "loss_aux_layer_1": 0.04510498046875, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.08935546875, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.12109375, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0706787109375, "step": 1707, "total_loss": 0.6535162180662155 }, { "epoch": 0.3381508612156009, "grad_norm": 1.318294644355774, "learning_rate": 5e-05, "llm_loss": 0.6175026968121529, "loss": 2.8933, "loss_aux_layer_0": 0.0228271484375, "loss_aux_layer_1": 0.0518798828125, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.1064453125, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06378173828125, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.07763671875, "step": 1708, "total_loss": 0.7233135253190994 }, { "epoch": 0.3383488418135023, "grad_norm": 1.2817277908325195, "learning_rate": 5e-05, "llm_loss": 0.5620919913053513, "loss": 2.6813, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.05078125, "loss_aux_layer_10": 0.0823974609375, "loss_aux_layer_11": 0.087158203125, "loss_aux_layer_12": 0.093017578125, "loss_aux_layer_13": 0.0999755859375, "loss_aux_layer_14": 0.1102294921875, "loss_aux_layer_15": 0.119873046875, "loss_aux_layer_16": 0.13037109375, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.156982421875, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.08251953125, "loss_aux_layer_6": 0.0860595703125, "loss_aux_layer_7": 0.083251953125, "loss_aux_layer_8": 0.08203125, "loss_aux_layer_9": 0.080810546875, "step": 1709, "total_loss": 0.6703203767538071 }, { "epoch": 0.33854682241140366, "grad_norm": 1.4698424339294434, "learning_rate": 5e-05, "llm_loss": 0.5811901986598969, "loss": 2.7466, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.0478515625, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.1171875, "loss_aux_layer_16": 0.12841796875, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0615234375, "loss_aux_layer_20": 0.15625, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.187744140625, "loss_aux_layer_23": 0.229248046875, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.0767822265625, "loss_aux_layer_9": 0.075439453125, "step": 1710, "total_loss": 0.6866468489170074 }, { "epoch": 0.3387448030093051, "grad_norm": 2.1754062175750732, "learning_rate": 5e-05, "llm_loss": 0.527565523982048, "loss": 2.5263, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.05029296875, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0762939453125, "step": 1711, "total_loss": 0.6315832436084747 }, { "epoch": 0.3389427836072065, "grad_norm": 1.7406723499298096, "learning_rate": 5e-05, "llm_loss": 0.6181260943412781, "loss": 2.897, "loss_aux_layer_0": 0.02569580078125, "loss_aux_layer_1": 0.0516357421875, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.127685546875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.0633544921875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.0794677734375, "loss_aux_layer_5": 0.0811767578125, "loss_aux_layer_6": 0.0843505859375, "loss_aux_layer_7": 0.0814208984375, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.07861328125, "step": 1712, "total_loss": 0.724239706993103 }, { "epoch": 0.3391407642051079, "grad_norm": 1.3174517154693604, "learning_rate": 5e-05, "llm_loss": 0.6313579231500626, "loss": 2.9495, "loss_aux_layer_0": 0.02423095703125, "loss_aux_layer_1": 0.0506591796875, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.0980224609375, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1182861328125, "loss_aux_layer_16": 0.1282958984375, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.14404296875, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.06329345703125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.0802001953125, "loss_aux_layer_9": 0.078857421875, "step": 1713, "total_loss": 0.7373852133750916 }, { "epoch": 0.3393387448030093, "grad_norm": 2.3734757900238037, "learning_rate": 5e-05, "llm_loss": 0.6142908930778503, "loss": 2.8719, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.0947265625, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.0748291015625, "step": 1714, "total_loss": 0.7179759740829468 }, { "epoch": 0.33953672540091073, "grad_norm": 1.5889859199523926, "learning_rate": 5e-05, "llm_loss": 0.6700383722782135, "loss": 3.0898, "loss_aux_layer_0": 0.025054931640625, "loss_aux_layer_1": 0.0498046875, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.1126708984375, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0765380859375, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.074951171875, "step": 1715, "total_loss": 0.7724500745534897 }, { "epoch": 0.3397347059988121, "grad_norm": 1.8777718544006348, "learning_rate": 5e-05, "llm_loss": 0.5479700416326523, "loss": 2.6069, "loss_aux_layer_0": 0.02435302734375, "loss_aux_layer_1": 0.04913330078125, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.1138916015625, "loss_aux_layer_16": 0.1248779296875, "loss_aux_layer_17": 0.1331787109375, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0753173828125, "step": 1716, "total_loss": 0.6517158448696136 }, { "epoch": 0.3399326865967135, "grad_norm": 1.2612653970718384, "learning_rate": 5e-05, "llm_loss": 0.6267106682062149, "loss": 2.9253, "loss_aux_layer_0": 0.02166748046875, "loss_aux_layer_1": 0.04901123046875, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.087890625, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.061767578125, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.0767822265625, "step": 1717, "total_loss": 0.7313295304775238 }, { "epoch": 0.34013066719461493, "grad_norm": 1.9302622079849243, "learning_rate": 5e-05, "llm_loss": 0.6104789599776268, "loss": 2.8623, "loss_aux_layer_0": 0.02197265625, "loss_aux_layer_1": 0.0501708984375, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.09716796875, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.0626220703125, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.0777587890625, "step": 1718, "total_loss": 0.7155703753232956 }, { "epoch": 0.3403286477925163, "grad_norm": 1.113331913948059, "learning_rate": 5e-05, "llm_loss": 0.5342021808028221, "loss": 2.552, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.0484619140625, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1153564453125, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0760498046875, "step": 1719, "total_loss": 0.6380031108856201 }, { "epoch": 0.34052662839041775, "grad_norm": 1.2303316593170166, "learning_rate": 5e-05, "llm_loss": 0.6183263510465622, "loss": 2.8792, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04681396484375, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.0848388671875, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.1231689453125, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.07275390625, "step": 1720, "total_loss": 0.7197988778352737 }, { "epoch": 0.34072460898831913, "grad_norm": 1.2278317213058472, "learning_rate": 5e-05, "llm_loss": 0.5739306062459946, "loss": 2.7206, "loss_aux_layer_0": 0.022735595703125, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1202392578125, "loss_aux_layer_16": 0.1307373046875, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.1474609375, "loss_aux_layer_19": 0.149658203125, "loss_aux_layer_2": 0.06048583984375, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.0772705078125, "step": 1721, "total_loss": 0.6801408529281616 }, { "epoch": 0.34092258958622057, "grad_norm": 1.261596918106079, "learning_rate": 5e-05, "llm_loss": 0.5452634319663048, "loss": 2.5889, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.0869140625, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.1234130859375, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.15771484375, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.074462890625, "step": 1722, "total_loss": 0.6472131162881851 }, { "epoch": 0.34112057018412195, "grad_norm": 1.3584188222885132, "learning_rate": 5e-05, "llm_loss": 0.6845347434282303, "loss": 3.1433, "loss_aux_layer_0": 0.02313232421875, "loss_aux_layer_1": 0.049072265625, "loss_aux_layer_10": 0.0753173828125, "loss_aux_layer_11": 0.0799560546875, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.1021728515625, "loss_aux_layer_15": 0.1116943359375, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05950927734375, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.07421875, "step": 1723, "total_loss": 0.7858133614063263 }, { "epoch": 0.3413185507820234, "grad_norm": 1.1577491760253906, "learning_rate": 5e-05, "llm_loss": 0.6364518254995346, "loss": 2.9804, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.0517578125, "loss_aux_layer_10": 0.082275390625, "loss_aux_layer_11": 0.088134765625, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.1011962890625, "loss_aux_layer_14": 0.11181640625, "loss_aux_layer_15": 0.121826171875, "loss_aux_layer_16": 0.1318359375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.0654296875, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.077880859375, "loss_aux_layer_4": 0.0811767578125, "loss_aux_layer_5": 0.0831298828125, "loss_aux_layer_6": 0.0863037109375, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.0819091796875, "loss_aux_layer_9": 0.0806884765625, "step": 1724, "total_loss": 0.7451021671295166 }, { "epoch": 0.34151653137992477, "grad_norm": 1.6857576370239258, "learning_rate": 5e-05, "llm_loss": 0.6482366025447845, "loss": 3.0173, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.04998779296875, "loss_aux_layer_10": 0.079345703125, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0965576171875, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.1171875, "loss_aux_layer_16": 0.1279296875, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.14453125, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.06201171875, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162353515625, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.077880859375, "step": 1725, "total_loss": 0.7543266862630844 }, { "epoch": 0.34171451197782615, "grad_norm": 1.0484576225280762, "learning_rate": 5e-05, "llm_loss": 0.6367294043302536, "loss": 2.9681, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.049560546875, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.0843505859375, "loss_aux_layer_12": 0.0904541015625, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.107177734375, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.1270751953125, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.1435546875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.06170654296875, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.0777587890625, "step": 1726, "total_loss": 0.7420147359371185 }, { "epoch": 0.3419124925757276, "grad_norm": 1.3435312509536743, "learning_rate": 5e-05, "llm_loss": 0.5794442296028137, "loss": 2.728, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.0458984375, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0936279296875, "loss_aux_layer_14": 0.104248046875, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.1251220703125, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.07373046875, "step": 1727, "total_loss": 0.6819912493228912 }, { "epoch": 0.34211047317362897, "grad_norm": 1.0383563041687012, "learning_rate": 5e-05, "llm_loss": 0.7201322913169861, "loss": 3.2989, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.04888916015625, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.1153564453125, "loss_aux_layer_16": 0.1256103515625, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.0760498046875, "step": 1728, "total_loss": 0.8247239887714386 }, { "epoch": 0.3423084537715304, "grad_norm": 0.8461883664131165, "learning_rate": 5e-05, "llm_loss": 0.560698002576828, "loss": 2.6719, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.0496826171875, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0906982421875, "loss_aux_layer_13": 0.09765625, "loss_aux_layer_14": 0.10888671875, "loss_aux_layer_15": 0.1192626953125, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.14697265625, "loss_aux_layer_19": 0.1494140625, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.15673828125, "loss_aux_layer_21": 0.164306640625, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.228271484375, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.0804443359375, "loss_aux_layer_6": 0.0838623046875, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.0784912109375, "step": 1729, "total_loss": 0.6679718792438507 }, { "epoch": 0.3425064343694318, "grad_norm": 1.4342597723007202, "learning_rate": 5e-05, "llm_loss": 0.5685007497668266, "loss": 2.6728, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.120849609375, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.138916015625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0709228515625, "loss_aux_layer_9": 0.070068359375, "step": 1730, "total_loss": 0.6681888103485107 }, { "epoch": 0.3427044149673332, "grad_norm": 1.237829566001892, "learning_rate": 5e-05, "llm_loss": 0.6303867101669312, "loss": 2.9413, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04888916015625, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.061767578125, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.076904296875, "step": 1731, "total_loss": 0.7353372871875763 }, { "epoch": 0.3429023955652346, "grad_norm": 1.2839503288269043, "learning_rate": 5e-05, "llm_loss": 0.5815446153283119, "loss": 2.7225, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04510498046875, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1280517578125, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0565185546875, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.070556640625, "step": 1732, "total_loss": 0.6806259900331497 }, { "epoch": 0.343100376163136, "grad_norm": 1.1959468126296997, "learning_rate": 5e-05, "llm_loss": 0.648695282638073, "loss": 2.9932, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.11083984375, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.1295166015625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05731201171875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.0721435546875, "step": 1733, "total_loss": 0.7482911646366119 }, { "epoch": 0.3432983567610374, "grad_norm": 0.9442198872566223, "learning_rate": 5e-05, "llm_loss": 0.6244460493326187, "loss": 2.9194, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.049072265625, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0748291015625, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.080810546875, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.078125, "step": 1734, "total_loss": 0.7298615574836731 }, { "epoch": 0.3434963373589388, "grad_norm": 0.8657474517822266, "learning_rate": 5e-05, "llm_loss": 0.5699475631117821, "loss": 2.6985, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.04962158203125, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06195068359375, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.182373046875, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.0767822265625, "step": 1735, "total_loss": 0.6746208071708679 }, { "epoch": 0.34369431795684025, "grad_norm": 1.219765067100525, "learning_rate": 5e-05, "llm_loss": 0.5616199374198914, "loss": 2.6649, "loss_aux_layer_0": 0.024749755859375, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.1278076171875, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.14892578125, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.15576171875, "loss_aux_layer_21": 0.163818359375, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.0736083984375, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0751953125, "step": 1736, "total_loss": 0.6662220805883408 }, { "epoch": 0.3438922985547416, "grad_norm": 0.9670153260231018, "learning_rate": 5e-05, "llm_loss": 0.6084645241498947, "loss": 2.8496, "loss_aux_layer_0": 0.025238037109375, "loss_aux_layer_1": 0.04925537109375, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.1336669921875, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.075439453125, "step": 1737, "total_loss": 0.7123913168907166 }, { "epoch": 0.34409027915264306, "grad_norm": 1.293342113494873, "learning_rate": 5e-05, "llm_loss": 0.5828322619199753, "loss": 2.7385, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.04730224609375, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.0933837890625, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.071044921875, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.07470703125, "step": 1738, "total_loss": 0.6846342831850052 }, { "epoch": 0.34428825975054445, "grad_norm": 1.1827757358551025, "learning_rate": 5e-05, "llm_loss": 0.6096148490905762, "loss": 2.854, "loss_aux_layer_0": 0.02362060546875, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.07666015625, "loss_aux_layer_9": 0.075439453125, "step": 1739, "total_loss": 0.7135048508644104 }, { "epoch": 0.3444862403484458, "grad_norm": 0.9655545949935913, "learning_rate": 5e-05, "llm_loss": 0.650790810585022, "loss": 3.0211, "loss_aux_layer_0": 0.02545166015625, "loss_aux_layer_1": 0.0491943359375, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.1263427734375, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.06195068359375, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.076171875, "step": 1740, "total_loss": 0.7552868127822876 }, { "epoch": 0.34468422094634726, "grad_norm": 1.2071020603179932, "learning_rate": 5e-05, "llm_loss": 0.6386882066726685, "loss": 2.9755, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.04937744140625, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.0899658203125, "loss_aux_layer_13": 0.09716796875, "loss_aux_layer_14": 0.10791015625, "loss_aux_layer_15": 0.117919921875, "loss_aux_layer_16": 0.1282958984375, "loss_aux_layer_17": 0.13623046875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.154052734375, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0733642578125, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.079833984375, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.0777587890625, "step": 1741, "total_loss": 0.7438689470291138 }, { "epoch": 0.34488220154424865, "grad_norm": 1.1118438243865967, "learning_rate": 5e-05, "llm_loss": 0.5918785333633423, "loss": 2.7856, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.0501708984375, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.1160888671875, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0733642578125, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.07763671875, "step": 1742, "total_loss": 0.696392685174942 }, { "epoch": 0.3450801821421501, "grad_norm": 1.1264233589172363, "learning_rate": 5e-05, "llm_loss": 0.5497113168239594, "loss": 2.6121, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.048095703125, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.1253662109375, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.076416015625, "step": 1743, "total_loss": 0.6530279517173767 }, { "epoch": 0.34527816274005146, "grad_norm": 1.5006500482559204, "learning_rate": 5e-05, "llm_loss": 0.5974815338850021, "loss": 2.7978, "loss_aux_layer_0": 0.02618408203125, "loss_aux_layer_1": 0.049072265625, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.07244873046875, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.0802001953125, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.073486328125, "step": 1744, "total_loss": 0.6994417309761047 }, { "epoch": 0.3454761433379529, "grad_norm": 1.4021563529968262, "learning_rate": 5e-05, "llm_loss": 0.608873188495636, "loss": 2.835, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04620361328125, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.15771484375, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0706787109375, "step": 1745, "total_loss": 0.7087448388338089 }, { "epoch": 0.3456741239358543, "grad_norm": 1.1612141132354736, "learning_rate": 5e-05, "llm_loss": 0.561762273311615, "loss": 2.6617, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.0770263671875, "loss_aux_layer_9": 0.075927734375, "step": 1746, "total_loss": 0.6654211282730103 }, { "epoch": 0.34587210453375566, "grad_norm": 1.1778349876403809, "learning_rate": 5e-05, "llm_loss": 0.5702899247407913, "loss": 2.6892, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0799560546875, "loss_aux_layer_12": 0.085693359375, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.07061767578125, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.073486328125, "step": 1747, "total_loss": 0.6723100394010544 }, { "epoch": 0.3460700851316571, "grad_norm": 1.3111330270767212, "learning_rate": 5e-05, "llm_loss": 0.5321481972932816, "loss": 2.5491, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04998779296875, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.083251953125, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.12646484375, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.0828857421875, "loss_aux_layer_7": 0.079833984375, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.0772705078125, "step": 1748, "total_loss": 0.6372683644294739 }, { "epoch": 0.3462680657295585, "grad_norm": 1.3268624544143677, "learning_rate": 5e-05, "llm_loss": 0.5941288471221924, "loss": 2.8035, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.050048828125, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.091552734375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1090087890625, "loss_aux_layer_15": 0.119384765625, "loss_aux_layer_16": 0.1300048828125, "loss_aux_layer_17": 0.137451171875, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.148193359375, "loss_aux_layer_2": 0.0633544921875, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.078857421875, "loss_aux_layer_5": 0.0810546875, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.0809326171875, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.0782470703125, "step": 1749, "total_loss": 0.7008737921714783 }, { "epoch": 0.3464660463274599, "grad_norm": 1.7521843910217285, "learning_rate": 5e-05, "llm_loss": 0.5507167279720306, "loss": 2.5994, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04559326171875, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.12109375, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0699462890625, "step": 1750, "total_loss": 0.6498478353023529 }, { "epoch": 0.3466640269253613, "grad_norm": 2.774320602416992, "learning_rate": 5e-05, "llm_loss": 0.631485290825367, "loss": 2.9424, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.050048828125, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1048583984375, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.12451171875, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.0618896484375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0765380859375, "step": 1751, "total_loss": 0.7355947345495224 }, { "epoch": 0.34686200752326274, "grad_norm": 2.220355272293091, "learning_rate": 5e-05, "llm_loss": 0.5606419295072556, "loss": 2.6686, "loss_aux_layer_0": 0.02374267578125, "loss_aux_layer_1": 0.0499267578125, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.129150390625, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.14990234375, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.077392578125, "step": 1752, "total_loss": 0.6671568602323532 }, { "epoch": 0.3470599881211641, "grad_norm": 1.6784507036209106, "learning_rate": 5e-05, "llm_loss": 0.6155462861061096, "loss": 2.8848, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0496826171875, "loss_aux_layer_10": 0.0794677734375, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.1068115234375, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.1273193359375, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.062255859375, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.07861328125, "loss_aux_layer_5": 0.08056640625, "loss_aux_layer_6": 0.083984375, "loss_aux_layer_7": 0.0810546875, "loss_aux_layer_8": 0.0797119140625, "loss_aux_layer_9": 0.0784912109375, "step": 1753, "total_loss": 0.7211929112672806 }, { "epoch": 0.34725796871906556, "grad_norm": 1.4622578620910645, "learning_rate": 5e-05, "llm_loss": 0.632148340344429, "loss": 2.9572, "loss_aux_layer_0": 0.023956298828125, "loss_aux_layer_1": 0.05133056640625, "loss_aux_layer_10": 0.0799560546875, "loss_aux_layer_11": 0.0850830078125, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.0985107421875, "loss_aux_layer_14": 0.108642578125, "loss_aux_layer_15": 0.1187744140625, "loss_aux_layer_16": 0.1292724609375, "loss_aux_layer_17": 0.136962890625, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06427001953125, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.07666015625, "loss_aux_layer_4": 0.0799560546875, "loss_aux_layer_5": 0.08154296875, "loss_aux_layer_6": 0.084716796875, "loss_aux_layer_7": 0.0819091796875, "loss_aux_layer_8": 0.0804443359375, "loss_aux_layer_9": 0.078857421875, "step": 1754, "total_loss": 0.7393060177564621 }, { "epoch": 0.34745594931696694, "grad_norm": 1.6077446937561035, "learning_rate": 5e-05, "llm_loss": 0.5667828917503357, "loss": 2.6611, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.04376220703125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.1083984375, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1275634765625, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.05499267578125, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0689697265625, "step": 1755, "total_loss": 0.6652858555316925 }, { "epoch": 0.3476539299148683, "grad_norm": 1.2179838418960571, "learning_rate": 5e-05, "llm_loss": 0.6345520466566086, "loss": 2.9697, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.05224609375, "loss_aux_layer_10": 0.0811767578125, "loss_aux_layer_11": 0.086181640625, "loss_aux_layer_12": 0.0921630859375, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1204833984375, "loss_aux_layer_16": 0.13134765625, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.0640869140625, "loss_aux_layer_20": 0.156005859375, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.18603515625, "loss_aux_layer_23": 0.22705078125, "loss_aux_layer_3": 0.0758056640625, "loss_aux_layer_4": 0.0792236328125, "loss_aux_layer_5": 0.0809326171875, "loss_aux_layer_6": 0.0841064453125, "loss_aux_layer_7": 0.0811767578125, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0792236328125, "step": 1756, "total_loss": 0.7424227893352509 }, { "epoch": 0.34785191051276976, "grad_norm": 1.6299169063568115, "learning_rate": 5e-05, "llm_loss": 0.5998004674911499, "loss": 2.8138, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.05059814453125, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1048583984375, "loss_aux_layer_15": 0.114501953125, "loss_aux_layer_16": 0.12451171875, "loss_aux_layer_17": 0.1317138671875, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.06341552734375, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0750732421875, "loss_aux_layer_4": 0.0784912109375, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.076904296875, "step": 1757, "total_loss": 0.7034551948308945 }, { "epoch": 0.34804989111067114, "grad_norm": 1.530730962753296, "learning_rate": 5e-05, "llm_loss": 0.6074034869670868, "loss": 2.8395, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04925537109375, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.092041015625, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.073974609375, "step": 1758, "total_loss": 0.7098840773105621 }, { "epoch": 0.3482478717085726, "grad_norm": 1.2724634408950806, "learning_rate": 5e-05, "llm_loss": 0.4933026656508446, "loss": 2.4058, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.051025390625, "loss_aux_layer_10": 0.0819091796875, "loss_aux_layer_11": 0.0870361328125, "loss_aux_layer_12": 0.0927734375, "loss_aux_layer_13": 0.099365234375, "loss_aux_layer_14": 0.109619140625, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.1300048828125, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.1455078125, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06475830078125, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.226318359375, "loss_aux_layer_3": 0.07763671875, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.082763671875, "loss_aux_layer_6": 0.086181640625, "loss_aux_layer_7": 0.0831298828125, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.0804443359375, "step": 1759, "total_loss": 0.6014404445886612 }, { "epoch": 0.34844585230647396, "grad_norm": 1.8950855731964111, "learning_rate": 5e-05, "llm_loss": 0.5867073684930801, "loss": 2.7619, "loss_aux_layer_0": 0.022918701171875, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05999755859375, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.075439453125, "step": 1760, "total_loss": 0.6904687136411667 }, { "epoch": 0.3486438329043754, "grad_norm": 1.6422585248947144, "learning_rate": 5e-05, "llm_loss": 0.6019863188266754, "loss": 2.8227, "loss_aux_layer_0": 0.024566650390625, "loss_aux_layer_1": 0.0479736328125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1046142578125, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.125732421875, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0751953125, "step": 1761, "total_loss": 0.7056740969419479 }, { "epoch": 0.3488418135022768, "grad_norm": 1.2725167274475098, "learning_rate": 5e-05, "llm_loss": 0.5394074618816376, "loss": 2.5587, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04736328125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.120361328125, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05963134765625, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.07275390625, "step": 1762, "total_loss": 0.6396681666374207 }, { "epoch": 0.34903979410017816, "grad_norm": 1.3951821327209473, "learning_rate": 5e-05, "llm_loss": 0.6490688920021057, "loss": 2.994, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.09033203125, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0726318359375, "step": 1763, "total_loss": 0.7485038638114929 }, { "epoch": 0.3492377746980796, "grad_norm": 1.3080145120620728, "learning_rate": 5e-05, "llm_loss": 0.6037180870771408, "loss": 2.8274, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.0506591796875, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0947265625, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.1248779296875, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.076171875, "step": 1764, "total_loss": 0.7068438231945038 }, { "epoch": 0.349435755295981, "grad_norm": 1.1952475309371948, "learning_rate": 5e-05, "llm_loss": 0.6577711254358292, "loss": 3.0474, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.04937744140625, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.125732421875, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.076416015625, "step": 1765, "total_loss": 0.761859729886055 }, { "epoch": 0.3496337358938824, "grad_norm": 1.1497752666473389, "learning_rate": 5e-05, "llm_loss": 0.5820792317390442, "loss": 2.7382, "loss_aux_layer_0": 0.022125244140625, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.05902099609375, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.0736083984375, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.073974609375, "step": 1766, "total_loss": 0.6845555007457733 }, { "epoch": 0.3498317164917838, "grad_norm": 0.9063835740089417, "learning_rate": 5e-05, "llm_loss": 0.5404137521982193, "loss": 2.5635, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.120849609375, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0723876953125, "step": 1767, "total_loss": 0.6408823877573013 }, { "epoch": 0.35002969708968523, "grad_norm": 1.4291415214538574, "learning_rate": 5e-05, "llm_loss": 0.643616795539856, "loss": 2.9818, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0802001953125, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.1312255859375, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.06005859375, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.071533203125, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.0767822265625, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.0738525390625, "step": 1768, "total_loss": 0.7454393357038498 }, { "epoch": 0.3502276776875866, "grad_norm": 1.799355149269104, "learning_rate": 5e-05, "llm_loss": 0.5898408591747284, "loss": 2.7613, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.045654296875, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.100341796875, "loss_aux_layer_15": 0.1107177734375, "loss_aux_layer_16": 0.121337890625, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.15771484375, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.07177734375, "step": 1769, "total_loss": 0.690331757068634 }, { "epoch": 0.350425658285488, "grad_norm": 1.1991567611694336, "learning_rate": 5e-05, "llm_loss": 0.6228612437844276, "loss": 2.9052, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.048828125, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1048583984375, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0775146484375, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.076904296875, "step": 1770, "total_loss": 0.7263048887252808 }, { "epoch": 0.35062363888338943, "grad_norm": 1.4714088439941406, "learning_rate": 5e-05, "llm_loss": 0.7824575304985046, "loss": 3.5479, "loss_aux_layer_0": 0.0233154296875, "loss_aux_layer_1": 0.0489501953125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06109619140625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0750732421875, "step": 1771, "total_loss": 0.8869694173336029 }, { "epoch": 0.3508216194812908, "grad_norm": 0.8857694864273071, "learning_rate": 5e-05, "llm_loss": 0.6011599749326706, "loss": 2.7966, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.0443115234375, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0977783203125, "loss_aux_layer_15": 0.107666015625, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.1265869140625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0693359375, "step": 1772, "total_loss": 0.6991417706012726 }, { "epoch": 0.35101960007919225, "grad_norm": 1.240613341331482, "learning_rate": 5e-05, "llm_loss": 0.5771689563989639, "loss": 2.7217, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.04931640625, "loss_aux_layer_10": 0.076416015625, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1033935546875, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.1248779296875, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.074951171875, "step": 1773, "total_loss": 0.6804205924272537 }, { "epoch": 0.35121758067709363, "grad_norm": 1.355346441268921, "learning_rate": 5e-05, "llm_loss": 0.6308697313070297, "loss": 2.9234, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.122802734375, "loss_aux_layer_17": 0.1309814453125, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.06744384765625, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0711669921875, "step": 1774, "total_loss": 0.730855792760849 }, { "epoch": 0.35141556127499507, "grad_norm": 1.2816137075424194, "learning_rate": 5e-05, "llm_loss": 0.7195421755313873, "loss": 3.292, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.04974365234375, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.114501953125, "loss_aux_layer_16": 0.1240234375, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.06219482421875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0799560546875, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0791015625, "loss_aux_layer_9": 0.077392578125, "step": 1775, "total_loss": 0.8230056613683701 }, { "epoch": 0.35161354187289645, "grad_norm": 1.138945460319519, "learning_rate": 5e-05, "llm_loss": 0.5992398113012314, "loss": 2.8054, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.04693603515625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.092041015625, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.12353515625, "loss_aux_layer_17": 0.1319580078125, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.058837890625, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0733642578125, "step": 1776, "total_loss": 0.7013422399759293 }, { "epoch": 0.3518115224707979, "grad_norm": 1.3351335525512695, "learning_rate": 5e-05, "llm_loss": 0.5903705507516861, "loss": 2.7771, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.04888916015625, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.0618896484375, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0819091796875, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.0771484375, "step": 1777, "total_loss": 0.6942725777626038 }, { "epoch": 0.35200950306869927, "grad_norm": 0.9979242086410522, "learning_rate": 5e-05, "llm_loss": 0.5112373009324074, "loss": 2.4588, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.093994140625, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.125732421875, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.075439453125, "step": 1778, "total_loss": 0.6147096902132034 }, { "epoch": 0.35220748366660065, "grad_norm": 1.2620000839233398, "learning_rate": 5e-05, "llm_loss": 0.5877647548913956, "loss": 2.7632, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.0465087890625, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0802001953125, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.0931396484375, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1138916015625, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.14111328125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.16162109375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.07421875, "step": 1779, "total_loss": 0.690801128745079 }, { "epoch": 0.3524054642645021, "grad_norm": 1.6867475509643555, "learning_rate": 5e-05, "llm_loss": 0.5798113942146301, "loss": 2.7398, "loss_aux_layer_0": 0.024993896484375, "loss_aux_layer_1": 0.04986572265625, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.0628662109375, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.224609375, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0771484375, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.076416015625, "step": 1780, "total_loss": 0.6849468499422073 }, { "epoch": 0.35260344486240347, "grad_norm": 1.08309805393219, "learning_rate": 5e-05, "llm_loss": 0.5480494648218155, "loss": 2.6291, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.05255126953125, "loss_aux_layer_10": 0.082763671875, "loss_aux_layer_11": 0.0882568359375, "loss_aux_layer_12": 0.094482421875, "loss_aux_layer_13": 0.1014404296875, "loss_aux_layer_14": 0.111572265625, "loss_aux_layer_15": 0.1212158203125, "loss_aux_layer_16": 0.131591796875, "loss_aux_layer_17": 0.138427734375, "loss_aux_layer_18": 0.147216796875, "loss_aux_layer_19": 0.148681640625, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.155517578125, "loss_aux_layer_21": 0.1640625, "loss_aux_layer_22": 0.18896484375, "loss_aux_layer_23": 0.230224609375, "loss_aux_layer_3": 0.0772705078125, "loss_aux_layer_4": 0.0810546875, "loss_aux_layer_5": 0.0828857421875, "loss_aux_layer_6": 0.086669921875, "loss_aux_layer_7": 0.0841064453125, "loss_aux_layer_8": 0.0831298828125, "loss_aux_layer_9": 0.08154296875, "step": 1781, "total_loss": 0.6572677195072174 }, { "epoch": 0.3528014254603049, "grad_norm": 1.7689377069473267, "learning_rate": 5e-05, "llm_loss": 0.5943406671285629, "loss": 2.7846, "loss_aux_layer_0": 0.025543212890625, "loss_aux_layer_1": 0.048583984375, "loss_aux_layer_10": 0.0753173828125, "loss_aux_layer_11": 0.0802001953125, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.06146240234375, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.07177734375, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.0767822265625, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.073974609375, "step": 1782, "total_loss": 0.6961419135332108 }, { "epoch": 0.3529994060582063, "grad_norm": 1.0710866451263428, "learning_rate": 5e-05, "llm_loss": 0.617811769247055, "loss": 2.8868, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.0474853515625, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.0882568359375, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.1156005859375, "loss_aux_layer_16": 0.1259765625, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.060302734375, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.0780029296875, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.076171875, "step": 1783, "total_loss": 0.7216890007257462 }, { "epoch": 0.3531973866561077, "grad_norm": 1.448542594909668, "learning_rate": 5e-05, "llm_loss": 0.5647599697113037, "loss": 2.6676, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04730224609375, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05902099609375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.070068359375, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.0743408203125, "loss_aux_layer_9": 0.073486328125, "step": 1784, "total_loss": 0.6668991148471832 }, { "epoch": 0.3533953672540091, "grad_norm": 1.063332200050354, "learning_rate": 5e-05, "llm_loss": 0.6383031904697418, "loss": 2.9559, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.1121826171875, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.07275390625, "step": 1785, "total_loss": 0.7389789670705795 }, { "epoch": 0.3535933478519105, "grad_norm": 0.9499844908714294, "learning_rate": 5e-05, "llm_loss": 0.6083690449595451, "loss": 2.8399, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.10302734375, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0731201171875, "step": 1786, "total_loss": 0.7099679261445999 }, { "epoch": 0.3537913284498119, "grad_norm": 0.8878889679908752, "learning_rate": 5e-05, "llm_loss": 0.5532575249671936, "loss": 2.6272, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.04742431640625, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.082275390625, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.113525390625, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.1317138671875, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.224853515625, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0771484375, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0760498046875, "step": 1787, "total_loss": 0.6568019390106201 }, { "epoch": 0.3539893090477133, "grad_norm": 0.952355146408081, "learning_rate": 5e-05, "llm_loss": 0.6319374442100525, "loss": 2.9738, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.05224609375, "loss_aux_layer_10": 0.0855712890625, "loss_aux_layer_11": 0.09130859375, "loss_aux_layer_12": 0.0977783203125, "loss_aux_layer_13": 0.10498046875, "loss_aux_layer_14": 0.1156005859375, "loss_aux_layer_15": 0.12548828125, "loss_aux_layer_16": 0.1357421875, "loss_aux_layer_17": 0.143310546875, "loss_aux_layer_18": 0.150390625, "loss_aux_layer_19": 0.15185546875, "loss_aux_layer_2": 0.06707763671875, "loss_aux_layer_20": 0.15771484375, "loss_aux_layer_21": 0.16455078125, "loss_aux_layer_22": 0.1865234375, "loss_aux_layer_23": 0.2265625, "loss_aux_layer_3": 0.080322265625, "loss_aux_layer_4": 0.0841064453125, "loss_aux_layer_5": 0.0863037109375, "loss_aux_layer_6": 0.0894775390625, "loss_aux_layer_7": 0.08642578125, "loss_aux_layer_8": 0.08544921875, "loss_aux_layer_9": 0.0841064453125, "step": 1788, "total_loss": 0.7434406131505966 }, { "epoch": 0.35418728964561474, "grad_norm": 0.8780178427696228, "learning_rate": 5e-05, "llm_loss": 0.525689035654068, "loss": 2.5159, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.04827880859375, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.0931396484375, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.113525390625, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.1319580078125, "loss_aux_layer_18": 0.14111328125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.0753173828125, "step": 1789, "total_loss": 0.6289849877357483 }, { "epoch": 0.3543852702435161, "grad_norm": 1.0508904457092285, "learning_rate": 5e-05, "llm_loss": 0.6849881708621979, "loss": 3.1416, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.12109375, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.0733642578125, "step": 1790, "total_loss": 0.7853942215442657 }, { "epoch": 0.35458325084141756, "grad_norm": 1.228793740272522, "learning_rate": 5e-05, "llm_loss": 0.6371152848005295, "loss": 2.9493, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.100341796875, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.1385498046875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.057861328125, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.06842041015625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.07440185546875, "loss_aux_layer_8": 0.07342529296875, "loss_aux_layer_9": 0.07177734375, "step": 1791, "total_loss": 0.7373297661542892 }, { "epoch": 0.35478123143931894, "grad_norm": 1.5858498811721802, "learning_rate": 5e-05, "llm_loss": 0.54561448097229, "loss": 2.5899, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.0469970703125, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.05865478515625, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0699462890625, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.0731201171875, "step": 1792, "total_loss": 0.6474675536155701 }, { "epoch": 0.3549792120372203, "grad_norm": 1.2455334663391113, "learning_rate": 5e-05, "llm_loss": 0.6348450630903244, "loss": 2.9382, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1102294921875, "loss_aux_layer_16": 0.1209716796875, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.06646728515625, "loss_aux_layer_4": 0.06976318359375, "loss_aux_layer_5": 0.07171630859375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.07135009765625, "loss_aux_layer_9": 0.070556640625, "step": 1793, "total_loss": 0.7345455288887024 }, { "epoch": 0.35517719263512176, "grad_norm": 1.3856905698776245, "learning_rate": 5e-05, "llm_loss": 0.7234542369842529, "loss": 3.3036, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.04705810546875, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.0750732421875, "step": 1794, "total_loss": 0.825887605547905 }, { "epoch": 0.35537517323302314, "grad_norm": 1.2190406322479248, "learning_rate": 5e-05, "llm_loss": 0.6798300743103027, "loss": 3.1225, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.04559326171875, "loss_aux_layer_10": 0.0743408203125, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.10205078125, "loss_aux_layer_15": 0.1121826171875, "loss_aux_layer_16": 0.1231689453125, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.072998046875, "step": 1795, "total_loss": 0.7806205600500107 }, { "epoch": 0.3555731538309246, "grad_norm": 1.7910345792770386, "learning_rate": 5e-05, "llm_loss": 0.6464362144470215, "loss": 2.9967, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.10302734375, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.1240234375, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.060791015625, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.077392578125, "loss_aux_layer_6": 0.0802001953125, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.0738525390625, "step": 1796, "total_loss": 0.7491750419139862 }, { "epoch": 0.35577113442882596, "grad_norm": 1.126822590827942, "learning_rate": 5e-05, "llm_loss": 0.611553393304348, "loss": 2.8799, "loss_aux_layer_0": 0.025360107421875, "loss_aux_layer_1": 0.051513671875, "loss_aux_layer_10": 0.081787109375, "loss_aux_layer_11": 0.0869140625, "loss_aux_layer_12": 0.0926513671875, "loss_aux_layer_13": 0.099609375, "loss_aux_layer_14": 0.1099853515625, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.130615234375, "loss_aux_layer_17": 0.138916015625, "loss_aux_layer_18": 0.14794921875, "loss_aux_layer_19": 0.150146484375, "loss_aux_layer_2": 0.06500244140625, "loss_aux_layer_20": 0.1572265625, "loss_aux_layer_21": 0.16357421875, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0771484375, "loss_aux_layer_4": 0.080322265625, "loss_aux_layer_5": 0.0823974609375, "loss_aux_layer_6": 0.08544921875, "loss_aux_layer_7": 0.0826416015625, "loss_aux_layer_8": 0.0816650390625, "loss_aux_layer_9": 0.0802001953125, "step": 1797, "total_loss": 0.7199717015028 }, { "epoch": 0.3559691150267274, "grad_norm": 1.2484898567199707, "learning_rate": 5e-05, "llm_loss": 0.6046115607023239, "loss": 2.8263, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.04620361328125, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1126708984375, "loss_aux_layer_16": 0.12353515625, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.0732421875, "step": 1798, "total_loss": 0.7065701335668564 }, { "epoch": 0.3561670956246288, "grad_norm": 1.4735466241836548, "learning_rate": 5e-05, "llm_loss": 0.6239005774259567, "loss": 2.9037, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.0924072265625, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0750732421875, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.080078125, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.07421875, "step": 1799, "total_loss": 0.7259313762187958 }, { "epoch": 0.3563650762225302, "grad_norm": 1.5298937559127808, "learning_rate": 5e-05, "llm_loss": 0.6362758874893188, "loss": 2.9642, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.0489501953125, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.106201171875, "loss_aux_layer_15": 0.11669921875, "loss_aux_layer_16": 0.127685546875, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.14697265625, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0770263671875, "loss_aux_layer_9": 0.0760498046875, "step": 1800, "total_loss": 0.7410619556903839 }, { "epoch": 0.3565630568204316, "grad_norm": 1.1190170049667358, "learning_rate": 5e-05, "llm_loss": 0.6381145268678665, "loss": 2.9718, "loss_aux_layer_0": 0.023284912109375, "loss_aux_layer_1": 0.04736328125, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.1168212890625, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.1357421875, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.153564453125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.07177734375, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0767822265625, "step": 1801, "total_loss": 0.742943063378334 }, { "epoch": 0.356761037418333, "grad_norm": 1.2011183500289917, "learning_rate": 5e-05, "llm_loss": 0.6238647550344467, "loss": 2.8936, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05889892578125, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072509765625, "step": 1802, "total_loss": 0.7234073728322983 }, { "epoch": 0.3569590180162344, "grad_norm": 1.0739294290542603, "learning_rate": 5e-05, "llm_loss": 0.6766913682222366, "loss": 3.1298, "loss_aux_layer_0": 0.0228271484375, "loss_aux_layer_1": 0.050537109375, "loss_aux_layer_10": 0.0804443359375, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.09130859375, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1082763671875, "loss_aux_layer_15": 0.1180419921875, "loss_aux_layer_16": 0.128173828125, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.0634765625, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0753173828125, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.081298828125, "loss_aux_layer_6": 0.0843505859375, "loss_aux_layer_7": 0.081298828125, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0787353515625, "step": 1803, "total_loss": 0.7824395298957825 }, { "epoch": 0.3571569986141358, "grad_norm": 1.428978443145752, "learning_rate": 5e-05, "llm_loss": 0.5423443391919136, "loss": 2.5833, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.1234130859375, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.223876953125, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.080810546875, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.07568359375, "step": 1804, "total_loss": 0.6458132266998291 }, { "epoch": 0.35735497921203724, "grad_norm": 1.1336603164672852, "learning_rate": 5e-05, "llm_loss": 0.5472346618771553, "loss": 2.6039, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04913330078125, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.08837890625, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.1251220703125, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.080322265625, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.0771484375, "step": 1805, "total_loss": 0.650979146361351 }, { "epoch": 0.3575529598099386, "grad_norm": 1.166759729385376, "learning_rate": 5e-05, "llm_loss": 0.5117073357105255, "loss": 2.4626, "loss_aux_layer_0": 0.02471923828125, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.08203125, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.09423828125, "loss_aux_layer_14": 0.1046142578125, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.1253662109375, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.0802001953125, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0758056640625, "step": 1806, "total_loss": 0.6156465709209442 }, { "epoch": 0.35775094040784006, "grad_norm": 1.1518659591674805, "learning_rate": 5e-05, "llm_loss": 0.5585104823112488, "loss": 2.6446, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0924072265625, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.11376953125, "loss_aux_layer_16": 0.12451171875, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.144775390625, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.152099609375, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.182861328125, "loss_aux_layer_23": 0.22265625, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.0736083984375, "step": 1807, "total_loss": 0.66114142537117 }, { "epoch": 0.35794892100574144, "grad_norm": 1.0499283075332642, "learning_rate": 5e-05, "llm_loss": 0.6490430980920792, "loss": 3.0068, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.0479736328125, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.060546875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.07763671875, "loss_aux_layer_6": 0.0810546875, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0760498046875, "step": 1808, "total_loss": 0.7517121732234955 }, { "epoch": 0.3581469016036428, "grad_norm": 1.737323522567749, "learning_rate": 5e-05, "llm_loss": 0.5417932346463203, "loss": 2.5941, "loss_aux_layer_0": 0.025390625, "loss_aux_layer_1": 0.050048828125, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.085205078125, "loss_aux_layer_12": 0.091064453125, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.108154296875, "loss_aux_layer_15": 0.1185302734375, "loss_aux_layer_16": 0.1290283203125, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06427001953125, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.184326171875, "loss_aux_layer_23": 0.22509765625, "loss_aux_layer_3": 0.0760498046875, "loss_aux_layer_4": 0.078857421875, "loss_aux_layer_5": 0.0806884765625, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.0782470703125, "step": 1809, "total_loss": 0.6485286056995392 }, { "epoch": 0.35834488220154426, "grad_norm": 1.1725400686264038, "learning_rate": 5e-05, "llm_loss": 0.6591563522815704, "loss": 3.0414, "loss_aux_layer_0": 0.022979736328125, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.079833984375, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.092041015625, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.0738525390625, "step": 1810, "total_loss": 0.7603623121976852 }, { "epoch": 0.35854286279944564, "grad_norm": 1.274794578552246, "learning_rate": 5e-05, "llm_loss": 0.5936739817261696, "loss": 2.7916, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.047607421875, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0869140625, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1044921875, "loss_aux_layer_15": 0.11474609375, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06060791015625, "loss_aux_layer_20": 0.155029296875, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.225341796875, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.0751953125, "step": 1811, "total_loss": 0.6979008615016937 }, { "epoch": 0.3587408433973471, "grad_norm": 1.161074161529541, "learning_rate": 5e-05, "llm_loss": 0.6052051037549973, "loss": 2.8201, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.0721435546875, "step": 1812, "total_loss": 0.705015704035759 }, { "epoch": 0.35893882399524846, "grad_norm": 1.2784887552261353, "learning_rate": 5e-05, "llm_loss": 0.6430376768112183, "loss": 2.9879, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.050048828125, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0947265625, "loss_aux_layer_14": 0.1044921875, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06256103515625, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.07568359375, "step": 1813, "total_loss": 0.7469735592603683 }, { "epoch": 0.3591368045931499, "grad_norm": 1.2284380197525024, "learning_rate": 5e-05, "llm_loss": 0.6504015177488327, "loss": 3.0244, "loss_aux_layer_0": 0.022918701171875, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0966796875, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.126953125, "loss_aux_layer_17": 0.13525390625, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.06195068359375, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.185302734375, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.0743408203125, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0797119140625, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.079833984375, "loss_aux_layer_8": 0.0787353515625, "loss_aux_layer_9": 0.0770263671875, "step": 1814, "total_loss": 0.7561037391424179 }, { "epoch": 0.3593347851910513, "grad_norm": 1.8610612154006958, "learning_rate": 5e-05, "llm_loss": 0.6385893523693085, "loss": 2.9602, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04608154296875, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.05755615234375, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.18115234375, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.071533203125, "step": 1815, "total_loss": 0.7400482147932053 }, { "epoch": 0.35953276578895266, "grad_norm": 1.7011414766311646, "learning_rate": 5e-05, "llm_loss": 0.5470990836620331, "loss": 2.5971, "loss_aux_layer_0": 0.0240478515625, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1131591796875, "loss_aux_layer_16": 0.1240234375, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.0570068359375, "loss_aux_layer_20": 0.154541015625, "loss_aux_layer_21": 0.1630859375, "loss_aux_layer_22": 0.18505859375, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.06768798828125, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.073486328125, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0723876953125, "step": 1816, "total_loss": 0.6492752879858017 }, { "epoch": 0.3597307463868541, "grad_norm": 1.2054314613342285, "learning_rate": 5e-05, "llm_loss": 0.6539898067712784, "loss": 3.0131, "loss_aux_layer_0": 0.025482177734375, "loss_aux_layer_1": 0.0458984375, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.109619140625, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.12890625, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.071044921875, "step": 1817, "total_loss": 0.7532784193754196 }, { "epoch": 0.3599287269847555, "grad_norm": 2.138916254043579, "learning_rate": 5e-05, "llm_loss": 0.5940933972597122, "loss": 2.7923, "loss_aux_layer_0": 0.023651123046875, "loss_aux_layer_1": 0.04681396484375, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.1170654296875, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.1353759765625, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.146728515625, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.075927734375, "step": 1818, "total_loss": 0.6980845928192139 }, { "epoch": 0.3601267075826569, "grad_norm": 1.3152848482131958, "learning_rate": 5e-05, "llm_loss": 0.5757649540901184, "loss": 2.7053, "loss_aux_layer_0": 0.025146484375, "loss_aux_layer_1": 0.045654296875, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.121337890625, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.071044921875, "step": 1819, "total_loss": 0.6763211488723755 }, { "epoch": 0.3603246881805583, "grad_norm": 1.6970019340515137, "learning_rate": 5e-05, "llm_loss": 0.6932097375392914, "loss": 3.188, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.0789794921875, "loss_aux_layer_11": 0.0838623046875, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.077392578125, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.07763671875, "step": 1820, "total_loss": 0.7969960272312164 }, { "epoch": 0.36052266877845973, "grad_norm": 1.3527189493179321, "learning_rate": 5e-05, "llm_loss": 0.671680748462677, "loss": 3.1134, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.04986572265625, "loss_aux_layer_10": 0.080078125, "loss_aux_layer_11": 0.0849609375, "loss_aux_layer_12": 0.0909423828125, "loss_aux_layer_13": 0.09765625, "loss_aux_layer_14": 0.1077880859375, "loss_aux_layer_15": 0.1181640625, "loss_aux_layer_16": 0.1298828125, "loss_aux_layer_17": 0.137939453125, "loss_aux_layer_18": 0.146484375, "loss_aux_layer_19": 0.149169921875, "loss_aux_layer_2": 0.06268310546875, "loss_aux_layer_20": 0.156494140625, "loss_aux_layer_21": 0.162841796875, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0780029296875, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.0833740234375, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.078369140625, "step": 1821, "total_loss": 0.7783552408218384 }, { "epoch": 0.3607206493763611, "grad_norm": 1.0394108295440674, "learning_rate": 5e-05, "llm_loss": 0.5494381561875343, "loss": 2.592, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.089111328125, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.120849609375, "loss_aux_layer_17": 0.1292724609375, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0709228515625, "loss_aux_layer_9": 0.0699462890625, "step": 1822, "total_loss": 0.6479876637458801 }, { "epoch": 0.3609186299742625, "grad_norm": 1.2573223114013672, "learning_rate": 5e-05, "llm_loss": 0.5953325629234314, "loss": 2.7905, "loss_aux_layer_0": 0.02313232421875, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.07568359375, "loss_aux_layer_11": 0.08056640625, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.0926513671875, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.059326171875, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.2236328125, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.0738525390625, "step": 1823, "total_loss": 0.6976359039545059 }, { "epoch": 0.36111661057216393, "grad_norm": 1.0213687419891357, "learning_rate": 5e-05, "llm_loss": 0.5782736390829086, "loss": 2.7292, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.1334228515625, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0760498046875, "step": 1824, "total_loss": 0.6822966039180756 }, { "epoch": 0.3613145911700653, "grad_norm": 1.3829208612442017, "learning_rate": 5e-05, "llm_loss": 0.5712406933307648, "loss": 2.7102, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.07958984375, "loss_aux_layer_11": 0.0845947265625, "loss_aux_layer_12": 0.0904541015625, "loss_aux_layer_13": 0.0968017578125, "loss_aux_layer_14": 0.1070556640625, "loss_aux_layer_15": 0.1175537109375, "loss_aux_layer_16": 0.1278076171875, "loss_aux_layer_17": 0.135986328125, "loss_aux_layer_18": 0.144287109375, "loss_aux_layer_19": 0.1474609375, "loss_aux_layer_2": 0.06256103515625, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.163330078125, "loss_aux_layer_22": 0.185791015625, "loss_aux_layer_23": 0.226806640625, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.0777587890625, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.0836181640625, "loss_aux_layer_7": 0.0804443359375, "loss_aux_layer_8": 0.07958984375, "loss_aux_layer_9": 0.0780029296875, "step": 1825, "total_loss": 0.6775423288345337 }, { "epoch": 0.36151257176796675, "grad_norm": 1.35593581199646, "learning_rate": 5e-05, "llm_loss": 0.5562324449419975, "loss": 2.6469, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04754638671875, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.0897216796875, "loss_aux_layer_13": 0.0972900390625, "loss_aux_layer_14": 0.1077880859375, "loss_aux_layer_15": 0.1181640625, "loss_aux_layer_16": 0.1287841796875, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145263671875, "loss_aux_layer_19": 0.147705078125, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15478515625, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.076904296875, "step": 1826, "total_loss": 0.6617366671562195 }, { "epoch": 0.36171055236586813, "grad_norm": 1.6927183866500854, "learning_rate": 5e-05, "llm_loss": 0.5265940502285957, "loss": 2.5075, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.0452880859375, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.0723876953125, "step": 1827, "total_loss": 0.626869261264801 }, { "epoch": 0.36190853296376957, "grad_norm": 1.1748442649841309, "learning_rate": 5e-05, "llm_loss": 0.6378489136695862, "loss": 2.9536, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0838623046875, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.11083984375, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.06793212890625, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.072021484375, "step": 1828, "total_loss": 0.7384021133184433 }, { "epoch": 0.36210651356167095, "grad_norm": 1.6353363990783691, "learning_rate": 5e-05, "llm_loss": 0.5824861750006676, "loss": 2.7442, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.076904296875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.075439453125, "step": 1829, "total_loss": 0.6860544085502625 }, { "epoch": 0.3623044941595724, "grad_norm": 1.152531385421753, "learning_rate": 5e-05, "llm_loss": 0.6677465885877609, "loss": 3.0698, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04583740234375, "loss_aux_layer_10": 0.0743408203125, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.100341796875, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.1199951171875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.072998046875, "step": 1830, "total_loss": 0.7674406617879868 }, { "epoch": 0.36250247475747377, "grad_norm": 1.6500054597854614, "learning_rate": 5e-05, "llm_loss": 0.6060950458049774, "loss": 2.8339, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.0472412109375, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.05902099609375, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.0758056640625, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0743408203125, "step": 1831, "total_loss": 0.7084659487009048 }, { "epoch": 0.36270045535537515, "grad_norm": 1.2372305393218994, "learning_rate": 5e-05, "llm_loss": 0.611206442117691, "loss": 2.8726, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.0491943359375, "loss_aux_layer_10": 0.0810546875, "loss_aux_layer_11": 0.0858154296875, "loss_aux_layer_12": 0.0916748046875, "loss_aux_layer_13": 0.09912109375, "loss_aux_layer_14": 0.1092529296875, "loss_aux_layer_15": 0.1197509765625, "loss_aux_layer_16": 0.130126953125, "loss_aux_layer_17": 0.1376953125, "loss_aux_layer_18": 0.14599609375, "loss_aux_layer_19": 0.1484375, "loss_aux_layer_2": 0.06317138671875, "loss_aux_layer_20": 0.1552734375, "loss_aux_layer_21": 0.16259765625, "loss_aux_layer_22": 0.183837890625, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.0755615234375, "loss_aux_layer_4": 0.0787353515625, "loss_aux_layer_5": 0.0806884765625, "loss_aux_layer_6": 0.083740234375, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.0802001953125, "loss_aux_layer_9": 0.0792236328125, "step": 1832, "total_loss": 0.7181623131036758 }, { "epoch": 0.3628984359532766, "grad_norm": 1.5205904245376587, "learning_rate": 5e-05, "llm_loss": 0.6387577205896378, "loss": 2.9682, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.0465087890625, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.0865478515625, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.14599609375, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.075927734375, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.074951171875, "step": 1833, "total_loss": 0.7420560419559479 }, { "epoch": 0.36309641655117797, "grad_norm": 1.3677558898925781, "learning_rate": 5e-05, "llm_loss": 0.6390814185142517, "loss": 3.0019, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.0526123046875, "loss_aux_layer_10": 0.0848388671875, "loss_aux_layer_11": 0.0904541015625, "loss_aux_layer_12": 0.096923828125, "loss_aux_layer_13": 0.1046142578125, "loss_aux_layer_14": 0.1151123046875, "loss_aux_layer_15": 0.125, "loss_aux_layer_16": 0.1357421875, "loss_aux_layer_17": 0.1435546875, "loss_aux_layer_18": 0.1513671875, "loss_aux_layer_19": 0.153076171875, "loss_aux_layer_2": 0.06646728515625, "loss_aux_layer_20": 0.15966796875, "loss_aux_layer_21": 0.165771484375, "loss_aux_layer_22": 0.189453125, "loss_aux_layer_23": 0.2294921875, "loss_aux_layer_3": 0.0792236328125, "loss_aux_layer_4": 0.0828857421875, "loss_aux_layer_5": 0.0849609375, "loss_aux_layer_6": 0.0882568359375, "loss_aux_layer_7": 0.0850830078125, "loss_aux_layer_8": 0.0838623046875, "loss_aux_layer_9": 0.082763671875, "step": 1834, "total_loss": 0.7504714876413345 }, { "epoch": 0.3632943971490794, "grad_norm": 1.4321075677871704, "learning_rate": 5e-05, "llm_loss": 0.6163971871137619, "loss": 2.872, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.1021728515625, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.05865478515625, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0723876953125, "step": 1835, "total_loss": 0.7180013358592987 }, { "epoch": 0.3634923777469808, "grad_norm": 1.9740681648254395, "learning_rate": 5e-05, "llm_loss": 0.6369890868663788, "loss": 2.9747, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0494384765625, "loss_aux_layer_10": 0.0802001953125, "loss_aux_layer_11": 0.0853271484375, "loss_aux_layer_12": 0.0914306640625, "loss_aux_layer_13": 0.098388671875, "loss_aux_layer_14": 0.1083984375, "loss_aux_layer_15": 0.11865234375, "loss_aux_layer_16": 0.1290283203125, "loss_aux_layer_17": 0.136474609375, "loss_aux_layer_18": 0.145751953125, "loss_aux_layer_19": 0.14794921875, "loss_aux_layer_2": 0.06341552734375, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.162109375, "loss_aux_layer_22": 0.18310546875, "loss_aux_layer_23": 0.223388671875, "loss_aux_layer_3": 0.075927734375, "loss_aux_layer_4": 0.0791015625, "loss_aux_layer_5": 0.0814208984375, "loss_aux_layer_6": 0.084228515625, "loss_aux_layer_7": 0.0810546875, "loss_aux_layer_8": 0.080322265625, "loss_aux_layer_9": 0.0789794921875, "step": 1836, "total_loss": 0.7436645478010178 }, { "epoch": 0.3636903583448822, "grad_norm": 1.4493191242218018, "learning_rate": 5e-05, "llm_loss": 0.6057265996932983, "loss": 2.8198, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05914306640625, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.07177734375, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.072998046875, "step": 1837, "total_loss": 0.7049558013677597 }, { "epoch": 0.3638883389427836, "grad_norm": 2.07554030418396, "learning_rate": 5e-05, "llm_loss": 0.6526096910238266, "loss": 3.0218, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.11328125, "loss_aux_layer_16": 0.1239013671875, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.05914306640625, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.0740966796875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.074951171875, "step": 1838, "total_loss": 0.755448505282402 }, { "epoch": 0.364086319540685, "grad_norm": 1.977992057800293, "learning_rate": 5e-05, "llm_loss": 0.6515390574932098, "loss": 3.0046, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.072021484375, "step": 1839, "total_loss": 0.751146212220192 }, { "epoch": 0.3642843001385864, "grad_norm": 1.2920997142791748, "learning_rate": 5e-05, "llm_loss": 0.6724869906902313, "loss": 3.0911, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.0479736328125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.10009765625, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.07275390625, "step": 1840, "total_loss": 0.7727719098329544 }, { "epoch": 0.3644822807364878, "grad_norm": 1.3395922183990479, "learning_rate": 5e-05, "llm_loss": 0.5250381827354431, "loss": 2.5002, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.0706787109375, "step": 1841, "total_loss": 0.6250433027744293 }, { "epoch": 0.36468026133438924, "grad_norm": 1.6314494609832764, "learning_rate": 5e-05, "llm_loss": 0.6690325886011124, "loss": 3.0901, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.0946044921875, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.1326904296875, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.075927734375, "step": 1842, "total_loss": 0.7725216299295425 }, { "epoch": 0.3648782419322906, "grad_norm": 1.2131503820419312, "learning_rate": 5e-05, "llm_loss": 0.5990197509527206, "loss": 2.8105, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.04962158203125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0882568359375, "loss_aux_layer_13": 0.0948486328125, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.0628662109375, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.0787353515625, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.0765380859375, "step": 1843, "total_loss": 0.7026272267103195 }, { "epoch": 0.36507622253019206, "grad_norm": 1.4957069158554077, "learning_rate": 5e-05, "llm_loss": 0.5372218117117882, "loss": 2.5652, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.04864501953125, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.0875244140625, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.145263671875, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0755615234375, "step": 1844, "total_loss": 0.6412959843873978 }, { "epoch": 0.36527420312809344, "grad_norm": 1.1493011713027954, "learning_rate": 5e-05, "llm_loss": 0.647812232375145, "loss": 2.9909, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04583740234375, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.111083984375, "loss_aux_layer_16": 0.12158203125, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.072021484375, "step": 1845, "total_loss": 0.7477263361215591 }, { "epoch": 0.3654721837259948, "grad_norm": 1.4213120937347412, "learning_rate": 5e-05, "llm_loss": 0.528722807765007, "loss": 2.5191, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.04730224609375, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.08544921875, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.111083984375, "loss_aux_layer_16": 0.1214599609375, "loss_aux_layer_17": 0.12890625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05999755859375, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.07373046875, "step": 1846, "total_loss": 0.629771277308464 }, { "epoch": 0.36567016432389626, "grad_norm": 1.1525278091430664, "learning_rate": 5e-05, "llm_loss": 0.5810021609067917, "loss": 2.7291, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.04620361328125, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.0926513671875, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0736083984375, "step": 1847, "total_loss": 0.68228380382061 }, { "epoch": 0.36586814492179764, "grad_norm": 1.0037330389022827, "learning_rate": 5e-05, "llm_loss": 0.5872714519500732, "loss": 2.7495, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04498291015625, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.09130859375, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.1312255859375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.07366943359375, "loss_aux_layer_8": 0.07293701171875, "loss_aux_layer_9": 0.07183837890625, "step": 1848, "total_loss": 0.6873794198036194 }, { "epoch": 0.3660661255196991, "grad_norm": 1.1310309171676636, "learning_rate": 5e-05, "llm_loss": 0.5730343014001846, "loss": 2.6914, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1278076171875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.0582275390625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0711669921875, "step": 1849, "total_loss": 0.6728382408618927 }, { "epoch": 0.36626410611760046, "grad_norm": 1.0213338136672974, "learning_rate": 5e-05, "llm_loss": 0.5577910840511322, "loss": 2.6461, "loss_aux_layer_0": 0.024627685546875, "loss_aux_layer_1": 0.04937744140625, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.0882568359375, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.125732421875, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.076171875, "step": 1850, "total_loss": 0.6615182608366013 }, { "epoch": 0.3664620867155019, "grad_norm": 0.9441523551940918, "learning_rate": 5e-05, "llm_loss": 0.5788385570049286, "loss": 2.7187, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.074951171875, "loss_aux_layer_11": 0.079833984375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.075439453125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.073486328125, "step": 1851, "total_loss": 0.6796864420175552 }, { "epoch": 0.3666600673134033, "grad_norm": 0.9382503628730774, "learning_rate": 5e-05, "llm_loss": 0.6729305982589722, "loss": 3.1018, "loss_aux_layer_0": 0.02392578125, "loss_aux_layer_1": 0.04693603515625, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.0799560546875, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0714111328125, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0736083984375, "step": 1852, "total_loss": 0.7754548937082291 }, { "epoch": 0.3668580479113047, "grad_norm": 1.1491056680679321, "learning_rate": 5e-05, "llm_loss": 0.6139324903488159, "loss": 2.8638, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.04681396484375, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0931396484375, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.0714111328125, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.0745849609375, "step": 1853, "total_loss": 0.7159624546766281 }, { "epoch": 0.3670560285092061, "grad_norm": 1.360874056816101, "learning_rate": 5e-05, "llm_loss": 0.5297851115465164, "loss": 2.5373, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.048828125, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0889892578125, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.0625, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.0745849609375, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.07958984375, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.0782470703125, "loss_aux_layer_9": 0.0767822265625, "step": 1854, "total_loss": 0.6343241930007935 }, { "epoch": 0.3672540091071075, "grad_norm": 1.281448245048523, "learning_rate": 5e-05, "llm_loss": 0.6560031771659851, "loss": 3.0219, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.05682373046875, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.072021484375, "step": 1855, "total_loss": 0.7554665058851242 }, { "epoch": 0.3674519897050089, "grad_norm": 0.9384003281593323, "learning_rate": 5e-05, "llm_loss": 0.6127926856279373, "loss": 2.8441, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04345703125, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1197509765625, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.069091796875, "step": 1856, "total_loss": 0.711020827293396 }, { "epoch": 0.3676499703029103, "grad_norm": 0.9241388440132141, "learning_rate": 5e-05, "llm_loss": 0.6254349052906036, "loss": 2.9164, "loss_aux_layer_0": 0.023162841796875, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.11474609375, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.1319580078125, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.0780029296875, "loss_aux_layer_9": 0.0765380859375, "step": 1857, "total_loss": 0.7290964722633362 }, { "epoch": 0.36784795090081174, "grad_norm": 0.9119279980659485, "learning_rate": 5e-05, "llm_loss": 0.602703720331192, "loss": 2.826, "loss_aux_layer_0": 0.02423095703125, "loss_aux_layer_1": 0.04815673828125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0885009765625, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.1256103515625, "loss_aux_layer_17": 0.1337890625, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.15185546875, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.0765380859375, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0762939453125, "step": 1858, "total_loss": 0.7065013349056244 }, { "epoch": 0.3680459314987131, "grad_norm": 0.8318908214569092, "learning_rate": 5e-05, "llm_loss": 0.5234675630927086, "loss": 2.5031, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04656982421875, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.1224365234375, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.05889892578125, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.0765380859375, "loss_aux_layer_6": 0.079833984375, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.0748291015625, "step": 1859, "total_loss": 0.6257836073637009 }, { "epoch": 0.36824391209661456, "grad_norm": 1.35850989818573, "learning_rate": 5e-05, "llm_loss": 0.6511708199977875, "loss": 3.0204, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.049072265625, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0953369140625, "loss_aux_layer_14": 0.1048583984375, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.061767578125, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0738525390625, "loss_aux_layer_4": 0.0772705078125, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.079345703125, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.0765380859375, "step": 1860, "total_loss": 0.7550984174013138 }, { "epoch": 0.36844189269451594, "grad_norm": 1.3004724979400635, "learning_rate": 5e-05, "llm_loss": 0.6377564519643784, "loss": 2.961, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.04632568359375, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.1234130859375, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0740966796875, "step": 1861, "total_loss": 0.7402428984642029 }, { "epoch": 0.3686398732924173, "grad_norm": 1.1654223203659058, "learning_rate": 5e-05, "llm_loss": 0.5174256935715675, "loss": 2.4689, "loss_aux_layer_0": 0.02197265625, "loss_aux_layer_1": 0.04705810546875, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.0849609375, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0732421875, "step": 1862, "total_loss": 0.6172341108322144 }, { "epoch": 0.36883785389031876, "grad_norm": 1.1159684658050537, "learning_rate": 5e-05, "llm_loss": 0.6546297818422318, "loss": 3.0171, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.1368408203125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0716552734375, "step": 1863, "total_loss": 0.7542858719825745 }, { "epoch": 0.36903583448822014, "grad_norm": 0.8888379335403442, "learning_rate": 5e-05, "llm_loss": 0.5441784411668777, "loss": 2.5786, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.12158203125, "loss_aux_layer_17": 0.1302490234375, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.22021484375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.07147216796875, "step": 1864, "total_loss": 0.6446489840745926 }, { "epoch": 0.3692338150861216, "grad_norm": 0.9234676957130432, "learning_rate": 5e-05, "llm_loss": 0.6181643307209015, "loss": 2.8732, "loss_aux_layer_0": 0.02410888671875, "loss_aux_layer_1": 0.0469970703125, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.120361328125, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.075439453125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0726318359375, "step": 1865, "total_loss": 0.718295693397522 }, { "epoch": 0.36943179568402296, "grad_norm": 0.903470516204834, "learning_rate": 5e-05, "llm_loss": 0.615092545747757, "loss": 2.8623, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.04644775390625, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.1102294921875, "loss_aux_layer_16": 0.1197509765625, "loss_aux_layer_17": 0.127685546875, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.07958984375, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0740966796875, "step": 1866, "total_loss": 0.7155685126781464 }, { "epoch": 0.3696297762819244, "grad_norm": 0.8404643535614014, "learning_rate": 5e-05, "llm_loss": 0.5159295797348022, "loss": 2.4766, "loss_aux_layer_0": 0.022247314453125, "loss_aux_layer_1": 0.04608154296875, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0933837890625, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.11376953125, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.142333984375, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.0599365234375, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.0712890625, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0743408203125, "step": 1867, "total_loss": 0.6191447526216507 }, { "epoch": 0.3698277568798258, "grad_norm": 0.9630199074745178, "learning_rate": 5e-05, "llm_loss": 0.49295003712177277, "loss": 2.3717, "loss_aux_layer_0": 0.024200439453125, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.1197509765625, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.220703125, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0712890625, "step": 1868, "total_loss": 0.5929146409034729 }, { "epoch": 0.37002573747772716, "grad_norm": 1.2627923488616943, "learning_rate": 5e-05, "llm_loss": 0.568109355866909, "loss": 2.6882, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.047607421875, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.1046142578125, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.1246337890625, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.06097412109375, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.18408203125, "loss_aux_layer_23": 0.22412109375, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.07666015625, "loss_aux_layer_9": 0.075439453125, "step": 1869, "total_loss": 0.6720490753650665 }, { "epoch": 0.3702237180756286, "grad_norm": 0.9906255006790161, "learning_rate": 5e-05, "llm_loss": 0.5382057949900627, "loss": 2.5549, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04608154296875, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.091064453125, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.111328125, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138916015625, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.0721435546875, "step": 1870, "total_loss": 0.6387167274951935 }, { "epoch": 0.37042169867353, "grad_norm": 1.2152379751205444, "learning_rate": 5e-05, "llm_loss": 0.6053446680307388, "loss": 2.8346, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04736328125, "loss_aux_layer_10": 0.0787353515625, "loss_aux_layer_11": 0.083984375, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.060791015625, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.0789794921875, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0771484375, "step": 1871, "total_loss": 0.7086514383554459 }, { "epoch": 0.3706196792714314, "grad_norm": 1.0360159873962402, "learning_rate": 5e-05, "llm_loss": 0.5567634999752045, "loss": 2.6251, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.0455322265625, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.100341796875, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1278076171875, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.0693359375, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.072509765625, "step": 1872, "total_loss": 0.6562781631946564 }, { "epoch": 0.3708176598693328, "grad_norm": 1.1466788053512573, "learning_rate": 5e-05, "llm_loss": 0.636578157544136, "loss": 2.9636, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.0889892578125, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.1263427734375, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142822265625, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0760498046875, "step": 1873, "total_loss": 0.7409080862998962 }, { "epoch": 0.37101564046723423, "grad_norm": 1.1343284845352173, "learning_rate": 5e-05, "llm_loss": 0.6003232598304749, "loss": 2.7829, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.1370849609375, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.07269287109375, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068115234375, "step": 1874, "total_loss": 0.6957356631755829 }, { "epoch": 0.3712136210651356, "grad_norm": 0.9364634156227112, "learning_rate": 5e-05, "llm_loss": 0.4789286479353905, "loss": 2.3103, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04443359375, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.1285400390625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0562744140625, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0703125, "step": 1875, "total_loss": 0.5775673389434814 }, { "epoch": 0.37141160166303705, "grad_norm": 0.958733856678009, "learning_rate": 5e-05, "llm_loss": 0.6133460700511932, "loss": 2.8558, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.1109619140625, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.0731201171875, "step": 1876, "total_loss": 0.7139428108930588 }, { "epoch": 0.37160958226093843, "grad_norm": 1.5518769025802612, "learning_rate": 5e-05, "llm_loss": 0.5988288894295692, "loss": 2.8048, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.0460205078125, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1019287109375, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.0584716796875, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.1611328125, "loss_aux_layer_22": 0.1845703125, "loss_aux_layer_23": 0.225830078125, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.073486328125, "step": 1877, "total_loss": 0.7011892944574356 }, { "epoch": 0.3718075628588398, "grad_norm": 0.8953622579574585, "learning_rate": 5e-05, "llm_loss": 0.5851104706525803, "loss": 2.7412, "loss_aux_layer_0": 0.023773193359375, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.10009765625, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.0589599609375, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.072021484375, "step": 1878, "total_loss": 0.6852879226207733 }, { "epoch": 0.37200554345674125, "grad_norm": 1.3393956422805786, "learning_rate": 5e-05, "llm_loss": 0.5386331230401993, "loss": 2.5643, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.1231689453125, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.14501953125, "loss_aux_layer_2": 0.05914306640625, "loss_aux_layer_20": 0.152587890625, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.1826171875, "loss_aux_layer_23": 0.22314453125, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.0740966796875, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.0736083984375, "step": 1879, "total_loss": 0.641086220741272 }, { "epoch": 0.37220352405464263, "grad_norm": 1.0294243097305298, "learning_rate": 5e-05, "llm_loss": 0.5843632221221924, "loss": 2.7466, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.047607421875, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.0604248046875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.077392578125, "loss_aux_layer_6": 0.08056640625, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.075439453125, "step": 1880, "total_loss": 0.6866559088230133 }, { "epoch": 0.37240150465254407, "grad_norm": 1.2482714653015137, "learning_rate": 5e-05, "llm_loss": 0.5370814725756645, "loss": 2.5482, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.04608154296875, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.120361328125, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0716552734375, "step": 1881, "total_loss": 0.6370421200990677 }, { "epoch": 0.37259948525044545, "grad_norm": 1.0377570390701294, "learning_rate": 5e-05, "llm_loss": 0.6003416925668716, "loss": 2.8155, "loss_aux_layer_0": 0.024078369140625, "loss_aux_layer_1": 0.0474853515625, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.059814453125, "loss_aux_layer_20": 0.15380859375, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.183349609375, "loss_aux_layer_23": 0.222900390625, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.0740966796875, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.07470703125, "step": 1882, "total_loss": 0.7038773447275162 }, { "epoch": 0.3727974658483469, "grad_norm": 1.0698060989379883, "learning_rate": 5e-05, "llm_loss": 0.6387573331594467, "loss": 2.9459, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.07122802734375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.0552978515625, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.06591796875, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.07098388671875, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.07171630859375, "loss_aux_layer_8": 0.0709228515625, "loss_aux_layer_9": 0.07000732421875, "step": 1883, "total_loss": 0.7364631593227386 }, { "epoch": 0.37299544644624827, "grad_norm": 1.0844980478286743, "learning_rate": 5e-05, "llm_loss": 0.6365287601947784, "loss": 2.9382, "loss_aux_layer_0": 0.0244140625, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.07470703125, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.06982421875, "step": 1884, "total_loss": 0.7345454841852188 }, { "epoch": 0.37319342704414965, "grad_norm": 1.5033482313156128, "learning_rate": 5e-05, "llm_loss": 0.6661116033792496, "loss": 3.0798, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.0491943359375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.1253662109375, "loss_aux_layer_17": 0.13330078125, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.06231689453125, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.073974609375, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.079345703125, "loss_aux_layer_6": 0.082763671875, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.078369140625, "loss_aux_layer_9": 0.076904296875, "step": 1885, "total_loss": 0.7699560821056366 }, { "epoch": 0.3733914076420511, "grad_norm": 1.2964247465133667, "learning_rate": 5e-05, "llm_loss": 0.6109132170677185, "loss": 2.8496, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.0469970703125, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.1300048828125, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.05999755859375, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.07098388671875, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.075927734375, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.07421875, "step": 1886, "total_loss": 0.7123963981866837 }, { "epoch": 0.37358938823995247, "grad_norm": 0.9354525804519653, "learning_rate": 5e-05, "llm_loss": 0.5776949822902679, "loss": 2.6927, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.0435791015625, "loss_aux_layer_10": 0.06878662109375, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.06964111328125, "loss_aux_layer_6": 0.07244873046875, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.06878662109375, "loss_aux_layer_9": 0.06744384765625, "step": 1887, "total_loss": 0.6731802225112915 }, { "epoch": 0.3737873688378539, "grad_norm": 1.0937491655349731, "learning_rate": 5e-05, "llm_loss": 0.615478053689003, "loss": 2.8573, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04461669921875, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.118896484375, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.071533203125, "step": 1888, "total_loss": 0.7143306732177734 }, { "epoch": 0.3739853494357553, "grad_norm": 1.2686645984649658, "learning_rate": 5e-05, "llm_loss": 0.6102551817893982, "loss": 2.8386, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.127685546875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.07177734375, "step": 1889, "total_loss": 0.7096396386623383 }, { "epoch": 0.3741833300336567, "grad_norm": 0.8828977346420288, "learning_rate": 5e-05, "llm_loss": 0.5943282693624496, "loss": 2.7746, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.05670166015625, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.072265625, "step": 1890, "total_loss": 0.6936514228582382 }, { "epoch": 0.3743813106315581, "grad_norm": 1.3087420463562012, "learning_rate": 5e-05, "llm_loss": 0.6051187366247177, "loss": 2.8371, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.04840087890625, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.105224609375, "loss_aux_layer_15": 0.11474609375, "loss_aux_layer_16": 0.1248779296875, "loss_aux_layer_17": 0.1319580078125, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.158935546875, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.222412109375, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.082275390625, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0770263671875, "step": 1891, "total_loss": 0.7092707604169846 }, { "epoch": 0.3745792912294595, "grad_norm": 1.576898217201233, "learning_rate": 5e-05, "llm_loss": 0.6553872972726822, "loss": 3.0241, "loss_aux_layer_0": 0.0286865234375, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05950927734375, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0726318359375, "step": 1892, "total_loss": 0.7560315579175949 }, { "epoch": 0.3747772718273609, "grad_norm": 1.4108372926712036, "learning_rate": 5e-05, "llm_loss": 0.5636182054877281, "loss": 2.6611, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.0478515625, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.092041015625, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.060791015625, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.077392578125, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0748291015625, "step": 1893, "total_loss": 0.6652717739343643 }, { "epoch": 0.3749752524252623, "grad_norm": 1.1216634511947632, "learning_rate": 5e-05, "llm_loss": 0.6051898449659348, "loss": 2.8281, "loss_aux_layer_0": 0.0213623046875, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0933837890625, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.1307373046875, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.080810546875, "loss_aux_layer_7": 0.0782470703125, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0753173828125, "step": 1894, "total_loss": 0.7070141285657883 }, { "epoch": 0.37517323302316374, "grad_norm": 1.1315044164657593, "learning_rate": 5e-05, "llm_loss": 0.51752570271492, "loss": 2.4764, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.04681396484375, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.0865478515625, "loss_aux_layer_13": 0.0936279296875, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.12353515625, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0743408203125, "step": 1895, "total_loss": 0.6190928220748901 }, { "epoch": 0.3753712136210651, "grad_norm": 0.8923473954200745, "learning_rate": 5e-05, "llm_loss": 0.5370820164680481, "loss": 2.5461, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.0455322265625, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1199951171875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0728759765625, "step": 1896, "total_loss": 0.636531189084053 }, { "epoch": 0.37556919421896656, "grad_norm": 1.1580902338027954, "learning_rate": 5e-05, "llm_loss": 0.6299924701452255, "loss": 2.9244, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.0460205078125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.10205078125, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0592041015625, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.075439453125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.07275390625, "step": 1897, "total_loss": 0.7310876846313477 }, { "epoch": 0.37576717481686794, "grad_norm": 0.8388562202453613, "learning_rate": 5e-05, "llm_loss": 0.4933878406882286, "loss": 2.3788, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04498291015625, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.08056640625, "loss_aux_layer_12": 0.0865478515625, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.07421875, "step": 1898, "total_loss": 0.5947038233280182 }, { "epoch": 0.3759651554147693, "grad_norm": 1.1861474514007568, "learning_rate": 5e-05, "llm_loss": 0.6460777372121811, "loss": 2.9996, "loss_aux_layer_0": 0.02593994140625, "loss_aux_layer_1": 0.048828125, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.1251220703125, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.072998046875, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.078125, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.076171875, "step": 1899, "total_loss": 0.7498959749937057 }, { "epoch": 0.37616313601267076, "grad_norm": 1.3613736629486084, "learning_rate": 5e-05, "llm_loss": 0.5558721199631691, "loss": 2.6306, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.1126708984375, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.06005859375, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.073974609375, "step": 1900, "total_loss": 0.6576617360115051 }, { "epoch": 0.37636111661057214, "grad_norm": 1.7127487659454346, "learning_rate": 5e-05, "llm_loss": 0.5866359919309616, "loss": 2.7378, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1177978515625, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.06634521484375, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0693359375, "step": 1901, "total_loss": 0.6844564825296402 }, { "epoch": 0.3765590972084736, "grad_norm": 1.0188000202178955, "learning_rate": 5e-05, "llm_loss": 0.599478229880333, "loss": 2.8065, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.0875244140625, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1033935546875, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.122802734375, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.06109619140625, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.0772705078125, "loss_aux_layer_9": 0.07568359375, "step": 1902, "total_loss": 0.7016210556030273 }, { "epoch": 0.37675707780637496, "grad_norm": 1.9081733226776123, "learning_rate": 5e-05, "llm_loss": 0.5782787650823593, "loss": 2.7107, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.0447998046875, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1263427734375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0711669921875, "step": 1903, "total_loss": 0.6776814758777618 }, { "epoch": 0.3769550584042764, "grad_norm": 0.998802661895752, "learning_rate": 5e-05, "llm_loss": 0.5148710533976555, "loss": 2.4656, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.0460205078125, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.0848388671875, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1021728515625, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05963134765625, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.22119140625, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0733642578125, "step": 1904, "total_loss": 0.6163946092128754 }, { "epoch": 0.3771530390021778, "grad_norm": 1.1610926389694214, "learning_rate": 5e-05, "llm_loss": 0.5832119733095169, "loss": 2.7512, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.0479736328125, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.1260986328125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.14111328125, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.062744140625, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.074951171875, "loss_aux_layer_4": 0.078369140625, "loss_aux_layer_5": 0.0802001953125, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.08056640625, "loss_aux_layer_8": 0.0794677734375, "loss_aux_layer_9": 0.077880859375, "step": 1905, "total_loss": 0.6878015995025635 }, { "epoch": 0.3773510196000792, "grad_norm": 1.1010513305664062, "learning_rate": 5e-05, "llm_loss": 0.5640187710523605, "loss": 2.6639, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04705810546875, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.0875244140625, "loss_aux_layer_13": 0.093994140625, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.1219482421875, "loss_aux_layer_17": 0.1292724609375, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.177734375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.076171875, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.0782470703125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0760498046875, "step": 1906, "total_loss": 0.6659787893295288 }, { "epoch": 0.3775490001979806, "grad_norm": 1.2519989013671875, "learning_rate": 5e-05, "llm_loss": 0.6044654995203018, "loss": 2.8117, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.0433349609375, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0882568359375, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.06732177734375, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0703125, "step": 1907, "total_loss": 0.7029213905334473 }, { "epoch": 0.377746980795882, "grad_norm": 1.0328236818313599, "learning_rate": 5e-05, "llm_loss": 0.6196263879537582, "loss": 2.8816, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.074951171875, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.073486328125, "step": 1908, "total_loss": 0.7204111814498901 }, { "epoch": 0.3779449613937834, "grad_norm": 1.284481167793274, "learning_rate": 5e-05, "llm_loss": 0.6477325111627579, "loss": 2.9977, "loss_aux_layer_0": 0.024139404296875, "loss_aux_layer_1": 0.0465087890625, "loss_aux_layer_10": 0.0743408203125, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.09130859375, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.1116943359375, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.151611328125, "loss_aux_layer_21": 0.160400390625, "loss_aux_layer_22": 0.181396484375, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0733642578125, "step": 1909, "total_loss": 0.7494372576475143 }, { "epoch": 0.3781429419916848, "grad_norm": 1.2643799781799316, "learning_rate": 5e-05, "llm_loss": 0.6719421893358231, "loss": 3.0953, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.0931396484375, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.1121826171875, "loss_aux_layer_16": 0.12158203125, "loss_aux_layer_17": 0.1295166015625, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.059814453125, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.07244873046875, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0771484375, "loss_aux_layer_9": 0.07568359375, "step": 1910, "total_loss": 0.7738326191902161 }, { "epoch": 0.37834092258958624, "grad_norm": 1.0209242105484009, "learning_rate": 5e-05, "llm_loss": 0.6675977408885956, "loss": 3.0709, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.0439453125, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.0693359375, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.0721435546875, "step": 1911, "total_loss": 0.7677211463451385 }, { "epoch": 0.3785389031874876, "grad_norm": 1.7619816064834595, "learning_rate": 5e-05, "llm_loss": 0.6271891444921494, "loss": 2.928, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.0477294921875, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.116943359375, "loss_aux_layer_16": 0.1275634765625, "loss_aux_layer_17": 0.135498046875, "loss_aux_layer_18": 0.14501953125, "loss_aux_layer_19": 0.147216796875, "loss_aux_layer_2": 0.06097412109375, "loss_aux_layer_20": 0.154296875, "loss_aux_layer_21": 0.16064453125, "loss_aux_layer_22": 0.181884765625, "loss_aux_layer_23": 0.221435546875, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.076171875, "step": 1912, "total_loss": 0.7319896817207336 }, { "epoch": 0.37873688378538906, "grad_norm": 1.2849677801132202, "learning_rate": 5e-05, "llm_loss": 0.59633419662714, "loss": 2.7681, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05517578125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.06884765625, "step": 1913, "total_loss": 0.6920303106307983 }, { "epoch": 0.37893486438329044, "grad_norm": 1.3966240882873535, "learning_rate": 5e-05, "llm_loss": 0.6631688475608826, "loss": 3.0416, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1270751953125, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06793212890625, "loss_aux_layer_5": 0.06964111328125, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.06951904296875, "loss_aux_layer_9": 0.068359375, "step": 1914, "total_loss": 0.7603981494903564 }, { "epoch": 0.3791328449811918, "grad_norm": 1.790451169013977, "learning_rate": 5e-05, "llm_loss": 0.5659500807523727, "loss": 2.6832, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.04888916015625, "loss_aux_layer_10": 0.078369140625, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.095947265625, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.1324462890625, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.0648193359375, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.159912109375, "loss_aux_layer_22": 0.18359375, "loss_aux_layer_23": 0.224365234375, "loss_aux_layer_3": 0.07470703125, "loss_aux_layer_4": 0.077880859375, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.08251953125, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.076904296875, "step": 1915, "total_loss": 0.6708084642887115 }, { "epoch": 0.37933082557909326, "grad_norm": 1.1851118803024292, "learning_rate": 5e-05, "llm_loss": 0.5892174392938614, "loss": 2.7756, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.048828125, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.089599609375, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.1163330078125, "loss_aux_layer_16": 0.127197265625, "loss_aux_layer_17": 0.134765625, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.06182861328125, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.1591796875, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0792236328125, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.07958984375, "loss_aux_layer_8": 0.078857421875, "loss_aux_layer_9": 0.0775146484375, "step": 1916, "total_loss": 0.6938997358083725 }, { "epoch": 0.37952880617699464, "grad_norm": 1.4097179174423218, "learning_rate": 5e-05, "llm_loss": 0.7082057446241379, "loss": 3.2399, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.04833984375, "loss_aux_layer_10": 0.076416015625, "loss_aux_layer_11": 0.0811767578125, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.060546875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.0765380859375, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.0750732421875, "step": 1917, "total_loss": 0.8099818676710129 }, { "epoch": 0.3797267867748961, "grad_norm": 0.9215584993362427, "learning_rate": 5e-05, "llm_loss": 0.6412013322114944, "loss": 2.9666, "loss_aux_layer_0": 0.02423095703125, "loss_aux_layer_1": 0.04754638671875, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05926513671875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.0736083984375, "step": 1918, "total_loss": 0.7416433840990067 }, { "epoch": 0.37992476737279746, "grad_norm": 1.3378806114196777, "learning_rate": 5e-05, "llm_loss": 0.5631663501262665, "loss": 2.6543, "loss_aux_layer_0": 0.02166748046875, "loss_aux_layer_1": 0.04656982421875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1102294921875, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.12890625, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072021484375, "step": 1919, "total_loss": 0.6635691821575165 }, { "epoch": 0.3801227479706989, "grad_norm": 1.0503596067428589, "learning_rate": 5e-05, "llm_loss": 0.6671519577503204, "loss": 3.0802, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0467529296875, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1251220703125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.059326171875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.22216796875, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0736083984375, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0748291015625, "step": 1920, "total_loss": 0.7700424641370773 }, { "epoch": 0.3803207285686003, "grad_norm": 1.059885025024414, "learning_rate": 5e-05, "llm_loss": 0.647431880235672, "loss": 3.0052, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04876708984375, "loss_aux_layer_10": 0.0782470703125, "loss_aux_layer_11": 0.0836181640625, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.096435546875, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.115478515625, "loss_aux_layer_16": 0.1256103515625, "loss_aux_layer_17": 0.133056640625, "loss_aux_layer_18": 0.1416015625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.06109619140625, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.078125, "loss_aux_layer_6": 0.08154296875, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.07666015625, "step": 1921, "total_loss": 0.7513011544942856 }, { "epoch": 0.38051870916650166, "grad_norm": 1.758824110031128, "learning_rate": 5e-05, "llm_loss": 0.6453435570001602, "loss": 2.9655, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06787109375, "step": 1922, "total_loss": 0.7413721978664398 }, { "epoch": 0.3807166897644031, "grad_norm": 0.9566324353218079, "learning_rate": 5e-05, "llm_loss": 0.6114950776100159, "loss": 2.8565, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.046630859375, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.10498046875, "loss_aux_layer_15": 0.1146240234375, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.139892578125, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.06060791015625, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.076171875, "step": 1923, "total_loss": 0.7141180336475372 }, { "epoch": 0.3809146703623045, "grad_norm": 1.4866588115692139, "learning_rate": 5e-05, "llm_loss": 0.5612056702375412, "loss": 2.627, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.06884765625, "loss_aux_layer_9": 0.0675048828125, "step": 1924, "total_loss": 0.6567603349685669 }, { "epoch": 0.3811126509602059, "grad_norm": 0.9897788763046265, "learning_rate": 5e-05, "llm_loss": 0.65833979845047, "loss": 3.0358, "loss_aux_layer_0": 0.022125244140625, "loss_aux_layer_1": 0.0462646484375, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.1297607421875, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.073486328125, "step": 1925, "total_loss": 0.7589611858129501 }, { "epoch": 0.3813106315581073, "grad_norm": 1.3796886205673218, "learning_rate": 5e-05, "llm_loss": 0.5991089269518852, "loss": 2.792, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.08935546875, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.06695556640625, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.07080078125, "step": 1926, "total_loss": 0.6979939490556717 }, { "epoch": 0.38150861215600873, "grad_norm": 0.8103665113449097, "learning_rate": 5e-05, "llm_loss": 0.622363954782486, "loss": 2.8743, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04388427734375, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.1226806640625, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05621337890625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.0697021484375, "step": 1927, "total_loss": 0.7185804843902588 }, { "epoch": 0.3817065927539101, "grad_norm": 1.1844482421875, "learning_rate": 5e-05, "llm_loss": 0.5901478379964828, "loss": 2.7532, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1190185546875, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.0693359375, "step": 1928, "total_loss": 0.6883074790239334 }, { "epoch": 0.38190457335181155, "grad_norm": 1.1812553405761719, "learning_rate": 5e-05, "llm_loss": 0.6625460684299469, "loss": 3.0493, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.1102294921875, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0693359375, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.072021484375, "step": 1929, "total_loss": 0.7623129785060883 }, { "epoch": 0.38210255394971293, "grad_norm": 1.1407541036605835, "learning_rate": 5e-05, "llm_loss": 0.6482900977134705, "loss": 3.0108, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.049560546875, "loss_aux_layer_10": 0.078857421875, "loss_aux_layer_11": 0.084228515625, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.09619140625, "loss_aux_layer_14": 0.1058349609375, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.125244140625, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.06304931640625, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0748291015625, "loss_aux_layer_4": 0.078125, "loss_aux_layer_5": 0.079833984375, "loss_aux_layer_6": 0.0831298828125, "loss_aux_layer_7": 0.0802001953125, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.0772705078125, "step": 1930, "total_loss": 0.7526915222406387 }, { "epoch": 0.3823005345476143, "grad_norm": 0.9914405941963196, "learning_rate": 5e-05, "llm_loss": 0.650462418794632, "loss": 3.0098, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.11328125, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.05877685546875, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0740966796875, "step": 1931, "total_loss": 0.7524513900279999 }, { "epoch": 0.38249851514551575, "grad_norm": 1.3716498613357544, "learning_rate": 5e-05, "llm_loss": 0.6268030256032944, "loss": 2.9093, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.0440673828125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.138916015625, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.0570068359375, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0726318359375, "step": 1932, "total_loss": 0.7273217439651489 }, { "epoch": 0.38269649574341713, "grad_norm": 1.2910250425338745, "learning_rate": 5e-05, "llm_loss": 0.6126362532377243, "loss": 2.8411, "loss_aux_layer_0": 0.024444580078125, "loss_aux_layer_1": 0.04437255859375, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.10693359375, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.07470703125, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0709228515625, "loss_aux_layer_9": 0.069580078125, "step": 1933, "total_loss": 0.7102662473917007 }, { "epoch": 0.38289447634131857, "grad_norm": 1.2364537715911865, "learning_rate": 5e-05, "llm_loss": 0.5988131761550903, "loss": 2.7979, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.09033203125, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.1806640625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.07177734375, "step": 1934, "total_loss": 0.6994765400886536 }, { "epoch": 0.38309245693921995, "grad_norm": 1.5565909147262573, "learning_rate": 5e-05, "llm_loss": 0.5665683001279831, "loss": 2.6574, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.04437255859375, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.096923828125, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.0673828125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0703125, "step": 1935, "total_loss": 0.6643472611904144 }, { "epoch": 0.3832904375371214, "grad_norm": 1.2168442010879517, "learning_rate": 5e-05, "llm_loss": 0.6039854511618614, "loss": 2.8252, "loss_aux_layer_0": 0.023681640625, "loss_aux_layer_1": 0.04827880859375, "loss_aux_layer_10": 0.076416015625, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.0772705078125, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0750732421875, "step": 1936, "total_loss": 0.706291675567627 }, { "epoch": 0.38348841813502277, "grad_norm": 1.0225733518600464, "learning_rate": 5e-05, "llm_loss": 0.6035228371620178, "loss": 2.8278, "loss_aux_layer_0": 0.023223876953125, "loss_aux_layer_1": 0.04779052734375, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.0943603515625, "loss_aux_layer_14": 0.104248046875, "loss_aux_layer_15": 0.114013671875, "loss_aux_layer_16": 0.1243896484375, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.06195068359375, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.075927734375, "step": 1937, "total_loss": 0.7069475650787354 }, { "epoch": 0.38368639873292415, "grad_norm": 1.5098552703857422, "learning_rate": 5e-05, "llm_loss": 0.7070634365081787, "loss": 3.2348, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.122802734375, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.05853271484375, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.07568359375, "loss_aux_layer_9": 0.074462890625, "step": 1938, "total_loss": 0.8087069690227509 }, { "epoch": 0.3838843793308256, "grad_norm": 1.245707631111145, "learning_rate": 5e-05, "llm_loss": 0.7161936610937119, "loss": 3.2596, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.04461669921875, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0709228515625, "step": 1939, "total_loss": 0.814892902970314 }, { "epoch": 0.38408235992872697, "grad_norm": 1.104057788848877, "learning_rate": 5e-05, "llm_loss": 0.6649811789393425, "loss": 3.0513, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.06915283203125, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0712890625, "step": 1940, "total_loss": 0.7628160864114761 }, { "epoch": 0.3842803405266284, "grad_norm": 1.005798578262329, "learning_rate": 5e-05, "llm_loss": 0.5686485320329666, "loss": 2.6773, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.045166015625, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.12109375, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.07275390625, "step": 1941, "total_loss": 0.6693206280469894 }, { "epoch": 0.3844783211245298, "grad_norm": 1.433553695678711, "learning_rate": 5e-05, "llm_loss": 0.7224726974964142, "loss": 3.283, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04412841796875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.100341796875, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1275634765625, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.0570068359375, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.072021484375, "step": 1942, "total_loss": 0.8207525014877319 }, { "epoch": 0.3846763017224312, "grad_norm": 1.278247594833374, "learning_rate": 5e-05, "llm_loss": 0.5905865728855133, "loss": 2.7617, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.0455322265625, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0718994140625, "step": 1943, "total_loss": 0.6904270648956299 }, { "epoch": 0.3848742823203326, "grad_norm": 1.2579550743103027, "learning_rate": 5e-05, "llm_loss": 0.5679228603839874, "loss": 2.6717, "loss_aux_layer_0": 0.0213623046875, "loss_aux_layer_1": 0.044921875, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.120849609375, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.072021484375, "step": 1944, "total_loss": 0.6679220050573349 }, { "epoch": 0.385072262918234, "grad_norm": 1.6609991788864136, "learning_rate": 5e-05, "llm_loss": 0.5802092105150223, "loss": 2.7186, "loss_aux_layer_0": 0.023468017578125, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0777587890625, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.111083984375, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.1297607421875, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0712890625, "step": 1945, "total_loss": 0.6796610802412033 }, { "epoch": 0.3852702435161354, "grad_norm": 1.1463370323181152, "learning_rate": 5e-05, "llm_loss": 0.5831045135855675, "loss": 2.734, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.0445556640625, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0838623046875, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.1116943359375, "loss_aux_layer_16": 0.122802734375, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0718994140625, "step": 1946, "total_loss": 0.6835108548402786 }, { "epoch": 0.3854682241140368, "grad_norm": 1.0873045921325684, "learning_rate": 5e-05, "llm_loss": 0.591036930680275, "loss": 2.7763, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.08740234375, "loss_aux_layer_13": 0.0938720703125, "loss_aux_layer_14": 0.1038818359375, "loss_aux_layer_15": 0.1138916015625, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.0614013671875, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.0782470703125, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.0755615234375, "step": 1947, "total_loss": 0.6940785646438599 }, { "epoch": 0.38566620471193824, "grad_norm": 1.4866161346435547, "learning_rate": 5e-05, "llm_loss": 0.5604401603341103, "loss": 2.6435, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04443359375, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.12158203125, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05731201171875, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.15673828125, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0726318359375, "step": 1948, "total_loss": 0.6608824133872986 }, { "epoch": 0.3858641853098396, "grad_norm": 1.577508807182312, "learning_rate": 5e-05, "llm_loss": 0.6170725226402283, "loss": 2.8767, "loss_aux_layer_0": 0.022247314453125, "loss_aux_layer_1": 0.04730224609375, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.060546875, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.0762939453125, "loss_aux_layer_9": 0.07470703125, "step": 1949, "total_loss": 0.7191740125417709 }, { "epoch": 0.38606216590774106, "grad_norm": 1.1624006032943726, "learning_rate": 5e-05, "llm_loss": 0.5775940716266632, "loss": 2.7241, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.078125, "loss_aux_layer_11": 0.0833740234375, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.105712890625, "loss_aux_layer_15": 0.1151123046875, "loss_aux_layer_16": 0.1253662109375, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.0611572265625, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.07275390625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0765380859375, "step": 1950, "total_loss": 0.6810257732868195 }, { "epoch": 0.38626014650564244, "grad_norm": 2.1812121868133545, "learning_rate": 5e-05, "llm_loss": 0.6277846843004227, "loss": 2.9105, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04461669921875, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1107177734375, "loss_aux_layer_16": 0.12109375, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.072265625, "step": 1951, "total_loss": 0.7276148647069931 }, { "epoch": 0.3864581271035439, "grad_norm": 1.563355565071106, "learning_rate": 5e-05, "llm_loss": 0.6212491989135742, "loss": 2.8858, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04498291015625, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.059326171875, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.072509765625, "step": 1952, "total_loss": 0.7214585691690445 }, { "epoch": 0.38665610770144526, "grad_norm": 1.5366981029510498, "learning_rate": 5e-05, "llm_loss": 0.6339658498764038, "loss": 2.9446, "loss_aux_layer_0": 0.02178955078125, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.0753173828125, "loss_aux_layer_11": 0.0802001953125, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.1239013671875, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.06103515625, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.076171875, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.07373046875, "step": 1953, "total_loss": 0.7361463308334351 }, { "epoch": 0.38685408829934664, "grad_norm": 1.5346757173538208, "learning_rate": 5e-05, "llm_loss": 0.5461002886295319, "loss": 2.581, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17919921875, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0704345703125, "step": 1954, "total_loss": 0.6452458500862122 }, { "epoch": 0.3870520688972481, "grad_norm": 1.5188860893249512, "learning_rate": 5e-05, "llm_loss": 0.6704618334770203, "loss": 3.0763, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.044921875, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.0986328125, "loss_aux_layer_15": 0.1087646484375, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0706787109375, "step": 1955, "total_loss": 0.769081637263298 }, { "epoch": 0.38725004949514946, "grad_norm": 1.8252443075180054, "learning_rate": 5e-05, "llm_loss": 0.5932862386107445, "loss": 2.7679, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.119140625, "loss_aux_layer_17": 0.1270751953125, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0712890625, "step": 1956, "total_loss": 0.6919630914926529 }, { "epoch": 0.3874480300930509, "grad_norm": 1.1348016262054443, "learning_rate": 5e-05, "llm_loss": 0.5738383010029793, "loss": 2.7085, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04705810546875, "loss_aux_layer_10": 0.0775146484375, "loss_aux_layer_11": 0.08251953125, "loss_aux_layer_12": 0.088134765625, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.1048583984375, "loss_aux_layer_15": 0.114501953125, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.14111328125, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.150146484375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.0784912109375, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.076171875, "step": 1957, "total_loss": 0.6771232634782791 }, { "epoch": 0.3876460106909523, "grad_norm": 1.1045218706130981, "learning_rate": 5e-05, "llm_loss": 0.5325639098882675, "loss": 2.5322, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.04461669921875, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.10009765625, "loss_aux_layer_15": 0.1102294921875, "loss_aux_layer_16": 0.120849609375, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.05712890625, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.157470703125, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.220947265625, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.0721435546875, "step": 1958, "total_loss": 0.6330473273992538 }, { "epoch": 0.3878439912888537, "grad_norm": 1.3159254789352417, "learning_rate": 5e-05, "llm_loss": 0.623021773993969, "loss": 2.8965, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.10205078125, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.1219482421875, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.0738525390625, "step": 1959, "total_loss": 0.7241312712430954 }, { "epoch": 0.3880419718867551, "grad_norm": 1.5375587940216064, "learning_rate": 5e-05, "llm_loss": 0.6052512004971504, "loss": 2.838, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.0462646484375, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.0963134765625, "loss_aux_layer_14": 0.1063232421875, "loss_aux_layer_15": 0.1162109375, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.142578125, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.0791015625, "loss_aux_layer_8": 0.078125, "loss_aux_layer_9": 0.076904296875, "step": 1960, "total_loss": 0.7095070481300354 }, { "epoch": 0.3882399524846565, "grad_norm": 1.1147302389144897, "learning_rate": 5e-05, "llm_loss": 0.5952053368091583, "loss": 2.7974, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.0482177734375, "loss_aux_layer_10": 0.077880859375, "loss_aux_layer_11": 0.0828857421875, "loss_aux_layer_12": 0.0887451171875, "loss_aux_layer_13": 0.09521484375, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.115234375, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.133544921875, "loss_aux_layer_18": 0.14306640625, "loss_aux_layer_19": 0.146240234375, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.1533203125, "loss_aux_layer_21": 0.160888671875, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.075439453125, "loss_aux_layer_5": 0.0775146484375, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.077880859375, "loss_aux_layer_8": 0.076904296875, "loss_aux_layer_9": 0.076171875, "step": 1961, "total_loss": 0.6993394047021866 }, { "epoch": 0.3884379330825579, "grad_norm": 1.284048080444336, "learning_rate": 5e-05, "llm_loss": 0.5290324538946152, "loss": 2.5125, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1190185546875, "loss_aux_layer_17": 0.126953125, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0711669921875, "step": 1962, "total_loss": 0.6281149089336395 }, { "epoch": 0.3886359136804593, "grad_norm": 1.3946810960769653, "learning_rate": 5e-05, "llm_loss": 0.5517875328660011, "loss": 2.6119, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.0447998046875, "loss_aux_layer_10": 0.076171875, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.0869140625, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.102294921875, "loss_aux_layer_15": 0.1116943359375, "loss_aux_layer_16": 0.121337890625, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.059326171875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0716552734375, "loss_aux_layer_4": 0.0748291015625, "loss_aux_layer_5": 0.076904296875, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0745849609375, "step": 1963, "total_loss": 0.6529790610074997 }, { "epoch": 0.38883389427836074, "grad_norm": 1.3456206321716309, "learning_rate": 5e-05, "llm_loss": 0.6388632655143738, "loss": 2.9541, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.0572509765625, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0711669921875, "step": 1964, "total_loss": 0.7385136187076569 }, { "epoch": 0.3890318748762621, "grad_norm": 1.5255351066589355, "learning_rate": 5e-05, "llm_loss": 0.5169929787516594, "loss": 2.4715, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.10205078125, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.220458984375, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072265625, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0716552734375, "step": 1965, "total_loss": 0.6178709119558334 }, { "epoch": 0.38922985547416356, "grad_norm": 2.072361469268799, "learning_rate": 5e-05, "llm_loss": 0.5748245418071747, "loss": 2.6977, "loss_aux_layer_0": 0.022918701171875, "loss_aux_layer_1": 0.04559326171875, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.110107421875, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05865478515625, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.07275390625, "step": 1966, "total_loss": 0.6744277030229568 }, { "epoch": 0.38942783607206494, "grad_norm": 1.4061675071716309, "learning_rate": 5e-05, "llm_loss": 0.6243800222873688, "loss": 2.8853, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.0557861328125, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0699462890625, "step": 1967, "total_loss": 0.7213370501995087 }, { "epoch": 0.3896258166699663, "grad_norm": 1.368688941001892, "learning_rate": 5e-05, "llm_loss": 0.5599246844649315, "loss": 2.6487, "loss_aux_layer_0": 0.02301025390625, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0849609375, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.1241455078125, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.14404296875, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.161865234375, "loss_aux_layer_22": 0.185546875, "loss_aux_layer_23": 0.227294921875, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.07275390625, "step": 1968, "total_loss": 0.6621644198894501 }, { "epoch": 0.38982379726786776, "grad_norm": 0.9921329617500305, "learning_rate": 5e-05, "llm_loss": 0.6477275788784027, "loss": 2.9869, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.089111328125, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.1185302734375, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0712890625, "step": 1969, "total_loss": 0.7467280477285385 }, { "epoch": 0.39002177786576914, "grad_norm": 1.0787335634231567, "learning_rate": 5e-05, "llm_loss": 0.5917064696550369, "loss": 2.7563, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.1260986328125, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05499267578125, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0693359375, "step": 1970, "total_loss": 0.6890797913074493 }, { "epoch": 0.3902197584636706, "grad_norm": 1.334814429283142, "learning_rate": 5e-05, "llm_loss": 0.5943415313959122, "loss": 2.778, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.0458984375, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.1197509765625, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05865478515625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.0726318359375, "step": 1971, "total_loss": 0.6944916397333145 }, { "epoch": 0.39041773906157196, "grad_norm": 1.050218105316162, "learning_rate": 5e-05, "llm_loss": 0.6268594264984131, "loss": 2.9175, "loss_aux_layer_0": 0.0242919921875, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.138916015625, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.06072998046875, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.07568359375, "loss_aux_layer_5": 0.077392578125, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.0777587890625, "loss_aux_layer_8": 0.07666015625, "loss_aux_layer_9": 0.0750732421875, "step": 1972, "total_loss": 0.7293860912322998 }, { "epoch": 0.3906157196594734, "grad_norm": 1.2651900053024292, "learning_rate": 5e-05, "llm_loss": 0.5626758933067322, "loss": 2.6574, "loss_aux_layer_0": 0.02508544921875, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.101806640625, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.06036376953125, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.0804443359375, "loss_aux_layer_7": 0.0775146484375, "loss_aux_layer_8": 0.0765380859375, "loss_aux_layer_9": 0.0750732421875, "step": 1973, "total_loss": 0.6643561571836472 }, { "epoch": 0.3908137002573748, "grad_norm": 1.1120952367782593, "learning_rate": 5e-05, "llm_loss": 0.6508447229862213, "loss": 3.0022, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.04522705078125, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0848388671875, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.1109619140625, "loss_aux_layer_16": 0.121826171875, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05731201171875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.0743408203125, "loss_aux_layer_9": 0.073486328125, "step": 1974, "total_loss": 0.7505543977022171 }, { "epoch": 0.39101168085527616, "grad_norm": 1.001021146774292, "learning_rate": 5e-05, "llm_loss": 0.5877632945775986, "loss": 2.7368, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0433349609375, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.07501220703125, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.07000732421875, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.07196044921875, "loss_aux_layer_8": 0.07110595703125, "loss_aux_layer_9": 0.06964111328125, "step": 1975, "total_loss": 0.6842055320739746 }, { "epoch": 0.3912096614531776, "grad_norm": 1.064363956451416, "learning_rate": 5e-05, "llm_loss": 0.526262529194355, "loss": 2.5001, "loss_aux_layer_0": 0.02276611328125, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.0823974609375, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.119140625, "loss_aux_layer_17": 0.1270751953125, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06878662109375, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0714111328125, "step": 1976, "total_loss": 0.6250153183937073 }, { "epoch": 0.391407642051079, "grad_norm": 1.1633325815200806, "learning_rate": 5e-05, "llm_loss": 0.6814245730638504, "loss": 3.1237, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.10009765625, "loss_aux_layer_15": 0.109619140625, "loss_aux_layer_16": 0.1199951171875, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.1365966796875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072265625, "step": 1977, "total_loss": 0.7809138000011444 }, { "epoch": 0.3916056226489804, "grad_norm": 1.0924081802368164, "learning_rate": 5e-05, "llm_loss": 0.6277163177728653, "loss": 2.901, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.096923828125, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.06829833984375, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0694580078125, "step": 1978, "total_loss": 0.7252622544765472 }, { "epoch": 0.3918036032468818, "grad_norm": 1.392308235168457, "learning_rate": 5e-05, "llm_loss": 0.6611218601465225, "loss": 3.0376, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04522705078125, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06854248046875, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.072021484375, "step": 1979, "total_loss": 0.7593976557254791 }, { "epoch": 0.39200158384478323, "grad_norm": 0.9669089913368225, "learning_rate": 5e-05, "llm_loss": 0.5937204509973526, "loss": 2.767, "loss_aux_layer_0": 0.023651123046875, "loss_aux_layer_1": 0.0440673828125, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.07470703125, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.070068359375, "step": 1980, "total_loss": 0.6917527616024017 }, { "epoch": 0.3921995644426846, "grad_norm": 1.1854791641235352, "learning_rate": 5e-05, "llm_loss": 0.5445531383156776, "loss": 2.5734, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.0777587890625, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1083984375, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05670166015625, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.072021484375, "step": 1981, "total_loss": 0.643346518278122 }, { "epoch": 0.39239754504058605, "grad_norm": 0.8370237946510315, "learning_rate": 5e-05, "llm_loss": 0.5847756713628769, "loss": 2.7514, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04803466796875, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.0892333984375, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.12353515625, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.13916015625, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.06121826171875, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.07666015625, "loss_aux_layer_5": 0.078857421875, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0792236328125, "loss_aux_layer_8": 0.07861328125, "loss_aux_layer_9": 0.0771484375, "step": 1982, "total_loss": 0.6878417879343033 }, { "epoch": 0.39259552563848743, "grad_norm": 1.4415969848632812, "learning_rate": 5e-05, "llm_loss": 0.590337723493576, "loss": 2.7623, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.0743408203125, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.091064453125, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.1107177734375, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.057861328125, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.06890869140625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.072998046875, "step": 1983, "total_loss": 0.6905750781297684 }, { "epoch": 0.3927935062363888, "grad_norm": 1.3685884475708008, "learning_rate": 5e-05, "llm_loss": 0.5788977891206741, "loss": 2.7251, "loss_aux_layer_0": 0.0235595703125, "loss_aux_layer_1": 0.04638671875, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.1236572265625, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.14453125, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.18017578125, "loss_aux_layer_23": 0.2197265625, "loss_aux_layer_3": 0.07110595703125, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.0740966796875, "step": 1984, "total_loss": 0.6812753826379776 }, { "epoch": 0.39299148683429025, "grad_norm": 1.0347524881362915, "learning_rate": 5e-05, "llm_loss": 0.5931652784347534, "loss": 2.7736, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.045654296875, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.0849609375, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1280517578125, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0736083984375, "step": 1985, "total_loss": 0.6934086978435516 }, { "epoch": 0.39318946743219163, "grad_norm": 1.2131315469741821, "learning_rate": 5e-05, "llm_loss": 0.5984787344932556, "loss": 2.7696, "loss_aux_layer_0": 0.022125244140625, "loss_aux_layer_1": 0.0419921875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.0931396484375, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1126708984375, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0670166015625, "step": 1986, "total_loss": 0.6924029290676117 }, { "epoch": 0.39338744803009307, "grad_norm": 0.9996520280838013, "learning_rate": 5e-05, "llm_loss": 0.6335257738828659, "loss": 2.9299, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.07379150390625, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.12646484375, "loss_aux_layer_18": 0.1343994140625, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.06976318359375, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.07501220703125, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.0723876953125, "step": 1987, "total_loss": 0.7324789017438889 }, { "epoch": 0.39358542862799445, "grad_norm": 1.0113743543624878, "learning_rate": 5e-05, "llm_loss": 0.5897558704018593, "loss": 2.756, "loss_aux_layer_0": 0.0225830078125, "loss_aux_layer_1": 0.044921875, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.10986328125, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1280517578125, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.06866455078125, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.073486328125, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0711669921875, "step": 1988, "total_loss": 0.6889888942241669 }, { "epoch": 0.3937834092258959, "grad_norm": 1.0400702953338623, "learning_rate": 5e-05, "llm_loss": 0.5912958234548569, "loss": 2.7668, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.04583740234375, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.0804443359375, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.076416015625, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.073974609375, "step": 1989, "total_loss": 0.6916930973529816 }, { "epoch": 0.39398138982379727, "grad_norm": 0.8785432577133179, "learning_rate": 5e-05, "llm_loss": 0.594764307141304, "loss": 2.7717, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.06884765625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0692138671875, "step": 1990, "total_loss": 0.6929320693016052 }, { "epoch": 0.39417937042169865, "grad_norm": 1.0748205184936523, "learning_rate": 5e-05, "llm_loss": 0.5554053634405136, "loss": 2.6326, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.0462646484375, "loss_aux_layer_10": 0.0780029296875, "loss_aux_layer_11": 0.0830078125, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1142578125, "loss_aux_layer_16": 0.1246337890625, "loss_aux_layer_17": 0.13232421875, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.075927734375, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.0809326171875, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.076416015625, "step": 1991, "total_loss": 0.658146932721138 }, { "epoch": 0.3943773510196001, "grad_norm": 0.9651937484741211, "learning_rate": 5e-05, "llm_loss": 0.6715575456619263, "loss": 3.0869, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0445556640625, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0584716796875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.075439453125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.07275390625, "step": 1992, "total_loss": 0.7717335671186447 }, { "epoch": 0.39457533161750147, "grad_norm": 1.0201061964035034, "learning_rate": 5e-05, "llm_loss": 0.6032223403453827, "loss": 2.8208, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.0867919921875, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.1239013671875, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.0592041015625, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0709228515625, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.0765380859375, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.0745849609375, "step": 1993, "total_loss": 0.7051893621683121 }, { "epoch": 0.3947733122154029, "grad_norm": 0.8784953355789185, "learning_rate": 5e-05, "llm_loss": 0.6571448966860771, "loss": 3.0151, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0975341796875, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0699462890625, "step": 1994, "total_loss": 0.7537652999162674 }, { "epoch": 0.3949712928133043, "grad_norm": 1.2787754535675049, "learning_rate": 5e-05, "llm_loss": 0.6636122167110443, "loss": 3.051, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.0445556640625, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.127685546875, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05792236328125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.072509765625, "step": 1995, "total_loss": 0.7627597153186798 }, { "epoch": 0.3951692734112057, "grad_norm": 1.0228558778762817, "learning_rate": 5e-05, "llm_loss": 0.5700962394475937, "loss": 2.6682, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04364013671875, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0572509765625, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06793212890625, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072265625, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.06964111328125, "step": 1996, "total_loss": 0.6670443713665009 }, { "epoch": 0.3953672540091071, "grad_norm": 0.9645127058029175, "learning_rate": 5e-05, "llm_loss": 0.5447434186935425, "loss": 2.5844, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.07403564453125, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.05792236328125, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.179931640625, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.06915283203125, "loss_aux_layer_4": 0.07244873046875, "loss_aux_layer_5": 0.07415771484375, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.07269287109375, "step": 1997, "total_loss": 0.6460983604192734 }, { "epoch": 0.3955652346070085, "grad_norm": 1.0458604097366333, "learning_rate": 5e-05, "llm_loss": 0.5682845339179039, "loss": 2.6859, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.047119140625, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0882568359375, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.104736328125, "loss_aux_layer_15": 0.1143798828125, "loss_aux_layer_16": 0.124755859375, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.14208984375, "loss_aux_layer_2": 0.062255859375, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.07373046875, "loss_aux_layer_4": 0.076904296875, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.077880859375, "loss_aux_layer_9": 0.07666015625, "step": 1998, "total_loss": 0.6714641451835632 }, { "epoch": 0.3957632152049099, "grad_norm": 1.2287261486053467, "learning_rate": 5e-05, "llm_loss": 0.6191119104623795, "loss": 2.8678, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.0977783203125, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.1177978515625, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0662841796875, "loss_aux_layer_4": 0.06927490234375, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.06976318359375, "step": 1999, "total_loss": 0.7169622629880905 }, { "epoch": 0.3959611958028113, "grad_norm": 1.1296865940093994, "learning_rate": 5e-05, "llm_loss": 0.6327068656682968, "loss": 2.9476, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.0469970703125, "loss_aux_layer_10": 0.07763671875, "loss_aux_layer_11": 0.082763671875, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0960693359375, "loss_aux_layer_14": 0.1065673828125, "loss_aux_layer_15": 0.1165771484375, "loss_aux_layer_16": 0.12744140625, "loss_aux_layer_17": 0.1353759765625, "loss_aux_layer_18": 0.143798828125, "loss_aux_layer_19": 0.145751953125, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.15234375, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0732421875, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0789794921875, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.076171875, "step": 2000, "total_loss": 0.736901044845581 }, { "epoch": 0.39615917640071274, "grad_norm": 1.5180729627609253, "learning_rate": 5e-05, "llm_loss": 0.5654294341802597, "loss": 2.6828, "loss_aux_layer_0": 0.02691650390625, "loss_aux_layer_1": 0.04888916015625, "loss_aux_layer_10": 0.079833984375, "loss_aux_layer_11": 0.0848388671875, "loss_aux_layer_12": 0.0902099609375, "loss_aux_layer_13": 0.0965576171875, "loss_aux_layer_14": 0.1060791015625, "loss_aux_layer_15": 0.1153564453125, "loss_aux_layer_16": 0.125, "loss_aux_layer_17": 0.1328125, "loss_aux_layer_18": 0.141357421875, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.0631103515625, "loss_aux_layer_20": 0.1513671875, "loss_aux_layer_21": 0.159423828125, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0751953125, "loss_aux_layer_4": 0.0782470703125, "loss_aux_layer_5": 0.080078125, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.080810546875, "loss_aux_layer_8": 0.080078125, "loss_aux_layer_9": 0.07861328125, "step": 2001, "total_loss": 0.6706985384225845 }, { "epoch": 0.3963571569986141, "grad_norm": 1.1224690675735474, "learning_rate": 5e-05, "llm_loss": 0.5885337293148041, "loss": 2.7595, "loss_aux_layer_0": 0.021240234375, "loss_aux_layer_1": 0.045654296875, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.079833984375, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.1307373046875, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.149658203125, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.0745849609375, "loss_aux_layer_9": 0.0733642578125, "step": 2002, "total_loss": 0.6898663938045502 }, { "epoch": 0.39655513759651556, "grad_norm": 1.2028799057006836, "learning_rate": 5e-05, "llm_loss": 0.6752229779958725, "loss": 3.1028, "loss_aux_layer_0": 0.023040771484375, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.0577392578125, "loss_aux_layer_20": 0.149169921875, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0726318359375, "step": 2003, "total_loss": 0.775700181722641 }, { "epoch": 0.39675311819441694, "grad_norm": 1.1832339763641357, "learning_rate": 5e-05, "llm_loss": 0.6113181412220001, "loss": 2.8262, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.041259765625, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0672607421875, "step": 2004, "total_loss": 0.7065449208021164 }, { "epoch": 0.3969510987923184, "grad_norm": 1.2327251434326172, "learning_rate": 5e-05, "llm_loss": 0.6208120882511139, "loss": 2.8791, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04437255859375, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.11962890625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05755615234375, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06866455078125, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072265625, "step": 2005, "total_loss": 0.7197728008031845 }, { "epoch": 0.39714907939021976, "grad_norm": 1.169789433479309, "learning_rate": 5e-05, "llm_loss": 0.6506556123495102, "loss": 3.006, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.081298828125, "loss_aux_layer_12": 0.086669921875, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.102294921875, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.121337890625, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.0599365234375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.0712890625, "loss_aux_layer_4": 0.07470703125, "loss_aux_layer_5": 0.0765380859375, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.0745849609375, "step": 2006, "total_loss": 0.7514952123165131 }, { "epoch": 0.39734705998812114, "grad_norm": 1.188992977142334, "learning_rate": 5e-05, "llm_loss": 0.6018804162740707, "loss": 2.8082, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04595947265625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.091796875, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.1107177734375, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05926513671875, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0732421875, "step": 2007, "total_loss": 0.7020561248064041 }, { "epoch": 0.3975450405860226, "grad_norm": 0.9816415309906006, "learning_rate": 5e-05, "llm_loss": 0.6437874436378479, "loss": 2.9679, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.0445556640625, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0723876953125, "step": 2008, "total_loss": 0.7419655025005341 }, { "epoch": 0.39774302118392396, "grad_norm": 0.9561536312103271, "learning_rate": 5e-05, "llm_loss": 0.6220052465796471, "loss": 2.8758, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05657958984375, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0706787109375, "step": 2009, "total_loss": 0.7189414799213409 }, { "epoch": 0.3979410017818254, "grad_norm": 1.0987548828125, "learning_rate": 5e-05, "llm_loss": 0.582354411482811, "loss": 2.7246, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05621337890625, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0711669921875, "step": 2010, "total_loss": 0.681160032749176 }, { "epoch": 0.3981389823797268, "grad_norm": 1.322715401649475, "learning_rate": 5e-05, "llm_loss": 0.6079281866550446, "loss": 2.8262, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1087646484375, "loss_aux_layer_16": 0.1185302734375, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.071533203125, "step": 2011, "total_loss": 0.7065419107675552 }, { "epoch": 0.3983369629776282, "grad_norm": 1.2101842164993286, "learning_rate": 5e-05, "llm_loss": 0.6391470730304718, "loss": 2.9504, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.0435791015625, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.0989990234375, "loss_aux_layer_15": 0.1082763671875, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.12646484375, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.0721435546875, "step": 2012, "total_loss": 0.7375916838645935 }, { "epoch": 0.3985349435755296, "grad_norm": 1.3372036218643188, "learning_rate": 5e-05, "llm_loss": 0.7024231851100922, "loss": 3.2015, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.1182861328125, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.070556640625, "step": 2013, "total_loss": 0.8003814518451691 }, { "epoch": 0.398732924173431, "grad_norm": 1.3921293020248413, "learning_rate": 5e-05, "llm_loss": 0.6474154442548752, "loss": 2.9934, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.04632568359375, "loss_aux_layer_10": 0.0767822265625, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.09375, "loss_aux_layer_14": 0.1031494140625, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.1298828125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05926513671875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.0760498046875, "loss_aux_layer_6": 0.0794677734375, "loss_aux_layer_7": 0.076904296875, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0750732421875, "step": 2014, "total_loss": 0.7483424544334412 }, { "epoch": 0.3989309047713324, "grad_norm": 1.5011835098266602, "learning_rate": 5e-05, "llm_loss": 0.5883641690015793, "loss": 2.7664, "loss_aux_layer_0": 0.0234375, "loss_aux_layer_1": 0.0469970703125, "loss_aux_layer_10": 0.0770263671875, "loss_aux_layer_11": 0.0819091796875, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.123046875, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.1396484375, "loss_aux_layer_19": 0.142822265625, "loss_aux_layer_2": 0.06134033203125, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.15869140625, "loss_aux_layer_22": 0.18212890625, "loss_aux_layer_23": 0.2216796875, "loss_aux_layer_3": 0.0728759765625, "loss_aux_layer_4": 0.076416015625, "loss_aux_layer_5": 0.078369140625, "loss_aux_layer_6": 0.0814208984375, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.07763671875, "loss_aux_layer_9": 0.0758056640625, "step": 2015, "total_loss": 0.6916072070598602 }, { "epoch": 0.3991288853692338, "grad_norm": 1.0129401683807373, "learning_rate": 5e-05, "llm_loss": 0.6326188445091248, "loss": 2.9329, "loss_aux_layer_0": 0.021820068359375, "loss_aux_layer_1": 0.0455322265625, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.120361328125, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.071044921875, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.0770263671875, "loss_aux_layer_8": 0.0760498046875, "loss_aux_layer_9": 0.0745849609375, "step": 2016, "total_loss": 0.7332287728786469 }, { "epoch": 0.39932686596713524, "grad_norm": 1.366734504699707, "learning_rate": 5e-05, "llm_loss": 0.6243275254964828, "loss": 2.8988, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0848388671875, "loss_aux_layer_13": 0.091796875, "loss_aux_layer_14": 0.101806640625, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.0721435546875, "step": 2017, "total_loss": 0.7246990352869034 }, { "epoch": 0.3995248465650366, "grad_norm": 1.1537469625473022, "learning_rate": 5e-05, "llm_loss": 0.531566746532917, "loss": 2.5311, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.0458984375, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.101806640625, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.121826171875, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.21875, "loss_aux_layer_3": 0.070068359375, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0740966796875, "step": 2018, "total_loss": 0.6327827572822571 }, { "epoch": 0.39972282716293805, "grad_norm": 1.2356802225112915, "learning_rate": 5e-05, "llm_loss": 0.5963320583105087, "loss": 2.7756, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.1185302734375, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0701904296875, "step": 2019, "total_loss": 0.6939101219177246 }, { "epoch": 0.39992080776083944, "grad_norm": 1.2236566543579102, "learning_rate": 5e-05, "llm_loss": 0.605729416012764, "loss": 2.8156, "loss_aux_layer_0": 0.022430419921875, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.10986328125, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0703125, "step": 2020, "total_loss": 0.7039003372192383 }, { "epoch": 0.4001187883587408, "grad_norm": 1.420838713645935, "learning_rate": 5e-05, "llm_loss": 0.6485612541437149, "loss": 2.9921, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.04437255859375, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05682373046875, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.0714111328125, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0709228515625, "step": 2021, "total_loss": 0.7480304539203644 }, { "epoch": 0.40031676895664225, "grad_norm": 0.8328862190246582, "learning_rate": 5e-05, "llm_loss": 0.5733869671821594, "loss": 2.6991, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.04620361328125, "loss_aux_layer_10": 0.07568359375, "loss_aux_layer_11": 0.080810546875, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1024169921875, "loss_aux_layer_15": 0.1124267578125, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.141357421875, "loss_aux_layer_2": 0.0592041015625, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.0787353515625, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0743408203125, "step": 2022, "total_loss": 0.674771174788475 }, { "epoch": 0.40051474955454364, "grad_norm": 1.563701868057251, "learning_rate": 5e-05, "llm_loss": 0.5538321733474731, "loss": 2.5973, "loss_aux_layer_0": 0.022979736328125, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1156005859375, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0667724609375, "step": 2023, "total_loss": 0.649321049451828 }, { "epoch": 0.4007127301524451, "grad_norm": 1.2810863256454468, "learning_rate": 5e-05, "llm_loss": 0.5982437282800674, "loss": 2.7846, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.1083984375, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.1368408203125, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05609130859375, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0694580078125, "step": 2024, "total_loss": 0.6961492151021957 }, { "epoch": 0.40091071075034646, "grad_norm": 0.9632026553153992, "learning_rate": 5e-05, "llm_loss": 0.593342587351799, "loss": 2.7618, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06719970703125, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.0699462890625, "step": 2025, "total_loss": 0.6904411017894745 }, { "epoch": 0.4011086913482479, "grad_norm": 1.388533592224121, "learning_rate": 5e-05, "llm_loss": 0.6330250054597855, "loss": 2.9482, "loss_aux_layer_0": 0.022857666015625, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.0784912109375, "loss_aux_layer_11": 0.08349609375, "loss_aux_layer_12": 0.0888671875, "loss_aux_layer_13": 0.0958251953125, "loss_aux_layer_14": 0.10546875, "loss_aux_layer_15": 0.114990234375, "loss_aux_layer_16": 0.12548828125, "loss_aux_layer_17": 0.132568359375, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.143310546875, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.150390625, "loss_aux_layer_21": 0.157958984375, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.0791015625, "loss_aux_layer_6": 0.0821533203125, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.07666015625, "step": 2026, "total_loss": 0.7370417714118958 }, { "epoch": 0.4013066719461493, "grad_norm": 1.1001750230789185, "learning_rate": 5e-05, "llm_loss": 0.5754515677690506, "loss": 2.6993, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.1180419921875, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.072998046875, "step": 2027, "total_loss": 0.6748307943344116 }, { "epoch": 0.40150465254405066, "grad_norm": 1.035913348197937, "learning_rate": 5e-05, "llm_loss": 0.5707335323095322, "loss": 2.672, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.0885009765625, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.0699462890625, "step": 2028, "total_loss": 0.6679928600788116 }, { "epoch": 0.4017026331419521, "grad_norm": 1.168257474899292, "learning_rate": 5e-05, "llm_loss": 0.6221473962068558, "loss": 2.8768, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05560302734375, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0662841796875, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0689697265625, "step": 2029, "total_loss": 0.7191977500915527 }, { "epoch": 0.4019006137398535, "grad_norm": 1.1880033016204834, "learning_rate": 5e-05, "llm_loss": 0.5729187279939651, "loss": 2.6753, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0682373046875, "step": 2030, "total_loss": 0.6688216626644135 }, { "epoch": 0.4020985943377549, "grad_norm": 1.1641318798065186, "learning_rate": 5e-05, "llm_loss": 0.7105147391557693, "loss": 3.2319, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.126220703125, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05609130859375, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06707763671875, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0706787109375, "step": 2031, "total_loss": 0.8079652935266495 }, { "epoch": 0.4022965749356563, "grad_norm": 1.17099130153656, "learning_rate": 5e-05, "llm_loss": 0.6092454344034195, "loss": 2.8509, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.094970703125, "loss_aux_layer_14": 0.1053466796875, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.13427734375, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.146484375, "loss_aux_layer_2": 0.0601806640625, "loss_aux_layer_20": 0.153076171875, "loss_aux_layer_21": 0.16015625, "loss_aux_layer_22": 0.180908203125, "loss_aux_layer_23": 0.219970703125, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.0745849609375, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0797119140625, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.0745849609375, "step": 2032, "total_loss": 0.7127264142036438 }, { "epoch": 0.40249455553355773, "grad_norm": 1.9447323083877563, "learning_rate": 5e-05, "llm_loss": 0.6400223225355148, "loss": 2.9562, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.04522705078125, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.12646484375, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0726318359375, "step": 2033, "total_loss": 0.7390493005514145 }, { "epoch": 0.4026925361314591, "grad_norm": 2.3974123001098633, "learning_rate": 5e-05, "llm_loss": 0.6056173592805862, "loss": 2.8129, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.127685546875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.070556640625, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0693359375, "step": 2034, "total_loss": 0.7032163739204407 }, { "epoch": 0.40289051672936055, "grad_norm": 1.3204132318496704, "learning_rate": 5e-05, "llm_loss": 0.5826040208339691, "loss": 2.7489, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04827880859375, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.08447265625, "loss_aux_layer_12": 0.09033203125, "loss_aux_layer_13": 0.096923828125, "loss_aux_layer_14": 0.106689453125, "loss_aux_layer_15": 0.116455078125, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.141845703125, "loss_aux_layer_19": 0.143798828125, "loss_aux_layer_2": 0.061767578125, "loss_aux_layer_20": 0.15087890625, "loss_aux_layer_21": 0.15771484375, "loss_aux_layer_22": 0.1796875, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.07421875, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.079833984375, "loss_aux_layer_6": 0.083251953125, "loss_aux_layer_7": 0.0806884765625, "loss_aux_layer_8": 0.079345703125, "loss_aux_layer_9": 0.077880859375, "step": 2035, "total_loss": 0.6872203350067139 }, { "epoch": 0.40308849732726193, "grad_norm": 1.360859751701355, "learning_rate": 5e-05, "llm_loss": 0.6000520884990692, "loss": 2.7968, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04486083984375, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.072509765625, "step": 2036, "total_loss": 0.699188768863678 }, { "epoch": 0.4032864779251633, "grad_norm": 1.2903523445129395, "learning_rate": 5e-05, "llm_loss": 0.536940261721611, "loss": 2.5439, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.04327392578125, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.156005859375, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0706787109375, "step": 2037, "total_loss": 0.6359872967004776 }, { "epoch": 0.40348445852306475, "grad_norm": 1.3607016801834106, "learning_rate": 5e-05, "llm_loss": 0.6221979260444641, "loss": 2.8707, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.08642578125, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1153564453125, "loss_aux_layer_17": 0.1231689453125, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0689697265625, "step": 2038, "total_loss": 0.7176720052957535 }, { "epoch": 0.40368243912096613, "grad_norm": 1.5649274587631226, "learning_rate": 5e-05, "llm_loss": 0.6939990520477295, "loss": 3.1675, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.087646484375, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.118896484375, "loss_aux_layer_17": 0.126953125, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.1767578125, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.0692138671875, "step": 2039, "total_loss": 0.791885569691658 }, { "epoch": 0.40388041971886757, "grad_norm": 1.3118976354599, "learning_rate": 5e-05, "llm_loss": 0.610395073890686, "loss": 2.8343, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.04443359375, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06915283203125, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0711669921875, "step": 2040, "total_loss": 0.7085779309272766 }, { "epoch": 0.40407840031676895, "grad_norm": 1.554093837738037, "learning_rate": 5e-05, "llm_loss": 0.6368406116962433, "loss": 2.9636, "loss_aux_layer_0": 0.02349853515625, "loss_aux_layer_1": 0.04718017578125, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1055908203125, "loss_aux_layer_15": 0.11572265625, "loss_aux_layer_16": 0.126220703125, "loss_aux_layer_17": 0.134033203125, "loss_aux_layer_18": 0.14208984375, "loss_aux_layer_19": 0.144287109375, "loss_aux_layer_2": 0.06158447265625, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.180419921875, "loss_aux_layer_23": 0.219482421875, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.0770263671875, "loss_aux_layer_5": 0.07861328125, "loss_aux_layer_6": 0.081787109375, "loss_aux_layer_7": 0.07861328125, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.076416015625, "step": 2041, "total_loss": 0.7408963143825531 }, { "epoch": 0.4042763809146704, "grad_norm": 1.2197542190551758, "learning_rate": 5e-05, "llm_loss": 0.6085781455039978, "loss": 2.8419, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.04693603515625, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.0816650390625, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.1026611328125, "loss_aux_layer_15": 0.112060546875, "loss_aux_layer_16": 0.1219482421875, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.0609130859375, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0723876953125, "loss_aux_layer_4": 0.0755615234375, "loss_aux_layer_5": 0.0771484375, "loss_aux_layer_6": 0.080078125, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.074951171875, "step": 2042, "total_loss": 0.7104731649160385 }, { "epoch": 0.40447436151257177, "grad_norm": 1.2789430618286133, "learning_rate": 5e-05, "llm_loss": 0.5293852090835571, "loss": 2.5085, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.0440673828125, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1273193359375, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.06854248046875, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0704345703125, "step": 2043, "total_loss": 0.6271361857652664 }, { "epoch": 0.40467234211047315, "grad_norm": 1.591612696647644, "learning_rate": 5e-05, "llm_loss": 0.6453066766262054, "loss": 2.9832, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05999755859375, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.0762939453125, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.072509765625, "step": 2044, "total_loss": 0.7457937598228455 }, { "epoch": 0.4048703227083746, "grad_norm": 1.0494680404663086, "learning_rate": 5e-05, "llm_loss": 0.6153736263513565, "loss": 2.8677, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.04559326171875, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.0814208984375, "loss_aux_layer_12": 0.0869140625, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.059814453125, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0718994140625, "loss_aux_layer_4": 0.0753173828125, "loss_aux_layer_5": 0.0770263671875, "loss_aux_layer_6": 0.080078125, "loss_aux_layer_7": 0.077392578125, "loss_aux_layer_8": 0.076416015625, "loss_aux_layer_9": 0.0750732421875, "step": 2045, "total_loss": 0.7169267684221268 }, { "epoch": 0.40506830330627597, "grad_norm": 1.1011873483657837, "learning_rate": 5e-05, "llm_loss": 0.5935561954975128, "loss": 2.7741, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0914306640625, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1209716796875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.0699462890625, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0732421875, "step": 2046, "total_loss": 0.6935225427150726 }, { "epoch": 0.4052662839041774, "grad_norm": 1.2062015533447266, "learning_rate": 5e-05, "llm_loss": 0.6626464128494263, "loss": 3.0559, "loss_aux_layer_0": 0.0230712890625, "loss_aux_layer_1": 0.046142578125, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0810546875, "loss_aux_layer_12": 0.087158203125, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.121826171875, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.075927734375, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.074462890625, "step": 2047, "total_loss": 0.7639787644147873 }, { "epoch": 0.4054642645020788, "grad_norm": 1.0713517665863037, "learning_rate": 5e-05, "llm_loss": 0.5688999593257904, "loss": 2.6438, "loss_aux_layer_0": 0.02166748046875, "loss_aux_layer_1": 0.040771484375, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.065185546875, "loss_aux_layer_5": 0.06671142578125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.06658935546875, "loss_aux_layer_9": 0.06561279296875, "step": 2048, "total_loss": 0.660957008600235 }, { "epoch": 0.4056622450999802, "grad_norm": 1.2173126935958862, "learning_rate": 5e-05, "llm_loss": 0.6167673468589783, "loss": 2.8618, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.04412841796875, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1190185546875, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0718994140625, "step": 2049, "total_loss": 0.7154513448476791 }, { "epoch": 0.4058602256978816, "grad_norm": 1.197952389717102, "learning_rate": 5e-05, "llm_loss": 0.5824682712554932, "loss": 2.7149, "loss_aux_layer_0": 0.022735595703125, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.06964111328125, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.06683349609375, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.07086181640625, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.07086181640625, "loss_aux_layer_8": 0.06976318359375, "loss_aux_layer_9": 0.06829833984375, "step": 2050, "total_loss": 0.678720086812973 }, { "epoch": 0.406058206295783, "grad_norm": 1.08740234375, "learning_rate": 5e-05, "llm_loss": 0.6024227440357208, "loss": 2.8242, "loss_aux_layer_0": 0.021697998046875, "loss_aux_layer_1": 0.0457763671875, "loss_aux_layer_10": 0.0762939453125, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.0877685546875, "loss_aux_layer_13": 0.0955810546875, "loss_aux_layer_14": 0.10595703125, "loss_aux_layer_15": 0.115966796875, "loss_aux_layer_16": 0.126953125, "loss_aux_layer_17": 0.135009765625, "loss_aux_layer_18": 0.143310546875, "loss_aux_layer_19": 0.1455078125, "loss_aux_layer_2": 0.0596923828125, "loss_aux_layer_20": 0.15283203125, "loss_aux_layer_21": 0.15966796875, "loss_aux_layer_22": 0.181640625, "loss_aux_layer_23": 0.221923828125, "loss_aux_layer_3": 0.071044921875, "loss_aux_layer_4": 0.07421875, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.07470703125, "step": 2051, "total_loss": 0.7060526758432388 }, { "epoch": 0.4062561868936844, "grad_norm": 1.2629443407058716, "learning_rate": 5e-05, "llm_loss": 0.610373318195343, "loss": 2.8258, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04388427734375, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1156005859375, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.071044921875, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.069580078125, "step": 2052, "total_loss": 0.706438809633255 }, { "epoch": 0.4064541674915858, "grad_norm": 1.2801179885864258, "learning_rate": 5e-05, "llm_loss": 0.6173688247799873, "loss": 2.8901, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.05047607421875, "loss_aux_layer_10": 0.080322265625, "loss_aux_layer_11": 0.0855712890625, "loss_aux_layer_12": 0.0911865234375, "loss_aux_layer_13": 0.097900390625, "loss_aux_layer_14": 0.10693359375, "loss_aux_layer_15": 0.1158447265625, "loss_aux_layer_16": 0.1258544921875, "loss_aux_layer_17": 0.1329345703125, "loss_aux_layer_18": 0.140869140625, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.064697265625, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.217529296875, "loss_aux_layer_3": 0.076416015625, "loss_aux_layer_4": 0.0797119140625, "loss_aux_layer_5": 0.081787109375, "loss_aux_layer_6": 0.085205078125, "loss_aux_layer_7": 0.0821533203125, "loss_aux_layer_8": 0.08056640625, "loss_aux_layer_9": 0.078857421875, "step": 2053, "total_loss": 0.7225298658013344 }, { "epoch": 0.40665214808948724, "grad_norm": 1.140264868736267, "learning_rate": 5e-05, "llm_loss": 0.678339809179306, "loss": 3.0924, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.06646728515625, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.06695556640625, "step": 2054, "total_loss": 0.773101195693016 }, { "epoch": 0.4068501286873886, "grad_norm": 1.0938199758529663, "learning_rate": 5e-05, "llm_loss": 0.5666807293891907, "loss": 2.6657, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.06866455078125, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0731201171875, "step": 2055, "total_loss": 0.6664218604564667 }, { "epoch": 0.40704810928529006, "grad_norm": 1.3518329858779907, "learning_rate": 5e-05, "llm_loss": 0.5648486614227295, "loss": 2.6453, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05499267578125, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.06927490234375, "step": 2056, "total_loss": 0.6613204479217529 }, { "epoch": 0.40724608988319144, "grad_norm": 0.9305853247642517, "learning_rate": 5e-05, "llm_loss": 0.6543234884738922, "loss": 3.0048, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.07135009765625, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.096923828125, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.1326904296875, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05523681640625, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.07012939453125, "step": 2057, "total_loss": 0.7512041181325912 }, { "epoch": 0.4074440704810929, "grad_norm": 1.6199686527252197, "learning_rate": 5e-05, "llm_loss": 0.5654435753822327, "loss": 2.6478, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.0435791015625, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.06634521484375, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.069580078125, "step": 2058, "total_loss": 0.6619391888380051 }, { "epoch": 0.40764205107899426, "grad_norm": 0.9365748167037964, "learning_rate": 5e-05, "llm_loss": 0.6103348582983017, "loss": 2.8229, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04217529296875, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.1319580078125, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.0540771484375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.068115234375, "step": 2059, "total_loss": 0.7057299613952637 }, { "epoch": 0.40784003167689564, "grad_norm": 1.467581868171692, "learning_rate": 5e-05, "llm_loss": 0.6430933400988579, "loss": 2.9676, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.07318115234375, "loss_aux_layer_11": 0.0777587890625, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.089111328125, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.119140625, "loss_aux_layer_17": 0.12646484375, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05767822265625, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.07183837890625, "step": 2060, "total_loss": 0.7419122457504272 }, { "epoch": 0.4080380122747971, "grad_norm": 1.8432903289794922, "learning_rate": 5e-05, "llm_loss": 0.5514142960309982, "loss": 2.5906, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.042724609375, "loss_aux_layer_10": 0.06903076171875, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.06512451171875, "loss_aux_layer_4": 0.06829833984375, "loss_aux_layer_5": 0.07000732421875, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.06927490234375, "loss_aux_layer_9": 0.067626953125, "step": 2061, "total_loss": 0.6476540416479111 }, { "epoch": 0.40823599287269846, "grad_norm": 1.196219801902771, "learning_rate": 5e-05, "llm_loss": 0.5620935410261154, "loss": 2.6325, "loss_aux_layer_0": 0.02288818359375, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1053466796875, "loss_aux_layer_16": 0.1156005859375, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0546875, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.06884765625, "step": 2062, "total_loss": 0.6581162959337234 }, { "epoch": 0.4084339734705999, "grad_norm": 1.4041500091552734, "learning_rate": 5e-05, "llm_loss": 0.6280177235603333, "loss": 2.9021, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.124755859375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.070556640625, "step": 2063, "total_loss": 0.7255267947912216 }, { "epoch": 0.4086319540685013, "grad_norm": 1.2920774221420288, "learning_rate": 5e-05, "llm_loss": 0.5510287582874298, "loss": 2.5901, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.0438232421875, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.0562744140625, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06646728515625, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0689697265625, "step": 2064, "total_loss": 0.6475310772657394 }, { "epoch": 0.4088299346664027, "grad_norm": 1.4958842992782593, "learning_rate": 5e-05, "llm_loss": 0.5576237589120865, "loss": 2.6124, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.0565185546875, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0693359375, "step": 2065, "total_loss": 0.6530934125185013 }, { "epoch": 0.4090279152643041, "grad_norm": 1.191209077835083, "learning_rate": 5e-05, "llm_loss": 0.5480584278702736, "loss": 2.5818, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.0443115234375, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1240234375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.07080078125, "step": 2066, "total_loss": 0.645449697971344 }, { "epoch": 0.4092258958622055, "grad_norm": 1.198431372642517, "learning_rate": 5e-05, "llm_loss": 0.5110526978969574, "loss": 2.4526, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.04571533203125, "loss_aux_layer_10": 0.0760498046875, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.1036376953125, "loss_aux_layer_15": 0.1134033203125, "loss_aux_layer_16": 0.123779296875, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140380859375, "loss_aux_layer_19": 0.1435546875, "loss_aux_layer_2": 0.058837890625, "loss_aux_layer_20": 0.151123046875, "loss_aux_layer_21": 0.158447265625, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.07470703125, "step": 2067, "total_loss": 0.6131518185138702 }, { "epoch": 0.4094238764601069, "grad_norm": 1.3752996921539307, "learning_rate": 5e-05, "llm_loss": 0.6614114791154861, "loss": 3.0357, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0562744140625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.07470703125, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0699462890625, "step": 2068, "total_loss": 0.758917823433876 }, { "epoch": 0.4096218570580083, "grad_norm": 1.01409912109375, "learning_rate": 5e-05, "llm_loss": 0.6228973269462585, "loss": 2.8696, "loss_aux_layer_0": 0.022125244140625, "loss_aux_layer_1": 0.04150390625, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0670166015625, "step": 2069, "total_loss": 0.7174123376607895 }, { "epoch": 0.40981983765590974, "grad_norm": 1.2480418682098389, "learning_rate": 5e-05, "llm_loss": 0.6433202475309372, "loss": 2.9739, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.0751953125, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.091796875, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.0733642578125, "step": 2070, "total_loss": 0.7434794157743454 }, { "epoch": 0.4100178182538111, "grad_norm": 1.109784722328186, "learning_rate": 5e-05, "llm_loss": 0.6155246645212173, "loss": 2.8693, "loss_aux_layer_0": 0.024169921875, "loss_aux_layer_1": 0.0458984375, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.08642578125, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.10302734375, "loss_aux_layer_15": 0.113037109375, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.07098388671875, "loss_aux_layer_4": 0.07373046875, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.0791015625, "loss_aux_layer_7": 0.0765380859375, "loss_aux_layer_8": 0.0758056640625, "loss_aux_layer_9": 0.0743408203125, "step": 2071, "total_loss": 0.7173259109258652 }, { "epoch": 0.41021579885171255, "grad_norm": 0.8437215089797974, "learning_rate": 5e-05, "llm_loss": 0.5898223593831062, "loss": 2.7324, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06573486328125, "step": 2072, "total_loss": 0.6830988675355911 }, { "epoch": 0.41041377944961394, "grad_norm": 1.489097237586975, "learning_rate": 5e-05, "llm_loss": 0.6213481426239014, "loss": 2.8619, "loss_aux_layer_0": 0.022674560546875, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06353759765625, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.06640625, "step": 2073, "total_loss": 0.715468019247055 }, { "epoch": 0.4106117600475153, "grad_norm": 1.1741118431091309, "learning_rate": 5e-05, "llm_loss": 0.5591797530651093, "loss": 2.6282, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0885009765625, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0706787109375, "step": 2074, "total_loss": 0.6570397913455963 }, { "epoch": 0.41080974064541675, "grad_norm": 0.9650455713272095, "learning_rate": 5e-05, "llm_loss": 0.6051876991987228, "loss": 2.8224, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.045166015625, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.09130859375, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.121337890625, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.07373046875, "step": 2075, "total_loss": 0.7056110203266144 }, { "epoch": 0.41100772124331814, "grad_norm": 1.113807201385498, "learning_rate": 5e-05, "llm_loss": 0.5919836014509201, "loss": 2.7661, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.045654296875, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.09130859375, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.078125, "loss_aux_layer_7": 0.075439453125, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.072998046875, "step": 2076, "total_loss": 0.6915215998888016 }, { "epoch": 0.4112057018412196, "grad_norm": 0.9867318272590637, "learning_rate": 5e-05, "llm_loss": 0.6266270726919174, "loss": 2.8928, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0694580078125, "step": 2077, "total_loss": 0.7231926620006561 }, { "epoch": 0.41140368243912095, "grad_norm": 1.0697649717330933, "learning_rate": 5e-05, "llm_loss": 0.5859321653842926, "loss": 2.7307, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.04278564453125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05487060546875, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.06549072265625, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.0693359375, "step": 2078, "total_loss": 0.6826819628477097 }, { "epoch": 0.4116016630370224, "grad_norm": 1.32660973072052, "learning_rate": 5e-05, "llm_loss": 0.4863905534148216, "loss": 2.3532, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.0447998046875, "loss_aux_layer_10": 0.07568359375, "loss_aux_layer_11": 0.080322265625, "loss_aux_layer_12": 0.0860595703125, "loss_aux_layer_13": 0.09326171875, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.1129150390625, "loss_aux_layer_16": 0.1234130859375, "loss_aux_layer_17": 0.1318359375, "loss_aux_layer_18": 0.140625, "loss_aux_layer_19": 0.14306640625, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.150634765625, "loss_aux_layer_21": 0.158203125, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.218505859375, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0736083984375, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.079345703125, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0740966796875, "step": 2079, "total_loss": 0.5882952511310577 }, { "epoch": 0.4117996436349238, "grad_norm": 0.800989031791687, "learning_rate": 5e-05, "llm_loss": 0.5455707907676697, "loss": 2.5725, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0557861328125, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.07000732421875, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.0712890625, "step": 2080, "total_loss": 0.6431288421154022 }, { "epoch": 0.4119976242328252, "grad_norm": 1.2140965461730957, "learning_rate": 5e-05, "llm_loss": 0.5459932088851929, "loss": 2.5804, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0787353515625, "loss_aux_layer_12": 0.0843505859375, "loss_aux_layer_13": 0.091064453125, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1270751953125, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.077392578125, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0721435546875, "step": 2081, "total_loss": 0.6450901180505753 }, { "epoch": 0.4121956048307266, "grad_norm": 0.917401909828186, "learning_rate": 5e-05, "llm_loss": 0.6554591357707977, "loss": 3.0075, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.043212890625, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05548095703125, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0701904296875, "step": 2082, "total_loss": 0.7518702149391174 }, { "epoch": 0.412393585428628, "grad_norm": 1.1075215339660645, "learning_rate": 5e-05, "llm_loss": 0.6379765570163727, "loss": 2.9268, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.1290283203125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0670166015625, "step": 2083, "total_loss": 0.7317027151584625 }, { "epoch": 0.4125915660265294, "grad_norm": 0.9664706587791443, "learning_rate": 5e-05, "llm_loss": 0.654677078127861, "loss": 3.0033, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.0435791015625, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.0552978515625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.069580078125, "step": 2084, "total_loss": 0.7508150935173035 }, { "epoch": 0.4127895466244308, "grad_norm": 1.0001139640808105, "learning_rate": 5e-05, "llm_loss": 0.5669358372688293, "loss": 2.6553, "loss_aux_layer_0": 0.023529052734375, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1156005859375, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0694580078125, "step": 2085, "total_loss": 0.6638361364603043 }, { "epoch": 0.41298752722233223, "grad_norm": 0.9905925393104553, "learning_rate": 5e-05, "llm_loss": 0.6805630624294281, "loss": 3.1003, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.0673828125, "step": 2086, "total_loss": 0.7750764787197113 }, { "epoch": 0.4131855078202336, "grad_norm": 1.3392691612243652, "learning_rate": 5e-05, "llm_loss": 0.5954166501760483, "loss": 2.7789, "loss_aux_layer_0": 0.02032470703125, "loss_aux_layer_1": 0.04541015625, "loss_aux_layer_10": 0.074951171875, "loss_aux_layer_11": 0.0799560546875, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.11962890625, "loss_aux_layer_17": 0.1273193359375, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.078857421875, "loss_aux_layer_7": 0.0762939453125, "loss_aux_layer_8": 0.0753173828125, "loss_aux_layer_9": 0.0736083984375, "step": 2087, "total_loss": 0.6947257667779922 }, { "epoch": 0.41338348841813505, "grad_norm": 1.520229697227478, "learning_rate": 5e-05, "llm_loss": 0.6479868441820145, "loss": 2.9916, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1107177734375, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.1295166015625, "loss_aux_layer_18": 0.13818359375, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.06890869140625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0716552734375, "step": 2088, "total_loss": 0.7479104697704315 }, { "epoch": 0.41358146901603643, "grad_norm": 1.3414932489395142, "learning_rate": 5e-05, "llm_loss": 0.6203414648771286, "loss": 2.869, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.07110595703125, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.1336669921875, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.055908203125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.06964111328125, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.07220458984375, "loss_aux_layer_8": 0.07110595703125, "loss_aux_layer_9": 0.0697021484375, "step": 2089, "total_loss": 0.7172608524560928 }, { "epoch": 0.4137794496139378, "grad_norm": 1.3739367723464966, "learning_rate": 5e-05, "llm_loss": 0.6263986229896545, "loss": 2.8992, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.0438232421875, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1087646484375, "loss_aux_layer_16": 0.118896484375, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0701904296875, "step": 2090, "total_loss": 0.7248100191354752 }, { "epoch": 0.41397743021183925, "grad_norm": 1.083669662475586, "learning_rate": 5e-05, "llm_loss": 0.5641700029373169, "loss": 2.6375, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.06781005859375, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.0679931640625, "step": 2091, "total_loss": 0.6593770533800125 }, { "epoch": 0.41417541080974063, "grad_norm": 1.2966316938400269, "learning_rate": 5e-05, "llm_loss": 0.6470019966363907, "loss": 2.981, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.04376220703125, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.118896484375, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0711669921875, "step": 2092, "total_loss": 0.7452550828456879 }, { "epoch": 0.41437339140764207, "grad_norm": 1.1208198070526123, "learning_rate": 5e-05, "llm_loss": 0.6414160579442978, "loss": 2.9576, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.04302978515625, "loss_aux_layer_10": 0.07281494140625, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1082763671875, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.1361083984375, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05621337890625, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.06744384765625, "loss_aux_layer_4": 0.07086181640625, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.073486328125, "loss_aux_layer_8": 0.07244873046875, "loss_aux_layer_9": 0.07135009765625, "step": 2093, "total_loss": 0.7394119501113892 }, { "epoch": 0.41457137200554345, "grad_norm": 1.4019609689712524, "learning_rate": 5e-05, "llm_loss": 0.6003106459975243, "loss": 2.7892, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.0703125, "step": 2094, "total_loss": 0.6972928196191788 }, { "epoch": 0.4147693526034449, "grad_norm": 0.9834194183349609, "learning_rate": 5e-05, "llm_loss": 0.5943032205104828, "loss": 2.7836, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.04595947265625, "loss_aux_layer_10": 0.077392578125, "loss_aux_layer_11": 0.0826416015625, "loss_aux_layer_12": 0.0880126953125, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.1126708984375, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.1295166015625, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.06060791015625, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.072509765625, "loss_aux_layer_4": 0.0760498046875, "loss_aux_layer_5": 0.077880859375, "loss_aux_layer_6": 0.0811767578125, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.077392578125, "loss_aux_layer_9": 0.0758056640625, "step": 2095, "total_loss": 0.6959032565355301 }, { "epoch": 0.41496733320134627, "grad_norm": 1.2013643980026245, "learning_rate": 5e-05, "llm_loss": 0.5407980978488922, "loss": 2.5671, "loss_aux_layer_0": 0.023406982421875, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.091796875, "loss_aux_layer_14": 0.1015625, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.13037109375, "loss_aux_layer_18": 0.138671875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0728759765625, "step": 2096, "total_loss": 0.6417818516492844 }, { "epoch": 0.41516531379924765, "grad_norm": 1.1121587753295898, "learning_rate": 5e-05, "llm_loss": 0.6239236444234848, "loss": 2.8804, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.07000732421875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05523681640625, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06597900390625, "loss_aux_layer_4": 0.06890869140625, "loss_aux_layer_5": 0.07061767578125, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.06988525390625, "loss_aux_layer_9": 0.068603515625, "step": 2097, "total_loss": 0.7200972437858582 }, { "epoch": 0.4153632943971491, "grad_norm": 0.9870884418487549, "learning_rate": 5e-05, "llm_loss": 0.6045844405889511, "loss": 2.8135, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.0439453125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.057861328125, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.07177734375, "step": 2098, "total_loss": 0.703378438949585 }, { "epoch": 0.41556127499505047, "grad_norm": 1.0248247385025024, "learning_rate": 5e-05, "llm_loss": 0.5919863879680634, "loss": 2.7548, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.07171630859375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.087646484375, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.07220458984375, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.07257080078125, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0701904296875, "step": 2099, "total_loss": 0.6886956244707108 }, { "epoch": 0.4157592555929519, "grad_norm": 1.0420442819595337, "learning_rate": 5e-05, "llm_loss": 0.6951836496591568, "loss": 3.1648, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04217529296875, "loss_aux_layer_10": 0.07012939453125, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.07049560546875, "loss_aux_layer_6": 0.07342529296875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.06878662109375, "step": 2100, "total_loss": 0.7912084758281708 }, { "epoch": 0.4159572361908533, "grad_norm": 0.9672736525535583, "learning_rate": 5e-05, "llm_loss": 0.5684982314705849, "loss": 2.6802, "loss_aux_layer_0": 0.022705078125, "loss_aux_layer_1": 0.04510498046875, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1029052734375, "loss_aux_layer_15": 0.112548828125, "loss_aux_layer_16": 0.1226806640625, "loss_aux_layer_17": 0.131103515625, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.142333984375, "loss_aux_layer_2": 0.05877685546875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.1572265625, "loss_aux_layer_22": 0.178466796875, "loss_aux_layer_23": 0.21826171875, "loss_aux_layer_3": 0.0699462890625, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.076171875, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.0738525390625, "step": 2101, "total_loss": 0.6700564175844193 }, { "epoch": 0.4161552167887547, "grad_norm": 1.2298043966293335, "learning_rate": 5e-05, "llm_loss": 0.6490414887666702, "loss": 2.9849, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.0411376953125, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.1197509765625, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.06964111328125, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.06829833984375, "step": 2102, "total_loss": 0.7462193220853806 }, { "epoch": 0.4163531973866561, "grad_norm": 1.5233899354934692, "learning_rate": 5e-05, "llm_loss": 0.6589197665452957, "loss": 3.025, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1256103515625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.070068359375, "step": 2103, "total_loss": 0.756241112947464 }, { "epoch": 0.4165511779845575, "grad_norm": 1.3078511953353882, "learning_rate": 5e-05, "llm_loss": 0.5935753583908081, "loss": 2.7835, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04595947265625, "loss_aux_layer_10": 0.0765380859375, "loss_aux_layer_11": 0.081787109375, "loss_aux_layer_12": 0.0872802734375, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.1041259765625, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.1239013671875, "loss_aux_layer_17": 0.131591796875, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.06024169921875, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0721435546875, "loss_aux_layer_4": 0.0751953125, "loss_aux_layer_5": 0.076904296875, "loss_aux_layer_6": 0.080322265625, "loss_aux_layer_7": 0.07763671875, "loss_aux_layer_8": 0.07666015625, "loss_aux_layer_9": 0.0751953125, "step": 2104, "total_loss": 0.6958683729171753 }, { "epoch": 0.4167491585824589, "grad_norm": 1.2489042282104492, "learning_rate": 5e-05, "llm_loss": 0.5679458677768707, "loss": 2.6763, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.04498291015625, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.08056640625, "loss_aux_layer_12": 0.0863037109375, "loss_aux_layer_13": 0.093017578125, "loss_aux_layer_14": 0.102783203125, "loss_aux_layer_15": 0.11279296875, "loss_aux_layer_16": 0.12255859375, "loss_aux_layer_17": 0.130615234375, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.1416015625, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.070556640625, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0750732421875, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0740966796875, "step": 2105, "total_loss": 0.669068232178688 }, { "epoch": 0.4169471391803603, "grad_norm": 1.141282558441162, "learning_rate": 5e-05, "llm_loss": 0.6039148718118668, "loss": 2.7981, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.04278564453125, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.055908203125, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0687255859375, "step": 2106, "total_loss": 0.6995159536600113 }, { "epoch": 0.41714511977826174, "grad_norm": 1.1784594058990479, "learning_rate": 5e-05, "llm_loss": 0.6375571340322495, "loss": 2.9276, "loss_aux_layer_0": 0.0223388671875, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.066650390625, "step": 2107, "total_loss": 0.7319082617759705 }, { "epoch": 0.4173431003761631, "grad_norm": 1.140100121498108, "learning_rate": 5e-05, "llm_loss": 0.6196976006031036, "loss": 2.8865, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.04595947265625, "loss_aux_layer_10": 0.0772705078125, "loss_aux_layer_11": 0.0823974609375, "loss_aux_layer_12": 0.087890625, "loss_aux_layer_13": 0.0941162109375, "loss_aux_layer_14": 0.103271484375, "loss_aux_layer_15": 0.1121826171875, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.129638671875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.05999755859375, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.072265625, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.081298828125, "loss_aux_layer_7": 0.078369140625, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0760498046875, "step": 2108, "total_loss": 0.721627876162529 }, { "epoch": 0.41754108097406456, "grad_norm": 1.0933576822280884, "learning_rate": 5e-05, "llm_loss": 0.6184099316596985, "loss": 2.8609, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.06982421875, "step": 2109, "total_loss": 0.715222641825676 }, { "epoch": 0.41773906157196594, "grad_norm": 1.3740417957305908, "learning_rate": 5e-05, "llm_loss": 0.6065675616264343, "loss": 2.8222, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.04376220703125, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1207275390625, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.072265625, "step": 2110, "total_loss": 0.705552414059639 }, { "epoch": 0.4179370421698674, "grad_norm": 0.8728827238082886, "learning_rate": 5e-05, "llm_loss": 0.5731149166822433, "loss": 2.6786, "loss_aux_layer_0": 0.023712158203125, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.1240234375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0692138671875, "step": 2111, "total_loss": 0.6696536391973495 }, { "epoch": 0.41813502276776876, "grad_norm": 1.0426312685012817, "learning_rate": 5e-05, "llm_loss": 0.5853195339441299, "loss": 2.7313, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0975341796875, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.0557861328125, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.06646728515625, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.06982421875, "step": 2112, "total_loss": 0.6828133314847946 }, { "epoch": 0.41833300336567014, "grad_norm": 0.8813035488128662, "learning_rate": 5e-05, "llm_loss": 0.5686310678720474, "loss": 2.6676, "loss_aux_layer_0": 0.02178955078125, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.1083984375, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.0555419921875, "loss_aux_layer_20": 0.1484375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0699462890625, "step": 2113, "total_loss": 0.6668986082077026 }, { "epoch": 0.4185309839635716, "grad_norm": 1.1315795183181763, "learning_rate": 5e-05, "llm_loss": 0.6302358657121658, "loss": 2.9066, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1240234375, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05548095703125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.071044921875, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0692138671875, "step": 2114, "total_loss": 0.7266376912593842 }, { "epoch": 0.41872896456147296, "grad_norm": 1.0943270921707153, "learning_rate": 5e-05, "llm_loss": 0.6690881252288818, "loss": 3.0703, "loss_aux_layer_0": 0.020660400390625, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0989990234375, "loss_aux_layer_15": 0.1087646484375, "loss_aux_layer_16": 0.1182861328125, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072509765625, "step": 2115, "total_loss": 0.7675847560167313 }, { "epoch": 0.4189269451593744, "grad_norm": 1.721895456314087, "learning_rate": 5e-05, "llm_loss": 0.6223959475755692, "loss": 2.8874, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0838623046875, "loss_aux_layer_13": 0.0906982421875, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.070068359375, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.0736083984375, "loss_aux_layer_9": 0.072265625, "step": 2116, "total_loss": 0.7218375951051712 }, { "epoch": 0.4191249257572758, "grad_norm": 1.133180856704712, "learning_rate": 5e-05, "llm_loss": 0.5658338367938995, "loss": 2.6616, "loss_aux_layer_0": 0.02386474609375, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.110107421875, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.217041015625, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0718994140625, "step": 2117, "total_loss": 0.6653894931077957 }, { "epoch": 0.4193229063551772, "grad_norm": 1.1113619804382324, "learning_rate": 5e-05, "llm_loss": 0.5865790396928787, "loss": 2.7245, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.0418701171875, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.1297607421875, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05438232421875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.06964111328125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.06805419921875, "step": 2118, "total_loss": 0.6811360269784927 }, { "epoch": 0.4195208869530786, "grad_norm": 1.1885039806365967, "learning_rate": 5e-05, "llm_loss": 0.6797555685043335, "loss": 3.1126, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.042724609375, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0841064453125, "loss_aux_layer_13": 0.09033203125, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05633544921875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0772705078125, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.072509765625, "step": 2119, "total_loss": 0.7781533002853394 }, { "epoch": 0.41971886755098, "grad_norm": 1.1104401350021362, "learning_rate": 5e-05, "llm_loss": 0.5489064753055573, "loss": 2.5798, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.06842041015625, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0689697265625, "step": 2120, "total_loss": 0.6449546962976456 }, { "epoch": 0.4199168481488814, "grad_norm": 1.5867688655853271, "learning_rate": 5e-05, "llm_loss": 0.6205203160643578, "loss": 2.8627, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.1302490234375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0684814453125, "step": 2121, "total_loss": 0.7156825512647629 }, { "epoch": 0.4201148287467828, "grad_norm": 0.9092640280723572, "learning_rate": 5e-05, "llm_loss": 0.6164909154176712, "loss": 2.8469, "loss_aux_layer_0": 0.02178955078125, "loss_aux_layer_1": 0.043212890625, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.07012939453125, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.06951904296875, "step": 2122, "total_loss": 0.7117337286472321 }, { "epoch": 0.42031280934468424, "grad_norm": 1.3069727420806885, "learning_rate": 5e-05, "llm_loss": 0.5654287338256836, "loss": 2.6703, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04595947265625, "loss_aux_layer_10": 0.07666015625, "loss_aux_layer_11": 0.08154296875, "loss_aux_layer_12": 0.0870361328125, "loss_aux_layer_13": 0.093505859375, "loss_aux_layer_14": 0.10302734375, "loss_aux_layer_15": 0.1116943359375, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.0616455078125, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.155517578125, "loss_aux_layer_22": 0.178955078125, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0736083984375, "loss_aux_layer_4": 0.0762939453125, "loss_aux_layer_5": 0.0777587890625, "loss_aux_layer_6": 0.0806884765625, "loss_aux_layer_7": 0.0780029296875, "loss_aux_layer_8": 0.0770263671875, "loss_aux_layer_9": 0.0751953125, "step": 2123, "total_loss": 0.6675747185945511 }, { "epoch": 0.4205107899425856, "grad_norm": 1.0676182508468628, "learning_rate": 5e-05, "llm_loss": 0.6918591111898422, "loss": 3.1369, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05078125, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.06396484375, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.065673828125, "loss_aux_layer_9": 0.0643310546875, "step": 2124, "total_loss": 0.7842262983322144 }, { "epoch": 0.42070877054048705, "grad_norm": 0.9868884682655334, "learning_rate": 5e-05, "llm_loss": 0.6124349236488342, "loss": 2.8506, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.044921875, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.079345703125, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.110107421875, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0782470703125, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0731201171875, "step": 2125, "total_loss": 0.7126430571079254 }, { "epoch": 0.42090675113838844, "grad_norm": 0.9096347689628601, "learning_rate": 5e-05, "llm_loss": 0.6157954186201096, "loss": 2.8536, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.07159423828125, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.0980224609375, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.06854248046875, "loss_aux_layer_4": 0.07110595703125, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.07293701171875, "loss_aux_layer_8": 0.07183837890625, "loss_aux_layer_9": 0.07025146484375, "step": 2126, "total_loss": 0.7134001851081848 }, { "epoch": 0.4211047317362898, "grad_norm": 1.103347897529602, "learning_rate": 5e-05, "llm_loss": 0.6021338850259781, "loss": 2.8115, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0745849609375, "loss_aux_layer_11": 0.0791015625, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.0908203125, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.1104736328125, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.14990234375, "loss_aux_layer_21": 0.156982421875, "loss_aux_layer_22": 0.1787109375, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0731201171875, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.072998046875, "step": 2127, "total_loss": 0.7028873562812805 }, { "epoch": 0.42130271233419125, "grad_norm": 0.9993857145309448, "learning_rate": 5e-05, "llm_loss": 0.5234073996543884, "loss": 2.4851, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04345703125, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.107666015625, "loss_aux_layer_16": 0.1177978515625, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.070556640625, "step": 2128, "total_loss": 0.6212861835956573 }, { "epoch": 0.42150069293209264, "grad_norm": 0.8168793320655823, "learning_rate": 5e-05, "llm_loss": 0.5635135546326637, "loss": 2.6559, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0809326171875, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.091796875, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.110107421875, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.127685546875, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.072021484375, "loss_aux_layer_4": 0.0750732421875, "loss_aux_layer_5": 0.0767822265625, "loss_aux_layer_6": 0.0799560546875, "loss_aux_layer_7": 0.0771484375, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.07470703125, "step": 2129, "total_loss": 0.6639670729637146 }, { "epoch": 0.4216986735299941, "grad_norm": 0.9958680272102356, "learning_rate": 5e-05, "llm_loss": 0.6567647904157639, "loss": 3.0193, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.07177734375, "step": 2130, "total_loss": 0.7548196911811829 }, { "epoch": 0.42189665412789545, "grad_norm": 1.4005424976348877, "learning_rate": 5e-05, "llm_loss": 0.6040135473012924, "loss": 2.7995, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.1319580078125, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.05609130859375, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06707763671875, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.0701904296875, "step": 2131, "total_loss": 0.6998851597309113 }, { "epoch": 0.4220946347257969, "grad_norm": 1.292719841003418, "learning_rate": 5e-05, "llm_loss": 0.5631238520145416, "loss": 2.652, "loss_aux_layer_0": 0.024810791015625, "loss_aux_layer_1": 0.0450439453125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.109619140625, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.0582275390625, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0714111328125, "step": 2132, "total_loss": 0.6629912108182907 }, { "epoch": 0.4222926153236983, "grad_norm": 1.2578109502792358, "learning_rate": 5e-05, "llm_loss": 0.6511212140321732, "loss": 3.0007, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.04327392578125, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.12841796875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0718994140625, "step": 2133, "total_loss": 0.7501866966485977 }, { "epoch": 0.4224905959215997, "grad_norm": 1.3982343673706055, "learning_rate": 5e-05, "llm_loss": 0.587317943572998, "loss": 2.7528, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.04425048828125, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0911865234375, "loss_aux_layer_14": 0.1014404296875, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.1229248046875, "loss_aux_layer_17": 0.13134765625, "loss_aux_layer_18": 0.14013671875, "loss_aux_layer_19": 0.142578125, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.1494140625, "loss_aux_layer_21": 0.156494140625, "loss_aux_layer_22": 0.179443359375, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.072509765625, "step": 2134, "total_loss": 0.6881944388151169 }, { "epoch": 0.4226885765195011, "grad_norm": 1.2123539447784424, "learning_rate": 5e-05, "llm_loss": 0.560692235827446, "loss": 2.6277, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.06646728515625, "loss_aux_layer_4": 0.06939697265625, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.06982421875, "step": 2135, "total_loss": 0.6569219082593918 }, { "epoch": 0.4228865571174025, "grad_norm": 1.583691954612732, "learning_rate": 5e-05, "llm_loss": 0.6267445087432861, "loss": 2.901, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0823974609375, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.1094970703125, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1275634765625, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.216552734375, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.070068359375, "step": 2136, "total_loss": 0.725246399641037 }, { "epoch": 0.4230845377153039, "grad_norm": 1.4771153926849365, "learning_rate": 5e-05, "llm_loss": 0.6168328374624252, "loss": 2.864, "loss_aux_layer_0": 0.021697998046875, "loss_aux_layer_1": 0.04376220703125, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.099853515625, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1280517578125, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05755615234375, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.072021484375, "step": 2137, "total_loss": 0.7159960567951202 }, { "epoch": 0.4232825183132053, "grad_norm": 1.243073582649231, "learning_rate": 5e-05, "llm_loss": 0.5992762595415115, "loss": 2.798, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.0452880859375, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.0799560546875, "loss_aux_layer_12": 0.085205078125, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.101318359375, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.0594482421875, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0701904296875, "loss_aux_layer_4": 0.0732421875, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.0784912109375, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.07470703125, "loss_aux_layer_9": 0.0736083984375, "step": 2138, "total_loss": 0.69950932264328 }, { "epoch": 0.42348049891110673, "grad_norm": 1.1748934984207153, "learning_rate": 5e-05, "llm_loss": 0.5905016660690308, "loss": 2.7592, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04559326171875, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.127197265625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0718994140625, "step": 2139, "total_loss": 0.6898014098405838 }, { "epoch": 0.4236784795090081, "grad_norm": 1.5823532342910767, "learning_rate": 5e-05, "llm_loss": 0.6397222429513931, "loss": 2.9466, "loss_aux_layer_0": 0.023101806640625, "loss_aux_layer_1": 0.04388427734375, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1160888671875, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.0711669921875, "step": 2140, "total_loss": 0.7366378009319305 }, { "epoch": 0.42387646010690955, "grad_norm": 1.1067243814468384, "learning_rate": 5e-05, "llm_loss": 0.7413654178380966, "loss": 3.3429, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1231689453125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05126953125, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0614013671875, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0655517578125, "step": 2141, "total_loss": 0.8357297629117966 }, { "epoch": 0.42407444070481093, "grad_norm": 1.5447596311569214, "learning_rate": 5e-05, "llm_loss": 0.6204027980566025, "loss": 2.8639, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.06500244140625, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.068603515625, "step": 2142, "total_loss": 0.7159822881221771 }, { "epoch": 0.4242724213027123, "grad_norm": 1.5956367254257202, "learning_rate": 5e-05, "llm_loss": 0.6536488980054855, "loss": 3.0024, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04302978515625, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.0703125, "step": 2143, "total_loss": 0.7505916357040405 }, { "epoch": 0.42447040190061375, "grad_norm": 1.056380033493042, "learning_rate": 5e-05, "llm_loss": 0.6373714506626129, "loss": 2.9168, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.03948974609375, "loss_aux_layer_10": 0.06536865234375, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.06396484375, "step": 2144, "total_loss": 0.7291932702064514 }, { "epoch": 0.42466838249851513, "grad_norm": 2.2568459510803223, "learning_rate": 5e-05, "llm_loss": 0.5380657911300659, "loss": 2.5401, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0693359375, "step": 2145, "total_loss": 0.6350361853837967 }, { "epoch": 0.42486636309641657, "grad_norm": 1.410775065422058, "learning_rate": 5e-05, "llm_loss": 0.5830729305744171, "loss": 2.7153, "loss_aux_layer_0": 0.021697998046875, "loss_aux_layer_1": 0.04205322265625, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.06884765625, "step": 2146, "total_loss": 0.678815171122551 }, { "epoch": 0.42506434369431795, "grad_norm": 1.7134042978286743, "learning_rate": 5e-05, "llm_loss": 0.6224173828959465, "loss": 2.8759, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0697021484375, "step": 2147, "total_loss": 0.7189775556325912 }, { "epoch": 0.4252623242922194, "grad_norm": 1.1300532817840576, "learning_rate": 5e-05, "llm_loss": 0.650131955742836, "loss": 2.9822, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.0684814453125, "step": 2148, "total_loss": 0.7455421537160873 }, { "epoch": 0.42546030489012077, "grad_norm": 1.98903226852417, "learning_rate": 5e-05, "llm_loss": 0.525802306830883, "loss": 2.4935, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0704345703125, "step": 2149, "total_loss": 0.623367503285408 }, { "epoch": 0.42565828548802215, "grad_norm": 1.1862784624099731, "learning_rate": 5e-05, "llm_loss": 0.5607409775257111, "loss": 2.6258, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06591796875, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.068115234375, "step": 2150, "total_loss": 0.6564576178789139 }, { "epoch": 0.4258562660859236, "grad_norm": 1.770674467086792, "learning_rate": 5e-05, "llm_loss": 0.5709977149963379, "loss": 2.6703, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.07025146484375, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.06927490234375, "loss_aux_layer_9": 0.068359375, "step": 2151, "total_loss": 0.6675815433263779 }, { "epoch": 0.42605424668382497, "grad_norm": 1.011090636253357, "learning_rate": 5e-05, "llm_loss": 0.6054261028766632, "loss": 2.8057, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04083251953125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.0684814453125, "step": 2152, "total_loss": 0.7014178782701492 }, { "epoch": 0.4262522272817264, "grad_norm": 1.1042497158050537, "learning_rate": 5e-05, "llm_loss": 0.5080851018428802, "loss": 2.413, "loss_aux_layer_0": 0.02215576171875, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.1226806640625, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.06884765625, "loss_aux_layer_9": 0.0675048828125, "step": 2153, "total_loss": 0.6032572090625763 }, { "epoch": 0.4264502078796278, "grad_norm": 1.2922435998916626, "learning_rate": 5e-05, "llm_loss": 0.6239770725369453, "loss": 2.9048, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.04669189453125, "loss_aux_layer_10": 0.0771484375, "loss_aux_layer_11": 0.0821533203125, "loss_aux_layer_12": 0.087646484375, "loss_aux_layer_13": 0.094482421875, "loss_aux_layer_14": 0.10400390625, "loss_aux_layer_15": 0.11328125, "loss_aux_layer_16": 0.123291015625, "loss_aux_layer_17": 0.130126953125, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.06170654296875, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0731201171875, "loss_aux_layer_4": 0.076416015625, "loss_aux_layer_5": 0.0782470703125, "loss_aux_layer_6": 0.0816650390625, "loss_aux_layer_7": 0.078857421875, "loss_aux_layer_8": 0.0775146484375, "loss_aux_layer_9": 0.0758056640625, "step": 2154, "total_loss": 0.72620490193367 }, { "epoch": 0.4266481884775292, "grad_norm": 1.025699496269226, "learning_rate": 5e-05, "llm_loss": 0.5386967808008194, "loss": 2.5338, "loss_aux_layer_0": 0.02142333984375, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.06494140625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.067626953125, "step": 2155, "total_loss": 0.6334565579891205 }, { "epoch": 0.4268461690754306, "grad_norm": 1.1584898233413696, "learning_rate": 5e-05, "llm_loss": 0.622760459780693, "loss": 2.889, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.04681396484375, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.0848388671875, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1263427734375, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.07080078125, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.0755615234375, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.075927734375, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.0736083984375, "step": 2156, "total_loss": 0.7222453653812408 }, { "epoch": 0.42704414967333204, "grad_norm": 1.112194538116455, "learning_rate": 5e-05, "llm_loss": 0.6021033674478531, "loss": 2.8058, "loss_aux_layer_0": 0.02337646484375, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.128173828125, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.0740966796875, "loss_aux_layer_6": 0.07763671875, "loss_aux_layer_7": 0.0745849609375, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.07177734375, "step": 2157, "total_loss": 0.7014602273702621 }, { "epoch": 0.4272421302712334, "grad_norm": 1.4490187168121338, "learning_rate": 5e-05, "llm_loss": 0.6932093501091003, "loss": 3.1522, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05316162109375, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06622314453125, "loss_aux_layer_5": 0.06829833984375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.06689453125, "step": 2158, "total_loss": 0.7880598902702332 }, { "epoch": 0.4274401108691348, "grad_norm": 1.125353217124939, "learning_rate": 5e-05, "llm_loss": 0.5577213615179062, "loss": 2.6324, "loss_aux_layer_0": 0.02166748046875, "loss_aux_layer_1": 0.0443115234375, "loss_aux_layer_10": 0.074462890625, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0853271484375, "loss_aux_layer_13": 0.0921630859375, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.1297607421875, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.05712890625, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.15625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21728515625, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0726318359375, "step": 2159, "total_loss": 0.6581052839756012 }, { "epoch": 0.42763809146703624, "grad_norm": 1.259692907333374, "learning_rate": 5e-05, "llm_loss": 0.5576165392994881, "loss": 2.6234, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05755615234375, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.071044921875, "step": 2160, "total_loss": 0.6558557897806168 }, { "epoch": 0.4278360720649376, "grad_norm": 1.4935097694396973, "learning_rate": 5e-05, "llm_loss": 0.6459859311580658, "loss": 2.977, "loss_aux_layer_0": 0.023834228515625, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.0552978515625, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0703125, "step": 2161, "total_loss": 0.744253009557724 }, { "epoch": 0.42803405266283906, "grad_norm": 1.3073424100875854, "learning_rate": 5e-05, "llm_loss": 0.5792034566402435, "loss": 2.718, "loss_aux_layer_0": 0.0238037109375, "loss_aux_layer_1": 0.0445556640625, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0789794921875, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.1114501953125, "loss_aux_layer_16": 0.1217041015625, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.140625, "loss_aux_layer_2": 0.058349609375, "loss_aux_layer_20": 0.148681640625, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.0738525390625, "loss_aux_layer_9": 0.0726318359375, "step": 2162, "total_loss": 0.6794902980327606 }, { "epoch": 0.42823203326074044, "grad_norm": 1.3652310371398926, "learning_rate": 5e-05, "llm_loss": 0.5232508331537247, "loss": 2.4684, "loss_aux_layer_0": 0.02398681640625, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.0931396484375, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.1307373046875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0667724609375, "step": 2163, "total_loss": 0.6170927286148071 }, { "epoch": 0.4284300138586419, "grad_norm": 1.0281388759613037, "learning_rate": 5e-05, "llm_loss": 0.5339192599058151, "loss": 2.5121, "loss_aux_layer_0": 0.023590087890625, "loss_aux_layer_1": 0.04083251953125, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.06585693359375, "step": 2164, "total_loss": 0.6280190944671631 }, { "epoch": 0.42862799445654326, "grad_norm": 1.2787690162658691, "learning_rate": 5e-05, "llm_loss": 0.5608193129301071, "loss": 2.6265, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.040771484375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.066650390625, "step": 2165, "total_loss": 0.6566254496574402 }, { "epoch": 0.42882597505444464, "grad_norm": 1.071581244468689, "learning_rate": 5e-05, "llm_loss": 0.6069945394992828, "loss": 2.7972, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.066650390625, "step": 2166, "total_loss": 0.6993092745542526 }, { "epoch": 0.4290239556523461, "grad_norm": 1.3760395050048828, "learning_rate": 5e-05, "llm_loss": 0.543370746076107, "loss": 2.545, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.06744384765625, "loss_aux_layer_9": 0.0662841796875, "step": 2167, "total_loss": 0.6362597346305847 }, { "epoch": 0.42922193625024746, "grad_norm": 1.1940616369247437, "learning_rate": 5e-05, "llm_loss": 0.634342759847641, "loss": 2.9371, "loss_aux_layer_0": 0.02032470703125, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.0797119140625, "loss_aux_layer_12": 0.0850830078125, "loss_aux_layer_13": 0.091552734375, "loss_aux_layer_14": 0.1009521484375, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.136962890625, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.074462890625, "loss_aux_layer_9": 0.0731201171875, "step": 2168, "total_loss": 0.7342837005853653 }, { "epoch": 0.4294199168481489, "grad_norm": 0.9176029562950134, "learning_rate": 5e-05, "llm_loss": 0.5387669056653976, "loss": 2.5496, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04437255859375, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.1273193359375, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0693359375, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.07666015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0714111328125, "step": 2169, "total_loss": 0.63739113509655 }, { "epoch": 0.4296178974460503, "grad_norm": 1.1118448972702026, "learning_rate": 5e-05, "llm_loss": 0.6976463198661804, "loss": 3.1929, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0755615234375, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.0859375, "loss_aux_layer_13": 0.092529296875, "loss_aux_layer_14": 0.1016845703125, "loss_aux_layer_15": 0.1109619140625, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.1292724609375, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.058837890625, "loss_aux_layer_20": 0.14697265625, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.0703125, "loss_aux_layer_4": 0.0738525390625, "loss_aux_layer_5": 0.075927734375, "loss_aux_layer_6": 0.0792236328125, "loss_aux_layer_7": 0.07666015625, "loss_aux_layer_8": 0.0755615234375, "loss_aux_layer_9": 0.074462890625, "step": 2170, "total_loss": 0.7982275038957596 }, { "epoch": 0.4298158780439517, "grad_norm": 1.1287884712219238, "learning_rate": 5e-05, "llm_loss": 0.5934341251850128, "loss": 2.7575, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1053466796875, "loss_aux_layer_16": 0.1156005859375, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05487060546875, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.0687255859375, "step": 2171, "total_loss": 0.6893796324729919 }, { "epoch": 0.4300138586418531, "grad_norm": 1.0472034215927124, "learning_rate": 5e-05, "llm_loss": 0.6387530714273453, "loss": 2.9444, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1177978515625, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05572509765625, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.06890869140625, "step": 2172, "total_loss": 0.7361036837100983 }, { "epoch": 0.4302118392397545, "grad_norm": 1.503767728805542, "learning_rate": 5e-05, "llm_loss": 0.6967703998088837, "loss": 3.1634, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04205322265625, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06494140625, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0677490234375, "step": 2173, "total_loss": 0.7908413261175156 }, { "epoch": 0.4304098198376559, "grad_norm": 1.0365386009216309, "learning_rate": 5e-05, "llm_loss": 0.5734231173992157, "loss": 2.678, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.04205322265625, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.06561279296875, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0687255859375, "step": 2174, "total_loss": 0.669502779841423 }, { "epoch": 0.4306078004355573, "grad_norm": 1.1002951860427856, "learning_rate": 5e-05, "llm_loss": 0.5794489532709122, "loss": 2.7108, "loss_aux_layer_0": 0.020660400390625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.08935546875, "loss_aux_layer_14": 0.0989990234375, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.119140625, "loss_aux_layer_17": 0.1270751953125, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.14599609375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.07080078125, "step": 2175, "total_loss": 0.6776998490095139 }, { "epoch": 0.43080578103345873, "grad_norm": 1.411653757095337, "learning_rate": 5e-05, "llm_loss": 0.6198568046092987, "loss": 2.8665, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0701904296875, "step": 2176, "total_loss": 0.7166367620229721 }, { "epoch": 0.4310037616313601, "grad_norm": 0.8862756490707397, "learning_rate": 5e-05, "llm_loss": 0.5495885536074638, "loss": 2.5773, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.1317138671875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.066650390625, "step": 2177, "total_loss": 0.644332766532898 }, { "epoch": 0.43120174222926155, "grad_norm": 1.193024754524231, "learning_rate": 5e-05, "llm_loss": 0.6235923171043396, "loss": 2.8819, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.0546875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.0679931640625, "step": 2178, "total_loss": 0.7204843312501907 }, { "epoch": 0.43139972282716293, "grad_norm": 1.2231031656265259, "learning_rate": 5e-05, "llm_loss": 0.5649882182478905, "loss": 2.6333, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.06805419921875, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.1334228515625, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.06219482421875, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.06634521484375, "step": 2179, "total_loss": 0.6583262234926224 }, { "epoch": 0.4315977034250643, "grad_norm": 1.7036234140396118, "learning_rate": 5e-05, "llm_loss": 0.6837854981422424, "loss": 3.1266, "loss_aux_layer_0": 0.0206298828125, "loss_aux_layer_1": 0.0439453125, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.0975341796875, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.071533203125, "step": 2180, "total_loss": 0.7816565036773682 }, { "epoch": 0.43179568402296575, "grad_norm": 1.0331753492355347, "learning_rate": 5e-05, "llm_loss": 0.6044365465641022, "loss": 2.811, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.073974609375, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05853271484375, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.072998046875, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0723876953125, "step": 2181, "total_loss": 0.7027428895235062 }, { "epoch": 0.43199366462086713, "grad_norm": 1.2639235258102417, "learning_rate": 5e-05, "llm_loss": 0.5765101835131645, "loss": 2.7113, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04547119140625, "loss_aux_layer_10": 0.075927734375, "loss_aux_layer_11": 0.08056640625, "loss_aux_layer_12": 0.085693359375, "loss_aux_layer_13": 0.09228515625, "loss_aux_layer_14": 0.1021728515625, "loss_aux_layer_15": 0.111572265625, "loss_aux_layer_16": 0.121826171875, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.14013671875, "loss_aux_layer_2": 0.06011962890625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17724609375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.0714111328125, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.0802001953125, "loss_aux_layer_7": 0.0772705078125, "loss_aux_layer_8": 0.076171875, "loss_aux_layer_9": 0.0748291015625, "step": 2182, "total_loss": 0.6778172701597214 }, { "epoch": 0.43219164521876857, "grad_norm": 1.0729113817214966, "learning_rate": 5e-05, "llm_loss": 0.6024535298347473, "loss": 2.8024, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04425048828125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.08935546875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.071533203125, "step": 2183, "total_loss": 0.700594425201416 }, { "epoch": 0.43238962581666995, "grad_norm": 1.5290412902832031, "learning_rate": 5e-05, "llm_loss": 0.5776437968015671, "loss": 2.6868, "loss_aux_layer_0": 0.02264404296875, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06689453125, "step": 2184, "total_loss": 0.6717097759246826 }, { "epoch": 0.4325876064145714, "grad_norm": 0.9426499009132385, "learning_rate": 5e-05, "llm_loss": 0.6380407065153122, "loss": 2.922, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.1290283203125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.06549072265625, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.0654296875, "loss_aux_layer_9": 0.06427001953125, "step": 2185, "total_loss": 0.7304907143115997 }, { "epoch": 0.43278558701247277, "grad_norm": 1.517162561416626, "learning_rate": 5e-05, "llm_loss": 0.59007228910923, "loss": 2.753, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.083251953125, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.1201171875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05633544921875, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0714111328125, "step": 2186, "total_loss": 0.6882513165473938 }, { "epoch": 0.4329835676103742, "grad_norm": 1.0622940063476562, "learning_rate": 5e-05, "llm_loss": 0.6378493458032608, "loss": 2.9359, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.105712890625, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.1376953125, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068359375, "step": 2187, "total_loss": 0.7339716106653214 }, { "epoch": 0.4331815482082756, "grad_norm": 1.0343406200408936, "learning_rate": 5e-05, "llm_loss": 0.6214550286531448, "loss": 2.881, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.0438232421875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.089111328125, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.218994140625, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0714111328125, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.071533203125, "step": 2188, "total_loss": 0.7202428132295609 }, { "epoch": 0.433379528806177, "grad_norm": 1.0607813596725464, "learning_rate": 5e-05, "llm_loss": 0.7246528565883636, "loss": 3.2988, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.07958984375, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.1097412109375, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1282958984375, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.073974609375, "loss_aux_layer_5": 0.07568359375, "loss_aux_layer_6": 0.0789794921875, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.0733642578125, "step": 2189, "total_loss": 0.8246896713972092 }, { "epoch": 0.4335775094040784, "grad_norm": 1.165020227432251, "learning_rate": 5e-05, "llm_loss": 0.5431810468435287, "loss": 2.5665, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04443359375, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.0823974609375, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.126220703125, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.0714111328125, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.071533203125, "step": 2190, "total_loss": 0.6416286826133728 }, { "epoch": 0.4337754900019798, "grad_norm": 1.2390025854110718, "learning_rate": 5e-05, "llm_loss": 0.5124584138393402, "loss": 2.4435, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.0986328125, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.177978515625, "loss_aux_layer_23": 0.2177734375, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0704345703125, "step": 2191, "total_loss": 0.6108763366937637 }, { "epoch": 0.43397347059988123, "grad_norm": 1.240959644317627, "learning_rate": 5e-05, "llm_loss": 0.6037567108869553, "loss": 2.804, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.107666015625, "loss_aux_layer_16": 0.1185302734375, "loss_aux_layer_17": 0.1273193359375, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.069091796875, "step": 2192, "total_loss": 0.7010120898485184 }, { "epoch": 0.4341714511977826, "grad_norm": 1.5388588905334473, "learning_rate": 5e-05, "llm_loss": 0.6384487003087997, "loss": 2.9368, "loss_aux_layer_0": 0.02294921875, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0958251953125, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.06964111328125, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.07025146484375, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.06805419921875, "step": 2193, "total_loss": 0.7342077791690826 }, { "epoch": 0.43436943179568405, "grad_norm": 0.9870959520339966, "learning_rate": 5e-05, "llm_loss": 0.6014549881219864, "loss": 2.7938, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1153564453125, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.21533203125, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0712890625, "loss_aux_layer_9": 0.06982421875, "step": 2194, "total_loss": 0.698440209031105 }, { "epoch": 0.43456741239358543, "grad_norm": 1.2141401767730713, "learning_rate": 5e-05, "llm_loss": 0.5874768123030663, "loss": 2.7174, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1181640625, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.06585693359375, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0655517578125, "step": 2195, "total_loss": 0.6793620586395264 }, { "epoch": 0.4347653929914868, "grad_norm": 0.898274302482605, "learning_rate": 5e-05, "llm_loss": 0.6095045804977417, "loss": 2.8152, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04046630859375, "loss_aux_layer_10": 0.068603515625, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.06884765625, "loss_aux_layer_9": 0.067138671875, "step": 2196, "total_loss": 0.703802615404129 }, { "epoch": 0.43496337358938825, "grad_norm": 1.0908442735671997, "learning_rate": 5e-05, "llm_loss": 0.7231741547584534, "loss": 3.275, "loss_aux_layer_0": 0.021240234375, "loss_aux_layer_1": 0.04217529296875, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0546875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.06884765625, "step": 2197, "total_loss": 0.8187451809644699 }, { "epoch": 0.43516135418728963, "grad_norm": 0.8141379952430725, "learning_rate": 5e-05, "llm_loss": 0.5757324919104576, "loss": 2.6925, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.0443115234375, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.078125, "loss_aux_layer_12": 0.0830078125, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.0980224609375, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.116455078125, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0771484375, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.072021484375, "step": 2198, "total_loss": 0.6731290221214294 }, { "epoch": 0.43535933478519107, "grad_norm": 0.9189867377281189, "learning_rate": 5e-05, "llm_loss": 0.5978357419371605, "loss": 2.7713, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.08642578125, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.069091796875, "step": 2199, "total_loss": 0.6928256899118423 }, { "epoch": 0.43555731538309245, "grad_norm": 0.8616800904273987, "learning_rate": 5e-05, "llm_loss": 0.5031132474541664, "loss": 2.3869, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0662841796875, "step": 2200, "total_loss": 0.5967197567224503 }, { "epoch": 0.4357552959809939, "grad_norm": 0.7848851680755615, "learning_rate": 5e-05, "llm_loss": 0.5347007811069489, "loss": 2.5235, "loss_aux_layer_0": 0.021240234375, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.06475830078125, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.0679931640625, "step": 2201, "total_loss": 0.6308702826499939 }, { "epoch": 0.43595327657889527, "grad_norm": 0.8821750283241272, "learning_rate": 5e-05, "llm_loss": 0.6239030659198761, "loss": 2.8674, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.0391845703125, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.06695556640625, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0657958984375, "step": 2202, "total_loss": 0.7168495953083038 }, { "epoch": 0.43615125717679665, "grad_norm": 0.8588657379150391, "learning_rate": 5e-05, "llm_loss": 0.6213693097233772, "loss": 2.8559, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.053955078125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.06854248046875, "loss_aux_layer_9": 0.067138671875, "step": 2203, "total_loss": 0.7139756977558136 }, { "epoch": 0.4363492377746981, "grad_norm": 0.7868391275405884, "learning_rate": 5e-05, "llm_loss": 0.5261051580309868, "loss": 2.4939, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.0433349609375, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.1324462890625, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06781005859375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.07269287109375, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.07305908203125, "loss_aux_layer_8": 0.07220458984375, "loss_aux_layer_9": 0.07122802734375, "step": 2204, "total_loss": 0.6234738826751709 }, { "epoch": 0.43654721837259947, "grad_norm": 1.2203853130340576, "learning_rate": 5e-05, "llm_loss": 0.6361196339130402, "loss": 2.9296, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.04278564453125, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.09619140625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.070068359375, "step": 2205, "total_loss": 0.7324002236127853 }, { "epoch": 0.4367451989705009, "grad_norm": 0.8334263563156128, "learning_rate": 5e-05, "llm_loss": 0.5800063908100128, "loss": 2.7039, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.0555419921875, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.07183837890625, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.07122802734375, "loss_aux_layer_9": 0.0699462890625, "step": 2206, "total_loss": 0.6759685724973679 }, { "epoch": 0.4369431795684023, "grad_norm": 1.0169951915740967, "learning_rate": 5e-05, "llm_loss": 0.6016772240400314, "loss": 2.8169, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04632568359375, "loss_aux_layer_10": 0.0777587890625, "loss_aux_layer_11": 0.0831298828125, "loss_aux_layer_12": 0.088623046875, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1051025390625, "loss_aux_layer_15": 0.1148681640625, "loss_aux_layer_16": 0.1246337890625, "loss_aux_layer_17": 0.132080078125, "loss_aux_layer_18": 0.139404296875, "loss_aux_layer_19": 0.14111328125, "loss_aux_layer_2": 0.06085205078125, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0726318359375, "loss_aux_layer_4": 0.0758056640625, "loss_aux_layer_5": 0.0780029296875, "loss_aux_layer_6": 0.0810546875, "loss_aux_layer_7": 0.0784912109375, "loss_aux_layer_8": 0.0777587890625, "loss_aux_layer_9": 0.076416015625, "step": 2207, "total_loss": 0.7042286694049835 }, { "epoch": 0.4371411601663037, "grad_norm": 0.7953896522521973, "learning_rate": 5e-05, "llm_loss": 0.6652550101280212, "loss": 3.0423, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06549072265625, "loss_aux_layer_4": 0.06854248046875, "loss_aux_layer_5": 0.070556640625, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.07037353515625, "loss_aux_layer_9": 0.0693359375, "step": 2208, "total_loss": 0.7605749219655991 }, { "epoch": 0.4373391407642051, "grad_norm": 0.992749035358429, "learning_rate": 5e-05, "llm_loss": 0.5547692626714706, "loss": 2.6215, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.07470703125, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0858154296875, "loss_aux_layer_13": 0.0927734375, "loss_aux_layer_14": 0.1025390625, "loss_aux_layer_15": 0.1119384765625, "loss_aux_layer_16": 0.1220703125, "loss_aux_layer_17": 0.1300048828125, "loss_aux_layer_18": 0.1376953125, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.07470703125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0750732421875, "loss_aux_layer_9": 0.073486328125, "step": 2209, "total_loss": 0.6553669422864914 }, { "epoch": 0.43753712136210654, "grad_norm": 0.8455342650413513, "learning_rate": 5e-05, "llm_loss": 0.4852147400379181, "loss": 2.3345, "loss_aux_layer_0": 0.02227783203125, "loss_aux_layer_1": 0.04327392578125, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.0885009765625, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.1180419921875, "loss_aux_layer_17": 0.126220703125, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.154296875, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0709228515625, "step": 2210, "total_loss": 0.5836345702409744 }, { "epoch": 0.4377351019600079, "grad_norm": 1.019761085510254, "learning_rate": 5e-05, "llm_loss": 0.644866332411766, "loss": 2.9648, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0687255859375, "step": 2211, "total_loss": 0.7412047982215881 }, { "epoch": 0.4379330825579093, "grad_norm": 0.88743656873703, "learning_rate": 5e-05, "llm_loss": 0.5995589941740036, "loss": 2.7702, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.06640625, "step": 2212, "total_loss": 0.6925463527441025 }, { "epoch": 0.43813106315581074, "grad_norm": 1.2460873126983643, "learning_rate": 5e-05, "llm_loss": 0.5739506483078003, "loss": 2.6937, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.04376220703125, "loss_aux_layer_10": 0.07421875, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.0845947265625, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.10107421875, "loss_aux_layer_15": 0.110595703125, "loss_aux_layer_16": 0.1202392578125, "loss_aux_layer_17": 0.1275634765625, "loss_aux_layer_18": 0.135986328125, "loss_aux_layer_19": 0.138671875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.0777587890625, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0726318359375, "step": 2213, "total_loss": 0.6734131425619125 }, { "epoch": 0.4383290437537121, "grad_norm": 1.083551049232483, "learning_rate": 5e-05, "llm_loss": 0.5222615003585815, "loss": 2.4748, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.07073974609375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05560302734375, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.2138671875, "loss_aux_layer_3": 0.06640625, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.06939697265625, "step": 2214, "total_loss": 0.618711993098259 }, { "epoch": 0.43852702435161356, "grad_norm": 1.2405978441238403, "learning_rate": 5e-05, "llm_loss": 0.6220309734344482, "loss": 2.8614, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0660400390625, "step": 2215, "total_loss": 0.7153415083885193 }, { "epoch": 0.43872500494951494, "grad_norm": 1.2995948791503906, "learning_rate": 5e-05, "llm_loss": 0.5804157629609108, "loss": 2.7125, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.1265869140625, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.0557861328125, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06646728515625, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0699462890625, "step": 2216, "total_loss": 0.6781160533428192 }, { "epoch": 0.4389229855474164, "grad_norm": 1.883179783821106, "learning_rate": 5e-05, "llm_loss": 0.6084965169429779, "loss": 2.8043, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0631103515625, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06494140625, "step": 2217, "total_loss": 0.701070174574852 }, { "epoch": 0.43912096614531776, "grad_norm": 1.7506452798843384, "learning_rate": 5e-05, "llm_loss": 0.6795927435159683, "loss": 3.1027, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.1259765625, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.07025146484375, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.06903076171875, "step": 2218, "total_loss": 0.7756662368774414 }, { "epoch": 0.43931894674321914, "grad_norm": 1.2990270853042603, "learning_rate": 5e-05, "llm_loss": 0.6596280187368393, "loss": 2.9981, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0626220703125, "step": 2219, "total_loss": 0.7495153397321701 }, { "epoch": 0.4395169273411206, "grad_norm": 1.8470020294189453, "learning_rate": 5e-05, "llm_loss": 0.620688870549202, "loss": 2.8546, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.0401611328125, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.066162109375, "step": 2220, "total_loss": 0.7136475145816803 }, { "epoch": 0.43971490793902196, "grad_norm": 0.9833162426948547, "learning_rate": 5e-05, "llm_loss": 0.575930118560791, "loss": 2.6914, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.1072998046875, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.124755859375, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.0731201171875, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0712890625, "step": 2221, "total_loss": 0.6728555411100388 }, { "epoch": 0.4399128885369234, "grad_norm": 1.4053562879562378, "learning_rate": 5e-05, "llm_loss": 0.6966273933649063, "loss": 3.1674, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05517578125, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0704345703125, "step": 2222, "total_loss": 0.791852131485939 }, { "epoch": 0.4401108691348248, "grad_norm": 1.0621459484100342, "learning_rate": 5e-05, "llm_loss": 0.5817667320370674, "loss": 2.7217, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.0433349609375, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.08935546875, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.10888671875, "loss_aux_layer_16": 0.1190185546875, "loss_aux_layer_17": 0.126953125, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05706787109375, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0714111328125, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0716552734375, "step": 2223, "total_loss": 0.6804370582103729 }, { "epoch": 0.4403088497327262, "grad_norm": 1.5813052654266357, "learning_rate": 5e-05, "llm_loss": 0.590189516544342, "loss": 2.7432, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.040771484375, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05316162109375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0684814453125, "step": 2224, "total_loss": 0.6858001351356506 }, { "epoch": 0.4405068303306276, "grad_norm": 1.1026220321655273, "learning_rate": 5e-05, "llm_loss": 0.5875121206045151, "loss": 2.7399, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.0419921875, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05548095703125, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.070556640625, "step": 2225, "total_loss": 0.6849692016839981 }, { "epoch": 0.440704810928529, "grad_norm": 1.4780902862548828, "learning_rate": 5e-05, "llm_loss": 0.6756173223257065, "loss": 3.093, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.1180419921875, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.05487060546875, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.070068359375, "step": 2226, "total_loss": 0.7732589244842529 }, { "epoch": 0.4409027915264304, "grad_norm": 1.6845296621322632, "learning_rate": 5e-05, "llm_loss": 0.5579340755939484, "loss": 2.6201, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.0975341796875, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0572509765625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.069580078125, "step": 2227, "total_loss": 0.6550373584032059 }, { "epoch": 0.4411007721243318, "grad_norm": 0.9604653120040894, "learning_rate": 5e-05, "llm_loss": 0.5887922793626785, "loss": 2.7465, "loss_aux_layer_0": 0.021636962890625, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.06781005859375, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0706787109375, "step": 2228, "total_loss": 0.6866190880537033 }, { "epoch": 0.44129875272223323, "grad_norm": 1.474799633026123, "learning_rate": 5e-05, "llm_loss": 0.5643870532512665, "loss": 2.6492, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.0977783203125, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0712890625, "step": 2229, "total_loss": 0.6623122692108154 }, { "epoch": 0.4414967333201346, "grad_norm": 1.1135613918304443, "learning_rate": 5e-05, "llm_loss": 0.6025405526161194, "loss": 2.79, "loss_aux_layer_0": 0.02325439453125, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.0546875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.0687255859375, "step": 2230, "total_loss": 0.6975094676017761 }, { "epoch": 0.44169471391803605, "grad_norm": 1.0828148126602173, "learning_rate": 5e-05, "llm_loss": 0.6090920343995094, "loss": 2.8138, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.068603515625, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.06689453125, "step": 2231, "total_loss": 0.7034587413072586 }, { "epoch": 0.44189269451593743, "grad_norm": 1.0925452709197998, "learning_rate": 5e-05, "llm_loss": 0.645382285118103, "loss": 2.9615, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05389404296875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0667724609375, "step": 2232, "total_loss": 0.7403633147478104 }, { "epoch": 0.44209067511383887, "grad_norm": 0.9330046772956848, "learning_rate": 5e-05, "llm_loss": 0.6625866889953613, "loss": 3.0477, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.09033203125, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.1092529296875, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.1273193359375, "loss_aux_layer_18": 0.1343994140625, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0595703125, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.0723876953125, "step": 2233, "total_loss": 0.7619291990995407 }, { "epoch": 0.44228865571174025, "grad_norm": 1.3782562017440796, "learning_rate": 5e-05, "llm_loss": 0.5499032512307167, "loss": 2.5807, "loss_aux_layer_0": 0.021728515625, "loss_aux_layer_1": 0.04083251953125, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0673828125, "step": 2234, "total_loss": 0.6451831012964249 }, { "epoch": 0.44248663630964163, "grad_norm": 1.086546778678894, "learning_rate": 5e-05, "llm_loss": 0.5647923648357391, "loss": 2.6514, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0885009765625, "loss_aux_layer_14": 0.0986328125, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.126953125, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.13916015625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.06671142578125, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0706787109375, "step": 2235, "total_loss": 0.662840723991394 }, { "epoch": 0.44268461690754307, "grad_norm": 0.8522313237190247, "learning_rate": 5e-05, "llm_loss": 0.5190161913633347, "loss": 2.4559, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.04083251953125, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06787109375, "step": 2236, "total_loss": 0.6139767318964005 }, { "epoch": 0.44288259750544445, "grad_norm": 0.9950181245803833, "learning_rate": 5e-05, "llm_loss": 0.6042586117982864, "loss": 2.8115, "loss_aux_layer_0": 0.021575927734375, "loss_aux_layer_1": 0.04302978515625, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.09912109375, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.1278076171875, "loss_aux_layer_18": 0.13671875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.154052734375, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.0709228515625, "step": 2237, "total_loss": 0.7028795331716537 }, { "epoch": 0.4430805781033459, "grad_norm": 0.9674140810966492, "learning_rate": 5e-05, "llm_loss": 0.6214996576309204, "loss": 2.8782, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1265869140625, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.07080078125, "step": 2238, "total_loss": 0.7195519804954529 }, { "epoch": 0.44327855870124727, "grad_norm": 0.9322434663772583, "learning_rate": 5e-05, "llm_loss": 0.5701369345188141, "loss": 2.6598, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.0411376953125, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0689697265625, "step": 2239, "total_loss": 0.6649512350559235 }, { "epoch": 0.4434765392991487, "grad_norm": 0.9789408445358276, "learning_rate": 5e-05, "llm_loss": 0.6492301672697067, "loss": 2.996, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.0743408203125, "loss_aux_layer_11": 0.0792236328125, "loss_aux_layer_12": 0.084716796875, "loss_aux_layer_13": 0.0909423828125, "loss_aux_layer_14": 0.100830078125, "loss_aux_layer_15": 0.1103515625, "loss_aux_layer_16": 0.1212158203125, "loss_aux_layer_17": 0.12890625, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.0572509765625, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.17529296875, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.06866455078125, "loss_aux_layer_4": 0.07220458984375, "loss_aux_layer_5": 0.0743408203125, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.07275390625, "step": 2240, "total_loss": 0.7490043938159943 }, { "epoch": 0.4436745198970501, "grad_norm": 1.1316547393798828, "learning_rate": 5e-05, "llm_loss": 0.6472126096487045, "loss": 2.9831, "loss_aux_layer_0": 0.0220947265625, "loss_aux_layer_1": 0.04449462890625, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.0897216796875, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.118408203125, "loss_aux_layer_17": 0.1265869140625, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.07470703125, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.07177734375, "step": 2241, "total_loss": 0.745778813958168 }, { "epoch": 0.44387250049495147, "grad_norm": 1.2878410816192627, "learning_rate": 5e-05, "llm_loss": 0.6053472757339478, "loss": 2.7968, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06817626953125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.06866455078125, "loss_aux_layer_8": 0.06781005859375, "loss_aux_layer_9": 0.06671142578125, "step": 2242, "total_loss": 0.6991994976997375 }, { "epoch": 0.4440704810928529, "grad_norm": 0.9846243262290955, "learning_rate": 5e-05, "llm_loss": 0.5952318534255028, "loss": 2.755, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1282958984375, "loss_aux_layer_19": 0.1312255859375, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.0677490234375, "step": 2243, "total_loss": 0.6887582093477249 }, { "epoch": 0.4442684616907543, "grad_norm": 0.8820673227310181, "learning_rate": 5e-05, "llm_loss": 0.6020971387624741, "loss": 2.7849, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.06329345703125, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06689453125, "step": 2244, "total_loss": 0.6962366998195648 }, { "epoch": 0.4444664422886557, "grad_norm": 1.1116081476211548, "learning_rate": 5e-05, "llm_loss": 0.6074871718883514, "loss": 2.8166, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.07147216796875, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.087646484375, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.1331787109375, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0552978515625, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.06622314453125, "loss_aux_layer_4": 0.06939697265625, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0704345703125, "step": 2245, "total_loss": 0.7041469812393188 }, { "epoch": 0.4446644228865571, "grad_norm": 0.9569673538208008, "learning_rate": 5e-05, "llm_loss": 0.5992028415203094, "loss": 2.7805, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.0418701171875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.06884765625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.069091796875, "step": 2246, "total_loss": 0.6951171904802322 }, { "epoch": 0.44486240348445855, "grad_norm": 0.8466706275939941, "learning_rate": 5e-05, "llm_loss": 0.5724608674645424, "loss": 2.6501, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.06854248046875, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.06268310546875, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.06488037109375, "loss_aux_layer_8": 0.06439208984375, "loss_aux_layer_9": 0.0634765625, "step": 2247, "total_loss": 0.6625269949436188 }, { "epoch": 0.44506038408235993, "grad_norm": 1.3087847232818604, "learning_rate": 5e-05, "llm_loss": 0.644932433962822, "loss": 2.9624, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05450439453125, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.068359375, "step": 2248, "total_loss": 0.7406066358089447 }, { "epoch": 0.4452583646802613, "grad_norm": 1.0240910053253174, "learning_rate": 5e-05, "llm_loss": 0.648763582110405, "loss": 2.975, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.069091796875, "step": 2249, "total_loss": 0.7437411397695541 }, { "epoch": 0.44545634527816275, "grad_norm": 0.994479775428772, "learning_rate": 5e-05, "llm_loss": 0.5847332924604416, "loss": 2.7249, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.116455078125, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0701904296875, "step": 2250, "total_loss": 0.6812284588813782 }, { "epoch": 0.44565432587606413, "grad_norm": 1.1861087083816528, "learning_rate": 5e-05, "llm_loss": 0.6290530264377594, "loss": 2.8887, "loss_aux_layer_0": 0.02154541015625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.06787109375, "step": 2251, "total_loss": 0.7221665978431702 }, { "epoch": 0.44585230647396557, "grad_norm": 0.9433459043502808, "learning_rate": 5e-05, "llm_loss": 0.5595581829547882, "loss": 2.6509, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.0472412109375, "loss_aux_layer_10": 0.0791015625, "loss_aux_layer_11": 0.0841064453125, "loss_aux_layer_12": 0.0894775390625, "loss_aux_layer_13": 0.095458984375, "loss_aux_layer_14": 0.1043701171875, "loss_aux_layer_15": 0.1136474609375, "loss_aux_layer_16": 0.12353515625, "loss_aux_layer_17": 0.130859375, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.140869140625, "loss_aux_layer_2": 0.061767578125, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.0740966796875, "loss_aux_layer_4": 0.07763671875, "loss_aux_layer_5": 0.0794677734375, "loss_aux_layer_6": 0.0830078125, "loss_aux_layer_7": 0.080078125, "loss_aux_layer_8": 0.0792236328125, "loss_aux_layer_9": 0.077880859375, "step": 2252, "total_loss": 0.6627287417650223 }, { "epoch": 0.44605028707186695, "grad_norm": 1.1560359001159668, "learning_rate": 5e-05, "llm_loss": 0.6548788100481033, "loss": 2.9976, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0675048828125, "step": 2253, "total_loss": 0.7493995577096939 }, { "epoch": 0.4462482676697684, "grad_norm": 1.3287798166275024, "learning_rate": 5e-05, "llm_loss": 0.5929891169071198, "loss": 2.7548, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.153076171875, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.06890869140625, "loss_aux_layer_9": 0.06768798828125, "step": 2254, "total_loss": 0.6886920779943466 }, { "epoch": 0.44644624826766977, "grad_norm": 0.8791168928146362, "learning_rate": 5e-05, "llm_loss": 0.5671659335494041, "loss": 2.6602, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.0433349609375, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.074462890625, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.074951171875, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.0726318359375, "step": 2255, "total_loss": 0.6650461405515671 }, { "epoch": 0.44664422886557115, "grad_norm": 1.6107207536697388, "learning_rate": 5e-05, "llm_loss": 0.6894830763339996, "loss": 3.1385, "loss_aux_layer_0": 0.020660400390625, "loss_aux_layer_1": 0.041259765625, "loss_aux_layer_10": 0.06976318359375, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.06842041015625, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.07025146484375, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.06842041015625, "step": 2256, "total_loss": 0.784618616104126 }, { "epoch": 0.4468422094634726, "grad_norm": 1.2878305912017822, "learning_rate": 5e-05, "llm_loss": 0.5979512184858322, "loss": 2.7652, "loss_aux_layer_0": 0.01983642578125, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.06756591796875, "loss_aux_layer_8": 0.06707763671875, "loss_aux_layer_9": 0.0660400390625, "step": 2257, "total_loss": 0.6912906914949417 }, { "epoch": 0.44704019006137397, "grad_norm": 1.210518479347229, "learning_rate": 5e-05, "llm_loss": 0.5859182924032211, "loss": 2.7341, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.0435791015625, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.1170654296875, "loss_aux_layer_17": 0.1251220703125, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0572509765625, "loss_aux_layer_20": 0.144775390625, "loss_aux_layer_21": 0.15283203125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.070556640625, "step": 2258, "total_loss": 0.6835159212350845 }, { "epoch": 0.4472381706592754, "grad_norm": 1.1911250352859497, "learning_rate": 5e-05, "llm_loss": 0.609006330370903, "loss": 2.8279, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.0989990234375, "loss_aux_layer_15": 0.1083984375, "loss_aux_layer_16": 0.1180419921875, "loss_aux_layer_17": 0.125244140625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.0743408203125, "loss_aux_layer_8": 0.0732421875, "loss_aux_layer_9": 0.0721435546875, "step": 2259, "total_loss": 0.7069655358791351 }, { "epoch": 0.4474361512571768, "grad_norm": 1.310897707939148, "learning_rate": 5e-05, "llm_loss": 0.6541295647621155, "loss": 2.9932, "loss_aux_layer_0": 0.0213623046875, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.113037109375, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0557861328125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06597900390625, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.0675048828125, "step": 2260, "total_loss": 0.7483122050762177 }, { "epoch": 0.4476341318550782, "grad_norm": 1.1806201934814453, "learning_rate": 5e-05, "llm_loss": 0.5719613283872604, "loss": 2.6592, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.0419921875, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.06439208984375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.066162109375, "step": 2261, "total_loss": 0.6647897511720657 }, { "epoch": 0.4478321124529796, "grad_norm": 1.3310191631317139, "learning_rate": 5e-05, "llm_loss": 0.5343658700585365, "loss": 2.5287, "loss_aux_layer_0": 0.020050048828125, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0982666015625, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.0587158203125, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0706787109375, "loss_aux_layer_4": 0.0733642578125, "loss_aux_layer_5": 0.0748291015625, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0721435546875, "step": 2262, "total_loss": 0.6321791708469391 }, { "epoch": 0.44803009305088104, "grad_norm": 1.1712266206741333, "learning_rate": 5e-05, "llm_loss": 0.49492547661066055, "loss": 2.3629, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06719970703125, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0701904296875, "step": 2263, "total_loss": 0.5907352566719055 }, { "epoch": 0.4482280736487824, "grad_norm": 1.155369758605957, "learning_rate": 5e-05, "llm_loss": 0.577287033200264, "loss": 2.687, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.0693359375, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0679931640625, "step": 2264, "total_loss": 0.671741172671318 }, { "epoch": 0.4484260542466838, "grad_norm": 1.2260433435440063, "learning_rate": 5e-05, "llm_loss": 0.6360855624079704, "loss": 2.9364, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04339599609375, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.05694580078125, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.1533203125, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.070556640625, "step": 2265, "total_loss": 0.7340987920761108 }, { "epoch": 0.44862403484458524, "grad_norm": 1.3569369316101074, "learning_rate": 5e-05, "llm_loss": 0.613847404718399, "loss": 2.8318, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0540771484375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.06781005859375, "loss_aux_layer_5": 0.06927490234375, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.06781005859375, "step": 2266, "total_loss": 0.7079483717679977 }, { "epoch": 0.4488220154424866, "grad_norm": 1.2499916553497314, "learning_rate": 5e-05, "llm_loss": 0.5025198459625244, "loss": 2.3969, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.096923828125, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.1240234375, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0562744140625, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0694580078125, "step": 2267, "total_loss": 0.5992143228650093 }, { "epoch": 0.44901999604038806, "grad_norm": 0.923915684223175, "learning_rate": 5e-05, "llm_loss": 0.5383199080824852, "loss": 2.5519, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.04510498046875, "loss_aux_layer_10": 0.0750732421875, "loss_aux_layer_11": 0.080322265625, "loss_aux_layer_12": 0.085693359375, "loss_aux_layer_13": 0.0916748046875, "loss_aux_layer_14": 0.1007080078125, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.1190185546875, "loss_aux_layer_17": 0.12646484375, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13671875, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.17431640625, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0704345703125, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0753173828125, "loss_aux_layer_6": 0.07861328125, "loss_aux_layer_7": 0.0760498046875, "loss_aux_layer_8": 0.075439453125, "loss_aux_layer_9": 0.073974609375, "step": 2268, "total_loss": 0.6379788815975189 }, { "epoch": 0.44921797663828944, "grad_norm": 1.1317546367645264, "learning_rate": 5e-05, "llm_loss": 0.6687412559986115, "loss": 3.073, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.045166015625, "loss_aux_layer_10": 0.0738525390625, "loss_aux_layer_11": 0.07861328125, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.1192626953125, "loss_aux_layer_17": 0.1268310546875, "loss_aux_layer_18": 0.135009765625, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.05859375, "loss_aux_layer_20": 0.1455078125, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.07421875, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0748291015625, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.07275390625, "step": 2269, "total_loss": 0.7682574838399887 }, { "epoch": 0.4494159572361909, "grad_norm": 1.3378441333770752, "learning_rate": 5e-05, "llm_loss": 0.589028850197792, "loss": 2.7244, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0614013671875, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.06610107421875, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.06622314453125, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06463623046875, "step": 2270, "total_loss": 0.681103840470314 }, { "epoch": 0.44961393783409226, "grad_norm": 1.209795594215393, "learning_rate": 5e-05, "llm_loss": 0.5666974782943726, "loss": 2.6477, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0555419921875, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06671142578125, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0694580078125, "step": 2271, "total_loss": 0.6619228199124336 }, { "epoch": 0.44981191843199364, "grad_norm": 0.8513977527618408, "learning_rate": 5e-05, "llm_loss": 0.6108580678701401, "loss": 2.8246, "loss_aux_layer_0": 0.021240234375, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.068603515625, "step": 2272, "total_loss": 0.7061527222394943 }, { "epoch": 0.4500098990298951, "grad_norm": 0.9595203399658203, "learning_rate": 5e-05, "llm_loss": 0.6000305712223053, "loss": 2.7889, "loss_aux_layer_0": 0.022308349609375, "loss_aux_layer_1": 0.043701171875, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.116455078125, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0704345703125, "step": 2273, "total_loss": 0.6972223818302155 }, { "epoch": 0.45020787962779646, "grad_norm": 1.048325538635254, "learning_rate": 5e-05, "llm_loss": 0.485206738114357, "loss": 2.3212, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.06646728515625, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.06744384765625, "step": 2274, "total_loss": 0.5802887007594109 }, { "epoch": 0.4504058602256979, "grad_norm": 0.8106091618537903, "learning_rate": 5e-05, "llm_loss": 0.5880024507641792, "loss": 2.7619, "loss_aux_layer_0": 0.0205078125, "loss_aux_layer_1": 0.04766845703125, "loss_aux_layer_10": 0.07861328125, "loss_aux_layer_11": 0.083740234375, "loss_aux_layer_12": 0.089111328125, "loss_aux_layer_13": 0.0950927734375, "loss_aux_layer_14": 0.103515625, "loss_aux_layer_15": 0.1123046875, "loss_aux_layer_16": 0.1221923828125, "loss_aux_layer_17": 0.12939453125, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.139404296875, "loss_aux_layer_2": 0.061279296875, "loss_aux_layer_20": 0.145751953125, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.177001953125, "loss_aux_layer_23": 0.216064453125, "loss_aux_layer_3": 0.073486328125, "loss_aux_layer_4": 0.0767822265625, "loss_aux_layer_5": 0.0787353515625, "loss_aux_layer_6": 0.08203125, "loss_aux_layer_7": 0.0794677734375, "loss_aux_layer_8": 0.0784912109375, "loss_aux_layer_9": 0.0771484375, "step": 2275, "total_loss": 0.69046950340271 }, { "epoch": 0.4506038408235993, "grad_norm": 0.9658493399620056, "learning_rate": 5e-05, "llm_loss": 0.5738458037376404, "loss": 2.6752, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06353759765625, "loss_aux_layer_4": 0.06634521484375, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.06719970703125, "step": 2276, "total_loss": 0.6687888503074646 }, { "epoch": 0.4508018214215007, "grad_norm": 1.0708667039871216, "learning_rate": 5e-05, "llm_loss": 0.5940264984965324, "loss": 2.7497, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.06378173828125, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.065673828125, "step": 2277, "total_loss": 0.6874271333217621 }, { "epoch": 0.4509998020194021, "grad_norm": 1.0635353326797485, "learning_rate": 5e-05, "llm_loss": 0.627622127532959, "loss": 2.8994, "loss_aux_layer_0": 0.022216796875, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.0977783203125, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.1339111328125, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05670166015625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.06695556640625, "loss_aux_layer_4": 0.06964111328125, "loss_aux_layer_5": 0.07147216796875, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.07110595703125, "loss_aux_layer_9": 0.07000732421875, "step": 2278, "total_loss": 0.7248537093400955 }, { "epoch": 0.4511977826173035, "grad_norm": 1.3800128698349, "learning_rate": 5e-05, "llm_loss": 0.6220678240060806, "loss": 2.8913, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.044189453125, "loss_aux_layer_10": 0.0758056640625, "loss_aux_layer_11": 0.0806884765625, "loss_aux_layer_12": 0.086181640625, "loss_aux_layer_13": 0.0928955078125, "loss_aux_layer_14": 0.1021728515625, "loss_aux_layer_15": 0.11181640625, "loss_aux_layer_16": 0.122314453125, "loss_aux_layer_17": 0.1304931640625, "loss_aux_layer_18": 0.138427734375, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.0582275390625, "loss_aux_layer_20": 0.147705078125, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.0694580078125, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.07568359375, "loss_aux_layer_8": 0.0751953125, "loss_aux_layer_9": 0.0740966796875, "step": 2279, "total_loss": 0.7228344678878784 }, { "epoch": 0.4513957632152049, "grad_norm": 0.9238417744636536, "learning_rate": 5e-05, "llm_loss": 0.6440335959196091, "loss": 2.9543, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.041259765625, "loss_aux_layer_10": 0.06927490234375, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.1304931640625, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06500244140625, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.068115234375, "step": 2280, "total_loss": 0.7385708391666412 }, { "epoch": 0.4515937438131063, "grad_norm": 1.3241671323776245, "learning_rate": 5e-05, "llm_loss": 0.6323457881808281, "loss": 2.9123, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06561279296875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068359375, "step": 2281, "total_loss": 0.7280667126178741 }, { "epoch": 0.45179172441100773, "grad_norm": 1.436267614364624, "learning_rate": 5e-05, "llm_loss": 0.503307394683361, "loss": 2.3858, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.06805419921875, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.07025146484375, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.06683349609375, "step": 2282, "total_loss": 0.5964553132653236 }, { "epoch": 0.4519897050089091, "grad_norm": 1.1104012727737427, "learning_rate": 5e-05, "llm_loss": 0.591802716255188, "loss": 2.7511, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1226806640625, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.068603515625, "step": 2283, "total_loss": 0.6877830177545547 }, { "epoch": 0.45218768560681055, "grad_norm": 1.163408637046814, "learning_rate": 5e-05, "llm_loss": 0.6477529853582382, "loss": 2.9661, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.0662841796875, "step": 2284, "total_loss": 0.7415257394313812 }, { "epoch": 0.45238566620471193, "grad_norm": 1.3039056062698364, "learning_rate": 5e-05, "llm_loss": 0.6465263590216637, "loss": 2.9626, "loss_aux_layer_0": 0.020050048828125, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.068603515625, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0673828125, "step": 2285, "total_loss": 0.7406621426343918 }, { "epoch": 0.45258364680261337, "grad_norm": 1.5360532999038696, "learning_rate": 5e-05, "llm_loss": 0.6226554214954376, "loss": 2.8869, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.07281494140625, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1099853515625, "loss_aux_layer_16": 0.12060546875, "loss_aux_layer_17": 0.12890625, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.1474609375, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0716552734375, "step": 2286, "total_loss": 0.7217215150594711 }, { "epoch": 0.45278162740051475, "grad_norm": 1.4431456327438354, "learning_rate": 5e-05, "llm_loss": 0.5295931622385979, "loss": 2.514, "loss_aux_layer_0": 0.02191162109375, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.099365234375, "loss_aux_layer_15": 0.109375, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.146240234375, "loss_aux_layer_21": 0.155029296875, "loss_aux_layer_22": 0.17822265625, "loss_aux_layer_23": 0.21923828125, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0709228515625, "step": 2287, "total_loss": 0.6285063624382019 }, { "epoch": 0.45297960799841613, "grad_norm": 1.4789983034133911, "learning_rate": 5e-05, "llm_loss": 0.6150927543640137, "loss": 2.8455, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.145263671875, "loss_aux_layer_21": 0.15380859375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.0682373046875, "step": 2288, "total_loss": 0.7113712728023529 }, { "epoch": 0.45317758859631757, "grad_norm": 2.808720111846924, "learning_rate": 5e-05, "llm_loss": 0.5433274209499359, "loss": 2.5473, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.06671142578125, "loss_aux_layer_9": 0.065673828125, "step": 2289, "total_loss": 0.6368344128131866 }, { "epoch": 0.45337556919421895, "grad_norm": 1.3217707872390747, "learning_rate": 5e-05, "llm_loss": 0.6271073520183563, "loss": 2.9035, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0999755859375, "loss_aux_layer_15": 0.110107421875, "loss_aux_layer_16": 0.1204833984375, "loss_aux_layer_17": 0.1287841796875, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05682373046875, "loss_aux_layer_20": 0.14794921875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.0673828125, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0704345703125, "step": 2290, "total_loss": 0.7258719503879547 }, { "epoch": 0.4535735497921204, "grad_norm": 1.8839874267578125, "learning_rate": 5e-05, "llm_loss": 0.5828127264976501, "loss": 2.711, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.06719970703125, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.06884765625, "loss_aux_layer_9": 0.06781005859375, "step": 2291, "total_loss": 0.6777616739273071 }, { "epoch": 0.45377153039002177, "grad_norm": 1.3411996364593506, "learning_rate": 5e-05, "llm_loss": 0.656318187713623, "loss": 2.9944, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.0648193359375, "step": 2292, "total_loss": 0.7486037015914917 }, { "epoch": 0.4539695109879232, "grad_norm": 1.5785455703735352, "learning_rate": 5e-05, "llm_loss": 0.5670004934072495, "loss": 2.6464, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2293, "total_loss": 0.6615959256887436 }, { "epoch": 0.4541674915858246, "grad_norm": 1.1229537725448608, "learning_rate": 5e-05, "llm_loss": 0.5614857822656631, "loss": 2.6496, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04425048828125, "loss_aux_layer_10": 0.075439453125, "loss_aux_layer_11": 0.080322265625, "loss_aux_layer_12": 0.0855712890625, "loss_aux_layer_13": 0.0919189453125, "loss_aux_layer_14": 0.101806640625, "loss_aux_layer_15": 0.1112060546875, "loss_aux_layer_16": 0.1209716796875, "loss_aux_layer_17": 0.129150390625, "loss_aux_layer_18": 0.137451171875, "loss_aux_layer_19": 0.140380859375, "loss_aux_layer_2": 0.05938720703125, "loss_aux_layer_20": 0.148193359375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.176025390625, "loss_aux_layer_23": 0.21484375, "loss_aux_layer_3": 0.071044921875, "loss_aux_layer_4": 0.0743408203125, "loss_aux_layer_5": 0.076416015625, "loss_aux_layer_6": 0.079833984375, "loss_aux_layer_7": 0.0767822265625, "loss_aux_layer_8": 0.075927734375, "loss_aux_layer_9": 0.0743408203125, "step": 2294, "total_loss": 0.6623988300561905 }, { "epoch": 0.45436547218372597, "grad_norm": 1.205080509185791, "learning_rate": 5e-05, "llm_loss": 0.647868275642395, "loss": 2.9773, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05560302734375, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06695556640625, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0709228515625, "step": 2295, "total_loss": 0.7443266957998276 }, { "epoch": 0.4545634527816274, "grad_norm": 2.0035810470581055, "learning_rate": 5e-05, "llm_loss": 0.6444718688726425, "loss": 2.9698, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.1180419921875, "loss_aux_layer_17": 0.1260986328125, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.072021484375, "loss_aux_layer_5": 0.0738525390625, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.071044921875, "step": 2296, "total_loss": 0.7424390017986298 }, { "epoch": 0.4547614333795288, "grad_norm": 1.2042994499206543, "learning_rate": 5e-05, "llm_loss": 0.6242331117391586, "loss": 2.8755, "loss_aux_layer_0": 0.023345947265625, "loss_aux_layer_1": 0.04229736328125, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.0687255859375, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.067138671875, "step": 2297, "total_loss": 0.7188797146081924 }, { "epoch": 0.4549594139774302, "grad_norm": 1.26747465133667, "learning_rate": 5e-05, "llm_loss": 0.69168721139431, "loss": 3.1526, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04400634765625, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.1214599609375, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05712890625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0733642578125, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0704345703125, "step": 2298, "total_loss": 0.7881521582603455 }, { "epoch": 0.4551573945753316, "grad_norm": 1.3090471029281616, "learning_rate": 5e-05, "llm_loss": 0.6278151273727417, "loss": 2.8949, "loss_aux_layer_0": 0.02008056640625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05633544921875, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0697021484375, "step": 2299, "total_loss": 0.7237153500318527 }, { "epoch": 0.45535537517323305, "grad_norm": 1.2442216873168945, "learning_rate": 5e-05, "llm_loss": 0.60713991522789, "loss": 2.798, "loss_aux_layer_0": 0.022003173828125, "loss_aux_layer_1": 0.04150390625, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.06591796875, "step": 2300, "total_loss": 0.6994875222444534 }, { "epoch": 0.4555533557711344, "grad_norm": 1.7547054290771484, "learning_rate": 5e-05, "llm_loss": 0.6313611418008804, "loss": 2.909, "loss_aux_layer_0": 0.022613525390625, "loss_aux_layer_1": 0.04327392578125, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.105712890625, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.05657958984375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.0673828125, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0697021484375, "step": 2301, "total_loss": 0.7272515147924423 }, { "epoch": 0.4557513363690358, "grad_norm": 1.2136831283569336, "learning_rate": 5e-05, "llm_loss": 0.6318882554769516, "loss": 2.9143, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04290771484375, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.0570068359375, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.068115234375, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.070556640625, "step": 2302, "total_loss": 0.7285769879817963 }, { "epoch": 0.45594931696693725, "grad_norm": 1.355014443397522, "learning_rate": 5e-05, "llm_loss": 0.605949804186821, "loss": 2.8196, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.08251953125, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.13720703125, "loss_aux_layer_19": 0.139892578125, "loss_aux_layer_2": 0.05657958984375, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.1552734375, "loss_aux_layer_22": 0.176513671875, "loss_aux_layer_23": 0.216796875, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0712890625, "step": 2303, "total_loss": 0.7049111425876617 }, { "epoch": 0.4561472975648386, "grad_norm": 1.066913366317749, "learning_rate": 5e-05, "llm_loss": 0.6093509048223495, "loss": 2.8178, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.1302490234375, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.06597900390625, "loss_aux_layer_4": 0.06890869140625, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0684814453125, "step": 2304, "total_loss": 0.704449862241745 }, { "epoch": 0.45634527816274006, "grad_norm": 0.9285058379173279, "learning_rate": 5e-05, "llm_loss": 0.6494471728801727, "loss": 2.9898, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.044677734375, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.0986328125, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.11767578125, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.069091796875, "loss_aux_layer_4": 0.0721435546875, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.07421875, "loss_aux_layer_8": 0.0733642578125, "loss_aux_layer_9": 0.0718994140625, "step": 2305, "total_loss": 0.7474491745233536 }, { "epoch": 0.45654325876064145, "grad_norm": 0.9180092215538025, "learning_rate": 5e-05, "llm_loss": 0.6222135275602341, "loss": 2.8815, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.04364013671875, "loss_aux_layer_10": 0.074951171875, "loss_aux_layer_11": 0.080078125, "loss_aux_layer_12": 0.0849609375, "loss_aux_layer_13": 0.09130859375, "loss_aux_layer_14": 0.10009765625, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.11865234375, "loss_aux_layer_17": 0.1265869140625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0693359375, "loss_aux_layer_4": 0.07275390625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0755615234375, "loss_aux_layer_8": 0.074951171875, "loss_aux_layer_9": 0.0733642578125, "step": 2306, "total_loss": 0.7203815579414368 }, { "epoch": 0.4567412393585429, "grad_norm": 0.9234588146209717, "learning_rate": 5e-05, "llm_loss": 0.6465259492397308, "loss": 2.9636, "loss_aux_layer_0": 0.021240234375, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0670166015625, "step": 2307, "total_loss": 0.7408957332372665 }, { "epoch": 0.45693921995644426, "grad_norm": 1.6614691019058228, "learning_rate": 5e-05, "llm_loss": 0.6340740472078323, "loss": 2.9187, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.04132080078125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.0540771484375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0692138671875, "step": 2308, "total_loss": 0.7296804189682007 }, { "epoch": 0.4571372005543457, "grad_norm": 1.1478173732757568, "learning_rate": 5e-05, "llm_loss": 0.6129767745733261, "loss": 2.8288, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.0418701171875, "loss_aux_layer_10": 0.06927490234375, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1295166015625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05450439453125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.06988525390625, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0684814453125, "step": 2309, "total_loss": 0.7072112560272217 }, { "epoch": 0.4573351811522471, "grad_norm": 1.4193021059036255, "learning_rate": 5e-05, "llm_loss": 0.6296727955341339, "loss": 2.9039, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.04534912109375, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.1136474609375, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.0592041015625, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.073486328125, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.07080078125, "step": 2310, "total_loss": 0.7259780019521713 }, { "epoch": 0.45753316175014846, "grad_norm": 1.1746313571929932, "learning_rate": 5e-05, "llm_loss": 0.6088474541902542, "loss": 2.812, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.04083251953125, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0672607421875, "step": 2311, "total_loss": 0.7029913365840912 }, { "epoch": 0.4577311423480499, "grad_norm": 2.0339300632476807, "learning_rate": 5e-05, "llm_loss": 0.5919976234436035, "loss": 2.7452, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.04278564453125, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06591796875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.06884765625, "loss_aux_layer_9": 0.0675048828125, "step": 2312, "total_loss": 0.6862944066524506 }, { "epoch": 0.4579291229459513, "grad_norm": 1.3291763067245483, "learning_rate": 5e-05, "llm_loss": 0.5487587451934814, "loss": 2.5661, "loss_aux_layer_0": 0.02178955078125, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.06689453125, "step": 2313, "total_loss": 0.6415158212184906 }, { "epoch": 0.4581271035438527, "grad_norm": 1.1861355304718018, "learning_rate": 5e-05, "llm_loss": 0.5398144572973251, "loss": 2.5494, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.1243896484375, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05792236328125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.07177734375, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0718994140625, "step": 2314, "total_loss": 0.6373562812805176 }, { "epoch": 0.4583250841417541, "grad_norm": 1.4348132610321045, "learning_rate": 5e-05, "llm_loss": 0.6043047681450844, "loss": 2.8055, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0811767578125, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.070068359375, "step": 2315, "total_loss": 0.7013740390539169 }, { "epoch": 0.45852306473965554, "grad_norm": 1.368390679359436, "learning_rate": 5e-05, "llm_loss": 0.5806214660406113, "loss": 2.7116, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04315185546875, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0882568359375, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05816650390625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0709228515625, "step": 2316, "total_loss": 0.6779026091098785 }, { "epoch": 0.4587210453375569, "grad_norm": 1.4676986932754517, "learning_rate": 5e-05, "llm_loss": 0.5334630385041237, "loss": 2.5197, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04364013671875, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05755615234375, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.068359375, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.0728759765625, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0704345703125, "step": 2317, "total_loss": 0.6299140155315399 }, { "epoch": 0.4589190259354583, "grad_norm": 1.3498597145080566, "learning_rate": 5e-05, "llm_loss": 0.6256930530071259, "loss": 2.8958, "loss_aux_layer_0": 0.021331787109375, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.07373046875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0836181640625, "loss_aux_layer_13": 0.0899658203125, "loss_aux_layer_14": 0.0994873046875, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.06982421875, "loss_aux_layer_4": 0.0728759765625, "loss_aux_layer_5": 0.0745849609375, "loss_aux_layer_6": 0.0780029296875, "loss_aux_layer_7": 0.0751953125, "loss_aux_layer_8": 0.0740966796875, "loss_aux_layer_9": 0.072265625, "step": 2318, "total_loss": 0.723947748541832 }, { "epoch": 0.45911700653335974, "grad_norm": 1.0545274019241333, "learning_rate": 5e-05, "llm_loss": 0.5693065524101257, "loss": 2.6537, "loss_aux_layer_0": 0.02203369140625, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0667724609375, "step": 2319, "total_loss": 0.6634284853935242 }, { "epoch": 0.4593149871312611, "grad_norm": 1.4928803443908691, "learning_rate": 5e-05, "llm_loss": 0.554835170507431, "loss": 2.6014, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.042724609375, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0682373046875, "step": 2320, "total_loss": 0.6503555178642273 }, { "epoch": 0.45951296772916256, "grad_norm": 0.8658570051193237, "learning_rate": 5e-05, "llm_loss": 0.5860968828201294, "loss": 2.7318, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097900390625, "loss_aux_layer_15": 0.10693359375, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.133056640625, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05517578125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.06854248046875, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0697021484375, "step": 2321, "total_loss": 0.6829526722431183 }, { "epoch": 0.45971094832706394, "grad_norm": 1.1736050844192505, "learning_rate": 5e-05, "llm_loss": 0.46101872622966766, "loss": 2.2325, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.0443115234375, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0882568359375, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.1160888671875, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.070556640625, "step": 2322, "total_loss": 0.5581181645393372 }, { "epoch": 0.4599089289249654, "grad_norm": 1.2122125625610352, "learning_rate": 5e-05, "llm_loss": 0.5661637037992477, "loss": 2.6599, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04473876953125, "loss_aux_layer_10": 0.0732421875, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0838623046875, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.1002197265625, "loss_aux_layer_15": 0.109130859375, "loss_aux_layer_16": 0.119384765625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.1357421875, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.144287109375, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.0765380859375, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0718994140625, "step": 2323, "total_loss": 0.6649676859378815 }, { "epoch": 0.46010690952286676, "grad_norm": 1.0477464199066162, "learning_rate": 5e-05, "llm_loss": 0.6014991700649261, "loss": 2.7783, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1126708984375, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0631103515625, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.06591796875, "step": 2324, "total_loss": 0.6945663541555405 }, { "epoch": 0.46030489012076814, "grad_norm": 1.2989530563354492, "learning_rate": 5e-05, "llm_loss": 0.551881343126297, "loss": 2.5888, "loss_aux_layer_0": 0.021697998046875, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1053466796875, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.123779296875, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.06658935546875, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.06756591796875, "step": 2325, "total_loss": 0.647189199924469 }, { "epoch": 0.4605028707186696, "grad_norm": 1.13531494140625, "learning_rate": 5e-05, "llm_loss": 0.4770507961511612, "loss": 2.2947, "loss_aux_layer_0": 0.0213623046875, "loss_aux_layer_1": 0.04205322265625, "loss_aux_layer_10": 0.07086181640625, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.115966796875, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05572509765625, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212646484375, "loss_aux_layer_3": 0.06671142578125, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.07110595703125, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.07171630859375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.069580078125, "step": 2326, "total_loss": 0.5736759454011917 }, { "epoch": 0.46070085131657096, "grad_norm": 1.1937775611877441, "learning_rate": 5e-05, "llm_loss": 0.6139267683029175, "loss": 2.8263, "loss_aux_layer_0": 0.02197265625, "loss_aux_layer_1": 0.04071044921875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1011962890625, "loss_aux_layer_16": 0.1107177734375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0537109375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.06658935546875, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06689453125, "step": 2327, "total_loss": 0.7065813094377518 }, { "epoch": 0.4608988319144724, "grad_norm": 1.486429214477539, "learning_rate": 5e-05, "llm_loss": 0.517192080616951, "loss": 2.454, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.0552978515625, "loss_aux_layer_20": 0.14306640625, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.2158203125, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.06854248046875, "step": 2328, "total_loss": 0.613493487238884 }, { "epoch": 0.4610968125123738, "grad_norm": 1.2670173645019531, "learning_rate": 5e-05, "llm_loss": 0.6449623256921768, "loss": 2.9462, "loss_aux_layer_0": 0.020294189453125, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.06536865234375, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.06390380859375, "step": 2329, "total_loss": 0.7365430444478989 }, { "epoch": 0.4612947931102752, "grad_norm": 1.4109077453613281, "learning_rate": 5e-05, "llm_loss": 0.5736368149518967, "loss": 2.6758, "loss_aux_layer_0": 0.0224609375, "loss_aux_layer_1": 0.0411376953125, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1231689453125, "loss_aux_layer_18": 0.1317138671875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.21435546875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.06787109375, "step": 2330, "total_loss": 0.6689491122961044 }, { "epoch": 0.4614927737081766, "grad_norm": 1.8697775602340698, "learning_rate": 5e-05, "llm_loss": 0.6251748353242874, "loss": 2.8917, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.0772705078125, "loss_aux_layer_12": 0.0823974609375, "loss_aux_layer_13": 0.0892333984375, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.108642578125, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.12744140625, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.137939453125, "loss_aux_layer_2": 0.0556640625, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.210693359375, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.0709228515625, "step": 2331, "total_loss": 0.7229268997907639 }, { "epoch": 0.461690754306078, "grad_norm": 1.6621593236923218, "learning_rate": 5e-05, "llm_loss": 0.5600960403680801, "loss": 2.6274, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.0693359375, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.13818359375, "loss_aux_layer_2": 0.05517578125, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.15478515625, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.218017578125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.068115234375, "step": 2332, "total_loss": 0.6568443924188614 }, { "epoch": 0.4618887349039794, "grad_norm": 1.4577696323394775, "learning_rate": 5e-05, "llm_loss": 0.5475417375564575, "loss": 2.5707, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.0693359375, "step": 2333, "total_loss": 0.6426647305488586 }, { "epoch": 0.4620867155018808, "grad_norm": 1.1928542852401733, "learning_rate": 5e-05, "llm_loss": 0.5890195518732071, "loss": 2.7355, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.1304931640625, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.06927490234375, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0694580078125, "step": 2334, "total_loss": 0.6838853508234024 }, { "epoch": 0.46228469609978223, "grad_norm": 1.0927209854125977, "learning_rate": 5e-05, "llm_loss": 0.6093540862202644, "loss": 2.8203, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.1053466796875, "loss_aux_layer_16": 0.1153564453125, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05487060546875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0697021484375, "step": 2335, "total_loss": 0.7050742506980896 }, { "epoch": 0.4624826766976836, "grad_norm": 1.2411330938339233, "learning_rate": 5e-05, "llm_loss": 0.5813298672437668, "loss": 2.7088, "loss_aux_layer_0": 0.0206298828125, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.10693359375, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.1256103515625, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0693359375, "step": 2336, "total_loss": 0.6772120893001556 }, { "epoch": 0.46268065729558505, "grad_norm": 1.2266709804534912, "learning_rate": 5e-05, "llm_loss": 0.4991476237773895, "loss": 2.3782, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05682373046875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.07159423828125, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.07122802734375, "loss_aux_layer_9": 0.06982421875, "step": 2337, "total_loss": 0.5945613086223602 }, { "epoch": 0.46287863789348643, "grad_norm": 1.3761447668075562, "learning_rate": 5e-05, "llm_loss": 0.6497050523757935, "loss": 2.9648, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.06976318359375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.1334228515625, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.06768798828125, "loss_aux_layer_7": 0.06524658203125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.06390380859375, "step": 2338, "total_loss": 0.7411984950304031 }, { "epoch": 0.46307661849138787, "grad_norm": 1.564857006072998, "learning_rate": 5e-05, "llm_loss": 0.5366593226790428, "loss": 2.5273, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.0418701171875, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0694580078125, "step": 2339, "total_loss": 0.6318258345127106 }, { "epoch": 0.46327459908928925, "grad_norm": 1.3979051113128662, "learning_rate": 5e-05, "llm_loss": 0.5597880035638809, "loss": 2.6279, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.0421142578125, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.1162109375, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05877685546875, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.172119140625, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.0714111328125, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.075927734375, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.070556640625, "step": 2340, "total_loss": 0.6569696217775345 }, { "epoch": 0.46347257968719063, "grad_norm": 1.2478710412979126, "learning_rate": 5e-05, "llm_loss": 0.6001412123441696, "loss": 2.7795, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.174072265625, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.06549072265625, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.066650390625, "step": 2341, "total_loss": 0.69488525390625 }, { "epoch": 0.46367056028509207, "grad_norm": 1.0740019083023071, "learning_rate": 5e-05, "llm_loss": 0.6139687448740005, "loss": 2.8369, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.04132080078125, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.1302490234375, "loss_aux_layer_19": 0.1334228515625, "loss_aux_layer_2": 0.05487060546875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.069091796875, "step": 2342, "total_loss": 0.7092323899269104 }, { "epoch": 0.46386854088299345, "grad_norm": 1.3165874481201172, "learning_rate": 5e-05, "llm_loss": 0.5477432161569595, "loss": 2.554, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.06903076171875, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06121826171875, "loss_aux_layer_4": 0.0638427734375, "loss_aux_layer_5": 0.06561279296875, "loss_aux_layer_6": 0.06829833984375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06390380859375, "step": 2343, "total_loss": 0.6385109424591064 }, { "epoch": 0.4640665214808949, "grad_norm": 1.5679676532745361, "learning_rate": 5e-05, "llm_loss": 0.5461175888776779, "loss": 2.5548, "loss_aux_layer_0": 0.021209716796875, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0657958984375, "step": 2344, "total_loss": 0.6387107670307159 }, { "epoch": 0.46426450207879627, "grad_norm": 0.7790329456329346, "learning_rate": 5e-05, "llm_loss": 0.6432203054428101, "loss": 2.9505, "loss_aux_layer_0": 0.019775390625, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.06903076171875, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.0689697265625, "step": 2345, "total_loss": 0.7376361191272736 }, { "epoch": 0.4644624826766977, "grad_norm": 1.1015759706497192, "learning_rate": 5e-05, "llm_loss": 0.5013855472207069, "loss": 2.3952, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.0430908203125, "loss_aux_layer_10": 0.072265625, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.096923828125, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.05804443359375, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0736083984375, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0736083984375, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.0709228515625, "step": 2346, "total_loss": 0.5988115072250366 }, { "epoch": 0.4646604632745991, "grad_norm": 0.9571413397789001, "learning_rate": 5e-05, "llm_loss": 0.49605921655893326, "loss": 2.3574, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.06829833984375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0672607421875, "step": 2347, "total_loss": 0.5893622636795044 }, { "epoch": 0.46485844387250047, "grad_norm": 1.0411900281906128, "learning_rate": 5e-05, "llm_loss": 0.5708630084991455, "loss": 2.6476, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06036376953125, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.06805419921875, "loss_aux_layer_7": 0.06573486328125, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06439208984375, "step": 2348, "total_loss": 0.6618883609771729 }, { "epoch": 0.4650564244704019, "grad_norm": 0.9783336520195007, "learning_rate": 5e-05, "llm_loss": 0.5764635279774666, "loss": 2.6859, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.0679931640625, "step": 2349, "total_loss": 0.6714870035648346 }, { "epoch": 0.4652544050683033, "grad_norm": 1.1873149871826172, "learning_rate": 5e-05, "llm_loss": 0.6591108441352844, "loss": 3.0276, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.04412841796875, "loss_aux_layer_10": 0.0740966796875, "loss_aux_layer_11": 0.078857421875, "loss_aux_layer_12": 0.084228515625, "loss_aux_layer_13": 0.090576171875, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.1085205078125, "loss_aux_layer_16": 0.1181640625, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0723876953125, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0775146484375, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.0726318359375, "step": 2350, "total_loss": 0.7569031268358231 }, { "epoch": 0.4654523856662047, "grad_norm": 0.9427902102470398, "learning_rate": 5e-05, "llm_loss": 0.707053080201149, "loss": 3.2025, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.06829833984375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.06842041015625, "loss_aux_layer_9": 0.06719970703125, "step": 2351, "total_loss": 0.8006323724985123 }, { "epoch": 0.4656503662641061, "grad_norm": 1.24184250831604, "learning_rate": 5e-05, "llm_loss": 0.665645956993103, "loss": 3.0383, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.03948974609375, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0667724609375, "step": 2352, "total_loss": 0.7595851272344589 }, { "epoch": 0.46584834686200755, "grad_norm": 0.9477845430374146, "learning_rate": 5e-05, "llm_loss": 0.5038193389773369, "loss": 2.405, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.07275390625, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0828857421875, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.116455078125, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0716552734375, "step": 2353, "total_loss": 0.6012517884373665 }, { "epoch": 0.4660463274599089, "grad_norm": 1.062376618385315, "learning_rate": 5e-05, "llm_loss": 0.5549147427082062, "loss": 2.583, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.03955078125, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.11767578125, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06573486328125, "loss_aux_layer_9": 0.064208984375, "step": 2354, "total_loss": 0.645749032497406 }, { "epoch": 0.4662443080578103, "grad_norm": 0.6780542731285095, "learning_rate": 5e-05, "llm_loss": 0.5286804735660553, "loss": 2.488, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.06805419921875, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06658935546875, "step": 2355, "total_loss": 0.6219978332519531 }, { "epoch": 0.46644228865571175, "grad_norm": 1.1067664623260498, "learning_rate": 5e-05, "llm_loss": 0.6394871845841408, "loss": 2.9562, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.090087890625, "loss_aux_layer_14": 0.1004638671875, "loss_aux_layer_15": 0.10986328125, "loss_aux_layer_16": 0.120361328125, "loss_aux_layer_17": 0.128662109375, "loss_aux_layer_18": 0.137939453125, "loss_aux_layer_19": 0.141845703125, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.14892578125, "loss_aux_layer_21": 0.15576171875, "loss_aux_layer_22": 0.177490234375, "loss_aux_layer_23": 0.21630859375, "loss_aux_layer_3": 0.06781005859375, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.071533203125, "step": 2356, "total_loss": 0.7390568107366562 }, { "epoch": 0.4666402692536131, "grad_norm": 0.8845146894454956, "learning_rate": 5e-05, "llm_loss": 0.5854804664850235, "loss": 2.7254, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.07183837890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.105712890625, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0565185546875, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.070068359375, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.07269287109375, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.0704345703125, "step": 2357, "total_loss": 0.6813537925481796 }, { "epoch": 0.46683824985151456, "grad_norm": 1.1779433488845825, "learning_rate": 5e-05, "llm_loss": 0.6104554682970047, "loss": 2.8129, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.0399169921875, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0665283203125, "step": 2358, "total_loss": 0.7032370269298553 }, { "epoch": 0.46703623044941595, "grad_norm": 1.7917540073394775, "learning_rate": 5e-05, "llm_loss": 0.5459986254572868, "loss": 2.5493, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.0648193359375, "step": 2359, "total_loss": 0.6373256593942642 }, { "epoch": 0.4672342110473174, "grad_norm": 1.1130248308181763, "learning_rate": 5e-05, "llm_loss": 0.6862860172986984, "loss": 3.1235, "loss_aux_layer_0": 0.021087646484375, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.06988525390625, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.0687255859375, "step": 2360, "total_loss": 0.7808856070041656 }, { "epoch": 0.46743219164521876, "grad_norm": 1.385265827178955, "learning_rate": 5e-05, "llm_loss": 0.6377389878034592, "loss": 2.9374, "loss_aux_layer_0": 0.022186279296875, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.10595703125, "loss_aux_layer_16": 0.1160888671875, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.056640625, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.0703125, "step": 2361, "total_loss": 0.7343449145555496 }, { "epoch": 0.4676301722431202, "grad_norm": 1.4534298181533813, "learning_rate": 5e-05, "llm_loss": 0.5909821018576622, "loss": 2.7377, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0675048828125, "step": 2362, "total_loss": 0.6844167709350586 }, { "epoch": 0.4678281528410216, "grad_norm": 1.0772860050201416, "learning_rate": 5e-05, "llm_loss": 0.5653322637081146, "loss": 2.6355, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.041259765625, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06622314453125, "step": 2363, "total_loss": 0.6588834375143051 }, { "epoch": 0.46802613343892296, "grad_norm": 2.2254245281219482, "learning_rate": 5e-05, "llm_loss": 0.6185203194618225, "loss": 2.8475, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.067138671875, "step": 2364, "total_loss": 0.7118730992078781 }, { "epoch": 0.4682241140368244, "grad_norm": 1.3555266857147217, "learning_rate": 5e-05, "llm_loss": 0.5786503478884697, "loss": 2.7003, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.09716796875, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.116455078125, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.0565185546875, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0699462890625, "step": 2365, "total_loss": 0.6750690639019012 }, { "epoch": 0.4684220946347258, "grad_norm": 1.3840264081954956, "learning_rate": 5e-05, "llm_loss": 0.5828538239002228, "loss": 2.6956, "loss_aux_layer_0": 0.020294189453125, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06512451171875, "step": 2366, "total_loss": 0.6739008128643036 }, { "epoch": 0.4686200752326272, "grad_norm": 1.2702349424362183, "learning_rate": 5e-05, "llm_loss": 0.5927588567137718, "loss": 2.7612, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.072021484375, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.125732421875, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.13720703125, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.070556640625, "step": 2367, "total_loss": 0.6903006434440613 }, { "epoch": 0.4688180558305286, "grad_norm": 1.367804765701294, "learning_rate": 5e-05, "llm_loss": 0.5971303880214691, "loss": 2.7634, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.04168701171875, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.1307373046875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06512451171875, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.06915283203125, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.06878662109375, "loss_aux_layer_9": 0.0675048828125, "step": 2368, "total_loss": 0.6908444762229919 }, { "epoch": 0.46901603642843004, "grad_norm": 1.13620126247406, "learning_rate": 5e-05, "llm_loss": 0.6280467808246613, "loss": 2.8875, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.06854248046875, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1297607421875, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06414794921875, "loss_aux_layer_4": 0.06671142578125, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.067138671875, "step": 2369, "total_loss": 0.7218646556138992 }, { "epoch": 0.4692140170263314, "grad_norm": 0.8923109769821167, "learning_rate": 5e-05, "llm_loss": 0.5845535546541214, "loss": 2.712, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0684814453125, "step": 2370, "total_loss": 0.6779902875423431 }, { "epoch": 0.4694119976242328, "grad_norm": 1.3581924438476562, "learning_rate": 5e-05, "llm_loss": 0.6347983777523041, "loss": 2.9202, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0958251953125, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.114990234375, "loss_aux_layer_17": 0.1226806640625, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05633544921875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.0714111328125, "loss_aux_layer_6": 0.07421875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.069091796875, "step": 2371, "total_loss": 0.7300588339567184 }, { "epoch": 0.46960997822213424, "grad_norm": 0.949487566947937, "learning_rate": 5e-05, "llm_loss": 0.537249818444252, "loss": 2.5389, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.04296875, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0777587890625, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.0576171875, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.0723876953125, "loss_aux_layer_9": 0.0711669921875, "step": 2372, "total_loss": 0.6347188651561737 }, { "epoch": 0.4698079588200356, "grad_norm": 1.0774598121643066, "learning_rate": 5e-05, "llm_loss": 0.6042049527168274, "loss": 2.7857, "loss_aux_layer_0": 0.021148681640625, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.06591796875, "step": 2373, "total_loss": 0.6964278221130371 }, { "epoch": 0.47000593941793706, "grad_norm": 1.2176460027694702, "learning_rate": 5e-05, "llm_loss": 0.564696878194809, "loss": 2.6394, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.055908203125, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.069091796875, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.06884765625, "step": 2374, "total_loss": 0.6598392426967621 }, { "epoch": 0.47020392001583844, "grad_norm": 1.29951810836792, "learning_rate": 5e-05, "llm_loss": 0.6191258579492569, "loss": 2.8466, "loss_aux_layer_0": 0.020599365234375, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06109619140625, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.06585693359375, "loss_aux_layer_6": 0.06903076171875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.06512451171875, "step": 2375, "total_loss": 0.711646243929863 }, { "epoch": 0.4704019006137399, "grad_norm": 1.326037883758545, "learning_rate": 5e-05, "llm_loss": 0.6240338534116745, "loss": 2.8646, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0648193359375, "step": 2376, "total_loss": 0.716159775853157 }, { "epoch": 0.47059988121164126, "grad_norm": 1.1474523544311523, "learning_rate": 5e-05, "llm_loss": 0.5554503500461578, "loss": 2.5937, "loss_aux_layer_0": 0.02166748046875, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.06781005859375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.06683349609375, "step": 2377, "total_loss": 0.648428201675415 }, { "epoch": 0.47079786180954264, "grad_norm": 1.140200138092041, "learning_rate": 5e-05, "llm_loss": 0.5171389728784561, "loss": 2.4371, "loss_aux_layer_0": 0.020782470703125, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1126708984375, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.1300048828125, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.06207275390625, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0643310546875, "step": 2378, "total_loss": 0.6092648729681969 }, { "epoch": 0.4709958424074441, "grad_norm": 1.1738439798355103, "learning_rate": 5e-05, "llm_loss": 0.629721611738205, "loss": 2.8953, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.1136474609375, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.06817626953125, "loss_aux_layer_9": 0.06695556640625, "step": 2379, "total_loss": 0.7238209694623947 }, { "epoch": 0.47119382300534546, "grad_norm": 0.9352769255638123, "learning_rate": 5e-05, "llm_loss": 0.6858568787574768, "loss": 3.1296, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.0771484375, "loss_aux_layer_12": 0.082763671875, "loss_aux_layer_13": 0.0887451171875, "loss_aux_layer_14": 0.09765625, "loss_aux_layer_15": 0.1064453125, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.1240234375, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.056396484375, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072265625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.07080078125, "step": 2380, "total_loss": 0.7823944538831711 }, { "epoch": 0.4713918036032469, "grad_norm": 0.8196918964385986, "learning_rate": 5e-05, "llm_loss": 0.6163325682282448, "loss": 2.8467, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06622314453125, "loss_aux_layer_4": 0.06915283203125, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.07086181640625, "loss_aux_layer_9": 0.0694580078125, "step": 2381, "total_loss": 0.7116724997758865 }, { "epoch": 0.4715897842011483, "grad_norm": 1.0220603942871094, "learning_rate": 5e-05, "llm_loss": 0.6914677768945694, "loss": 3.14, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.1329345703125, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06585693359375, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.06866455078125, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06689453125, "step": 2382, "total_loss": 0.785000666975975 }, { "epoch": 0.4717877647990497, "grad_norm": 1.0025193691253662, "learning_rate": 5e-05, "llm_loss": 0.6985554695129395, "loss": 3.1736, "loss_aux_layer_0": 0.021820068359375, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.06890869140625, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.06951904296875, "loss_aux_layer_8": 0.06854248046875, "loss_aux_layer_9": 0.0675048828125, "step": 2383, "total_loss": 0.7933993339538574 }, { "epoch": 0.4719857453969511, "grad_norm": 0.9376727342605591, "learning_rate": 5e-05, "llm_loss": 0.6389637216925621, "loss": 2.9147, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0640869140625, "step": 2384, "total_loss": 0.7286798506975174 }, { "epoch": 0.4721837259948525, "grad_norm": 1.1025919914245605, "learning_rate": 5e-05, "llm_loss": 0.7151637822389603, "loss": 3.2323, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.06756591796875, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.052001953125, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06219482421875, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.06768798828125, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.06610107421875, "step": 2385, "total_loss": 0.808086484670639 }, { "epoch": 0.4723817065927539, "grad_norm": 0.7932115197181702, "learning_rate": 5e-05, "llm_loss": 0.614467978477478, "loss": 2.8317, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.0399169921875, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.0687255859375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0675048828125, "step": 2386, "total_loss": 0.7079324722290039 }, { "epoch": 0.4725796871906553, "grad_norm": 0.9214038848876953, "learning_rate": 5e-05, "llm_loss": 0.6332480758428574, "loss": 2.9169, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.1248779296875, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.06695556640625, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.0682373046875, "step": 2387, "total_loss": 0.7292258739471436 }, { "epoch": 0.47277766778855673, "grad_norm": 0.7710842490196228, "learning_rate": 5e-05, "llm_loss": 0.6456728577613831, "loss": 2.9525, "loss_aux_layer_0": 0.019744873046875, "loss_aux_layer_1": 0.03955078125, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.066650390625, "step": 2388, "total_loss": 0.7381344735622406 }, { "epoch": 0.4729756483864581, "grad_norm": 1.0749927759170532, "learning_rate": 5e-05, "llm_loss": 0.5571756958961487, "loss": 2.6119, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1226806640625, "loss_aux_layer_18": 0.1309814453125, "loss_aux_layer_19": 0.1336669921875, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.06890869140625, "loss_aux_layer_5": 0.07073974609375, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.07025146484375, "step": 2389, "total_loss": 0.6529867500066757 }, { "epoch": 0.47317362898435955, "grad_norm": 0.8962938189506531, "learning_rate": 5e-05, "llm_loss": 0.527157373726368, "loss": 2.4795, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.06512451171875, "step": 2390, "total_loss": 0.6198651492595673 }, { "epoch": 0.47337160958226093, "grad_norm": 0.9082748293876648, "learning_rate": 5e-05, "llm_loss": 0.6112455427646637, "loss": 2.8275, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1060791015625, "loss_aux_layer_16": 0.1160888671875, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.053955078125, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0692138671875, "step": 2391, "total_loss": 0.7068695574998856 }, { "epoch": 0.47356959018016237, "grad_norm": 1.0597659349441528, "learning_rate": 5e-05, "llm_loss": 0.49988964945077896, "loss": 2.3856, "loss_aux_layer_0": 0.0196533203125, "loss_aux_layer_1": 0.041259765625, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.0980224609375, "loss_aux_layer_15": 0.107421875, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.13427734375, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.070556640625, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0697021484375, "step": 2392, "total_loss": 0.5963906645774841 }, { "epoch": 0.47376757077806375, "grad_norm": 0.7711087465286255, "learning_rate": 5e-05, "llm_loss": 0.5364978536963463, "loss": 2.5176, "loss_aux_layer_0": 0.021392822265625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.06591796875, "step": 2393, "total_loss": 0.6294028460979462 }, { "epoch": 0.47396555137596513, "grad_norm": 0.9872943162918091, "learning_rate": 5e-05, "llm_loss": 0.5284209996461868, "loss": 2.4982, "loss_aux_layer_0": 0.0205078125, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.0711669921875, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.1307373046875, "loss_aux_layer_19": 0.1339111328125, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.06585693359375, "loss_aux_layer_4": 0.06884765625, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.07080078125, "loss_aux_layer_9": 0.0697021484375, "step": 2394, "total_loss": 0.6245437487959862 }, { "epoch": 0.47416353197386657, "grad_norm": 1.1497282981872559, "learning_rate": 5e-05, "llm_loss": 0.5990994870662689, "loss": 2.7681, "loss_aux_layer_0": 0.02032470703125, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.066650390625, "step": 2395, "total_loss": 0.6920310705900192 }, { "epoch": 0.47436151257176795, "grad_norm": 1.241775631904602, "learning_rate": 5e-05, "llm_loss": 0.5498551577329636, "loss": 2.5782, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0670166015625, "step": 2396, "total_loss": 0.6445384845137596 }, { "epoch": 0.4745594931696694, "grad_norm": 0.9314337372779846, "learning_rate": 5e-05, "llm_loss": 0.5355919152498245, "loss": 2.5361, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.0780029296875, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.0894775390625, "loss_aux_layer_14": 0.0985107421875, "loss_aux_layer_15": 0.1077880859375, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1256103515625, "loss_aux_layer_18": 0.134033203125, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05743408203125, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.1748046875, "loss_aux_layer_23": 0.21337890625, "loss_aux_layer_3": 0.0689697265625, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.073974609375, "loss_aux_layer_6": 0.0770263671875, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.073486328125, "loss_aux_layer_9": 0.072265625, "step": 2397, "total_loss": 0.6340335309505463 }, { "epoch": 0.47475747376757077, "grad_norm": 0.9404352307319641, "learning_rate": 5e-05, "llm_loss": 0.5712299346923828, "loss": 2.6496, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.06524658203125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.06561279296875, "loss_aux_layer_8": 0.0648193359375, "loss_aux_layer_9": 0.06402587890625, "step": 2398, "total_loss": 0.6623905748128891 }, { "epoch": 0.4749554543654722, "grad_norm": 0.8449000120162964, "learning_rate": 5e-05, "llm_loss": 0.6626904606819153, "loss": 3.0165, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06585693359375, "loss_aux_layer_9": 0.064697265625, "step": 2399, "total_loss": 0.7541147917509079 }, { "epoch": 0.4751534349633736, "grad_norm": 1.2079966068267822, "learning_rate": 5e-05, "llm_loss": 0.6643569469451904, "loss": 3.0293, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.03955078125, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.1282958984375, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.06817626953125, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.06585693359375, "step": 2400, "total_loss": 0.7573214173316956 }, { "epoch": 0.47535141556127497, "grad_norm": 1.0028605461120605, "learning_rate": 5e-05, "llm_loss": 0.5165634155273438, "loss": 2.4423, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.06689453125, "step": 2401, "total_loss": 0.610577717423439 }, { "epoch": 0.4755493961591764, "grad_norm": 0.7192879319190979, "learning_rate": 5e-05, "llm_loss": 0.5594947338104248, "loss": 2.6271, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.0718994140625, "loss_aux_layer_11": 0.076416015625, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.10693359375, "loss_aux_layer_16": 0.1173095703125, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.215087890625, "loss_aux_layer_3": 0.06610107421875, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.071044921875, "loss_aux_layer_6": 0.0745849609375, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.0704345703125, "step": 2402, "total_loss": 0.6567854285240173 }, { "epoch": 0.4757473767570778, "grad_norm": 1.244797945022583, "learning_rate": 5e-05, "llm_loss": 0.5535410344600677, "loss": 2.5927, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.1324462890625, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.052734375, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.06915283203125, "loss_aux_layer_8": 0.06829833984375, "loss_aux_layer_9": 0.0670166015625, "step": 2403, "total_loss": 0.6481698602437973 }, { "epoch": 0.4759453573549792, "grad_norm": 1.1363215446472168, "learning_rate": 5e-05, "llm_loss": 0.6010894030332565, "loss": 2.8002, "loss_aux_layer_0": 0.0205078125, "loss_aux_layer_1": 0.04248046875, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.077880859375, "loss_aux_layer_12": 0.0831298828125, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.099609375, "loss_aux_layer_15": 0.109619140625, "loss_aux_layer_16": 0.1195068359375, "loss_aux_layer_17": 0.1275634765625, "loss_aux_layer_18": 0.13623046875, "loss_aux_layer_19": 0.1396484375, "loss_aux_layer_2": 0.057373046875, "loss_aux_layer_20": 0.147216796875, "loss_aux_layer_21": 0.154541015625, "loss_aux_layer_22": 0.17626953125, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.0732421875, "loss_aux_layer_8": 0.072509765625, "loss_aux_layer_9": 0.0714111328125, "step": 2404, "total_loss": 0.7000577449798584 }, { "epoch": 0.4761433379528806, "grad_norm": 1.158227562904358, "learning_rate": 5e-05, "llm_loss": 0.584450013935566, "loss": 2.7116, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.1295166015625, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0672607421875, "step": 2405, "total_loss": 0.6778945177793503 }, { "epoch": 0.47634131855078204, "grad_norm": 1.2248733043670654, "learning_rate": 5e-05, "llm_loss": 0.6503502577543259, "loss": 2.9682, "loss_aux_layer_0": 0.019744873046875, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.06610107421875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.06512451171875, "step": 2406, "total_loss": 0.742050513625145 }, { "epoch": 0.4765392991486834, "grad_norm": 1.1749017238616943, "learning_rate": 5e-05, "llm_loss": 0.5785124897956848, "loss": 2.6975, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.08642578125, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.1165771484375, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.13330078125, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.06793212890625, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.07061767578125, "loss_aux_layer_8": 0.06976318359375, "loss_aux_layer_9": 0.06866455078125, "step": 2407, "total_loss": 0.6743724793195724 }, { "epoch": 0.4767372797465848, "grad_norm": 1.0420955419540405, "learning_rate": 5e-05, "llm_loss": 0.537051372230053, "loss": 2.5232, "loss_aux_layer_0": 0.019073486328125, "loss_aux_layer_1": 0.040771484375, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.06451416015625, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.06744384765625, "step": 2408, "total_loss": 0.6308059692382812 }, { "epoch": 0.47693526034448624, "grad_norm": 1.2050859928131104, "learning_rate": 5e-05, "llm_loss": 0.6177889704704285, "loss": 2.8398, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.06658935546875, "step": 2409, "total_loss": 0.7099596709012985 }, { "epoch": 0.4771332409423876, "grad_norm": 0.9486609697341919, "learning_rate": 5e-05, "llm_loss": 0.6760174930095673, "loss": 3.0665, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.06402587890625, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.06878662109375, "loss_aux_layer_7": 0.06646728515625, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.0643310546875, "step": 2410, "total_loss": 0.7666273713111877 }, { "epoch": 0.47733122154028906, "grad_norm": 1.2370816469192505, "learning_rate": 5e-05, "llm_loss": 0.6157191470265388, "loss": 2.8379, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06414794921875, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.0677490234375, "step": 2411, "total_loss": 0.7094817906618118 }, { "epoch": 0.47752920213819045, "grad_norm": 0.9784076809883118, "learning_rate": 5e-05, "llm_loss": 0.6206605732440948, "loss": 2.8593, "loss_aux_layer_0": 0.01983642578125, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0684814453125, "step": 2412, "total_loss": 0.7148290723562241 }, { "epoch": 0.4777271827360919, "grad_norm": 1.4506198167800903, "learning_rate": 5e-05, "llm_loss": 0.6937623769044876, "loss": 3.1423, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.07196044921875, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1011962890625, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.06671142578125, "loss_aux_layer_9": 0.06573486328125, "step": 2413, "total_loss": 0.7855708748102188 }, { "epoch": 0.47792516333399326, "grad_norm": 0.8106880784034729, "learning_rate": 5e-05, "llm_loss": 0.5268816202878952, "loss": 2.476, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06158447265625, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.0660400390625, "step": 2414, "total_loss": 0.6190027445554733 }, { "epoch": 0.4781231439318947, "grad_norm": 1.0484365224838257, "learning_rate": 5e-05, "llm_loss": 0.560425654053688, "loss": 2.6293, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.087158203125, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.124755859375, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.211669921875, "loss_aux_layer_3": 0.06610107421875, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.070068359375, "step": 2415, "total_loss": 0.6573295891284943 }, { "epoch": 0.4783211245297961, "grad_norm": 1.5620129108428955, "learning_rate": 5e-05, "llm_loss": 0.6133856326341629, "loss": 2.8214, "loss_aux_layer_0": 0.019134521484375, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.06817626953125, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1181640625, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.06671142578125, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.067138671875, "step": 2416, "total_loss": 0.7053591012954712 }, { "epoch": 0.47851910512769746, "grad_norm": 1.2325800657272339, "learning_rate": 5e-05, "llm_loss": 0.6220613718032837, "loss": 2.8682, "loss_aux_layer_0": 0.01971435546875, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1231689453125, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.0687255859375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06756591796875, "step": 2417, "total_loss": 0.7170413434505463 }, { "epoch": 0.4787170857255989, "grad_norm": 1.3374344110488892, "learning_rate": 5e-05, "llm_loss": 0.6570034027099609, "loss": 3.0133, "loss_aux_layer_0": 0.021514892578125, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.07135009765625, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.106201171875, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1239013671875, "loss_aux_layer_18": 0.1319580078125, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05523681640625, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.0667724609375, "loss_aux_layer_4": 0.0693359375, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.07000732421875, "step": 2418, "total_loss": 0.753324344754219 }, { "epoch": 0.4789150663235003, "grad_norm": 0.9992497563362122, "learning_rate": 5e-05, "llm_loss": 0.5707130432128906, "loss": 2.648, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.06646728515625, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0654296875, "step": 2419, "total_loss": 0.6620006263256073 }, { "epoch": 0.4791130469214017, "grad_norm": 0.9413600564002991, "learning_rate": 5e-05, "llm_loss": 0.46073681116104126, "loss": 2.2209, "loss_aux_layer_0": 0.0211181640625, "loss_aux_layer_1": 0.04107666015625, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06475830078125, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068603515625, "step": 2420, "total_loss": 0.5552214533090591 }, { "epoch": 0.4793110275193031, "grad_norm": 1.3183573484420776, "learning_rate": 5e-05, "llm_loss": 0.6553789526224136, "loss": 2.9982, "loss_aux_layer_0": 0.02001953125, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06243896484375, "loss_aux_layer_4": 0.065185546875, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.0665283203125, "step": 2421, "total_loss": 0.7495521008968353 }, { "epoch": 0.47950900811720454, "grad_norm": 1.0355840921401978, "learning_rate": 5e-05, "llm_loss": 0.561887189745903, "loss": 2.6221, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.04010009765625, "loss_aux_layer_10": 0.068603515625, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1292724609375, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0677490234375, "step": 2422, "total_loss": 0.6555219441652298 }, { "epoch": 0.4797069887151059, "grad_norm": 0.9899460077285767, "learning_rate": 5e-05, "llm_loss": 0.6257736086845398, "loss": 2.8833, "loss_aux_layer_0": 0.019439697265625, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.0958251953125, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.065185546875, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.068359375, "step": 2423, "total_loss": 0.7208137661218643 }, { "epoch": 0.4799049693130073, "grad_norm": 1.0293655395507812, "learning_rate": 5e-05, "llm_loss": 0.5362976044416428, "loss": 2.5138, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1004638671875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05328369140625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06329345703125, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0665283203125, "step": 2424, "total_loss": 0.6284535378217697 }, { "epoch": 0.48010294991090874, "grad_norm": 1.1032166481018066, "learning_rate": 5e-05, "llm_loss": 0.5517181605100632, "loss": 2.5752, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.2109375, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.064453125, "step": 2425, "total_loss": 0.6437903195619583 }, { "epoch": 0.4803009305088101, "grad_norm": 1.1786391735076904, "learning_rate": 5e-05, "llm_loss": 0.5693593919277191, "loss": 2.6387, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.06524658203125, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.06341552734375, "step": 2426, "total_loss": 0.6596809327602386 }, { "epoch": 0.48049891110671156, "grad_norm": 0.7731761336326599, "learning_rate": 5e-05, "llm_loss": 0.6046526879072189, "loss": 2.7934, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.06988525390625, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.068359375, "step": 2427, "total_loss": 0.6983414888381958 }, { "epoch": 0.48069689170461294, "grad_norm": 1.4591162204742432, "learning_rate": 5e-05, "llm_loss": 0.6047935113310814, "loss": 2.7892, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06219482421875, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0657958984375, "step": 2428, "total_loss": 0.6973058134317398 }, { "epoch": 0.4808948723025144, "grad_norm": 1.2638291120529175, "learning_rate": 5e-05, "llm_loss": 0.5329262241721153, "loss": 2.5143, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.04150390625, "loss_aux_layer_10": 0.07025146484375, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.1346435546875, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.150390625, "loss_aux_layer_22": 0.171875, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.07000732421875, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.06890869140625, "step": 2429, "total_loss": 0.6285723000764847 }, { "epoch": 0.48109285290041576, "grad_norm": 1.1774240732192993, "learning_rate": 5e-05, "llm_loss": 0.578886479139328, "loss": 2.6955, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.07037353515625, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06573486328125, "loss_aux_layer_4": 0.06866455078125, "loss_aux_layer_5": 0.07049560546875, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.07012939453125, "loss_aux_layer_9": 0.0689697265625, "step": 2430, "total_loss": 0.6738845705986023 }, { "epoch": 0.48129083349831714, "grad_norm": 1.1825051307678223, "learning_rate": 5e-05, "llm_loss": 0.683749794960022, "loss": 3.105, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.06610107421875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0665283203125, "step": 2431, "total_loss": 0.7762614488601685 }, { "epoch": 0.4814888140962186, "grad_norm": 1.0997878313064575, "learning_rate": 5e-05, "llm_loss": 0.5870188325643539, "loss": 2.7357, "loss_aux_layer_0": 0.0196533203125, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0880126953125, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.1182861328125, "loss_aux_layer_17": 0.12548828125, "loss_aux_layer_18": 0.1336669921875, "loss_aux_layer_19": 0.136474609375, "loss_aux_layer_2": 0.05535888671875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.071044921875, "loss_aux_layer_6": 0.074462890625, "loss_aux_layer_7": 0.07177734375, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.06982421875, "step": 2432, "total_loss": 0.6839186400175095 }, { "epoch": 0.48168679469411996, "grad_norm": 0.9068273901939392, "learning_rate": 5e-05, "llm_loss": 0.6108271330595016, "loss": 2.8143, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.0390625, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.1295166015625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2433, "total_loss": 0.7035835981369019 }, { "epoch": 0.4818847752920214, "grad_norm": 1.0108296871185303, "learning_rate": 5e-05, "llm_loss": 0.6287943869829178, "loss": 2.8975, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.0399169921875, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.1236572265625, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.14404296875, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.214111328125, "loss_aux_layer_3": 0.06378173828125, "loss_aux_layer_4": 0.06707763671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.06817626953125, "step": 2434, "total_loss": 0.7243690639734268 }, { "epoch": 0.4820827558899228, "grad_norm": 0.8334550261497498, "learning_rate": 5e-05, "llm_loss": 0.5217665731906891, "loss": 2.461, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.113037109375, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2435, "total_loss": 0.6152524203062057 }, { "epoch": 0.4822807364878242, "grad_norm": 1.0398668050765991, "learning_rate": 5e-05, "llm_loss": 0.5906632244586945, "loss": 2.7492, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.11669921875, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072021484375, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.06982421875, "step": 2436, "total_loss": 0.6873052418231964 }, { "epoch": 0.4824787170857256, "grad_norm": 1.0820785760879517, "learning_rate": 5e-05, "llm_loss": 0.6285629421472549, "loss": 2.9123, "loss_aux_layer_0": 0.020904541015625, "loss_aux_layer_1": 0.043212890625, "loss_aux_layer_10": 0.0736083984375, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.0838623046875, "loss_aux_layer_13": 0.0902099609375, "loss_aux_layer_14": 0.1005859375, "loss_aux_layer_15": 0.10986328125, "loss_aux_layer_16": 0.119873046875, "loss_aux_layer_17": 0.1279296875, "loss_aux_layer_18": 0.136474609375, "loss_aux_layer_19": 0.138916015625, "loss_aux_layer_2": 0.058837890625, "loss_aux_layer_20": 0.146728515625, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0697021484375, "loss_aux_layer_4": 0.0726318359375, "loss_aux_layer_5": 0.074951171875, "loss_aux_layer_6": 0.077880859375, "loss_aux_layer_7": 0.0750732421875, "loss_aux_layer_8": 0.07421875, "loss_aux_layer_9": 0.072509765625, "step": 2437, "total_loss": 0.7280653715133667 }, { "epoch": 0.48267669768362703, "grad_norm": 0.8626799583435059, "learning_rate": 5e-05, "llm_loss": 0.5436205044388771, "loss": 2.552, "loss_aux_layer_0": 0.02001953125, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.06951904296875, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1136474609375, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.1300048828125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.06915283203125, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.06915283203125, "loss_aux_layer_9": 0.0679931640625, "step": 2438, "total_loss": 0.6379932761192322 }, { "epoch": 0.4828746782815284, "grad_norm": 1.3624908924102783, "learning_rate": 5e-05, "llm_loss": 0.567894384264946, "loss": 2.6491, "loss_aux_layer_0": 0.019683837890625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06439208984375, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0684814453125, "step": 2439, "total_loss": 0.6622826755046844 }, { "epoch": 0.4830726588794298, "grad_norm": 1.2652308940887451, "learning_rate": 5e-05, "llm_loss": 0.5761317908763885, "loss": 2.6796, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0667724609375, "step": 2440, "total_loss": 0.6698962599039078 }, { "epoch": 0.48327063947733123, "grad_norm": 0.9659432768821716, "learning_rate": 5e-05, "llm_loss": 0.592651441693306, "loss": 2.7462, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.0421142578125, "loss_aux_layer_10": 0.0693359375, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1292724609375, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.0682373046875, "step": 2441, "total_loss": 0.6865386217832565 }, { "epoch": 0.4834686200752326, "grad_norm": 1.6152369976043701, "learning_rate": 5e-05, "llm_loss": 0.5334742218255997, "loss": 2.5287, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.04345703125, "loss_aux_layer_10": 0.072998046875, "loss_aux_layer_11": 0.0775146484375, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0889892578125, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.1187744140625, "loss_aux_layer_17": 0.1260986328125, "loss_aux_layer_18": 0.135498046875, "loss_aux_layer_19": 0.138427734375, "loss_aux_layer_2": 0.05731201171875, "loss_aux_layer_20": 0.146484375, "loss_aux_layer_21": 0.153564453125, "loss_aux_layer_22": 0.17578125, "loss_aux_layer_23": 0.213623046875, "loss_aux_layer_3": 0.068603515625, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.073486328125, "loss_aux_layer_6": 0.0767822265625, "loss_aux_layer_7": 0.073974609375, "loss_aux_layer_8": 0.072998046875, "loss_aux_layer_9": 0.0716552734375, "step": 2442, "total_loss": 0.6321629732847214 }, { "epoch": 0.48366660067313405, "grad_norm": 1.2249728441238403, "learning_rate": 5e-05, "llm_loss": 0.628759890794754, "loss": 2.8911, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0687255859375, "step": 2443, "total_loss": 0.7227751761674881 }, { "epoch": 0.48386458127103543, "grad_norm": 1.0836021900177002, "learning_rate": 5e-05, "llm_loss": 0.5798124521970749, "loss": 2.704, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.042724609375, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0966796875, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.1304931640625, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.0711669921875, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.076171875, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.071044921875, "step": 2444, "total_loss": 0.6759883612394333 }, { "epoch": 0.48406256186893687, "grad_norm": 1.198257565498352, "learning_rate": 5e-05, "llm_loss": 0.678582638502121, "loss": 3.0872, "loss_aux_layer_0": 0.019775390625, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06341552734375, "loss_aux_layer_4": 0.06646728515625, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0670166015625, "step": 2445, "total_loss": 0.7718068212270737 }, { "epoch": 0.48426054246683825, "grad_norm": 1.3241066932678223, "learning_rate": 5e-05, "llm_loss": 0.6858140230178833, "loss": 3.134, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.0439453125, "loss_aux_layer_10": 0.0748291015625, "loss_aux_layer_11": 0.0794677734375, "loss_aux_layer_12": 0.08447265625, "loss_aux_layer_13": 0.09033203125, "loss_aux_layer_14": 0.098876953125, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05841064453125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.070068359375, "loss_aux_layer_4": 0.073486328125, "loss_aux_layer_5": 0.0751953125, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0758056640625, "loss_aux_layer_8": 0.0748291015625, "loss_aux_layer_9": 0.072998046875, "step": 2446, "total_loss": 0.7834885567426682 }, { "epoch": 0.48445852306473963, "grad_norm": 1.0255937576293945, "learning_rate": 5e-05, "llm_loss": 0.6312219351530075, "loss": 2.8979, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.03997802734375, "loss_aux_layer_10": 0.068603515625, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.1102294921875, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.06878662109375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.06842041015625, "loss_aux_layer_9": 0.06719970703125, "step": 2447, "total_loss": 0.7244795113801956 }, { "epoch": 0.48465650366264107, "grad_norm": 0.9564471244812012, "learning_rate": 5e-05, "llm_loss": 0.5921158045530319, "loss": 2.7493, "loss_aux_layer_0": 0.0206298828125, "loss_aux_layer_1": 0.04010009765625, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.053466796875, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.06890869140625, "loss_aux_layer_8": 0.06829833984375, "loss_aux_layer_9": 0.06719970703125, "step": 2448, "total_loss": 0.6873250752687454 }, { "epoch": 0.48485448426054245, "grad_norm": 0.9133462905883789, "learning_rate": 5e-05, "llm_loss": 0.5766196250915527, "loss": 2.6837, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1136474609375, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.068359375, "step": 2449, "total_loss": 0.6709321588277817 }, { "epoch": 0.4850524648584439, "grad_norm": 0.9408311247825623, "learning_rate": 5e-05, "llm_loss": 0.5275881886482239, "loss": 2.5005, "loss_aux_layer_0": 0.0216064453125, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.072509765625, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.08203125, "loss_aux_layer_13": 0.0882568359375, "loss_aux_layer_14": 0.0977783203125, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.1251220703125, "loss_aux_layer_18": 0.1328125, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.0565185546875, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.151123046875, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.076416015625, "loss_aux_layer_7": 0.07373046875, "loss_aux_layer_8": 0.07275390625, "loss_aux_layer_9": 0.0711669921875, "step": 2450, "total_loss": 0.6251222044229507 }, { "epoch": 0.48525044545634527, "grad_norm": 1.0763630867004395, "learning_rate": 5e-05, "llm_loss": 0.6758945286273956, "loss": 3.0728, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06658935546875, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111572265625, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.06451416015625, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06524658203125, "step": 2451, "total_loss": 0.7682078033685684 }, { "epoch": 0.4854484260542467, "grad_norm": 1.076987624168396, "learning_rate": 5e-05, "llm_loss": 0.6010674238204956, "loss": 2.7883, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.07568359375, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0958251953125, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.0560302734375, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.0716552734375, "loss_aux_layer_9": 0.070068359375, "step": 2452, "total_loss": 0.6970792263746262 }, { "epoch": 0.4856464066521481, "grad_norm": 1.002610206604004, "learning_rate": 5e-05, "llm_loss": 0.5444555506110191, "loss": 2.5453, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.06634521484375, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.06671142578125, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.0650634765625, "step": 2453, "total_loss": 0.6363156884908676 }, { "epoch": 0.48584438725004947, "grad_norm": 0.9372820258140564, "learning_rate": 5e-05, "llm_loss": 0.6235079169273376, "loss": 2.8573, "loss_aux_layer_0": 0.0205078125, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06695556640625, "loss_aux_layer_9": 0.06585693359375, "step": 2454, "total_loss": 0.7143178731203079 }, { "epoch": 0.4860423678479509, "grad_norm": 1.069225788116455, "learning_rate": 5e-05, "llm_loss": 0.6951365619897842, "loss": 3.1553, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05328369140625, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068115234375, "step": 2455, "total_loss": 0.788830429315567 }, { "epoch": 0.4862403484458523, "grad_norm": 1.0850446224212646, "learning_rate": 5e-05, "llm_loss": 0.5349448844790459, "loss": 2.5195, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.06414794921875, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0675048828125, "step": 2456, "total_loss": 0.6298747509717941 }, { "epoch": 0.4864383290437537, "grad_norm": 0.8874106407165527, "learning_rate": 5e-05, "llm_loss": 0.5801229178905487, "loss": 2.6997, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06500244140625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.06903076171875, "step": 2457, "total_loss": 0.6749286204576492 }, { "epoch": 0.4866363096416551, "grad_norm": 0.8429979681968689, "learning_rate": 5e-05, "llm_loss": 0.6366555243730545, "loss": 2.9078, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.0648193359375, "step": 2458, "total_loss": 0.7269420623779297 }, { "epoch": 0.48683429023955654, "grad_norm": 1.127665400505066, "learning_rate": 5e-05, "llm_loss": 0.6453473269939423, "loss": 2.9454, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.06597900390625, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.0645751953125, "step": 2459, "total_loss": 0.7363480627536774 }, { "epoch": 0.4870322708374579, "grad_norm": 0.9461537003517151, "learning_rate": 5e-05, "llm_loss": 0.5682020261883736, "loss": 2.6432, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.0662841796875, "step": 2460, "total_loss": 0.6608102321624756 }, { "epoch": 0.4872302514353593, "grad_norm": 0.9898368120193481, "learning_rate": 5e-05, "llm_loss": 0.5769117474555969, "loss": 2.6789, "loss_aux_layer_0": 0.02032470703125, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.06683349609375, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.06927490234375, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.0667724609375, "step": 2461, "total_loss": 0.6697354465723038 }, { "epoch": 0.48742823203326074, "grad_norm": 1.2396668195724487, "learning_rate": 5e-05, "llm_loss": 0.571699932217598, "loss": 2.6715, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.0419921875, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.077392578125, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0885009765625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.06793212890625, "loss_aux_layer_4": 0.071044921875, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0760498046875, "loss_aux_layer_7": 0.073486328125, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.071533203125, "step": 2462, "total_loss": 0.667874664068222 }, { "epoch": 0.4876262126311621, "grad_norm": 0.8123762607574463, "learning_rate": 5e-05, "llm_loss": 0.5944728925824165, "loss": 2.7388, "loss_aux_layer_0": 0.019012451171875, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.06402587890625, "step": 2463, "total_loss": 0.6847071647644043 }, { "epoch": 0.48782419322906356, "grad_norm": 1.0110740661621094, "learning_rate": 5e-05, "llm_loss": 0.6011863201856613, "loss": 2.7747, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.0394287109375, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.066162109375, "step": 2464, "total_loss": 0.6936709582805634 }, { "epoch": 0.48802217382696494, "grad_norm": 0.891762912273407, "learning_rate": 5e-05, "llm_loss": 0.5260510444641113, "loss": 2.4679, "loss_aux_layer_0": 0.019073486328125, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.1292724609375, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.0650634765625, "step": 2465, "total_loss": 0.6169753074645996 }, { "epoch": 0.4882201544248664, "grad_norm": 1.0188870429992676, "learning_rate": 5e-05, "llm_loss": 0.5887829214334488, "loss": 2.7215, "loss_aux_layer_0": 0.0186767578125, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1004638671875, "loss_aux_layer_16": 0.1102294921875, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.066650390625, "step": 2466, "total_loss": 0.6803781539201736 }, { "epoch": 0.48841813502276776, "grad_norm": 1.054827094078064, "learning_rate": 5e-05, "llm_loss": 0.6676792651414871, "loss": 3.0335, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.07000732421875, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.06866455078125, "loss_aux_layer_7": 0.0662841796875, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.06451416015625, "step": 2467, "total_loss": 0.7583818286657333 }, { "epoch": 0.4886161156206692, "grad_norm": 1.2014408111572266, "learning_rate": 5e-05, "llm_loss": 0.6068708896636963, "loss": 2.8038, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0537109375, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.06787109375, "step": 2468, "total_loss": 0.700955793261528 }, { "epoch": 0.4888140962185706, "grad_norm": 1.2302558422088623, "learning_rate": 5e-05, "llm_loss": 0.6574559062719345, "loss": 3.0199, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.098388671875, "loss_aux_layer_15": 0.108154296875, "loss_aux_layer_16": 0.1182861328125, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.134521484375, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.1513671875, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212158203125, "loss_aux_layer_3": 0.067138671875, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0748291015625, "loss_aux_layer_7": 0.072509765625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.070556640625, "step": 2469, "total_loss": 0.7549746632575989 }, { "epoch": 0.48901207681647196, "grad_norm": 1.0551633834838867, "learning_rate": 5e-05, "llm_loss": 0.5397471934556961, "loss": 2.5285, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.06683349609375, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1197509765625, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.1324462890625, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06158447265625, "loss_aux_layer_4": 0.06414794921875, "loss_aux_layer_5": 0.06591796875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06536865234375, "step": 2470, "total_loss": 0.632114052772522 }, { "epoch": 0.4892100574143734, "grad_norm": 1.3835269212722778, "learning_rate": 5e-05, "llm_loss": 0.6856158822774887, "loss": 3.1208, "loss_aux_layer_0": 0.019134521484375, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0859375, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.113037109375, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06475830078125, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.0692138671875, "step": 2471, "total_loss": 0.7801985889673233 }, { "epoch": 0.4894080380122748, "grad_norm": 1.0266523361206055, "learning_rate": 5e-05, "llm_loss": 0.6350018233060837, "loss": 2.9217, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.1307373046875, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06494140625, "loss_aux_layer_4": 0.0679931640625, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0692138671875, "step": 2472, "total_loss": 0.7304364889860153 }, { "epoch": 0.4896060186101762, "grad_norm": 0.9010999798774719, "learning_rate": 5e-05, "llm_loss": 0.6188950836658478, "loss": 2.8525, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.053955078125, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.06707763671875, "loss_aux_layer_5": 0.06890869140625, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.06927490234375, "loss_aux_layer_9": 0.06817626953125, "step": 2473, "total_loss": 0.7131157219409943 }, { "epoch": 0.4898039992080776, "grad_norm": 1.1107256412506104, "learning_rate": 5e-05, "llm_loss": 0.5581668466329575, "loss": 2.6248, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.04241943359375, "loss_aux_layer_10": 0.0731201171875, "loss_aux_layer_11": 0.0782470703125, "loss_aux_layer_12": 0.08349609375, "loss_aux_layer_13": 0.08984375, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.1080322265625, "loss_aux_layer_16": 0.117919921875, "loss_aux_layer_17": 0.1253662109375, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.13623046875, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.143798828125, "loss_aux_layer_21": 0.152099609375, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0740966796875, "loss_aux_layer_8": 0.0731201171875, "loss_aux_layer_9": 0.0718994140625, "step": 2474, "total_loss": 0.6562046110630035 }, { "epoch": 0.49000197980597904, "grad_norm": 1.1978211402893066, "learning_rate": 5e-05, "llm_loss": 0.5615810453891754, "loss": 2.6132, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.0660400390625, "step": 2475, "total_loss": 0.6533027291297913 }, { "epoch": 0.4901999604038804, "grad_norm": 0.8855754733085632, "learning_rate": 5e-05, "llm_loss": 0.5601863265037537, "loss": 2.6258, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.076904296875, "loss_aux_layer_12": 0.082275390625, "loss_aux_layer_13": 0.08837890625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0555419921875, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.06658935546875, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.07080078125, "step": 2476, "total_loss": 0.65645532310009 }, { "epoch": 0.4903979410017818, "grad_norm": 1.0090218782424927, "learning_rate": 5e-05, "llm_loss": 0.5302538126707077, "loss": 2.4989, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.06939697265625, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.06768798828125, "step": 2477, "total_loss": 0.6247278228402138 }, { "epoch": 0.49059592159968324, "grad_norm": 1.2133586406707764, "learning_rate": 5e-05, "llm_loss": 0.6269847527146339, "loss": 2.8598, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.1202392578125, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.0628662109375, "step": 2478, "total_loss": 0.714948982000351 }, { "epoch": 0.4907939021975846, "grad_norm": 0.7870036959648132, "learning_rate": 5e-05, "llm_loss": 0.5376674607396126, "loss": 2.5282, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.07861328125, "loss_aux_layer_13": 0.0845947265625, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.06719970703125, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.06787109375, "step": 2479, "total_loss": 0.6320519596338272 }, { "epoch": 0.49099188279548606, "grad_norm": 1.31172513961792, "learning_rate": 5e-05, "llm_loss": 0.670564740896225, "loss": 3.0614, "loss_aux_layer_0": 0.02008056640625, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1141357421875, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.06829833984375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0687255859375, "step": 2480, "total_loss": 0.7653510868549347 }, { "epoch": 0.49118986339338744, "grad_norm": 0.916668713092804, "learning_rate": 5e-05, "llm_loss": 0.6450016051530838, "loss": 2.9593, "loss_aux_layer_0": 0.019744873046875, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.069580078125, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0693359375, "step": 2481, "total_loss": 0.739813044667244 }, { "epoch": 0.4913878439912889, "grad_norm": 1.1833051443099976, "learning_rate": 5e-05, "llm_loss": 0.5628283247351646, "loss": 2.6098, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.0623779296875, "step": 2482, "total_loss": 0.6524482667446136 }, { "epoch": 0.49158582458919026, "grad_norm": 1.2404085397720337, "learning_rate": 5e-05, "llm_loss": 0.583424299955368, "loss": 2.7202, "loss_aux_layer_0": 0.020599365234375, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.1068115234375, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.133544921875, "loss_aux_layer_19": 0.136962890625, "loss_aux_layer_2": 0.053955078125, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.152587890625, "loss_aux_layer_22": 0.175048828125, "loss_aux_layer_23": 0.214599609375, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0689697265625, "step": 2483, "total_loss": 0.6800405532121658 }, { "epoch": 0.49178380518709164, "grad_norm": 1.422463297843933, "learning_rate": 5e-05, "llm_loss": 0.6071406900882721, "loss": 2.8111, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.0809326171875, "loss_aux_layer_13": 0.08740234375, "loss_aux_layer_14": 0.09619140625, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0689697265625, "step": 2484, "total_loss": 0.7027675211429596 }, { "epoch": 0.4919817857849931, "grad_norm": 0.983638346195221, "learning_rate": 5e-05, "llm_loss": 0.5137787237763405, "loss": 2.422, "loss_aux_layer_0": 0.019439697265625, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.06646728515625, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.0631103515625, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06500244140625, "step": 2485, "total_loss": 0.6054950058460236 }, { "epoch": 0.49217976638289446, "grad_norm": 1.0888948440551758, "learning_rate": 5e-05, "llm_loss": 0.5489249527454376, "loss": 2.5683, "loss_aux_layer_0": 0.019775390625, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.06768798828125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.1317138671875, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.06781005859375, "loss_aux_layer_6": 0.07073974609375, "loss_aux_layer_7": 0.06829833984375, "loss_aux_layer_8": 0.06768798828125, "loss_aux_layer_9": 0.06640625, "step": 2486, "total_loss": 0.6420698463916779 }, { "epoch": 0.4923777469807959, "grad_norm": 0.9838801026344299, "learning_rate": 5e-05, "llm_loss": 0.6000413298606873, "loss": 2.7599, "loss_aux_layer_0": 0.019561767578125, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.065185546875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06329345703125, "step": 2487, "total_loss": 0.6899771094322205 }, { "epoch": 0.4925757275786973, "grad_norm": 1.249016523361206, "learning_rate": 5e-05, "llm_loss": 0.6313883513212204, "loss": 2.9127, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04144287109375, "loss_aux_layer_10": 0.0728759765625, "loss_aux_layer_11": 0.07763671875, "loss_aux_layer_12": 0.0826416015625, "loss_aux_layer_13": 0.0888671875, "loss_aux_layer_14": 0.09814453125, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1171875, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.1326904296875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150146484375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.072265625, "loss_aux_layer_9": 0.071044921875, "step": 2488, "total_loss": 0.7281813770532608 }, { "epoch": 0.4927737081765987, "grad_norm": 1.2502919435501099, "learning_rate": 5e-05, "llm_loss": 0.5807463526725769, "loss": 2.6905, "loss_aux_layer_0": 0.02032470703125, "loss_aux_layer_1": 0.0390625, "loss_aux_layer_10": 0.06658935546875, "loss_aux_layer_11": 0.07061767578125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.1383056640625, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.065185546875, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0654296875, "step": 2489, "total_loss": 0.672619491815567 }, { "epoch": 0.4929716887745001, "grad_norm": 1.0730799436569214, "learning_rate": 5e-05, "llm_loss": 0.53038090467453, "loss": 2.4899, "loss_aux_layer_0": 0.02001953125, "loss_aux_layer_1": 0.03955078125, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.066162109375, "step": 2490, "total_loss": 0.6224842220544815 }, { "epoch": 0.49316966937240153, "grad_norm": 0.9663907885551453, "learning_rate": 5e-05, "llm_loss": 0.5870262533426285, "loss": 2.7339, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.04254150390625, "loss_aux_layer_10": 0.0721435546875, "loss_aux_layer_11": 0.0767822265625, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0875244140625, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.122314453125, "loss_aux_layer_18": 0.13134765625, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05670166015625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.0679931640625, "loss_aux_layer_4": 0.0709228515625, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0726318359375, "loss_aux_layer_9": 0.0709228515625, "step": 2491, "total_loss": 0.6834689378738403 }, { "epoch": 0.4933676499703029, "grad_norm": 0.9525206089019775, "learning_rate": 5e-05, "llm_loss": 0.6164652109146118, "loss": 2.8462, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.0408935546875, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0948486328125, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.17138671875, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0703125, "loss_aux_layer_9": 0.0684814453125, "step": 2492, "total_loss": 0.7115498632192612 }, { "epoch": 0.4935656305682043, "grad_norm": 1.0745635032653809, "learning_rate": 5e-05, "llm_loss": 0.6075523495674133, "loss": 2.7987, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.06756591796875, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.06744384765625, "loss_aux_layer_9": 0.06634521484375, "step": 2493, "total_loss": 0.6996744275093079 }, { "epoch": 0.49376361116610573, "grad_norm": 0.9910717010498047, "learning_rate": 5e-05, "llm_loss": 0.5327717885375023, "loss": 2.5161, "loss_aux_layer_0": 0.020233154296875, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.07177734375, "loss_aux_layer_11": 0.0765380859375, "loss_aux_layer_12": 0.0819091796875, "loss_aux_layer_13": 0.088623046875, "loss_aux_layer_14": 0.0980224609375, "loss_aux_layer_15": 0.107177734375, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.124267578125, "loss_aux_layer_18": 0.1324462890625, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05596923828125, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0712890625, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.070068359375, "step": 2494, "total_loss": 0.629019632935524 }, { "epoch": 0.4939615917640071, "grad_norm": 0.7563391923904419, "learning_rate": 5e-05, "llm_loss": 0.5859686434268951, "loss": 2.7141, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.06329345703125, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.06719970703125, "step": 2495, "total_loss": 0.678533211350441 }, { "epoch": 0.49415957236190855, "grad_norm": 1.1977298259735107, "learning_rate": 5e-05, "llm_loss": 0.5501709952950478, "loss": 2.5802, "loss_aux_layer_0": 0.01904296875, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.068359375, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0692138671875, "step": 2496, "total_loss": 0.6450503021478653 }, { "epoch": 0.49435755295980993, "grad_norm": 1.1332001686096191, "learning_rate": 5e-05, "llm_loss": 0.4908299595117569, "loss": 2.3431, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.04095458984375, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.054931640625, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.0654296875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.068603515625, "step": 2497, "total_loss": 0.585764929652214 }, { "epoch": 0.49455553355771137, "grad_norm": 1.1413682699203491, "learning_rate": 5e-05, "llm_loss": 0.5858859047293663, "loss": 2.7089, "loss_aux_layer_0": 0.01873779296875, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1011962890625, "loss_aux_layer_16": 0.1107177734375, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0657958984375, "step": 2498, "total_loss": 0.6772167682647705 }, { "epoch": 0.49475351415561275, "grad_norm": 0.9892949461936951, "learning_rate": 5e-05, "llm_loss": 0.656543180346489, "loss": 3.0054, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.1304931640625, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05499267578125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0709228515625, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.06915283203125, "step": 2499, "total_loss": 0.7513574808835983 }, { "epoch": 0.49495149475351413, "grad_norm": 0.8708381652832031, "learning_rate": 5e-05, "llm_loss": 0.5559673756361008, "loss": 2.5812, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06396484375, "step": 2500, "total_loss": 0.6453029662370682 }, { "epoch": 0.49514947535141557, "grad_norm": 1.3804898262023926, "learning_rate": 5e-05, "llm_loss": 0.6074353754520416, "loss": 2.8017, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06768798828125, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0721435546875, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06732177734375, "step": 2501, "total_loss": 0.7004216611385345 }, { "epoch": 0.49534745594931695, "grad_norm": 0.9052412509918213, "learning_rate": 5e-05, "llm_loss": 0.5482942909002304, "loss": 2.5621, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06658935546875, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06243896484375, "loss_aux_layer_4": 0.06512451171875, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.06658935546875, "loss_aux_layer_9": 0.0653076171875, "step": 2502, "total_loss": 0.640518069267273 }, { "epoch": 0.4955454365472184, "grad_norm": 1.0373494625091553, "learning_rate": 5e-05, "llm_loss": 0.6407963782548904, "loss": 2.9201, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.06414794921875, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.062744140625, "step": 2503, "total_loss": 0.7300177067518234 }, { "epoch": 0.49574341714511977, "grad_norm": 0.9904662370681763, "learning_rate": 5e-05, "llm_loss": 0.5841702073812485, "loss": 2.7014, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.06378173828125, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.0643310546875, "step": 2504, "total_loss": 0.6753438413143158 }, { "epoch": 0.4959413977430212, "grad_norm": 0.9318678379058838, "learning_rate": 5e-05, "llm_loss": 0.5689133405685425, "loss": 2.6505, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.0394287109375, "loss_aux_layer_10": 0.06951904296875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.1295166015625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06658935546875, "loss_aux_layer_5": 0.06842041015625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0692138671875, "loss_aux_layer_9": 0.0682373046875, "step": 2505, "total_loss": 0.6626217365264893 }, { "epoch": 0.4961393783409226, "grad_norm": 1.3216309547424316, "learning_rate": 5e-05, "llm_loss": 0.5720452517271042, "loss": 2.6757, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.04217529296875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.083740234375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.0992431640625, "loss_aux_layer_15": 0.1075439453125, "loss_aux_layer_16": 0.116943359375, "loss_aux_layer_17": 0.1246337890625, "loss_aux_layer_18": 0.13232421875, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05615234375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072265625, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0733642578125, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0714111328125, "step": 2506, "total_loss": 0.6689160168170929 }, { "epoch": 0.49633735893882397, "grad_norm": 1.0429761409759521, "learning_rate": 5e-05, "llm_loss": 0.6670700460672379, "loss": 3.0505, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.0411376953125, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05584716796875, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.066162109375, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.070556640625, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0693359375, "step": 2507, "total_loss": 0.7626178860664368 }, { "epoch": 0.4965353395367254, "grad_norm": 1.0402792692184448, "learning_rate": 5e-05, "llm_loss": 0.697102315723896, "loss": 3.1528, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.06634521484375, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.06732177734375, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.06500244140625, "step": 2508, "total_loss": 0.7881955504417419 }, { "epoch": 0.4967333201346268, "grad_norm": 1.4458062648773193, "learning_rate": 5e-05, "llm_loss": 0.6021535992622375, "loss": 2.7631, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.06427001953125, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06292724609375, "step": 2509, "total_loss": 0.6907642632722855 }, { "epoch": 0.4969313007325282, "grad_norm": 0.934426486492157, "learning_rate": 5e-05, "llm_loss": 0.5600289404392242, "loss": 2.6131, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.1290283203125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0670166015625, "step": 2510, "total_loss": 0.653271421790123 }, { "epoch": 0.4971292813304296, "grad_norm": 1.273500919342041, "learning_rate": 5e-05, "llm_loss": 0.6578999608755112, "loss": 2.9947, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0611572265625, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06390380859375, "step": 2511, "total_loss": 0.7486681640148163 }, { "epoch": 0.49732726192833104, "grad_norm": 0.9010961651802063, "learning_rate": 5e-05, "llm_loss": 0.5592603832483292, "loss": 2.6144, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1214599609375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.06475830078125, "loss_aux_layer_4": 0.067626953125, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.067626953125, "step": 2512, "total_loss": 0.6535881459712982 }, { "epoch": 0.4975252425262324, "grad_norm": 1.149725317955017, "learning_rate": 5e-05, "llm_loss": 0.6200633943080902, "loss": 2.8545, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.03997802734375, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.1282958984375, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.06744384765625, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.0672607421875, "step": 2513, "total_loss": 0.7136207818984985 }, { "epoch": 0.49772322312413386, "grad_norm": 0.8834227323532104, "learning_rate": 5e-05, "llm_loss": 0.6075517758727074, "loss": 2.7852, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0623779296875, "step": 2514, "total_loss": 0.6963112205266953 }, { "epoch": 0.49792120372203524, "grad_norm": 1.1097592115402222, "learning_rate": 5e-05, "llm_loss": 0.5636245459318161, "loss": 2.6256, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1104736328125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06439208984375, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06689453125, "step": 2515, "total_loss": 0.6564013212919235 }, { "epoch": 0.4981191843199366, "grad_norm": 1.0467767715454102, "learning_rate": 5e-05, "llm_loss": 0.6167695373296738, "loss": 2.8368, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.06640625, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06402587890625, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.0650634765625, "step": 2516, "total_loss": 0.7091947197914124 }, { "epoch": 0.49831716491783806, "grad_norm": 1.2451162338256836, "learning_rate": 5e-05, "llm_loss": 0.6113072037696838, "loss": 2.8133, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1011962890625, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.06494140625, "step": 2517, "total_loss": 0.7033350318670273 }, { "epoch": 0.49851514551573944, "grad_norm": 1.2470518350601196, "learning_rate": 5e-05, "llm_loss": 0.6332698911428452, "loss": 2.901, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05328369140625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.06640625, "step": 2518, "total_loss": 0.7252392023801804 }, { "epoch": 0.4987131261136409, "grad_norm": 1.0330532789230347, "learning_rate": 5e-05, "llm_loss": 0.5853817611932755, "loss": 2.71, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06390380859375, "loss_aux_layer_4": 0.06671142578125, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0667724609375, "step": 2519, "total_loss": 0.6775044649839401 }, { "epoch": 0.49891110671154226, "grad_norm": 1.1594619750976562, "learning_rate": 5e-05, "llm_loss": 0.5618718415498734, "loss": 2.6105, "loss_aux_layer_0": 0.0189208984375, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.06707763671875, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06365966796875, "step": 2520, "total_loss": 0.6526365131139755 }, { "epoch": 0.4991090873094437, "grad_norm": 0.8279119729995728, "learning_rate": 5e-05, "llm_loss": 0.6073795557022095, "loss": 2.7942, "loss_aux_layer_0": 0.019073486328125, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06109619140625, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.06903076171875, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.06585693359375, "loss_aux_layer_9": 0.06494140625, "step": 2521, "total_loss": 0.6985446065664291 }, { "epoch": 0.4993070679073451, "grad_norm": 1.0542771816253662, "learning_rate": 5e-05, "llm_loss": 0.6032026559114456, "loss": 2.7861, "loss_aux_layer_0": 0.02008056640625, "loss_aux_layer_1": 0.0391845703125, "loss_aux_layer_10": 0.06817626953125, "loss_aux_layer_11": 0.07269287109375, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.07025146484375, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.06732177734375, "loss_aux_layer_9": 0.06671142578125, "step": 2522, "total_loss": 0.6965371519327164 }, { "epoch": 0.49950504850524646, "grad_norm": 0.8045823574066162, "learning_rate": 5e-05, "llm_loss": 0.6248120665550232, "loss": 2.8761, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0792236328125, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0684814453125, "step": 2523, "total_loss": 0.7190352380275726 }, { "epoch": 0.4997030291031479, "grad_norm": 1.0217645168304443, "learning_rate": 5e-05, "llm_loss": 0.5425020977854729, "loss": 2.5371, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0655517578125, "loss_aux_layer_9": 0.0645751953125, "step": 2524, "total_loss": 0.6342775076627731 }, { "epoch": 0.4999010097010493, "grad_norm": 0.9469785690307617, "learning_rate": 5e-05, "llm_loss": 0.5393990725278854, "loss": 2.519, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.06536865234375, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06072998046875, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.0640869140625, "step": 2525, "total_loss": 0.6297490894794464 }, { "epoch": 0.5000989902989507, "grad_norm": 0.9158158302307129, "learning_rate": 5e-05, "llm_loss": 0.5609366819262505, "loss": 2.6041, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.062255859375, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.06719970703125, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.06439208984375, "loss_aux_layer_9": 0.06329345703125, "step": 2526, "total_loss": 0.6510345935821533 }, { "epoch": 0.5002969708968521, "grad_norm": 0.8668584227561951, "learning_rate": 5e-05, "llm_loss": 0.5680950954556465, "loss": 2.6382, "loss_aux_layer_0": 0.020294189453125, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.1258544921875, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.06646728515625, "loss_aux_layer_9": 0.065185546875, "step": 2527, "total_loss": 0.659553274512291 }, { "epoch": 0.5004949514947535, "grad_norm": 0.9982494115829468, "learning_rate": 5e-05, "llm_loss": 0.5749529004096985, "loss": 2.6801, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.04071044921875, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.08642578125, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.1309814453125, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0689697265625, "step": 2528, "total_loss": 0.670035645365715 }, { "epoch": 0.500692932092655, "grad_norm": 0.7621088624000549, "learning_rate": 5e-05, "llm_loss": 0.5247781425714493, "loss": 2.4697, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.06903076171875, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.06744384765625, "step": 2529, "total_loss": 0.6174241304397583 }, { "epoch": 0.5008909126905563, "grad_norm": 1.0712164640426636, "learning_rate": 5e-05, "llm_loss": 0.5951551496982574, "loss": 2.7602, "loss_aux_layer_0": 0.01971435546875, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.1148681640625, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.064697265625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.068603515625, "step": 2530, "total_loss": 0.690040647983551 }, { "epoch": 0.5010888932884577, "grad_norm": 0.822674036026001, "learning_rate": 5e-05, "llm_loss": 0.5269461274147034, "loss": 2.4905, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.07666015625, "loss_aux_layer_12": 0.0816650390625, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.1043701171875, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.055908203125, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.0699462890625, "loss_aux_layer_5": 0.0718994140625, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.0704345703125, "step": 2531, "total_loss": 0.6226194947957993 }, { "epoch": 0.5012868738863592, "grad_norm": 1.0395716428756714, "learning_rate": 5e-05, "llm_loss": 0.6334914714097977, "loss": 2.8882, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.06298828125, "step": 2532, "total_loss": 0.7220582515001297 }, { "epoch": 0.5014848544842605, "grad_norm": 1.0927958488464355, "learning_rate": 5e-05, "llm_loss": 0.634501650929451, "loss": 2.895, "loss_aux_layer_0": 0.018463134765625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.06109619140625, "loss_aux_layer_4": 0.06396484375, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.06646728515625, "loss_aux_layer_8": 0.06573486328125, "loss_aux_layer_9": 0.06439208984375, "step": 2533, "total_loss": 0.7237536460161209 }, { "epoch": 0.5016828350821619, "grad_norm": 0.7914251685142517, "learning_rate": 5e-05, "llm_loss": 0.6293892860412598, "loss": 2.8908, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.067138671875, "step": 2534, "total_loss": 0.7227109372615814 }, { "epoch": 0.5018808156800634, "grad_norm": 1.1161049604415894, "learning_rate": 5e-05, "llm_loss": 0.6018059030175209, "loss": 2.7756, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.117919921875, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.0657958984375, "step": 2535, "total_loss": 0.6939014047384262 }, { "epoch": 0.5020787962779648, "grad_norm": 0.739864706993103, "learning_rate": 5e-05, "llm_loss": 0.5115683674812317, "loss": 2.3985, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.061279296875, "step": 2536, "total_loss": 0.5996188968420029 }, { "epoch": 0.5022767768758661, "grad_norm": 0.9902052283287048, "learning_rate": 5e-05, "llm_loss": 0.5181446000933647, "loss": 2.4376, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05078125, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06072998046875, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.06536865234375, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06622314453125, "loss_aux_layer_8": 0.0655517578125, "loss_aux_layer_9": 0.064453125, "step": 2537, "total_loss": 0.6093985140323639 }, { "epoch": 0.5024747574737676, "grad_norm": 0.9099463820457458, "learning_rate": 5e-05, "llm_loss": 0.6787717193365097, "loss": 3.0802, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.06439208984375, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0653076171875, "step": 2538, "total_loss": 0.7700470983982086 }, { "epoch": 0.502672738071669, "grad_norm": 0.9574717283248901, "learning_rate": 5e-05, "llm_loss": 0.582717590034008, "loss": 2.7, "loss_aux_layer_0": 0.020416259765625, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06683349609375, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.06451416015625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.0654296875, "step": 2539, "total_loss": 0.6749958992004395 }, { "epoch": 0.5028707186695703, "grad_norm": 0.9737014174461365, "learning_rate": 5e-05, "llm_loss": 0.697643369436264, "loss": 3.1715, "loss_aux_layer_0": 0.0196533203125, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.07037353515625, "loss_aux_layer_11": 0.0748291015625, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134765625, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.070068359375, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.07073974609375, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.06884765625, "step": 2540, "total_loss": 0.7928690612316132 }, { "epoch": 0.5030686992674718, "grad_norm": 0.8392680287361145, "learning_rate": 5e-05, "llm_loss": 0.5381799191236496, "loss": 2.5164, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.11767578125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06036376953125, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.063720703125, "step": 2541, "total_loss": 0.629097044467926 }, { "epoch": 0.5032666798653732, "grad_norm": 1.3076400756835938, "learning_rate": 5e-05, "llm_loss": 0.6979030072689056, "loss": 3.1563, "loss_aux_layer_0": 0.019256591796875, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06695556640625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.06756591796875, "loss_aux_layer_8": 0.06695556640625, "loss_aux_layer_9": 0.0655517578125, "step": 2542, "total_loss": 0.7890771627426147 }, { "epoch": 0.5034646604632746, "grad_norm": 1.4276235103607178, "learning_rate": 5e-05, "llm_loss": 0.5375558286905289, "loss": 2.5289, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.113525390625, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.1292724609375, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.068359375, "step": 2543, "total_loss": 0.6322299391031265 }, { "epoch": 0.503662641061176, "grad_norm": 0.9165369868278503, "learning_rate": 5e-05, "llm_loss": 0.6172615885734558, "loss": 2.8518, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.08642578125, "loss_aux_layer_14": 0.0958251953125, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.1319580078125, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.055419921875, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.06591796875, "loss_aux_layer_4": 0.06884765625, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.07147216796875, "loss_aux_layer_8": 0.07049560546875, "loss_aux_layer_9": 0.069091796875, "step": 2544, "total_loss": 0.7129475474357605 }, { "epoch": 0.5038606216590774, "grad_norm": 0.9295819401741028, "learning_rate": 5e-05, "llm_loss": 0.5964702516794205, "loss": 2.7526, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.0654296875, "step": 2545, "total_loss": 0.6881429851055145 }, { "epoch": 0.5040586022569788, "grad_norm": 0.9746739268302917, "learning_rate": 5e-05, "llm_loss": 0.5048261731863022, "loss": 2.3954, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0693359375, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.069091796875, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.068115234375, "step": 2546, "total_loss": 0.5988535135984421 }, { "epoch": 0.5042565828548802, "grad_norm": 0.8996988534927368, "learning_rate": 5e-05, "llm_loss": 0.5767863765358925, "loss": 2.6948, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.0421142578125, "loss_aux_layer_10": 0.0723876953125, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.0821533203125, "loss_aux_layer_13": 0.088134765625, "loss_aux_layer_14": 0.097412109375, "loss_aux_layer_15": 0.1063232421875, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.0574951171875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.211181640625, "loss_aux_layer_3": 0.0682373046875, "loss_aux_layer_4": 0.0706787109375, "loss_aux_layer_5": 0.0723876953125, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.07080078125, "step": 2547, "total_loss": 0.6737095713615417 }, { "epoch": 0.5044545634527816, "grad_norm": 3.5757486820220947, "learning_rate": 5e-05, "llm_loss": 0.5405697971582413, "loss": 2.5508, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.04107666015625, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.0968017578125, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.1158447265625, "loss_aux_layer_17": 0.12353515625, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.135986328125, "loss_aux_layer_2": 0.05645751953125, "loss_aux_layer_20": 0.1435546875, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.175537109375, "loss_aux_layer_23": 0.215576171875, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.072509765625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.0721435546875, "loss_aux_layer_9": 0.0703125, "step": 2548, "total_loss": 0.6376935839653015 }, { "epoch": 0.504652544050683, "grad_norm": 2.0152149200439453, "learning_rate": 5e-05, "llm_loss": 0.559138759970665, "loss": 2.6213, "loss_aux_layer_0": 0.019683837890625, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.0714111328125, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.096435546875, "loss_aux_layer_15": 0.1053466796875, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05828857421875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.0721435546875, "loss_aux_layer_6": 0.0751953125, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0701904296875, "step": 2549, "total_loss": 0.655320480465889 }, { "epoch": 0.5048505246485845, "grad_norm": 1.477879524230957, "learning_rate": 5e-05, "llm_loss": 0.5893491953611374, "loss": 2.7362, "loss_aux_layer_0": 0.02001953125, "loss_aux_layer_1": 0.0426025390625, "loss_aux_layer_10": 0.0709228515625, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1197509765625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05975341796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.06884765625, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0755615234375, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0697021484375, "step": 2550, "total_loss": 0.6840409487485886 }, { "epoch": 0.5050485052464858, "grad_norm": 1.2285329103469849, "learning_rate": 5e-05, "llm_loss": 0.5724230855703354, "loss": 2.6694, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.06903076171875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.0694580078125, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.07073974609375, "loss_aux_layer_8": 0.06939697265625, "loss_aux_layer_9": 0.0677490234375, "step": 2551, "total_loss": 0.6673484593629837 }, { "epoch": 0.5052464858443872, "grad_norm": 2.0827860832214355, "learning_rate": 5e-05, "llm_loss": 0.6051285415887833, "loss": 2.7931, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.04119873046875, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1290283203125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05499267578125, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.066650390625, "step": 2552, "total_loss": 0.6982868015766144 }, { "epoch": 0.5054444664422887, "grad_norm": 1.4629848003387451, "learning_rate": 5e-05, "llm_loss": 0.6561789512634277, "loss": 3.0142, "loss_aux_layer_0": 0.02105712890625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1070556640625, "loss_aux_layer_16": 0.1175537109375, "loss_aux_layer_17": 0.1258544921875, "loss_aux_layer_18": 0.134765625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.14453125, "loss_aux_layer_21": 0.15185546875, "loss_aux_layer_22": 0.17333984375, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.071533203125, "loss_aux_layer_6": 0.0743408203125, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0699462890625, "step": 2553, "total_loss": 0.7535517811775208 }, { "epoch": 0.50564244704019, "grad_norm": 1.5928449630737305, "learning_rate": 5e-05, "llm_loss": 0.5469755828380585, "loss": 2.5614, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.1290283203125, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.055908203125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2554, "total_loss": 0.6403552442789078 }, { "epoch": 0.5058404276380914, "grad_norm": 2.5850508213043213, "learning_rate": 5e-05, "llm_loss": 0.6058389246463776, "loss": 2.8184, "loss_aux_layer_0": 0.0208740234375, "loss_aux_layer_1": 0.04388427734375, "loss_aux_layer_10": 0.073486328125, "loss_aux_layer_11": 0.078369140625, "loss_aux_layer_12": 0.0833740234375, "loss_aux_layer_13": 0.089599609375, "loss_aux_layer_14": 0.0987548828125, "loss_aux_layer_15": 0.10791015625, "loss_aux_layer_16": 0.117431640625, "loss_aux_layer_17": 0.12451171875, "loss_aux_layer_18": 0.1331787109375, "loss_aux_layer_19": 0.13525390625, "loss_aux_layer_2": 0.05987548828125, "loss_aux_layer_20": 0.142822265625, "loss_aux_layer_21": 0.151611328125, "loss_aux_layer_22": 0.173828125, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0711669921875, "loss_aux_layer_4": 0.074462890625, "loss_aux_layer_5": 0.07666015625, "loss_aux_layer_6": 0.078369140625, "loss_aux_layer_7": 0.0753173828125, "loss_aux_layer_8": 0.073974609375, "loss_aux_layer_9": 0.072265625, "step": 2555, "total_loss": 0.704601913690567 }, { "epoch": 0.5060384082359929, "grad_norm": 3.2007436752319336, "learning_rate": 5e-05, "llm_loss": 0.5447712019085884, "loss": 2.5601, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.042236328125, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0745849609375, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.104248046875, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.1300048828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05908203125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.0684814453125, "loss_aux_layer_4": 0.0716552734375, "loss_aux_layer_5": 0.0726318359375, "loss_aux_layer_6": 0.0750732421875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0692138671875, "step": 2556, "total_loss": 0.6400202214717865 }, { "epoch": 0.5062363888338943, "grad_norm": 1.9021612405776978, "learning_rate": 5e-05, "llm_loss": 0.5826305896043777, "loss": 2.6979, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.0545654296875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06378173828125, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.06536865234375, "step": 2557, "total_loss": 0.6744805574417114 }, { "epoch": 0.5064343694317957, "grad_norm": 2.4792442321777344, "learning_rate": 5e-05, "llm_loss": 0.7092114984989166, "loss": 3.2182, "loss_aux_layer_0": 0.020965576171875, "loss_aux_layer_1": 0.04388427734375, "loss_aux_layer_10": 0.07061767578125, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.06268310546875, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.07073974609375, "loss_aux_layer_4": 0.072509765625, "loss_aux_layer_5": 0.0732421875, "loss_aux_layer_6": 0.07568359375, "loss_aux_layer_7": 0.072998046875, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.0697021484375, "step": 2558, "total_loss": 0.8045492023229599 }, { "epoch": 0.5066323500296971, "grad_norm": 1.530514121055603, "learning_rate": 5e-05, "llm_loss": 0.5921317934989929, "loss": 2.7501, "loss_aux_layer_0": 0.02239990234375, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.0799560546875, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.1217041015625, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.0579833984375, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.0672607421875, "loss_aux_layer_4": 0.0701904296875, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.0738525390625, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.0687255859375, "step": 2559, "total_loss": 0.6875162422657013 }, { "epoch": 0.5068303306275985, "grad_norm": 1.5325132608413696, "learning_rate": 5e-05, "llm_loss": 0.6295140013098717, "loss": 2.9032, "loss_aux_layer_0": 0.022796630859375, "loss_aux_layer_1": 0.04278564453125, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.0810546875, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.11572265625, "loss_aux_layer_17": 0.123046875, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05902099609375, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0692138671875, "loss_aux_layer_4": 0.0718994140625, "loss_aux_layer_5": 0.0731201171875, "loss_aux_layer_6": 0.0758056640625, "loss_aux_layer_7": 0.0728759765625, "loss_aux_layer_8": 0.07177734375, "loss_aux_layer_9": 0.070068359375, "step": 2560, "total_loss": 0.7258115857839584 }, { "epoch": 0.5070283112254999, "grad_norm": 1.3335820436477661, "learning_rate": 5e-05, "llm_loss": 0.6467092335224152, "loss": 2.9672, "loss_aux_layer_0": 0.021942138671875, "loss_aux_layer_1": 0.0428466796875, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.074951171875, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.1151123046875, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05633544921875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0665283203125, "loss_aux_layer_4": 0.0692138671875, "loss_aux_layer_5": 0.0704345703125, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.068603515625, "step": 2561, "total_loss": 0.7418062835931778 }, { "epoch": 0.5072262918234013, "grad_norm": 1.373609185218811, "learning_rate": 5e-05, "llm_loss": 0.6006008163094521, "loss": 2.774, "loss_aux_layer_0": 0.021820068359375, "loss_aux_layer_1": 0.0399169921875, "loss_aux_layer_10": 0.06658935546875, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05389404296875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.06488037109375, "step": 2562, "total_loss": 0.6934893280267715 }, { "epoch": 0.5074242724213027, "grad_norm": 0.9507111310958862, "learning_rate": 5e-05, "llm_loss": 0.685228243470192, "loss": 3.1265, "loss_aux_layer_0": 0.02142333984375, "loss_aux_layer_1": 0.0413818359375, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.0965576171875, "loss_aux_layer_15": 0.1058349609375, "loss_aux_layer_16": 0.1160888671875, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05718994140625, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.067626953125, "loss_aux_layer_4": 0.07080078125, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.0726318359375, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.06982421875, "step": 2563, "total_loss": 0.7816219925880432 }, { "epoch": 0.5076222530192042, "grad_norm": 1.1297303438186646, "learning_rate": 5e-05, "llm_loss": 0.6448443308472633, "loss": 2.9379, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.06524658203125, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0638427734375, "step": 2564, "total_loss": 0.7344836741685867 }, { "epoch": 0.5078202336171055, "grad_norm": 0.9631912112236023, "learning_rate": 5e-05, "llm_loss": 0.6088298112154007, "loss": 2.8139, "loss_aux_layer_0": 0.021270751953125, "loss_aux_layer_1": 0.04351806640625, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.0567626953125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.0677490234375, "loss_aux_layer_4": 0.070556640625, "loss_aux_layer_5": 0.072021484375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.072265625, "loss_aux_layer_8": 0.0711669921875, "loss_aux_layer_9": 0.0693359375, "step": 2565, "total_loss": 0.7034869343042374 }, { "epoch": 0.5080182142150069, "grad_norm": 1.1965951919555664, "learning_rate": 5e-05, "llm_loss": 0.5533863306045532, "loss": 2.5903, "loss_aux_layer_0": 0.022064208984375, "loss_aux_layer_1": 0.04156494140625, "loss_aux_layer_10": 0.06915283203125, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1126708984375, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1319580078125, "loss_aux_layer_2": 0.05609130859375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.06817626953125, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.07220458984375, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.067626953125, "step": 2566, "total_loss": 0.6475792974233627 }, { "epoch": 0.5082161948129084, "grad_norm": 1.0463017225265503, "learning_rate": 5e-05, "llm_loss": 0.5244537442922592, "loss": 2.482, "loss_aux_layer_0": 0.02044677734375, "loss_aux_layer_1": 0.043212890625, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.0760498046875, "loss_aux_layer_12": 0.080810546875, "loss_aux_layer_13": 0.0869140625, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.114013671875, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0582275390625, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.209228515625, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.0712890625, "loss_aux_layer_5": 0.07275390625, "loss_aux_layer_6": 0.0753173828125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.0718994140625, "loss_aux_layer_9": 0.0704345703125, "step": 2567, "total_loss": 0.6204952672123909 }, { "epoch": 0.5084141754108097, "grad_norm": 1.2412225008010864, "learning_rate": 5e-05, "llm_loss": 0.6097474917769432, "loss": 2.8153, "loss_aux_layer_0": 0.0218505859375, "loss_aux_layer_1": 0.0416259765625, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.09326171875, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05511474609375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.0697021484375, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.0670166015625, "step": 2568, "total_loss": 0.7038335204124451 }, { "epoch": 0.5086121560087111, "grad_norm": 1.4231593608856201, "learning_rate": 5e-05, "llm_loss": 0.6131703406572342, "loss": 2.8217, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.039306640625, "loss_aux_layer_10": 0.06793212890625, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.1104736328125, "loss_aux_layer_17": 0.1181640625, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06341552734375, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.06829833984375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.06646728515625, "step": 2569, "total_loss": 0.7054242044687271 }, { "epoch": 0.5088101366066126, "grad_norm": 1.2737605571746826, "learning_rate": 5e-05, "llm_loss": 0.6183915287256241, "loss": 2.855, "loss_aux_layer_0": 0.022552490234375, "loss_aux_layer_1": 0.04193115234375, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.085693359375, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.1219482421875, "loss_aux_layer_18": 0.130615234375, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.05621337890625, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.066650390625, "loss_aux_layer_4": 0.0697021484375, "loss_aux_layer_5": 0.0711669921875, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.068603515625, "step": 2570, "total_loss": 0.713761180639267 }, { "epoch": 0.509008117204514, "grad_norm": 0.9194032549858093, "learning_rate": 5e-05, "llm_loss": 0.5632361099123955, "loss": 2.6064, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0623779296875, "step": 2571, "total_loss": 0.6516119837760925 }, { "epoch": 0.5092060978024153, "grad_norm": 1.1891155242919922, "learning_rate": 5e-05, "llm_loss": 0.6381721273064613, "loss": 2.9303, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.06964111328125, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.115234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.0537109375, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.06756591796875, "loss_aux_layer_5": 0.069580078125, "loss_aux_layer_6": 0.07281494140625, "loss_aux_layer_7": 0.07025146484375, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0684814453125, "step": 2572, "total_loss": 0.7325836420059204 }, { "epoch": 0.5094040784003168, "grad_norm": 1.4404455423355103, "learning_rate": 5e-05, "llm_loss": 0.5727745294570923, "loss": 2.6591, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.05401611328125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06439208984375, "loss_aux_layer_4": 0.06707763671875, "loss_aux_layer_5": 0.0687255859375, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.06805419921875, "loss_aux_layer_9": 0.06695556640625, "step": 2573, "total_loss": 0.6647789925336838 }, { "epoch": 0.5096020589982182, "grad_norm": 0.998933732509613, "learning_rate": 5e-05, "llm_loss": 0.6433642581105232, "loss": 2.9391, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.0648193359375, "step": 2574, "total_loss": 0.7347854971885681 }, { "epoch": 0.5098000395961195, "grad_norm": 1.4542784690856934, "learning_rate": 5e-05, "llm_loss": 0.5966738387942314, "loss": 2.7715, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.04071044921875, "loss_aux_layer_10": 0.0716552734375, "loss_aux_layer_11": 0.076171875, "loss_aux_layer_12": 0.08154296875, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.1065673828125, "loss_aux_layer_16": 0.1163330078125, "loss_aux_layer_17": 0.1241455078125, "loss_aux_layer_18": 0.132568359375, "loss_aux_layer_19": 0.13427734375, "loss_aux_layer_2": 0.05548095703125, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.0670166015625, "loss_aux_layer_4": 0.0703125, "loss_aux_layer_5": 0.072265625, "loss_aux_layer_6": 0.075439453125, "loss_aux_layer_7": 0.07275390625, "loss_aux_layer_8": 0.072021484375, "loss_aux_layer_9": 0.0704345703125, "step": 2575, "total_loss": 0.6928645968437195 }, { "epoch": 0.509998020194021, "grad_norm": 1.0704896450042725, "learning_rate": 5e-05, "llm_loss": 0.5764754414558411, "loss": 2.7012, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.04266357421875, "loss_aux_layer_10": 0.0733642578125, "loss_aux_layer_11": 0.0784912109375, "loss_aux_layer_12": 0.083984375, "loss_aux_layer_13": 0.0904541015625, "loss_aux_layer_14": 0.0997314453125, "loss_aux_layer_15": 0.1090087890625, "loss_aux_layer_16": 0.118896484375, "loss_aux_layer_17": 0.126708984375, "loss_aux_layer_18": 0.13525390625, "loss_aux_layer_19": 0.137451171875, "loss_aux_layer_2": 0.05810546875, "loss_aux_layer_20": 0.14501953125, "loss_aux_layer_21": 0.15234375, "loss_aux_layer_22": 0.174560546875, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.069580078125, "loss_aux_layer_4": 0.072265625, "loss_aux_layer_5": 0.07373046875, "loss_aux_layer_6": 0.076904296875, "loss_aux_layer_7": 0.074462890625, "loss_aux_layer_8": 0.07373046875, "loss_aux_layer_9": 0.0721435546875, "step": 2576, "total_loss": 0.6753019988536835 }, { "epoch": 0.5101960007919224, "grad_norm": 1.3887687921524048, "learning_rate": 5e-05, "llm_loss": 0.6621583104133606, "loss": 3.0151, "loss_aux_layer_0": 0.020599365234375, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.064697265625, "step": 2577, "total_loss": 0.7537830621004105 }, { "epoch": 0.5103939813898238, "grad_norm": 1.1556024551391602, "learning_rate": 5e-05, "llm_loss": 0.5640980005264282, "loss": 2.6392, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.0423583984375, "loss_aux_layer_10": 0.071044921875, "loss_aux_layer_11": 0.0758056640625, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.0955810546875, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.1312255859375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.0562744140625, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06787109375, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.0721435546875, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0699462890625, "step": 2578, "total_loss": 0.6597883999347687 }, { "epoch": 0.5105919619877252, "grad_norm": 0.9641776084899902, "learning_rate": 5e-05, "llm_loss": 0.5708374977111816, "loss": 2.6506, "loss_aux_layer_0": 0.02020263671875, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.07025146484375, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.1309814453125, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06585693359375, "loss_aux_layer_9": 0.06451416015625, "step": 2579, "total_loss": 0.6626615524291992 }, { "epoch": 0.5107899425856266, "grad_norm": 1.201886534690857, "learning_rate": 5e-05, "llm_loss": 0.6942849159240723, "loss": 3.1496, "loss_aux_layer_0": 0.021026611328125, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.07171630859375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.1204833984375, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.064208984375, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.06805419921875, "loss_aux_layer_6": 0.07073974609375, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0657958984375, "step": 2580, "total_loss": 0.787409633398056 }, { "epoch": 0.510987923183528, "grad_norm": 0.990127682685852, "learning_rate": 5e-05, "llm_loss": 0.5926339030265808, "loss": 2.7328, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.115966796875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06182861328125, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06591796875, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.0640869140625, "step": 2581, "total_loss": 0.6832039058208466 }, { "epoch": 0.5111859037814295, "grad_norm": 0.9259170889854431, "learning_rate": 5e-05, "llm_loss": 0.6719924211502075, "loss": 3.0651, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.1300048828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06500244140625, "loss_aux_layer_4": 0.06781005859375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.06781005859375, "step": 2582, "total_loss": 0.7662749141454697 }, { "epoch": 0.5113838843793308, "grad_norm": 1.292633295059204, "learning_rate": 5e-05, "llm_loss": 0.6059688180685043, "loss": 2.7977, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.04071044921875, "loss_aux_layer_10": 0.06866455078125, "loss_aux_layer_11": 0.07305908203125, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0675048828125, "loss_aux_layer_5": 0.06890869140625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.06951904296875, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06719970703125, "step": 2583, "total_loss": 0.6994152218103409 }, { "epoch": 0.5115818649772322, "grad_norm": 1.1812691688537598, "learning_rate": 5e-05, "llm_loss": 0.5358879119157791, "loss": 2.5017, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06109619140625, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06475830078125, "loss_aux_layer_9": 0.063232421875, "step": 2584, "total_loss": 0.6254174113273621 }, { "epoch": 0.5117798455751337, "grad_norm": 0.7805721759796143, "learning_rate": 5e-05, "llm_loss": 0.5587444454431534, "loss": 2.6176, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.10498046875, "loss_aux_layer_16": 0.1146240234375, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.1416015625, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0709228515625, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.069091796875, "step": 2585, "total_loss": 0.654407188296318 }, { "epoch": 0.511977826173035, "grad_norm": 0.9400172233581543, "learning_rate": 5e-05, "llm_loss": 0.5766054168343544, "loss": 2.6797, "loss_aux_layer_0": 0.0203857421875, "loss_aux_layer_1": 0.0404052734375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05389404296875, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06494140625, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0667724609375, "step": 2586, "total_loss": 0.6699327975511551 }, { "epoch": 0.5121758067709364, "grad_norm": 1.0507488250732422, "learning_rate": 5e-05, "llm_loss": 0.6136216223239899, "loss": 2.8254, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.1207275390625, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.06781005859375, "loss_aux_layer_9": 0.06640625, "step": 2587, "total_loss": 0.706342488527298 }, { "epoch": 0.5123737873688379, "grad_norm": 0.8389606475830078, "learning_rate": 5e-05, "llm_loss": 0.5773358345031738, "loss": 2.6753, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0809326171875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.06512451171875, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.064697265625, "step": 2588, "total_loss": 0.6688237339258194 }, { "epoch": 0.5125717679667393, "grad_norm": 1.0010164976119995, "learning_rate": 5e-05, "llm_loss": 0.6101404502987862, "loss": 2.7859, "loss_aux_layer_0": 0.02081298828125, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.0601806640625, "step": 2589, "total_loss": 0.6964716762304306 }, { "epoch": 0.5127697485646406, "grad_norm": 1.0220786333084106, "learning_rate": 5e-05, "llm_loss": 0.5674044787883759, "loss": 2.636, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.039306640625, "loss_aux_layer_10": 0.06695556640625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.117919921875, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06744384765625, "loss_aux_layer_6": 0.07037353515625, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.0660400390625, "step": 2590, "total_loss": 0.6590007692575455 }, { "epoch": 0.5129677291625421, "grad_norm": 1.0499088764190674, "learning_rate": 5e-05, "llm_loss": 0.5588525831699371, "loss": 2.6097, "loss_aux_layer_0": 0.020660400390625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.09326171875, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.053955078125, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.070068359375, "loss_aux_layer_8": 0.0694580078125, "loss_aux_layer_9": 0.0677490234375, "step": 2591, "total_loss": 0.6524361670017242 }, { "epoch": 0.5131657097604435, "grad_norm": 0.8755955100059509, "learning_rate": 5e-05, "llm_loss": 0.5411874279379845, "loss": 2.5383, "loss_aux_layer_0": 0.02252197265625, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.1124267578125, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06689453125, "step": 2592, "total_loss": 0.6345627903938293 }, { "epoch": 0.5133636903583448, "grad_norm": 1.3794853687286377, "learning_rate": 5e-05, "llm_loss": 0.6585976630449295, "loss": 2.9951, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.06634521484375, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.064697265625, "step": 2593, "total_loss": 0.7487845718860626 }, { "epoch": 0.5135616709562463, "grad_norm": 0.9424095153808594, "learning_rate": 5e-05, "llm_loss": 0.48194753378629684, "loss": 2.3, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.06744384765625, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.14111328125, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.06719970703125, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.06781005859375, "loss_aux_layer_8": 0.06732177734375, "loss_aux_layer_9": 0.06622314453125, "step": 2594, "total_loss": 0.5750105679035187 }, { "epoch": 0.5137596515541477, "grad_norm": 1.0914274454116821, "learning_rate": 5e-05, "llm_loss": 0.49525976926088333, "loss": 2.3626, "loss_aux_layer_0": 0.020050048828125, "loss_aux_layer_1": 0.0401611328125, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0867919921875, "loss_aux_layer_14": 0.095947265625, "loss_aux_layer_15": 0.104736328125, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1221923828125, "loss_aux_layer_18": 0.131591796875, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.141845703125, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.0684814453125, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0714111328125, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0693359375, "step": 2595, "total_loss": 0.5906582027673721 }, { "epoch": 0.5139576321520491, "grad_norm": 0.8556384444236755, "learning_rate": 5e-05, "llm_loss": 0.5640091896057129, "loss": 2.6276, "loss_aux_layer_0": 0.019561767578125, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.06646728515625, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06854248046875, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.06646728515625, "step": 2596, "total_loss": 0.6568940430879593 }, { "epoch": 0.5141556127499505, "grad_norm": 0.9296279549598694, "learning_rate": 5e-05, "llm_loss": 0.5798207223415375, "loss": 2.6786, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03741455078125, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1285400390625, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0633544921875, "step": 2597, "total_loss": 0.6696565300226212 }, { "epoch": 0.5143535933478519, "grad_norm": 0.8996346592903137, "learning_rate": 5e-05, "llm_loss": 0.5572729036211967, "loss": 2.5862, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1243896484375, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06109619140625, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.0618896484375, "step": 2598, "total_loss": 0.6465611606836319 }, { "epoch": 0.5145515739457533, "grad_norm": 0.8426144123077393, "learning_rate": 5e-05, "llm_loss": 0.6176011711359024, "loss": 2.8372, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.06732177734375, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1104736328125, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.06695556640625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.06744384765625, "loss_aux_layer_8": 0.06695556640625, "loss_aux_layer_9": 0.0657958984375, "step": 2599, "total_loss": 0.7093039453029633 }, { "epoch": 0.5147495545436547, "grad_norm": 0.8543902635574341, "learning_rate": 5e-05, "llm_loss": 0.6881443411111832, "loss": 3.125, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.06793212890625, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0838623046875, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1287841796875, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.06866455078125, "loss_aux_layer_8": 0.06805419921875, "loss_aux_layer_9": 0.06658935546875, "step": 2600, "total_loss": 0.7812515199184418 }, { "epoch": 0.5149475351415561, "grad_norm": 0.7940971851348877, "learning_rate": 5e-05, "llm_loss": 0.5897298604249954, "loss": 2.7233, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1258544921875, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0655517578125, "loss_aux_layer_9": 0.06427001953125, "step": 2601, "total_loss": 0.6808146685361862 }, { "epoch": 0.5151455157394575, "grad_norm": 0.8102284669876099, "learning_rate": 5e-05, "llm_loss": 0.561596691608429, "loss": 2.6083, "loss_aux_layer_0": 0.019683837890625, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.06878662109375, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0635986328125, "step": 2602, "total_loss": 0.6520873606204987 }, { "epoch": 0.515343496337359, "grad_norm": 1.0062553882598877, "learning_rate": 5e-05, "llm_loss": 0.6053114980459213, "loss": 2.7732, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.06365966796875, "step": 2603, "total_loss": 0.6932941377162933 }, { "epoch": 0.5155414769352603, "grad_norm": 1.563843846321106, "learning_rate": 5e-05, "llm_loss": 0.5232020542025566, "loss": 2.4782, "loss_aux_layer_0": 0.02056884765625, "loss_aux_layer_1": 0.04046630859375, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0814208984375, "loss_aux_layer_13": 0.0877685546875, "loss_aux_layer_14": 0.0972900390625, "loss_aux_layer_15": 0.106689453125, "loss_aux_layer_16": 0.1168212890625, "loss_aux_layer_17": 0.125, "loss_aux_layer_18": 0.1337890625, "loss_aux_layer_19": 0.1357421875, "loss_aux_layer_2": 0.0546875, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.1708984375, "loss_aux_layer_23": 0.20947265625, "loss_aux_layer_3": 0.06597900390625, "loss_aux_layer_4": 0.06903076171875, "loss_aux_layer_5": 0.07080078125, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0718994140625, "loss_aux_layer_8": 0.071044921875, "loss_aux_layer_9": 0.069580078125, "step": 2604, "total_loss": 0.6195614784955978 }, { "epoch": 0.5157394575331617, "grad_norm": 0.9899893403053284, "learning_rate": 5e-05, "llm_loss": 0.6550180613994598, "loss": 2.9753, "loss_aux_layer_0": 0.019256591796875, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.06768798828125, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.10693359375, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.0633544921875, "loss_aux_layer_6": 0.06634521484375, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06207275390625, "step": 2605, "total_loss": 0.7438369542360306 }, { "epoch": 0.5159374381310632, "grad_norm": 0.9188082814216614, "learning_rate": 5e-05, "llm_loss": 0.6168231666088104, "loss": 2.8227, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06231689453125, "step": 2606, "total_loss": 0.7056860774755478 }, { "epoch": 0.5161354187289645, "grad_norm": 1.031673550605774, "learning_rate": 5e-05, "llm_loss": 0.6204952895641327, "loss": 2.8484, "loss_aux_layer_0": 0.01995849609375, "loss_aux_layer_1": 0.03961181640625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0640869140625, "loss_aux_layer_4": 0.06695556640625, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068359375, "loss_aux_layer_9": 0.06689453125, "step": 2607, "total_loss": 0.7120945900678635 }, { "epoch": 0.516333399326866, "grad_norm": 0.8257365822792053, "learning_rate": 5e-05, "llm_loss": 0.6138027906417847, "loss": 2.8233, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.052001953125, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.071044921875, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2608, "total_loss": 0.7058298289775848 }, { "epoch": 0.5165313799247674, "grad_norm": 1.3657089471817017, "learning_rate": 5e-05, "llm_loss": 0.5299271941184998, "loss": 2.4864, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.07025146484375, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.2099609375, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.06890869140625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.0655517578125, "loss_aux_layer_9": 0.0643310546875, "step": 2609, "total_loss": 0.6215989142656326 }, { "epoch": 0.5167293605226688, "grad_norm": 0.8002610802650452, "learning_rate": 5e-05, "llm_loss": 0.5460524782538414, "loss": 2.5514, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.0662841796875, "step": 2610, "total_loss": 0.6378483772277832 }, { "epoch": 0.5169273411205701, "grad_norm": 0.8883377313613892, "learning_rate": 5e-05, "llm_loss": 0.5097914561629295, "loss": 2.413, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1129150390625, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.149169921875, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06610107421875, "step": 2611, "total_loss": 0.6032540649175644 }, { "epoch": 0.5171253217184716, "grad_norm": 0.9551738500595093, "learning_rate": 5e-05, "llm_loss": 0.5722969025373459, "loss": 2.6608, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.0399169921875, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1197509765625, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05328369140625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.06817626953125, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.0673828125, "step": 2612, "total_loss": 0.6651953160762787 }, { "epoch": 0.517323302316373, "grad_norm": 0.7668949365615845, "learning_rate": 5e-05, "llm_loss": 0.5352108627557755, "loss": 2.5002, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.036865234375, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.06976318359375, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.0631103515625, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06512451171875, "loss_aux_layer_9": 0.06390380859375, "step": 2613, "total_loss": 0.6250588893890381 }, { "epoch": 0.5175212829142743, "grad_norm": 0.9860411286354065, "learning_rate": 5e-05, "llm_loss": 0.6038727015256882, "loss": 2.7792, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.06640625, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.06451416015625, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.0650634765625, "step": 2614, "total_loss": 0.6948077529668808 }, { "epoch": 0.5177192635121758, "grad_norm": 0.972651481628418, "learning_rate": 5e-05, "llm_loss": 0.6094591096043587, "loss": 2.8014, "loss_aux_layer_0": 0.019744873046875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06549072265625, "step": 2615, "total_loss": 0.7003404498100281 }, { "epoch": 0.5179172441100772, "grad_norm": 0.8566115498542786, "learning_rate": 5e-05, "llm_loss": 0.5546651631593704, "loss": 2.5747, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1258544921875, "loss_aux_layer_2": 0.0496826171875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06341552734375, "step": 2616, "total_loss": 0.6436777859926224 }, { "epoch": 0.5181152247079787, "grad_norm": 0.9507678151130676, "learning_rate": 5e-05, "llm_loss": 0.5664311572909355, "loss": 2.636, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.0400390625, "loss_aux_layer_10": 0.06756591796875, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.1302490234375, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.07122802734375, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0662841796875, "step": 2617, "total_loss": 0.6590070277452469 }, { "epoch": 0.51831320530588, "grad_norm": 0.7755681872367859, "learning_rate": 5e-05, "llm_loss": 0.5576235577464104, "loss": 2.5939, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05126953125, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.06671142578125, "loss_aux_layer_9": 0.06488037109375, "step": 2618, "total_loss": 0.6484815627336502 }, { "epoch": 0.5185111859037814, "grad_norm": 0.8600853085517883, "learning_rate": 5e-05, "llm_loss": 0.5327941924333572, "loss": 2.4965, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.064697265625, "step": 2619, "total_loss": 0.6241272389888763 }, { "epoch": 0.5187091665016829, "grad_norm": 0.8666519522666931, "learning_rate": 5e-05, "llm_loss": 0.5081689208745956, "loss": 2.4068, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.1282958984375, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.212890625, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06689453125, "step": 2620, "total_loss": 0.601710855960846 }, { "epoch": 0.5189071470995842, "grad_norm": 1.0709738731384277, "learning_rate": 5e-05, "llm_loss": 0.5907554924488068, "loss": 2.7296, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1102294921875, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.06634521484375, "step": 2621, "total_loss": 0.6823991686105728 }, { "epoch": 0.5191051276974856, "grad_norm": 0.8350309729576111, "learning_rate": 5e-05, "llm_loss": 0.5509018898010254, "loss": 2.5731, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.06805419921875, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06640625, "step": 2622, "total_loss": 0.6432662159204483 }, { "epoch": 0.5193031082953871, "grad_norm": 1.0249435901641846, "learning_rate": 5e-05, "llm_loss": 0.6153397262096405, "loss": 2.8277, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06695556640625, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.06756591796875, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.06805419921875, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.065673828125, "step": 2623, "total_loss": 0.7069228291511536 }, { "epoch": 0.5195010888932885, "grad_norm": 0.8675332069396973, "learning_rate": 5e-05, "llm_loss": 0.6706084609031677, "loss": 3.0533, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.0689697265625, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.06378173828125, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.0675048828125, "step": 2624, "total_loss": 0.7633127570152283 }, { "epoch": 0.5196990694911898, "grad_norm": 1.1074275970458984, "learning_rate": 5e-05, "llm_loss": 0.589442528784275, "loss": 2.7207, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0654296875, "loss_aux_layer_9": 0.0643310546875, "step": 2625, "total_loss": 0.6801628470420837 }, { "epoch": 0.5198970500890913, "grad_norm": 0.8827704787254333, "learning_rate": 5e-05, "llm_loss": 0.6706751137971878, "loss": 3.045, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06719970703125, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.06512451171875, "step": 2626, "total_loss": 0.7612524032592773 }, { "epoch": 0.5200950306869927, "grad_norm": 0.6982759237289429, "learning_rate": 5e-05, "llm_loss": 0.5454066321253777, "loss": 2.5241, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.060302734375, "step": 2627, "total_loss": 0.6310284584760666 }, { "epoch": 0.5202930112848941, "grad_norm": 0.89215487241745, "learning_rate": 5e-05, "llm_loss": 0.5480315759778023, "loss": 2.5576, "loss_aux_layer_0": 0.0189208984375, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.1292724609375, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.06634521484375, "loss_aux_layer_9": 0.06500244140625, "step": 2628, "total_loss": 0.6393932700157166 }, { "epoch": 0.5204909918827955, "grad_norm": 1.1438422203063965, "learning_rate": 5e-05, "llm_loss": 0.5437897443771362, "loss": 2.5465, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.0391845703125, "loss_aux_layer_10": 0.06793212890625, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06341552734375, "loss_aux_layer_4": 0.06622314453125, "loss_aux_layer_5": 0.06781005859375, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06903076171875, "loss_aux_layer_8": 0.06829833984375, "loss_aux_layer_9": 0.06695556640625, "step": 2629, "total_loss": 0.6366309076547623 }, { "epoch": 0.5206889724806969, "grad_norm": 1.1804966926574707, "learning_rate": 5e-05, "llm_loss": 0.598652109503746, "loss": 2.7752, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.06951904296875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0953369140625, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.1153564453125, "loss_aux_layer_17": 0.122802734375, "loss_aux_layer_18": 0.1309814453125, "loss_aux_layer_19": 0.134521484375, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.142333984375, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.1728515625, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.0692138671875, "loss_aux_layer_6": 0.0723876953125, "loss_aux_layer_7": 0.07012939453125, "loss_aux_layer_8": 0.06939697265625, "loss_aux_layer_9": 0.0679931640625, "step": 2630, "total_loss": 0.6938121169805527 }, { "epoch": 0.5208869530785983, "grad_norm": 1.026657223701477, "learning_rate": 5e-05, "llm_loss": 0.6655859053134918, "loss": 3.0213, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.06671142578125, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0653076171875, "step": 2631, "total_loss": 0.75533527135849 }, { "epoch": 0.5210849336764997, "grad_norm": 1.3712272644042969, "learning_rate": 5e-05, "llm_loss": 0.5606711059808731, "loss": 2.609, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0650634765625, "step": 2632, "total_loss": 0.6522494107484818 }, { "epoch": 0.5212829142744011, "grad_norm": 1.4188406467437744, "learning_rate": 5e-05, "llm_loss": 0.6124108135700226, "loss": 2.8223, "loss_aux_layer_0": 0.019439697265625, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.06878662109375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.06927490234375, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06732177734375, "step": 2633, "total_loss": 0.7055673003196716 }, { "epoch": 0.5214808948723025, "grad_norm": 1.090432047843933, "learning_rate": 5e-05, "llm_loss": 0.45190392434597015, "loss": 2.1745, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.06719970703125, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1297607421875, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.06793212890625, "loss_aux_layer_8": 0.06719970703125, "loss_aux_layer_9": 0.06585693359375, "step": 2634, "total_loss": 0.5436189323663712 }, { "epoch": 0.521678875470204, "grad_norm": 1.4603257179260254, "learning_rate": 5e-05, "llm_loss": 0.548846036195755, "loss": 2.5642, "loss_aux_layer_0": 0.018890380859375, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.1104736328125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.06634521484375, "step": 2635, "total_loss": 0.6410503685474396 }, { "epoch": 0.5218768560681053, "grad_norm": 1.273038387298584, "learning_rate": 5e-05, "llm_loss": 0.6203988194465637, "loss": 2.8551, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.03948974609375, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.07763671875, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1292724609375, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.0687255859375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06689453125, "step": 2636, "total_loss": 0.7137723565101624 }, { "epoch": 0.5220748366660067, "grad_norm": 1.495668888092041, "learning_rate": 5e-05, "llm_loss": 0.522859200835228, "loss": 2.4733, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0753173828125, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.0872802734375, "loss_aux_layer_14": 0.0963134765625, "loss_aux_layer_15": 0.10546875, "loss_aux_layer_16": 0.1153564453125, "loss_aux_layer_17": 0.1229248046875, "loss_aux_layer_18": 0.131103515625, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.0550537109375, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.06634521484375, "loss_aux_layer_4": 0.06884765625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0736083984375, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0694580078125, "step": 2637, "total_loss": 0.6183258891105652 }, { "epoch": 0.5222728172639082, "grad_norm": 1.6168948411941528, "learning_rate": 5e-05, "llm_loss": 0.6782733500003815, "loss": 3.0794, "loss_aux_layer_0": 0.020050048828125, "loss_aux_layer_1": 0.03814697265625, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06640625, "step": 2638, "total_loss": 0.7698512077331543 }, { "epoch": 0.5224707978618095, "grad_norm": 1.3752068281173706, "learning_rate": 5e-05, "llm_loss": 0.5598003715276718, "loss": 2.5955, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0625, "step": 2639, "total_loss": 0.648883730173111 }, { "epoch": 0.5226687784597109, "grad_norm": 1.147925853729248, "learning_rate": 5e-05, "llm_loss": 0.5831701159477234, "loss": 2.7, "loss_aux_layer_0": 0.0196533203125, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.0655517578125, "step": 2640, "total_loss": 0.6749970316886902 }, { "epoch": 0.5228667590576124, "grad_norm": 1.3638283014297485, "learning_rate": 5e-05, "llm_loss": 0.5966368317604065, "loss": 2.7559, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.0394287109375, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.1107177734375, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06573486328125, "step": 2641, "total_loss": 0.6889682412147522 }, { "epoch": 0.5230647396555138, "grad_norm": 1.025451898574829, "learning_rate": 5e-05, "llm_loss": 0.518770232796669, "loss": 2.4531, "loss_aux_layer_0": 0.019256591796875, "loss_aux_layer_1": 0.03948974609375, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.074462890625, "loss_aux_layer_12": 0.0794677734375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.1298828125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.148681640625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.2080078125, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.06756591796875, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.0731201171875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0684814453125, "step": 2642, "total_loss": 0.6132676005363464 }, { "epoch": 0.5232627202534151, "grad_norm": 0.9070179462432861, "learning_rate": 5e-05, "llm_loss": 0.5942181497812271, "loss": 2.7331, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.064208984375, "loss_aux_layer_9": 0.062744140625, "step": 2643, "total_loss": 0.6832785755395889 }, { "epoch": 0.5234607008513166, "grad_norm": 1.5688366889953613, "learning_rate": 5e-05, "llm_loss": 0.5828232616186142, "loss": 2.7084, "loss_aux_layer_0": 0.0181884765625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.134033203125, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.14208984375, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.172607421875, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.0672607421875, "step": 2644, "total_loss": 0.6770993620157242 }, { "epoch": 0.523658681449218, "grad_norm": 1.2666682004928589, "learning_rate": 5e-05, "llm_loss": 0.5877829045057297, "loss": 2.7173, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.07122802734375, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.06732177734375, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.06549072265625, "step": 2645, "total_loss": 0.6793186068534851 }, { "epoch": 0.5238566620471193, "grad_norm": 1.0740127563476562, "learning_rate": 5e-05, "llm_loss": 0.6156536787748337, "loss": 2.8243, "loss_aux_layer_0": 0.01971435546875, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0657958984375, "step": 2646, "total_loss": 0.7060659676790237 }, { "epoch": 0.5240546426450208, "grad_norm": 0.8613045811653137, "learning_rate": 5e-05, "llm_loss": 0.6126193255186081, "loss": 2.8213, "loss_aux_layer_0": 0.02008056640625, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06561279296875, "step": 2647, "total_loss": 0.7053340971469879 }, { "epoch": 0.5242526232429222, "grad_norm": 1.093846321105957, "learning_rate": 5e-05, "llm_loss": 0.6402203738689423, "loss": 2.9247, "loss_aux_layer_0": 0.02130126953125, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.06634521484375, "loss_aux_layer_9": 0.0648193359375, "step": 2648, "total_loss": 0.7311636954545975 }, { "epoch": 0.5244506038408236, "grad_norm": 1.1690764427185059, "learning_rate": 5e-05, "llm_loss": 0.5690418183803558, "loss": 2.6441, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.06646728515625, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.06646728515625, "loss_aux_layer_9": 0.06500244140625, "step": 2649, "total_loss": 0.6610279977321625 }, { "epoch": 0.524648584438725, "grad_norm": 1.0297662019729614, "learning_rate": 5e-05, "llm_loss": 0.5189007967710495, "loss": 2.4347, "loss_aux_layer_0": 0.020721435546875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.06329345703125, "step": 2650, "total_loss": 0.608672246336937 }, { "epoch": 0.5248465650366264, "grad_norm": 1.131057858467102, "learning_rate": 5e-05, "llm_loss": 0.6527578085660934, "loss": 2.9752, "loss_aux_layer_0": 0.02099609375, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.06683349609375, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.06976318359375, "loss_aux_layer_7": 0.06756591796875, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.0655517578125, "step": 2651, "total_loss": 0.7437938898801804 }, { "epoch": 0.5250445456345278, "grad_norm": 0.8849340081214905, "learning_rate": 5e-05, "llm_loss": 0.6155584454536438, "loss": 2.8334, "loss_aux_layer_0": 0.01983642578125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.06866455078125, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.06689453125, "step": 2652, "total_loss": 0.7083598524332047 }, { "epoch": 0.5252425262324292, "grad_norm": 0.8394860029220581, "learning_rate": 5e-05, "llm_loss": 0.540286660194397, "loss": 2.5334, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06353759765625, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.071044921875, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.0673828125, "step": 2653, "total_loss": 0.6333622187376022 }, { "epoch": 0.5254405068303306, "grad_norm": 0.9614883661270142, "learning_rate": 5e-05, "llm_loss": 0.5680904686450958, "loss": 2.6254, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.04901123046875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06182861328125, "step": 2654, "total_loss": 0.656357616186142 }, { "epoch": 0.525638487428232, "grad_norm": 0.7588055729866028, "learning_rate": 5e-05, "llm_loss": 0.5379864126443863, "loss": 2.5283, "loss_aux_layer_0": 0.0189208984375, "loss_aux_layer_1": 0.04052734375, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0858154296875, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.1031494140625, "loss_aux_layer_16": 0.1123046875, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.1304931640625, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.06536865234375, "loss_aux_layer_4": 0.06817626953125, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.06988525390625, "loss_aux_layer_9": 0.06854248046875, "step": 2655, "total_loss": 0.632071778178215 }, { "epoch": 0.5258364680261335, "grad_norm": 0.9356706738471985, "learning_rate": 5e-05, "llm_loss": 0.6955376863479614, "loss": 3.1475, "loss_aux_layer_0": 0.02093505859375, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.05328369140625, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.06695556640625, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0665283203125, "step": 2656, "total_loss": 0.7868632674217224 }, { "epoch": 0.5260344486240348, "grad_norm": 0.9042444229125977, "learning_rate": 5e-05, "llm_loss": 0.5679133012890816, "loss": 2.6342, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.0657958984375, "step": 2657, "total_loss": 0.6585523933172226 }, { "epoch": 0.5262324292219362, "grad_norm": 0.937308669090271, "learning_rate": 5e-05, "llm_loss": 0.5738498792052269, "loss": 2.6478, "loss_aux_layer_0": 0.019561767578125, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.06707763671875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06146240234375, "step": 2658, "total_loss": 0.6619520336389542 }, { "epoch": 0.5264304098198377, "grad_norm": 1.2031949758529663, "learning_rate": 5e-05, "llm_loss": 0.6015787422657013, "loss": 2.783, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.080078125, "loss_aux_layer_13": 0.0860595703125, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.1138916015625, "loss_aux_layer_17": 0.12158203125, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0537109375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.06512451171875, "loss_aux_layer_4": 0.06842041015625, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0732421875, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.0701904296875, "loss_aux_layer_9": 0.0687255859375, "step": 2659, "total_loss": 0.6957601308822632 }, { "epoch": 0.526628390417739, "grad_norm": 1.00339937210083, "learning_rate": 5e-05, "llm_loss": 0.5395667403936386, "loss": 2.5249, "loss_aux_layer_0": 0.01953125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.171142578125, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.06842041015625, "loss_aux_layer_7": 0.06634521484375, "loss_aux_layer_8": 0.0655517578125, "loss_aux_layer_9": 0.06390380859375, "step": 2660, "total_loss": 0.631231352686882 }, { "epoch": 0.5268263710156404, "grad_norm": 1.3032031059265137, "learning_rate": 5e-05, "llm_loss": 0.6141183376312256, "loss": 2.823, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.0382080078125, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.0653076171875, "step": 2661, "total_loss": 0.7057432234287262 }, { "epoch": 0.5270243516135419, "grad_norm": 1.3886884450912476, "learning_rate": 5e-05, "llm_loss": 0.6273043900728226, "loss": 2.8753, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06646728515625, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.1297607421875, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06182861328125, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06591796875, "loss_aux_layer_6": 0.06903076171875, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.06597900390625, "loss_aux_layer_9": 0.06475830078125, "step": 2662, "total_loss": 0.7188246697187424 }, { "epoch": 0.5272223322114433, "grad_norm": 1.0865817070007324, "learning_rate": 5e-05, "llm_loss": 0.6447484940290451, "loss": 2.9444, "loss_aux_layer_0": 0.02117919921875, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.0645751953125, "step": 2663, "total_loss": 0.7360920459032059 }, { "epoch": 0.5274203128093446, "grad_norm": 1.2938297986984253, "learning_rate": 5e-05, "llm_loss": 0.5065843090415001, "loss": 2.3918, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.06451416015625, "step": 2664, "total_loss": 0.5979461371898651 }, { "epoch": 0.5276182934072461, "grad_norm": 1.5089964866638184, "learning_rate": 5e-05, "llm_loss": 0.6130790114402771, "loss": 2.8243, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1202392578125, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.20849609375, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06549072265625, "step": 2665, "total_loss": 0.7060808688402176 }, { "epoch": 0.5278162740051475, "grad_norm": 1.150591254234314, "learning_rate": 5e-05, "llm_loss": 0.6500707417726517, "loss": 2.969, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06671142578125, "step": 2666, "total_loss": 0.742243081331253 }, { "epoch": 0.5280142546030488, "grad_norm": 1.2618913650512695, "learning_rate": 5e-05, "llm_loss": 0.5782425105571747, "loss": 2.6882, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0699462890625, "loss_aux_layer_11": 0.07421875, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.085205078125, "loss_aux_layer_14": 0.09423828125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.121337890625, "loss_aux_layer_18": 0.1297607421875, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.069580078125, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.068115234375, "step": 2667, "total_loss": 0.6720516383647919 }, { "epoch": 0.5282122352009503, "grad_norm": 1.1378743648529053, "learning_rate": 5e-05, "llm_loss": 0.5667004734277725, "loss": 2.6309, "loss_aux_layer_0": 0.018585205078125, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0614013671875, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.06597900390625, "loss_aux_layer_9": 0.0645751953125, "step": 2668, "total_loss": 0.6577229797840118 }, { "epoch": 0.5284102157988517, "grad_norm": 1.0820894241333008, "learning_rate": 5e-05, "llm_loss": 0.6202670931816101, "loss": 2.8399, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.062744140625, "step": 2669, "total_loss": 0.7099795341491699 }, { "epoch": 0.5286081963967532, "grad_norm": 1.3581281900405884, "learning_rate": 5e-05, "llm_loss": 0.6287106275558472, "loss": 2.858, "loss_aux_layer_0": 0.01873779296875, "loss_aux_layer_1": 0.035369873046875, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.05987548828125, "step": 2670, "total_loss": 0.7145071476697922 }, { "epoch": 0.5288061769946545, "grad_norm": 0.8723993897438049, "learning_rate": 5e-05, "llm_loss": 0.6013958007097244, "loss": 2.7693, "loss_aux_layer_0": 0.0186767578125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06744384765625, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.06781005859375, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.06573486328125, "step": 2671, "total_loss": 0.6923259645700455 }, { "epoch": 0.5290041575925559, "grad_norm": 0.9421612024307251, "learning_rate": 5e-05, "llm_loss": 0.5814016908407211, "loss": 2.6933, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.0660400390625, "step": 2672, "total_loss": 0.6733127012848854 }, { "epoch": 0.5292021381904574, "grad_norm": 1.0719228982925415, "learning_rate": 5e-05, "llm_loss": 0.6292765736579895, "loss": 2.8837, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1181640625, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.065185546875, "step": 2673, "total_loss": 0.7209138572216034 }, { "epoch": 0.5294001187883587, "grad_norm": 0.9764331579208374, "learning_rate": 5e-05, "llm_loss": 0.5761348605155945, "loss": 2.6677, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.039306640625, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05303955078125, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.06634521484375, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.06402587890625, "step": 2674, "total_loss": 0.6669194400310516 }, { "epoch": 0.5295980993862601, "grad_norm": 0.8961948752403259, "learning_rate": 5e-05, "llm_loss": 0.5506050139665604, "loss": 2.5541, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.06195068359375, "step": 2675, "total_loss": 0.6385366320610046 }, { "epoch": 0.5297960799841616, "grad_norm": 1.0250574350357056, "learning_rate": 5e-05, "llm_loss": 0.6441062018275261, "loss": 2.941, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.0382080078125, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1285400390625, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06329345703125, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.0677490234375, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0653076171875, "step": 2676, "total_loss": 0.7352393269538879 }, { "epoch": 0.529994060582063, "grad_norm": 0.9405000805854797, "learning_rate": 5e-05, "llm_loss": 0.5593263655900955, "loss": 2.6108, "loss_aux_layer_0": 0.019683837890625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.06781005859375, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.208984375, "loss_aux_layer_3": 0.063720703125, "loss_aux_layer_4": 0.06622314453125, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.06658935546875, "step": 2677, "total_loss": 0.652707114815712 }, { "epoch": 0.5301920411799643, "grad_norm": 0.9906818270683289, "learning_rate": 5e-05, "llm_loss": 0.6417784541845322, "loss": 2.9453, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.04046630859375, "loss_aux_layer_10": 0.0704345703125, "loss_aux_layer_11": 0.0751953125, "loss_aux_layer_12": 0.0802001953125, "loss_aux_layer_13": 0.0865478515625, "loss_aux_layer_14": 0.095458984375, "loss_aux_layer_15": 0.1046142578125, "loss_aux_layer_16": 0.1142578125, "loss_aux_layer_17": 0.121826171875, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.131591796875, "loss_aux_layer_2": 0.05462646484375, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0655517578125, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0706787109375, "loss_aux_layer_6": 0.0740966796875, "loss_aux_layer_7": 0.0716552734375, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.0689697265625, "step": 2678, "total_loss": 0.7363211065530777 }, { "epoch": 0.5303900217778658, "grad_norm": 1.1219911575317383, "learning_rate": 5e-05, "llm_loss": 0.6082280874252319, "loss": 2.7832, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.06207275390625, "step": 2679, "total_loss": 0.6957959085702896 }, { "epoch": 0.5305880023757672, "grad_norm": 1.1259692907333374, "learning_rate": 5e-05, "llm_loss": 0.563555896282196, "loss": 2.6109, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.03741455078125, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.06756591796875, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.0496826171875, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06195068359375, "step": 2680, "total_loss": 0.652714416384697 }, { "epoch": 0.5307859829736686, "grad_norm": 1.0752164125442505, "learning_rate": 5e-05, "llm_loss": 0.6322983950376511, "loss": 2.884, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.0628662109375, "step": 2681, "total_loss": 0.7209998369216919 }, { "epoch": 0.53098396357157, "grad_norm": 1.019666314125061, "learning_rate": 5e-05, "llm_loss": 0.5445947647094727, "loss": 2.5425, "loss_aux_layer_0": 0.0186767578125, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0655517578125, "step": 2682, "total_loss": 0.635636642575264 }, { "epoch": 0.5311819441694714, "grad_norm": 0.9558697938919067, "learning_rate": 5e-05, "llm_loss": 0.5582231432199478, "loss": 2.5959, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.06951904296875, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.11767578125, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06402587890625, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.06854248046875, "loss_aux_layer_7": 0.06622314453125, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.0640869140625, "step": 2683, "total_loss": 0.6489693820476532 }, { "epoch": 0.5313799247673728, "grad_norm": 0.9510367512702942, "learning_rate": 5e-05, "llm_loss": 0.5759840682148933, "loss": 2.6562, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06182861328125, "step": 2684, "total_loss": 0.6640555560588837 }, { "epoch": 0.5315779053652742, "grad_norm": 1.1096186637878418, "learning_rate": 5e-05, "llm_loss": 0.5733180791139603, "loss": 2.6446, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.06878662109375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.06329345703125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.06451416015625, "loss_aux_layer_9": 0.06341552734375, "step": 2685, "total_loss": 0.6611429154872894 }, { "epoch": 0.5317758859631756, "grad_norm": 1.089708924293518, "learning_rate": 5e-05, "llm_loss": 0.5944744497537613, "loss": 2.7573, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0706787109375, "loss_aux_layer_11": 0.075439453125, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.1143798828125, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.1304931640625, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.0548095703125, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.070556640625, "loss_aux_layer_6": 0.07373046875, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.069091796875, "step": 2686, "total_loss": 0.6893196105957031 }, { "epoch": 0.531973866561077, "grad_norm": 0.9817060232162476, "learning_rate": 5e-05, "llm_loss": 0.6127163022756577, "loss": 2.8198, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.127685546875, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06640625, "step": 2687, "total_loss": 0.7049404233694077 }, { "epoch": 0.5321718471589785, "grad_norm": 0.9545955657958984, "learning_rate": 5e-05, "llm_loss": 0.6574083045125008, "loss": 2.9971, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.1011962890625, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.128173828125, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06561279296875, "step": 2688, "total_loss": 0.7492763996124268 }, { "epoch": 0.5323698277568798, "grad_norm": 1.3066232204437256, "learning_rate": 5e-05, "llm_loss": 0.6106272041797638, "loss": 2.8122, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.10205078125, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.1309814453125, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.06756591796875, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.06646728515625, "step": 2689, "total_loss": 0.7030384093523026 }, { "epoch": 0.5325678083547812, "grad_norm": 1.4823524951934814, "learning_rate": 5e-05, "llm_loss": 0.6015101820230484, "loss": 2.7663, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06451416015625, "step": 2690, "total_loss": 0.6915799677371979 }, { "epoch": 0.5327657889526827, "grad_norm": 1.1073479652404785, "learning_rate": 5e-05, "llm_loss": 0.585308700799942, "loss": 2.7066, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1004638671875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1282958984375, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.06610107421875, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.06719970703125, "loss_aux_layer_9": 0.06597900390625, "step": 2691, "total_loss": 0.6766560971736908 }, { "epoch": 0.532963769550584, "grad_norm": 1.1511950492858887, "learning_rate": 5e-05, "llm_loss": 0.630201980471611, "loss": 2.8828, "loss_aux_layer_0": 0.01885986328125, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.06378173828125, "step": 2692, "total_loss": 0.7206913530826569 }, { "epoch": 0.5331617501484854, "grad_norm": 1.3947277069091797, "learning_rate": 5e-05, "llm_loss": 0.5485896617174149, "loss": 2.562, "loss_aux_layer_0": 0.0201416015625, "loss_aux_layer_1": 0.0394287109375, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1278076171875, "loss_aux_layer_2": 0.05322265625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0670166015625, "step": 2693, "total_loss": 0.6404939740896225 }, { "epoch": 0.5333597307463869, "grad_norm": 1.0687189102172852, "learning_rate": 5e-05, "llm_loss": 0.5069155618548393, "loss": 2.3932, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.07220458984375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.1181640625, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.06988525390625, "loss_aux_layer_7": 0.06781005859375, "loss_aux_layer_8": 0.06744384765625, "loss_aux_layer_9": 0.06634521484375, "step": 2694, "total_loss": 0.5983036160469055 }, { "epoch": 0.5335577113442883, "grad_norm": 1.326369285583496, "learning_rate": 5e-05, "llm_loss": 0.6050223261117935, "loss": 2.7886, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16845703125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.06646728515625, "step": 2695, "total_loss": 0.697144404053688 }, { "epoch": 0.5337556919421896, "grad_norm": 0.953703761100769, "learning_rate": 5e-05, "llm_loss": 0.6009307354688644, "loss": 2.7685, "loss_aux_layer_0": 0.020355224609375, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.06378173828125, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0662841796875, "step": 2696, "total_loss": 0.6921237707138062 }, { "epoch": 0.5339536725400911, "grad_norm": 1.2511893510818481, "learning_rate": 5e-05, "llm_loss": 0.5590416640043259, "loss": 2.6066, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0784912109375, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.11962890625, "loss_aux_layer_18": 0.1282958984375, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.06829833984375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06646728515625, "step": 2697, "total_loss": 0.6516470909118652 }, { "epoch": 0.5341516531379925, "grad_norm": 0.9777070879936218, "learning_rate": 5e-05, "llm_loss": 0.5925420224666595, "loss": 2.7371, "loss_aux_layer_0": 0.01904296875, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.130859375, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0648193359375, "step": 2698, "total_loss": 0.6842869371175766 }, { "epoch": 0.5343496337358938, "grad_norm": 1.1948076486587524, "learning_rate": 5e-05, "llm_loss": 0.6183391511440277, "loss": 2.8357, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.117919921875, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.06158447265625, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.0648193359375, "step": 2699, "total_loss": 0.7089362889528275 }, { "epoch": 0.5345476143337953, "grad_norm": 1.0879576206207275, "learning_rate": 5e-05, "llm_loss": 0.5361279994249344, "loss": 2.4961, "loss_aux_layer_0": 0.01806640625, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.06103515625, "step": 2700, "total_loss": 0.6240182965993881 }, { "epoch": 0.5347455949316967, "grad_norm": 1.1820560693740845, "learning_rate": 5e-05, "llm_loss": 0.5408252105116844, "loss": 2.5162, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.06768798828125, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0587158203125, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.0628662109375, "loss_aux_layer_9": 0.06201171875, "step": 2701, "total_loss": 0.629039466381073 }, { "epoch": 0.5349435755295981, "grad_norm": 1.0513421297073364, "learning_rate": 5e-05, "llm_loss": 0.620131179690361, "loss": 2.85, "loss_aux_layer_0": 0.01849365234375, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.06781005859375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.0931396484375, "loss_aux_layer_15": 0.101806640625, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06341552734375, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06671142578125, "step": 2702, "total_loss": 0.7125096321105957 }, { "epoch": 0.5351415561274995, "grad_norm": 1.2358607053756714, "learning_rate": 5e-05, "llm_loss": 0.5884301513433456, "loss": 2.7122, "loss_aux_layer_0": 0.019073486328125, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0660400390625, "loss_aux_layer_9": 0.06463623046875, "step": 2703, "total_loss": 0.6780572384595871 }, { "epoch": 0.5353395367254009, "grad_norm": 0.9715719223022461, "learning_rate": 5e-05, "llm_loss": 0.5494033172726631, "loss": 2.5623, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06536865234375, "step": 2704, "total_loss": 0.6405782550573349 }, { "epoch": 0.5355375173233023, "grad_norm": 1.7982405424118042, "learning_rate": 5e-05, "llm_loss": 0.5970104411244392, "loss": 2.7485, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.0367431640625, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.05035400390625, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.208740234375, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.06353759765625, "loss_aux_layer_9": 0.06219482421875, "step": 2705, "total_loss": 0.6871305853128433 }, { "epoch": 0.5357354979212037, "grad_norm": 1.51096773147583, "learning_rate": 5e-05, "llm_loss": 0.6175028830766678, "loss": 2.8349, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.11572265625, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.0662841796875, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0662841796875, "step": 2706, "total_loss": 0.7087245285511017 }, { "epoch": 0.5359334785191051, "grad_norm": 1.0948327779769897, "learning_rate": 5e-05, "llm_loss": 0.6473729759454727, "loss": 2.9705, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.070556640625, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.0806884765625, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0960693359375, "loss_aux_layer_15": 0.105224609375, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1224365234375, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05517578125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.148193359375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0660400390625, "loss_aux_layer_4": 0.0689697265625, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.073974609375, "loss_aux_layer_7": 0.071533203125, "loss_aux_layer_8": 0.0706787109375, "loss_aux_layer_9": 0.0692138671875, "step": 2707, "total_loss": 0.7426308989524841 }, { "epoch": 0.5361314591170065, "grad_norm": 1.3534595966339111, "learning_rate": 5e-05, "llm_loss": 0.5549970865249634, "loss": 2.5835, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.06439208984375, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06671142578125, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06494140625, "step": 2708, "total_loss": 0.6458716243505478 }, { "epoch": 0.536329439714908, "grad_norm": 1.0892308950424194, "learning_rate": 5e-05, "llm_loss": 0.60540971159935, "loss": 2.7688, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0582275390625, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.06231689453125, "loss_aux_layer_9": 0.0611572265625, "step": 2709, "total_loss": 0.6922050565481186 }, { "epoch": 0.5365274203128093, "grad_norm": 1.4771701097488403, "learning_rate": 5e-05, "llm_loss": 0.6247285753488541, "loss": 2.8582, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.1243896484375, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.06097412109375, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.0648193359375, "loss_aux_layer_9": 0.06378173828125, "step": 2710, "total_loss": 0.7145416587591171 }, { "epoch": 0.5367254009107107, "grad_norm": 1.2391985654830933, "learning_rate": 5e-05, "llm_loss": 0.5363203883171082, "loss": 2.5067, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.115966796875, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.064453125, "step": 2711, "total_loss": 0.6266818195581436 }, { "epoch": 0.5369233815086122, "grad_norm": 0.9665358662605286, "learning_rate": 5e-05, "llm_loss": 0.6297368854284286, "loss": 2.8892, "loss_aux_layer_0": 0.01849365234375, "loss_aux_layer_1": 0.03997802734375, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1258544921875, "loss_aux_layer_19": 0.1285400390625, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.0728759765625, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.06787109375, "step": 2712, "total_loss": 0.7222893983125687 }, { "epoch": 0.5371213621065135, "grad_norm": 0.9546138644218445, "learning_rate": 5e-05, "llm_loss": 0.5396162495017052, "loss": 2.5354, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.0841064453125, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.103759765625, "loss_aux_layer_16": 0.1136474609375, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.1297607421875, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.2119140625, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0673828125, "step": 2713, "total_loss": 0.633845180273056 }, { "epoch": 0.537319342704415, "grad_norm": 1.3439830541610718, "learning_rate": 5e-05, "llm_loss": 0.5590144395828247, "loss": 2.5883, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1282958984375, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.060546875, "step": 2714, "total_loss": 0.6470741480588913 }, { "epoch": 0.5375173233023164, "grad_norm": 1.2387117147445679, "learning_rate": 5e-05, "llm_loss": 0.6019500195980072, "loss": 2.7811, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.102783203125, "loss_aux_layer_16": 0.1126708984375, "loss_aux_layer_17": 0.12109375, "loss_aux_layer_18": 0.12939453125, "loss_aux_layer_19": 0.132568359375, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.0672607421875, "step": 2715, "total_loss": 0.6952824592590332 }, { "epoch": 0.5377153039002178, "grad_norm": 1.0024007558822632, "learning_rate": 5e-05, "llm_loss": 0.5716159716248512, "loss": 2.6281, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06488037109375, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06317138671875, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.05938720703125, "step": 2716, "total_loss": 0.6570154577493668 }, { "epoch": 0.5379132844981191, "grad_norm": 1.3173127174377441, "learning_rate": 5e-05, "llm_loss": 0.5868026837706566, "loss": 2.7249, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.04180908203125, "loss_aux_layer_10": 0.0712890625, "loss_aux_layer_11": 0.075927734375, "loss_aux_layer_12": 0.0804443359375, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.1292724609375, "loss_aux_layer_2": 0.056884765625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0675048828125, "loss_aux_layer_4": 0.0704345703125, "loss_aux_layer_5": 0.07177734375, "loss_aux_layer_6": 0.074951171875, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.0714111328125, "loss_aux_layer_9": 0.0697021484375, "step": 2717, "total_loss": 0.6812169253826141 }, { "epoch": 0.5381112650960206, "grad_norm": 0.8914618492126465, "learning_rate": 5e-05, "llm_loss": 0.523427702486515, "loss": 2.4502, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.06744384765625, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06597900390625, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.06207275390625, "step": 2718, "total_loss": 0.612558588385582 }, { "epoch": 0.538309245693922, "grad_norm": 1.389124870300293, "learning_rate": 5e-05, "llm_loss": 0.6527021080255508, "loss": 2.9707, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.06512451171875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.06402587890625, "step": 2719, "total_loss": 0.7426796704530716 }, { "epoch": 0.5385072262918233, "grad_norm": 0.8541128635406494, "learning_rate": 5e-05, "llm_loss": 0.5593801066279411, "loss": 2.5898, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.06634521484375, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.06219482421875, "step": 2720, "total_loss": 0.6474542915821075 }, { "epoch": 0.5387052068897248, "grad_norm": 1.6571199893951416, "learning_rate": 5e-05, "llm_loss": 0.6120481640100479, "loss": 2.8339, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.041748046875, "loss_aux_layer_10": 0.0726318359375, "loss_aux_layer_11": 0.0770263671875, "loss_aux_layer_12": 0.081787109375, "loss_aux_layer_13": 0.087890625, "loss_aux_layer_14": 0.0970458984375, "loss_aux_layer_15": 0.1055908203125, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.12255859375, "loss_aux_layer_18": 0.130859375, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05780029296875, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.0687255859375, "loss_aux_layer_4": 0.071533203125, "loss_aux_layer_5": 0.072998046875, "loss_aux_layer_6": 0.0762939453125, "loss_aux_layer_7": 0.0738525390625, "loss_aux_layer_8": 0.0728759765625, "loss_aux_layer_9": 0.0711669921875, "step": 2721, "total_loss": 0.7084722220897675 }, { "epoch": 0.5389031874876262, "grad_norm": 1.0384409427642822, "learning_rate": 5e-05, "llm_loss": 0.6289426237344742, "loss": 2.8761, "loss_aux_layer_0": 0.018585205078125, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.06134033203125, "loss_aux_layer_4": 0.0638427734375, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.0654296875, "loss_aux_layer_9": 0.06396484375, "step": 2722, "total_loss": 0.7190270125865936 }, { "epoch": 0.5391011680855277, "grad_norm": 1.1750272512435913, "learning_rate": 5e-05, "llm_loss": 0.6815078258514404, "loss": 3.0874, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.07049560546875, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0528564453125, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.06988525390625, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0650634765625, "step": 2723, "total_loss": 0.7718568742275238 }, { "epoch": 0.539299148683429, "grad_norm": 1.2297557592391968, "learning_rate": 5e-05, "llm_loss": 0.644441083073616, "loss": 2.9366, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.0367431640625, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1085205078125, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.1292724609375, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0596923828125, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.06439208984375, "loss_aux_layer_9": 0.06317138671875, "step": 2724, "total_loss": 0.7341544926166534 }, { "epoch": 0.5394971292813304, "grad_norm": 1.0946451425552368, "learning_rate": 5e-05, "llm_loss": 0.6051219552755356, "loss": 2.7791, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.06536865234375, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.0611572265625, "loss_aux_layer_4": 0.06402587890625, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.06390380859375, "step": 2725, "total_loss": 0.6947766840457916 }, { "epoch": 0.5396951098792319, "grad_norm": 0.9545626640319824, "learning_rate": 5e-05, "llm_loss": 0.5779683440923691, "loss": 2.6772, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.06658935546875, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.06488037109375, "step": 2726, "total_loss": 0.6692995205521584 }, { "epoch": 0.5398930904771332, "grad_norm": 1.148530125617981, "learning_rate": 5e-05, "llm_loss": 0.6475231200456619, "loss": 2.9465, "loss_aux_layer_0": 0.019073486328125, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.10693359375, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06573486328125, "loss_aux_layer_8": 0.06475830078125, "loss_aux_layer_9": 0.0633544921875, "step": 2727, "total_loss": 0.7366268187761307 }, { "epoch": 0.5400910710750346, "grad_norm": 0.8768374919891357, "learning_rate": 5e-05, "llm_loss": 0.5824037343263626, "loss": 2.6827, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.06304931640625, "step": 2728, "total_loss": 0.6706640720367432 }, { "epoch": 0.5402890516729361, "grad_norm": 0.9180760383605957, "learning_rate": 5e-05, "llm_loss": 0.5285412818193436, "loss": 2.4863, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0927734375, "loss_aux_layer_15": 0.102294921875, "loss_aux_layer_16": 0.11181640625, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.12841796875, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05316162109375, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.06341552734375, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.0682373046875, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0665283203125, "step": 2729, "total_loss": 0.6215849220752716 }, { "epoch": 0.5404870322708375, "grad_norm": 0.9436808824539185, "learning_rate": 5e-05, "llm_loss": 0.5788165926933289, "loss": 2.6786, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.06646728515625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.06671142578125, "loss_aux_layer_9": 0.0650634765625, "step": 2730, "total_loss": 0.669644370675087 }, { "epoch": 0.5406850128687388, "grad_norm": 0.9495300650596619, "learning_rate": 5e-05, "llm_loss": 0.6560205817222595, "loss": 2.9823, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.06573486328125, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06439208984375, "step": 2731, "total_loss": 0.7455731481313705 }, { "epoch": 0.5408829934666403, "grad_norm": 0.9641822576522827, "learning_rate": 5e-05, "llm_loss": 0.5728230625391006, "loss": 2.6542, "loss_aux_layer_0": 0.019012451171875, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.0682373046875, "loss_aux_layer_8": 0.06719970703125, "loss_aux_layer_9": 0.06561279296875, "step": 2732, "total_loss": 0.6635388284921646 }, { "epoch": 0.5410809740645417, "grad_norm": 0.9639847874641418, "learning_rate": 5e-05, "llm_loss": 0.5842453464865685, "loss": 2.6989, "loss_aux_layer_0": 0.019927978515625, "loss_aux_layer_1": 0.0394287109375, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.06549072265625, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0654296875, "step": 2733, "total_loss": 0.6747215241193771 }, { "epoch": 0.5412789546624431, "grad_norm": 0.7757923007011414, "learning_rate": 5e-05, "llm_loss": 0.5124718770384789, "loss": 2.4211, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.040283203125, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05413818359375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06707763671875, "loss_aux_layer_5": 0.0684814453125, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.06915283203125, "loss_aux_layer_8": 0.06842041015625, "loss_aux_layer_9": 0.06719970703125, "step": 2734, "total_loss": 0.6052795797586441 }, { "epoch": 0.5414769352603445, "grad_norm": 0.8135132789611816, "learning_rate": 5e-05, "llm_loss": 0.5506402403116226, "loss": 2.5684, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.1280517578125, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.06329345703125, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.070556640625, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06585693359375, "step": 2735, "total_loss": 0.6421055346727371 }, { "epoch": 0.5416749158582459, "grad_norm": 0.9705295562744141, "learning_rate": 5e-05, "llm_loss": 0.6190841570496559, "loss": 2.8378, "loss_aux_layer_0": 0.019256591796875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06060791015625, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06683349609375, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.064697265625, "step": 2736, "total_loss": 0.7094575315713882 }, { "epoch": 0.5418728964561473, "grad_norm": 1.4419156312942505, "learning_rate": 5e-05, "llm_loss": 0.6404575705528259, "loss": 2.9152, "loss_aux_layer_0": 0.01806640625, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.06414794921875, "step": 2737, "total_loss": 0.7288016676902771 }, { "epoch": 0.5420708770540487, "grad_norm": 1.3877779245376587, "learning_rate": 5e-05, "llm_loss": 0.5849717482924461, "loss": 2.6901, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.0369873046875, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0589599609375, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06402587890625, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06207275390625, "step": 2738, "total_loss": 0.6725129783153534 }, { "epoch": 0.5422688576519501, "grad_norm": 1.5367050170898438, "learning_rate": 5e-05, "llm_loss": 0.5837013125419617, "loss": 2.7116, "loss_aux_layer_0": 0.018463134765625, "loss_aux_layer_1": 0.0396728515625, "loss_aux_layer_10": 0.0692138671875, "loss_aux_layer_11": 0.07373046875, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1041259765625, "loss_aux_layer_16": 0.114501953125, "loss_aux_layer_17": 0.1220703125, "loss_aux_layer_18": 0.13037109375, "loss_aux_layer_19": 0.1328125, "loss_aux_layer_2": 0.05419921875, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06524658203125, "loss_aux_layer_4": 0.0677490234375, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.06787109375, "step": 2739, "total_loss": 0.6778915971517563 }, { "epoch": 0.5424668382498515, "grad_norm": 1.2024123668670654, "learning_rate": 5e-05, "llm_loss": 0.519469365477562, "loss": 2.443, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06121826171875, "loss_aux_layer_4": 0.06396484375, "loss_aux_layer_5": 0.06610107421875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0650634765625, "step": 2740, "total_loss": 0.6107488721609116 }, { "epoch": 0.542664818847753, "grad_norm": 1.342604160308838, "learning_rate": 5e-05, "llm_loss": 0.6125211492180824, "loss": 2.8162, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.06671142578125, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0672607421875, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.06561279296875, "step": 2741, "total_loss": 0.7040589898824692 }, { "epoch": 0.5428627994456543, "grad_norm": 1.1118570566177368, "learning_rate": 5e-05, "llm_loss": 0.6009097397327423, "loss": 2.765, "loss_aux_layer_0": 0.019439697265625, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1248779296875, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.06585693359375, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.0662841796875, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.06402587890625, "step": 2742, "total_loss": 0.6912419348955154 }, { "epoch": 0.5430607800435557, "grad_norm": 1.4141809940338135, "learning_rate": 5e-05, "llm_loss": 0.6101306080818176, "loss": 2.7914, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06109619140625, "step": 2743, "total_loss": 0.6978580057621002 }, { "epoch": 0.5432587606414572, "grad_norm": 1.1092908382415771, "learning_rate": 5e-05, "llm_loss": 0.5822520107030869, "loss": 2.69, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0614013671875, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.06536865234375, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.06512451171875, "loss_aux_layer_9": 0.0638427734375, "step": 2744, "total_loss": 0.6725004464387894 }, { "epoch": 0.5434567412393585, "grad_norm": 1.2046641111373901, "learning_rate": 5e-05, "llm_loss": 0.6688064932823181, "loss": 3.03, "loss_aux_layer_0": 0.01824951171875, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.06671142578125, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.062744140625, "step": 2745, "total_loss": 0.7574938982725143 }, { "epoch": 0.5436547218372599, "grad_norm": 0.969358503818512, "learning_rate": 5e-05, "llm_loss": 0.6171216666698456, "loss": 2.8333, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06097412109375, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0657958984375, "step": 2746, "total_loss": 0.7083301246166229 }, { "epoch": 0.5438527024351614, "grad_norm": 1.294278860092163, "learning_rate": 5e-05, "llm_loss": 0.5495655983686447, "loss": 2.5621, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.0382080078125, "loss_aux_layer_10": 0.0665283203125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06561279296875, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06719970703125, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0653076171875, "step": 2747, "total_loss": 0.6405297666788101 }, { "epoch": 0.5440506830330628, "grad_norm": 0.9527243971824646, "learning_rate": 5e-05, "llm_loss": 0.6021129190921783, "loss": 2.7879, "loss_aux_layer_0": 0.020843505859375, "loss_aux_layer_1": 0.039794921875, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.073486328125, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.0943603515625, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.13330078125, "loss_aux_layer_2": 0.05438232421875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.173583984375, "loss_aux_layer_23": 0.21240234375, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0704345703125, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.06787109375, "step": 2748, "total_loss": 0.6969731599092484 }, { "epoch": 0.5442486636309641, "grad_norm": 0.9884406924247742, "learning_rate": 5e-05, "llm_loss": 0.5380324423313141, "loss": 2.5196, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.169921875, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.06658935546875, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0653076171875, "step": 2749, "total_loss": 0.6298930794000626 }, { "epoch": 0.5444466442288656, "grad_norm": 1.0940983295440674, "learning_rate": 5e-05, "llm_loss": 0.5926051810383797, "loss": 2.7397, "loss_aux_layer_0": 0.02069091796875, "loss_aux_layer_1": 0.039306640625, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0672607421875, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0662841796875, "step": 2750, "total_loss": 0.6849272102117538 }, { "epoch": 0.544644624826767, "grad_norm": 1.1414145231246948, "learning_rate": 5e-05, "llm_loss": 0.5945896953344345, "loss": 2.731, "loss_aux_layer_0": 0.019012451171875, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.06744384765625, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06170654296875, "step": 2751, "total_loss": 0.6827459335327148 }, { "epoch": 0.5448426054246683, "grad_norm": 0.9062030911445618, "learning_rate": 5e-05, "llm_loss": 0.6196193993091583, "loss": 2.8324, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.06488037109375, "loss_aux_layer_11": 0.06915283203125, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.06524658203125, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.063232421875, "step": 2752, "total_loss": 0.7080947756767273 }, { "epoch": 0.5450405860225698, "grad_norm": 1.0165350437164307, "learning_rate": 5e-05, "llm_loss": 0.5733677446842194, "loss": 2.6453, "loss_aux_layer_0": 0.020263671875, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.06207275390625, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.06658935546875, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.06378173828125, "loss_aux_layer_9": 0.0625, "step": 2753, "total_loss": 0.6613359153270721 }, { "epoch": 0.5452385666204712, "grad_norm": 0.8910807967185974, "learning_rate": 5e-05, "llm_loss": 0.6571834683418274, "loss": 2.9804, "loss_aux_layer_0": 0.0181884765625, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.06671142578125, "loss_aux_layer_7": 0.06451416015625, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06243896484375, "step": 2754, "total_loss": 0.7450947612524033 }, { "epoch": 0.5454365472183726, "grad_norm": 1.0162582397460938, "learning_rate": 5e-05, "llm_loss": 0.6297528147697449, "loss": 2.8904, "loss_aux_layer_0": 0.0198974609375, "loss_aux_layer_1": 0.0390625, "loss_aux_layer_10": 0.06890869140625, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.1297607421875, "loss_aux_layer_2": 0.05377197265625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.067138671875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.06964111328125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0675048828125, "step": 2755, "total_loss": 0.7226024121046066 }, { "epoch": 0.545634527816274, "grad_norm": 1.610368013381958, "learning_rate": 5e-05, "llm_loss": 0.601951390504837, "loss": 2.775, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.06646728515625, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.06744384765625, "loss_aux_layer_8": 0.066650390625, "loss_aux_layer_9": 0.0655517578125, "step": 2756, "total_loss": 0.6937488913536072 }, { "epoch": 0.5458325084141754, "grad_norm": 1.0854095220565796, "learning_rate": 5e-05, "llm_loss": 0.5768044888973236, "loss": 2.6735, "loss_aux_layer_0": 0.01824951171875, "loss_aux_layer_1": 0.03973388671875, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.091796875, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.115966796875, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.05364990234375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.06402587890625, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0716552734375, "loss_aux_layer_7": 0.0692138671875, "loss_aux_layer_8": 0.068603515625, "loss_aux_layer_9": 0.0672607421875, "step": 2757, "total_loss": 0.66838438808918 }, { "epoch": 0.5460304890120768, "grad_norm": 1.1651173830032349, "learning_rate": 5e-05, "llm_loss": 0.5147832036018372, "loss": 2.4377, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06842041015625, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0787353515625, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.1048583984375, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.123291015625, "loss_aux_layer_18": 0.1318359375, "loss_aux_layer_19": 0.135009765625, "loss_aux_layer_2": 0.052734375, "loss_aux_layer_20": 0.142578125, "loss_aux_layer_21": 0.150634765625, "loss_aux_layer_22": 0.173095703125, "loss_aux_layer_23": 0.213134765625, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.06793212890625, "loss_aux_layer_9": 0.06689453125, "step": 2758, "total_loss": 0.6094231605529785 }, { "epoch": 0.5462284696099782, "grad_norm": 0.9375972747802734, "learning_rate": 5e-05, "llm_loss": 0.5742010176181793, "loss": 2.6604, "loss_aux_layer_0": 0.018890380859375, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.11767578125, "loss_aux_layer_18": 0.1258544921875, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0650634765625, "step": 2759, "total_loss": 0.6650969982147217 }, { "epoch": 0.5464264502078796, "grad_norm": 1.4678711891174316, "learning_rate": 5e-05, "llm_loss": 0.53846874833107, "loss": 2.5251, "loss_aux_layer_0": 0.01806640625, "loss_aux_layer_1": 0.04046630859375, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0931396484375, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.05426025390625, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06488037109375, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.072509765625, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.0693359375, "loss_aux_layer_9": 0.067626953125, "step": 2760, "total_loss": 0.6312787234783173 }, { "epoch": 0.546624430805781, "grad_norm": 0.9765082001686096, "learning_rate": 5e-05, "llm_loss": 0.6422915607690811, "loss": 2.9308, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.05126953125, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06182861328125, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06695556640625, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.0650634765625, "step": 2761, "total_loss": 0.7327107191085815 }, { "epoch": 0.5468224114036825, "grad_norm": 1.6384398937225342, "learning_rate": 5e-05, "llm_loss": 0.5167718455195427, "loss": 2.439, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.0391845703125, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.111572265625, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.1280517578125, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.14892578125, "loss_aux_layer_22": 0.170166015625, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.06280517578125, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.07037353515625, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.06732177734375, "loss_aux_layer_9": 0.0662841796875, "step": 2762, "total_loss": 0.6097456961870193 }, { "epoch": 0.5470203920015838, "grad_norm": 0.8498905897140503, "learning_rate": 5e-05, "llm_loss": 0.6080735325813293, "loss": 2.7872, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.037109375, "loss_aux_layer_10": 0.06585693359375, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.06842041015625, "loss_aux_layer_7": 0.06634521484375, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.064697265625, "step": 2763, "total_loss": 0.6967949420213699 }, { "epoch": 0.5472183725994852, "grad_norm": 1.6275266408920288, "learning_rate": 5e-05, "llm_loss": 0.6398159712553024, "loss": 2.9097, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.06768798828125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06585693359375, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.061767578125, "step": 2764, "total_loss": 0.7274244725704193 }, { "epoch": 0.5474163531973867, "grad_norm": 1.194703221321106, "learning_rate": 5e-05, "llm_loss": 0.6227519810199738, "loss": 2.8528, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.0382080078125, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.06524658203125, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.06982421875, "loss_aux_layer_7": 0.06756591796875, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06536865234375, "step": 2765, "total_loss": 0.7132065892219543 }, { "epoch": 0.547614333795288, "grad_norm": 1.1364052295684814, "learning_rate": 5e-05, "llm_loss": 0.6302456259727478, "loss": 2.8794, "loss_aux_layer_0": 0.01904296875, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06201171875, "step": 2766, "total_loss": 0.7198539674282074 }, { "epoch": 0.5478123143931894, "grad_norm": 1.6609498262405396, "learning_rate": 5e-05, "llm_loss": 0.5495295226573944, "loss": 2.5692, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.06646728515625, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.09228515625, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1209716796875, "loss_aux_layer_18": 0.130126953125, "loss_aux_layer_19": 0.1337890625, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.141357421875, "loss_aux_layer_21": 0.1494140625, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.21044921875, "loss_aux_layer_3": 0.06146240234375, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.06597900390625, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.06658935546875, "loss_aux_layer_8": 0.06634521484375, "loss_aux_layer_9": 0.06524658203125, "step": 2767, "total_loss": 0.6422955095767975 }, { "epoch": 0.5480102949910909, "grad_norm": 1.1543288230895996, "learning_rate": 5e-05, "llm_loss": 0.6251793950796127, "loss": 2.8514, "loss_aux_layer_0": 0.019256591796875, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1258544921875, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06121826171875, "step": 2768, "total_loss": 0.7128478288650513 }, { "epoch": 0.5482082755889923, "grad_norm": 1.1006532907485962, "learning_rate": 5e-05, "llm_loss": 0.5979933589696884, "loss": 2.7563, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.06585693359375, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.2060546875, "loss_aux_layer_3": 0.06243896484375, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.06707763671875, "loss_aux_layer_8": 0.06610107421875, "loss_aux_layer_9": 0.064697265625, "step": 2769, "total_loss": 0.6890757977962494 }, { "epoch": 0.5484062561868936, "grad_norm": 1.1411244869232178, "learning_rate": 5e-05, "llm_loss": 0.6093659698963165, "loss": 2.7884, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06268310546875, "step": 2770, "total_loss": 0.6970933228731155 }, { "epoch": 0.5486042367847951, "grad_norm": 0.9615731835365295, "learning_rate": 5e-05, "llm_loss": 0.5966806858778, "loss": 2.7523, "loss_aux_layer_0": 0.02008056640625, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05126953125, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0654296875, "step": 2771, "total_loss": 0.6880808472633362 }, { "epoch": 0.5488022173826965, "grad_norm": 0.9652315378189087, "learning_rate": 5e-05, "llm_loss": 0.5193121060729027, "loss": 2.4347, "loss_aux_layer_0": 0.0181884765625, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.06500244140625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.05126953125, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0614013671875, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.06573486328125, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.065673828125, "loss_aux_layer_9": 0.0640869140625, "step": 2772, "total_loss": 0.6086812168359756 }, { "epoch": 0.5490001979805978, "grad_norm": 1.06857430934906, "learning_rate": 5e-05, "llm_loss": 0.6057868152856827, "loss": 2.7896, "loss_aux_layer_0": 0.0194091796875, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.0677490234375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.0828857421875, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.052734375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.06768798828125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.06768798828125, "loss_aux_layer_9": 0.066162109375, "step": 2773, "total_loss": 0.69740791618824 }, { "epoch": 0.5491981785784993, "grad_norm": 1.1720300912857056, "learning_rate": 5e-05, "llm_loss": 0.5794231444597244, "loss": 2.6786, "loss_aux_layer_0": 0.022491455078125, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.06494140625, "step": 2774, "total_loss": 0.6696525067090988 }, { "epoch": 0.5493961591764007, "grad_norm": 0.7607387900352478, "learning_rate": 5e-05, "llm_loss": 0.5549655258655548, "loss": 2.57, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06036376953125, "step": 2775, "total_loss": 0.642488494515419 }, { "epoch": 0.5495941397743022, "grad_norm": 1.1330885887145996, "learning_rate": 5e-05, "llm_loss": 0.5869150608778, "loss": 2.699, "loss_aux_layer_0": 0.019775390625, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.06414794921875, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.06341552734375, "step": 2776, "total_loss": 0.6747465431690216 }, { "epoch": 0.5497921203722035, "grad_norm": 0.8615005016326904, "learning_rate": 5e-05, "llm_loss": 0.6392744481563568, "loss": 2.9033, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.060302734375, "step": 2777, "total_loss": 0.7258137166500092 }, { "epoch": 0.5499901009701049, "grad_norm": 1.0900812149047852, "learning_rate": 5e-05, "llm_loss": 0.6271399855613708, "loss": 2.861, "loss_aux_layer_0": 0.01776123046875, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06658935546875, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06219482421875, "step": 2778, "total_loss": 0.7152442932128906 }, { "epoch": 0.5501880815680064, "grad_norm": 1.238682508468628, "learning_rate": 5e-05, "llm_loss": 0.5961359441280365, "loss": 2.7395, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.06396484375, "loss_aux_layer_5": 0.0654296875, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.06341552734375, "step": 2779, "total_loss": 0.6848819702863693 }, { "epoch": 0.5503860621659077, "grad_norm": 1.3607453107833862, "learning_rate": 5e-05, "llm_loss": 0.5703548192977905, "loss": 2.6534, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06768798828125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.06634521484375, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0665283203125, "step": 2780, "total_loss": 0.6633612364530563 }, { "epoch": 0.5505840427638091, "grad_norm": 0.8427018523216248, "learning_rate": 5e-05, "llm_loss": 0.606967106461525, "loss": 2.7741, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.06060791015625, "step": 2781, "total_loss": 0.6935349553823471 }, { "epoch": 0.5507820233617106, "grad_norm": 1.3552509546279907, "learning_rate": 5e-05, "llm_loss": 0.5594897866249084, "loss": 2.5914, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.062255859375, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06219482421875, "step": 2782, "total_loss": 0.6478540748357773 }, { "epoch": 0.550980003959612, "grad_norm": 1.1614158153533936, "learning_rate": 5e-05, "llm_loss": 0.5391795784235001, "loss": 2.5188, "loss_aux_layer_0": 0.01904296875, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1278076171875, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.06597900390625, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.06683349609375, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.065185546875, "step": 2783, "total_loss": 0.6296878755092621 }, { "epoch": 0.5511779845575133, "grad_norm": 1.1010814905166626, "learning_rate": 5e-05, "llm_loss": 0.6046730726957321, "loss": 2.7738, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.064697265625, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.0509033203125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.06707763671875, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.06329345703125, "step": 2784, "total_loss": 0.6934400945901871 }, { "epoch": 0.5513759651554148, "grad_norm": 1.4048205614089966, "learning_rate": 5e-05, "llm_loss": 0.6352780312299728, "loss": 2.8981, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.1346435546875, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.06463623046875, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.063720703125, "step": 2785, "total_loss": 0.7245147973299026 }, { "epoch": 0.5515739457533162, "grad_norm": 1.1158866882324219, "learning_rate": 5e-05, "llm_loss": 0.6241467595100403, "loss": 2.8632, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.1302490234375, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.16650390625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.0623779296875, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.06707763671875, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.06488037109375, "step": 2786, "total_loss": 0.7157934457063675 }, { "epoch": 0.5517719263512176, "grad_norm": 1.2475427389144897, "learning_rate": 5e-05, "llm_loss": 0.6108535081148148, "loss": 2.7978, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.06243896484375, "step": 2787, "total_loss": 0.6994619518518448 }, { "epoch": 0.551969906949119, "grad_norm": 0.971691906452179, "learning_rate": 5e-05, "llm_loss": 0.5991533324122429, "loss": 2.7584, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1248779296875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05084228515625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.06121826171875, "loss_aux_layer_4": 0.06378173828125, "loss_aux_layer_5": 0.0654296875, "loss_aux_layer_6": 0.06829833984375, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.06414794921875, "step": 2788, "total_loss": 0.6896038055419922 }, { "epoch": 0.5521678875470204, "grad_norm": 1.3362584114074707, "learning_rate": 5e-05, "llm_loss": 0.6303506344556808, "loss": 2.8862, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.0653076171875, "step": 2789, "total_loss": 0.7215471863746643 }, { "epoch": 0.5523658681449218, "grad_norm": 0.852023720741272, "learning_rate": 5e-05, "llm_loss": 0.5153110027313232, "loss": 2.414, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.06768798828125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1256103515625, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.062255859375, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.06658935546875, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.062255859375, "step": 2790, "total_loss": 0.6035093814134598 }, { "epoch": 0.5525638487428232, "grad_norm": 1.346889853477478, "learning_rate": 5e-05, "llm_loss": 0.587434247136116, "loss": 2.7265, "loss_aux_layer_0": 0.021484375, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0797119140625, "loss_aux_layer_13": 0.086181640625, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1051025390625, "loss_aux_layer_16": 0.115478515625, "loss_aux_layer_17": 0.1234130859375, "loss_aux_layer_18": 0.1314697265625, "loss_aux_layer_19": 0.1331787109375, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.140380859375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.06817626953125, "loss_aux_layer_6": 0.0712890625, "loss_aux_layer_7": 0.06939697265625, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06787109375, "step": 2791, "total_loss": 0.6816345006227493 }, { "epoch": 0.5527618293407246, "grad_norm": 1.3142316341400146, "learning_rate": 5e-05, "llm_loss": 0.553662046790123, "loss": 2.5694, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.06170654296875, "step": 2792, "total_loss": 0.6423402577638626 }, { "epoch": 0.552959809938626, "grad_norm": 1.322453260421753, "learning_rate": 5e-05, "llm_loss": 0.6020790785551071, "loss": 2.7708, "loss_aux_layer_0": 0.021759033203125, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.06573486328125, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.06427001953125, "step": 2793, "total_loss": 0.6927042007446289 }, { "epoch": 0.5531577905365275, "grad_norm": 1.662668228149414, "learning_rate": 5e-05, "llm_loss": 0.5906579494476318, "loss": 2.7258, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1285400390625, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.06646728515625, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.064697265625, "step": 2794, "total_loss": 0.6814494878053665 }, { "epoch": 0.5533557711344288, "grad_norm": 0.9482941031455994, "learning_rate": 5e-05, "llm_loss": 0.569000206887722, "loss": 2.6271, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.064208984375, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.06219482421875, "step": 2795, "total_loss": 0.656774640083313 }, { "epoch": 0.5535537517323302, "grad_norm": 1.3454172611236572, "learning_rate": 5e-05, "llm_loss": 0.559580035507679, "loss": 2.6001, "loss_aux_layer_0": 0.02197265625, "loss_aux_layer_1": 0.03814697265625, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.06402587890625, "loss_aux_layer_5": 0.06573486328125, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.0662841796875, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06463623046875, "step": 2796, "total_loss": 0.6500328332185745 }, { "epoch": 0.5537517323302317, "grad_norm": 1.0784481763839722, "learning_rate": 5e-05, "llm_loss": 0.537302240729332, "loss": 2.5255, "loss_aux_layer_0": 0.018035888671875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1038818359375, "loss_aux_layer_16": 0.11474609375, "loss_aux_layer_17": 0.1231689453125, "loss_aux_layer_18": 0.132080078125, "loss_aux_layer_19": 0.135498046875, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.143310546875, "loss_aux_layer_21": 0.15087890625, "loss_aux_layer_22": 0.171630859375, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.06707763671875, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0665283203125, "step": 2797, "total_loss": 0.6313793361186981 }, { "epoch": 0.553949712928133, "grad_norm": 1.2684156894683838, "learning_rate": 5e-05, "llm_loss": 0.5496087670326233, "loss": 2.5724, "loss_aux_layer_0": 0.021453857421875, "loss_aux_layer_1": 0.0406494140625, "loss_aux_layer_10": 0.069580078125, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0791015625, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.129638671875, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.068115234375, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.068115234375, "step": 2798, "total_loss": 0.6430947333574295 }, { "epoch": 0.5541476935260344, "grad_norm": 1.1351205110549927, "learning_rate": 5e-05, "llm_loss": 0.5734614878892899, "loss": 2.6422, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.048828125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0589599609375, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06134033203125, "step": 2799, "total_loss": 0.6605543792247772 }, { "epoch": 0.5543456741239359, "grad_norm": 1.2017879486083984, "learning_rate": 5e-05, "llm_loss": 0.5792004838585854, "loss": 2.6788, "loss_aux_layer_0": 0.020294189453125, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.052734375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.06317138671875, "loss_aux_layer_4": 0.06597900390625, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.065673828125, "step": 2800, "total_loss": 0.6697040498256683 }, { "epoch": 0.5545436547218373, "grad_norm": 1.0245698690414429, "learning_rate": 5e-05, "llm_loss": 0.6623624563217163, "loss": 2.9988, "loss_aux_layer_0": 0.019500732421875, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.06231689453125, "loss_aux_layer_9": 0.0611572265625, "step": 2801, "total_loss": 0.7497085481882095 }, { "epoch": 0.5547416353197386, "grad_norm": 1.0230623483657837, "learning_rate": 5e-05, "llm_loss": 0.7080684453248978, "loss": 3.2012, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06817626953125, "loss_aux_layer_11": 0.07281494140625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203857421875, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.066650390625, "step": 2802, "total_loss": 0.8003056943416595 }, { "epoch": 0.5549396159176401, "grad_norm": 1.0202410221099854, "learning_rate": 5e-05, "llm_loss": 0.4968806356191635, "loss": 2.3519, "loss_aux_layer_0": 0.020111083984375, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.06640625, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06182861328125, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.06597900390625, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06494140625, "step": 2803, "total_loss": 0.5879821330308914 }, { "epoch": 0.5551375965155415, "grad_norm": 1.4635847806930542, "learning_rate": 5e-05, "llm_loss": 0.5991402119398117, "loss": 2.7643, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.072509765625, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.05352783203125, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.0643310546875, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.0689697265625, "loss_aux_layer_6": 0.07177734375, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.06689453125, "step": 2804, "total_loss": 0.6910855323076248 }, { "epoch": 0.5553355771134428, "grad_norm": 1.4465564489364624, "learning_rate": 5e-05, "llm_loss": 0.5742234736680984, "loss": 2.6499, "loss_aux_layer_0": 0.018157958984375, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06207275390625, "step": 2805, "total_loss": 0.662479966878891 }, { "epoch": 0.5555335577113443, "grad_norm": 0.9478911757469177, "learning_rate": 5e-05, "llm_loss": 0.5661232471466064, "loss": 2.64, "loss_aux_layer_0": 0.01806640625, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.08544921875, "loss_aux_layer_14": 0.094970703125, "loss_aux_layer_15": 0.1036376953125, "loss_aux_layer_16": 0.1131591796875, "loss_aux_layer_17": 0.12060546875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.052978515625, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06414794921875, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0693359375, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.0706787109375, "loss_aux_layer_8": 0.0699462890625, "loss_aux_layer_9": 0.0684814453125, "step": 2806, "total_loss": 0.6600013673305511 }, { "epoch": 0.5557315383092457, "grad_norm": 1.1661244630813599, "learning_rate": 5e-05, "llm_loss": 0.5942059606313705, "loss": 2.7407, "loss_aux_layer_0": 0.017822265625, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.1104736328125, "loss_aux_layer_17": 0.11865234375, "loss_aux_layer_18": 0.1265869140625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0654296875, "step": 2807, "total_loss": 0.6851658672094345 }, { "epoch": 0.5559295189071471, "grad_norm": 1.783542275428772, "learning_rate": 5e-05, "llm_loss": 0.5791031569242477, "loss": 2.6622, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06036376953125, "step": 2808, "total_loss": 0.6655609458684921 }, { "epoch": 0.5561274995050485, "grad_norm": 1.4460744857788086, "learning_rate": 5e-05, "llm_loss": 0.6479585319757462, "loss": 2.9559, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.06512451171875, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06512451171875, "step": 2809, "total_loss": 0.7389853149652481 }, { "epoch": 0.5563254801029499, "grad_norm": 1.4333417415618896, "learning_rate": 5e-05, "llm_loss": 0.6112416386604309, "loss": 2.8204, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.03936767578125, "loss_aux_layer_10": 0.06878662109375, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.0843505859375, "loss_aux_layer_14": 0.0936279296875, "loss_aux_layer_15": 0.10302734375, "loss_aux_layer_16": 0.113037109375, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.1300048828125, "loss_aux_layer_19": 0.133544921875, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.064453125, "loss_aux_layer_4": 0.06719970703125, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.06927490234375, "loss_aux_layer_8": 0.06878662109375, "loss_aux_layer_9": 0.06744384765625, "step": 2810, "total_loss": 0.7050946056842804 }, { "epoch": 0.5565234607008513, "grad_norm": 1.2733287811279297, "learning_rate": 5e-05, "llm_loss": 0.6061557084321976, "loss": 2.7767, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06243896484375, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.06317138671875, "step": 2811, "total_loss": 0.6941750794649124 }, { "epoch": 0.5567214412987527, "grad_norm": 1.3884024620056152, "learning_rate": 5e-05, "llm_loss": 0.5754608809947968, "loss": 2.6639, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.06890869140625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06585693359375, "loss_aux_layer_9": 0.06439208984375, "step": 2812, "total_loss": 0.665965236723423 }, { "epoch": 0.5569194218966541, "grad_norm": 1.0530091524124146, "learning_rate": 5e-05, "llm_loss": 0.586841993033886, "loss": 2.6952, "loss_aux_layer_0": 0.018157958984375, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.121337890625, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0615234375, "step": 2813, "total_loss": 0.6737964302301407 }, { "epoch": 0.5571174024945555, "grad_norm": 1.013704538345337, "learning_rate": 5e-05, "llm_loss": 0.5069548562169075, "loss": 2.4068, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.04022216796875, "loss_aux_layer_10": 0.07080078125, "loss_aux_layer_11": 0.0755615234375, "loss_aux_layer_12": 0.08056640625, "loss_aux_layer_13": 0.086669921875, "loss_aux_layer_14": 0.0950927734375, "loss_aux_layer_15": 0.10400390625, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.132080078125, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.14013671875, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06561279296875, "loss_aux_layer_4": 0.0687255859375, "loss_aux_layer_5": 0.0699462890625, "loss_aux_layer_6": 0.073486328125, "loss_aux_layer_7": 0.0711669921875, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.069091796875, "step": 2814, "total_loss": 0.6017046123743057 }, { "epoch": 0.557315383092457, "grad_norm": 1.1043821573257446, "learning_rate": 5e-05, "llm_loss": 0.5850865021348, "loss": 2.6943, "loss_aux_layer_0": 0.01947021484375, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.0621337890625, "step": 2815, "total_loss": 0.6735658645629883 }, { "epoch": 0.5575133636903583, "grad_norm": 1.2575196027755737, "learning_rate": 5e-05, "llm_loss": 0.6019226312637329, "loss": 2.7656, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0614013671875, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06256103515625, "step": 2816, "total_loss": 0.6914108395576477 }, { "epoch": 0.5577113442882597, "grad_norm": 0.9330902695655823, "learning_rate": 5e-05, "llm_loss": 0.5755104869604111, "loss": 2.6549, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.07122802734375, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.1243896484375, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06060791015625, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06109619140625, "step": 2817, "total_loss": 0.6637209206819534 }, { "epoch": 0.5579093248861612, "grad_norm": 1.0608247518539429, "learning_rate": 5e-05, "llm_loss": 0.5894396603107452, "loss": 2.715, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1282958984375, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06207275390625, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0623779296875, "step": 2818, "total_loss": 0.6787474155426025 }, { "epoch": 0.5581073054840625, "grad_norm": 1.1413283348083496, "learning_rate": 5e-05, "llm_loss": 0.5493763238191605, "loss": 2.542, "loss_aux_layer_0": 0.019775390625, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.05999755859375, "step": 2819, "total_loss": 0.635500431060791 }, { "epoch": 0.558305286081964, "grad_norm": 0.8388750553131104, "learning_rate": 5e-05, "llm_loss": 0.646287351846695, "loss": 2.9443, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.0367431640625, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.06060791015625, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.06512451171875, "loss_aux_layer_9": 0.06396484375, "step": 2820, "total_loss": 0.7360872179269791 }, { "epoch": 0.5585032666798654, "grad_norm": 1.1203378438949585, "learning_rate": 5e-05, "llm_loss": 0.5851424485445023, "loss": 2.6994, "loss_aux_layer_0": 0.01849365234375, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06719970703125, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.0653076171875, "step": 2821, "total_loss": 0.674852192401886 }, { "epoch": 0.5587012472777668, "grad_norm": 0.929607093334198, "learning_rate": 5e-05, "llm_loss": 0.5895976275205612, "loss": 2.7197, "loss_aux_layer_0": 0.020172119140625, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.0511474609375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06573486328125, "loss_aux_layer_9": 0.064208984375, "step": 2822, "total_loss": 0.6799325346946716 }, { "epoch": 0.5588992278756681, "grad_norm": 0.9391000866889954, "learning_rate": 5e-05, "llm_loss": 0.619384840130806, "loss": 2.8436, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.06768798828125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.066162109375, "step": 2823, "total_loss": 0.7109102457761765 }, { "epoch": 0.5590972084735696, "grad_norm": 1.0446995496749878, "learning_rate": 5e-05, "llm_loss": 0.6461468040943146, "loss": 2.9617, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.0738525390625, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.094482421875, "loss_aux_layer_15": 0.1029052734375, "loss_aux_layer_16": 0.11328125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.1297607421875, "loss_aux_layer_19": 0.133056640625, "loss_aux_layer_2": 0.05340576171875, "loss_aux_layer_20": 0.140869140625, "loss_aux_layer_21": 0.149658203125, "loss_aux_layer_22": 0.170654296875, "loss_aux_layer_23": 0.209716796875, "loss_aux_layer_3": 0.06427001953125, "loss_aux_layer_4": 0.0670166015625, "loss_aux_layer_5": 0.06903076171875, "loss_aux_layer_6": 0.072021484375, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.069091796875, "loss_aux_layer_9": 0.068115234375, "step": 2824, "total_loss": 0.7404315024614334 }, { "epoch": 0.559295189071471, "grad_norm": 1.095118522644043, "learning_rate": 5e-05, "llm_loss": 0.6353760957717896, "loss": 2.9011, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06488037109375, "step": 2825, "total_loss": 0.7252703905105591 }, { "epoch": 0.5594931696693723, "grad_norm": 0.9666634798049927, "learning_rate": 5e-05, "llm_loss": 0.5031559765338898, "loss": 2.3759, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0809326171875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06146240234375, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06451416015625, "step": 2826, "total_loss": 0.5939780324697495 }, { "epoch": 0.5596911502672738, "grad_norm": 1.0799072980880737, "learning_rate": 5e-05, "llm_loss": 0.5836227387189865, "loss": 2.7036, "loss_aux_layer_0": 0.018890380859375, "loss_aux_layer_1": 0.03851318359375, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.09326171875, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.120361328125, "loss_aux_layer_18": 0.1280517578125, "loss_aux_layer_19": 0.1300048828125, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.067626953125, "loss_aux_layer_9": 0.0662841796875, "step": 2827, "total_loss": 0.6759107559919357 }, { "epoch": 0.5598891308651752, "grad_norm": 1.1677393913269043, "learning_rate": 5e-05, "llm_loss": 0.5934345126152039, "loss": 2.7215, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05865478515625, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06573486328125, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06231689453125, "step": 2828, "total_loss": 0.6803759187459946 }, { "epoch": 0.5600871114630767, "grad_norm": 1.034267783164978, "learning_rate": 5e-05, "llm_loss": 0.5660527646541595, "loss": 2.6128, "loss_aux_layer_0": 0.017730712890625, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04901123046875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06597900390625, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.0625, "step": 2829, "total_loss": 0.6531877517700195 }, { "epoch": 0.560285092060978, "grad_norm": 0.9039034247398376, "learning_rate": 5e-05, "llm_loss": 0.4823332577943802, "loss": 2.2753, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.060791015625, "step": 2830, "total_loss": 0.5688301175832748 }, { "epoch": 0.5604830726588794, "grad_norm": 1.1267908811569214, "learning_rate": 5e-05, "llm_loss": 0.5839103907346725, "loss": 2.6792, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06005859375, "step": 2831, "total_loss": 0.6697887480258942 }, { "epoch": 0.5606810532567809, "grad_norm": 1.4888254404067993, "learning_rate": 5e-05, "llm_loss": 0.6740041226148605, "loss": 3.0571, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.0369873046875, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.11767578125, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.06353759765625, "step": 2832, "total_loss": 0.7642771899700165 }, { "epoch": 0.5608790338546823, "grad_norm": 1.0214072465896606, "learning_rate": 5e-05, "llm_loss": 0.5997340530157089, "loss": 2.7616, "loss_aux_layer_0": 0.0186767578125, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.0509033203125, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.06463623046875, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.06463623046875, "step": 2833, "total_loss": 0.6904019415378571 }, { "epoch": 0.5610770144525836, "grad_norm": 1.0433160066604614, "learning_rate": 5e-05, "llm_loss": 0.5055701583623886, "loss": 2.3912, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.10107421875, "loss_aux_layer_16": 0.1116943359375, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.1285400390625, "loss_aux_layer_19": 0.1318359375, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.139404296875, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06494140625, "step": 2834, "total_loss": 0.5978012681007385 }, { "epoch": 0.5612749950504851, "grad_norm": 0.9389290809631348, "learning_rate": 5e-05, "llm_loss": 0.6044652909040451, "loss": 2.7728, "loss_aux_layer_0": 0.019378662109375, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.06060791015625, "loss_aux_layer_4": 0.0631103515625, "loss_aux_layer_5": 0.06475830078125, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06414794921875, "step": 2835, "total_loss": 0.6932096183300018 }, { "epoch": 0.5614729756483865, "grad_norm": 0.9298157691955566, "learning_rate": 5e-05, "llm_loss": 0.5406146347522736, "loss": 2.5187, "loss_aux_layer_0": 0.01806640625, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.0648193359375, "loss_aux_layer_9": 0.06353759765625, "step": 2836, "total_loss": 0.6296690404415131 }, { "epoch": 0.5616709562462878, "grad_norm": 0.9606127738952637, "learning_rate": 5e-05, "llm_loss": 0.5936983823776245, "loss": 2.7191, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.1309814453125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06048583984375, "step": 2837, "total_loss": 0.6797757595777512 }, { "epoch": 0.5618689368441893, "grad_norm": 0.9503970146179199, "learning_rate": 5e-05, "llm_loss": 0.5902250558137894, "loss": 2.723, "loss_aux_layer_0": 0.018585205078125, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.11572265625, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.052001953125, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06231689453125, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.06671142578125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.06475830078125, "step": 2838, "total_loss": 0.6807381361722946 }, { "epoch": 0.5620669174420907, "grad_norm": 0.8675849437713623, "learning_rate": 5e-05, "llm_loss": 0.5140592604875565, "loss": 2.4112, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.06488037109375, "loss_aux_layer_8": 0.06451416015625, "loss_aux_layer_9": 0.06353759765625, "step": 2839, "total_loss": 0.6027929484844208 }, { "epoch": 0.5622648980399921, "grad_norm": 0.8227452635765076, "learning_rate": 5e-05, "llm_loss": 0.5373003706336021, "loss": 2.5116, "loss_aux_layer_0": 0.018798828125, "loss_aux_layer_1": 0.03997802734375, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0902099609375, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.05255126953125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.06683349609375, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06536865234375, "step": 2840, "total_loss": 0.6279075294733047 }, { "epoch": 0.5624628786378935, "grad_norm": 1.0419530868530273, "learning_rate": 5e-05, "llm_loss": 0.5619971305131912, "loss": 2.6142, "loss_aux_layer_0": 0.0196533203125, "loss_aux_layer_1": 0.0382080078125, "loss_aux_layer_10": 0.06640625, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.07562255859375, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06500244140625, "loss_aux_layer_5": 0.06634521484375, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06634521484375, "loss_aux_layer_9": 0.06524658203125, "step": 2841, "total_loss": 0.6535533368587494 }, { "epoch": 0.5626608592357949, "grad_norm": 0.9090951681137085, "learning_rate": 5e-05, "llm_loss": 0.5705046653747559, "loss": 2.6379, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.05078125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.06097412109375, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06396484375, "step": 2842, "total_loss": 0.6594791412353516 }, { "epoch": 0.5628588398336963, "grad_norm": 0.9719980955123901, "learning_rate": 5e-05, "llm_loss": 0.5345443934202194, "loss": 2.5004, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.06427001953125, "step": 2843, "total_loss": 0.6250882893800735 }, { "epoch": 0.5630568204315977, "grad_norm": 1.004329800605774, "learning_rate": 5e-05, "llm_loss": 0.6406424641609192, "loss": 2.9329, "loss_aux_layer_0": 0.018157958984375, "loss_aux_layer_1": 0.04034423828125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.0789794921875, "loss_aux_layer_13": 0.0850830078125, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.054443359375, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.065673828125, "loss_aux_layer_4": 0.068603515625, "loss_aux_layer_5": 0.0701904296875, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.071044921875, "loss_aux_layer_8": 0.070068359375, "loss_aux_layer_9": 0.0684814453125, "step": 2844, "total_loss": 0.7332215011119843 }, { "epoch": 0.5632548010294991, "grad_norm": 1.0564881563186646, "learning_rate": 5e-05, "llm_loss": 0.49387097358703613, "loss": 2.3428, "loss_aux_layer_0": 0.019012451171875, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.110595703125, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.1298828125, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06219482421875, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.065185546875, "step": 2845, "total_loss": 0.5857046842575073 }, { "epoch": 0.5634527816274005, "grad_norm": 1.1347527503967285, "learning_rate": 5e-05, "llm_loss": 0.594969242811203, "loss": 2.7257, "loss_aux_layer_0": 0.0191650390625, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.0599365234375, "step": 2846, "total_loss": 0.6814142167568207 }, { "epoch": 0.563650762225302, "grad_norm": 1.0616047382354736, "learning_rate": 5e-05, "llm_loss": 0.6614383608102798, "loss": 2.9983, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06451416015625, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.0626220703125, "step": 2847, "total_loss": 0.7495755106210709 }, { "epoch": 0.5638487428232033, "grad_norm": 1.0379022359848022, "learning_rate": 5e-05, "llm_loss": 0.5289783179759979, "loss": 2.4931, "loss_aux_layer_0": 0.019805908203125, "loss_aux_layer_1": 0.0401611328125, "loss_aux_layer_10": 0.06982421875, "loss_aux_layer_11": 0.0740966796875, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.084716796875, "loss_aux_layer_14": 0.0938720703125, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.129150390625, "loss_aux_layer_19": 0.13232421875, "loss_aux_layer_2": 0.05474853515625, "loss_aux_layer_20": 0.1396484375, "loss_aux_layer_21": 0.1484375, "loss_aux_layer_22": 0.169677734375, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.0657958984375, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.070556640625, "loss_aux_layer_8": 0.0697021484375, "loss_aux_layer_9": 0.0684814453125, "step": 2848, "total_loss": 0.6232693940401077 }, { "epoch": 0.5640467234211047, "grad_norm": 1.5348188877105713, "learning_rate": 5e-05, "llm_loss": 0.5728416740894318, "loss": 2.6588, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1180419921875, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05316162109375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06597900390625, "loss_aux_layer_5": 0.06781005859375, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.06866455078125, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0667724609375, "step": 2849, "total_loss": 0.6647108793258667 }, { "epoch": 0.5642447040190062, "grad_norm": 1.0918389558792114, "learning_rate": 5e-05, "llm_loss": 0.519496813416481, "loss": 2.4544, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.041015625, "loss_aux_layer_10": 0.071533203125, "loss_aux_layer_11": 0.0762939453125, "loss_aux_layer_12": 0.081298828125, "loss_aux_layer_13": 0.0870361328125, "loss_aux_layer_14": 0.09521484375, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.1119384765625, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.05560302734375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.06689453125, "loss_aux_layer_4": 0.06982421875, "loss_aux_layer_5": 0.0716552734375, "loss_aux_layer_6": 0.07470703125, "loss_aux_layer_7": 0.0723876953125, "loss_aux_layer_8": 0.071533203125, "loss_aux_layer_9": 0.070068359375, "step": 2850, "total_loss": 0.6135906428098679 }, { "epoch": 0.5644426846169075, "grad_norm": 1.1473289728164673, "learning_rate": 5e-05, "llm_loss": 0.5098302140831947, "loss": 2.3977, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.036865234375, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.06768798828125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.0634765625, "step": 2851, "total_loss": 0.5994364321231842 }, { "epoch": 0.5646406652148089, "grad_norm": 1.0286290645599365, "learning_rate": 5e-05, "llm_loss": 0.6099917888641357, "loss": 2.7936, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.037109375, "loss_aux_layer_10": 0.06439208984375, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06292724609375, "step": 2852, "total_loss": 0.6984047293663025 }, { "epoch": 0.5648386458127104, "grad_norm": 1.1341608762741089, "learning_rate": 5e-05, "llm_loss": 0.49748557806015015, "loss": 2.3466, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06243896484375, "loss_aux_layer_6": 0.06536865234375, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06219482421875, "step": 2853, "total_loss": 0.5866398960351944 }, { "epoch": 0.5650366264106118, "grad_norm": 0.9951832890510559, "learning_rate": 5e-05, "llm_loss": 0.6851468831300735, "loss": 3.092, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05865478515625, "loss_aux_layer_4": 0.061279296875, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06146240234375, "step": 2854, "total_loss": 0.7730109840631485 }, { "epoch": 0.5652346070085131, "grad_norm": 0.9489693641662598, "learning_rate": 5e-05, "llm_loss": 0.5837100744247437, "loss": 2.7026, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.0380859375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.071044921875, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.139892578125, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207763671875, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.06439208984375, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.06640625, "loss_aux_layer_9": 0.065185546875, "step": 2855, "total_loss": 0.6756589263677597 }, { "epoch": 0.5654325876064146, "grad_norm": 1.1115471124649048, "learning_rate": 5e-05, "llm_loss": 0.5296217650175095, "loss": 2.4775, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1278076171875, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.0601806640625, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.0672607421875, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.0643310546875, "step": 2856, "total_loss": 0.6193768233060837 }, { "epoch": 0.565630568204316, "grad_norm": 0.9030406475067139, "learning_rate": 5e-05, "llm_loss": 0.5779212117195129, "loss": 2.6838, "loss_aux_layer_0": 0.017608642578125, "loss_aux_layer_1": 0.03900146484375, "loss_aux_layer_10": 0.070068359375, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.079345703125, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.093994140625, "loss_aux_layer_15": 0.1025390625, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.126953125, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0645751953125, "loss_aux_layer_4": 0.0673828125, "loss_aux_layer_5": 0.0694580078125, "loss_aux_layer_6": 0.0726318359375, "loss_aux_layer_7": 0.0703125, "loss_aux_layer_8": 0.06982421875, "loss_aux_layer_9": 0.0687255859375, "step": 2857, "total_loss": 0.670960396528244 }, { "epoch": 0.5658285488022173, "grad_norm": 1.1022746562957764, "learning_rate": 5e-05, "llm_loss": 0.6060922592878342, "loss": 2.7983, "loss_aux_layer_0": 0.018707275390625, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.0697021484375, "loss_aux_layer_11": 0.0743408203125, "loss_aux_layer_12": 0.07958984375, "loss_aux_layer_13": 0.0853271484375, "loss_aux_layer_14": 0.0947265625, "loss_aux_layer_15": 0.103271484375, "loss_aux_layer_16": 0.11279296875, "loss_aux_layer_17": 0.1201171875, "loss_aux_layer_18": 0.12890625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.145263671875, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.0667724609375, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.072265625, "loss_aux_layer_7": 0.0699462890625, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0682373046875, "step": 2858, "total_loss": 0.6995843052864075 }, { "epoch": 0.5660265294001188, "grad_norm": 0.9386233687400818, "learning_rate": 5e-05, "llm_loss": 0.559467226266861, "loss": 2.6021, "loss_aux_layer_0": 0.01922607421875, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.07000732421875, "loss_aux_layer_12": 0.0751953125, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1102294921875, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.064697265625, "step": 2859, "total_loss": 0.6505313515663147 }, { "epoch": 0.5662245099980202, "grad_norm": 0.8641242384910583, "learning_rate": 5e-05, "llm_loss": 0.586181640625, "loss": 2.7027, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.064697265625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.11572265625, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0633544921875, "step": 2860, "total_loss": 0.6756722033023834 }, { "epoch": 0.5664224905959216, "grad_norm": 0.7697184085845947, "learning_rate": 5e-05, "llm_loss": 0.5458498299121857, "loss": 2.5401, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.0601806640625, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.0645751953125, "step": 2861, "total_loss": 0.6350263059139252 }, { "epoch": 0.566620471193823, "grad_norm": 1.1279597282409668, "learning_rate": 5e-05, "llm_loss": 0.5759005695581436, "loss": 2.6791, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.03948974609375, "loss_aux_layer_10": 0.0703125, "loss_aux_layer_11": 0.0750732421875, "loss_aux_layer_12": 0.080322265625, "loss_aux_layer_13": 0.0863037109375, "loss_aux_layer_14": 0.095703125, "loss_aux_layer_15": 0.1044921875, "loss_aux_layer_16": 0.11376953125, "loss_aux_layer_17": 0.120849609375, "loss_aux_layer_18": 0.128662109375, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.0538330078125, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0648193359375, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.0697021484375, "loss_aux_layer_6": 0.072998046875, "loss_aux_layer_7": 0.07080078125, "loss_aux_layer_8": 0.0704345703125, "loss_aux_layer_9": 0.069091796875, "step": 2862, "total_loss": 0.6697658151388168 }, { "epoch": 0.5668184517917244, "grad_norm": 0.9710238575935364, "learning_rate": 5e-05, "llm_loss": 0.5841189473867416, "loss": 2.6913, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06243896484375, "step": 2863, "total_loss": 0.6728195548057556 }, { "epoch": 0.5670164323896258, "grad_norm": 1.0581530332565308, "learning_rate": 5e-05, "llm_loss": 0.6224811673164368, "loss": 2.8558, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.063232421875, "loss_aux_layer_4": 0.06549072265625, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.0677490234375, "loss_aux_layer_8": 0.06707763671875, "loss_aux_layer_9": 0.06561279296875, "step": 2864, "total_loss": 0.7139417678117752 }, { "epoch": 0.5672144129875272, "grad_norm": 0.8665580749511719, "learning_rate": 5e-05, "llm_loss": 0.6028969883918762, "loss": 2.7754, "loss_aux_layer_0": 0.01824951171875, "loss_aux_layer_1": 0.0389404296875, "loss_aux_layer_10": 0.06683349609375, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.052001953125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06243896484375, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.06768798828125, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.06549072265625, "step": 2865, "total_loss": 0.6938527226448059 }, { "epoch": 0.5674123935854286, "grad_norm": 1.2327017784118652, "learning_rate": 5e-05, "llm_loss": 0.6316504925489426, "loss": 2.8863, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.1253662109375, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.0640869140625, "step": 2866, "total_loss": 0.7215856313705444 }, { "epoch": 0.56761037418333, "grad_norm": 1.1110072135925293, "learning_rate": 5e-05, "llm_loss": 0.6410236358642578, "loss": 2.9141, "loss_aux_layer_0": 0.019287109375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0582275390625, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.06024169921875, "step": 2867, "total_loss": 0.7285262644290924 }, { "epoch": 0.5678083547812315, "grad_norm": 1.3057198524475098, "learning_rate": 5e-05, "llm_loss": 0.6053005009889603, "loss": 2.7779, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1243896484375, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.0631103515625, "step": 2868, "total_loss": 0.6944638341665268 }, { "epoch": 0.5680063353791328, "grad_norm": 1.459205985069275, "learning_rate": 5e-05, "llm_loss": 0.5855107456445694, "loss": 2.6971, "loss_aux_layer_0": 0.018157958984375, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.06378173828125, "loss_aux_layer_9": 0.062744140625, "step": 2869, "total_loss": 0.6742772459983826 }, { "epoch": 0.5682043159770342, "grad_norm": 1.267823576927185, "learning_rate": 5e-05, "llm_loss": 0.5109849721193314, "loss": 2.4083, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.066162109375, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.1004638671875, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1182861328125, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.1302490234375, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.146728515625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.064697265625, "step": 2870, "total_loss": 0.6020675450563431 }, { "epoch": 0.5684022965749357, "grad_norm": 1.7076326608657837, "learning_rate": 5e-05, "llm_loss": 0.5733486860990524, "loss": 2.6398, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.0599365234375, "step": 2871, "total_loss": 0.6599501073360443 }, { "epoch": 0.568600277172837, "grad_norm": 1.083848476409912, "learning_rate": 5e-05, "llm_loss": 0.6285617053508759, "loss": 2.8681, "loss_aux_layer_0": 0.017913818359375, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06573486328125, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.0628662109375, "loss_aux_layer_9": 0.06158447265625, "step": 2872, "total_loss": 0.7170166671276093 }, { "epoch": 0.5687982577707384, "grad_norm": 1.4144973754882812, "learning_rate": 5e-05, "llm_loss": 0.4595436528325081, "loss": 2.1968, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.06634521484375, "loss_aux_layer_11": 0.070556640625, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1248779296875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.06036376953125, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.06756591796875, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06475830078125, "step": 2873, "total_loss": 0.5491999983787537 }, { "epoch": 0.5689962383686399, "grad_norm": 1.136043906211853, "learning_rate": 5e-05, "llm_loss": 0.5291290357708931, "loss": 2.4822, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03741455078125, "loss_aux_layer_10": 0.066650390625, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.100830078125, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.14599609375, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06695556640625, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.065185546875, "step": 2874, "total_loss": 0.6205522269010544 }, { "epoch": 0.5691942189665413, "grad_norm": 1.205739974975586, "learning_rate": 5e-05, "llm_loss": 0.5754127949476242, "loss": 2.6516, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.06658935546875, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.0614013671875, "step": 2875, "total_loss": 0.6629000008106232 }, { "epoch": 0.5693921995644426, "grad_norm": 1.1607725620269775, "learning_rate": 5e-05, "llm_loss": 0.5719536989927292, "loss": 2.6527, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.117431640625, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.06195068359375, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.06634521484375, "loss_aux_layer_9": 0.0653076171875, "step": 2876, "total_loss": 0.6631675660610199 }, { "epoch": 0.5695901801623441, "grad_norm": 1.1846753358840942, "learning_rate": 5e-05, "llm_loss": 0.5575742125511169, "loss": 2.5754, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.0604248046875, "step": 2877, "total_loss": 0.6438614130020142 }, { "epoch": 0.5697881607602455, "grad_norm": 1.0583198070526123, "learning_rate": 5e-05, "llm_loss": 0.5468601733446121, "loss": 2.5341, "loss_aux_layer_0": 0.01849365234375, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.060546875, "step": 2878, "total_loss": 0.6335211992263794 }, { "epoch": 0.5699861413581468, "grad_norm": 1.2085638046264648, "learning_rate": 5e-05, "llm_loss": 0.5502027422189713, "loss": 2.5625, "loss_aux_layer_0": 0.018951416015625, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.06903076171875, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.117919921875, "loss_aux_layer_18": 0.126220703125, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.14697265625, "loss_aux_layer_22": 0.1689453125, "loss_aux_layer_23": 0.208251953125, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.064208984375, "loss_aux_layer_9": 0.06365966796875, "step": 2879, "total_loss": 0.6406175792217255 }, { "epoch": 0.5701841219560483, "grad_norm": 0.9110801815986633, "learning_rate": 5e-05, "llm_loss": 0.542820081114769, "loss": 2.5125, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.0595703125, "step": 2880, "total_loss": 0.6281139850616455 }, { "epoch": 0.5703821025539497, "grad_norm": 1.0375382900238037, "learning_rate": 5e-05, "llm_loss": 0.64584019780159, "loss": 2.9322, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06280517578125, "step": 2881, "total_loss": 0.733060210943222 }, { "epoch": 0.5705800831518512, "grad_norm": 1.0520167350769043, "learning_rate": 5e-05, "llm_loss": 0.562623105943203, "loss": 2.5965, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.0623779296875, "loss_aux_layer_9": 0.0611572265625, "step": 2882, "total_loss": 0.6491259634494781 }, { "epoch": 0.5707780637497525, "grad_norm": 0.8277285099029541, "learning_rate": 5e-05, "llm_loss": 0.5087189674377441, "loss": 2.4137, "loss_aux_layer_0": 0.018890380859375, "loss_aux_layer_1": 0.04058837890625, "loss_aux_layer_10": 0.0694580078125, "loss_aux_layer_11": 0.073974609375, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0849609375, "loss_aux_layer_14": 0.0941162109375, "loss_aux_layer_15": 0.1033935546875, "loss_aux_layer_16": 0.1134033203125, "loss_aux_layer_17": 0.1212158203125, "loss_aux_layer_18": 0.129638671875, "loss_aux_layer_19": 0.1326904296875, "loss_aux_layer_2": 0.05438232421875, "loss_aux_layer_20": 0.140625, "loss_aux_layer_21": 0.14990234375, "loss_aux_layer_22": 0.17236328125, "loss_aux_layer_23": 0.21142578125, "loss_aux_layer_3": 0.0650634765625, "loss_aux_layer_4": 0.06787109375, "loss_aux_layer_5": 0.06982421875, "loss_aux_layer_6": 0.07275390625, "loss_aux_layer_7": 0.0701904296875, "loss_aux_layer_8": 0.069580078125, "loss_aux_layer_9": 0.0679931640625, "step": 2883, "total_loss": 0.6034168899059296 }, { "epoch": 0.5709760443476539, "grad_norm": 0.9730314016342163, "learning_rate": 5e-05, "llm_loss": 0.5648497194051743, "loss": 2.6196, "loss_aux_layer_0": 0.018280029296875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05157470703125, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.0643310546875, "step": 2884, "total_loss": 0.6548894494771957 }, { "epoch": 0.5711740249455554, "grad_norm": 0.8037052750587463, "learning_rate": 5e-05, "llm_loss": 0.6259276866912842, "loss": 2.8731, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.06781005859375, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.1109619140625, "loss_aux_layer_17": 0.1190185546875, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.06842041015625, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.0662841796875, "step": 2885, "total_loss": 0.718280479311943 }, { "epoch": 0.5713720055434568, "grad_norm": 0.9743044972419739, "learning_rate": 5e-05, "llm_loss": 0.5019599944353104, "loss": 2.3701, "loss_aux_layer_0": 0.018890380859375, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06488037109375, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.1268310546875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.138916015625, "loss_aux_layer_21": 0.14794921875, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20703125, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0634765625, "step": 2886, "total_loss": 0.5925197005271912 }, { "epoch": 0.5715699861413581, "grad_norm": 0.8990205526351929, "learning_rate": 5e-05, "llm_loss": 0.5911189168691635, "loss": 2.7085, "loss_aux_layer_0": 0.017608642578125, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.0604248046875, "step": 2887, "total_loss": 0.6771127730607986 }, { "epoch": 0.5717679667392596, "grad_norm": 0.9254148006439209, "learning_rate": 5e-05, "llm_loss": 0.6306794285774231, "loss": 2.8764, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.1317138671875, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.0662841796875, "loss_aux_layer_8": 0.065673828125, "loss_aux_layer_9": 0.06427001953125, "step": 2888, "total_loss": 0.719098225235939 }, { "epoch": 0.571965947337161, "grad_norm": 0.7648727893829346, "learning_rate": 5e-05, "llm_loss": 0.5640872418880463, "loss": 2.6127, "loss_aux_layer_0": 0.017059326171875, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.097412109375, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.06097412109375, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06402587890625, "step": 2889, "total_loss": 0.653176337480545 }, { "epoch": 0.5721639279350623, "grad_norm": 0.981876015663147, "learning_rate": 5e-05, "llm_loss": 0.5582735687494278, "loss": 2.5876, "loss_aux_layer_0": 0.017364501953125, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.06805419921875, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.06427001953125, "step": 2890, "total_loss": 0.6469064801931381 }, { "epoch": 0.5723619085329638, "grad_norm": 0.7683637738227844, "learning_rate": 5e-05, "llm_loss": 0.6309520751237869, "loss": 2.8804, "loss_aux_layer_0": 0.01776123046875, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.06646728515625, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.067138671875, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.064697265625, "step": 2891, "total_loss": 0.720108225941658 }, { "epoch": 0.5725598891308652, "grad_norm": 1.0524780750274658, "learning_rate": 5e-05, "llm_loss": 0.588848702609539, "loss": 2.7099, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06201171875, "step": 2892, "total_loss": 0.6774765104055405 }, { "epoch": 0.5727578697287666, "grad_norm": 0.9261307120323181, "learning_rate": 5e-05, "llm_loss": 0.6605893969535828, "loss": 2.9887, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.0609130859375, "step": 2893, "total_loss": 0.7471672147512436 }, { "epoch": 0.572955850326668, "grad_norm": 0.983761191368103, "learning_rate": 5e-05, "llm_loss": 0.5956572443246841, "loss": 2.7282, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06146240234375, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.06024169921875, "step": 2894, "total_loss": 0.682051882147789 }, { "epoch": 0.5731538309245694, "grad_norm": 1.221243977546692, "learning_rate": 5e-05, "llm_loss": 0.5929348468780518, "loss": 2.7247, "loss_aux_layer_0": 0.017333984375, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.06329345703125, "step": 2895, "total_loss": 0.6811858713626862 }, { "epoch": 0.5733518115224708, "grad_norm": 1.1579484939575195, "learning_rate": 5e-05, "llm_loss": 0.5797596424818039, "loss": 2.6643, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.06005859375, "step": 2896, "total_loss": 0.6660791635513306 }, { "epoch": 0.5735497921203722, "grad_norm": 0.9089160561561584, "learning_rate": 5e-05, "llm_loss": 0.5572132095694542, "loss": 2.5852, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.1356201171875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.06744384765625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06268310546875, "step": 2897, "total_loss": 0.6463065147399902 }, { "epoch": 0.5737477727182736, "grad_norm": 1.1874548196792603, "learning_rate": 5e-05, "llm_loss": 0.6151105612516403, "loss": 2.8049, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04736328125, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.06103515625, "step": 2898, "total_loss": 0.7012363970279694 }, { "epoch": 0.573945753316175, "grad_norm": 0.7862069606781006, "learning_rate": 5e-05, "llm_loss": 0.5711123645305634, "loss": 2.6364, "loss_aux_layer_0": 0.01739501953125, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06439208984375, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.124755859375, "loss_aux_layer_2": 0.048828125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.063232421875, "step": 2899, "total_loss": 0.6590884327888489 }, { "epoch": 0.5741437339140765, "grad_norm": 1.0269973278045654, "learning_rate": 5e-05, "llm_loss": 0.6028715670108795, "loss": 2.7693, "loss_aux_layer_0": 0.01824951171875, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1248779296875, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06292724609375, "step": 2900, "total_loss": 0.6923319548368454 }, { "epoch": 0.5743417145119778, "grad_norm": 0.8953850269317627, "learning_rate": 5e-05, "llm_loss": 0.509459599852562, "loss": 2.3805, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.0592041015625, "step": 2901, "total_loss": 0.5951143503189087 }, { "epoch": 0.5745396951098792, "grad_norm": 0.7817211747169495, "learning_rate": 5e-05, "llm_loss": 0.5080520883202553, "loss": 2.3973, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.03887939453125, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.0718994140625, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.099365234375, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.066162109375, "step": 2902, "total_loss": 0.5993262007832527 }, { "epoch": 0.5747376757077807, "grad_norm": 1.1071863174438477, "learning_rate": 5e-05, "llm_loss": 0.5986122190952301, "loss": 2.7404, "loss_aux_layer_0": 0.0174560546875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.1309814453125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.0614013671875, "step": 2903, "total_loss": 0.6851119846105576 }, { "epoch": 0.574935656305682, "grad_norm": 0.8980089426040649, "learning_rate": 5e-05, "llm_loss": 0.5578684285283089, "loss": 2.5821, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.06585693359375, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.061767578125, "step": 2904, "total_loss": 0.645520955324173 }, { "epoch": 0.5751336369035834, "grad_norm": 0.8947350978851318, "learning_rate": 5e-05, "llm_loss": 0.6557125747203827, "loss": 2.9681, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.0615234375, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.0609130859375, "step": 2905, "total_loss": 0.7420250177383423 }, { "epoch": 0.5753316175014849, "grad_norm": 1.6143043041229248, "learning_rate": 5e-05, "llm_loss": 0.5812806859612465, "loss": 2.6945, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.068359375, "loss_aux_layer_11": 0.0731201171875, "loss_aux_layer_12": 0.0777587890625, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1019287109375, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.119384765625, "loss_aux_layer_18": 0.1273193359375, "loss_aux_layer_19": 0.1295166015625, "loss_aux_layer_2": 0.05267333984375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.0633544921875, "loss_aux_layer_4": 0.06640625, "loss_aux_layer_5": 0.0679931640625, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.068603515625, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0667724609375, "step": 2906, "total_loss": 0.6736356317996979 }, { "epoch": 0.5755295980993863, "grad_norm": 1.1712381839752197, "learning_rate": 5e-05, "llm_loss": 0.6817148923873901, "loss": 3.0865, "loss_aux_layer_0": 0.0177001953125, "loss_aux_layer_1": 0.0361328125, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.06817626953125, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20751953125, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.06719970703125, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.06378173828125, "loss_aux_layer_9": 0.06231689453125, "step": 2907, "total_loss": 0.7716350257396698 }, { "epoch": 0.5757275786972876, "grad_norm": 1.059193730354309, "learning_rate": 5e-05, "llm_loss": 0.5332968607544899, "loss": 2.4738, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.1302490234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.060546875, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.05963134765625, "step": 2908, "total_loss": 0.6184501200914383 }, { "epoch": 0.5759255592951891, "grad_norm": 1.3083852529525757, "learning_rate": 5e-05, "llm_loss": 0.5603487119078636, "loss": 2.6003, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.06268310546875, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.065185546875, "loss_aux_layer_8": 0.0648193359375, "loss_aux_layer_9": 0.06341552734375, "step": 2909, "total_loss": 0.6500791013240814 }, { "epoch": 0.5761235398930905, "grad_norm": 1.1412876844406128, "learning_rate": 5e-05, "llm_loss": 0.6272540986537933, "loss": 2.863, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.124755859375, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.06243896484375, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.0672607421875, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0638427734375, "step": 2910, "total_loss": 0.71575927734375 }, { "epoch": 0.5763215204909918, "grad_norm": 1.0756995677947998, "learning_rate": 5e-05, "llm_loss": 0.5996647030115128, "loss": 2.7503, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.0628662109375, "step": 2911, "total_loss": 0.6875720173120499 }, { "epoch": 0.5765195010888933, "grad_norm": 1.5757184028625488, "learning_rate": 5e-05, "llm_loss": 0.5895212292671204, "loss": 2.7173, "loss_aux_layer_0": 0.017822265625, "loss_aux_layer_1": 0.03741455078125, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.06622314453125, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.06488037109375, "step": 2912, "total_loss": 0.6793280243873596 }, { "epoch": 0.5767174816867947, "grad_norm": 0.9374587535858154, "learning_rate": 5e-05, "llm_loss": 0.6063647866249084, "loss": 2.7704, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0626220703125, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.061767578125, "step": 2913, "total_loss": 0.6926100254058838 }, { "epoch": 0.5769154622846961, "grad_norm": 1.3460681438446045, "learning_rate": 5e-05, "llm_loss": 0.6081484779715538, "loss": 2.7991, "loss_aux_layer_0": 0.018829345703125, "loss_aux_layer_1": 0.03924560546875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.05450439453125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06463623046875, "loss_aux_layer_4": 0.06744384765625, "loss_aux_layer_5": 0.06903076171875, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.06817626953125, "loss_aux_layer_9": 0.0670166015625, "step": 2914, "total_loss": 0.6997732147574425 }, { "epoch": 0.5771134428825975, "grad_norm": 0.9648331999778748, "learning_rate": 5e-05, "llm_loss": 0.5882919132709503, "loss": 2.7177, "loss_aux_layer_0": 0.016998291015625, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.06744384765625, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.0826416015625, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1251220703125, "loss_aux_layer_19": 0.1280517578125, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.06304931640625, "loss_aux_layer_4": 0.06573486328125, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0673828125, "loss_aux_layer_9": 0.066162109375, "step": 2915, "total_loss": 0.679432600736618 }, { "epoch": 0.5773114234804989, "grad_norm": 1.3005187511444092, "learning_rate": 5e-05, "llm_loss": 0.5529751777648926, "loss": 2.5769, "loss_aux_layer_0": 0.019500732421875, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06182861328125, "loss_aux_layer_4": 0.064453125, "loss_aux_layer_5": 0.06640625, "loss_aux_layer_6": 0.06951904296875, "loss_aux_layer_7": 0.06719970703125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.06573486328125, "step": 2916, "total_loss": 0.6442158967256546 }, { "epoch": 0.5775094040784003, "grad_norm": 1.1126136779785156, "learning_rate": 5e-05, "llm_loss": 0.536744236946106, "loss": 2.5118, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.038818359375, "loss_aux_layer_10": 0.06768798828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.06396484375, "loss_aux_layer_4": 0.06671142578125, "loss_aux_layer_5": 0.06842041015625, "loss_aux_layer_6": 0.071044921875, "loss_aux_layer_7": 0.06842041015625, "loss_aux_layer_8": 0.06768798828125, "loss_aux_layer_9": 0.06646728515625, "step": 2917, "total_loss": 0.6279386430978775 }, { "epoch": 0.5777073846763017, "grad_norm": 1.3303593397140503, "learning_rate": 5e-05, "llm_loss": 0.5413000583648682, "loss": 2.527, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.06573486328125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0809326171875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1279296875, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0615234375, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.06591796875, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.0665283203125, "loss_aux_layer_8": 0.06585693359375, "loss_aux_layer_9": 0.06451416015625, "step": 2918, "total_loss": 0.6317468583583832 }, { "epoch": 0.5779053652742031, "grad_norm": 1.040086030960083, "learning_rate": 5e-05, "llm_loss": 0.5388139933347702, "loss": 2.4975, "loss_aux_layer_0": 0.019195556640625, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.060546875, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05902099609375, "step": 2919, "total_loss": 0.6243692189455032 }, { "epoch": 0.5781033458721045, "grad_norm": 0.9107378125190735, "learning_rate": 5e-05, "llm_loss": 0.602122500538826, "loss": 2.7668, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.13720703125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06268310546875, "step": 2920, "total_loss": 0.6916976124048233 }, { "epoch": 0.578301326470006, "grad_norm": 1.6518986225128174, "learning_rate": 5e-05, "llm_loss": 0.5587737858295441, "loss": 2.5912, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.064697265625, "loss_aux_layer_11": 0.06890869140625, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.062255859375, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.06695556640625, "loss_aux_layer_7": 0.06488037109375, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.063232421875, "step": 2921, "total_loss": 0.6477897614240646 }, { "epoch": 0.5784993070679073, "grad_norm": 0.915503978729248, "learning_rate": 5e-05, "llm_loss": 0.5250067189335823, "loss": 2.4491, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.0587158203125, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06170654296875, "step": 2922, "total_loss": 0.6122829914093018 }, { "epoch": 0.5786972876658087, "grad_norm": 1.4224717617034912, "learning_rate": 5e-05, "llm_loss": 0.6323388740420341, "loss": 2.8942, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0667724609375, "loss_aux_layer_11": 0.0711669921875, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.06512451171875, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0677490234375, "loss_aux_layer_8": 0.06683349609375, "loss_aux_layer_9": 0.0654296875, "step": 2923, "total_loss": 0.7235489189624786 }, { "epoch": 0.5788952682637102, "grad_norm": 1.3768566846847534, "learning_rate": 5e-05, "llm_loss": 0.49236439168453217, "loss": 2.3192, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06573486328125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.060302734375, "step": 2924, "total_loss": 0.579808235168457 }, { "epoch": 0.5790932488616115, "grad_norm": 1.0351676940917969, "learning_rate": 5e-05, "llm_loss": 0.6620399206876755, "loss": 2.9988, "loss_aux_layer_0": 0.017486572265625, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.06488037109375, "loss_aux_layer_8": 0.064208984375, "loss_aux_layer_9": 0.0625, "step": 2925, "total_loss": 0.7496941387653351 }, { "epoch": 0.579291229459513, "grad_norm": 1.0395550727844238, "learning_rate": 5e-05, "llm_loss": 0.5172697007656097, "loss": 2.4342, "loss_aux_layer_0": 0.020538330078125, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.0753173828125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.147216796875, "loss_aux_layer_22": 0.168701171875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06121826171875, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.06439208984375, "step": 2926, "total_loss": 0.6085401326417923 }, { "epoch": 0.5794892100574144, "grad_norm": 1.3049696683883667, "learning_rate": 5e-05, "llm_loss": 0.5928495600819588, "loss": 2.7252, "loss_aux_layer_0": 0.018768310546875, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.06243896484375, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0625, "step": 2927, "total_loss": 0.6812890321016312 }, { "epoch": 0.5796871906553158, "grad_norm": 0.8935714364051819, "learning_rate": 5e-05, "llm_loss": 0.5745637714862823, "loss": 2.6631, "loss_aux_layer_0": 0.018218994140625, "loss_aux_layer_1": 0.03814697265625, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.072265625, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0675048828125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.068359375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.0662841796875, "step": 2928, "total_loss": 0.6657840013504028 }, { "epoch": 0.5798851712532171, "grad_norm": 1.4020323753356934, "learning_rate": 5e-05, "llm_loss": 0.528980016708374, "loss": 2.474, "loss_aux_layer_0": 0.019989013671875, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.206298828125, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0621337890625, "step": 2929, "total_loss": 0.6184983849525452 }, { "epoch": 0.5800831518511186, "grad_norm": 1.269429087638855, "learning_rate": 5e-05, "llm_loss": 0.5828195661306381, "loss": 2.6858, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0587158203125, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06585693359375, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06341552734375, "loss_aux_layer_9": 0.06219482421875, "step": 2930, "total_loss": 0.6714601069688797 }, { "epoch": 0.58028113244902, "grad_norm": 1.4475045204162598, "learning_rate": 5e-05, "llm_loss": 0.6164785102009773, "loss": 2.8133, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.06146240234375, "step": 2931, "total_loss": 0.7033154517412186 }, { "epoch": 0.5804791130469213, "grad_norm": 0.989224910736084, "learning_rate": 5e-05, "llm_loss": 0.564254105091095, "loss": 2.603, "loss_aux_layer_0": 0.0186767578125, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.1300048828125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.0628662109375, "loss_aux_layer_9": 0.0615234375, "step": 2932, "total_loss": 0.6507507562637329 }, { "epoch": 0.5806770936448228, "grad_norm": 1.077925682067871, "learning_rate": 5e-05, "llm_loss": 0.6492842137813568, "loss": 2.9561, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.06451416015625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1092529296875, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1258544921875, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.0594482421875, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06292724609375, "step": 2933, "total_loss": 0.7390269488096237 }, { "epoch": 0.5808750742427242, "grad_norm": 1.0062320232391357, "learning_rate": 5e-05, "llm_loss": 0.5917931199073792, "loss": 2.6999, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.05743408203125, "step": 2934, "total_loss": 0.6749783307313919 }, { "epoch": 0.5810730548406257, "grad_norm": 1.146764874458313, "learning_rate": 5e-05, "llm_loss": 0.6140669882297516, "loss": 2.7838, "loss_aux_layer_0": 0.018524169921875, "loss_aux_layer_1": 0.033477783203125, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.0570068359375, "step": 2935, "total_loss": 0.6959483772516251 }, { "epoch": 0.581271035438527, "grad_norm": 1.312379240989685, "learning_rate": 5e-05, "llm_loss": 0.6164431124925613, "loss": 2.8103, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.0587158203125, "step": 2936, "total_loss": 0.7025768160820007 }, { "epoch": 0.5814690160364284, "grad_norm": 1.1376056671142578, "learning_rate": 5e-05, "llm_loss": 0.5812542885541916, "loss": 2.6576, "loss_aux_layer_0": 0.019439697265625, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.05902099609375, "step": 2937, "total_loss": 0.6644065678119659 }, { "epoch": 0.5816669966343299, "grad_norm": 1.2678005695343018, "learning_rate": 5e-05, "llm_loss": 0.47657257318496704, "loss": 2.2475, "loss_aux_layer_0": 0.01934814453125, "loss_aux_layer_1": 0.033905029296875, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.0589599609375, "step": 2938, "total_loss": 0.561874732375145 }, { "epoch": 0.5818649772322313, "grad_norm": 0.9161737561225891, "learning_rate": 5e-05, "llm_loss": 0.5996579974889755, "loss": 2.7511, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.05096435546875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.0611572265625, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0628662109375, "step": 2939, "total_loss": 0.6877750009298325 }, { "epoch": 0.5820629578301326, "grad_norm": 1.1923856735229492, "learning_rate": 5e-05, "llm_loss": 0.5300597697496414, "loss": 2.4869, "loss_aux_layer_0": 0.019622802734375, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0755615234375, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.0523681640625, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.064697265625, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.0657958984375, "step": 2940, "total_loss": 0.6217219978570938 }, { "epoch": 0.5822609384280341, "grad_norm": 0.9529680609703064, "learning_rate": 5e-05, "llm_loss": 0.647405281662941, "loss": 2.947, "loss_aux_layer_0": 0.017608642578125, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06396484375, "step": 2941, "total_loss": 0.736756294965744 }, { "epoch": 0.5824589190259355, "grad_norm": 1.0542243719100952, "learning_rate": 5e-05, "llm_loss": 0.6529696583747864, "loss": 2.9635, "loss_aux_layer_0": 0.01873779296875, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.06597900390625, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.0621337890625, "step": 2942, "total_loss": 0.7408833205699921 }, { "epoch": 0.5826568996238368, "grad_norm": 1.1831029653549194, "learning_rate": 5e-05, "llm_loss": 0.5540601462125778, "loss": 2.5634, "loss_aux_layer_0": 0.0181884765625, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.06756591796875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.1302490234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.06646728515625, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06231689453125, "step": 2943, "total_loss": 0.6408572196960449 }, { "epoch": 0.5828548802217383, "grad_norm": 1.231058955192566, "learning_rate": 5e-05, "llm_loss": 0.5170990973711014, "loss": 2.4255, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06390380859375, "step": 2944, "total_loss": 0.6063774824142456 }, { "epoch": 0.5830528608196397, "grad_norm": 1.0252594947814941, "learning_rate": 5e-05, "llm_loss": 0.5900150388479233, "loss": 2.7299, "loss_aux_layer_0": 0.0174560546875, "loss_aux_layer_1": 0.03857421875, "loss_aux_layer_10": 0.06884765625, "loss_aux_layer_11": 0.0733642578125, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.08447265625, "loss_aux_layer_14": 0.0928955078125, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.118408203125, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.05279541015625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06365966796875, "loss_aux_layer_4": 0.06658935546875, "loss_aux_layer_5": 0.068359375, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.0675048828125, "step": 2945, "total_loss": 0.6824734508991241 }, { "epoch": 0.5832508414175411, "grad_norm": 1.0478297472000122, "learning_rate": 5e-05, "llm_loss": 0.6440834105014801, "loss": 2.9339, "loss_aux_layer_0": 0.018463134765625, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.0509033203125, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.06097412109375, "loss_aux_layer_4": 0.0638427734375, "loss_aux_layer_5": 0.0653076171875, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06402587890625, "step": 2946, "total_loss": 0.7334678024053574 }, { "epoch": 0.5834488220154425, "grad_norm": 0.9221655130386353, "learning_rate": 5e-05, "llm_loss": 0.5173808336257935, "loss": 2.4199, "loss_aux_layer_0": 0.0179443359375, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.0625, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.061767578125, "step": 2947, "total_loss": 0.6049743443727493 }, { "epoch": 0.5836468026133439, "grad_norm": 0.9405415654182434, "learning_rate": 5e-05, "llm_loss": 0.552626721560955, "loss": 2.5788, "loss_aux_layer_0": 0.017913818359375, "loss_aux_layer_1": 0.0384521484375, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.072021484375, "loss_aux_layer_12": 0.076904296875, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1192626953125, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13037109375, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.137939453125, "loss_aux_layer_21": 0.145751953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.0662841796875, "step": 2948, "total_loss": 0.6446921825408936 }, { "epoch": 0.5838447832112453, "grad_norm": 0.9916725158691406, "learning_rate": 5e-05, "llm_loss": 0.5740359425544739, "loss": 2.6523, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.06488037109375, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.06488037109375, "loss_aux_layer_9": 0.063720703125, "step": 2949, "total_loss": 0.6630812436342239 }, { "epoch": 0.5840427638091467, "grad_norm": 0.9918791055679321, "learning_rate": 5e-05, "llm_loss": 0.5624944120645523, "loss": 2.5959, "loss_aux_layer_0": 0.0172119140625, "loss_aux_layer_1": 0.034820556640625, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.05999755859375, "step": 2950, "total_loss": 0.6489657983183861 }, { "epoch": 0.5842407444070481, "grad_norm": 1.2145358324050903, "learning_rate": 5e-05, "llm_loss": 0.6144003644585609, "loss": 2.812, "loss_aux_layer_0": 0.01812744140625, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.06536865234375, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.063232421875, "step": 2951, "total_loss": 0.7030094861984253 }, { "epoch": 0.5844387250049495, "grad_norm": 0.8070013523101807, "learning_rate": 5e-05, "llm_loss": 0.5544318407773972, "loss": 2.5766, "loss_aux_layer_0": 0.017730712890625, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06378173828125, "loss_aux_layer_5": 0.06549072265625, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.06622314453125, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06427001953125, "step": 2952, "total_loss": 0.6441576033830643 }, { "epoch": 0.584636705602851, "grad_norm": 1.319713830947876, "learning_rate": 5e-05, "llm_loss": 0.6311309933662415, "loss": 2.8715, "loss_aux_layer_0": 0.017669677734375, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.0614013671875, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06573486328125, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06134033203125, "step": 2953, "total_loss": 0.7178708910942078 }, { "epoch": 0.5848346862007523, "grad_norm": 0.8077349066734314, "learning_rate": 5e-05, "llm_loss": 0.5196457952260971, "loss": 2.4197, "loss_aux_layer_0": 0.0172119140625, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.05926513671875, "step": 2954, "total_loss": 0.6049166023731232 }, { "epoch": 0.5850326667986537, "grad_norm": 1.4471758604049683, "learning_rate": 5e-05, "llm_loss": 0.6382012218236923, "loss": 2.9052, "loss_aux_layer_0": 0.01708984375, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06658935546875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1165771484375, "loss_aux_layer_18": 0.125, "loss_aux_layer_19": 0.1282958984375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06475830078125, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.06072998046875, "step": 2955, "total_loss": 0.7262943536043167 }, { "epoch": 0.5852306473965552, "grad_norm": 1.2103276252746582, "learning_rate": 5e-05, "llm_loss": 0.5660713762044907, "loss": 2.6137, "loss_aux_layer_0": 0.017181396484375, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.06634521484375, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06103515625, "step": 2956, "total_loss": 0.6534193605184555 }, { "epoch": 0.5854286279944565, "grad_norm": 1.1529821157455444, "learning_rate": 5e-05, "llm_loss": 0.5289349555969238, "loss": 2.4783, "loss_aux_layer_0": 0.017303466796875, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.0709228515625, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.115966796875, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.06561279296875, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.06787109375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0657958984375, "step": 2957, "total_loss": 0.6195836067199707 }, { "epoch": 0.5856266085923579, "grad_norm": 1.086047649383545, "learning_rate": 5e-05, "llm_loss": 0.5859630256891251, "loss": 2.6957, "loss_aux_layer_0": 0.016754150390625, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.124755859375, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06317138671875, "step": 2958, "total_loss": 0.6739330589771271 }, { "epoch": 0.5858245891902594, "grad_norm": 1.0396963357925415, "learning_rate": 5e-05, "llm_loss": 0.7000039666891098, "loss": 3.1418, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.035797119140625, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06646728515625, "loss_aux_layer_12": 0.07098388671875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06109619140625, "step": 2959, "total_loss": 0.7854615300893784 }, { "epoch": 0.5860225697881608, "grad_norm": 1.2026116847991943, "learning_rate": 5e-05, "llm_loss": 0.5866203978657722, "loss": 2.7049, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.036346435546875, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.1282958984375, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.167236328125, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.0628662109375, "step": 2960, "total_loss": 0.6762218326330185 }, { "epoch": 0.5862205503860621, "grad_norm": 0.9889198541641235, "learning_rate": 5e-05, "llm_loss": 0.5906494855880737, "loss": 2.7138, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.0626220703125, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.06219482421875, "step": 2961, "total_loss": 0.6784426718950272 }, { "epoch": 0.5864185309839636, "grad_norm": 0.8155779242515564, "learning_rate": 5e-05, "llm_loss": 0.566554993391037, "loss": 2.597, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.0567626953125, "step": 2962, "total_loss": 0.6492615789175034 }, { "epoch": 0.586616511581865, "grad_norm": 1.1371161937713623, "learning_rate": 5e-05, "llm_loss": 0.5668377503752708, "loss": 2.633, "loss_aux_layer_0": 0.0174560546875, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1009521484375, "loss_aux_layer_16": 0.1107177734375, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204833984375, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.06439208984375, "loss_aux_layer_5": 0.0660400390625, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.064453125, "step": 2963, "total_loss": 0.658246248960495 }, { "epoch": 0.5868144921797663, "grad_norm": 1.1703070402145386, "learning_rate": 5e-05, "llm_loss": 0.5859590768814087, "loss": 2.716, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.069091796875, "loss_aux_layer_11": 0.0736083984375, "loss_aux_layer_12": 0.078857421875, "loss_aux_layer_13": 0.0848388671875, "loss_aux_layer_14": 0.09375, "loss_aux_layer_15": 0.1026611328125, "loss_aux_layer_16": 0.1121826171875, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.1280517578125, "loss_aux_layer_19": 0.1307373046875, "loss_aux_layer_2": 0.0537109375, "loss_aux_layer_20": 0.1376953125, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.06463623046875, "loss_aux_layer_4": 0.0672607421875, "loss_aux_layer_5": 0.06884765625, "loss_aux_layer_6": 0.0718994140625, "loss_aux_layer_7": 0.06982421875, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.0677490234375, "step": 2964, "total_loss": 0.6789887100458145 }, { "epoch": 0.5870124727776678, "grad_norm": 1.2072094678878784, "learning_rate": 5e-05, "llm_loss": 0.5986211448907852, "loss": 2.7418, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.0609130859375, "step": 2965, "total_loss": 0.6854395121335983 }, { "epoch": 0.5872104533755692, "grad_norm": 0.8859086036682129, "learning_rate": 5e-05, "llm_loss": 0.5813353210687637, "loss": 2.6787, "loss_aux_layer_0": 0.017486572265625, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.06500244140625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.1312255859375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0635986328125, "step": 2966, "total_loss": 0.6696840971708298 }, { "epoch": 0.5874084339734706, "grad_norm": 1.0795965194702148, "learning_rate": 5e-05, "llm_loss": 0.6205632537603378, "loss": 2.8358, "loss_aux_layer_0": 0.018035888671875, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0496826171875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.06231689453125, "step": 2967, "total_loss": 0.7089511454105377 }, { "epoch": 0.587606414571372, "grad_norm": 1.0993013381958008, "learning_rate": 5e-05, "llm_loss": 0.6152328699827194, "loss": 2.803, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06005859375, "step": 2968, "total_loss": 0.7007414549589157 }, { "epoch": 0.5878043951692734, "grad_norm": 0.7779133319854736, "learning_rate": 5e-05, "llm_loss": 0.61720772087574, "loss": 2.8127, "loss_aux_layer_0": 0.0181884765625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0572509765625, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05865478515625, "step": 2969, "total_loss": 0.7031631916761398 }, { "epoch": 0.5880023757671748, "grad_norm": 0.9659715294837952, "learning_rate": 5e-05, "llm_loss": 0.6319541335105896, "loss": 2.8823, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.06463623046875, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.06341552734375, "step": 2970, "total_loss": 0.7205687612295151 }, { "epoch": 0.5882003563650762, "grad_norm": 0.7950299978256226, "learning_rate": 5e-05, "llm_loss": 0.6107423603534698, "loss": 2.806, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05169677734375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.0650634765625, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06622314453125, "step": 2971, "total_loss": 0.7015115022659302 }, { "epoch": 0.5883983369629776, "grad_norm": 0.8298354744911194, "learning_rate": 5e-05, "llm_loss": 0.49793053418397903, "loss": 2.3302, "loss_aux_layer_0": 0.018402099609375, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.1300048828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.05889892578125, "step": 2972, "total_loss": 0.5825400426983833 }, { "epoch": 0.588596317560879, "grad_norm": 0.950456440448761, "learning_rate": 5e-05, "llm_loss": 0.5746317654848099, "loss": 2.6594, "loss_aux_layer_0": 0.0172119140625, "loss_aux_layer_1": 0.03802490234375, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0716552734375, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.1336669921875, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.0643310546875, "loss_aux_layer_5": 0.066162109375, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.065673828125, "step": 2973, "total_loss": 0.664843738079071 }, { "epoch": 0.5887942981587805, "grad_norm": 1.033046841621399, "learning_rate": 5e-05, "llm_loss": 0.5766390413045883, "loss": 2.6549, "loss_aux_layer_0": 0.01800537109375, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06085205078125, "step": 2974, "total_loss": 0.6637331694364548 }, { "epoch": 0.5889922787566818, "grad_norm": 0.8740327954292297, "learning_rate": 5e-05, "llm_loss": 0.646135225892067, "loss": 2.9246, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.0599365234375, "step": 2975, "total_loss": 0.7311385869979858 }, { "epoch": 0.5891902593545832, "grad_norm": 1.1237808465957642, "learning_rate": 5e-05, "llm_loss": 0.641731932759285, "loss": 2.9163, "loss_aux_layer_0": 0.017486572265625, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.07159423828125, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06097412109375, "step": 2976, "total_loss": 0.729072630405426 }, { "epoch": 0.5893882399524847, "grad_norm": 0.9976817965507507, "learning_rate": 5e-05, "llm_loss": 0.5818267166614532, "loss": 2.6709, "loss_aux_layer_0": 0.01898193359375, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06048583984375, "step": 2977, "total_loss": 0.6677245050668716 }, { "epoch": 0.589586220550386, "grad_norm": 0.8723309636116028, "learning_rate": 5e-05, "llm_loss": 0.5174033045768738, "loss": 2.4168, "loss_aux_layer_0": 0.0172119140625, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0623779296875, "step": 2978, "total_loss": 0.6042038798332214 }, { "epoch": 0.5897842011482874, "grad_norm": 0.9726953506469727, "learning_rate": 5e-05, "llm_loss": 0.520762637257576, "loss": 2.4429, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06280517578125, "step": 2979, "total_loss": 0.6107233911752701 }, { "epoch": 0.5899821817461889, "grad_norm": 0.9919626116752625, "learning_rate": 5e-05, "llm_loss": 0.5683285742998123, "loss": 2.615, "loss_aux_layer_0": 0.017730712890625, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.05999755859375, "step": 2980, "total_loss": 0.6537477523088455 }, { "epoch": 0.5901801623440903, "grad_norm": 0.9821814298629761, "learning_rate": 5e-05, "llm_loss": 0.6557115763425827, "loss": 2.9721, "loss_aux_layer_0": 0.017181396484375, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06243896484375, "loss_aux_layer_5": 0.06414794921875, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.06439208984375, "loss_aux_layer_9": 0.06304931640625, "step": 2981, "total_loss": 0.7430133819580078 }, { "epoch": 0.5903781429419916, "grad_norm": 0.989920437335968, "learning_rate": 5e-05, "llm_loss": 0.6790968179702759, "loss": 3.0618, "loss_aux_layer_0": 0.0174560546875, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.07269287109375, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.0626220703125, "step": 2982, "total_loss": 0.7654544115066528 }, { "epoch": 0.5905761235398931, "grad_norm": 0.8335828185081482, "learning_rate": 5e-05, "llm_loss": 0.5198038518428802, "loss": 2.4305, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.062255859375, "step": 2983, "total_loss": 0.6076293140649796 }, { "epoch": 0.5907741041377945, "grad_norm": 0.9826087355613708, "learning_rate": 5e-05, "llm_loss": 0.6495579928159714, "loss": 2.9582, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.06622314453125, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0638427734375, "loss_aux_layer_5": 0.06573486328125, "loss_aux_layer_6": 0.069091796875, "loss_aux_layer_7": 0.06658935546875, "loss_aux_layer_8": 0.06597900390625, "loss_aux_layer_9": 0.064697265625, "step": 2984, "total_loss": 0.739551305770874 }, { "epoch": 0.590972084735696, "grad_norm": 0.8147421479225159, "learning_rate": 5e-05, "llm_loss": 0.5764967948198318, "loss": 2.6579, "loss_aux_layer_0": 0.017333984375, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0625, "step": 2985, "total_loss": 0.6644664108753204 }, { "epoch": 0.5911700653335973, "grad_norm": 0.9549041986465454, "learning_rate": 5e-05, "llm_loss": 0.6305827796459198, "loss": 2.8703, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.0611572265625, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.060546875, "step": 2986, "total_loss": 0.71758733689785 }, { "epoch": 0.5913680459314987, "grad_norm": 0.9280440211296082, "learning_rate": 5e-05, "llm_loss": 0.5957162082195282, "loss": 2.7269, "loss_aux_layer_0": 0.01641845703125, "loss_aux_layer_1": 0.035491943359375, "loss_aux_layer_10": 0.06201171875, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.07098388671875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06060791015625, "step": 2987, "total_loss": 0.6817155182361603 }, { "epoch": 0.5915660265294002, "grad_norm": 0.8388981223106384, "learning_rate": 5e-05, "llm_loss": 0.5741352736949921, "loss": 2.6501, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.063232421875, "loss_aux_layer_9": 0.06231689453125, "step": 2988, "total_loss": 0.6625266373157501 }, { "epoch": 0.5917640071273015, "grad_norm": 0.8422707915306091, "learning_rate": 5e-05, "llm_loss": 0.6115688532590866, "loss": 2.8035, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.06378173828125, "step": 2989, "total_loss": 0.7008813768625259 }, { "epoch": 0.5919619877252029, "grad_norm": 1.015068531036377, "learning_rate": 5e-05, "llm_loss": 0.6033438146114349, "loss": 2.7634, "loss_aux_layer_0": 0.016845703125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.048828125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.06219482421875, "step": 2990, "total_loss": 0.6908492743968964 }, { "epoch": 0.5921599683231044, "grad_norm": 0.9375059604644775, "learning_rate": 5e-05, "llm_loss": 0.6446777507662773, "loss": 2.9298, "loss_aux_layer_0": 0.016815185546875, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06292724609375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0616455078125, "step": 2991, "total_loss": 0.7324547320604324 }, { "epoch": 0.5923579489210058, "grad_norm": 0.9908597469329834, "learning_rate": 5e-05, "llm_loss": 0.6101559698581696, "loss": 2.8017, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.03863525390625, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0821533203125, "loss_aux_layer_14": 0.0906982421875, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0631103515625, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.0673828125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0677490234375, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.065673828125, "step": 2992, "total_loss": 0.7004198879003525 }, { "epoch": 0.5925559295189071, "grad_norm": 0.810196042060852, "learning_rate": 5e-05, "llm_loss": 0.6151665970683098, "loss": 2.8077, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.07196044921875, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06121826171875, "step": 2993, "total_loss": 0.7019178420305252 }, { "epoch": 0.5927539101168086, "grad_norm": 1.119570016860962, "learning_rate": 5e-05, "llm_loss": 0.6147126406431198, "loss": 2.8087, "loss_aux_layer_0": 0.015960693359375, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.06109619140625, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.06201171875, "step": 2994, "total_loss": 0.7021631598472595 }, { "epoch": 0.59295189071471, "grad_norm": 0.8186125755310059, "learning_rate": 5e-05, "llm_loss": 0.5818441957235336, "loss": 2.6798, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.0628662109375, "step": 2995, "total_loss": 0.6699593365192413 }, { "epoch": 0.5931498713126113, "grad_norm": 0.967726469039917, "learning_rate": 5e-05, "llm_loss": 0.6034128367900848, "loss": 2.772, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.064208984375, "loss_aux_layer_5": 0.0657958984375, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.0640869140625, "step": 2996, "total_loss": 0.6930096745491028 }, { "epoch": 0.5933478519105128, "grad_norm": 1.0495620965957642, "learning_rate": 5e-05, "llm_loss": 0.583393394947052, "loss": 2.687, "loss_aux_layer_0": 0.017791748046875, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.06890869140625, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06414794921875, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.06304931640625, "step": 2997, "total_loss": 0.6717545688152313 }, { "epoch": 0.5935458325084142, "grad_norm": 0.9684380888938904, "learning_rate": 5e-05, "llm_loss": 0.6251322031021118, "loss": 2.8349, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.0584716796875, "step": 2998, "total_loss": 0.7087297886610031 }, { "epoch": 0.5937438131063156, "grad_norm": 1.255767583847046, "learning_rate": 5e-05, "llm_loss": 0.6267460435628891, "loss": 2.8605, "loss_aux_layer_0": 0.016845703125, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06109619140625, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06182861328125, "step": 2999, "total_loss": 0.7151238322257996 }, { "epoch": 0.593941793704217, "grad_norm": 0.9242568612098694, "learning_rate": 5e-05, "llm_loss": 0.5943934544920921, "loss": 2.7272, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.06817626953125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.1304931640625, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.0626220703125, "step": 3000, "total_loss": 0.6818077117204666 }, { "epoch": 0.5941397743021184, "grad_norm": 0.8848668932914734, "learning_rate": 5e-05, "llm_loss": 0.5505339056253433, "loss": 2.551, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.06158447265625, "step": 3001, "total_loss": 0.6377396434545517 }, { "epoch": 0.5943377549000198, "grad_norm": 1.0708955526351929, "learning_rate": 5e-05, "llm_loss": 0.7110982090234756, "loss": 3.2095, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.083251953125, "loss_aux_layer_14": 0.092529296875, "loss_aux_layer_15": 0.1015625, "loss_aux_layer_16": 0.111328125, "loss_aux_layer_17": 0.1199951171875, "loss_aux_layer_18": 0.1270751953125, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.05224609375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.0653076171875, "loss_aux_layer_5": 0.0667724609375, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.067626953125, "loss_aux_layer_8": 0.06689453125, "loss_aux_layer_9": 0.0657958984375, "step": 3002, "total_loss": 0.8023850321769714 }, { "epoch": 0.5945357354979212, "grad_norm": 0.9143344163894653, "learning_rate": 5e-05, "llm_loss": 0.7191047370433807, "loss": 3.217, "loss_aux_layer_0": 0.017364501953125, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05914306640625, "step": 3003, "total_loss": 0.8042575120925903 }, { "epoch": 0.5947337160958226, "grad_norm": 1.3593508005142212, "learning_rate": 5e-05, "llm_loss": 0.5724265724420547, "loss": 2.6613, "loss_aux_layer_0": 0.019317626953125, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.0780029296875, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.0933837890625, "loss_aux_layer_15": 0.1024169921875, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.12646484375, "loss_aux_layer_19": 0.12890625, "loss_aux_layer_2": 0.0540771484375, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.06451416015625, "loss_aux_layer_4": 0.06689453125, "loss_aux_layer_5": 0.068603515625, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.069091796875, "loss_aux_layer_8": 0.0684814453125, "loss_aux_layer_9": 0.0670166015625, "step": 3004, "total_loss": 0.6653253734111786 }, { "epoch": 0.594931696693724, "grad_norm": 1.0609391927719116, "learning_rate": 5e-05, "llm_loss": 0.5875162482261658, "loss": 2.7253, "loss_aux_layer_0": 0.01873779296875, "loss_aux_layer_1": 0.03985595703125, "loss_aux_layer_10": 0.0701904296875, "loss_aux_layer_11": 0.07470703125, "loss_aux_layer_12": 0.079833984375, "loss_aux_layer_13": 0.0855712890625, "loss_aux_layer_14": 0.0946044921875, "loss_aux_layer_15": 0.103515625, "loss_aux_layer_16": 0.112548828125, "loss_aux_layer_17": 0.119873046875, "loss_aux_layer_18": 0.1275634765625, "loss_aux_layer_19": 0.1292724609375, "loss_aux_layer_2": 0.0543212890625, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0653076171875, "loss_aux_layer_4": 0.0682373046875, "loss_aux_layer_5": 0.0703125, "loss_aux_layer_6": 0.0733642578125, "loss_aux_layer_7": 0.0712890625, "loss_aux_layer_8": 0.070556640625, "loss_aux_layer_9": 0.0689697265625, "step": 3005, "total_loss": 0.6813215762376785 }, { "epoch": 0.5951296772916255, "grad_norm": 0.9175757765769958, "learning_rate": 5e-05, "llm_loss": 0.5667060315608978, "loss": 2.6207, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06390380859375, "loss_aux_layer_9": 0.06256103515625, "step": 3006, "total_loss": 0.6551788598299026 }, { "epoch": 0.5953276578895268, "grad_norm": 1.1220194101333618, "learning_rate": 5e-05, "llm_loss": 0.5698409080505371, "loss": 2.6437, "loss_aux_layer_0": 0.017303466796875, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.06707763671875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.11669921875, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.1278076171875, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0616455078125, "loss_aux_layer_4": 0.0648193359375, "loss_aux_layer_5": 0.06671142578125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.06707763671875, "loss_aux_layer_9": 0.06573486328125, "step": 3007, "total_loss": 0.6609348654747009 }, { "epoch": 0.5955256384874282, "grad_norm": 0.9471302032470703, "learning_rate": 5e-05, "llm_loss": 0.6185665801167488, "loss": 2.8334, "loss_aux_layer_0": 0.01763916015625, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.0662841796875, "loss_aux_layer_11": 0.07080078125, "loss_aux_layer_12": 0.07568359375, "loss_aux_layer_13": 0.08154296875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.0989990234375, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.06689453125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.06500244140625, "step": 3008, "total_loss": 0.7083457857370377 }, { "epoch": 0.5957236190853297, "grad_norm": 0.9504278302192688, "learning_rate": 5e-05, "llm_loss": 0.5608688294887543, "loss": 2.6019, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.06610107421875, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0618896484375, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.06634521484375, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.064697265625, "step": 3009, "total_loss": 0.65047986805439 }, { "epoch": 0.595921599683231, "grad_norm": 1.0524578094482422, "learning_rate": 5e-05, "llm_loss": 0.5447532534599304, "loss": 2.5433, "loss_aux_layer_0": 0.01959228515625, "loss_aux_layer_1": 0.03814697265625, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.0712890625, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.0999755859375, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1175537109375, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0660400390625, "step": 3010, "total_loss": 0.635819211602211 }, { "epoch": 0.5961195802811324, "grad_norm": 0.8529261350631714, "learning_rate": 5e-05, "llm_loss": 0.5627154856920242, "loss": 2.5999, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.06915283203125, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06695556640625, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0633544921875, "step": 3011, "total_loss": 0.6499711573123932 }, { "epoch": 0.5963175608790339, "grad_norm": 0.9632887840270996, "learning_rate": 5e-05, "llm_loss": 0.601238414645195, "loss": 2.7573, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.0614013671875, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.06304931640625, "step": 3012, "total_loss": 0.6893350332975388 }, { "epoch": 0.5965155414769353, "grad_norm": 1.0292813777923584, "learning_rate": 5e-05, "llm_loss": 0.6536365523934364, "loss": 2.9836, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.0386962890625, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.083740234375, "loss_aux_layer_14": 0.0926513671875, "loss_aux_layer_15": 0.1016845703125, "loss_aux_layer_16": 0.1114501953125, "loss_aux_layer_17": 0.118896484375, "loss_aux_layer_18": 0.127197265625, "loss_aux_layer_19": 0.130126953125, "loss_aux_layer_2": 0.0526123046875, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.0677490234375, "loss_aux_layer_9": 0.06640625, "step": 3013, "total_loss": 0.7459021508693695 }, { "epoch": 0.5967135220748366, "grad_norm": 1.0238120555877686, "learning_rate": 5e-05, "llm_loss": 0.6547431498765945, "loss": 2.9702, "loss_aux_layer_0": 0.017364501953125, "loss_aux_layer_1": 0.03717041015625, "loss_aux_layer_10": 0.06451416015625, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.06060791015625, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.06451416015625, "loss_aux_layer_9": 0.0631103515625, "step": 3014, "total_loss": 0.7425382435321808 }, { "epoch": 0.5969115026727381, "grad_norm": 1.066652536392212, "learning_rate": 5e-05, "llm_loss": 0.5292289853096008, "loss": 2.4585, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.035003662109375, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.06060791015625, "step": 3015, "total_loss": 0.6146296709775925 }, { "epoch": 0.5971094832706395, "grad_norm": 0.8782765865325928, "learning_rate": 5e-05, "llm_loss": 0.4959460645914078, "loss": 2.3253, "loss_aux_layer_0": 0.016632080078125, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.0589599609375, "step": 3016, "total_loss": 0.5813161134719849 }, { "epoch": 0.5973074638685408, "grad_norm": 1.0131772756576538, "learning_rate": 5e-05, "llm_loss": 0.6125975996255875, "loss": 2.8022, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06201171875, "step": 3017, "total_loss": 0.7005486190319061 }, { "epoch": 0.5975054444664423, "grad_norm": 1.0334686040878296, "learning_rate": 5e-05, "llm_loss": 0.5219128131866455, "loss": 2.4467, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03790283203125, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0628662109375, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.06683349609375, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.06768798828125, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.06573486328125, "step": 3018, "total_loss": 0.6116830557584763 }, { "epoch": 0.5977034250643437, "grad_norm": 0.9132184386253357, "learning_rate": 5e-05, "llm_loss": 0.5410832613706589, "loss": 2.5157, "loss_aux_layer_0": 0.017181396484375, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.062255859375, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.0621337890625, "step": 3019, "total_loss": 0.6289195418357849 }, { "epoch": 0.5979014056622451, "grad_norm": 1.1404117345809937, "learning_rate": 5e-05, "llm_loss": 0.580141082406044, "loss": 2.6676, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.0582275390625, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.0625, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.061767578125, "step": 3020, "total_loss": 0.6668934226036072 }, { "epoch": 0.5980993862601465, "grad_norm": 1.0081826448440552, "learning_rate": 5e-05, "llm_loss": 0.5610925853252411, "loss": 2.6011, "loss_aux_layer_0": 0.017669677734375, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.06585693359375, "loss_aux_layer_11": 0.06988525390625, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.06536865234375, "loss_aux_layer_6": 0.06854248046875, "loss_aux_layer_7": 0.06671142578125, "loss_aux_layer_8": 0.0662841796875, "loss_aux_layer_9": 0.064697265625, "step": 3021, "total_loss": 0.6502628922462463 }, { "epoch": 0.5982973668580479, "grad_norm": 0.9762638807296753, "learning_rate": 5e-05, "llm_loss": 0.5559601560235023, "loss": 2.5714, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.060791015625, "step": 3022, "total_loss": 0.6428411453962326 }, { "epoch": 0.5984953474559493, "grad_norm": 1.2972339391708374, "learning_rate": 5e-05, "llm_loss": 0.5670172274112701, "loss": 2.6319, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.06689453125, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0997314453125, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.06488037109375, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0692138671875, "loss_aux_layer_7": 0.0670166015625, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06524658203125, "step": 3023, "total_loss": 0.6579635143280029 }, { "epoch": 0.5986933280538507, "grad_norm": 0.7650340795516968, "learning_rate": 5e-05, "llm_loss": 0.5943406373262405, "loss": 2.7209, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.034027099609375, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.06005859375, "step": 3024, "total_loss": 0.6802268177270889 }, { "epoch": 0.5988913086517521, "grad_norm": 1.257677674293518, "learning_rate": 5e-05, "llm_loss": 0.5403682440519333, "loss": 2.533, "loss_aux_layer_0": 0.0185546875, "loss_aux_layer_1": 0.038330078125, "loss_aux_layer_10": 0.06817626953125, "loss_aux_layer_11": 0.07275390625, "loss_aux_layer_12": 0.077880859375, "loss_aux_layer_13": 0.0836181640625, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.1112060546875, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.12744140625, "loss_aux_layer_19": 0.13134765625, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.13916015625, "loss_aux_layer_21": 0.1474609375, "loss_aux_layer_22": 0.16943359375, "loss_aux_layer_23": 0.207275390625, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.06597900390625, "loss_aux_layer_5": 0.06781005859375, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.06878662109375, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06695556640625, "step": 3025, "total_loss": 0.6332454085350037 }, { "epoch": 0.5990892892496535, "grad_norm": 0.8479840755462646, "learning_rate": 5e-05, "llm_loss": 0.6355032846331596, "loss": 2.8946, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.07293701171875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.06219482421875, "step": 3026, "total_loss": 0.7236551493406296 }, { "epoch": 0.599287269847555, "grad_norm": 1.183509111404419, "learning_rate": 5e-05, "llm_loss": 0.5990249216556549, "loss": 2.7591, "loss_aux_layer_0": 0.01708984375, "loss_aux_layer_1": 0.036865234375, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.06951904296875, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.08984375, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1090087890625, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.05072021484375, "loss_aux_layer_20": 0.137451171875, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.169189453125, "loss_aux_layer_23": 0.206787109375, "loss_aux_layer_3": 0.0611572265625, "loss_aux_layer_4": 0.06353759765625, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.0654296875, "loss_aux_layer_9": 0.0638427734375, "step": 3027, "total_loss": 0.6897712498903275 }, { "epoch": 0.5994852504454563, "grad_norm": 0.8766260147094727, "learning_rate": 5e-05, "llm_loss": 0.5239111483097076, "loss": 2.451, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.06439208984375, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0601806640625, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.06475830078125, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.06329345703125, "step": 3028, "total_loss": 0.6127580106258392 }, { "epoch": 0.5996832310433577, "grad_norm": 1.1045855283737183, "learning_rate": 5e-05, "llm_loss": 0.5356073975563049, "loss": 2.4892, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.0601806640625, "step": 3029, "total_loss": 0.6223041862249374 }, { "epoch": 0.5998812116412592, "grad_norm": 0.927230954170227, "learning_rate": 5e-05, "llm_loss": 0.5341801792383194, "loss": 2.4909, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.126708984375, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06402587890625, "loss_aux_layer_8": 0.06341552734375, "loss_aux_layer_9": 0.06243896484375, "step": 3030, "total_loss": 0.6227317154407501 }, { "epoch": 0.6000791922391605, "grad_norm": 0.9755420684814453, "learning_rate": 5e-05, "llm_loss": 0.580309271812439, "loss": 2.6783, "loss_aux_layer_0": 0.017913818359375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.04901123046875, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.2041015625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.0672607421875, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06292724609375, "step": 3031, "total_loss": 0.669580489397049 }, { "epoch": 0.600277172837062, "grad_norm": 1.0024962425231934, "learning_rate": 5e-05, "llm_loss": 0.6164143085479736, "loss": 2.8222, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.0894775390625, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.06402587890625, "step": 3032, "total_loss": 0.7055461555719376 }, { "epoch": 0.6004751534349634, "grad_norm": 1.0473053455352783, "learning_rate": 5e-05, "llm_loss": 0.5288019701838493, "loss": 2.468, "loss_aux_layer_0": 0.01739501953125, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06500244140625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.0614013671875, "step": 3033, "total_loss": 0.6169911623001099 }, { "epoch": 0.6006731340328648, "grad_norm": 1.285885214805603, "learning_rate": 5e-05, "llm_loss": 0.5934188961982727, "loss": 2.7129, "loss_aux_layer_0": 0.018646240234375, "loss_aux_layer_1": 0.03326416015625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.06500244140625, "loss_aux_layer_12": 0.06951904296875, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.05975341796875, "step": 3034, "total_loss": 0.6782164126634598 }, { "epoch": 0.6008711146307661, "grad_norm": 0.9847946763038635, "learning_rate": 5e-05, "llm_loss": 0.6237266212701797, "loss": 2.8483, "loss_aux_layer_0": 0.018585205078125, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.064453125, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.064208984375, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.0628662109375, "step": 3035, "total_loss": 0.7120761424303055 }, { "epoch": 0.6010690952286676, "grad_norm": 0.9524043202400208, "learning_rate": 5e-05, "llm_loss": 0.5825062841176987, "loss": 2.6723, "loss_aux_layer_0": 0.018096923828125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06488037109375, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.0596923828125, "step": 3036, "total_loss": 0.6680813878774643 }, { "epoch": 0.601267075826569, "grad_norm": 1.0283708572387695, "learning_rate": 5e-05, "llm_loss": 0.6004262566566467, "loss": 2.7459, "loss_aux_layer_0": 0.017059326171875, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.058837890625, "step": 3037, "total_loss": 0.6864661425352097 }, { "epoch": 0.6014650564244705, "grad_norm": 1.1312581300735474, "learning_rate": 5e-05, "llm_loss": 0.5928554236888885, "loss": 2.7251, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06353759765625, "loss_aux_layer_9": 0.06231689453125, "step": 3038, "total_loss": 0.681265726685524 }, { "epoch": 0.6016630370223718, "grad_norm": 1.194262981414795, "learning_rate": 5e-05, "llm_loss": 0.5925916880369186, "loss": 2.729, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.064453125, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1241455078125, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.20556640625, "loss_aux_layer_3": 0.06072998046875, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.06719970703125, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06317138671875, "step": 3039, "total_loss": 0.682241827249527 }, { "epoch": 0.6018610176202732, "grad_norm": 1.0764098167419434, "learning_rate": 5e-05, "llm_loss": 0.5907347649335861, "loss": 2.7119, "loss_aux_layer_0": 0.016845703125, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.06109619140625, "step": 3040, "total_loss": 0.677968367934227 }, { "epoch": 0.6020589982181747, "grad_norm": 1.2786505222320557, "learning_rate": 5e-05, "llm_loss": 0.6293617337942123, "loss": 2.8696, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0582275390625, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06207275390625, "step": 3041, "total_loss": 0.7173969596624374 }, { "epoch": 0.602256978816076, "grad_norm": 1.0717318058013916, "learning_rate": 5e-05, "llm_loss": 0.6021295040845871, "loss": 2.7702, "loss_aux_layer_0": 0.017333984375, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.081787109375, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.05242919921875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.06268310546875, "loss_aux_layer_4": 0.06512451171875, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.0677490234375, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.065673828125, "step": 3042, "total_loss": 0.6925544440746307 }, { "epoch": 0.6024549594139774, "grad_norm": 1.1983369588851929, "learning_rate": 5e-05, "llm_loss": 0.5870878100395203, "loss": 2.6919, "loss_aux_layer_0": 0.017486572265625, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.06005859375, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06024169921875, "step": 3043, "total_loss": 0.672969862818718 }, { "epoch": 0.6026529400118789, "grad_norm": 1.2910823822021484, "learning_rate": 5e-05, "llm_loss": 0.5514290556311607, "loss": 2.5588, "loss_aux_layer_0": 0.016754150390625, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.0653076171875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.06329345703125, "step": 3044, "total_loss": 0.6396943032741547 }, { "epoch": 0.6028509206097803, "grad_norm": 1.1278775930404663, "learning_rate": 5e-05, "llm_loss": 0.5281066596508026, "loss": 2.4652, "loss_aux_layer_0": 0.01776123046875, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1256103515625, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06402587890625, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0623779296875, "step": 3045, "total_loss": 0.6163004040718079 }, { "epoch": 0.6030489012076816, "grad_norm": 0.9810858368873596, "learning_rate": 5e-05, "llm_loss": 0.597419336438179, "loss": 2.7466, "loss_aux_layer_0": 0.0177001953125, "loss_aux_layer_1": 0.03729248046875, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.06939697265625, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.06805419921875, "loss_aux_layer_7": 0.06573486328125, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.06414794921875, "step": 3046, "total_loss": 0.6866583824157715 }, { "epoch": 0.6032468818055831, "grad_norm": 1.1741063594818115, "learning_rate": 5e-05, "llm_loss": 0.6458039879798889, "loss": 2.9373, "loss_aux_layer_0": 0.017333984375, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.05035400390625, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.0648193359375, "loss_aux_layer_9": 0.06365966796875, "step": 3047, "total_loss": 0.7343129217624664 }, { "epoch": 0.6034448624034845, "grad_norm": 1.0704444646835327, "learning_rate": 5e-05, "llm_loss": 0.6073468774557114, "loss": 2.7798, "loss_aux_layer_0": 0.01727294921875, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.06829833984375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.061279296875, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06292724609375, "step": 3048, "total_loss": 0.6949526146054268 }, { "epoch": 0.6036428430013858, "grad_norm": 1.0702005624771118, "learning_rate": 5e-05, "llm_loss": 0.5792012810707092, "loss": 2.664, "loss_aux_layer_0": 0.017913818359375, "loss_aux_layer_1": 0.0361328125, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06256103515625, "step": 3049, "total_loss": 0.6660066097974777 }, { "epoch": 0.6038408235992873, "grad_norm": 0.832287609577179, "learning_rate": 5e-05, "llm_loss": 0.5456563234329224, "loss": 2.5334, "loss_aux_layer_0": 0.01849365234375, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.0625, "step": 3050, "total_loss": 0.6333391517400742 }, { "epoch": 0.6040388041971887, "grad_norm": 0.9119130373001099, "learning_rate": 5e-05, "llm_loss": 0.5287080034613609, "loss": 2.4684, "loss_aux_layer_0": 0.017913818359375, "loss_aux_layer_1": 0.035430908203125, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1278076171875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.061279296875, "step": 3051, "total_loss": 0.6171023547649384 }, { "epoch": 0.6042367847950901, "grad_norm": 1.056797742843628, "learning_rate": 5e-05, "llm_loss": 0.6033025309443474, "loss": 2.7678, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.064208984375, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.06427001953125, "loss_aux_layer_9": 0.06280517578125, "step": 3052, "total_loss": 0.69195157289505 }, { "epoch": 0.6044347653929915, "grad_norm": 1.0638787746429443, "learning_rate": 5e-05, "llm_loss": 0.6145090013742447, "loss": 2.8213, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.0379638671875, "loss_aux_layer_10": 0.06707763671875, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.07666015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.0911865234375, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.05218505859375, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.065673828125, "loss_aux_layer_5": 0.06732177734375, "loss_aux_layer_6": 0.0704345703125, "loss_aux_layer_7": 0.06805419921875, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.06549072265625, "step": 3053, "total_loss": 0.7053190171718597 }, { "epoch": 0.6046327459908929, "grad_norm": 0.8112555742263794, "learning_rate": 5e-05, "llm_loss": 0.5522722229361534, "loss": 2.5604, "loss_aux_layer_0": 0.017578125, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.06195068359375, "step": 3054, "total_loss": 0.6401093751192093 }, { "epoch": 0.6048307265887943, "grad_norm": 1.0509536266326904, "learning_rate": 5e-05, "llm_loss": 0.6409040838479996, "loss": 2.9123, "loss_aux_layer_0": 0.01715087890625, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.06451416015625, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06256103515625, "step": 3055, "total_loss": 0.7280685752630234 }, { "epoch": 0.6050287071866957, "grad_norm": 0.9261244535446167, "learning_rate": 5e-05, "llm_loss": 0.5248123407363892, "loss": 2.4494, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06134033203125, "step": 3056, "total_loss": 0.6123469173908234 }, { "epoch": 0.6052266877845971, "grad_norm": 0.896218478679657, "learning_rate": 5e-05, "llm_loss": 0.5179304108023643, "loss": 2.42, "loss_aux_layer_0": 0.01788330078125, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.06201171875, "step": 3057, "total_loss": 0.6049932688474655 }, { "epoch": 0.6054246683824985, "grad_norm": 0.9997830390930176, "learning_rate": 5e-05, "llm_loss": 0.6057744398713112, "loss": 2.7729, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.06817626953125, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.06231689453125, "step": 3058, "total_loss": 0.6932359933853149 }, { "epoch": 0.6056226489804, "grad_norm": 1.146288275718689, "learning_rate": 5e-05, "llm_loss": 0.5731576830148697, "loss": 2.6392, "loss_aux_layer_0": 0.016387939453125, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.06732177734375, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0589599609375, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0616455078125, "step": 3059, "total_loss": 0.6597879528999329 }, { "epoch": 0.6058206295783013, "grad_norm": 1.0631823539733887, "learning_rate": 5e-05, "llm_loss": 0.5138922706246376, "loss": 2.4043, "loss_aux_layer_0": 0.0177001953125, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.06658935546875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.1339111328125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.202880859375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.06060791015625, "step": 3060, "total_loss": 0.6010718643665314 }, { "epoch": 0.6060186101762027, "grad_norm": 1.1714869737625122, "learning_rate": 5e-05, "llm_loss": 0.672959566116333, "loss": 3.0391, "loss_aux_layer_0": 0.017303466796875, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.06231689453125, "step": 3061, "total_loss": 0.7597723007202148 }, { "epoch": 0.6062165907741042, "grad_norm": 0.8897770643234253, "learning_rate": 5e-05, "llm_loss": 0.4852791279554367, "loss": 2.2971, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.036865234375, "loss_aux_layer_10": 0.06512451171875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.06488037109375, "loss_aux_layer_9": 0.06353759765625, "step": 3062, "total_loss": 0.5742754489183426 }, { "epoch": 0.6064145713720055, "grad_norm": 0.9195343255996704, "learning_rate": 5e-05, "llm_loss": 0.5582736879587173, "loss": 2.5854, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.1202392578125, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.1307373046875, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.06072998046875, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0654296875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06304931640625, "step": 3063, "total_loss": 0.6463451087474823 }, { "epoch": 0.6066125519699069, "grad_norm": 1.0342966318130493, "learning_rate": 5e-05, "llm_loss": 0.5969664603471756, "loss": 2.7243, "loss_aux_layer_0": 0.016815185546875, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.0589599609375, "step": 3064, "total_loss": 0.6810869723558426 }, { "epoch": 0.6068105325678084, "grad_norm": 1.2448192834854126, "learning_rate": 5e-05, "llm_loss": 0.5900433510541916, "loss": 2.709, "loss_aux_layer_0": 0.016265869140625, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.06768798828125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.04901123046875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.0621337890625, "step": 3065, "total_loss": 0.6772541105747223 }, { "epoch": 0.6070085131657098, "grad_norm": 1.5905070304870605, "learning_rate": 5e-05, "llm_loss": 0.5363325700163841, "loss": 2.4967, "loss_aux_layer_0": 0.017364501953125, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06292724609375, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.06146240234375, "step": 3066, "total_loss": 0.6241686046123505 }, { "epoch": 0.6072064937636111, "grad_norm": 1.5384283065795898, "learning_rate": 5e-05, "llm_loss": 0.5722307413816452, "loss": 2.6387, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.0601806640625, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.062744140625, "step": 3067, "total_loss": 0.6596849113702774 }, { "epoch": 0.6074044743615126, "grad_norm": 1.2672821283340454, "learning_rate": 5e-05, "llm_loss": 0.60942342877388, "loss": 2.7852, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06103515625, "step": 3068, "total_loss": 0.6962957680225372 }, { "epoch": 0.607602454959414, "grad_norm": 1.039413571357727, "learning_rate": 5e-05, "llm_loss": 0.6214499622583389, "loss": 2.849, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.037841796875, "loss_aux_layer_10": 0.0675048828125, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0772705078125, "loss_aux_layer_13": 0.0831298828125, "loss_aux_layer_14": 0.0916748046875, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.109375, "loss_aux_layer_17": 0.1173095703125, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.052001953125, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.06292724609375, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.0672607421875, "loss_aux_layer_9": 0.0660400390625, "step": 3069, "total_loss": 0.7122405618429184 }, { "epoch": 0.6078004355573153, "grad_norm": 1.057925820350647, "learning_rate": 5e-05, "llm_loss": 0.6361582726240158, "loss": 2.9094, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03753662109375, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.082763671875, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1285400390625, "loss_aux_layer_2": 0.051513671875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.06207275390625, "loss_aux_layer_4": 0.0645751953125, "loss_aux_layer_5": 0.0662841796875, "loss_aux_layer_6": 0.0694580078125, "loss_aux_layer_7": 0.0675048828125, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.066162109375, "step": 3070, "total_loss": 0.7273579239845276 }, { "epoch": 0.6079984161552168, "grad_norm": 0.7890492081642151, "learning_rate": 5e-05, "llm_loss": 0.5581352412700653, "loss": 2.578, "loss_aux_layer_0": 0.016815185546875, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0594482421875, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.063232421875, "loss_aux_layer_9": 0.06182861328125, "step": 3071, "total_loss": 0.6444947272539139 }, { "epoch": 0.6081963967531182, "grad_norm": 0.9946773052215576, "learning_rate": 5e-05, "llm_loss": 0.5886736661195755, "loss": 2.6994, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.0601806640625, "step": 3072, "total_loss": 0.6748394519090652 }, { "epoch": 0.6083943773510196, "grad_norm": 0.9070671796798706, "learning_rate": 5e-05, "llm_loss": 0.531896561384201, "loss": 2.4896, "loss_aux_layer_0": 0.017059326171875, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06591796875, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0816650390625, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1256103515625, "loss_aux_layer_19": 0.1290283203125, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06427001953125, "step": 3073, "total_loss": 0.622390478849411 }, { "epoch": 0.608592357948921, "grad_norm": 1.017293930053711, "learning_rate": 5e-05, "llm_loss": 0.5334826409816742, "loss": 2.484, "loss_aux_layer_0": 0.016632080078125, "loss_aux_layer_1": 0.035186767578125, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.07257080078125, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.06256103515625, "step": 3074, "total_loss": 0.6210048422217369 }, { "epoch": 0.6087903385468224, "grad_norm": 1.0175583362579346, "learning_rate": 5e-05, "llm_loss": 0.6631246507167816, "loss": 3.0045, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.0596923828125, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.06243896484375, "step": 3075, "total_loss": 0.7511314302682877 }, { "epoch": 0.6089883191447238, "grad_norm": 0.8455734252929688, "learning_rate": 5e-05, "llm_loss": 0.6029753237962723, "loss": 2.7414, "loss_aux_layer_0": 0.01666259765625, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06732177734375, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05767822265625, "step": 3076, "total_loss": 0.6853381544351578 }, { "epoch": 0.6091862997426252, "grad_norm": 1.4350138902664185, "learning_rate": 5e-05, "llm_loss": 0.5714034289121628, "loss": 2.6457, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.124755859375, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.205078125, "loss_aux_layer_3": 0.06072998046875, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.06451416015625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.065185546875, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0635986328125, "step": 3077, "total_loss": 0.6614160537719727 }, { "epoch": 0.6093842803405266, "grad_norm": 1.0731722116470337, "learning_rate": 5e-05, "llm_loss": 0.6550763845443726, "loss": 2.9847, "loss_aux_layer_0": 0.01837158203125, "loss_aux_layer_1": 0.0361328125, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.06976318359375, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.10009765625, "loss_aux_layer_16": 0.1103515625, "loss_aux_layer_17": 0.119140625, "loss_aux_layer_18": 0.1278076171875, "loss_aux_layer_19": 0.1312255859375, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.138671875, "loss_aux_layer_21": 0.146240234375, "loss_aux_layer_22": 0.167724609375, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.06329345703125, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.06829833984375, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.0640869140625, "step": 3078, "total_loss": 0.7461856305599213 }, { "epoch": 0.609582260938428, "grad_norm": 2.4632670879364014, "learning_rate": 5e-05, "llm_loss": 0.5774805322289467, "loss": 2.6595, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.06304931640625, "step": 3079, "total_loss": 0.6648859083652496 }, { "epoch": 0.6097802415363295, "grad_norm": 0.9605275392532349, "learning_rate": 5e-05, "llm_loss": 0.5056448876857758, "loss": 2.3938, "loss_aux_layer_0": 0.01641845703125, "loss_aux_layer_1": 0.03839111328125, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.0782470703125, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093505859375, "loss_aux_layer_15": 0.1021728515625, "loss_aux_layer_16": 0.112060546875, "loss_aux_layer_17": 0.1195068359375, "loss_aux_layer_18": 0.1279296875, "loss_aux_layer_19": 0.131103515625, "loss_aux_layer_2": 0.05194091796875, "loss_aux_layer_20": 0.13818359375, "loss_aux_layer_21": 0.146484375, "loss_aux_layer_22": 0.16796875, "loss_aux_layer_23": 0.20654296875, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.0689697265625, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.067138671875, "step": 3080, "total_loss": 0.5984510332345963 }, { "epoch": 0.6099782221342308, "grad_norm": 1.8420567512512207, "learning_rate": 5e-05, "llm_loss": 0.5115516632795334, "loss": 2.4181, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0687255859375, "loss_aux_layer_11": 0.0728759765625, "loss_aux_layer_12": 0.0775146484375, "loss_aux_layer_13": 0.0833740234375, "loss_aux_layer_14": 0.0924072265625, "loss_aux_layer_15": 0.101318359375, "loss_aux_layer_16": 0.111083984375, "loss_aux_layer_17": 0.1185302734375, "loss_aux_layer_18": 0.126708984375, "loss_aux_layer_19": 0.130615234375, "loss_aux_layer_2": 0.0531005859375, "loss_aux_layer_20": 0.138427734375, "loss_aux_layer_21": 0.147705078125, "loss_aux_layer_22": 0.17041015625, "loss_aux_layer_23": 0.210205078125, "loss_aux_layer_3": 0.0635986328125, "loss_aux_layer_4": 0.06591796875, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0689697265625, "loss_aux_layer_9": 0.067626953125, "step": 3081, "total_loss": 0.60452950745821 }, { "epoch": 0.6101762027321322, "grad_norm": 0.9336682558059692, "learning_rate": 5e-05, "llm_loss": 0.5378291681408882, "loss": 2.5141, "loss_aux_layer_0": 0.016815185546875, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0721435546875, "loss_aux_layer_12": 0.0767822265625, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.1097412109375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0521240234375, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.06298828125, "loss_aux_layer_4": 0.06585693359375, "loss_aux_layer_5": 0.0677490234375, "loss_aux_layer_6": 0.0706787109375, "loss_aux_layer_7": 0.0684814453125, "loss_aux_layer_8": 0.0679931640625, "loss_aux_layer_9": 0.0667724609375, "step": 3082, "total_loss": 0.6285349726676941 }, { "epoch": 0.6103741833300337, "grad_norm": 1.2660006284713745, "learning_rate": 5e-05, "llm_loss": 0.5717521011829376, "loss": 2.6452, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.037353515625, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0806884765625, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.06561279296875, "loss_aux_layer_8": 0.0654296875, "loss_aux_layer_9": 0.0643310546875, "step": 3083, "total_loss": 0.6612955182790756 }, { "epoch": 0.610572163927935, "grad_norm": 0.9125070571899414, "learning_rate": 5e-05, "llm_loss": 0.5845064371824265, "loss": 2.6956, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1258544921875, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.06036376953125, "loss_aux_layer_4": 0.06329345703125, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.06365966796875, "step": 3084, "total_loss": 0.6739024966955185 }, { "epoch": 0.6107701445258364, "grad_norm": 1.1228379011154175, "learning_rate": 5e-05, "llm_loss": 0.6347427368164062, "loss": 2.8776, "loss_aux_layer_0": 0.018341064453125, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.05938720703125, "step": 3085, "total_loss": 0.7194029688835144 }, { "epoch": 0.6109681251237379, "grad_norm": 0.9043318629264832, "learning_rate": 5e-05, "llm_loss": 0.5508510917425156, "loss": 2.5488, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.124755859375, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.0611572265625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.0604248046875, "step": 3086, "total_loss": 0.6372031271457672 }, { "epoch": 0.6111661057216393, "grad_norm": 0.9001898169517517, "learning_rate": 5e-05, "llm_loss": 0.6008778885006905, "loss": 2.7468, "loss_aux_layer_0": 0.018310546875, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.06060791015625, "step": 3087, "total_loss": 0.6867044568061829 }, { "epoch": 0.6113640863195406, "grad_norm": 0.9675999283790588, "learning_rate": 5e-05, "llm_loss": 0.5789206624031067, "loss": 2.6825, "loss_aux_layer_0": 0.01739501953125, "loss_aux_layer_1": 0.03912353515625, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.091552734375, "loss_aux_layer_15": 0.1002197265625, "loss_aux_layer_16": 0.109130859375, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.0533447265625, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0638427734375, "loss_aux_layer_4": 0.066650390625, "loss_aux_layer_5": 0.068115234375, "loss_aux_layer_6": 0.071533203125, "loss_aux_layer_7": 0.0693359375, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.066650390625, "step": 3088, "total_loss": 0.670622393488884 }, { "epoch": 0.6115620669174421, "grad_norm": 1.281402349472046, "learning_rate": 5e-05, "llm_loss": 0.610721543431282, "loss": 2.7893, "loss_aux_layer_0": 0.01910400390625, "loss_aux_layer_1": 0.035675048828125, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06097412109375, "step": 3089, "total_loss": 0.6973149925470352 }, { "epoch": 0.6117600475153435, "grad_norm": 1.2310692071914673, "learning_rate": 5e-05, "llm_loss": 0.5131177455186844, "loss": 2.4043, "loss_aux_layer_0": 0.01708984375, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.0621337890625, "step": 3090, "total_loss": 0.601070836186409 }, { "epoch": 0.611958028113245, "grad_norm": 0.9974672198295593, "learning_rate": 5e-05, "llm_loss": 0.5725623220205307, "loss": 2.643, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.064453125, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.06451416015625, "loss_aux_layer_9": 0.06329345703125, "step": 3091, "total_loss": 0.6607433259487152 }, { "epoch": 0.6121560087111463, "grad_norm": 0.9106980562210083, "learning_rate": 5e-05, "llm_loss": 0.626710444688797, "loss": 2.8576, "loss_aux_layer_0": 0.016998291015625, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06378173828125, "loss_aux_layer_9": 0.0623779296875, "step": 3092, "total_loss": 0.7143981754779816 }, { "epoch": 0.6123539893090477, "grad_norm": 1.3285385370254517, "learning_rate": 5e-05, "llm_loss": 0.5442851930856705, "loss": 2.5284, "loss_aux_layer_0": 0.01654052734375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06219482421875, "step": 3093, "total_loss": 0.632090836763382 }, { "epoch": 0.6125519699069492, "grad_norm": 0.8520404100418091, "learning_rate": 5e-05, "llm_loss": 0.6296044513583183, "loss": 2.8705, "loss_aux_layer_0": 0.016326904296875, "loss_aux_layer_1": 0.035186767578125, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06207275390625, "step": 3094, "total_loss": 0.7176328748464584 }, { "epoch": 0.6127499505048505, "grad_norm": 0.9134960174560547, "learning_rate": 5e-05, "llm_loss": 0.5403398647904396, "loss": 2.5177, "loss_aux_layer_0": 0.017364501953125, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.097412109375, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.06390380859375, "step": 3095, "total_loss": 0.6294302046298981 }, { "epoch": 0.6129479311027519, "grad_norm": 0.9596572518348694, "learning_rate": 5e-05, "llm_loss": 0.5967923998832703, "loss": 2.7298, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.060546875, "step": 3096, "total_loss": 0.6824566721916199 }, { "epoch": 0.6131459117006534, "grad_norm": 1.180555820465088, "learning_rate": 5e-05, "llm_loss": 0.5854467451572418, "loss": 2.687, "loss_aux_layer_0": 0.016845703125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.0606689453125, "step": 3097, "total_loss": 0.6717598140239716 }, { "epoch": 0.6133438922985548, "grad_norm": 1.0065432786941528, "learning_rate": 5e-05, "llm_loss": 0.5977984964847565, "loss": 2.7541, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.0673828125, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0992431640625, "loss_aux_layer_16": 0.10888671875, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1243896484375, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.05120849609375, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142822265625, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.2021484375, "loss_aux_layer_3": 0.06170654296875, "loss_aux_layer_4": 0.06475830078125, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.0697021484375, "loss_aux_layer_7": 0.06768798828125, "loss_aux_layer_8": 0.06719970703125, "loss_aux_layer_9": 0.0660400390625, "step": 3098, "total_loss": 0.6885315477848053 }, { "epoch": 0.6135418728964561, "grad_norm": 0.8253668546676636, "learning_rate": 5e-05, "llm_loss": 0.5725317150354385, "loss": 2.6505, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.0657958984375, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.09033203125, "loss_aux_layer_15": 0.0994873046875, "loss_aux_layer_16": 0.1094970703125, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125732421875, "loss_aux_layer_19": 0.129150390625, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.06268310546875, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06427001953125, "step": 3099, "total_loss": 0.6626297831535339 }, { "epoch": 0.6137398534943576, "grad_norm": 1.1801509857177734, "learning_rate": 5e-05, "llm_loss": 0.5330308973789215, "loss": 2.4896, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.097412109375, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1151123046875, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.134765625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.0640869140625, "step": 3100, "total_loss": 0.6224066317081451 }, { "epoch": 0.613937834092259, "grad_norm": 0.8812125325202942, "learning_rate": 5e-05, "llm_loss": 0.5615832507610321, "loss": 2.5901, "loss_aux_layer_0": 0.0163116455078125, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.061279296875, "step": 3101, "total_loss": 0.647517591714859 }, { "epoch": 0.6141358146901603, "grad_norm": 0.8861916661262512, "learning_rate": 5e-05, "llm_loss": 0.5482999607920647, "loss": 2.5464, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.10693359375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1268310546875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06341552734375, "loss_aux_layer_9": 0.0623779296875, "step": 3102, "total_loss": 0.6366048157215118 }, { "epoch": 0.6143337952880618, "grad_norm": 1.1399152278900146, "learning_rate": 5e-05, "llm_loss": 0.5243785232305527, "loss": 2.4467, "loss_aux_layer_0": 0.017486572265625, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.06707763671875, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.06536865234375, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06158447265625, "step": 3103, "total_loss": 0.6116753965616226 }, { "epoch": 0.6145317758859632, "grad_norm": 1.0052516460418701, "learning_rate": 5e-05, "llm_loss": 0.5317915230989456, "loss": 2.4744, "loss_aux_layer_0": 0.01708984375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.06658935546875, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.0626220703125, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06103515625, "step": 3104, "total_loss": 0.6185964792966843 }, { "epoch": 0.6147297564838646, "grad_norm": 0.9783931970596313, "learning_rate": 5e-05, "llm_loss": 0.5929429829120636, "loss": 2.7153, "loss_aux_layer_0": 0.017974853515625, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.1319580078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0615234375, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.060302734375, "step": 3105, "total_loss": 0.6788239479064941 }, { "epoch": 0.614927737081766, "grad_norm": 1.0919315814971924, "learning_rate": 5e-05, "llm_loss": 0.6193975806236267, "loss": 2.8309, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.0618896484375, "step": 3106, "total_loss": 0.7077171355485916 }, { "epoch": 0.6151257176796674, "grad_norm": 0.8854255080223083, "learning_rate": 5e-05, "llm_loss": 0.5964924097061157, "loss": 2.7193, "loss_aux_layer_0": 0.01654052734375, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05810546875, "step": 3107, "total_loss": 0.679833859205246 }, { "epoch": 0.6153236982775688, "grad_norm": 1.0448673963546753, "learning_rate": 5e-05, "llm_loss": 0.5942361950874329, "loss": 2.7328, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.06494140625, "loss_aux_layer_11": 0.0693359375, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1258544921875, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.0634765625, "step": 3108, "total_loss": 0.6832111775875092 }, { "epoch": 0.6155216788754702, "grad_norm": 0.750688910484314, "learning_rate": 5e-05, "llm_loss": 0.6077259331941605, "loss": 2.7916, "loss_aux_layer_0": 0.016357421875, "loss_aux_layer_1": 0.03765869140625, "loss_aux_layer_10": 0.0670166015625, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.0904541015625, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.10791015625, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.0518798828125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.062744140625, "loss_aux_layer_4": 0.0654296875, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.0657958984375, "step": 3109, "total_loss": 0.6978971660137177 }, { "epoch": 0.6157196594733716, "grad_norm": 1.0987433195114136, "learning_rate": 5e-05, "llm_loss": 0.49809375405311584, "loss": 2.3286, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0574951171875, "step": 3110, "total_loss": 0.5821510851383209 }, { "epoch": 0.615917640071273, "grad_norm": 0.8466034531593323, "learning_rate": 5e-05, "llm_loss": 0.5008621960878372, "loss": 2.3604, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06524658203125, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.0980224609375, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.06280517578125, "loss_aux_layer_5": 0.064697265625, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.0640869140625, "step": 3111, "total_loss": 0.5901058316230774 }, { "epoch": 0.6161156206691745, "grad_norm": 0.8751788139343262, "learning_rate": 5e-05, "llm_loss": 0.6023076474666595, "loss": 2.7671, "loss_aux_layer_0": 0.0161590576171875, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.0750732421875, "loss_aux_layer_13": 0.0809326171875, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0635986328125, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06439208984375, "step": 3112, "total_loss": 0.6917689442634583 }, { "epoch": 0.6163136012670758, "grad_norm": 1.0091001987457275, "learning_rate": 5e-05, "llm_loss": 0.6322897672653198, "loss": 2.8695, "loss_aux_layer_0": 0.016387939453125, "loss_aux_layer_1": 0.033538818359375, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.07000732421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.1297607421875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.06146240234375, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.05963134765625, "step": 3113, "total_loss": 0.7173829823732376 }, { "epoch": 0.6165115818649772, "grad_norm": 0.8285719752311707, "learning_rate": 5e-05, "llm_loss": 0.5493471398949623, "loss": 2.5453, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06072998046875, "step": 3114, "total_loss": 0.6363245993852615 }, { "epoch": 0.6167095624628787, "grad_norm": 1.0451271533966064, "learning_rate": 5e-05, "llm_loss": 0.6945685148239136, "loss": 3.137, "loss_aux_layer_0": 0.015838623046875, "loss_aux_layer_1": 0.036865234375, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1083984375, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.06329345703125, "loss_aux_layer_5": 0.0650634765625, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.0643310546875, "step": 3115, "total_loss": 0.7842569798231125 }, { "epoch": 0.61690754306078, "grad_norm": 1.2226744890213013, "learning_rate": 5e-05, "llm_loss": 0.612877145409584, "loss": 2.8043, "loss_aux_layer_0": 0.0161285400390625, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.06243896484375, "step": 3116, "total_loss": 0.7010653764009476 }, { "epoch": 0.6171055236586814, "grad_norm": 0.9884774684906006, "learning_rate": 5e-05, "llm_loss": 0.5772449970245361, "loss": 2.6538, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0616455078125, "step": 3117, "total_loss": 0.6634522527456284 }, { "epoch": 0.6173035042565829, "grad_norm": 1.0291777849197388, "learning_rate": 5e-05, "llm_loss": 0.6716775745153427, "loss": 3.0384, "loss_aux_layer_0": 0.0157470703125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.1240234375, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0618896484375, "step": 3118, "total_loss": 0.7596053183078766 }, { "epoch": 0.6175014848544843, "grad_norm": 1.2309917211532593, "learning_rate": 5e-05, "llm_loss": 0.6244971603155136, "loss": 2.8362, "loss_aux_layer_0": 0.0159912109375, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.0587158203125, "step": 3119, "total_loss": 0.7090558111667633 }, { "epoch": 0.6176994654523856, "grad_norm": 0.9576053619384766, "learning_rate": 5e-05, "llm_loss": 0.5992956012487411, "loss": 2.7538, "loss_aux_layer_0": 0.016387939453125, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.070068359375, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.06146240234375, "loss_aux_layer_4": 0.06414794921875, "loss_aux_layer_5": 0.065673828125, "loss_aux_layer_6": 0.068603515625, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.064208984375, "step": 3120, "total_loss": 0.6884536445140839 }, { "epoch": 0.6178974460502871, "grad_norm": 0.9593137502670288, "learning_rate": 5e-05, "llm_loss": 0.5144835487008095, "loss": 2.4038, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.06005859375, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06085205078125, "step": 3121, "total_loss": 0.6009473204612732 }, { "epoch": 0.6180954266481885, "grad_norm": 0.8772031664848328, "learning_rate": 5e-05, "llm_loss": 0.5054189115762711, "loss": 2.3775, "loss_aux_layer_0": 0.016571044921875, "loss_aux_layer_1": 0.0361328125, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0892333984375, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.107666015625, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.0628662109375, "step": 3122, "total_loss": 0.5943747162818909 }, { "epoch": 0.6182934072460898, "grad_norm": 0.973210334777832, "learning_rate": 5e-05, "llm_loss": 0.5063304156064987, "loss": 2.369, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06573486328125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06011962890625, "step": 3123, "total_loss": 0.592249870300293 }, { "epoch": 0.6184913878439913, "grad_norm": 0.9101972579956055, "learning_rate": 5e-05, "llm_loss": 0.6162610501050949, "loss": 2.7975, "loss_aux_layer_0": 0.015869140625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.06756591796875, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.05743408203125, "step": 3124, "total_loss": 0.6993709802627563 }, { "epoch": 0.6186893684418927, "grad_norm": 0.9577646851539612, "learning_rate": 5e-05, "llm_loss": 0.5027034878730774, "loss": 2.3669, "loss_aux_layer_0": 0.016632080078125, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06146240234375, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06292724609375, "step": 3125, "total_loss": 0.5917244404554367 }, { "epoch": 0.6188873490397941, "grad_norm": 0.8206095695495605, "learning_rate": 5e-05, "llm_loss": 0.5604757964611053, "loss": 2.5903, "loss_aux_layer_0": 0.01593017578125, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0625, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06219482421875, "step": 3126, "total_loss": 0.6475852131843567 }, { "epoch": 0.6190853296376955, "grad_norm": 0.9392625689506531, "learning_rate": 5e-05, "llm_loss": 0.5542929023504257, "loss": 2.5498, "loss_aux_layer_0": 0.0159454345703125, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.058837890625, "step": 3127, "total_loss": 0.6374562233686447 }, { "epoch": 0.6192833102355969, "grad_norm": 0.8470501899719238, "learning_rate": 5e-05, "llm_loss": 0.5279221534729004, "loss": 2.454, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06011962890625, "step": 3128, "total_loss": 0.6134887635707855 }, { "epoch": 0.6194812908334983, "grad_norm": 0.8422656059265137, "learning_rate": 5e-05, "llm_loss": 0.6070053577423096, "loss": 2.775, "loss_aux_layer_0": 0.01556396484375, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0736083984375, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.063720703125, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.06317138671875, "step": 3129, "total_loss": 0.693753644824028 }, { "epoch": 0.6196792714313997, "grad_norm": 0.8277801871299744, "learning_rate": 5e-05, "llm_loss": 0.47281256318092346, "loss": 2.2482, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.0660400390625, "loss_aux_layer_11": 0.0701904296875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.066162109375, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06451416015625, "step": 3130, "total_loss": 0.562043085694313 }, { "epoch": 0.6198772520293011, "grad_norm": 0.7853823900222778, "learning_rate": 5e-05, "llm_loss": 0.5520437657833099, "loss": 2.5625, "loss_aux_layer_0": 0.015533447265625, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.06781005859375, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.0638427734375, "step": 3131, "total_loss": 0.6406247019767761 }, { "epoch": 0.6200752326272025, "grad_norm": 0.847317099571228, "learning_rate": 5e-05, "llm_loss": 0.6981715559959412, "loss": 3.1273, "loss_aux_layer_0": 0.01580810546875, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.05859375, "step": 3132, "total_loss": 0.7818276137113571 }, { "epoch": 0.620273213225104, "grad_norm": 0.9312999248504639, "learning_rate": 5e-05, "llm_loss": 0.5800295919179916, "loss": 2.6589, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.121337890625, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.05975341796875, "step": 3133, "total_loss": 0.6647175997495651 }, { "epoch": 0.6204711938230053, "grad_norm": 0.7826575636863708, "learning_rate": 5e-05, "llm_loss": 0.5769537389278412, "loss": 2.6381, "loss_aux_layer_0": 0.0157318115234375, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06719970703125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0576171875, "step": 3134, "total_loss": 0.6595213860273361 }, { "epoch": 0.6206691744209067, "grad_norm": 1.0348280668258667, "learning_rate": 5e-05, "llm_loss": 0.5704769566655159, "loss": 2.6259, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.0609130859375, "step": 3135, "total_loss": 0.6564636528491974 }, { "epoch": 0.6208671550188082, "grad_norm": 0.786223292350769, "learning_rate": 5e-05, "llm_loss": 0.5486416518688202, "loss": 2.5362, "loss_aux_layer_0": 0.0153350830078125, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06121826171875, "step": 3136, "total_loss": 0.6340499371290207 }, { "epoch": 0.6210651356167095, "grad_norm": 0.8756604194641113, "learning_rate": 5e-05, "llm_loss": 0.6087488085031509, "loss": 2.7844, "loss_aux_layer_0": 0.0164947509765625, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.0606689453125, "step": 3137, "total_loss": 0.6961021572351456 }, { "epoch": 0.6212631162146109, "grad_norm": 0.825632631778717, "learning_rate": 5e-05, "llm_loss": 0.5995901376008987, "loss": 2.7487, "loss_aux_layer_0": 0.01556396484375, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06109619140625, "step": 3138, "total_loss": 0.6871668696403503 }, { "epoch": 0.6214610968125124, "grad_norm": 0.8893534541130066, "learning_rate": 5e-05, "llm_loss": 0.5711206793785095, "loss": 2.6235, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.121337890625, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05950927734375, "step": 3139, "total_loss": 0.6558703184127808 }, { "epoch": 0.6216590774104138, "grad_norm": 1.0393104553222656, "learning_rate": 5e-05, "llm_loss": 0.5535986796021461, "loss": 2.5804, "loss_aux_layer_0": 0.0155792236328125, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.06787109375, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.0770263671875, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0914306640625, "loss_aux_layer_15": 0.1004638671875, "loss_aux_layer_16": 0.110107421875, "loss_aux_layer_17": 0.1177978515625, "loss_aux_layer_18": 0.1259765625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0513916015625, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.062255859375, "loss_aux_layer_4": 0.0655517578125, "loss_aux_layer_5": 0.06768798828125, "loss_aux_layer_6": 0.07080078125, "loss_aux_layer_7": 0.06829833984375, "loss_aux_layer_8": 0.06768798828125, "loss_aux_layer_9": 0.06640625, "step": 3140, "total_loss": 0.6451033502817154 }, { "epoch": 0.6218570580083151, "grad_norm": 0.7485698461532593, "learning_rate": 5e-05, "llm_loss": 0.5317050367593765, "loss": 2.4581, "loss_aux_layer_0": 0.015899658203125, "loss_aux_layer_1": 0.033050537109375, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.0570068359375, "step": 3141, "total_loss": 0.6145205497741699 }, { "epoch": 0.6220550386062166, "grad_norm": 1.0812056064605713, "learning_rate": 5e-05, "llm_loss": 0.5886794179677963, "loss": 2.7105, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06500244140625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.1072998046875, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.123291015625, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0633544921875, "step": 3142, "total_loss": 0.677637055516243 }, { "epoch": 0.622253019204118, "grad_norm": 0.9425358176231384, "learning_rate": 5e-05, "llm_loss": 0.5266808643937111, "loss": 2.4584, "loss_aux_layer_0": 0.0159149169921875, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.0633544921875, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.087646484375, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.12646484375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06201171875, "step": 3143, "total_loss": 0.6145992428064346 }, { "epoch": 0.6224509998020195, "grad_norm": 1.1812602281570435, "learning_rate": 5e-05, "llm_loss": 0.6350644826889038, "loss": 2.8998, "loss_aux_layer_0": 0.016632080078125, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.080810546875, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.116455078125, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.12744140625, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.060791015625, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.06524658203125, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.0662841796875, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.064208984375, "step": 3144, "total_loss": 0.7249487042427063 }, { "epoch": 0.6226489803999208, "grad_norm": 0.9595181345939636, "learning_rate": 5e-05, "llm_loss": 0.6015327572822571, "loss": 2.7574, "loss_aux_layer_0": 0.016021728515625, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0496826171875, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.0673828125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.06475830078125, "loss_aux_layer_9": 0.063232421875, "step": 3145, "total_loss": 0.6893376410007477 }, { "epoch": 0.6228469609978222, "grad_norm": 1.0926965475082397, "learning_rate": 5e-05, "llm_loss": 0.5466097742319107, "loss": 2.545, "loss_aux_layer_0": 0.0155792236328125, "loss_aux_layer_1": 0.0369873046875, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0758056640625, "loss_aux_layer_13": 0.0814208984375, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.06561279296875, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.0667724609375, "loss_aux_layer_8": 0.066162109375, "loss_aux_layer_9": 0.0650634765625, "step": 3146, "total_loss": 0.6362600177526474 }, { "epoch": 0.6230449415957237, "grad_norm": 1.093747854232788, "learning_rate": 5e-05, "llm_loss": 0.5594237893819809, "loss": 2.5838, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.0604248046875, "step": 3147, "total_loss": 0.6459391415119171 }, { "epoch": 0.623242922193625, "grad_norm": 1.2258284091949463, "learning_rate": 5e-05, "llm_loss": 0.5377689898014069, "loss": 2.5101, "loss_aux_layer_0": 0.01641845703125, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06573486328125, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.114990234375, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.0601806640625, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.06549072265625, "loss_aux_layer_9": 0.0643310546875, "step": 3148, "total_loss": 0.6275191903114319 }, { "epoch": 0.6234409027915264, "grad_norm": 1.1179431676864624, "learning_rate": 5e-05, "llm_loss": 0.5158255323767662, "loss": 2.4051, "loss_aux_layer_0": 0.0155181884765625, "loss_aux_layer_1": 0.033233642578125, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.06939697265625, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.060302734375, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.05963134765625, "step": 3149, "total_loss": 0.6012807041406631 }, { "epoch": 0.6236388833894279, "grad_norm": 0.9670693874359131, "learning_rate": 5e-05, "llm_loss": 0.5263665467500687, "loss": 2.4491, "loss_aux_layer_0": 0.01715087890625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.0594482421875, "step": 3150, "total_loss": 0.612276241183281 }, { "epoch": 0.6238368639873293, "grad_norm": 0.9839885830879211, "learning_rate": 5e-05, "llm_loss": 0.577060654759407, "loss": 2.6535, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.05999755859375, "step": 3151, "total_loss": 0.6633827835321426 }, { "epoch": 0.6240348445852306, "grad_norm": 0.8671945333480835, "learning_rate": 5e-05, "llm_loss": 0.5896802395582199, "loss": 2.7019, "loss_aux_layer_0": 0.017852783203125, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.0601806640625, "step": 3152, "total_loss": 0.6754792928695679 }, { "epoch": 0.6242328251831321, "grad_norm": 1.0780926942825317, "learning_rate": 5e-05, "llm_loss": 0.5780482143163681, "loss": 2.6804, "loss_aux_layer_0": 0.015625, "loss_aux_layer_1": 0.0374755859375, "loss_aux_layer_10": 0.0682373046875, "loss_aux_layer_11": 0.0732421875, "loss_aux_layer_12": 0.078369140625, "loss_aux_layer_13": 0.084228515625, "loss_aux_layer_14": 0.093017578125, "loss_aux_layer_15": 0.1014404296875, "loss_aux_layer_16": 0.11083984375, "loss_aux_layer_17": 0.1187744140625, "loss_aux_layer_18": 0.1263427734375, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0631103515625, "loss_aux_layer_4": 0.066162109375, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.0682373046875, "loss_aux_layer_9": 0.06689453125, "step": 3153, "total_loss": 0.6701009124517441 }, { "epoch": 0.6244308057810335, "grad_norm": 0.8112866282463074, "learning_rate": 5e-05, "llm_loss": 0.6144284009933472, "loss": 2.7999, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.1297607421875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.064453125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.0601806640625, "step": 3154, "total_loss": 0.6999645531177521 }, { "epoch": 0.6246287863789348, "grad_norm": 0.940903902053833, "learning_rate": 5e-05, "llm_loss": 0.6202411502599716, "loss": 2.8214, "loss_aux_layer_0": 0.016326904296875, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.0611572265625, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.0604248046875, "step": 3155, "total_loss": 0.7053581178188324 }, { "epoch": 0.6248267669768363, "grad_norm": 0.8604971170425415, "learning_rate": 5e-05, "llm_loss": 0.56639464199543, "loss": 2.6194, "loss_aux_layer_0": 0.0170135498046875, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.0633544921875, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06280517578125, "step": 3156, "total_loss": 0.6548438668251038 }, { "epoch": 0.6250247475747377, "grad_norm": 0.8742203116416931, "learning_rate": 5e-05, "llm_loss": 0.5161950141191483, "loss": 2.4181, "loss_aux_layer_0": 0.016387939453125, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.06817626953125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0621337890625, "step": 3157, "total_loss": 0.604529894888401 }, { "epoch": 0.6252227281726391, "grad_norm": 0.8082841038703918, "learning_rate": 5e-05, "llm_loss": 0.5634667575359344, "loss": 2.5996, "loss_aux_layer_0": 0.016265869140625, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0587158203125, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06634521484375, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0625, "step": 3158, "total_loss": 0.6498923003673553 }, { "epoch": 0.6254207087705405, "grad_norm": 0.8800146579742432, "learning_rate": 5e-05, "llm_loss": 0.5514352023601532, "loss": 2.5523, "loss_aux_layer_0": 0.0159149169921875, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.06793212890625, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.061279296875, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.06243896484375, "step": 3159, "total_loss": 0.6380874067544937 }, { "epoch": 0.6256186893684419, "grad_norm": 0.9564681053161621, "learning_rate": 5e-05, "llm_loss": 0.5456402897834778, "loss": 2.5191, "loss_aux_layer_0": 0.016204833984375, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.082275390625, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.05987548828125, "loss_aux_layer_9": 0.058837890625, "step": 3160, "total_loss": 0.629784420132637 }, { "epoch": 0.6258166699663433, "grad_norm": 0.8363344669342041, "learning_rate": 5e-05, "llm_loss": 0.548556312918663, "loss": 2.5518, "loss_aux_layer_0": 0.016845703125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06500244140625, "loss_aux_layer_11": 0.06915283203125, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0977783203125, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.128173828125, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.135986328125, "loss_aux_layer_21": 0.143798828125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.06365966796875, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0633544921875, "step": 3161, "total_loss": 0.6379453837871552 }, { "epoch": 0.6260146505642447, "grad_norm": 1.0560864210128784, "learning_rate": 5e-05, "llm_loss": 0.6181824207305908, "loss": 2.8175, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.198974609375, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.0611572265625, "step": 3162, "total_loss": 0.7043848633766174 }, { "epoch": 0.6262126311621461, "grad_norm": 0.8897595405578613, "learning_rate": 5e-05, "llm_loss": 0.5462114363908768, "loss": 2.542, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.1339111328125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.063720703125, "loss_aux_layer_5": 0.0655517578125, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.06671142578125, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.064453125, "step": 3163, "total_loss": 0.6354962736368179 }, { "epoch": 0.6264106117600475, "grad_norm": 0.8983618021011353, "learning_rate": 5e-05, "llm_loss": 0.5338451713323593, "loss": 2.4843, "loss_aux_layer_0": 0.015960693359375, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05889892578125, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.0633544921875, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.06353759765625, "loss_aux_layer_9": 0.06207275390625, "step": 3164, "total_loss": 0.6210755258798599 }, { "epoch": 0.626608592357949, "grad_norm": 0.798056423664093, "learning_rate": 5e-05, "llm_loss": 0.6328385323286057, "loss": 2.884, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06329345703125, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.06353759765625, "step": 3165, "total_loss": 0.7210055291652679 }, { "epoch": 0.6268065729558503, "grad_norm": 1.2240365743637085, "learning_rate": 5e-05, "llm_loss": 0.5814740359783173, "loss": 2.6713, "loss_aux_layer_0": 0.016357421875, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06610107421875, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06060791015625, "step": 3166, "total_loss": 0.6678345650434494 }, { "epoch": 0.6270045535537517, "grad_norm": 1.348314881324768, "learning_rate": 5e-05, "llm_loss": 0.6235923618078232, "loss": 2.8482, "loss_aux_layer_0": 0.01544189453125, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06561279296875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.1314697265625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0611572265625, "loss_aux_layer_4": 0.06390380859375, "loss_aux_layer_5": 0.0654296875, "loss_aux_layer_6": 0.068359375, "loss_aux_layer_7": 0.06610107421875, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.064208984375, "step": 3167, "total_loss": 0.7120574563741684 }, { "epoch": 0.6272025341516532, "grad_norm": 0.9708284735679626, "learning_rate": 5e-05, "llm_loss": 0.5480433478951454, "loss": 2.5253, "loss_aux_layer_0": 0.0162353515625, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.06341552734375, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05828857421875, "step": 3168, "total_loss": 0.6313300430774689 }, { "epoch": 0.6274005147495545, "grad_norm": 1.4886974096298218, "learning_rate": 5e-05, "llm_loss": 0.5381295382976532, "loss": 2.4959, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.07049560546875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.0601806640625, "step": 3169, "total_loss": 0.6239711791276932 }, { "epoch": 0.6275984953474559, "grad_norm": 1.1771337985992432, "learning_rate": 5e-05, "llm_loss": 0.6643873006105423, "loss": 2.9838, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05694580078125, "step": 3170, "total_loss": 0.7459585666656494 }, { "epoch": 0.6277964759453574, "grad_norm": 1.3229237794876099, "learning_rate": 5e-05, "llm_loss": 0.5719450116157532, "loss": 2.6382, "loss_aux_layer_0": 0.017242431640625, "loss_aux_layer_1": 0.03619384765625, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.124755859375, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06597900390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.062255859375, "step": 3171, "total_loss": 0.6595548689365387 }, { "epoch": 0.6279944565432588, "grad_norm": 1.0384891033172607, "learning_rate": 5e-05, "llm_loss": 0.5455024689435959, "loss": 2.5087, "loss_aux_layer_0": 0.01605224609375, "loss_aux_layer_1": 0.033843994140625, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1141357421875, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05743408203125, "step": 3172, "total_loss": 0.6271775960922241 }, { "epoch": 0.6281924371411601, "grad_norm": 1.0905189514160156, "learning_rate": 5e-05, "llm_loss": 0.5310350805521011, "loss": 2.473, "loss_aux_layer_0": 0.016448974609375, "loss_aux_layer_1": 0.03631591796875, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06256103515625, "step": 3173, "total_loss": 0.6182394474744797 }, { "epoch": 0.6283904177390616, "grad_norm": 0.8821781873703003, "learning_rate": 5e-05, "llm_loss": 0.5641409009695053, "loss": 2.6049, "loss_aux_layer_0": 0.01605224609375, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.0615234375, "step": 3174, "total_loss": 0.6512160450220108 }, { "epoch": 0.628588398336963, "grad_norm": 1.2622642517089844, "learning_rate": 5e-05, "llm_loss": 0.572942778468132, "loss": 2.6387, "loss_aux_layer_0": 0.0162353515625, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.06060791015625, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06085205078125, "step": 3175, "total_loss": 0.659678652882576 }, { "epoch": 0.6287863789348643, "grad_norm": 0.9016545414924622, "learning_rate": 5e-05, "llm_loss": 0.6306658685207367, "loss": 2.8696, "loss_aux_layer_0": 0.0158843994140625, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.06396484375, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.06158447265625, "step": 3176, "total_loss": 0.7174105793237686 }, { "epoch": 0.6289843595327658, "grad_norm": 0.874445915222168, "learning_rate": 5e-05, "llm_loss": 0.5974317044019699, "loss": 2.7286, "loss_aux_layer_0": 0.0153961181640625, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05682373046875, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.05999755859375, "step": 3177, "total_loss": 0.6821454614400864 }, { "epoch": 0.6291823401306672, "grad_norm": 0.9682436585426331, "learning_rate": 5e-05, "llm_loss": 0.5484947487711906, "loss": 2.5486, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.1357421875, "loss_aux_layer_21": 0.144287109375, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06109619140625, "loss_aux_layer_5": 0.0626220703125, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.0623779296875, "step": 3178, "total_loss": 0.637155070900917 }, { "epoch": 0.6293803207285686, "grad_norm": 0.8420716524124146, "learning_rate": 5e-05, "llm_loss": 0.5882116183638573, "loss": 2.6958, "loss_aux_layer_0": 0.0160064697265625, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.1297607421875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.061279296875, "step": 3179, "total_loss": 0.6739397943019867 }, { "epoch": 0.62957830132647, "grad_norm": 0.9351300597190857, "learning_rate": 5e-05, "llm_loss": 0.6071975529193878, "loss": 2.7732, "loss_aux_layer_0": 0.0157470703125, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.0618896484375, "step": 3180, "total_loss": 0.6933080554008484 }, { "epoch": 0.6297762819243714, "grad_norm": 0.8579524755477905, "learning_rate": 5e-05, "llm_loss": 0.5500345528125763, "loss": 2.5299, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05670166015625, "step": 3181, "total_loss": 0.6324870958924294 }, { "epoch": 0.6299742625222728, "grad_norm": 0.9259898662567139, "learning_rate": 5e-05, "llm_loss": 0.5652410238981247, "loss": 2.6171, "loss_aux_layer_0": 0.0149993896484375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.07470703125, "loss_aux_layer_13": 0.08056640625, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.068115234375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.0638427734375, "step": 3182, "total_loss": 0.6542700380086899 }, { "epoch": 0.6301722431201742, "grad_norm": 1.0867177248001099, "learning_rate": 5e-05, "llm_loss": 0.6193433403968811, "loss": 2.7982, "loss_aux_layer_0": 0.0156097412109375, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05645751953125, "step": 3183, "total_loss": 0.6995428949594498 }, { "epoch": 0.6303702237180756, "grad_norm": 0.9445160031318665, "learning_rate": 5e-05, "llm_loss": 0.5524494647979736, "loss": 2.5564, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06475830078125, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06121826171875, "step": 3184, "total_loss": 0.6391087248921394 }, { "epoch": 0.630568204315977, "grad_norm": 0.9091463088989258, "learning_rate": 5e-05, "llm_loss": 0.5758705958724022, "loss": 2.6485, "loss_aux_layer_0": 0.016448974609375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06201171875, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.0616455078125, "loss_aux_layer_9": 0.06072998046875, "step": 3185, "total_loss": 0.6621328145265579 }, { "epoch": 0.6307661849138785, "grad_norm": 1.1169928312301636, "learning_rate": 5e-05, "llm_loss": 0.5598304718732834, "loss": 2.5768, "loss_aux_layer_0": 0.016021728515625, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.05902099609375, "step": 3186, "total_loss": 0.6442067176103592 }, { "epoch": 0.6309641655117798, "grad_norm": 1.2220817804336548, "learning_rate": 5e-05, "llm_loss": 0.5336701571941376, "loss": 2.4864, "loss_aux_layer_0": 0.0161895751953125, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.06707763671875, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.06573486328125, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06170654296875, "step": 3187, "total_loss": 0.6216096729040146 }, { "epoch": 0.6311621461096812, "grad_norm": 1.1348549127578735, "learning_rate": 5e-05, "llm_loss": 0.6535456776618958, "loss": 2.9803, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.0377197265625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0762939453125, "loss_aux_layer_13": 0.0823974609375, "loss_aux_layer_14": 0.09130859375, "loss_aux_layer_15": 0.099853515625, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.128662109375, "loss_aux_layer_2": 0.05230712890625, "loss_aux_layer_20": 0.13671875, "loss_aux_layer_21": 0.1455078125, "loss_aux_layer_22": 0.168212890625, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.065185546875, "loss_aux_layer_5": 0.066650390625, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0670166015625, "loss_aux_layer_9": 0.0660400390625, "step": 3188, "total_loss": 0.7450659275054932 }, { "epoch": 0.6313601267075827, "grad_norm": 0.9966735243797302, "learning_rate": 5e-05, "llm_loss": 0.5911605805158615, "loss": 2.71, "loss_aux_layer_0": 0.016326904296875, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06085205078125, "step": 3189, "total_loss": 0.6774953007698059 }, { "epoch": 0.6315581073054841, "grad_norm": 1.0411180257797241, "learning_rate": 5e-05, "llm_loss": 0.5662200748920441, "loss": 2.6076, "loss_aux_layer_0": 0.016204833984375, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06048583984375, "step": 3190, "total_loss": 0.6518906354904175 }, { "epoch": 0.6317560879033854, "grad_norm": 1.0904505252838135, "learning_rate": 5e-05, "llm_loss": 0.6397997289896011, "loss": 2.9106, "loss_aux_layer_0": 0.016754150390625, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06524658203125, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06695556640625, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06365966796875, "step": 3191, "total_loss": 0.7276587337255478 }, { "epoch": 0.6319540685012869, "grad_norm": 0.884175717830658, "learning_rate": 5e-05, "llm_loss": 0.5717195719480515, "loss": 2.6472, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.03741455078125, "loss_aux_layer_10": 0.06732177734375, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.0765380859375, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.05181884765625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.0626220703125, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.0670166015625, "loss_aux_layer_6": 0.070068359375, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.06719970703125, "loss_aux_layer_9": 0.0660400390625, "step": 3192, "total_loss": 0.6618064939975739 }, { "epoch": 0.6321520490991883, "grad_norm": 1.0218966007232666, "learning_rate": 5e-05, "llm_loss": 0.637041836977005, "loss": 2.8832, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.0595703125, "step": 3193, "total_loss": 0.7207974791526794 }, { "epoch": 0.6323500296970896, "grad_norm": 1.0995765924453735, "learning_rate": 5e-05, "llm_loss": 0.5776070207357407, "loss": 2.6674, "loss_aux_layer_0": 0.01702880859375, "loss_aux_layer_1": 0.03656005859375, "loss_aux_layer_10": 0.06524658203125, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.089599609375, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1153564453125, "loss_aux_layer_18": 0.123779296875, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.06475830078125, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06353759765625, "step": 3194, "total_loss": 0.666853278875351 }, { "epoch": 0.6325480102949911, "grad_norm": 1.0149904489517212, "learning_rate": 5e-05, "llm_loss": 0.5277667194604874, "loss": 2.4689, "loss_aux_layer_0": 0.015869140625, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.0645751953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1087646484375, "loss_aux_layer_17": 0.116943359375, "loss_aux_layer_18": 0.1260986328125, "loss_aux_layer_19": 0.12939453125, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.136962890625, "loss_aux_layer_21": 0.14501953125, "loss_aux_layer_22": 0.16552734375, "loss_aux_layer_23": 0.203369140625, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06280517578125, "step": 3195, "total_loss": 0.6172221153974533 }, { "epoch": 0.6327459908928925, "grad_norm": 1.4855040311813354, "learning_rate": 5e-05, "llm_loss": 0.6341314688324928, "loss": 2.8806, "loss_aux_layer_0": 0.0177001953125, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06024169921875, "step": 3196, "total_loss": 0.7201544940471649 }, { "epoch": 0.632943971490794, "grad_norm": 1.2185792922973633, "learning_rate": 5e-05, "llm_loss": 0.5224825590848923, "loss": 2.4356, "loss_aux_layer_0": 0.0157470703125, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.0616455078125, "step": 3197, "total_loss": 0.6088937744498253 }, { "epoch": 0.6331419520886953, "grad_norm": 1.08066725730896, "learning_rate": 5e-05, "llm_loss": 0.5756548196077347, "loss": 2.6546, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.200927734375, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06683349609375, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.0626220703125, "step": 3198, "total_loss": 0.6636423915624619 }, { "epoch": 0.6333399326865967, "grad_norm": 1.2530951499938965, "learning_rate": 5e-05, "llm_loss": 0.5590041428804398, "loss": 2.6023, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.03826904296875, "loss_aux_layer_10": 0.0684814453125, "loss_aux_layer_11": 0.072998046875, "loss_aux_layer_12": 0.078125, "loss_aux_layer_13": 0.083984375, "loss_aux_layer_14": 0.092041015625, "loss_aux_layer_15": 0.1005859375, "loss_aux_layer_16": 0.109619140625, "loss_aux_layer_17": 0.1170654296875, "loss_aux_layer_18": 0.125244140625, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.05291748046875, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.06390380859375, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.06787109375, "loss_aux_layer_6": 0.0714111328125, "loss_aux_layer_7": 0.0694580078125, "loss_aux_layer_8": 0.0687255859375, "loss_aux_layer_9": 0.06689453125, "step": 3199, "total_loss": 0.6505759507417679 }, { "epoch": 0.6335379132844982, "grad_norm": 1.0633600950241089, "learning_rate": 5e-05, "llm_loss": 0.626555547118187, "loss": 2.8421, "loss_aux_layer_0": 0.0160064697265625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0576171875, "step": 3200, "total_loss": 0.7105329036712646 }, { "epoch": 0.6337358938823995, "grad_norm": 0.844231903553009, "learning_rate": 5e-05, "llm_loss": 0.5644014552235603, "loss": 2.5983, "loss_aux_layer_0": 0.0153656005859375, "loss_aux_layer_1": 0.034027099609375, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.07000732421875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05963134765625, "step": 3201, "total_loss": 0.6495681554079056 }, { "epoch": 0.6339338744803009, "grad_norm": 1.3265336751937866, "learning_rate": 5e-05, "llm_loss": 0.5931997150182724, "loss": 2.7285, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.107177734375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1236572265625, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.0494384765625, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.06378173828125, "loss_aux_layer_9": 0.06268310546875, "step": 3202, "total_loss": 0.682136282324791 }, { "epoch": 0.6341318550782024, "grad_norm": 1.1234537363052368, "learning_rate": 5e-05, "llm_loss": 0.5883261263370514, "loss": 2.6853, "loss_aux_layer_0": 0.0157928466796875, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.05718994140625, "step": 3203, "total_loss": 0.6713357120752335 }, { "epoch": 0.6343298356761038, "grad_norm": 1.262184739112854, "learning_rate": 5e-05, "llm_loss": 0.6065228432416916, "loss": 2.7648, "loss_aux_layer_0": 0.017303466796875, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.059326171875, "step": 3204, "total_loss": 0.69119031727314 }, { "epoch": 0.6345278162740051, "grad_norm": 1.087127447128296, "learning_rate": 5e-05, "llm_loss": 0.5661437511444092, "loss": 2.6088, "loss_aux_layer_0": 0.0166015625, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.0595703125, "step": 3205, "total_loss": 0.6522043794393539 }, { "epoch": 0.6347257968719066, "grad_norm": 1.8687341213226318, "learning_rate": 5e-05, "llm_loss": 0.5453202575445175, "loss": 2.5324, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.10693359375, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.06500244140625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.062255859375, "step": 3206, "total_loss": 0.6331100165843964 }, { "epoch": 0.634923777469808, "grad_norm": 0.8731239438056946, "learning_rate": 5e-05, "llm_loss": 0.5708561316132545, "loss": 2.6411, "loss_aux_layer_0": 0.015594482421875, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0703125, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0811767578125, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0985107421875, "loss_aux_layer_16": 0.1080322265625, "loss_aux_layer_17": 0.1160888671875, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.0506591796875, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.063720703125, "loss_aux_layer_5": 0.06536865234375, "loss_aux_layer_6": 0.0684814453125, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.06622314453125, "loss_aux_layer_9": 0.06494140625, "step": 3207, "total_loss": 0.6602694541215897 }, { "epoch": 0.6351217580677093, "grad_norm": 1.2395133972167969, "learning_rate": 5e-05, "llm_loss": 0.562592476606369, "loss": 2.5993, "loss_aux_layer_0": 0.016754150390625, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.06201171875, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.20068359375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.05926513671875, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.062255859375, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06072998046875, "step": 3208, "total_loss": 0.6498226970434189 }, { "epoch": 0.6353197386656108, "grad_norm": 1.0359857082366943, "learning_rate": 5e-05, "llm_loss": 0.6030183136463165, "loss": 2.7558, "loss_aux_layer_0": 0.016571044921875, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.060546875, "step": 3209, "total_loss": 0.6889503002166748 }, { "epoch": 0.6355177192635122, "grad_norm": 1.2441893815994263, "learning_rate": 5e-05, "llm_loss": 0.5703288167715073, "loss": 2.633, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.035797119140625, "loss_aux_layer_10": 0.06463623046875, "loss_aux_layer_11": 0.06884765625, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.0501708984375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.06005859375, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.0645751953125, "loss_aux_layer_6": 0.06781005859375, "loss_aux_layer_7": 0.06536865234375, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0631103515625, "step": 3210, "total_loss": 0.6582566350698471 }, { "epoch": 0.6357156998614136, "grad_norm": 1.269736647605896, "learning_rate": 5e-05, "llm_loss": 0.5028318762779236, "loss": 2.3676, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.0361328125, "loss_aux_layer_10": 0.06549072265625, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.201416015625, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.06549072265625, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.06658935546875, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.06427001953125, "step": 3211, "total_loss": 0.5918942987918854 }, { "epoch": 0.635913680459315, "grad_norm": 1.010009527206421, "learning_rate": 5e-05, "llm_loss": 0.5459452122449875, "loss": 2.5192, "loss_aux_layer_0": 0.016937255859375, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06005859375, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.0606689453125, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05877685546875, "step": 3212, "total_loss": 0.6297932416200638 }, { "epoch": 0.6361116610572164, "grad_norm": 1.1179327964782715, "learning_rate": 5e-05, "llm_loss": 0.5141706764698029, "loss": 2.416, "loss_aux_layer_0": 0.016265869140625, "loss_aux_layer_1": 0.03759765625, "loss_aux_layer_10": 0.067138671875, "loss_aux_layer_11": 0.071533203125, "loss_aux_layer_12": 0.076171875, "loss_aux_layer_13": 0.0819091796875, "loss_aux_layer_14": 0.090576171875, "loss_aux_layer_15": 0.098876953125, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.115234375, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.05133056640625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.0621337890625, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.0699462890625, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.067138671875, "loss_aux_layer_9": 0.065673828125, "step": 3213, "total_loss": 0.6039968580007553 }, { "epoch": 0.6363096416551178, "grad_norm": 0.9820958375930786, "learning_rate": 5e-05, "llm_loss": 0.5240155458450317, "loss": 2.4394, "loss_aux_layer_0": 0.0162811279296875, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06427001953125, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06048583984375, "step": 3214, "total_loss": 0.6098457723855972 }, { "epoch": 0.6365076222530192, "grad_norm": 1.5047396421432495, "learning_rate": 5e-05, "llm_loss": 0.5646283477544785, "loss": 2.61, "loss_aux_layer_0": 0.019866943359375, "loss_aux_layer_1": 0.03643798828125, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.121337890625, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.05963134765625, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06182861328125, "step": 3215, "total_loss": 0.6525018215179443 }, { "epoch": 0.6367056028509206, "grad_norm": 0.8612713813781738, "learning_rate": 5e-05, "llm_loss": 0.5191578194499016, "loss": 2.4289, "loss_aux_layer_0": 0.0162353515625, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.126953125, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.135498046875, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166015625, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.061767578125, "step": 3216, "total_loss": 0.6072323322296143 }, { "epoch": 0.636903583448822, "grad_norm": 1.6283764839172363, "learning_rate": 5e-05, "llm_loss": 0.6752782613039017, "loss": 3.0335, "loss_aux_layer_0": 0.01776123046875, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0572509765625, "step": 3217, "total_loss": 0.758366048336029 }, { "epoch": 0.6371015640467235, "grad_norm": 1.0178604125976562, "learning_rate": 5e-05, "llm_loss": 0.5372404381632805, "loss": 2.4887, "loss_aux_layer_0": 0.01641845703125, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.0594482421875, "step": 3218, "total_loss": 0.6221857070922852 }, { "epoch": 0.6372995446446248, "grad_norm": 0.9748142957687378, "learning_rate": 5e-05, "llm_loss": 0.49725624173879623, "loss": 2.3407, "loss_aux_layer_0": 0.017059326171875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1265869140625, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.13427734375, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06195068359375, "step": 3219, "total_loss": 0.5851701647043228 }, { "epoch": 0.6374975252425262, "grad_norm": 1.069703221321106, "learning_rate": 5e-05, "llm_loss": 0.52914859354496, "loss": 2.4701, "loss_aux_layer_0": 0.0164794921875, "loss_aux_layer_1": 0.036376953125, "loss_aux_layer_10": 0.064697265625, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1060791015625, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.0606689453125, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.06549072265625, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0634765625, "step": 3220, "total_loss": 0.6175275295972824 }, { "epoch": 0.6376955058404277, "grad_norm": 1.0136828422546387, "learning_rate": 5e-05, "llm_loss": 0.6136586666107178, "loss": 2.8112, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0975341796875, "loss_aux_layer_16": 0.1070556640625, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.1640625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06060791015625, "loss_aux_layer_4": 0.0631103515625, "loss_aux_layer_5": 0.06463623046875, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.06536865234375, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.0634765625, "step": 3221, "total_loss": 0.7028084099292755 }, { "epoch": 0.637893486438329, "grad_norm": 0.9531010985374451, "learning_rate": 5e-05, "llm_loss": 0.5618340000510216, "loss": 2.5947, "loss_aux_layer_0": 0.0160369873046875, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.06243896484375, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.06182861328125, "step": 3222, "total_loss": 0.6486750990152359 }, { "epoch": 0.6380914670362304, "grad_norm": 1.1066906452178955, "learning_rate": 5e-05, "llm_loss": 0.6691318452358246, "loss": 3.0059, "loss_aux_layer_0": 0.017547607421875, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.0570068359375, "step": 3223, "total_loss": 0.7514693886041641 }, { "epoch": 0.6382894476341319, "grad_norm": 1.40193510055542, "learning_rate": 5e-05, "llm_loss": 0.5117685496807098, "loss": 2.393, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.06036376953125, "step": 3224, "total_loss": 0.5982508361339569 }, { "epoch": 0.6384874282320333, "grad_norm": 1.3313442468643188, "learning_rate": 5e-05, "llm_loss": 0.5315362811088562, "loss": 2.4757, "loss_aux_layer_0": 0.017333984375, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06060791015625, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0614013671875, "step": 3225, "total_loss": 0.6189302802085876 }, { "epoch": 0.6386854088299346, "grad_norm": 1.1608076095581055, "learning_rate": 5e-05, "llm_loss": 0.5909712761640549, "loss": 2.7043, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06109619140625, "step": 3226, "total_loss": 0.676066130399704 }, { "epoch": 0.6388833894278361, "grad_norm": 1.1030552387237549, "learning_rate": 5e-05, "llm_loss": 0.64778633415699, "loss": 2.9229, "loss_aux_layer_0": 0.0164947509765625, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05731201171875, "step": 3227, "total_loss": 0.730726957321167 }, { "epoch": 0.6390813700257375, "grad_norm": 1.1282376050949097, "learning_rate": 5e-05, "llm_loss": 0.5825138092041016, "loss": 2.6701, "loss_aux_layer_0": 0.0159912109375, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.06573486328125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06048583984375, "step": 3228, "total_loss": 0.6675341129302979 }, { "epoch": 0.6392793506236388, "grad_norm": 1.060205340385437, "learning_rate": 5e-05, "llm_loss": 0.6107128262519836, "loss": 2.7848, "loss_aux_layer_0": 0.015533447265625, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.06005859375, "step": 3229, "total_loss": 0.6961977481842041 }, { "epoch": 0.6394773312215403, "grad_norm": 1.1155848503112793, "learning_rate": 5e-05, "llm_loss": 0.6364578008651733, "loss": 2.8944, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.0614013671875, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.06146240234375, "step": 3230, "total_loss": 0.7236010283231735 }, { "epoch": 0.6396753118194417, "grad_norm": 1.026533603668213, "learning_rate": 5e-05, "llm_loss": 0.5990232229232788, "loss": 2.7438, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.06732177734375, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06561279296875, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.061767578125, "step": 3231, "total_loss": 0.6859483569860458 }, { "epoch": 0.6398732924173431, "grad_norm": 1.084661841392517, "learning_rate": 5e-05, "llm_loss": 0.5493271499872208, "loss": 2.5566, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.0654296875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080322265625, "loss_aux_layer_14": 0.08935546875, "loss_aux_layer_15": 0.098388671875, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.115478515625, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.127685546875, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.064208984375, "step": 3232, "total_loss": 0.6391529142856598 }, { "epoch": 0.6400712730152445, "grad_norm": 0.8914464712142944, "learning_rate": 5e-05, "llm_loss": 0.5184827446937561, "loss": 2.4282, "loss_aux_layer_0": 0.0167083740234375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1142578125, "loss_aux_layer_18": 0.12255859375, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05999755859375, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0677490234375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06488037109375, "loss_aux_layer_9": 0.06353759765625, "step": 3233, "total_loss": 0.6070572137832642 }, { "epoch": 0.6402692536131459, "grad_norm": 1.1570383310317993, "learning_rate": 5e-05, "llm_loss": 0.5541711151599884, "loss": 2.5695, "loss_aux_layer_0": 0.017120361328125, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.064697265625, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.0596923828125, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.06427001953125, "loss_aux_layer_9": 0.06317138671875, "step": 3234, "total_loss": 0.6423752754926682 }, { "epoch": 0.6404672342110473, "grad_norm": 0.9945852160453796, "learning_rate": 5e-05, "llm_loss": 0.5937249884009361, "loss": 2.7239, "loss_aux_layer_0": 0.015655517578125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0960693359375, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.122314453125, "loss_aux_layer_19": 0.126220703125, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.06182861328125, "step": 3235, "total_loss": 0.6809861212968826 }, { "epoch": 0.6406652148089487, "grad_norm": 1.0689421892166138, "learning_rate": 5e-05, "llm_loss": 0.6015593558549881, "loss": 2.7375, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05804443359375, "step": 3236, "total_loss": 0.6843748241662979 }, { "epoch": 0.6408631954068501, "grad_norm": 0.9163230657577515, "learning_rate": 5e-05, "llm_loss": 0.5252045020461082, "loss": 2.4435, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1202392578125, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06005859375, "step": 3237, "total_loss": 0.6108763217926025 }, { "epoch": 0.6410611760047515, "grad_norm": 0.8630931973457336, "learning_rate": 5e-05, "llm_loss": 0.5561895370483398, "loss": 2.5629, "loss_aux_layer_0": 0.016357421875, "loss_aux_layer_1": 0.033233642578125, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.05767822265625, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.0587158203125, "step": 3238, "total_loss": 0.6407148838043213 }, { "epoch": 0.641259156602653, "grad_norm": 1.0640747547149658, "learning_rate": 5e-05, "llm_loss": 0.559629887342453, "loss": 2.5736, "loss_aux_layer_0": 0.0159912109375, "loss_aux_layer_1": 0.033355712890625, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.06866455078125, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.0589599609375, "step": 3239, "total_loss": 0.64340740442276 }, { "epoch": 0.6414571372005543, "grad_norm": 1.0259395837783813, "learning_rate": 5e-05, "llm_loss": 0.6290106773376465, "loss": 2.8554, "loss_aux_layer_0": 0.0155181884765625, "loss_aux_layer_1": 0.03326416015625, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05816650390625, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.05908203125, "step": 3240, "total_loss": 0.7138543128967285 }, { "epoch": 0.6416551177984557, "grad_norm": 1.20474112033844, "learning_rate": 5e-05, "llm_loss": 0.5585999339818954, "loss": 2.5772, "loss_aux_layer_0": 0.015960693359375, "loss_aux_layer_1": 0.034454345703125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.07037353515625, "loss_aux_layer_13": 0.07586669921875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1314697265625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.0592041015625, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.060302734375, "step": 3241, "total_loss": 0.6443099677562714 }, { "epoch": 0.6418530983963572, "grad_norm": 0.8555484414100647, "learning_rate": 5e-05, "llm_loss": 0.5480814874172211, "loss": 2.5393, "loss_aux_layer_0": 0.01531982421875, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.0626220703125, "step": 3242, "total_loss": 0.6348244398832321 }, { "epoch": 0.6420510789942586, "grad_norm": 1.031801700592041, "learning_rate": 5e-05, "llm_loss": 0.5307598263025284, "loss": 2.4771, "loss_aux_layer_0": 0.01593017578125, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.1075439453125, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.122802734375, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20361328125, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.0633544921875, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06341552734375, "loss_aux_layer_9": 0.0626220703125, "step": 3243, "total_loss": 0.6192780435085297 }, { "epoch": 0.6422490595921599, "grad_norm": 1.206060767173767, "learning_rate": 5e-05, "llm_loss": 0.5765514075756073, "loss": 2.6562, "loss_aux_layer_0": 0.01544189453125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06195068359375, "step": 3244, "total_loss": 0.6640589833259583 }, { "epoch": 0.6424470401900614, "grad_norm": 1.0005230903625488, "learning_rate": 5e-05, "llm_loss": 0.5789519473910332, "loss": 2.6619, "loss_aux_layer_0": 0.015625, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06121826171875, "step": 3245, "total_loss": 0.6654811948537827 }, { "epoch": 0.6426450207879628, "grad_norm": 1.2708803415298462, "learning_rate": 5e-05, "llm_loss": 0.5393733382225037, "loss": 2.5056, "loss_aux_layer_0": 0.0153961181640625, "loss_aux_layer_1": 0.033721923828125, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1337890625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.0623779296875, "loss_aux_layer_9": 0.06121826171875, "step": 3246, "total_loss": 0.6263882219791412 }, { "epoch": 0.6428430013858641, "grad_norm": 0.8962730169296265, "learning_rate": 5e-05, "llm_loss": 0.5395707488059998, "loss": 2.5043, "loss_aux_layer_0": 0.015594482421875, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.0618896484375, "step": 3247, "total_loss": 0.6260642260313034 }, { "epoch": 0.6430409819837656, "grad_norm": 1.1585516929626465, "learning_rate": 5e-05, "llm_loss": 0.647014245390892, "loss": 2.933, "loss_aux_layer_0": 0.0164337158203125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.06243896484375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.06231689453125, "loss_aux_layer_9": 0.06109619140625, "step": 3248, "total_loss": 0.733244314789772 }, { "epoch": 0.643238962581667, "grad_norm": 0.6510619521141052, "learning_rate": 5e-05, "llm_loss": 0.49172861129045486, "loss": 2.3291, "loss_aux_layer_0": 0.0155181884765625, "loss_aux_layer_1": 0.03680419921875, "loss_aux_layer_10": 0.06671142578125, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.082275390625, "loss_aux_layer_14": 0.091064453125, "loss_aux_layer_15": 0.09912109375, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.1156005859375, "loss_aux_layer_18": 0.1239013671875, "loss_aux_layer_19": 0.1263427734375, "loss_aux_layer_2": 0.05108642578125, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.166748046875, "loss_aux_layer_23": 0.20458984375, "loss_aux_layer_3": 0.061767578125, "loss_aux_layer_4": 0.06427001953125, "loss_aux_layer_5": 0.06591796875, "loss_aux_layer_6": 0.06884765625, "loss_aux_layer_7": 0.06683349609375, "loss_aux_layer_8": 0.0665283203125, "loss_aux_layer_9": 0.06524658203125, "step": 3249, "total_loss": 0.5822789520025253 }, { "epoch": 0.6434369431795685, "grad_norm": 1.0636494159698486, "learning_rate": 5e-05, "llm_loss": 0.5760001093149185, "loss": 2.6466, "loss_aux_layer_0": 0.0164794921875, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.1290283203125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.0609130859375, "step": 3250, "total_loss": 0.6616491973400116 }, { "epoch": 0.6436349237774698, "grad_norm": 0.9302506446838379, "learning_rate": 5e-05, "llm_loss": 0.5843480154871941, "loss": 2.6814, "loss_aux_layer_0": 0.015625, "loss_aux_layer_1": 0.03564453125, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06170654296875, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06231689453125, "step": 3251, "total_loss": 0.6703571230173111 }, { "epoch": 0.6438329043753712, "grad_norm": 1.2868430614471436, "learning_rate": 5e-05, "llm_loss": 0.5231392756104469, "loss": 2.4452, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.114501953125, "loss_aux_layer_18": 0.1234130859375, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.1650390625, "loss_aux_layer_23": 0.203125, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.06158447265625, "step": 3252, "total_loss": 0.611298680305481 }, { "epoch": 0.6440308849732727, "grad_norm": 0.8773806691169739, "learning_rate": 5e-05, "llm_loss": 0.5173555165529251, "loss": 2.3987, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05621337890625, "step": 3253, "total_loss": 0.5996652394533157 }, { "epoch": 0.644228865571174, "grad_norm": 1.0458428859710693, "learning_rate": 5e-05, "llm_loss": 0.5900265127420425, "loss": 2.7064, "loss_aux_layer_0": 0.01580810546875, "loss_aux_layer_1": 0.035186767578125, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.06756591796875, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.1307373046875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0616455078125, "step": 3254, "total_loss": 0.6766079664230347 }, { "epoch": 0.6444268461690754, "grad_norm": 1.0150905847549438, "learning_rate": 5e-05, "llm_loss": 0.5982857346534729, "loss": 2.7337, "loss_aux_layer_0": 0.015777587890625, "loss_aux_layer_1": 0.0338134765625, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.0634765625, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06024169921875, "step": 3255, "total_loss": 0.6834357380867004 }, { "epoch": 0.6446248267669769, "grad_norm": 0.9707311391830444, "learning_rate": 5e-05, "llm_loss": 0.555298462510109, "loss": 2.5664, "loss_aux_layer_0": 0.0156097412109375, "loss_aux_layer_1": 0.034698486328125, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.06707763671875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.06500244140625, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06146240234375, "step": 3256, "total_loss": 0.6415947079658508 }, { "epoch": 0.6448228073648783, "grad_norm": 0.874225914478302, "learning_rate": 5e-05, "llm_loss": 0.5630401819944382, "loss": 2.6017, "loss_aux_layer_0": 0.0163116455078125, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.095703125, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1202392578125, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06268310546875, "step": 3257, "total_loss": 0.6504215598106384 }, { "epoch": 0.6450207879627796, "grad_norm": 1.054987907409668, "learning_rate": 5e-05, "llm_loss": 0.5671238377690315, "loss": 2.613, "loss_aux_layer_0": 0.015960693359375, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.0611572265625, "step": 3258, "total_loss": 0.6532588750123978 }, { "epoch": 0.645218768560681, "grad_norm": 0.9574115872383118, "learning_rate": 5e-05, "llm_loss": 0.5799346715211868, "loss": 2.6571, "loss_aux_layer_0": 0.0150909423828125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06011962890625, "step": 3259, "total_loss": 0.6642783433198929 }, { "epoch": 0.6454167491585825, "grad_norm": 1.0042237043380737, "learning_rate": 5e-05, "llm_loss": 0.5699363574385643, "loss": 2.6203, "loss_aux_layer_0": 0.016021728515625, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06201171875, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.060791015625, "step": 3260, "total_loss": 0.6550773084163666 }, { "epoch": 0.6456147297564838, "grad_norm": 0.9865071177482605, "learning_rate": 5e-05, "llm_loss": 0.6179729253053665, "loss": 2.8155, "loss_aux_layer_0": 0.0165557861328125, "loss_aux_layer_1": 0.03582763671875, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06658935546875, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06134033203125, "step": 3261, "total_loss": 0.7038698345422745 }, { "epoch": 0.6458127103543853, "grad_norm": 1.003319501876831, "learning_rate": 5e-05, "llm_loss": 0.5741363316774368, "loss": 2.6635, "loss_aux_layer_0": 0.0160369873046875, "loss_aux_layer_1": 0.03778076171875, "loss_aux_layer_10": 0.0679931640625, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.0830078125, "loss_aux_layer_14": 0.0919189453125, "loss_aux_layer_15": 0.100341796875, "loss_aux_layer_16": 0.1099853515625, "loss_aux_layer_17": 0.1171875, "loss_aux_layer_18": 0.12548828125, "loss_aux_layer_19": 0.1287841796875, "loss_aux_layer_2": 0.052490234375, "loss_aux_layer_20": 0.13623046875, "loss_aux_layer_21": 0.14453125, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.0634765625, "loss_aux_layer_4": 0.0660400390625, "loss_aux_layer_5": 0.06793212890625, "loss_aux_layer_6": 0.0709228515625, "loss_aux_layer_7": 0.0687255859375, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.066650390625, "step": 3262, "total_loss": 0.6658840179443359 }, { "epoch": 0.6460106909522867, "grad_norm": 0.9362697005271912, "learning_rate": 5e-05, "llm_loss": 0.538045659661293, "loss": 2.4993, "loss_aux_layer_0": 0.016632080078125, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.1309814453125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.0589599609375, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06158447265625, "step": 3263, "total_loss": 0.6248319894075394 }, { "epoch": 0.6462086715501881, "grad_norm": 1.0377165079116821, "learning_rate": 5e-05, "llm_loss": 0.5348226726055145, "loss": 2.4946, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.123046875, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.0498046875, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.14306640625, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.060302734375, "loss_aux_layer_4": 0.0626220703125, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.06683349609375, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.06280517578125, "step": 3264, "total_loss": 0.6236608773469925 }, { "epoch": 0.6464066521480895, "grad_norm": 1.1725165843963623, "learning_rate": 5e-05, "llm_loss": 0.49600864946842194, "loss": 2.3279, "loss_aux_layer_0": 0.01751708984375, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.06060791015625, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06109619140625, "step": 3265, "total_loss": 0.581974670290947 }, { "epoch": 0.6466046327459909, "grad_norm": 0.8602327108383179, "learning_rate": 5e-05, "llm_loss": 0.6734252423048019, "loss": 3.0389, "loss_aux_layer_0": 0.0161895751953125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.06103515625, "step": 3266, "total_loss": 0.7597170621156693 }, { "epoch": 0.6468026133438923, "grad_norm": 1.2701070308685303, "learning_rate": 5e-05, "llm_loss": 0.6423828452825546, "loss": 2.9075, "loss_aux_layer_0": 0.01763916015625, "loss_aux_layer_1": 0.034149169921875, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.060302734375, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.058837890625, "step": 3267, "total_loss": 0.7268633097410202 }, { "epoch": 0.6470005939417937, "grad_norm": 1.0889058113098145, "learning_rate": 5e-05, "llm_loss": 0.6157092899084091, "loss": 2.8013, "loss_aux_layer_0": 0.0159149169921875, "loss_aux_layer_1": 0.035125732421875, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06610107421875, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.0606689453125, "step": 3268, "total_loss": 0.7003134042024612 }, { "epoch": 0.6471985745396951, "grad_norm": 1.180232286453247, "learning_rate": 5e-05, "llm_loss": 0.6043761968612671, "loss": 2.7557, "loss_aux_layer_0": 0.017425537109375, "loss_aux_layer_1": 0.034759521484375, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.06060791015625, "step": 3269, "total_loss": 0.6889303922653198 }, { "epoch": 0.6473965551375965, "grad_norm": 0.8494411706924438, "learning_rate": 5e-05, "llm_loss": 0.6092684715986252, "loss": 2.7889, "loss_aux_layer_0": 0.0154266357421875, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.1136474609375, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.05035400390625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.06036376953125, "loss_aux_layer_4": 0.06268310546875, "loss_aux_layer_5": 0.06402587890625, "loss_aux_layer_6": 0.06719970703125, "loss_aux_layer_7": 0.06494140625, "loss_aux_layer_8": 0.064208984375, "loss_aux_layer_9": 0.0628662109375, "step": 3270, "total_loss": 0.6972136944532394 }, { "epoch": 0.647594535735498, "grad_norm": 1.1028587818145752, "learning_rate": 5e-05, "llm_loss": 0.5937681943178177, "loss": 2.7137, "loss_aux_layer_0": 0.016876220703125, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.06451416015625, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06317138671875, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.059326171875, "step": 3271, "total_loss": 0.6784227192401886 }, { "epoch": 0.6477925163333993, "grad_norm": 0.8689622282981873, "learning_rate": 5e-05, "llm_loss": 0.5789402723312378, "loss": 2.6631, "loss_aux_layer_0": 0.016448974609375, "loss_aux_layer_1": 0.035003662109375, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.1312255859375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.06158447265625, "step": 3272, "total_loss": 0.6657751649618149 }, { "epoch": 0.6479904969313007, "grad_norm": 0.882270872592926, "learning_rate": 5e-05, "llm_loss": 0.5417878776788712, "loss": 2.5064, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05950927734375, "step": 3273, "total_loss": 0.6266111731529236 }, { "epoch": 0.6481884775292022, "grad_norm": 0.7666045427322388, "learning_rate": 5e-05, "llm_loss": 0.5726488754153252, "loss": 2.6401, "loss_aux_layer_0": 0.0161590576171875, "loss_aux_layer_1": 0.03662109375, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.06854248046875, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.064208984375, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.0628662109375, "step": 3274, "total_loss": 0.6600195467472076 }, { "epoch": 0.6483864581271035, "grad_norm": 1.0566388368606567, "learning_rate": 5e-05, "llm_loss": 0.6556382030248642, "loss": 2.9552, "loss_aux_layer_0": 0.0153045654296875, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.057861328125, "step": 3275, "total_loss": 0.7387986034154892 }, { "epoch": 0.6485844387250049, "grad_norm": 0.7277897000312805, "learning_rate": 5e-05, "llm_loss": 0.5388428792357445, "loss": 2.4971, "loss_aux_layer_0": 0.015411376953125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.06573486328125, "loss_aux_layer_7": 0.06365966796875, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06182861328125, "step": 3276, "total_loss": 0.6242823302745819 }, { "epoch": 0.6487824193229064, "grad_norm": 0.8066539764404297, "learning_rate": 5e-05, "llm_loss": 0.5295125246047974, "loss": 2.4692, "loss_aux_layer_0": 0.0146026611328125, "loss_aux_layer_1": 0.03668212890625, "loss_aux_layer_10": 0.0655517578125, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.080078125, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06304931640625, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.06561279296875, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06390380859375, "step": 3277, "total_loss": 0.6173001751303673 }, { "epoch": 0.6489803999208078, "grad_norm": 1.0048781633377075, "learning_rate": 5e-05, "llm_loss": 0.5112271010875702, "loss": 2.386, "loss_aux_layer_0": 0.0162353515625, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.0611572265625, "step": 3278, "total_loss": 0.5964886844158173 }, { "epoch": 0.6491783805187091, "grad_norm": 0.7841517925262451, "learning_rate": 5e-05, "llm_loss": 0.569513127207756, "loss": 2.6306, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06524658203125, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.06085205078125, "loss_aux_layer_4": 0.0634765625, "loss_aux_layer_5": 0.06500244140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06597900390625, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.0638427734375, "step": 3279, "total_loss": 0.6576451361179352 }, { "epoch": 0.6493763611166106, "grad_norm": 0.9967234134674072, "learning_rate": 5e-05, "llm_loss": 0.5062868893146515, "loss": 2.3718, "loss_aux_layer_0": 0.016571044921875, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06134033203125, "step": 3280, "total_loss": 0.5929576382040977 }, { "epoch": 0.649574341714512, "grad_norm": 0.7697166800498962, "learning_rate": 5e-05, "llm_loss": 0.6063642501831055, "loss": 2.7671, "loss_aux_layer_0": 0.0150299072265625, "loss_aux_layer_1": 0.034271240234375, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06146240234375, "step": 3281, "total_loss": 0.6917676031589508 }, { "epoch": 0.6497723223124133, "grad_norm": 1.035983681678772, "learning_rate": 5e-05, "llm_loss": 0.5809284448623657, "loss": 2.6752, "loss_aux_layer_0": 0.01605224609375, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.0648193359375, "loss_aux_layer_11": 0.06890869140625, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0794677734375, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0599365234375, "loss_aux_layer_4": 0.062744140625, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.06536865234375, "loss_aux_layer_8": 0.06463623046875, "loss_aux_layer_9": 0.06341552734375, "step": 3282, "total_loss": 0.6687996685504913 }, { "epoch": 0.6499703029103148, "grad_norm": 0.8514280319213867, "learning_rate": 5e-05, "llm_loss": 0.5906095504760742, "loss": 2.6913, "loss_aux_layer_0": 0.01513671875, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.05755615234375, "step": 3283, "total_loss": 0.6728283911943436 }, { "epoch": 0.6501682835082162, "grad_norm": 1.0938299894332886, "learning_rate": 5e-05, "llm_loss": 0.6034159660339355, "loss": 2.7615, "loss_aux_layer_0": 0.0167236328125, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.06744384765625, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.048828125, "loss_aux_layer_20": 0.1312255859375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.061767578125, "step": 3284, "total_loss": 0.6903701424598694 }, { "epoch": 0.6503662641061176, "grad_norm": 0.7405329346656799, "learning_rate": 5e-05, "llm_loss": 0.5315520390868187, "loss": 2.4794, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.097900390625, "loss_aux_layer_16": 0.1077880859375, "loss_aux_layer_17": 0.11572265625, "loss_aux_layer_18": 0.124267578125, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.1346435546875, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.0626220703125, "step": 3285, "total_loss": 0.6198421567678452 }, { "epoch": 0.650564244704019, "grad_norm": 0.8676871657371521, "learning_rate": 5e-05, "llm_loss": 0.5826119929552078, "loss": 2.679, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06536865234375, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06170654296875, "step": 3286, "total_loss": 0.6697380691766739 }, { "epoch": 0.6507622253019204, "grad_norm": 0.8518181443214417, "learning_rate": 5e-05, "llm_loss": 0.5900744199752808, "loss": 2.7245, "loss_aux_layer_0": 0.0151519775390625, "loss_aux_layer_1": 0.03692626953125, "loss_aux_layer_10": 0.067626953125, "loss_aux_layer_11": 0.0723876953125, "loss_aux_layer_12": 0.077392578125, "loss_aux_layer_13": 0.08349609375, "loss_aux_layer_14": 0.0921630859375, "loss_aux_layer_15": 0.1007080078125, "loss_aux_layer_16": 0.10986328125, "loss_aux_layer_17": 0.1168212890625, "loss_aux_layer_18": 0.1248779296875, "loss_aux_layer_19": 0.127197265625, "loss_aux_layer_2": 0.0517578125, "loss_aux_layer_20": 0.133544921875, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06256103515625, "loss_aux_layer_4": 0.06536865234375, "loss_aux_layer_5": 0.067138671875, "loss_aux_layer_6": 0.0703125, "loss_aux_layer_7": 0.0679931640625, "loss_aux_layer_8": 0.0675048828125, "loss_aux_layer_9": 0.06622314453125, "step": 3287, "total_loss": 0.6811260432004929 }, { "epoch": 0.6509602058998218, "grad_norm": 0.8724444508552551, "learning_rate": 5e-05, "llm_loss": 0.4998857229948044, "loss": 2.3344, "loss_aux_layer_0": 0.0161285400390625, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.0589599609375, "step": 3288, "total_loss": 0.583589568734169 }, { "epoch": 0.6511581864977232, "grad_norm": 0.8889641165733337, "learning_rate": 5e-05, "llm_loss": 0.6236932128667831, "loss": 2.8395, "loss_aux_layer_0": 0.0164794921875, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.06732177734375, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06170654296875, "step": 3289, "total_loss": 0.7098752409219742 }, { "epoch": 0.6513561670956246, "grad_norm": 0.777217447757721, "learning_rate": 5e-05, "llm_loss": 0.6474409848451614, "loss": 2.9262, "loss_aux_layer_0": 0.0150299072265625, "loss_aux_layer_1": 0.033477783203125, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06500244140625, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.0606689453125, "loss_aux_layer_9": 0.0594482421875, "step": 3290, "total_loss": 0.7315402179956436 }, { "epoch": 0.651554147693526, "grad_norm": 0.8848134279251099, "learning_rate": 5e-05, "llm_loss": 0.5643804669380188, "loss": 2.6035, "loss_aux_layer_0": 0.0172119140625, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.06719970703125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06475830078125, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06158447265625, "step": 3291, "total_loss": 0.6508808135986328 }, { "epoch": 0.6517521282914275, "grad_norm": 0.8541923761367798, "learning_rate": 5e-05, "llm_loss": 0.5336437523365021, "loss": 2.4764, "loss_aux_layer_0": 0.01483154296875, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1195068359375, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06158447265625, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.05999755859375, "step": 3292, "total_loss": 0.6191013008356094 }, { "epoch": 0.6519501088893288, "grad_norm": 0.8640865683555603, "learning_rate": 5e-05, "llm_loss": 0.5593778118491173, "loss": 2.5823, "loss_aux_layer_0": 0.015380859375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.062255859375, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.060791015625, "step": 3293, "total_loss": 0.6455722451210022 }, { "epoch": 0.6521480894872302, "grad_norm": 1.0537818670272827, "learning_rate": 5e-05, "llm_loss": 0.5489486306905746, "loss": 2.5334, "loss_aux_layer_0": 0.0149078369140625, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.0592041015625, "step": 3294, "total_loss": 0.6333559304475784 }, { "epoch": 0.6523460700851317, "grad_norm": 0.852296769618988, "learning_rate": 5e-05, "llm_loss": 0.6499200463294983, "loss": 2.9464, "loss_aux_layer_0": 0.015289306640625, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0614013671875, "step": 3295, "total_loss": 0.736607551574707 }, { "epoch": 0.6525440506830331, "grad_norm": 1.0243946313858032, "learning_rate": 5e-05, "llm_loss": 0.6234098374843597, "loss": 2.8382, "loss_aux_layer_0": 0.01483154296875, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0621337890625, "step": 3296, "total_loss": 0.7095496654510498 }, { "epoch": 0.6527420312809344, "grad_norm": 1.606284737586975, "learning_rate": 5e-05, "llm_loss": 0.5207776427268982, "loss": 2.4191, "loss_aux_layer_0": 0.0147857666015625, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0574951171875, "step": 3297, "total_loss": 0.6047657579183578 }, { "epoch": 0.6529400118788359, "grad_norm": 1.3628947734832764, "learning_rate": 5e-05, "llm_loss": 0.5863337218761444, "loss": 2.6876, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.033782958984375, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.0604248046875, "step": 3298, "total_loss": 0.6719082891941071 }, { "epoch": 0.6531379924767373, "grad_norm": 1.1142548322677612, "learning_rate": 5e-05, "llm_loss": 0.5519881621003151, "loss": 2.5377, "loss_aux_layer_0": 0.0147247314453125, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.06781005859375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05755615234375, "step": 3299, "total_loss": 0.634418211877346 }, { "epoch": 0.6533359730746386, "grad_norm": 1.0624058246612549, "learning_rate": 5e-05, "llm_loss": 0.5484985411167145, "loss": 2.545, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0504150390625, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06292724609375, "loss_aux_layer_5": 0.064453125, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.0628662109375, "step": 3300, "total_loss": 0.6362546235322952 }, { "epoch": 0.6535339536725401, "grad_norm": 1.1704515218734741, "learning_rate": 5e-05, "llm_loss": 0.5774716734886169, "loss": 2.6513, "loss_aux_layer_0": 0.0156402587890625, "loss_aux_layer_1": 0.033843994140625, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05914306640625, "step": 3301, "total_loss": 0.6628130376338959 }, { "epoch": 0.6537319342704415, "grad_norm": 1.2069026231765747, "learning_rate": 5e-05, "llm_loss": 0.5379516556859016, "loss": 2.4958, "loss_aux_layer_0": 0.0156097412109375, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06011962890625, "step": 3302, "total_loss": 0.6239609867334366 }, { "epoch": 0.653929914868343, "grad_norm": 1.0529778003692627, "learning_rate": 5e-05, "llm_loss": 0.6001143455505371, "loss": 2.7292, "loss_aux_layer_0": 0.0151519775390625, "loss_aux_layer_1": 0.032623291015625, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.057373046875, "step": 3303, "total_loss": 0.6822971403598785 }, { "epoch": 0.6541278954662443, "grad_norm": 0.9508535861968994, "learning_rate": 5e-05, "llm_loss": 0.6215964257717133, "loss": 2.8313, "loss_aux_layer_0": 0.0159454345703125, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.06005859375, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.06463623046875, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.060791015625, "step": 3304, "total_loss": 0.7078255265951157 }, { "epoch": 0.6543258760641457, "grad_norm": 1.2110614776611328, "learning_rate": 5e-05, "llm_loss": 0.6561856716871262, "loss": 2.977, "loss_aux_layer_0": 0.015045166015625, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.1270751953125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.14404296875, "loss_aux_layer_22": 0.1669921875, "loss_aux_layer_23": 0.205322265625, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.0623779296875, "loss_aux_layer_9": 0.06109619140625, "step": 3305, "total_loss": 0.7442474663257599 }, { "epoch": 0.6545238566620472, "grad_norm": 0.7875997424125671, "learning_rate": 5e-05, "llm_loss": 0.5167191922664642, "loss": 2.4071, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.06005859375, "step": 3306, "total_loss": 0.6017677038908005 }, { "epoch": 0.6547218372599485, "grad_norm": 0.9410637617111206, "learning_rate": 5e-05, "llm_loss": 0.52470862865448, "loss": 2.4491, "loss_aux_layer_0": 0.0153350830078125, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.064453125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.05029296875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.0633544921875, "loss_aux_layer_5": 0.06488037109375, "loss_aux_layer_6": 0.06787109375, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.06500244140625, "loss_aux_layer_9": 0.06329345703125, "step": 3307, "total_loss": 0.6122725456953049 }, { "epoch": 0.6549198178578499, "grad_norm": 0.8977957963943481, "learning_rate": 5e-05, "llm_loss": 0.5731899812817574, "loss": 2.6221, "loss_aux_layer_0": 0.015045166015625, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.0577392578125, "step": 3308, "total_loss": 0.6555126458406448 }, { "epoch": 0.6551177984557514, "grad_norm": 1.2219429016113281, "learning_rate": 5e-05, "llm_loss": 0.6203311383724213, "loss": 2.8168, "loss_aux_layer_0": 0.015411376953125, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.0601806640625, "step": 3309, "total_loss": 0.7042124420404434 }, { "epoch": 0.6553157790536528, "grad_norm": 1.2596083879470825, "learning_rate": 5e-05, "llm_loss": 0.6213376373052597, "loss": 2.8182, "loss_aux_layer_0": 0.0149993896484375, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05889892578125, "step": 3310, "total_loss": 0.7045512050390244 }, { "epoch": 0.6555137596515541, "grad_norm": 1.1103607416152954, "learning_rate": 5e-05, "llm_loss": 0.5716145932674408, "loss": 2.633, "loss_aux_layer_0": 0.015411376953125, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.06060791015625, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06158447265625, "step": 3311, "total_loss": 0.6582545787096024 }, { "epoch": 0.6557117402494556, "grad_norm": 0.8213244080543518, "learning_rate": 5e-05, "llm_loss": 0.5564379245042801, "loss": 2.5797, "loss_aux_layer_0": 0.0149078369140625, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06488037109375, "loss_aux_layer_11": 0.0692138671875, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.0887451171875, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.06512451171875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06353759765625, "step": 3312, "total_loss": 0.6449308097362518 }, { "epoch": 0.655909720847357, "grad_norm": 0.9360404014587402, "learning_rate": 5e-05, "llm_loss": 0.5557257980108261, "loss": 2.5633, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.06097412109375, "step": 3313, "total_loss": 0.6408371478319168 }, { "epoch": 0.6561077014452583, "grad_norm": 0.9679827094078064, "learning_rate": 5e-05, "llm_loss": 0.5918208509683609, "loss": 2.7013, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.06060791015625, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.060302734375, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.0606689453125, "loss_aux_layer_9": 0.05926513671875, "step": 3314, "total_loss": 0.6753147095441818 }, { "epoch": 0.6563056820431598, "grad_norm": 0.9164804220199585, "learning_rate": 5e-05, "llm_loss": 0.5546807944774628, "loss": 2.5589, "loss_aux_layer_0": 0.016204833984375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05914306640625, "step": 3315, "total_loss": 0.6397239714860916 }, { "epoch": 0.6565036626410612, "grad_norm": 0.8527910709381104, "learning_rate": 5e-05, "llm_loss": 0.5740574225783348, "loss": 2.6451, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.0628662109375, "loss_aux_layer_9": 0.0615234375, "step": 3316, "total_loss": 0.6612845957279205 }, { "epoch": 0.6567016432389626, "grad_norm": 0.8864243030548096, "learning_rate": 5e-05, "llm_loss": 0.5948449671268463, "loss": 2.7226, "loss_aux_layer_0": 0.0154266357421875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.06024169921875, "step": 3317, "total_loss": 0.6806504428386688 }, { "epoch": 0.656899623836864, "grad_norm": 0.8221163153648376, "learning_rate": 5e-05, "llm_loss": 0.551258385181427, "loss": 2.5566, "loss_aux_layer_0": 0.015655517578125, "loss_aux_layer_1": 0.034942626953125, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.06793212890625, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0968017578125, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.141357421875, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06195068359375, "step": 3318, "total_loss": 0.6391406804323196 }, { "epoch": 0.6570976044347654, "grad_norm": 0.8901565670967102, "learning_rate": 5e-05, "llm_loss": 0.653911754488945, "loss": 2.9612, "loss_aux_layer_0": 0.0148773193359375, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.077392578125, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.06005859375, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06158447265625, "step": 3319, "total_loss": 0.7402905374765396 }, { "epoch": 0.6572955850326668, "grad_norm": 0.8512123227119446, "learning_rate": 5e-05, "llm_loss": 0.6351570039987564, "loss": 2.8796, "loss_aux_layer_0": 0.0153045654296875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.0592041015625, "step": 3320, "total_loss": 0.7199011147022247 }, { "epoch": 0.6574935656305682, "grad_norm": 1.0357911586761475, "learning_rate": 5e-05, "llm_loss": 0.5406126230955124, "loss": 2.5061, "loss_aux_layer_0": 0.0150146484375, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.061279296875, "step": 3321, "total_loss": 0.6265207827091217 }, { "epoch": 0.6576915462284696, "grad_norm": 1.3562177419662476, "learning_rate": 5e-05, "llm_loss": 0.6030270159244537, "loss": 2.7517, "loss_aux_layer_0": 0.014404296875, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.0606689453125, "loss_aux_layer_9": 0.0594482421875, "step": 3322, "total_loss": 0.6879284381866455 }, { "epoch": 0.657889526826371, "grad_norm": 1.390873670578003, "learning_rate": 5e-05, "llm_loss": 0.6430835723876953, "loss": 2.9303, "loss_aux_layer_0": 0.0145111083984375, "loss_aux_layer_1": 0.0364990234375, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.0899658203125, "loss_aux_layer_15": 0.0987548828125, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.115966796875, "loss_aux_layer_18": 0.12353515625, "loss_aux_layer_19": 0.1259765625, "loss_aux_layer_2": 0.05023193359375, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.06121826171875, "loss_aux_layer_4": 0.063720703125, "loss_aux_layer_5": 0.06573486328125, "loss_aux_layer_6": 0.0687255859375, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.06451416015625, "step": 3323, "total_loss": 0.7325731068849564 }, { "epoch": 0.6580875074242725, "grad_norm": 0.8499297499656677, "learning_rate": 5e-05, "llm_loss": 0.6715237945318222, "loss": 3.0347, "loss_aux_layer_0": 0.01641845703125, "loss_aux_layer_1": 0.03594970703125, "loss_aux_layer_10": 0.06439208984375, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06463623046875, "loss_aux_layer_8": 0.064208984375, "loss_aux_layer_9": 0.0628662109375, "step": 3324, "total_loss": 0.758680909872055 }, { "epoch": 0.6582854880221738, "grad_norm": 1.1532760858535767, "learning_rate": 5e-05, "llm_loss": 0.48867590725421906, "loss": 2.2883, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.033050537109375, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05859375, "step": 3325, "total_loss": 0.5720744282007217 }, { "epoch": 0.6584834686200752, "grad_norm": 1.3407983779907227, "learning_rate": 5e-05, "llm_loss": 0.6979639232158661, "loss": 3.1444, "loss_aux_layer_0": 0.015167236328125, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.095947265625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.11376953125, "loss_aux_layer_18": 0.1224365234375, "loss_aux_layer_19": 0.1260986328125, "loss_aux_layer_2": 0.0479736328125, "loss_aux_layer_20": 0.134521484375, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.06243896484375, "step": 3326, "total_loss": 0.7860942780971527 }, { "epoch": 0.6586814492179767, "grad_norm": 1.07346773147583, "learning_rate": 5e-05, "llm_loss": 0.5866841077804565, "loss": 2.6936, "loss_aux_layer_0": 0.01568603515625, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.06109619140625, "step": 3327, "total_loss": 0.673397108912468 }, { "epoch": 0.658879429815878, "grad_norm": 1.290927529335022, "learning_rate": 5e-05, "llm_loss": 0.6319126486778259, "loss": 2.8698, "loss_aux_layer_0": 0.015045166015625, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06121826171875, "step": 3328, "total_loss": 0.7174399197101593 }, { "epoch": 0.6590774104137794, "grad_norm": 0.9595226049423218, "learning_rate": 5e-05, "llm_loss": 0.5590404346585274, "loss": 2.5806, "loss_aux_layer_0": 0.0157623291015625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06024169921875, "step": 3329, "total_loss": 0.6451507210731506 }, { "epoch": 0.6592753910116809, "grad_norm": 1.1538376808166504, "learning_rate": 5e-05, "llm_loss": 0.536698967218399, "loss": 2.4921, "loss_aux_layer_0": 0.0152130126953125, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.121337890625, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06072998046875, "step": 3330, "total_loss": 0.623014286160469 }, { "epoch": 0.6594733716095823, "grad_norm": 0.7831919193267822, "learning_rate": 5e-05, "llm_loss": 0.5666948407888412, "loss": 2.6, "loss_aux_layer_0": 0.0149383544921875, "loss_aux_layer_1": 0.032257080078125, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.05731201171875, "step": 3331, "total_loss": 0.6499944776296616 }, { "epoch": 0.6596713522074836, "grad_norm": 1.122044563293457, "learning_rate": 5e-05, "llm_loss": 0.5460617914795876, "loss": 2.5217, "loss_aux_layer_0": 0.01715087890625, "loss_aux_layer_1": 0.0343017578125, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.121337890625, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.058349609375, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.06231689453125, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05889892578125, "step": 3332, "total_loss": 0.630430668592453 }, { "epoch": 0.6598693328053851, "grad_norm": 1.0096559524536133, "learning_rate": 5e-05, "llm_loss": 0.6140780299901962, "loss": 2.7984, "loss_aux_layer_0": 0.01458740234375, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06024169921875, "step": 3333, "total_loss": 0.6996045857667923 }, { "epoch": 0.6600673134032865, "grad_norm": 0.9741390943527222, "learning_rate": 5e-05, "llm_loss": 0.5576586574316025, "loss": 2.5579, "loss_aux_layer_0": 0.01629638671875, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0552978515625, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.0579833984375, "step": 3334, "total_loss": 0.6394823044538498 }, { "epoch": 0.6602652940011878, "grad_norm": 1.0896135568618774, "learning_rate": 5e-05, "llm_loss": 0.5476124286651611, "loss": 2.5301, "loss_aux_layer_0": 0.01690673828125, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.048828125, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06024169921875, "step": 3335, "total_loss": 0.6325176805257797 }, { "epoch": 0.6604632745990893, "grad_norm": 0.8494287133216858, "learning_rate": 5e-05, "llm_loss": 0.6721270382404327, "loss": 3.0342, "loss_aux_layer_0": 0.01495361328125, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.06719970703125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06158447265625, "step": 3336, "total_loss": 0.7585482001304626 }, { "epoch": 0.6606612551969907, "grad_norm": 1.057553768157959, "learning_rate": 5e-05, "llm_loss": 0.6004260182380676, "loss": 2.7474, "loss_aux_layer_0": 0.01654052734375, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04949951171875, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06195068359375, "step": 3337, "total_loss": 0.6868449747562408 }, { "epoch": 0.6608592357948921, "grad_norm": 1.0789893865585327, "learning_rate": 5e-05, "llm_loss": 0.6147036403417587, "loss": 2.8018, "loss_aux_layer_0": 0.016571044921875, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.0611572265625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06085205078125, "step": 3338, "total_loss": 0.7004494369029999 }, { "epoch": 0.6610572163927935, "grad_norm": 0.8921056985855103, "learning_rate": 5e-05, "llm_loss": 0.5560519620776176, "loss": 2.5488, "loss_aux_layer_0": 0.01556396484375, "loss_aux_layer_1": 0.031463623046875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.0556640625, "step": 3339, "total_loss": 0.6372123509645462 }, { "epoch": 0.6612551969906949, "grad_norm": 1.0429201126098633, "learning_rate": 5e-05, "llm_loss": 0.6498157382011414, "loss": 2.9418, "loss_aux_layer_0": 0.0162353515625, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06085205078125, "step": 3340, "total_loss": 0.7354497462511063 }, { "epoch": 0.6614531775885963, "grad_norm": 0.7987872362136841, "learning_rate": 5e-05, "llm_loss": 0.5264830440282822, "loss": 2.4465, "loss_aux_layer_0": 0.01654052734375, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06500244140625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.0596923828125, "step": 3341, "total_loss": 0.6116242408752441 }, { "epoch": 0.6616511581864978, "grad_norm": 0.7278326153755188, "learning_rate": 5e-05, "llm_loss": 0.5985338389873505, "loss": 2.7229, "loss_aux_layer_0": 0.014739990234375, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0576171875, "step": 3342, "total_loss": 0.6807176917791367 }, { "epoch": 0.6618491387843991, "grad_norm": 0.9715556502342224, "learning_rate": 5e-05, "llm_loss": 0.5653572231531143, "loss": 2.6056, "loss_aux_layer_0": 0.015655517578125, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0572509765625, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.0601806640625, "step": 3343, "total_loss": 0.6513934582471848 }, { "epoch": 0.6620471193823005, "grad_norm": 0.8100379109382629, "learning_rate": 5e-05, "llm_loss": 0.5942401438951492, "loss": 2.7291, "loss_aux_layer_0": 0.015228271484375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06512451171875, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0740966796875, "loss_aux_layer_13": 0.0799560546875, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06719970703125, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.06451416015625, "loss_aux_layer_9": 0.063720703125, "step": 3344, "total_loss": 0.6822750121355057 }, { "epoch": 0.662245099980202, "grad_norm": 0.8268072605133057, "learning_rate": 5e-05, "llm_loss": 0.6049049645662308, "loss": 2.7692, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.035888671875, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1309814453125, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.160888671875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05975341796875, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06390380859375, "loss_aux_layer_6": 0.06683349609375, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.0628662109375, "step": 3345, "total_loss": 0.6922983676195145 }, { "epoch": 0.6624430805781033, "grad_norm": 0.9671553373336792, "learning_rate": 5e-05, "llm_loss": 0.5649018585681915, "loss": 2.5966, "loss_aux_layer_0": 0.0151214599609375, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.0589599609375, "step": 3346, "total_loss": 0.6491559594869614 }, { "epoch": 0.6626410611760047, "grad_norm": 0.6894689202308655, "learning_rate": 5e-05, "llm_loss": 0.5730845183134079, "loss": 2.639, "loss_aux_layer_0": 0.014923095703125, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04901123046875, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.0634765625, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.06341552734375, "loss_aux_layer_9": 0.0623779296875, "step": 3347, "total_loss": 0.6597490459680557 }, { "epoch": 0.6628390417739062, "grad_norm": 1.1306761503219604, "learning_rate": 5e-05, "llm_loss": 0.5937724113464355, "loss": 2.7135, "loss_aux_layer_0": 0.0145111083984375, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.0599365234375, "step": 3348, "total_loss": 0.6783802360296249 }, { "epoch": 0.6630370223718076, "grad_norm": 1.1159570217132568, "learning_rate": 5e-05, "llm_loss": 0.6236374825239182, "loss": 2.8474, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.0372314453125, "loss_aux_layer_10": 0.06640625, "loss_aux_layer_11": 0.0706787109375, "loss_aux_layer_12": 0.075439453125, "loss_aux_layer_13": 0.0810546875, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0966796875, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.121337890625, "loss_aux_layer_2": 0.0516357421875, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.06219482421875, "loss_aux_layer_4": 0.06494140625, "loss_aux_layer_5": 0.0665283203125, "loss_aux_layer_6": 0.069580078125, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.0653076171875, "step": 3349, "total_loss": 0.7118547260761261 }, { "epoch": 0.6632350029697089, "grad_norm": 0.7618492841720581, "learning_rate": 5e-05, "llm_loss": 0.5895136594772339, "loss": 2.6841, "loss_aux_layer_0": 0.01458740234375, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05657958984375, "step": 3350, "total_loss": 0.6710311770439148 }, { "epoch": 0.6634329835676104, "grad_norm": 0.7997620701789856, "learning_rate": 5e-05, "llm_loss": 0.5247985348105431, "loss": 2.4394, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.0623779296875, "loss_aux_layer_9": 0.06103515625, "step": 3351, "total_loss": 0.6098476350307465 }, { "epoch": 0.6636309641655118, "grad_norm": 1.0134090185165405, "learning_rate": 5e-05, "llm_loss": 0.6171045750379562, "loss": 2.8102, "loss_aux_layer_0": 0.0144195556640625, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.06463623046875, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.060791015625, "step": 3352, "total_loss": 0.7025482654571533 }, { "epoch": 0.6638289447634131, "grad_norm": 0.7896947860717773, "learning_rate": 5e-05, "llm_loss": 0.6288163065910339, "loss": 2.8598, "loss_aux_layer_0": 0.0150909423828125, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.0771484375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05926513671875, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.062255859375, "loss_aux_layer_9": 0.06109619140625, "step": 3353, "total_loss": 0.7149520814418793 }, { "epoch": 0.6640269253613146, "grad_norm": 0.9913033843040466, "learning_rate": 5e-05, "llm_loss": 0.6357728093862534, "loss": 2.8905, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.0640869140625, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.06402587890625, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0625, "step": 3354, "total_loss": 0.722630962729454 }, { "epoch": 0.664224905959216, "grad_norm": 0.9095171093940735, "learning_rate": 5e-05, "llm_loss": 0.6557068675756454, "loss": 2.965, "loss_aux_layer_0": 0.015777587890625, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.06292724609375, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.06536865234375, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.06170654296875, "step": 3355, "total_loss": 0.7412588894367218 }, { "epoch": 0.6644228865571175, "grad_norm": 0.9267247319221497, "learning_rate": 5e-05, "llm_loss": 0.5317700356245041, "loss": 2.4601, "loss_aux_layer_0": 0.0147552490234375, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05810546875, "step": 3356, "total_loss": 0.6150139719247818 }, { "epoch": 0.6646208671550188, "grad_norm": 0.852209746837616, "learning_rate": 5e-05, "llm_loss": 0.6020997017621994, "loss": 2.7541, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.0621337890625, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06658935546875, "loss_aux_layer_7": 0.064453125, "loss_aux_layer_8": 0.063720703125, "loss_aux_layer_9": 0.062255859375, "step": 3357, "total_loss": 0.6885202974081039 }, { "epoch": 0.6648188477529202, "grad_norm": 0.9285509586334229, "learning_rate": 5e-05, "llm_loss": 0.6039684116840363, "loss": 2.7483, "loss_aux_layer_0": 0.0148162841796875, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05865478515625, "step": 3358, "total_loss": 0.6870795488357544 }, { "epoch": 0.6650168283508217, "grad_norm": 1.1864322423934937, "learning_rate": 5e-05, "llm_loss": 0.585489347577095, "loss": 2.6719, "loss_aux_layer_0": 0.014862060546875, "loss_aux_layer_1": 0.032745361328125, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05816650390625, "step": 3359, "total_loss": 0.667985588312149 }, { "epoch": 0.665214808948723, "grad_norm": 0.9840710759162903, "learning_rate": 5e-05, "llm_loss": 0.5976013615727425, "loss": 2.731, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06024169921875, "step": 3360, "total_loss": 0.6827549189329147 }, { "epoch": 0.6654127895466244, "grad_norm": 0.9459938406944275, "learning_rate": 5e-05, "llm_loss": 0.5236087962985039, "loss": 2.4352, "loss_aux_layer_0": 0.0152740478515625, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.0596923828125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.0601806640625, "step": 3361, "total_loss": 0.6087890267372131 }, { "epoch": 0.6656107701445259, "grad_norm": 0.9331032633781433, "learning_rate": 5e-05, "llm_loss": 0.6676749438047409, "loss": 3.0048, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.061279296875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.06158447265625, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.05999755859375, "step": 3362, "total_loss": 0.7511938363313675 }, { "epoch": 0.6658087507424273, "grad_norm": 0.8539632558822632, "learning_rate": 5e-05, "llm_loss": 0.7271002978086472, "loss": 3.2434, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06451416015625, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05926513671875, "step": 3363, "total_loss": 0.8108382970094681 }, { "epoch": 0.6660067313403286, "grad_norm": 0.9509893655776978, "learning_rate": 5e-05, "llm_loss": 0.6130583360791206, "loss": 2.799, "loss_aux_layer_0": 0.0146636962890625, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.07763671875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.1312255859375, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06207275390625, "step": 3364, "total_loss": 0.6997556835412979 }, { "epoch": 0.66620471193823, "grad_norm": 0.858454167842865, "learning_rate": 5e-05, "llm_loss": 0.5850172936916351, "loss": 2.6785, "loss_aux_layer_0": 0.0150299072265625, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.060546875, "step": 3365, "total_loss": 0.6696183085441589 }, { "epoch": 0.6664026925361315, "grad_norm": 0.8545876145362854, "learning_rate": 5e-05, "llm_loss": 0.6126875579357147, "loss": 2.8018, "loss_aux_layer_0": 0.0145263671875, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.06500244140625, "loss_aux_layer_11": 0.06927490234375, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.0797119140625, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.06756591796875, "loss_aux_layer_7": 0.0657958984375, "loss_aux_layer_8": 0.06524658203125, "loss_aux_layer_9": 0.0635986328125, "step": 3366, "total_loss": 0.7004609555006027 }, { "epoch": 0.6666006731340328, "grad_norm": 1.033993124961853, "learning_rate": 5e-05, "llm_loss": 0.5849499553442001, "loss": 2.6796, "loss_aux_layer_0": 0.0144195556640625, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06500244140625, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06072998046875, "step": 3367, "total_loss": 0.6699121445417404 }, { "epoch": 0.6667986537319343, "grad_norm": 1.2497491836547852, "learning_rate": 5e-05, "llm_loss": 0.6066632717847824, "loss": 2.755, "loss_aux_layer_0": 0.014404296875, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06671142578125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05670166015625, "step": 3368, "total_loss": 0.6887433528900146 }, { "epoch": 0.6669966343298357, "grad_norm": 1.0688284635543823, "learning_rate": 5e-05, "llm_loss": 0.5514495521783829, "loss": 2.5559, "loss_aux_layer_0": 0.015533447265625, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.05010986328125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.0643310546875, "loss_aux_layer_6": 0.067138671875, "loss_aux_layer_7": 0.065185546875, "loss_aux_layer_8": 0.064697265625, "loss_aux_layer_9": 0.06329345703125, "step": 3369, "total_loss": 0.6389769017696381 }, { "epoch": 0.6671946149277371, "grad_norm": 0.959326982498169, "learning_rate": 5e-05, "llm_loss": 0.6332135796546936, "loss": 2.8864, "loss_aux_layer_0": 0.0148162841796875, "loss_aux_layer_1": 0.035308837890625, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.07000732421875, "loss_aux_layer_12": 0.0748291015625, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1219482421875, "loss_aux_layer_19": 0.1248779296875, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05926513671875, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06536865234375, "loss_aux_layer_9": 0.06414794921875, "step": 3370, "total_loss": 0.7215973734855652 }, { "epoch": 0.6673925955256385, "grad_norm": 0.9372877478599548, "learning_rate": 5e-05, "llm_loss": 0.5351310968399048, "loss": 2.4643, "loss_aux_layer_0": 0.0149993896484375, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.05560302734375, "step": 3371, "total_loss": 0.6160677671432495 }, { "epoch": 0.6675905761235399, "grad_norm": 1.0760713815689087, "learning_rate": 5e-05, "llm_loss": 0.6135392636060715, "loss": 2.7882, "loss_aux_layer_0": 0.0150146484375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.0582275390625, "step": 3372, "total_loss": 0.6970600485801697 }, { "epoch": 0.6677885567214413, "grad_norm": 0.9736006259918213, "learning_rate": 5e-05, "llm_loss": 0.5952878892421722, "loss": 2.7085, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.056884765625, "step": 3373, "total_loss": 0.6771237850189209 }, { "epoch": 0.6679865373193427, "grad_norm": 0.9138309359550476, "learning_rate": 5e-05, "llm_loss": 0.6197781264781952, "loss": 2.8123, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05487060546875, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05853271484375, "step": 3374, "total_loss": 0.7030802965164185 }, { "epoch": 0.6681845179172441, "grad_norm": 0.9872811436653137, "learning_rate": 5e-05, "llm_loss": 0.6150060072541237, "loss": 2.8143, "loss_aux_layer_0": 0.0148468017578125, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.06597900390625, "loss_aux_layer_11": 0.0704345703125, "loss_aux_layer_12": 0.074951171875, "loss_aux_layer_13": 0.0804443359375, "loss_aux_layer_14": 0.089111328125, "loss_aux_layer_15": 0.097412109375, "loss_aux_layer_16": 0.1064453125, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.121826171875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.05059814453125, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.061279296875, "loss_aux_layer_4": 0.06396484375, "loss_aux_layer_5": 0.06585693359375, "loss_aux_layer_6": 0.0689697265625, "loss_aux_layer_7": 0.066650390625, "loss_aux_layer_8": 0.06591796875, "loss_aux_layer_9": 0.064697265625, "step": 3375, "total_loss": 0.7035838216543198 }, { "epoch": 0.6683824985151455, "grad_norm": 1.2195665836334229, "learning_rate": 5e-05, "llm_loss": 0.5898961946368217, "loss": 2.6952, "loss_aux_layer_0": 0.0146636962890625, "loss_aux_layer_1": 0.033233642578125, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.059326171875, "step": 3376, "total_loss": 0.6737891584634781 }, { "epoch": 0.668580479113047, "grad_norm": 0.833050012588501, "learning_rate": 5e-05, "llm_loss": 0.5256197899580002, "loss": 2.4447, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06109619140625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05950927734375, "step": 3377, "total_loss": 0.6111666113138199 }, { "epoch": 0.6687784597109483, "grad_norm": 1.1447184085845947, "learning_rate": 5e-05, "llm_loss": 0.6046699583530426, "loss": 2.7552, "loss_aux_layer_0": 0.0141448974609375, "loss_aux_layer_1": 0.032867431640625, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05859375, "step": 3378, "total_loss": 0.6887878030538559 }, { "epoch": 0.6689764403088497, "grad_norm": 1.1420947313308716, "learning_rate": 5e-05, "llm_loss": 0.5222631990909576, "loss": 2.4286, "loss_aux_layer_0": 0.01470947265625, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.06134033203125, "step": 3379, "total_loss": 0.607156403362751 }, { "epoch": 0.6691744209067512, "grad_norm": 1.0373677015304565, "learning_rate": 5e-05, "llm_loss": 0.6111844480037689, "loss": 2.7788, "loss_aux_layer_0": 0.0146636962890625, "loss_aux_layer_1": 0.03289794921875, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.05841064453125, "step": 3380, "total_loss": 0.6946972161531448 }, { "epoch": 0.6693724015046525, "grad_norm": 1.4697679281234741, "learning_rate": 5e-05, "llm_loss": 0.5328005105257034, "loss": 2.4857, "loss_aux_layer_0": 0.015899658203125, "loss_aux_layer_1": 0.03570556640625, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.096923828125, "loss_aux_layer_16": 0.1063232421875, "loss_aux_layer_17": 0.1138916015625, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.12548828125, "loss_aux_layer_2": 0.0499267578125, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.142333984375, "loss_aux_layer_22": 0.16455078125, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.06378173828125, "loss_aux_layer_6": 0.06695556640625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06298828125, "step": 3381, "total_loss": 0.6214360296726227 }, { "epoch": 0.6695703821025539, "grad_norm": 1.2461525201797485, "learning_rate": 5e-05, "llm_loss": 0.5384061187505722, "loss": 2.4915, "loss_aux_layer_0": 0.01544189453125, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.1297607421875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.05889892578125, "step": 3382, "total_loss": 0.6228729784488678 }, { "epoch": 0.6697683627004554, "grad_norm": 1.1473875045776367, "learning_rate": 5e-05, "llm_loss": 0.6043356955051422, "loss": 2.7658, "loss_aux_layer_0": 0.0147552490234375, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06365966796875, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.0963134765625, "loss_aux_layer_16": 0.10595703125, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.121337890625, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.06646728515625, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.06365966796875, "loss_aux_layer_9": 0.062255859375, "step": 3383, "total_loss": 0.6914487332105637 }, { "epoch": 0.6699663432983568, "grad_norm": 1.1209760904312134, "learning_rate": 5e-05, "llm_loss": 0.53705795109272, "loss": 2.4836, "loss_aux_layer_0": 0.0161285400390625, "loss_aux_layer_1": 0.0338134765625, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05816650390625, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.058837890625, "step": 3384, "total_loss": 0.6208965182304382 }, { "epoch": 0.6701643238962581, "grad_norm": 1.1142617464065552, "learning_rate": 5e-05, "llm_loss": 0.6599044948816299, "loss": 2.9804, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.06146240234375, "step": 3385, "total_loss": 0.745107039809227 }, { "epoch": 0.6703623044941596, "grad_norm": 1.4562275409698486, "learning_rate": 5e-05, "llm_loss": 0.6030898243188858, "loss": 2.7633, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0880126953125, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1229248046875, "loss_aux_layer_19": 0.1251220703125, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.06640625, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0623779296875, "step": 3386, "total_loss": 0.6908324956893921 }, { "epoch": 0.670560285092061, "grad_norm": 0.8627204298973083, "learning_rate": 5e-05, "llm_loss": 0.6007399260997772, "loss": 2.7398, "loss_aux_layer_0": 0.01458740234375, "loss_aux_layer_1": 0.033660888671875, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.059326171875, "step": 3387, "total_loss": 0.6849454641342163 }, { "epoch": 0.6707582656899623, "grad_norm": 1.1015515327453613, "learning_rate": 5e-05, "llm_loss": 0.5540027245879173, "loss": 2.5585, "loss_aux_layer_0": 0.015289306640625, "loss_aux_layer_1": 0.033721923828125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.1300048828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.0611572265625, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.06060791015625, "step": 3388, "total_loss": 0.6396204978227615 }, { "epoch": 0.6709562462878638, "grad_norm": 0.9984591007232666, "learning_rate": 5e-05, "llm_loss": 0.5945158898830414, "loss": 2.7138, "loss_aux_layer_0": 0.015533447265625, "loss_aux_layer_1": 0.03289794921875, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.06207275390625, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.0582275390625, "step": 3389, "total_loss": 0.6784514784812927 }, { "epoch": 0.6711542268857652, "grad_norm": 1.4705320596694946, "learning_rate": 5e-05, "llm_loss": 0.6402013748884201, "loss": 2.9067, "loss_aux_layer_0": 0.014862060546875, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06292724609375, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.0618896484375, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.0614013671875, "step": 3390, "total_loss": 0.7266753613948822 }, { "epoch": 0.6713522074836666, "grad_norm": 1.195763349533081, "learning_rate": 5e-05, "llm_loss": 0.5553762912750244, "loss": 2.5725, "loss_aux_layer_0": 0.01654052734375, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.133056640625, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.163330078125, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.0592041015625, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06182861328125, "step": 3391, "total_loss": 0.643127903342247 }, { "epoch": 0.671550188081568, "grad_norm": 0.8174457550048828, "learning_rate": 5e-05, "llm_loss": 0.5697546899318695, "loss": 2.6219, "loss_aux_layer_0": 0.0145721435546875, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.0599365234375, "step": 3392, "total_loss": 0.6554774343967438 }, { "epoch": 0.6717481686794694, "grad_norm": 1.0355720520019531, "learning_rate": 5e-05, "llm_loss": 0.5612142384052277, "loss": 2.5998, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.0743408203125, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0885009765625, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1065673828125, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1221923828125, "loss_aux_layer_19": 0.1253662109375, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.13330078125, "loss_aux_layer_21": 0.1416015625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.05938720703125, "loss_aux_layer_4": 0.06207275390625, "loss_aux_layer_5": 0.0638427734375, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.0655517578125, "loss_aux_layer_8": 0.0650634765625, "loss_aux_layer_9": 0.06427001953125, "step": 3393, "total_loss": 0.649951159954071 }, { "epoch": 0.6719461492773708, "grad_norm": 0.9899518489837646, "learning_rate": 5e-05, "llm_loss": 0.5449923276901245, "loss": 2.5089, "loss_aux_layer_0": 0.014495849609375, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0574951171875, "step": 3394, "total_loss": 0.6272175163030624 }, { "epoch": 0.6721441298752723, "grad_norm": 1.007519006729126, "learning_rate": 5e-05, "llm_loss": 0.5726640373468399, "loss": 2.6376, "loss_aux_layer_0": 0.0164794921875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06097412109375, "step": 3395, "total_loss": 0.6593942940235138 }, { "epoch": 0.6723421104731736, "grad_norm": 0.8867025375366211, "learning_rate": 5e-05, "llm_loss": 0.5813748687505722, "loss": 2.6693, "loss_aux_layer_0": 0.014862060546875, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.0611572265625, "step": 3396, "total_loss": 0.6673257499933243 }, { "epoch": 0.672540091071075, "grad_norm": 0.9511080980300903, "learning_rate": 5e-05, "llm_loss": 0.5455809384584427, "loss": 2.5166, "loss_aux_layer_0": 0.016265869140625, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05804443359375, "step": 3397, "total_loss": 0.6291544437408447 }, { "epoch": 0.6727380716689765, "grad_norm": 1.0668144226074219, "learning_rate": 5e-05, "llm_loss": 0.6589133813977242, "loss": 2.9789, "loss_aux_layer_0": 0.015350341796875, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.06695556640625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.0631103515625, "loss_aux_layer_9": 0.06170654296875, "step": 3398, "total_loss": 0.7447342276573181 }, { "epoch": 0.6729360522668778, "grad_norm": 1.0617622137069702, "learning_rate": 5e-05, "llm_loss": 0.6351044178009033, "loss": 2.8829, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.0601806640625, "step": 3399, "total_loss": 0.7207230478525162 }, { "epoch": 0.6731340328647792, "grad_norm": 1.0061143636703491, "learning_rate": 5e-05, "llm_loss": 0.5656515657901764, "loss": 2.5977, "loss_aux_layer_0": 0.016693115234375, "loss_aux_layer_1": 0.033660888671875, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05902099609375, "step": 3400, "total_loss": 0.6494158208370209 }, { "epoch": 0.6733320134626807, "grad_norm": 1.0834522247314453, "learning_rate": 5e-05, "llm_loss": 0.6800836622714996, "loss": 3.0532, "loss_aux_layer_0": 0.0145263671875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.0616455078125, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.05975341796875, "step": 3401, "total_loss": 0.7632929682731628 }, { "epoch": 0.6735299940605821, "grad_norm": 0.9090831279754639, "learning_rate": 5e-05, "llm_loss": 0.5337074771523476, "loss": 2.4661, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.0577392578125, "step": 3402, "total_loss": 0.6165174841880798 }, { "epoch": 0.6737279746584834, "grad_norm": 0.9367561340332031, "learning_rate": 5e-05, "llm_loss": 0.6341207027435303, "loss": 2.8741, "loss_aux_layer_0": 0.0149078369140625, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.06060791015625, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.06951904296875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.1290283203125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.0592041015625, "step": 3403, "total_loss": 0.7185306400060654 }, { "epoch": 0.6739259552563849, "grad_norm": 1.08543860912323, "learning_rate": 5e-05, "llm_loss": 0.537991389632225, "loss": 2.4889, "loss_aux_layer_0": 0.015106201171875, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.05816650390625, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.05902099609375, "step": 3404, "total_loss": 0.6222333386540413 }, { "epoch": 0.6741239358542863, "grad_norm": 1.4442613124847412, "learning_rate": 5e-05, "llm_loss": 0.5774411633610725, "loss": 2.6587, "loss_aux_layer_0": 0.01513671875, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1131591796875, "loss_aux_layer_18": 0.1217041015625, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.049072265625, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.058837890625, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06195068359375, "step": 3405, "total_loss": 0.664664164185524 }, { "epoch": 0.6743219164521876, "grad_norm": 0.8059356808662415, "learning_rate": 5e-05, "llm_loss": 0.5308903157711029, "loss": 2.4586, "loss_aux_layer_0": 0.015594482421875, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.0589599609375, "step": 3406, "total_loss": 0.6146465241909027 }, { "epoch": 0.6745198970500891, "grad_norm": 1.13507080078125, "learning_rate": 5e-05, "llm_loss": 0.5781662091612816, "loss": 2.6518, "loss_aux_layer_0": 0.0155181884765625, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.06475830078125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06146240234375, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05975341796875, "step": 3407, "total_loss": 0.6629403233528137 }, { "epoch": 0.6747178776479905, "grad_norm": 1.0924781560897827, "learning_rate": 5e-05, "llm_loss": 0.5925908386707306, "loss": 2.694, "loss_aux_layer_0": 0.0155487060546875, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.1329345703125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.05572509765625, "step": 3408, "total_loss": 0.673488199710846 }, { "epoch": 0.674915858245892, "grad_norm": 0.9450078010559082, "learning_rate": 5e-05, "llm_loss": 0.6404777020215988, "loss": 2.9056, "loss_aux_layer_0": 0.01544189453125, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06085205078125, "step": 3409, "total_loss": 0.7263890653848648 }, { "epoch": 0.6751138388437933, "grad_norm": 1.064176321029663, "learning_rate": 5e-05, "llm_loss": 0.6249293982982635, "loss": 2.846, "loss_aux_layer_0": 0.015472412109375, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06085205078125, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.0618896484375, "step": 3410, "total_loss": 0.7114997208118439 }, { "epoch": 0.6753118194416947, "grad_norm": 0.7251352071762085, "learning_rate": 5e-05, "llm_loss": 0.5220286324620247, "loss": 2.4281, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05987548828125, "step": 3411, "total_loss": 0.6070185899734497 }, { "epoch": 0.6755098000395962, "grad_norm": 1.0174733400344849, "learning_rate": 5e-05, "llm_loss": 0.5427248775959015, "loss": 2.5121, "loss_aux_layer_0": 0.0159149169921875, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.1300048828125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.0604248046875, "step": 3412, "total_loss": 0.6280219554901123 }, { "epoch": 0.6757077806374975, "grad_norm": 0.7871980667114258, "learning_rate": 5e-05, "llm_loss": 0.5311177298426628, "loss": 2.4719, "loss_aux_layer_0": 0.0153961181640625, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.07177734375, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06182861328125, "loss_aux_layer_5": 0.0633544921875, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06182861328125, "step": 3413, "total_loss": 0.6179723739624023 }, { "epoch": 0.6759057612353989, "grad_norm": 0.9053924679756165, "learning_rate": 5e-05, "llm_loss": 0.5744871944189072, "loss": 2.6369, "loss_aux_layer_0": 0.0145111083984375, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06475830078125, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.0595703125, "step": 3414, "total_loss": 0.6592209935188293 }, { "epoch": 0.6761037418333004, "grad_norm": 0.8572723865509033, "learning_rate": 5e-05, "llm_loss": 0.5492700412869453, "loss": 2.5468, "loss_aux_layer_0": 0.016571044921875, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.10498046875, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16259765625, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06231689453125, "step": 3415, "total_loss": 0.6366996616125107 }, { "epoch": 0.6763017224312018, "grad_norm": 0.854924738407135, "learning_rate": 5e-05, "llm_loss": 0.5604203790426254, "loss": 2.5651, "loss_aux_layer_0": 0.0151214599609375, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.0548095703125, "step": 3416, "total_loss": 0.641271635890007 }, { "epoch": 0.6764997030291031, "grad_norm": 0.9324678182601929, "learning_rate": 5e-05, "llm_loss": 0.5710003972053528, "loss": 2.6059, "loss_aux_layer_0": 0.0145416259765625, "loss_aux_layer_1": 0.030853271484375, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05474853515625, "step": 3417, "total_loss": 0.6514792889356613 }, { "epoch": 0.6766976836270046, "grad_norm": 0.8871491551399231, "learning_rate": 5e-05, "llm_loss": 0.6725255846977234, "loss": 3.0197, "loss_aux_layer_0": 0.0152435302734375, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.0584716796875, "step": 3418, "total_loss": 0.7549261450767517 }, { "epoch": 0.676895664224906, "grad_norm": 0.7731744050979614, "learning_rate": 5e-05, "llm_loss": 0.5032337233424187, "loss": 2.3502, "loss_aux_layer_0": 0.0148162841796875, "loss_aux_layer_1": 0.032440185546875, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.05859375, "step": 3419, "total_loss": 0.5875587239861488 }, { "epoch": 0.6770936448228073, "grad_norm": 1.0426386594772339, "learning_rate": 5e-05, "llm_loss": 0.539532333612442, "loss": 2.4843, "loss_aux_layer_0": 0.014923095703125, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0570068359375, "step": 3420, "total_loss": 0.6210836619138718 }, { "epoch": 0.6772916254207088, "grad_norm": 0.8409425616264343, "learning_rate": 5e-05, "llm_loss": 0.5596286058425903, "loss": 2.5868, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.11279296875, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.1328125, "loss_aux_layer_21": 0.14208984375, "loss_aux_layer_22": 0.165283203125, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.060302734375, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06475830078125, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.06085205078125, "step": 3421, "total_loss": 0.6467094719409943 }, { "epoch": 0.6774896060186102, "grad_norm": 0.8848226070404053, "learning_rate": 5e-05, "llm_loss": 0.6725333631038666, "loss": 3.0137, "loss_aux_layer_0": 0.0146942138671875, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.056396484375, "step": 3422, "total_loss": 0.7534222304821014 }, { "epoch": 0.6776875866165116, "grad_norm": 1.150119423866272, "learning_rate": 5e-05, "llm_loss": 0.5413907319307327, "loss": 2.498, "loss_aux_layer_0": 0.015380859375, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0574951171875, "step": 3423, "total_loss": 0.6244976371526718 }, { "epoch": 0.677885567214413, "grad_norm": 0.8322768211364746, "learning_rate": 5e-05, "llm_loss": 0.5718094706535339, "loss": 2.6343, "loss_aux_layer_0": 0.014495849609375, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201171875, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.0609130859375, "step": 3424, "total_loss": 0.6585642546415329 }, { "epoch": 0.6780835478123144, "grad_norm": 1.1696659326553345, "learning_rate": 5e-05, "llm_loss": 0.5902460739016533, "loss": 2.6881, "loss_aux_layer_0": 0.0157623291015625, "loss_aux_layer_1": 0.032928466796875, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.05712890625, "step": 3425, "total_loss": 0.6720288246870041 }, { "epoch": 0.6782815284102158, "grad_norm": 1.0444740056991577, "learning_rate": 5e-05, "llm_loss": 0.5574379116296768, "loss": 2.5513, "loss_aux_layer_0": 0.0149383544921875, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05584716796875, "step": 3426, "total_loss": 0.6378239989280701 }, { "epoch": 0.6784795090081172, "grad_norm": 0.9548467397689819, "learning_rate": 5e-05, "llm_loss": 0.6437565237283707, "loss": 2.9139, "loss_aux_layer_0": 0.0160369873046875, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05767822265625, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.05914306640625, "step": 3427, "total_loss": 0.7284783273935318 }, { "epoch": 0.6786774896060186, "grad_norm": 1.2225106954574585, "learning_rate": 5e-05, "llm_loss": 0.6488330215215683, "loss": 2.9255, "loss_aux_layer_0": 0.01483154296875, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.0579833984375, "step": 3428, "total_loss": 0.7313726395368576 }, { "epoch": 0.67887547020392, "grad_norm": 1.012675166130066, "learning_rate": 5e-05, "llm_loss": 0.5558791756629944, "loss": 2.5591, "loss_aux_layer_0": 0.0150604248046875, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.0599365234375, "step": 3429, "total_loss": 0.6397667527198792 }, { "epoch": 0.6790734508018215, "grad_norm": 0.767927348613739, "learning_rate": 5e-05, "llm_loss": 0.5392444506287575, "loss": 2.4953, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.05987548828125, "step": 3430, "total_loss": 0.6238231956958771 }, { "epoch": 0.6792714313997228, "grad_norm": 0.9989328980445862, "learning_rate": 5e-05, "llm_loss": 0.6689940840005875, "loss": 3.014, "loss_aux_layer_0": 0.0149383544921875, "loss_aux_layer_1": 0.034820556640625, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06610107421875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.060791015625, "step": 3431, "total_loss": 0.7535039782524109 }, { "epoch": 0.6794694119976242, "grad_norm": 1.0311737060546875, "learning_rate": 5e-05, "llm_loss": 0.5250089690089226, "loss": 2.4503, "loss_aux_layer_0": 0.015167236328125, "loss_aux_layer_1": 0.0355224609375, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0787353515625, "loss_aux_layer_14": 0.0875244140625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.113525390625, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.06427001953125, "loss_aux_layer_9": 0.062744140625, "step": 3432, "total_loss": 0.6125791221857071 }, { "epoch": 0.6796673925955257, "grad_norm": 1.444069266319275, "learning_rate": 5e-05, "llm_loss": 0.6149930655956268, "loss": 2.7881, "loss_aux_layer_0": 0.0145111083984375, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.0579833984375, "step": 3433, "total_loss": 0.6970316916704178 }, { "epoch": 0.679865373193427, "grad_norm": 0.9947223663330078, "learning_rate": 5e-05, "llm_loss": 0.5456160455942154, "loss": 2.5301, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1202392578125, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06195068359375, "step": 3434, "total_loss": 0.6325224786996841 }, { "epoch": 0.6800633537913284, "grad_norm": 1.1978960037231445, "learning_rate": 5e-05, "llm_loss": 0.5601125657558441, "loss": 2.5754, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.06793212890625, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05828857421875, "step": 3435, "total_loss": 0.643839418888092 }, { "epoch": 0.6802613343892299, "grad_norm": 1.0218136310577393, "learning_rate": 5e-05, "llm_loss": 0.5030557960271835, "loss": 2.3572, "loss_aux_layer_0": 0.0158538818359375, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.048583984375, "loss_aux_layer_20": 0.1302490234375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06292724609375, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.0611572265625, "step": 3436, "total_loss": 0.5892939269542694 }, { "epoch": 0.6804593149871313, "grad_norm": 1.1249499320983887, "learning_rate": 5e-05, "llm_loss": 0.6148431152105331, "loss": 2.7942, "loss_aux_layer_0": 0.016998291015625, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.06451416015625, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.0592041015625, "step": 3437, "total_loss": 0.6985616683959961 }, { "epoch": 0.6806572955850326, "grad_norm": 0.9609230160713196, "learning_rate": 5e-05, "llm_loss": 0.6179137378931046, "loss": 2.8112, "loss_aux_layer_0": 0.015045166015625, "loss_aux_layer_1": 0.032928466796875, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.05975341796875, "step": 3438, "total_loss": 0.7028020471334457 }, { "epoch": 0.6808552761829341, "grad_norm": 1.2052838802337646, "learning_rate": 5e-05, "llm_loss": 0.5675989240407944, "loss": 2.6085, "loss_aux_layer_0": 0.01678466796875, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.06427001953125, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.05999755859375, "step": 3439, "total_loss": 0.652120552957058 }, { "epoch": 0.6810532567808355, "grad_norm": 0.8469802737236023, "learning_rate": 5e-05, "llm_loss": 0.4887330234050751, "loss": 2.2953, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.0595703125, "step": 3440, "total_loss": 0.5738291591405869 }, { "epoch": 0.6812512373787368, "grad_norm": 0.9997222423553467, "learning_rate": 5e-05, "llm_loss": 0.6106540560722351, "loss": 2.7674, "loss_aux_layer_0": 0.015106201171875, "loss_aux_layer_1": 0.031890869140625, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05609130859375, "step": 3441, "total_loss": 0.6918428838253021 }, { "epoch": 0.6814492179766383, "grad_norm": 0.7494639158248901, "learning_rate": 5e-05, "llm_loss": 0.5644537955522537, "loss": 2.5952, "loss_aux_layer_0": 0.01446533203125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.06146240234375, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.0596923828125, "step": 3442, "total_loss": 0.6488080620765686 }, { "epoch": 0.6816471985745397, "grad_norm": 0.9666947722434998, "learning_rate": 5e-05, "llm_loss": 0.5378178507089615, "loss": 2.5025, "loss_aux_layer_0": 0.01513671875, "loss_aux_layer_1": 0.0360107421875, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.06805419921875, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.087158203125, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.10546875, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.059326171875, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0665283203125, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.06414794921875, "loss_aux_layer_9": 0.06292724609375, "step": 3443, "total_loss": 0.6256294697523117 }, { "epoch": 0.6818451791724411, "grad_norm": 0.8366850018501282, "learning_rate": 5e-05, "llm_loss": 0.5640088319778442, "loss": 2.5975, "loss_aux_layer_0": 0.014923095703125, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.05963134765625, "step": 3444, "total_loss": 0.6493712663650513 }, { "epoch": 0.6820431597703425, "grad_norm": 1.0135120153427124, "learning_rate": 5e-05, "llm_loss": 0.4807135686278343, "loss": 2.2689, "loss_aux_layer_0": 0.014404296875, "loss_aux_layer_1": 0.0352783203125, "loss_aux_layer_10": 0.06256103515625, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1119384765625, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.06427001953125, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.06109619140625, "step": 3445, "total_loss": 0.5672147870063782 }, { "epoch": 0.6822411403682439, "grad_norm": 0.9631767272949219, "learning_rate": 5e-05, "llm_loss": 0.5519860982894897, "loss": 2.5504, "loss_aux_layer_0": 0.0152130126953125, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.06475830078125, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.0606689453125, "step": 3446, "total_loss": 0.6376070827245712 }, { "epoch": 0.6824391209661453, "grad_norm": 0.9688128232955933, "learning_rate": 5e-05, "llm_loss": 0.5856874585151672, "loss": 2.6682, "loss_aux_layer_0": 0.0147247314453125, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.055908203125, "step": 3447, "total_loss": 0.6670552790164948 }, { "epoch": 0.6826371015640468, "grad_norm": 0.9657444357872009, "learning_rate": 5e-05, "llm_loss": 0.54585862159729, "loss": 2.524, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06292724609375, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.0615234375, "step": 3448, "total_loss": 0.63100815564394 }, { "epoch": 0.6828350821619481, "grad_norm": 0.8671439290046692, "learning_rate": 5e-05, "llm_loss": 0.5407462120056152, "loss": 2.494, "loss_aux_layer_0": 0.0146331787109375, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0576171875, "step": 3449, "total_loss": 0.6235036104917526 }, { "epoch": 0.6830330627598495, "grad_norm": 0.9061557650566101, "learning_rate": 5e-05, "llm_loss": 0.5511184185743332, "loss": 2.5444, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.06500244140625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.0594482421875, "step": 3450, "total_loss": 0.6360963582992554 }, { "epoch": 0.683231043357751, "grad_norm": 0.868948221206665, "learning_rate": 5e-05, "llm_loss": 0.492237389087677, "loss": 2.3077, "loss_aux_layer_0": 0.0151519775390625, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.0584716796875, "step": 3451, "total_loss": 0.5769132077693939 }, { "epoch": 0.6834290239556523, "grad_norm": 0.8266246914863586, "learning_rate": 5e-05, "llm_loss": 0.5879824310541153, "loss": 2.6896, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06103515625, "step": 3452, "total_loss": 0.6724088490009308 }, { "epoch": 0.6836270045535537, "grad_norm": 0.9370095133781433, "learning_rate": 5e-05, "llm_loss": 0.5091704428195953, "loss": 2.3733, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.06005859375, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.06890869140625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.0621337890625, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.0587158203125, "step": 3453, "total_loss": 0.5933281481266022 }, { "epoch": 0.6838249851514552, "grad_norm": 0.7990873456001282, "learning_rate": 5e-05, "llm_loss": 0.6033226102590561, "loss": 2.7356, "loss_aux_layer_0": 0.0141448974609375, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.07037353515625, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.0560302734375, "step": 3454, "total_loss": 0.683905765414238 }, { "epoch": 0.6840229657493566, "grad_norm": 1.1144012212753296, "learning_rate": 5e-05, "llm_loss": 0.5156205594539642, "loss": 2.4054, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.1307373046875, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06317138671875, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06024169921875, "step": 3455, "total_loss": 0.6013427525758743 }, { "epoch": 0.6842209463472579, "grad_norm": 0.9510241150856018, "learning_rate": 5e-05, "llm_loss": 0.6182287186384201, "loss": 2.8105, "loss_aux_layer_0": 0.013824462890625, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05682373046875, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06060791015625, "step": 3456, "total_loss": 0.702626422047615 }, { "epoch": 0.6844189269451594, "grad_norm": 1.213684320449829, "learning_rate": 5e-05, "llm_loss": 0.5945495963096619, "loss": 2.7238, "loss_aux_layer_0": 0.013702392578125, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.1290283203125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0640869140625, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0625, "step": 3457, "total_loss": 0.680961012840271 }, { "epoch": 0.6846169075430608, "grad_norm": 1.0019482374191284, "learning_rate": 5e-05, "llm_loss": 0.640005961060524, "loss": 2.879, "loss_aux_layer_0": 0.014404296875, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.055419921875, "step": 3458, "total_loss": 0.7197571098804474 }, { "epoch": 0.6848148881409621, "grad_norm": 1.303063988685608, "learning_rate": 5e-05, "llm_loss": 0.5845577567815781, "loss": 2.6802, "loss_aux_layer_0": 0.0144500732421875, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06109619140625, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0615234375, "step": 3459, "total_loss": 0.6700556129217148 }, { "epoch": 0.6850128687388636, "grad_norm": 1.0837326049804688, "learning_rate": 5e-05, "llm_loss": 0.7161044776439667, "loss": 3.2007, "loss_aux_layer_0": 0.0142669677734375, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.0594482421875, "step": 3460, "total_loss": 0.8001807481050491 }, { "epoch": 0.685210849336765, "grad_norm": 1.2118406295776367, "learning_rate": 5e-05, "llm_loss": 0.5933753252029419, "loss": 2.7153, "loss_aux_layer_0": 0.0147247314453125, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0489501953125, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05908203125, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06304931640625, "loss_aux_layer_6": 0.0660400390625, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.0616455078125, "step": 3461, "total_loss": 0.6788143962621689 }, { "epoch": 0.6854088299346665, "grad_norm": 1.072474718093872, "learning_rate": 5e-05, "llm_loss": 0.6103080362081528, "loss": 2.7635, "loss_aux_layer_0": 0.014801025390625, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.0556640625, "step": 3462, "total_loss": 0.6908663213253021 }, { "epoch": 0.6856068105325678, "grad_norm": 1.6314789056777954, "learning_rate": 5e-05, "llm_loss": 0.6327477991580963, "loss": 2.8667, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05908203125, "step": 3463, "total_loss": 0.716662734746933 }, { "epoch": 0.6858047911304692, "grad_norm": 1.117934226989746, "learning_rate": 5e-05, "llm_loss": 0.471756711602211, "loss": 2.2344, "loss_aux_layer_0": 0.0148468017578125, "loss_aux_layer_1": 0.0338134765625, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.061767578125, "step": 3464, "total_loss": 0.5586060285568237 }, { "epoch": 0.6860027717283707, "grad_norm": 1.0758877992630005, "learning_rate": 5e-05, "llm_loss": 0.5138700306415558, "loss": 2.3979, "loss_aux_layer_0": 0.0149993896484375, "loss_aux_layer_1": 0.033416748046875, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.07012939453125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.0634765625, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.0599365234375, "step": 3465, "total_loss": 0.5994775593280792 }, { "epoch": 0.686200752326272, "grad_norm": 1.6768510341644287, "learning_rate": 5e-05, "llm_loss": 0.5517214834690094, "loss": 2.5493, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.1302490234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.05987548828125, "step": 3466, "total_loss": 0.6373148113489151 }, { "epoch": 0.6863987329241734, "grad_norm": 1.3070040941238403, "learning_rate": 5e-05, "llm_loss": 0.4712565392255783, "loss": 2.2323, "loss_aux_layer_0": 0.0155792236328125, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06396484375, "loss_aux_layer_11": 0.0679931640625, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04779052734375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.0579833984375, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.06439208984375, "loss_aux_layer_8": 0.0638427734375, "loss_aux_layer_9": 0.06243896484375, "step": 3467, "total_loss": 0.5580738112330437 }, { "epoch": 0.6865967135220749, "grad_norm": 1.3490902185440063, "learning_rate": 5e-05, "llm_loss": 0.6552413702011108, "loss": 2.9697, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.035430908203125, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.06842041015625, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1058349609375, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.1207275390625, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06219482421875, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.06634521484375, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.06353759765625, "loss_aux_layer_9": 0.06243896484375, "step": 3468, "total_loss": 0.7424178570508957 }, { "epoch": 0.6867946941199763, "grad_norm": 1.1264082193374634, "learning_rate": 5e-05, "llm_loss": 0.48187974840402603, "loss": 2.2662, "loss_aux_layer_0": 0.01519775390625, "loss_aux_layer_1": 0.034423828125, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.064453125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.06011962890625, "step": 3469, "total_loss": 0.566545695066452 }, { "epoch": 0.6869926747178776, "grad_norm": 1.1480883359909058, "learning_rate": 5e-05, "llm_loss": 0.5468098819255829, "loss": 2.5358, "loss_aux_layer_0": 0.014617919921875, "loss_aux_layer_1": 0.034912109375, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.125732421875, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.134033203125, "loss_aux_layer_21": 0.143310546875, "loss_aux_layer_22": 0.164306640625, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0614013671875, "loss_aux_layer_6": 0.06414794921875, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.0616455078125, "loss_aux_layer_9": 0.060546875, "step": 3470, "total_loss": 0.633942574262619 }, { "epoch": 0.687190655315779, "grad_norm": 1.1102980375289917, "learning_rate": 5e-05, "llm_loss": 0.562828928232193, "loss": 2.6052, "loss_aux_layer_0": 0.0149688720703125, "loss_aux_layer_1": 0.0350341796875, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0882568359375, "loss_aux_layer_15": 0.09765625, "loss_aux_layer_16": 0.108154296875, "loss_aux_layer_17": 0.1163330078125, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.1275634765625, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.135009765625, "loss_aux_layer_21": 0.142578125, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.06121826171875, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06329345703125, "loss_aux_layer_9": 0.0621337890625, "step": 3471, "total_loss": 0.6512919068336487 }, { "epoch": 0.6873886359136805, "grad_norm": 1.2423456907272339, "learning_rate": 5e-05, "llm_loss": 0.5307074338197708, "loss": 2.4575, "loss_aux_layer_0": 0.0154266357421875, "loss_aux_layer_1": 0.03326416015625, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.05902099609375, "step": 3472, "total_loss": 0.6143797487020493 }, { "epoch": 0.6875866165115818, "grad_norm": 1.8436598777770996, "learning_rate": 5e-05, "llm_loss": 0.5735106468200684, "loss": 2.6302, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.033599853515625, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.06866455078125, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.0606689453125, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.05908203125, "step": 3473, "total_loss": 0.6575466245412827 }, { "epoch": 0.6877845971094833, "grad_norm": 1.149827241897583, "learning_rate": 5e-05, "llm_loss": 0.5868443250656128, "loss": 2.6915, "loss_aux_layer_0": 0.014801025390625, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.123046875, "loss_aux_layer_2": 0.0487060546875, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05841064453125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.0623779296875, "loss_aux_layer_9": 0.0609130859375, "step": 3474, "total_loss": 0.672874391078949 }, { "epoch": 0.6879825777073847, "grad_norm": 1.2527790069580078, "learning_rate": 5e-05, "llm_loss": 0.5116779655218124, "loss": 2.3947, "loss_aux_layer_0": 0.015167236328125, "loss_aux_layer_1": 0.034515380859375, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.06732177734375, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.198486328125, "loss_aux_layer_3": 0.058349609375, "loss_aux_layer_4": 0.0611572265625, "loss_aux_layer_5": 0.0628662109375, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06170654296875, "step": 3475, "total_loss": 0.5986820533871651 }, { "epoch": 0.6881805583052861, "grad_norm": 1.0490614175796509, "learning_rate": 5e-05, "llm_loss": 0.632463201880455, "loss": 2.8782, "loss_aux_layer_0": 0.0149078369140625, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.063720703125, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.095458984375, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.1204833984375, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.1314697265625, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.06134033203125, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0623779296875, "step": 3476, "total_loss": 0.7195495367050171 }, { "epoch": 0.6883785389031875, "grad_norm": 1.1629126071929932, "learning_rate": 5e-05, "llm_loss": 0.6194090098142624, "loss": 2.8087, "loss_aux_layer_0": 0.0154266357421875, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.06768798828125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.058349609375, "step": 3477, "total_loss": 0.7021712958812714 }, { "epoch": 0.6885765195010889, "grad_norm": 0.8892202377319336, "learning_rate": 5e-05, "llm_loss": 0.5263695642352104, "loss": 2.4486, "loss_aux_layer_0": 0.01580810546875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06634521484375, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.123291015625, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.061279296875, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.0616455078125, "loss_aux_layer_9": 0.0604248046875, "step": 3478, "total_loss": 0.6121536940336227 }, { "epoch": 0.6887745000989903, "grad_norm": 1.0007890462875366, "learning_rate": 5e-05, "llm_loss": 0.5318259671330452, "loss": 2.469, "loss_aux_layer_0": 0.0144500732421875, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.06201171875, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05926513671875, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.0606689453125, "step": 3479, "total_loss": 0.6172541975975037 }, { "epoch": 0.6889724806968917, "grad_norm": 1.075134515762329, "learning_rate": 5e-05, "llm_loss": 0.5712240934371948, "loss": 2.6184, "loss_aux_layer_0": 0.015716552734375, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05828857421875, "step": 3480, "total_loss": 0.6545919328927994 }, { "epoch": 0.6891704612947931, "grad_norm": 0.9368428587913513, "learning_rate": 5e-05, "llm_loss": 0.5652660056948662, "loss": 2.5945, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05487060546875, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05816650390625, "step": 3481, "total_loss": 0.6486217528581619 }, { "epoch": 0.6893684418926945, "grad_norm": 1.1298694610595703, "learning_rate": 5e-05, "llm_loss": 0.5831395238637924, "loss": 2.6623, "loss_aux_layer_0": 0.015106201171875, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05767822265625, "step": 3482, "total_loss": 0.6655735373497009 }, { "epoch": 0.689566422490596, "grad_norm": 1.091604471206665, "learning_rate": 5e-05, "llm_loss": 0.5923157185316086, "loss": 2.6926, "loss_aux_layer_0": 0.0141754150390625, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.056396484375, "step": 3483, "total_loss": 0.6731463670730591 }, { "epoch": 0.6897644030884973, "grad_norm": 0.9398313164710999, "learning_rate": 5e-05, "llm_loss": 0.5999383106827736, "loss": 2.7358, "loss_aux_layer_0": 0.014495849609375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06427001953125, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0555419921875, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05889892578125, "step": 3484, "total_loss": 0.6839380264282227 }, { "epoch": 0.6899623836863987, "grad_norm": 1.8009110689163208, "learning_rate": 5e-05, "llm_loss": 0.630969911813736, "loss": 2.8645, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.034393310546875, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04803466796875, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06158447265625, "loss_aux_layer_6": 0.06427001953125, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.0616455078125, "loss_aux_layer_9": 0.06005859375, "step": 3485, "total_loss": 0.7161349803209305 }, { "epoch": 0.6901603642843002, "grad_norm": 1.3226509094238281, "learning_rate": 5e-05, "llm_loss": 0.5718173161149025, "loss": 2.631, "loss_aux_layer_0": 0.01861572265625, "loss_aux_layer_1": 0.03497314453125, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.06280517578125, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.06109619140625, "step": 3486, "total_loss": 0.6577482968568802 }, { "epoch": 0.6903583448822015, "grad_norm": 1.5790841579437256, "learning_rate": 5e-05, "llm_loss": 0.5992409437894821, "loss": 2.7418, "loss_aux_layer_0": 0.015533447265625, "loss_aux_layer_1": 0.035400390625, "loss_aux_layer_10": 0.06298828125, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.051025390625, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.06024169921875, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.06298828125, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06170654296875, "step": 3487, "total_loss": 0.6854383945465088 }, { "epoch": 0.6905563254801029, "grad_norm": 1.1191186904907227, "learning_rate": 5e-05, "llm_loss": 0.5857966616749763, "loss": 2.7093, "loss_aux_layer_0": 0.01776123046875, "loss_aux_layer_1": 0.03875732421875, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.0760498046875, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.0909423828125, "loss_aux_layer_15": 0.099609375, "loss_aux_layer_16": 0.108642578125, "loss_aux_layer_17": 0.1158447265625, "loss_aux_layer_18": 0.12451171875, "loss_aux_layer_19": 0.1273193359375, "loss_aux_layer_2": 0.0535888671875, "loss_aux_layer_20": 0.13525390625, "loss_aux_layer_21": 0.1435546875, "loss_aux_layer_22": 0.165771484375, "loss_aux_layer_23": 0.205810546875, "loss_aux_layer_3": 0.06414794921875, "loss_aux_layer_4": 0.0665283203125, "loss_aux_layer_5": 0.06829833984375, "loss_aux_layer_6": 0.071044921875, "loss_aux_layer_7": 0.06890869140625, "loss_aux_layer_8": 0.06787109375, "loss_aux_layer_9": 0.0662841796875, "step": 3488, "total_loss": 0.6773343533277512 }, { "epoch": 0.6907543060780044, "grad_norm": 1.349819302558899, "learning_rate": 5e-05, "llm_loss": 0.5093389302492142, "loss": 2.3773, "loss_aux_layer_0": 0.020751953125, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.0592041015625, "loss_aux_layer_5": 0.060546875, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.06036376953125, "step": 3489, "total_loss": 0.5943159312009811 }, { "epoch": 0.6909522866759058, "grad_norm": 0.8462128639221191, "learning_rate": 5e-05, "llm_loss": 0.5810503214597702, "loss": 2.6642, "loss_aux_layer_0": 0.01446533203125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06005859375, "step": 3490, "total_loss": 0.6660456731915474 }, { "epoch": 0.6911502672738071, "grad_norm": 1.3783152103424072, "learning_rate": 5e-05, "llm_loss": 0.5437157303094864, "loss": 2.5162, "loss_aux_layer_0": 0.020050048828125, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06005859375, "step": 3491, "total_loss": 0.629046693444252 }, { "epoch": 0.6913482478717086, "grad_norm": 0.9952370524406433, "learning_rate": 5e-05, "llm_loss": 0.5570144653320312, "loss": 2.5559, "loss_aux_layer_0": 0.0169677734375, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0572509765625, "step": 3492, "total_loss": 0.638974666595459 }, { "epoch": 0.69154622846961, "grad_norm": 1.1575003862380981, "learning_rate": 5e-05, "llm_loss": 0.5479885935783386, "loss": 2.5306, "loss_aux_layer_0": 0.0157318115234375, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.0604248046875, "step": 3493, "total_loss": 0.632649376988411 }, { "epoch": 0.6917442090675113, "grad_norm": 1.0174490213394165, "learning_rate": 5e-05, "llm_loss": 0.5934179574251175, "loss": 2.7043, "loss_aux_layer_0": 0.0159912109375, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.06805419921875, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.058349609375, "step": 3494, "total_loss": 0.6760822087526321 }, { "epoch": 0.6919421896654128, "grad_norm": 0.8881546258926392, "learning_rate": 5e-05, "llm_loss": 0.5862956717610359, "loss": 2.6918, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.03533935546875, "loss_aux_layer_10": 0.06329345703125, "loss_aux_layer_11": 0.067626953125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.085693359375, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04913330078125, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06292724609375, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.0638427734375, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06195068359375, "step": 3495, "total_loss": 0.6729487478733063 }, { "epoch": 0.6921401702633142, "grad_norm": 0.9613595604896545, "learning_rate": 5e-05, "llm_loss": 0.6017136499285698, "loss": 2.7457, "loss_aux_layer_0": 0.0158843994140625, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.0606689453125, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.0601806640625, "step": 3496, "total_loss": 0.6864179223775864 }, { "epoch": 0.6923381508612156, "grad_norm": 0.8545570373535156, "learning_rate": 5e-05, "llm_loss": 0.5133636444807053, "loss": 2.3877, "loss_aux_layer_0": 0.01568603515625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0552978515625, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.057861328125, "step": 3497, "total_loss": 0.5969365239143372 }, { "epoch": 0.692536131459117, "grad_norm": 1.229472041130066, "learning_rate": 5e-05, "llm_loss": 0.5383952260017395, "loss": 2.4965, "loss_aux_layer_0": 0.014984130859375, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.0482177734375, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.06463623046875, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06060791015625, "step": 3498, "total_loss": 0.6241185516119003 }, { "epoch": 0.6927341120570184, "grad_norm": 0.7786878347396851, "learning_rate": 5e-05, "llm_loss": 0.5550594851374626, "loss": 2.5665, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.034698486328125, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.06671142578125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0772705078125, "loss_aux_layer_14": 0.08642578125, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.1048583984375, "loss_aux_layer_17": 0.1129150390625, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.060791015625, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.0611572265625, "step": 3499, "total_loss": 0.6416340470314026 }, { "epoch": 0.6929320926549198, "grad_norm": 1.1656197309494019, "learning_rate": 5e-05, "llm_loss": 0.6192118525505066, "loss": 2.8162, "loss_aux_layer_0": 0.01446533203125, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04791259765625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.060302734375, "step": 3500, "total_loss": 0.7040395140647888 }, { "epoch": 0.6931300732528213, "grad_norm": 0.9306498169898987, "learning_rate": 5e-05, "llm_loss": 0.5621632188558578, "loss": 2.5774, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.0565185546875, "step": 3501, "total_loss": 0.6443465054035187 }, { "epoch": 0.6933280538507226, "grad_norm": 1.2476810216903687, "learning_rate": 5e-05, "llm_loss": 0.5919399559497833, "loss": 2.7117, "loss_aux_layer_0": 0.0143280029296875, "loss_aux_layer_1": 0.0338134765625, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1197509765625, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.04736328125, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.1630859375, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06005859375, "step": 3502, "total_loss": 0.6779340356588364 }, { "epoch": 0.693526034448624, "grad_norm": 0.9354846477508545, "learning_rate": 5e-05, "llm_loss": 0.5315920785069466, "loss": 2.4777, "loss_aux_layer_0": 0.01495361328125, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0877685546875, "loss_aux_layer_15": 0.0958251953125, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.124267578125, "loss_aux_layer_2": 0.04937744140625, "loss_aux_layer_20": 0.1317138671875, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05987548828125, "loss_aux_layer_4": 0.0623779296875, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.06500244140625, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.06304931640625, "step": 3503, "total_loss": 0.6194299608469009 }, { "epoch": 0.6937240150465255, "grad_norm": 1.0623257160186768, "learning_rate": 5e-05, "llm_loss": 0.5218249335885048, "loss": 2.4301, "loss_aux_layer_0": 0.015472412109375, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.061279296875, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.06024169921875, "step": 3504, "total_loss": 0.6075137704610825 }, { "epoch": 0.6939219956444268, "grad_norm": 0.9210858941078186, "learning_rate": 5e-05, "llm_loss": 0.6049109548330307, "loss": 2.7551, "loss_aux_layer_0": 0.0147247314453125, "loss_aux_layer_1": 0.032440185546875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05780029296875, "step": 3505, "total_loss": 0.6887868344783783 }, { "epoch": 0.6941199762423282, "grad_norm": 0.785225510597229, "learning_rate": 5e-05, "llm_loss": 0.5567302703857422, "loss": 2.5694, "loss_aux_layer_0": 0.0149688720703125, "loss_aux_layer_1": 0.034027099609375, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.06207275390625, "loss_aux_layer_9": 0.06072998046875, "step": 3506, "total_loss": 0.6423530727624893 }, { "epoch": 0.6943179568402297, "grad_norm": 0.9070850610733032, "learning_rate": 5e-05, "llm_loss": 0.6328261345624924, "loss": 2.8699, "loss_aux_layer_0": 0.01458740234375, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1300048828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.05816650390625, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.059326171875, "step": 3507, "total_loss": 0.7174847573041916 }, { "epoch": 0.6945159374381311, "grad_norm": 0.9882844090461731, "learning_rate": 5e-05, "llm_loss": 0.614576905965805, "loss": 2.7886, "loss_aux_layer_0": 0.0147857666015625, "loss_aux_layer_1": 0.032623291015625, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05810546875, "step": 3508, "total_loss": 0.6971489638090134 }, { "epoch": 0.6947139180360324, "grad_norm": 0.7075039744377136, "learning_rate": 5e-05, "llm_loss": 0.5427974984049797, "loss": 2.5317, "loss_aux_layer_0": 0.01495361328125, "loss_aux_layer_1": 0.0369873046875, "loss_aux_layer_10": 0.068115234375, "loss_aux_layer_11": 0.0726318359375, "loss_aux_layer_12": 0.0771484375, "loss_aux_layer_13": 0.08251953125, "loss_aux_layer_14": 0.0908203125, "loss_aux_layer_15": 0.0986328125, "loss_aux_layer_16": 0.107421875, "loss_aux_layer_17": 0.11474609375, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1246337890625, "loss_aux_layer_2": 0.05206298828125, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0625, "loss_aux_layer_4": 0.0657958984375, "loss_aux_layer_5": 0.067626953125, "loss_aux_layer_6": 0.0711669921875, "loss_aux_layer_7": 0.06884765625, "loss_aux_layer_8": 0.068115234375, "loss_aux_layer_9": 0.0667724609375, "step": 3509, "total_loss": 0.6329372376203537 }, { "epoch": 0.6949118986339339, "grad_norm": 1.0849295854568481, "learning_rate": 5e-05, "llm_loss": 0.6496310830116272, "loss": 2.9545, "loss_aux_layer_0": 0.0143585205078125, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.0672607421875, "loss_aux_layer_11": 0.0714111328125, "loss_aux_layer_12": 0.075927734375, "loss_aux_layer_13": 0.081298828125, "loss_aux_layer_14": 0.0897216796875, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.106201171875, "loss_aux_layer_17": 0.114013671875, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.050537109375, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.06103515625, "loss_aux_layer_4": 0.0640869140625, "loss_aux_layer_5": 0.06585693359375, "loss_aux_layer_6": 0.0693359375, "loss_aux_layer_7": 0.0673828125, "loss_aux_layer_8": 0.0667724609375, "loss_aux_layer_9": 0.065673828125, "step": 3510, "total_loss": 0.7386173009872437 }, { "epoch": 0.6951098792318353, "grad_norm": 0.6737386584281921, "learning_rate": 5e-05, "llm_loss": 0.5805055350065231, "loss": 2.6343, "loss_aux_layer_0": 0.0145263671875, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05438232421875, "step": 3511, "total_loss": 0.6585846543312073 }, { "epoch": 0.6953078598297366, "grad_norm": 0.9459106922149658, "learning_rate": 5e-05, "llm_loss": 0.6134175956249237, "loss": 2.7944, "loss_aux_layer_0": 0.0146331787109375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.140869140625, "loss_aux_layer_22": 0.163818359375, "loss_aux_layer_23": 0.20263671875, "loss_aux_layer_3": 0.055419921875, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.05908203125, "step": 3512, "total_loss": 0.698596864938736 }, { "epoch": 0.6955058404276381, "grad_norm": 0.9665195941925049, "learning_rate": 5e-05, "llm_loss": 0.6227994859218597, "loss": 2.8363, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.03546142578125, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0726318359375, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04974365234375, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.06396484375, "loss_aux_layer_6": 0.06689453125, "loss_aux_layer_7": 0.0648193359375, "loss_aux_layer_8": 0.06402587890625, "loss_aux_layer_9": 0.06268310546875, "step": 3513, "total_loss": 0.7090639621019363 }, { "epoch": 0.6957038210255395, "grad_norm": 0.9842485189437866, "learning_rate": 5e-05, "llm_loss": 0.5811840742826462, "loss": 2.6658, "loss_aux_layer_0": 0.0152587890625, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.06243896484375, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05755615234375, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.06243896484375, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.0611572265625, "step": 3514, "total_loss": 0.6664401888847351 }, { "epoch": 0.695901801623441, "grad_norm": 0.8235673904418945, "learning_rate": 5e-05, "llm_loss": 0.5857120454311371, "loss": 2.6846, "loss_aux_layer_0": 0.0146636962890625, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.060546875, "step": 3515, "total_loss": 0.6711578369140625 }, { "epoch": 0.6960997822213423, "grad_norm": 1.3268802165985107, "learning_rate": 5e-05, "llm_loss": 0.5386504679918289, "loss": 2.5081, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.06439208984375, "loss_aux_layer_11": 0.068603515625, "loss_aux_layer_12": 0.0733642578125, "loss_aux_layer_13": 0.0789794921875, "loss_aux_layer_14": 0.087890625, "loss_aux_layer_15": 0.09716796875, "loss_aux_layer_16": 0.10693359375, "loss_aux_layer_17": 0.1148681640625, "loss_aux_layer_18": 0.1231689453125, "loss_aux_layer_19": 0.125244140625, "loss_aux_layer_2": 0.04986572265625, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.16357421875, "loss_aux_layer_23": 0.20166015625, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.061767578125, "loss_aux_layer_5": 0.06329345703125, "loss_aux_layer_6": 0.06646728515625, "loss_aux_layer_7": 0.0643310546875, "loss_aux_layer_8": 0.0640869140625, "loss_aux_layer_9": 0.06292724609375, "step": 3516, "total_loss": 0.6270149946212769 }, { "epoch": 0.6962977628192437, "grad_norm": 0.9910489320755005, "learning_rate": 5e-05, "llm_loss": 0.49775411933660507, "loss": 2.3239, "loss_aux_layer_0": 0.01605224609375, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.05767822265625, "step": 3517, "total_loss": 0.5809688866138458 }, { "epoch": 0.6964957434171452, "grad_norm": 1.3737536668777466, "learning_rate": 5e-05, "llm_loss": 0.7059423625469208, "loss": 3.1655, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.0662841796875, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04766845703125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.06072998046875, "step": 3518, "total_loss": 0.7913803607225418 }, { "epoch": 0.6966937240150465, "grad_norm": 1.0367448329925537, "learning_rate": 5e-05, "llm_loss": 0.6352167576551437, "loss": 2.8808, "loss_aux_layer_0": 0.014739990234375, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.0614013671875, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06427001953125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.06005859375, "step": 3519, "total_loss": 0.7201927155256271 }, { "epoch": 0.6968917046129479, "grad_norm": 1.22073495388031, "learning_rate": 5e-05, "llm_loss": 0.5957025587558746, "loss": 2.7176, "loss_aux_layer_0": 0.016510009765625, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.05865478515625, "step": 3520, "total_loss": 0.6794084310531616 }, { "epoch": 0.6970896852108494, "grad_norm": 0.8879185914993286, "learning_rate": 5e-05, "llm_loss": 0.5911338180303574, "loss": 2.688, "loss_aux_layer_0": 0.014312744140625, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.0562744140625, "step": 3521, "total_loss": 0.6719885468482971 }, { "epoch": 0.6972876658087508, "grad_norm": 1.16294527053833, "learning_rate": 5e-05, "llm_loss": 0.6318521201610565, "loss": 2.8547, "loss_aux_layer_0": 0.0149078369140625, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05804443359375, "loss_aux_layer_9": 0.0567626953125, "step": 3522, "total_loss": 0.7136759608983994 }, { "epoch": 0.6974856464066521, "grad_norm": 0.9484425187110901, "learning_rate": 5e-05, "llm_loss": 0.529592826962471, "loss": 2.4711, "loss_aux_layer_0": 0.0145263671875, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0697021484375, "loss_aux_layer_12": 0.073974609375, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.0965576171875, "loss_aux_layer_16": 0.1055908203125, "loss_aux_layer_17": 0.11328125, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.162109375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.06048583984375, "loss_aux_layer_4": 0.06317138671875, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.0675048828125, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06494140625, "loss_aux_layer_9": 0.0633544921875, "step": 3523, "total_loss": 0.6177691370248795 }, { "epoch": 0.6976836270045536, "grad_norm": 1.1508406400680542, "learning_rate": 5e-05, "llm_loss": 0.7081131786108017, "loss": 3.1789, "loss_aux_layer_0": 0.014129638671875, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1051025390625, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.1212158203125, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.04833984375, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.061279296875, "loss_aux_layer_5": 0.06280517578125, "loss_aux_layer_6": 0.06591796875, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.0633544921875, "loss_aux_layer_9": 0.06182861328125, "step": 3524, "total_loss": 0.7947214394807816 }, { "epoch": 0.697881607602455, "grad_norm": 1.5486433506011963, "learning_rate": 5e-05, "llm_loss": 0.6054005473852158, "loss": 2.7678, "loss_aux_layer_0": 0.01422119140625, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.0675048828125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104736328125, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.120361328125, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.048095703125, "loss_aux_layer_20": 0.1307373046875, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.06121826171875, "step": 3525, "total_loss": 0.6919469684362411 }, { "epoch": 0.6980795882003563, "grad_norm": 1.1384057998657227, "learning_rate": 5e-05, "llm_loss": 0.5586079955101013, "loss": 2.5674, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.033355712890625, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.0621337890625, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05859375, "step": 3526, "total_loss": 0.6418432146310806 }, { "epoch": 0.6982775687982578, "grad_norm": 1.1696572303771973, "learning_rate": 5e-05, "llm_loss": 0.5587334930896759, "loss": 2.5633, "loss_aux_layer_0": 0.0144195556640625, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.0579833984375, "step": 3527, "total_loss": 0.6408141255378723 }, { "epoch": 0.6984755493961592, "grad_norm": 0.9524524807929993, "learning_rate": 5e-05, "llm_loss": 0.5346105843782425, "loss": 2.4752, "loss_aux_layer_0": 0.014190673828125, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.06036376953125, "step": 3528, "total_loss": 0.6187999993562698 }, { "epoch": 0.6986735299940606, "grad_norm": 0.9266321659088135, "learning_rate": 5e-05, "llm_loss": 0.5917627066373825, "loss": 2.7024, "loss_aux_layer_0": 0.014068603515625, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.05999755859375, "step": 3529, "total_loss": 0.6755932569503784 }, { "epoch": 0.698871510591962, "grad_norm": 0.8801249265670776, "learning_rate": 5e-05, "llm_loss": 0.5275873839855194, "loss": 2.4456, "loss_aux_layer_0": 0.0148773193359375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05841064453125, "step": 3530, "total_loss": 0.611393928527832 }, { "epoch": 0.6990694911898634, "grad_norm": 0.9360907673835754, "learning_rate": 5e-05, "llm_loss": 0.44425761699676514, "loss": 2.1191, "loss_aux_layer_0": 0.01434326171875, "loss_aux_layer_1": 0.03485107421875, "loss_aux_layer_10": 0.06304931640625, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04852294921875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05853271484375, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.062744140625, "loss_aux_layer_6": 0.06585693359375, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06195068359375, "step": 3531, "total_loss": 0.529780812561512 }, { "epoch": 0.6992674717877648, "grad_norm": 0.7990756034851074, "learning_rate": 5e-05, "llm_loss": 0.5027423426508904, "loss": 2.3683, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06427001953125, "loss_aux_layer_11": 0.0687255859375, "loss_aux_layer_12": 0.07373046875, "loss_aux_layer_13": 0.07958984375, "loss_aux_layer_14": 0.0888671875, "loss_aux_layer_15": 0.0982666015625, "loss_aux_layer_16": 0.1082763671875, "loss_aux_layer_17": 0.1162109375, "loss_aux_layer_18": 0.1246337890625, "loss_aux_layer_19": 0.12841796875, "loss_aux_layer_2": 0.04864501953125, "loss_aux_layer_20": 0.136474609375, "loss_aux_layer_21": 0.144775390625, "loss_aux_layer_22": 0.166259765625, "loss_aux_layer_23": 0.204345703125, "loss_aux_layer_3": 0.05877685546875, "loss_aux_layer_4": 0.0616455078125, "loss_aux_layer_5": 0.06341552734375, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.064697265625, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.06317138671875, "step": 3532, "total_loss": 0.5920625329017639 }, { "epoch": 0.6994654523856662, "grad_norm": 0.9944128394126892, "learning_rate": 5e-05, "llm_loss": 0.5651375651359558, "loss": 2.5935, "loss_aux_layer_0": 0.0147705078125, "loss_aux_layer_1": 0.033782958984375, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.0584716796875, "step": 3533, "total_loss": 0.6483649462461472 }, { "epoch": 0.6996634329835676, "grad_norm": 0.9025300741195679, "learning_rate": 5e-05, "llm_loss": 0.47390878200531006, "loss": 2.2221, "loss_aux_layer_0": 0.014251708984375, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.056640625, "step": 3534, "total_loss": 0.5555270910263062 }, { "epoch": 0.699861413581469, "grad_norm": 0.9724733233451843, "learning_rate": 5e-05, "llm_loss": 0.5502485781908035, "loss": 2.5275, "loss_aux_layer_0": 0.015106201171875, "loss_aux_layer_1": 0.032135009765625, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.05670166015625, "step": 3535, "total_loss": 0.6318696886301041 }, { "epoch": 0.7000593941793705, "grad_norm": 0.8299984931945801, "learning_rate": 5e-05, "llm_loss": 0.569746695458889, "loss": 2.6101, "loss_aux_layer_0": 0.0153350830078125, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05487060546875, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05780029296875, "step": 3536, "total_loss": 0.6525191068649292 }, { "epoch": 0.7002573747772718, "grad_norm": 0.868517279624939, "learning_rate": 5e-05, "llm_loss": 0.6464325338602066, "loss": 2.9115, "loss_aux_layer_0": 0.014373779296875, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.056396484375, "step": 3537, "total_loss": 0.7278643995523453 }, { "epoch": 0.7004553553751732, "grad_norm": 0.907938539981842, "learning_rate": 5e-05, "llm_loss": 0.6558481603860855, "loss": 2.959, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06005859375, "step": 3538, "total_loss": 0.7397507429122925 }, { "epoch": 0.7006533359730747, "grad_norm": 0.8730873465538025, "learning_rate": 5e-05, "llm_loss": 0.6018648445606232, "loss": 2.747, "loss_aux_layer_0": 0.0145416259765625, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.0594482421875, "step": 3539, "total_loss": 0.6867518723011017 }, { "epoch": 0.700851316570976, "grad_norm": 0.869711697101593, "learning_rate": 5e-05, "llm_loss": 0.5825770199298859, "loss": 2.6709, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.0767822265625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.06439208984375, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.0609130859375, "step": 3540, "total_loss": 0.667726457118988 }, { "epoch": 0.7010492971688774, "grad_norm": 0.810430109500885, "learning_rate": 5e-05, "llm_loss": 0.48756466805934906, "loss": 2.285, "loss_aux_layer_0": 0.01409912109375, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06488037109375, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.05963134765625, "step": 3541, "total_loss": 0.5712624788284302 }, { "epoch": 0.7012472777667789, "grad_norm": 0.8229300379753113, "learning_rate": 5e-05, "llm_loss": 0.5749286636710167, "loss": 2.6436, "loss_aux_layer_0": 0.0139007568359375, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.0615234375, "step": 3542, "total_loss": 0.6609080284833908 }, { "epoch": 0.7014452583646803, "grad_norm": 0.733700156211853, "learning_rate": 5e-05, "llm_loss": 0.5097643658518791, "loss": 2.3716, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.05877685546875, "step": 3543, "total_loss": 0.5928999036550522 }, { "epoch": 0.7016432389625816, "grad_norm": 0.8073698282241821, "learning_rate": 5e-05, "llm_loss": 0.5098383948206902, "loss": 2.3733, "loss_aux_layer_0": 0.013885498046875, "loss_aux_layer_1": 0.032928466796875, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.06512451171875, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.0634765625, "loss_aux_layer_7": 0.0616455078125, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.06011962890625, "step": 3544, "total_loss": 0.5933216065168381 }, { "epoch": 0.7018412195604831, "grad_norm": 1.0394890308380127, "learning_rate": 5e-05, "llm_loss": 0.6097565144300461, "loss": 2.7636, "loss_aux_layer_0": 0.013916015625, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05523681640625, "step": 3545, "total_loss": 0.6909106969833374 }, { "epoch": 0.7020392001583845, "grad_norm": 0.8897773027420044, "learning_rate": 5e-05, "llm_loss": 0.5709372907876968, "loss": 2.6167, "loss_aux_layer_0": 0.0135345458984375, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0552978515625, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.058837890625, "step": 3546, "total_loss": 0.6541635692119598 }, { "epoch": 0.7022371807562859, "grad_norm": 0.8574745059013367, "learning_rate": 5e-05, "llm_loss": 0.6330280900001526, "loss": 2.8661, "loss_aux_layer_0": 0.0143280029296875, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.0587158203125, "step": 3547, "total_loss": 0.7165170162916183 }, { "epoch": 0.7024351613541873, "grad_norm": 0.907139241695404, "learning_rate": 5e-05, "llm_loss": 0.5540260225534439, "loss": 2.5419, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05291748046875, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05609130859375, "step": 3548, "total_loss": 0.6354827731847763 }, { "epoch": 0.7026331419520887, "grad_norm": 0.9589692950248718, "learning_rate": 5e-05, "llm_loss": 0.5858050882816315, "loss": 2.6682, "loss_aux_layer_0": 0.0143585205078125, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04364013671875, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.0565185546875, "step": 3549, "total_loss": 0.6670521795749664 }, { "epoch": 0.7028311225499901, "grad_norm": 0.9427729249000549, "learning_rate": 5e-05, "llm_loss": 0.5381591320037842, "loss": 2.4849, "loss_aux_layer_0": 0.0137176513671875, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.058349609375, "step": 3550, "total_loss": 0.6212370991706848 }, { "epoch": 0.7030291031478915, "grad_norm": 1.33826744556427, "learning_rate": 5e-05, "llm_loss": 0.5914375185966492, "loss": 2.7034, "loss_aux_layer_0": 0.013580322265625, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.199462890625, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05816650390625, "step": 3551, "total_loss": 0.6758617609739304 }, { "epoch": 0.7032270837457929, "grad_norm": 0.8979339003562927, "learning_rate": 5e-05, "llm_loss": 0.5948997139930725, "loss": 2.7207, "loss_aux_layer_0": 0.0150909423828125, "loss_aux_layer_1": 0.034271240234375, "loss_aux_layer_10": 0.06207275390625, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05987548828125, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.060791015625, "step": 3552, "total_loss": 0.6801822185516357 }, { "epoch": 0.7034250643436943, "grad_norm": 0.9218335747718811, "learning_rate": 5e-05, "llm_loss": 0.6206416040658951, "loss": 2.8157, "loss_aux_layer_0": 0.014312744140625, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.05853271484375, "step": 3553, "total_loss": 0.7039335519075394 }, { "epoch": 0.7036230449415958, "grad_norm": 0.858295202255249, "learning_rate": 5e-05, "llm_loss": 0.6670495942234993, "loss": 3.0025, "loss_aux_layer_0": 0.014007568359375, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06005859375, "step": 3554, "total_loss": 0.750630185008049 }, { "epoch": 0.7038210255394971, "grad_norm": 1.1334679126739502, "learning_rate": 5e-05, "llm_loss": 0.5370694026350975, "loss": 2.4947, "loss_aux_layer_0": 0.0149993896484375, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0853271484375, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1231689453125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.162353515625, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05810546875, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.06488037109375, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06146240234375, "step": 3555, "total_loss": 0.6236847341060638 }, { "epoch": 0.7040190061373985, "grad_norm": 1.175310730934143, "learning_rate": 5e-05, "llm_loss": 0.5213630273938179, "loss": 2.4305, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1243896484375, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06036376953125, "step": 3556, "total_loss": 0.6076290011405945 }, { "epoch": 0.7042169867353, "grad_norm": 1.2060555219650269, "learning_rate": 5e-05, "llm_loss": 0.6917811334133148, "loss": 3.103, "loss_aux_layer_0": 0.0139617919921875, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06268310546875, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06134033203125, "step": 3557, "total_loss": 0.7757411301136017 }, { "epoch": 0.7044149673332013, "grad_norm": 0.9592280387878418, "learning_rate": 5e-05, "llm_loss": 0.5237445533275604, "loss": 2.4188, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.032135009765625, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.1341552734375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05560302734375, "step": 3558, "total_loss": 0.6047081053256989 }, { "epoch": 0.7046129479311027, "grad_norm": 0.9257864952087402, "learning_rate": 5e-05, "llm_loss": 0.5762578248977661, "loss": 2.6424, "loss_aux_layer_0": 0.014892578125, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.06756591796875, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1318359375, "loss_aux_layer_21": 0.14111328125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.199951171875, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05792236328125, "step": 3559, "total_loss": 0.6606115996837616 }, { "epoch": 0.7048109285290042, "grad_norm": 1.056955099105835, "learning_rate": 5e-05, "llm_loss": 0.5434196144342422, "loss": 2.5049, "loss_aux_layer_0": 0.01361083984375, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.0621337890625, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05816650390625, "step": 3560, "total_loss": 0.6262329518795013 }, { "epoch": 0.7050089091269056, "grad_norm": 1.1111619472503662, "learning_rate": 5e-05, "llm_loss": 0.5602157935500145, "loss": 2.578, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.0345458984375, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.05902099609375, "step": 3561, "total_loss": 0.6444966495037079 }, { "epoch": 0.7052068897248069, "grad_norm": 0.9117030501365662, "learning_rate": 5e-05, "llm_loss": 0.5421356409788132, "loss": 2.4915, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.031890869140625, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05670166015625, "step": 3562, "total_loss": 0.6228757053613663 }, { "epoch": 0.7054048703227084, "grad_norm": 1.2468615770339966, "learning_rate": 5e-05, "llm_loss": 0.5697331503033638, "loss": 2.6193, "loss_aux_layer_0": 0.01434326171875, "loss_aux_layer_1": 0.03448486328125, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.061279296875, "step": 3563, "total_loss": 0.6548325717449188 }, { "epoch": 0.7056028509206098, "grad_norm": 1.0818455219268799, "learning_rate": 5e-05, "llm_loss": 0.5399280861020088, "loss": 2.5015, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.0491943359375, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0594482421875, "loss_aux_layer_4": 0.0618896484375, "loss_aux_layer_5": 0.063232421875, "loss_aux_layer_6": 0.06610107421875, "loss_aux_layer_7": 0.06402587890625, "loss_aux_layer_8": 0.063232421875, "loss_aux_layer_9": 0.06158447265625, "step": 3564, "total_loss": 0.6253811866044998 }, { "epoch": 0.7058008315185111, "grad_norm": 1.0045245885849, "learning_rate": 5e-05, "llm_loss": 0.5550999343395233, "loss": 2.5524, "loss_aux_layer_0": 0.014312744140625, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.0621337890625, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.058349609375, "step": 3565, "total_loss": 0.6381027400493622 }, { "epoch": 0.7059988121164126, "grad_norm": 0.9067992568016052, "learning_rate": 5e-05, "llm_loss": 0.6280461102724075, "loss": 2.8293, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05462646484375, "step": 3566, "total_loss": 0.707322508096695 }, { "epoch": 0.706196792714314, "grad_norm": 1.1677298545837402, "learning_rate": 5e-05, "llm_loss": 0.4860314577817917, "loss": 2.2819, "loss_aux_layer_0": 0.0146942138671875, "loss_aux_layer_1": 0.032989501953125, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.05889892578125, "step": 3567, "total_loss": 0.5704775899648666 }, { "epoch": 0.7063947733122155, "grad_norm": 1.1579874753952026, "learning_rate": 5e-05, "llm_loss": 0.6341532468795776, "loss": 2.8626, "loss_aux_layer_0": 0.015380859375, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1141357421875, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05633544921875, "step": 3568, "total_loss": 0.7156585454940796 }, { "epoch": 0.7065927539101168, "grad_norm": 1.1294173002243042, "learning_rate": 5e-05, "llm_loss": 0.5860058590769768, "loss": 2.6754, "loss_aux_layer_0": 0.0142059326171875, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.05853271484375, "step": 3569, "total_loss": 0.6688579767942429 }, { "epoch": 0.7067907345080182, "grad_norm": 1.171260952949524, "learning_rate": 5e-05, "llm_loss": 0.553969033062458, "loss": 2.5549, "loss_aux_layer_0": 0.0146331787109375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06280517578125, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.05902099609375, "step": 3570, "total_loss": 0.6387344896793365 }, { "epoch": 0.7069887151059197, "grad_norm": 0.9315631985664368, "learning_rate": 5e-05, "llm_loss": 0.5914050936698914, "loss": 2.6991, "loss_aux_layer_0": 0.0152130126953125, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05841064453125, "step": 3571, "total_loss": 0.6747836470603943 }, { "epoch": 0.707186695703821, "grad_norm": 1.3935480117797852, "learning_rate": 5e-05, "llm_loss": 0.6346437484025955, "loss": 2.8742, "loss_aux_layer_0": 0.014801025390625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.05914306640625, "step": 3572, "total_loss": 0.7185408025979996 }, { "epoch": 0.7073846763017224, "grad_norm": 0.9109113216400146, "learning_rate": 5e-05, "llm_loss": 0.5611466392874718, "loss": 2.593, "loss_aux_layer_0": 0.0138702392578125, "loss_aux_layer_1": 0.03509521484375, "loss_aux_layer_10": 0.06475830078125, "loss_aux_layer_11": 0.069091796875, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.078857421875, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.06298828125, "loss_aux_layer_5": 0.06463623046875, "loss_aux_layer_6": 0.067626953125, "loss_aux_layer_7": 0.065673828125, "loss_aux_layer_8": 0.06488037109375, "loss_aux_layer_9": 0.0634765625, "step": 3573, "total_loss": 0.6482485681772232 }, { "epoch": 0.7075826568996239, "grad_norm": 0.9208697080612183, "learning_rate": 5e-05, "llm_loss": 0.7243236601352692, "loss": 3.2344, "loss_aux_layer_0": 0.015167236328125, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.0634765625, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05950927734375, "step": 3574, "total_loss": 0.8085955083370209 }, { "epoch": 0.7077806374975253, "grad_norm": 0.970815122127533, "learning_rate": 5e-05, "llm_loss": 0.5447167381644249, "loss": 2.5209, "loss_aux_layer_0": 0.0145416259765625, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.1046142578125, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.1241455078125, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.1312255859375, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06317138671875, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.0589599609375, "step": 3575, "total_loss": 0.6302282810211182 }, { "epoch": 0.7079786180954266, "grad_norm": 0.8625935316085815, "learning_rate": 5e-05, "llm_loss": 0.6330718547105789, "loss": 2.874, "loss_aux_layer_0": 0.013214111328125, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.0625, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06219482421875, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.0631103515625, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.06109619140625, "step": 3576, "total_loss": 0.7184933423995972 }, { "epoch": 0.708176598693328, "grad_norm": 1.0183340311050415, "learning_rate": 5e-05, "llm_loss": 0.604759007692337, "loss": 2.7576, "loss_aux_layer_0": 0.0142822265625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.06610107421875, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0849609375, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06024169921875, "step": 3577, "total_loss": 0.689411997795105 }, { "epoch": 0.7083745792912295, "grad_norm": 0.8222018480300903, "learning_rate": 5e-05, "llm_loss": 0.5418745875358582, "loss": 2.5092, "loss_aux_layer_0": 0.0154266357421875, "loss_aux_layer_1": 0.033416748046875, "loss_aux_layer_10": 0.06109619140625, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.1396484375, "loss_aux_layer_22": 0.1611328125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.0616455078125, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.06011962890625, "step": 3578, "total_loss": 0.6272989213466644 }, { "epoch": 0.7085725598891308, "grad_norm": 1.041149377822876, "learning_rate": 5e-05, "llm_loss": 0.6453433632850647, "loss": 2.9268, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.035736083984375, "loss_aux_layer_10": 0.06402587890625, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.0731201171875, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05950927734375, "loss_aux_layer_4": 0.06231689453125, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.066650390625, "loss_aux_layer_7": 0.06475830078125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06280517578125, "step": 3579, "total_loss": 0.7317039966583252 }, { "epoch": 0.7087705404870323, "grad_norm": 0.8647557497024536, "learning_rate": 5e-05, "llm_loss": 0.5359031111001968, "loss": 2.4814, "loss_aux_layer_0": 0.014068603515625, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.12353515625, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.05780029296875, "step": 3580, "total_loss": 0.6203375607728958 }, { "epoch": 0.7089685210849337, "grad_norm": 0.8905626535415649, "learning_rate": 5e-05, "llm_loss": 0.6031978651881218, "loss": 2.7689, "loss_aux_layer_0": 0.0146331787109375, "loss_aux_layer_1": 0.03704833984375, "loss_aux_layer_10": 0.06732177734375, "loss_aux_layer_11": 0.07177734375, "loss_aux_layer_12": 0.076416015625, "loss_aux_layer_13": 0.08203125, "loss_aux_layer_14": 0.090087890625, "loss_aux_layer_15": 0.09814453125, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1141357421875, "loss_aux_layer_18": 0.1214599609375, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.05145263671875, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.06201171875, "loss_aux_layer_4": 0.065185546875, "loss_aux_layer_5": 0.06689453125, "loss_aux_layer_6": 0.0701904296875, "loss_aux_layer_7": 0.068115234375, "loss_aux_layer_8": 0.06744384765625, "loss_aux_layer_9": 0.0657958984375, "step": 3581, "total_loss": 0.6922168731689453 }, { "epoch": 0.7091665016828351, "grad_norm": 0.8508995771408081, "learning_rate": 5e-05, "llm_loss": 0.6085605025291443, "loss": 2.7692, "loss_aux_layer_0": 0.0143890380859375, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.05853271484375, "step": 3582, "total_loss": 0.6923074424266815 }, { "epoch": 0.7093644822807365, "grad_norm": 0.8712931275367737, "learning_rate": 5e-05, "llm_loss": 0.6206677407026291, "loss": 2.8083, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.058349609375, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.0552978515625, "step": 3583, "total_loss": 0.7020700573921204 }, { "epoch": 0.7095624628786379, "grad_norm": 0.8440040349960327, "learning_rate": 5e-05, "llm_loss": 0.5350666493177414, "loss": 2.4754, "loss_aux_layer_0": 0.014251708984375, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.0572509765625, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.05804443359375, "step": 3584, "total_loss": 0.618837870657444 }, { "epoch": 0.7097604434765393, "grad_norm": 0.8505944013595581, "learning_rate": 5e-05, "llm_loss": 0.5411648005247116, "loss": 2.4953, "loss_aux_layer_0": 0.014129638671875, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.06744384765625, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.05767822265625, "step": 3585, "total_loss": 0.6238247603178024 }, { "epoch": 0.7099584240744407, "grad_norm": 1.2560735940933228, "learning_rate": 5e-05, "llm_loss": 0.5601355135440826, "loss": 2.5751, "loss_aux_layer_0": 0.0146026611328125, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06475830078125, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.05938720703125, "step": 3586, "total_loss": 0.6437814086675644 }, { "epoch": 0.7101564046723421, "grad_norm": 0.8981943130493164, "learning_rate": 5e-05, "llm_loss": 0.5795338302850723, "loss": 2.6475, "loss_aux_layer_0": 0.013336181640625, "loss_aux_layer_1": 0.033050537109375, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.05767822265625, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.05987548828125, "loss_aux_layer_9": 0.05877685546875, "step": 3587, "total_loss": 0.6618632972240448 }, { "epoch": 0.7103543852702435, "grad_norm": 0.9889597296714783, "learning_rate": 5e-05, "llm_loss": 0.6375586837530136, "loss": 2.8929, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.06689453125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.09375, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06341552734375, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.0616455078125, "step": 3588, "total_loss": 0.7232232093811035 }, { "epoch": 0.710552365868145, "grad_norm": 0.8451210260391235, "learning_rate": 5e-05, "llm_loss": 0.5029635280370712, "loss": 2.3403, "loss_aux_layer_0": 0.014678955078125, "loss_aux_layer_1": 0.031890869140625, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0572509765625, "step": 3589, "total_loss": 0.5850767865777016 }, { "epoch": 0.7107503464660463, "grad_norm": 0.9971134066581726, "learning_rate": 5e-05, "llm_loss": 0.510046660900116, "loss": 2.3704, "loss_aux_layer_0": 0.0145721435546875, "loss_aux_layer_1": 0.032470703125, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.05755615234375, "step": 3590, "total_loss": 0.5925971940159798 }, { "epoch": 0.7109483270639477, "grad_norm": 0.9856619238853455, "learning_rate": 5e-05, "llm_loss": 0.542495459318161, "loss": 2.4892, "loss_aux_layer_0": 0.014984130859375, "loss_aux_layer_1": 0.031524658203125, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.0552978515625, "step": 3591, "total_loss": 0.6223031431436539 }, { "epoch": 0.7111463076618492, "grad_norm": 0.943148136138916, "learning_rate": 5e-05, "llm_loss": 0.616041287779808, "loss": 2.7989, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.06475830078125, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.0594482421875, "step": 3592, "total_loss": 0.6997282058000565 }, { "epoch": 0.7113442882597505, "grad_norm": 1.126584768295288, "learning_rate": 5e-05, "llm_loss": 0.5281229242682457, "loss": 2.4511, "loss_aux_layer_0": 0.0151824951171875, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.0584716796875, "step": 3593, "total_loss": 0.61276875436306 }, { "epoch": 0.7115422688576519, "grad_norm": 0.8795912861824036, "learning_rate": 5e-05, "llm_loss": 0.6069624274969101, "loss": 2.756, "loss_aux_layer_0": 0.0143585205078125, "loss_aux_layer_1": 0.031219482421875, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.0560302734375, "step": 3594, "total_loss": 0.6890027225017548 }, { "epoch": 0.7117402494555534, "grad_norm": 1.4676158428192139, "learning_rate": 5e-05, "llm_loss": 0.6084757000207901, "loss": 2.7609, "loss_aux_layer_0": 0.01531982421875, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.06658935546875, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.0565185546875, "step": 3595, "total_loss": 0.6902370154857635 }, { "epoch": 0.7119382300534548, "grad_norm": 1.1412538290023804, "learning_rate": 5e-05, "llm_loss": 0.6908669024705887, "loss": 3.0965, "loss_aux_layer_0": 0.014556884765625, "loss_aux_layer_1": 0.032867431640625, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05792236328125, "step": 3596, "total_loss": 0.7741343379020691 }, { "epoch": 0.7121362106513561, "grad_norm": 1.436166524887085, "learning_rate": 5e-05, "llm_loss": 0.6541563272476196, "loss": 2.9482, "loss_aux_layer_0": 0.01470947265625, "loss_aux_layer_1": 0.033599853515625, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.1322021484375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.05938720703125, "step": 3597, "total_loss": 0.7370469868183136 }, { "epoch": 0.7123341912492576, "grad_norm": 1.2589339017868042, "learning_rate": 5e-05, "llm_loss": 0.5771375298500061, "loss": 2.6554, "loss_aux_layer_0": 0.015289306640625, "loss_aux_layer_1": 0.035064697265625, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.06683349609375, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.085205078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1124267578125, "loss_aux_layer_18": 0.1209716796875, "loss_aux_layer_19": 0.12451171875, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.140625, "loss_aux_layer_22": 0.162841796875, "loss_aux_layer_23": 0.201904296875, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.06005859375, "loss_aux_layer_5": 0.0615234375, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06256103515625, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.060791015625, "step": 3598, "total_loss": 0.6638537347316742 }, { "epoch": 0.712532171847159, "grad_norm": 1.0616689920425415, "learning_rate": 5e-05, "llm_loss": 0.5877373516559601, "loss": 2.6948, "loss_aux_layer_0": 0.0139923095703125, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.0723876953125, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.04840087890625, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05902099609375, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.064208984375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.062255859375, "step": 3599, "total_loss": 0.6737018674612045 }, { "epoch": 0.7127301524450604, "grad_norm": 1.058619499206543, "learning_rate": 5e-05, "llm_loss": 0.6414273083209991, "loss": 2.9031, "loss_aux_layer_0": 0.0141143798828125, "loss_aux_layer_1": 0.032928466796875, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05963134765625, "step": 3600, "total_loss": 0.725777342915535 }, { "epoch": 0.7129281330429618, "grad_norm": 0.8962019681930542, "learning_rate": 5e-05, "llm_loss": 0.5417463555932045, "loss": 2.4958, "loss_aux_layer_0": 0.013824462890625, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0574951171875, "step": 3601, "total_loss": 0.6239492669701576 }, { "epoch": 0.7131261136408632, "grad_norm": 1.0638922452926636, "learning_rate": 5e-05, "llm_loss": 0.6047063916921616, "loss": 2.7541, "loss_aux_layer_0": 0.013458251953125, "loss_aux_layer_1": 0.034027099609375, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.09228515625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0556640625, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.0582275390625, "step": 3602, "total_loss": 0.6885174363851547 }, { "epoch": 0.7133240942387646, "grad_norm": 0.9575835466384888, "learning_rate": 5e-05, "llm_loss": 0.6542863845825195, "loss": 2.9518, "loss_aux_layer_0": 0.0137176513671875, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04547119140625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05816650390625, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.058837890625, "step": 3603, "total_loss": 0.7379538416862488 }, { "epoch": 0.713522074836666, "grad_norm": 1.1176148653030396, "learning_rate": 5e-05, "llm_loss": 0.608578622341156, "loss": 2.7723, "loss_aux_layer_0": 0.013397216796875, "loss_aux_layer_1": 0.033538818359375, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.1346435546875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.06011962890625, "loss_aux_layer_5": 0.061767578125, "loss_aux_layer_6": 0.06463623046875, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.0604248046875, "step": 3604, "total_loss": 0.6930817067623138 }, { "epoch": 0.7137200554345674, "grad_norm": 0.9058377742767334, "learning_rate": 5e-05, "llm_loss": 0.5927819013595581, "loss": 2.7044, "loss_aux_layer_0": 0.013824462890625, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05816650390625, "step": 3605, "total_loss": 0.6760895699262619 }, { "epoch": 0.7139180360324688, "grad_norm": 1.1008498668670654, "learning_rate": 5e-05, "llm_loss": 0.5344496965408325, "loss": 2.4723, "loss_aux_layer_0": 0.0135650634765625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06500244140625, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.060546875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.05938720703125, "step": 3606, "total_loss": 0.618080273270607 }, { "epoch": 0.7141160166303703, "grad_norm": 1.3770653009414673, "learning_rate": 5e-05, "llm_loss": 0.630917102098465, "loss": 2.8718, "loss_aux_layer_0": 0.013031005859375, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.0728759765625, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.0950927734375, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04931640625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.06011962890625, "loss_aux_layer_4": 0.0628662109375, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.06732177734375, "loss_aux_layer_7": 0.0653076171875, "loss_aux_layer_8": 0.064453125, "loss_aux_layer_9": 0.062744140625, "step": 3607, "total_loss": 0.7179597616195679 }, { "epoch": 0.7143139972282716, "grad_norm": 1.5579291582107544, "learning_rate": 5e-05, "llm_loss": 0.6346659362316132, "loss": 2.8891, "loss_aux_layer_0": 0.013702392578125, "loss_aux_layer_1": 0.03558349609375, "loss_aux_layer_10": 0.0653076171875, "loss_aux_layer_11": 0.0699462890625, "loss_aux_layer_12": 0.0745849609375, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.08837890625, "loss_aux_layer_15": 0.09619140625, "loss_aux_layer_16": 0.1053466796875, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.1201171875, "loss_aux_layer_19": 0.1221923828125, "loss_aux_layer_2": 0.050048828125, "loss_aux_layer_20": 0.1290283203125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0604248046875, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.0648193359375, "loss_aux_layer_6": 0.06793212890625, "loss_aux_layer_7": 0.0660400390625, "loss_aux_layer_8": 0.06561279296875, "loss_aux_layer_9": 0.06414794921875, "step": 3608, "total_loss": 0.7222723364830017 }, { "epoch": 0.714511977826173, "grad_norm": 1.1680490970611572, "learning_rate": 5e-05, "llm_loss": 0.5480012446641922, "loss": 2.5406, "loss_aux_layer_0": 0.0141448974609375, "loss_aux_layer_1": 0.0357666015625, "loss_aux_layer_10": 0.064208984375, "loss_aux_layer_11": 0.0682373046875, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04962158203125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.06256103515625, "loss_aux_layer_5": 0.0640869140625, "loss_aux_layer_6": 0.0670166015625, "loss_aux_layer_7": 0.065185546875, "loss_aux_layer_8": 0.0645751953125, "loss_aux_layer_9": 0.06298828125, "step": 3609, "total_loss": 0.6351398825645447 }, { "epoch": 0.7147099584240745, "grad_norm": 0.9318494200706482, "learning_rate": 5e-05, "llm_loss": 0.5785657614469528, "loss": 2.6456, "loss_aux_layer_0": 0.013946533203125, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.06341552734375, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.0584716796875, "step": 3610, "total_loss": 0.6614100188016891 }, { "epoch": 0.7149079390219758, "grad_norm": 1.1143065690994263, "learning_rate": 5e-05, "llm_loss": 0.617935910820961, "loss": 2.8107, "loss_aux_layer_0": 0.0136260986328125, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.05963134765625, "step": 3611, "total_loss": 0.702681303024292 }, { "epoch": 0.7151059196198772, "grad_norm": 0.8964203596115112, "learning_rate": 5e-05, "llm_loss": 0.65145343542099, "loss": 2.9491, "loss_aux_layer_0": 0.0136566162109375, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.0770263671875, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1123046875, "loss_aux_layer_18": 0.12109375, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.062255859375, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.06048583984375, "step": 3612, "total_loss": 0.7372824400663376 }, { "epoch": 0.7153039002177787, "grad_norm": 1.1735106706619263, "learning_rate": 5e-05, "llm_loss": 0.5413382351398468, "loss": 2.5095, "loss_aux_layer_0": 0.015289306640625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0712890625, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06207275390625, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.063232421875, "loss_aux_layer_8": 0.0626220703125, "loss_aux_layer_9": 0.0614013671875, "step": 3613, "total_loss": 0.6273751482367516 }, { "epoch": 0.7155018808156801, "grad_norm": 0.9723641276359558, "learning_rate": 5e-05, "llm_loss": 0.648627907037735, "loss": 2.9102, "loss_aux_layer_0": 0.0138702392578125, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.0546875, "step": 3614, "total_loss": 0.7275386452674866 }, { "epoch": 0.7156998614135814, "grad_norm": 1.2283961772918701, "learning_rate": 5e-05, "llm_loss": 0.61470066010952, "loss": 2.7991, "loss_aux_layer_0": 0.0157623291015625, "loss_aux_layer_1": 0.03424072265625, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.110107421875, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.1290283203125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05682373046875, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.06005859375, "step": 3615, "total_loss": 0.6997696161270142 }, { "epoch": 0.7158978420114829, "grad_norm": 0.9035474061965942, "learning_rate": 5e-05, "llm_loss": 0.6028526127338409, "loss": 2.7386, "loss_aux_layer_0": 0.01385498046875, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.1141357421875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05731201171875, "step": 3616, "total_loss": 0.6846549212932587 }, { "epoch": 0.7160958226093843, "grad_norm": 0.9743249416351318, "learning_rate": 5e-05, "llm_loss": 0.5338202714920044, "loss": 2.4732, "loss_aux_layer_0": 0.015472412109375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1021728515625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.1226806640625, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.130859375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.0582275390625, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06207275390625, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.0589599609375, "step": 3617, "total_loss": 0.6183076947927475 }, { "epoch": 0.7162938032072856, "grad_norm": 0.7337396740913391, "learning_rate": 5e-05, "llm_loss": 0.6255896240472794, "loss": 2.845, "loss_aux_layer_0": 0.01373291015625, "loss_aux_layer_1": 0.032989501953125, "loss_aux_layer_10": 0.0623779296875, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0946044921875, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.111572265625, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.1236572265625, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.13134765625, "loss_aux_layer_21": 0.138671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06109619140625, "step": 3618, "total_loss": 0.7112551778554916 }, { "epoch": 0.7164917838051871, "grad_norm": 0.970956563949585, "learning_rate": 5e-05, "llm_loss": 0.7051301002502441, "loss": 3.1439, "loss_aux_layer_0": 0.014434814453125, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05731201171875, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.0557861328125, "step": 3619, "total_loss": 0.7859743535518646 }, { "epoch": 0.7166897644030885, "grad_norm": 0.8485208749771118, "learning_rate": 5e-05, "llm_loss": 0.634432390332222, "loss": 2.8945, "loss_aux_layer_0": 0.014190673828125, "loss_aux_layer_1": 0.03607177734375, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.07421875, "loss_aux_layer_13": 0.079833984375, "loss_aux_layer_14": 0.0889892578125, "loss_aux_layer_15": 0.0970458984375, "loss_aux_layer_16": 0.1068115234375, "loss_aux_layer_17": 0.1143798828125, "loss_aux_layer_18": 0.1226806640625, "loss_aux_layer_19": 0.1258544921875, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.1334228515625, "loss_aux_layer_21": 0.141845703125, "loss_aux_layer_22": 0.164794921875, "loss_aux_layer_23": 0.202392578125, "loss_aux_layer_3": 0.06109619140625, "loss_aux_layer_4": 0.06365966796875, "loss_aux_layer_5": 0.065185546875, "loss_aux_layer_6": 0.06829833984375, "loss_aux_layer_7": 0.06591796875, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.06378173828125, "step": 3620, "total_loss": 0.7236190587282181 }, { "epoch": 0.71688774500099, "grad_norm": 0.9313652515411377, "learning_rate": 5e-05, "llm_loss": 0.5805419087409973, "loss": 2.6413, "loss_aux_layer_0": 0.01373291015625, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0546875, "step": 3621, "total_loss": 0.6603221595287323 }, { "epoch": 0.7170857255988913, "grad_norm": 0.9184389114379883, "learning_rate": 5e-05, "llm_loss": 0.5946279615163803, "loss": 2.6952, "loss_aux_layer_0": 0.0139923095703125, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.05474853515625, "step": 3622, "total_loss": 0.6737933605909348 }, { "epoch": 0.7172837061967927, "grad_norm": 0.9360194206237793, "learning_rate": 5e-05, "llm_loss": 0.5632027387619019, "loss": 2.5915, "loss_aux_layer_0": 0.0137176513671875, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.0643310546875, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.0601806640625, "step": 3623, "total_loss": 0.647874653339386 }, { "epoch": 0.7174816867946942, "grad_norm": 1.0392299890518188, "learning_rate": 5e-05, "llm_loss": 0.6415293365716934, "loss": 2.903, "loss_aux_layer_0": 0.0141143798828125, "loss_aux_layer_1": 0.033050537109375, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.06951904296875, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.0592041015625, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.0596923828125, "step": 3624, "total_loss": 0.7257393300533295 }, { "epoch": 0.7176796673925955, "grad_norm": 0.8570641279220581, "learning_rate": 5e-05, "llm_loss": 0.6045070737600327, "loss": 2.7527, "loss_aux_layer_0": 0.0139617919921875, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.058837890625, "step": 3625, "total_loss": 0.6881678402423859 }, { "epoch": 0.7178776479904969, "grad_norm": 0.9053658246994019, "learning_rate": 5e-05, "llm_loss": 0.5623315721750259, "loss": 2.5747, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05682373046875, "step": 3626, "total_loss": 0.6436769515275955 }, { "epoch": 0.7180756285883984, "grad_norm": 0.82407546043396, "learning_rate": 5e-05, "llm_loss": 0.5309719368815422, "loss": 2.4619, "loss_aux_layer_0": 0.013275146484375, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.06951904296875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.06158447265625, "loss_aux_layer_8": 0.06121826171875, "loss_aux_layer_9": 0.059814453125, "step": 3627, "total_loss": 0.6154786944389343 }, { "epoch": 0.7182736091862998, "grad_norm": 0.9801435470581055, "learning_rate": 5e-05, "llm_loss": 0.5937438160181046, "loss": 2.7038, "loss_aux_layer_0": 0.0138092041015625, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05694580078125, "step": 3628, "total_loss": 0.6759610623121262 }, { "epoch": 0.7184715897842011, "grad_norm": 0.9284526109695435, "learning_rate": 5e-05, "llm_loss": 0.651284471154213, "loss": 2.9489, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0716552734375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1031494140625, "loss_aux_layer_17": 0.11083984375, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.062744140625, "loss_aux_layer_9": 0.06158447265625, "step": 3629, "total_loss": 0.7372184991836548 }, { "epoch": 0.7186695703821026, "grad_norm": 0.7959168553352356, "learning_rate": 5e-05, "llm_loss": 0.5863456428050995, "loss": 2.6792, "loss_aux_layer_0": 0.013397216796875, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06121826171875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05816650390625, "step": 3630, "total_loss": 0.6698092669248581 }, { "epoch": 0.718867550980004, "grad_norm": 1.1373393535614014, "learning_rate": 5e-05, "llm_loss": 0.6056731194257736, "loss": 2.7618, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.06561279296875, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06134033203125, "loss_aux_layer_9": 0.06005859375, "step": 3631, "total_loss": 0.6904487311840057 }, { "epoch": 0.7190655315779053, "grad_norm": 0.9083883762359619, "learning_rate": 5e-05, "llm_loss": 0.600177139043808, "loss": 2.7456, "loss_aux_layer_0": 0.0135498046875, "loss_aux_layer_1": 0.034515380859375, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.0780029296875, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.094970703125, "loss_aux_layer_16": 0.1041259765625, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05804443359375, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.06353759765625, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0616455078125, "step": 3632, "total_loss": 0.6864113062620163 }, { "epoch": 0.7192635121758068, "grad_norm": 0.996239423751831, "learning_rate": 5e-05, "llm_loss": 0.6558616459369659, "loss": 2.9438, "loss_aux_layer_0": 0.0135345458984375, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05596923828125, "step": 3633, "total_loss": 0.7359384298324585 }, { "epoch": 0.7194614927737082, "grad_norm": 1.1563998460769653, "learning_rate": 5e-05, "llm_loss": 0.5859426110982895, "loss": 2.6789, "loss_aux_layer_0": 0.0135040283203125, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05828857421875, "step": 3634, "total_loss": 0.6697267889976501 }, { "epoch": 0.7196594733716096, "grad_norm": 0.7713908553123474, "learning_rate": 5e-05, "llm_loss": 0.641210064291954, "loss": 2.8939, "loss_aux_layer_0": 0.013214111328125, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.06793212890625, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05828857421875, "step": 3635, "total_loss": 0.7234745472669601 }, { "epoch": 0.719857453969511, "grad_norm": 1.1694210767745972, "learning_rate": 5e-05, "llm_loss": 0.5883927196264267, "loss": 2.6991, "loss_aux_layer_0": 0.0130767822265625, "loss_aux_layer_1": 0.03466796875, "loss_aux_layer_10": 0.06414794921875, "loss_aux_layer_11": 0.0684814453125, "loss_aux_layer_12": 0.072998046875, "loss_aux_layer_13": 0.0782470703125, "loss_aux_layer_14": 0.0865478515625, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.049560546875, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0596923828125, "loss_aux_layer_4": 0.06201171875, "loss_aux_layer_5": 0.0635986328125, "loss_aux_layer_6": 0.0662841796875, "loss_aux_layer_7": 0.0645751953125, "loss_aux_layer_8": 0.06396484375, "loss_aux_layer_9": 0.06292724609375, "step": 3636, "total_loss": 0.6747808307409286 }, { "epoch": 0.7200554345674124, "grad_norm": 0.8353562951087952, "learning_rate": 5e-05, "llm_loss": 0.5675772428512573, "loss": 2.5978, "loss_aux_layer_0": 0.014251708984375, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05657958984375, "step": 3637, "total_loss": 0.6494424492120743 }, { "epoch": 0.7202534151653138, "grad_norm": 0.9447192549705505, "learning_rate": 5e-05, "llm_loss": 0.5034733936190605, "loss": 2.3409, "loss_aux_layer_0": 0.01324462890625, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.0576171875, "step": 3638, "total_loss": 0.5852337777614594 }, { "epoch": 0.7204513957632152, "grad_norm": 0.9100322127342224, "learning_rate": 5e-05, "llm_loss": 0.6211023926734924, "loss": 2.8037, "loss_aux_layer_0": 0.014007568359375, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.054931640625, "step": 3639, "total_loss": 0.7009339183568954 }, { "epoch": 0.7206493763611166, "grad_norm": 1.0441633462905884, "learning_rate": 5e-05, "llm_loss": 0.590288057923317, "loss": 2.6864, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.032623291015625, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.0570068359375, "step": 3640, "total_loss": 0.6715914160013199 }, { "epoch": 0.720847356959018, "grad_norm": 0.9578725695610046, "learning_rate": 5e-05, "llm_loss": 0.6368050426244736, "loss": 2.8946, "loss_aux_layer_0": 0.0135498046875, "loss_aux_layer_1": 0.035186767578125, "loss_aux_layer_10": 0.06341552734375, "loss_aux_layer_11": 0.06781005859375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.0784912109375, "loss_aux_layer_14": 0.0869140625, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04876708984375, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.05914306640625, "loss_aux_layer_4": 0.06158447265625, "loss_aux_layer_5": 0.0631103515625, "loss_aux_layer_6": 0.06622314453125, "loss_aux_layer_7": 0.06427001953125, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.06207275390625, "step": 3641, "total_loss": 0.7236470878124237 }, { "epoch": 0.7210453375569195, "grad_norm": 0.8154632449150085, "learning_rate": 5e-05, "llm_loss": 0.5780042335391045, "loss": 2.6493, "loss_aux_layer_0": 0.015228271484375, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06427001953125, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.161376953125, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.05889892578125, "step": 3642, "total_loss": 0.6623230129480362 }, { "epoch": 0.7212433181548208, "grad_norm": 0.8963184356689453, "learning_rate": 5e-05, "llm_loss": 0.6297834515571594, "loss": 2.842, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05621337890625, "step": 3643, "total_loss": 0.710489422082901 }, { "epoch": 0.7214412987527222, "grad_norm": 0.8123783469200134, "learning_rate": 5e-05, "llm_loss": 0.5469244420528412, "loss": 2.5146, "loss_aux_layer_0": 0.0135040283203125, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.05755615234375, "step": 3644, "total_loss": 0.6286542564630508 }, { "epoch": 0.7216392793506237, "grad_norm": 0.9190279841423035, "learning_rate": 5e-05, "llm_loss": 0.6563126593828201, "loss": 2.9603, "loss_aux_layer_0": 0.01416015625, "loss_aux_layer_1": 0.033599853515625, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.0587158203125, "step": 3645, "total_loss": 0.7400757968425751 }, { "epoch": 0.721837259948525, "grad_norm": 0.8963109254837036, "learning_rate": 5e-05, "llm_loss": 0.5859223157167435, "loss": 2.6726, "loss_aux_layer_0": 0.013671875, "loss_aux_layer_1": 0.033111572265625, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05853271484375, "step": 3646, "total_loss": 0.6681577265262604 }, { "epoch": 0.7220352405464264, "grad_norm": 1.0084141492843628, "learning_rate": 5e-05, "llm_loss": 0.5473416969180107, "loss": 2.5291, "loss_aux_layer_0": 0.014190673828125, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.11962890625, "loss_aux_layer_19": 0.1239013671875, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.13232421875, "loss_aux_layer_21": 0.140380859375, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19970703125, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.058837890625, "step": 3647, "total_loss": 0.632269412279129 }, { "epoch": 0.7222332211443279, "grad_norm": 0.964253842830658, "learning_rate": 5e-05, "llm_loss": 0.6079027056694031, "loss": 2.7495, "loss_aux_layer_0": 0.0138092041015625, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.058349609375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.05499267578125, "step": 3648, "total_loss": 0.6873679310083389 }, { "epoch": 0.7224312017422293, "grad_norm": 0.8862912058830261, "learning_rate": 5e-05, "llm_loss": 0.5651203393936157, "loss": 2.5988, "loss_aux_layer_0": 0.0139923095703125, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0567626953125, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.0606689453125, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05975341796875, "step": 3649, "total_loss": 0.6497082412242889 }, { "epoch": 0.7226291823401306, "grad_norm": 0.7812208533287048, "learning_rate": 5e-05, "llm_loss": 0.6187891662120819, "loss": 2.8071, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05950927734375, "loss_aux_layer_6": 0.06231689453125, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.05889892578125, "step": 3650, "total_loss": 0.7017748802900314 }, { "epoch": 0.7228271629380321, "grad_norm": 0.8161277770996094, "learning_rate": 5e-05, "llm_loss": 0.5838768035173416, "loss": 2.6598, "loss_aux_layer_0": 0.0141754150390625, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05609130859375, "step": 3651, "total_loss": 0.6649442911148071 }, { "epoch": 0.7230251435359335, "grad_norm": 0.9760545492172241, "learning_rate": 5e-05, "llm_loss": 0.6013965308666229, "loss": 2.7424, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06451416015625, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06317138671875, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.05914306640625, "step": 3652, "total_loss": 0.685599610209465 }, { "epoch": 0.7232231241338349, "grad_norm": 1.0172432661056519, "learning_rate": 5e-05, "llm_loss": 0.5339780151844025, "loss": 2.4745, "loss_aux_layer_0": 0.0131988525390625, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.06512451171875, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.0606689453125, "loss_aux_layer_9": 0.0595703125, "step": 3653, "total_loss": 0.6186128780245781 }, { "epoch": 0.7234211047317363, "grad_norm": 1.2705963850021362, "learning_rate": 5e-05, "llm_loss": 0.6449950188398361, "loss": 2.9078, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.032623291015625, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.05706787109375, "step": 3654, "total_loss": 0.7269624769687653 }, { "epoch": 0.7236190853296377, "grad_norm": 1.0554839372634888, "learning_rate": 5e-05, "llm_loss": 0.5823663473129272, "loss": 2.6784, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.03460693359375, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072265625, "loss_aux_layer_13": 0.078125, "loss_aux_layer_14": 0.0867919921875, "loss_aux_layer_15": 0.0955810546875, "loss_aux_layer_16": 0.105224609375, "loss_aux_layer_17": 0.1134033203125, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.1240234375, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.132080078125, "loss_aux_layer_21": 0.139892578125, "loss_aux_layer_22": 0.161865234375, "loss_aux_layer_23": 0.19921875, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.0606689453125, "loss_aux_layer_5": 0.0626220703125, "loss_aux_layer_6": 0.0657958984375, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06317138671875, "loss_aux_layer_9": 0.06170654296875, "step": 3655, "total_loss": 0.6695952713489532 }, { "epoch": 0.7238170659275391, "grad_norm": 1.135366678237915, "learning_rate": 5e-05, "llm_loss": 0.6195454746484756, "loss": 2.7956, "loss_aux_layer_0": 0.0128326416015625, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0548095703125, "step": 3656, "total_loss": 0.698896199464798 }, { "epoch": 0.7240150465254405, "grad_norm": 0.7689371705055237, "learning_rate": 5e-05, "llm_loss": 0.5931766927242279, "loss": 2.6987, "loss_aux_layer_0": 0.01348876953125, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05291748046875, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05712890625, "step": 3657, "total_loss": 0.6746680289506912 }, { "epoch": 0.7242130271233419, "grad_norm": 1.1362072229385376, "learning_rate": 5e-05, "llm_loss": 0.5907420068979263, "loss": 2.6929, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.0322265625, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05816650390625, "step": 3658, "total_loss": 0.6732333898544312 }, { "epoch": 0.7244110077212433, "grad_norm": 0.7729761600494385, "learning_rate": 5e-05, "llm_loss": 0.5520848855376244, "loss": 2.5508, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.06231689453125, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.047607421875, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05767822265625, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.0621337890625, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.0609130859375, "step": 3659, "total_loss": 0.6376958191394806 }, { "epoch": 0.7246089883191448, "grad_norm": 0.8810272812843323, "learning_rate": 5e-05, "llm_loss": 0.59762904047966, "loss": 2.7447, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.0362548828125, "loss_aux_layer_10": 0.065673828125, "loss_aux_layer_11": 0.06982421875, "loss_aux_layer_12": 0.074462890625, "loss_aux_layer_13": 0.0802001953125, "loss_aux_layer_14": 0.088623046875, "loss_aux_layer_15": 0.0972900390625, "loss_aux_layer_16": 0.106689453125, "loss_aux_layer_17": 0.1146240234375, "loss_aux_layer_18": 0.1220703125, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.05047607421875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.160400390625, "loss_aux_layer_23": 0.197265625, "loss_aux_layer_3": 0.0609130859375, "loss_aux_layer_4": 0.06341552734375, "loss_aux_layer_5": 0.06512451171875, "loss_aux_layer_6": 0.0682373046875, "loss_aux_layer_7": 0.06640625, "loss_aux_layer_8": 0.0657958984375, "loss_aux_layer_9": 0.0643310546875, "step": 3660, "total_loss": 0.6861866116523743 }, { "epoch": 0.7248069689170461, "grad_norm": 0.8036233186721802, "learning_rate": 5e-05, "llm_loss": 0.5838113278150558, "loss": 2.6814, "loss_aux_layer_0": 0.0130157470703125, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111083984375, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.123779296875, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.138916015625, "loss_aux_layer_22": 0.16015625, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.06097412109375, "loss_aux_layer_5": 0.06243896484375, "loss_aux_layer_6": 0.06549072265625, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0616455078125, "step": 3661, "total_loss": 0.6703420281410217 }, { "epoch": 0.7250049495149475, "grad_norm": 0.7761619687080383, "learning_rate": 5e-05, "llm_loss": 0.52497598528862, "loss": 2.4342, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.0322265625, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.05926513671875, "step": 3662, "total_loss": 0.608557291328907 }, { "epoch": 0.725202930112849, "grad_norm": 0.8452122211456299, "learning_rate": 5e-05, "llm_loss": 0.5477754697203636, "loss": 2.5137, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05645751953125, "step": 3663, "total_loss": 0.628414660692215 }, { "epoch": 0.7254009107107503, "grad_norm": 0.76164710521698, "learning_rate": 5e-05, "llm_loss": 0.6487973034381866, "loss": 2.9281, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06207275390625, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.058349609375, "step": 3664, "total_loss": 0.7320335358381271 }, { "epoch": 0.7255988913086517, "grad_norm": 0.9249830842018127, "learning_rate": 5e-05, "llm_loss": 0.5355082601308823, "loss": 2.4881, "loss_aux_layer_0": 0.013031005859375, "loss_aux_layer_1": 0.03436279296875, "loss_aux_layer_10": 0.06390380859375, "loss_aux_layer_11": 0.06854248046875, "loss_aux_layer_12": 0.0732421875, "loss_aux_layer_13": 0.07861328125, "loss_aux_layer_14": 0.0870361328125, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1121826171875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04742431640625, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05816650390625, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06268310546875, "loss_aux_layer_6": 0.066162109375, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.0625, "step": 3665, "total_loss": 0.622013658285141 }, { "epoch": 0.7257968719065532, "grad_norm": 0.8011220693588257, "learning_rate": 5e-05, "llm_loss": 0.5482541397213936, "loss": 2.5227, "loss_aux_layer_0": 0.013214111328125, "loss_aux_layer_1": 0.033111572265625, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06207275390625, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05841064453125, "step": 3666, "total_loss": 0.6306818127632141 }, { "epoch": 0.7259948525044546, "grad_norm": 0.8661584854125977, "learning_rate": 5e-05, "llm_loss": 0.5439054816961288, "loss": 2.5033, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.031890869140625, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05712890625, "step": 3667, "total_loss": 0.6258373707532883 }, { "epoch": 0.7261928331023559, "grad_norm": 0.8613032102584839, "learning_rate": 5e-05, "llm_loss": 0.6290878355503082, "loss": 2.8486, "loss_aux_layer_0": 0.0133056640625, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05908203125, "step": 3668, "total_loss": 0.712144747376442 }, { "epoch": 0.7263908137002574, "grad_norm": 0.7557782530784607, "learning_rate": 5e-05, "llm_loss": 0.5464852750301361, "loss": 2.5224, "loss_aux_layer_0": 0.0128936767578125, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.05975341796875, "step": 3669, "total_loss": 0.630591943860054 }, { "epoch": 0.7265887942981588, "grad_norm": 0.9077004790306091, "learning_rate": 5e-05, "llm_loss": 0.5793289244174957, "loss": 2.6433, "loss_aux_layer_0": 0.0134735107421875, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05657958984375, "step": 3670, "total_loss": 0.660818487405777 }, { "epoch": 0.7267867748960601, "grad_norm": 0.9221897721290588, "learning_rate": 5e-05, "llm_loss": 0.5694190189242363, "loss": 2.6084, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.033416748046875, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05792236328125, "step": 3671, "total_loss": 0.6520981192588806 }, { "epoch": 0.7269847554939616, "grad_norm": 0.8467544913291931, "learning_rate": 5e-05, "llm_loss": 0.500053696334362, "loss": 2.3175, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1331787109375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.054443359375, "step": 3672, "total_loss": 0.579370804131031 }, { "epoch": 0.727182736091863, "grad_norm": 1.3323115110397339, "learning_rate": 5e-05, "llm_loss": 0.6075566038489342, "loss": 2.7661, "loss_aux_layer_0": 0.0139007568359375, "loss_aux_layer_1": 0.032440185546875, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.05950927734375, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05841064453125, "step": 3673, "total_loss": 0.6915255934000015 }, { "epoch": 0.7273807166897645, "grad_norm": 0.9042942523956299, "learning_rate": 5e-05, "llm_loss": 0.5947059988975525, "loss": 2.7006, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.05572509765625, "step": 3674, "total_loss": 0.6751376092433929 }, { "epoch": 0.7275786972876658, "grad_norm": 0.9600253105163574, "learning_rate": 5e-05, "llm_loss": 0.6763913035392761, "loss": 3.0275, "loss_aux_layer_0": 0.0131988525390625, "loss_aux_layer_1": 0.031951904296875, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05816650390625, "step": 3675, "total_loss": 0.75688037276268 }, { "epoch": 0.7277766778855672, "grad_norm": 1.0488780736923218, "learning_rate": 5e-05, "llm_loss": 0.5464417040348053, "loss": 2.5114, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0555419921875, "step": 3676, "total_loss": 0.6278414875268936 }, { "epoch": 0.7279746584834687, "grad_norm": 1.2996506690979004, "learning_rate": 5e-05, "llm_loss": 0.6248321533203125, "loss": 2.8325, "loss_aux_layer_0": 0.013214111328125, "loss_aux_layer_1": 0.033294677734375, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.0592041015625, "step": 3677, "total_loss": 0.7081146985292435 }, { "epoch": 0.72817263908137, "grad_norm": 1.2498693466186523, "learning_rate": 5e-05, "llm_loss": 0.5742917954921722, "loss": 2.6192, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05535888671875, "step": 3678, "total_loss": 0.6548026502132416 }, { "epoch": 0.7283706196792714, "grad_norm": 0.9330843687057495, "learning_rate": 5e-05, "llm_loss": 0.5432324409484863, "loss": 2.5086, "loss_aux_layer_0": 0.01300048828125, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.06451416015625, "loss_aux_layer_12": 0.06890869140625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.05926513671875, "loss_aux_layer_5": 0.06097412109375, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.05963134765625, "step": 3679, "total_loss": 0.6271447688341141 }, { "epoch": 0.7285686002771729, "grad_norm": 1.357842206954956, "learning_rate": 5e-05, "llm_loss": 0.5641478151082993, "loss": 2.5928, "loss_aux_layer_0": 0.0136260986328125, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.06109619140625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.074951171875, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0919189453125, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.0596923828125, "step": 3680, "total_loss": 0.6482022106647491 }, { "epoch": 0.7287665808750743, "grad_norm": 0.934264600276947, "learning_rate": 5e-05, "llm_loss": 0.6145954132080078, "loss": 2.7841, "loss_aux_layer_0": 0.0140228271484375, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.06597900390625, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0570068359375, "step": 3681, "total_loss": 0.6960174441337585 }, { "epoch": 0.7289645614729756, "grad_norm": 1.0131484270095825, "learning_rate": 5e-05, "llm_loss": 0.6089975088834763, "loss": 2.7724, "loss_aux_layer_0": 0.0131378173828125, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.1282958984375, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05914306640625, "step": 3682, "total_loss": 0.6931022554636002 }, { "epoch": 0.729162542070877, "grad_norm": 0.9622688293457031, "learning_rate": 5e-05, "llm_loss": 0.6188497394323349, "loss": 2.8012, "loss_aux_layer_0": 0.013763427734375, "loss_aux_layer_1": 0.031951904296875, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0572509765625, "step": 3683, "total_loss": 0.7003019899129868 }, { "epoch": 0.7293605226687785, "grad_norm": 1.0886603593826294, "learning_rate": 5e-05, "llm_loss": 0.5567996054887772, "loss": 2.5769, "loss_aux_layer_0": 0.0125732421875, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0650634765625, "loss_aux_layer_11": 0.069580078125, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.079345703125, "loss_aux_layer_14": 0.088134765625, "loss_aux_layer_15": 0.096435546875, "loss_aux_layer_16": 0.105712890625, "loss_aux_layer_17": 0.113037109375, "loss_aux_layer_18": 0.120849609375, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.04998779296875, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.060546875, "loss_aux_layer_4": 0.063232421875, "loss_aux_layer_5": 0.06494140625, "loss_aux_layer_6": 0.0679931640625, "loss_aux_layer_7": 0.06622314453125, "loss_aux_layer_8": 0.0653076171875, "loss_aux_layer_9": 0.063720703125, "step": 3684, "total_loss": 0.6442360430955887 }, { "epoch": 0.7295585032666798, "grad_norm": 0.9082003831863403, "learning_rate": 5e-05, "llm_loss": 0.541421577334404, "loss": 2.4967, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.030853271484375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.200439453125, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.056884765625, "step": 3685, "total_loss": 0.6241857260465622 }, { "epoch": 0.7297564838645813, "grad_norm": 1.1659526824951172, "learning_rate": 5e-05, "llm_loss": 0.5687541663646698, "loss": 2.5969, "loss_aux_layer_0": 0.0136871337890625, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.055908203125, "step": 3686, "total_loss": 0.6492249816656113 }, { "epoch": 0.7299544644624827, "grad_norm": 0.9390293955802917, "learning_rate": 5e-05, "llm_loss": 0.5668743997812271, "loss": 2.5962, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.030487060546875, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.0565185546875, "step": 3687, "total_loss": 0.6490454524755478 }, { "epoch": 0.7301524450603841, "grad_norm": 0.9983271360397339, "learning_rate": 5e-05, "llm_loss": 0.5975986197590828, "loss": 2.7273, "loss_aux_layer_0": 0.01361083984375, "loss_aux_layer_1": 0.033355712890625, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.06182861328125, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06036376953125, "step": 3688, "total_loss": 0.6818349063396454 }, { "epoch": 0.7303504256582855, "grad_norm": 0.9727997779846191, "learning_rate": 5e-05, "llm_loss": 0.5993870794773102, "loss": 2.7237, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.0574951171875, "step": 3689, "total_loss": 0.6809231340885162 }, { "epoch": 0.7305484062561869, "grad_norm": 0.9933415055274963, "learning_rate": 5e-05, "llm_loss": 0.6521745473146439, "loss": 2.9493, "loss_aux_layer_0": 0.0131988525390625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.066650390625, "loss_aux_layer_12": 0.0711669921875, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.056884765625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06298828125, "loss_aux_layer_8": 0.06256103515625, "loss_aux_layer_9": 0.0615234375, "step": 3690, "total_loss": 0.7373189181089401 }, { "epoch": 0.7307463868540883, "grad_norm": 0.9600886702537537, "learning_rate": 5e-05, "llm_loss": 0.5010054334998131, "loss": 2.3448, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.0616455078125, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.197509765625, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.06048583984375, "step": 3691, "total_loss": 0.5861897617578506 }, { "epoch": 0.7309443674519897, "grad_norm": 1.2347692251205444, "learning_rate": 5e-05, "llm_loss": 0.5745822712779045, "loss": 2.6168, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.031463623046875, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.05462646484375, "step": 3692, "total_loss": 0.654195249080658 }, { "epoch": 0.7311423480498911, "grad_norm": 0.8783536553382874, "learning_rate": 5e-05, "llm_loss": 0.627415269613266, "loss": 2.8431, "loss_aux_layer_0": 0.0140228271484375, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.05865478515625, "step": 3693, "total_loss": 0.7107734084129333 }, { "epoch": 0.7313403286477925, "grad_norm": 1.1562424898147583, "learning_rate": 5e-05, "llm_loss": 0.6103257387876511, "loss": 2.7665, "loss_aux_layer_0": 0.0132904052734375, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0572509765625, "step": 3694, "total_loss": 0.691636249423027 }, { "epoch": 0.731538309245694, "grad_norm": 0.9914985299110413, "learning_rate": 5e-05, "llm_loss": 0.5723645240068436, "loss": 2.6124, "loss_aux_layer_0": 0.0141448974609375, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.0562744140625, "step": 3695, "total_loss": 0.6530885398387909 }, { "epoch": 0.7317362898435953, "grad_norm": 0.8455948233604431, "learning_rate": 5e-05, "llm_loss": 0.6317980289459229, "loss": 2.8469, "loss_aux_layer_0": 0.014129638671875, "loss_aux_layer_1": 0.0322265625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05621337890625, "step": 3696, "total_loss": 0.711733490228653 }, { "epoch": 0.7319342704414967, "grad_norm": 0.8999587893486023, "learning_rate": 5e-05, "llm_loss": 0.6148714274168015, "loss": 2.7831, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05682373046875, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05596923828125, "step": 3697, "total_loss": 0.6957801282405853 }, { "epoch": 0.7321322510393982, "grad_norm": 0.8614102005958557, "learning_rate": 5e-05, "llm_loss": 0.5052796304225922, "loss": 2.3465, "loss_aux_layer_0": 0.013946533203125, "loss_aux_layer_1": 0.032501220703125, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05792236328125, "step": 3698, "total_loss": 0.5866246372461319 }, { "epoch": 0.7323302316372996, "grad_norm": 0.7212034463882446, "learning_rate": 5e-05, "llm_loss": 0.567166306078434, "loss": 2.5901, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05621337890625, "step": 3699, "total_loss": 0.6475224792957306 }, { "epoch": 0.7325282122352009, "grad_norm": 0.8269683122634888, "learning_rate": 5e-05, "llm_loss": 0.563372790813446, "loss": 2.5872, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.032318115234375, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.058837890625, "step": 3700, "total_loss": 0.6467935591936111 }, { "epoch": 0.7327261928331024, "grad_norm": 0.7545577883720398, "learning_rate": 5e-05, "llm_loss": 0.5935862511396408, "loss": 2.7024, "loss_aux_layer_0": 0.0129852294921875, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.06744384765625, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.05780029296875, "step": 3701, "total_loss": 0.6755905300378799 }, { "epoch": 0.7329241734310038, "grad_norm": 0.9256556034088135, "learning_rate": 5e-05, "llm_loss": 0.5554565638303757, "loss": 2.5518, "loss_aux_layer_0": 0.013275146484375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04705810546875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.0592041015625, "step": 3702, "total_loss": 0.6379586756229401 }, { "epoch": 0.7331221540289051, "grad_norm": 0.919684648513794, "learning_rate": 5e-05, "llm_loss": 0.5615458860993385, "loss": 2.5739, "loss_aux_layer_0": 0.0130157470703125, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05694580078125, "step": 3703, "total_loss": 0.6434787958860397 }, { "epoch": 0.7333201346268066, "grad_norm": 0.7605764865875244, "learning_rate": 5e-05, "llm_loss": 0.6109715104103088, "loss": 2.7923, "loss_aux_layer_0": 0.0130767822265625, "loss_aux_layer_1": 0.03515625, "loss_aux_layer_10": 0.0643310546875, "loss_aux_layer_11": 0.0689697265625, "loss_aux_layer_12": 0.073486328125, "loss_aux_layer_13": 0.0791015625, "loss_aux_layer_14": 0.0872802734375, "loss_aux_layer_15": 0.0953369140625, "loss_aux_layer_16": 0.104248046875, "loss_aux_layer_17": 0.1116943359375, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.1219482421875, "loss_aux_layer_2": 0.04925537109375, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.0595703125, "loss_aux_layer_4": 0.06195068359375, "loss_aux_layer_5": 0.06353759765625, "loss_aux_layer_6": 0.0667724609375, "loss_aux_layer_7": 0.0650634765625, "loss_aux_layer_8": 0.0643310546875, "loss_aux_layer_9": 0.0631103515625, "step": 3704, "total_loss": 0.6980681270360947 }, { "epoch": 0.733518115224708, "grad_norm": 0.7159084677696228, "learning_rate": 5e-05, "llm_loss": 0.4873031750321388, "loss": 2.2885, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.06475830078125, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1024169921875, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.129638671875, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.159423828125, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.059326171875, "step": 3705, "total_loss": 0.57211534678936 }, { "epoch": 0.7337160958226094, "grad_norm": 0.7235291004180908, "learning_rate": 5e-05, "llm_loss": 0.5867384374141693, "loss": 2.6964, "loss_aux_layer_0": 0.0126495361328125, "loss_aux_layer_1": 0.03472900390625, "loss_aux_layer_10": 0.065185546875, "loss_aux_layer_11": 0.0694580078125, "loss_aux_layer_12": 0.0738525390625, "loss_aux_layer_13": 0.0792236328125, "loss_aux_layer_14": 0.08740234375, "loss_aux_layer_15": 0.09521484375, "loss_aux_layer_16": 0.1044921875, "loss_aux_layer_17": 0.112060546875, "loss_aux_layer_18": 0.119873046875, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04888916015625, "loss_aux_layer_20": 0.130126953125, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.059814453125, "loss_aux_layer_4": 0.0625, "loss_aux_layer_5": 0.06439208984375, "loss_aux_layer_6": 0.06768798828125, "loss_aux_layer_7": 0.06585693359375, "loss_aux_layer_8": 0.065185546875, "loss_aux_layer_9": 0.0638427734375, "step": 3706, "total_loss": 0.674092173576355 }, { "epoch": 0.7339140764205108, "grad_norm": 0.9925827383995056, "learning_rate": 5e-05, "llm_loss": 0.5070574432611465, "loss": 2.3538, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0574951171875, "step": 3707, "total_loss": 0.5884462296962738 }, { "epoch": 0.7341120570184122, "grad_norm": 0.8296144604682922, "learning_rate": 5e-05, "llm_loss": 0.5906442552804947, "loss": 2.6991, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.091796875, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.1351318359375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05926513671875, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06011962890625, "step": 3708, "total_loss": 0.6747667640447617 }, { "epoch": 0.7343100376163136, "grad_norm": 0.9348584413528442, "learning_rate": 5e-05, "llm_loss": 0.6149391382932663, "loss": 2.7687, "loss_aux_layer_0": 0.01324462890625, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.05279541015625, "step": 3709, "total_loss": 0.6921678781509399 }, { "epoch": 0.734508018214215, "grad_norm": 1.4030722379684448, "learning_rate": 5e-05, "llm_loss": 0.6475756913423538, "loss": 2.9246, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.08349609375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05487060546875, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.05828857421875, "step": 3710, "total_loss": 0.7311413586139679 }, { "epoch": 0.7347059988121164, "grad_norm": 0.9395480751991272, "learning_rate": 5e-05, "llm_loss": 0.5965902358293533, "loss": 2.7302, "loss_aux_layer_0": 0.012908935546875, "loss_aux_layer_1": 0.03338623046875, "loss_aux_layer_10": 0.063232421875, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.0718994140625, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0859375, "loss_aux_layer_15": 0.0943603515625, "loss_aux_layer_16": 0.1038818359375, "loss_aux_layer_17": 0.11181640625, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1302490234375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.05999755859375, "loss_aux_layer_5": 0.06170654296875, "loss_aux_layer_6": 0.06512451171875, "loss_aux_layer_7": 0.0633544921875, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.06182861328125, "step": 3711, "total_loss": 0.682541623711586 }, { "epoch": 0.7349039794100178, "grad_norm": 1.171903133392334, "learning_rate": 5e-05, "llm_loss": 0.663079023361206, "loss": 2.9894, "loss_aux_layer_0": 0.013580322265625, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.06640625, "loss_aux_layer_12": 0.07080078125, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06085205078125, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.0623779296875, "loss_aux_layer_8": 0.06201171875, "loss_aux_layer_9": 0.0609130859375, "step": 3712, "total_loss": 0.7473548501729965 }, { "epoch": 0.7351019600079193, "grad_norm": 0.8430077433586121, "learning_rate": 5e-05, "llm_loss": 0.5995892137289047, "loss": 2.7218, "loss_aux_layer_0": 0.014373779296875, "loss_aux_layer_1": 0.032501220703125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.056396484375, "step": 3713, "total_loss": 0.680447980761528 }, { "epoch": 0.7352999406058206, "grad_norm": 1.3803400993347168, "learning_rate": 5e-05, "llm_loss": 0.5396520495414734, "loss": 2.4941, "loss_aux_layer_0": 0.014801025390625, "loss_aux_layer_1": 0.032470703125, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.100830078125, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.122802734375, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13916015625, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197021484375, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05816650390625, "step": 3714, "total_loss": 0.623524621129036 }, { "epoch": 0.735497921203722, "grad_norm": 0.916889488697052, "learning_rate": 5e-05, "llm_loss": 0.473355732858181, "loss": 2.2162, "loss_aux_layer_0": 0.0130462646484375, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05645751953125, "step": 3715, "total_loss": 0.5540501475334167 }, { "epoch": 0.7356959018016235, "grad_norm": 1.1076163053512573, "learning_rate": 5e-05, "llm_loss": 0.6365343332290649, "loss": 2.8793, "loss_aux_layer_0": 0.01519775390625, "loss_aux_layer_1": 0.032501220703125, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.059814453125, "loss_aux_layer_9": 0.0589599609375, "step": 3716, "total_loss": 0.7198225855827332 }, { "epoch": 0.7358938823995248, "grad_norm": 0.915366530418396, "learning_rate": 5e-05, "llm_loss": 0.5386421382427216, "loss": 2.488, "loss_aux_layer_0": 0.0135345458984375, "loss_aux_layer_1": 0.03326416015625, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.197998046875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.0584716796875, "step": 3717, "total_loss": 0.6220120191574097 }, { "epoch": 0.7360918629974262, "grad_norm": 0.995573878288269, "learning_rate": 5e-05, "llm_loss": 0.5964154303073883, "loss": 2.7128, "loss_aux_layer_0": 0.01507568359375, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.05731201171875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.0556640625, "step": 3718, "total_loss": 0.6781893670558929 }, { "epoch": 0.7362898435953277, "grad_norm": 1.0623573064804077, "learning_rate": 5e-05, "llm_loss": 0.5640464201569557, "loss": 2.5894, "loss_aux_layer_0": 0.0126953125, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.06781005859375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1287841796875, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.0582275390625, "step": 3719, "total_loss": 0.6473501771688461 }, { "epoch": 0.7364878241932291, "grad_norm": 0.7986767888069153, "learning_rate": 5e-05, "llm_loss": 0.5916586816310883, "loss": 2.7057, "loss_aux_layer_0": 0.0139923095703125, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.06573486328125, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0931396484375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1102294921875, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.060302734375, "step": 3720, "total_loss": 0.6764133870601654 }, { "epoch": 0.7366858047911304, "grad_norm": 1.1391313076019287, "learning_rate": 5e-05, "llm_loss": 0.504617303609848, "loss": 2.3588, "loss_aux_layer_0": 0.01373291015625, "loss_aux_layer_1": 0.034210205078125, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1204833984375, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.0576171875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06182861328125, "loss_aux_layer_6": 0.065185546875, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.06243896484375, "loss_aux_layer_9": 0.06097412109375, "step": 3721, "total_loss": 0.5897016674280167 }, { "epoch": 0.7368837853890319, "grad_norm": 0.8230911493301392, "learning_rate": 5e-05, "llm_loss": 0.6364323049783707, "loss": 2.8752, "loss_aux_layer_0": 0.013580322265625, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05780029296875, "step": 3722, "total_loss": 0.718792200088501 }, { "epoch": 0.7370817659869333, "grad_norm": 0.9442510604858398, "learning_rate": 5e-05, "llm_loss": 0.5255751013755798, "loss": 2.4263, "loss_aux_layer_0": 0.01336669921875, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05584716796875, "step": 3723, "total_loss": 0.6065638437867165 }, { "epoch": 0.7372797465848346, "grad_norm": 0.9239985346794128, "learning_rate": 5e-05, "llm_loss": 0.5622831657528877, "loss": 2.5768, "loss_aux_layer_0": 0.0132904052734375, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.05792236328125, "step": 3724, "total_loss": 0.6441975980997086 }, { "epoch": 0.7374777271827361, "grad_norm": 0.9051831364631653, "learning_rate": 5e-05, "llm_loss": 0.6195688545703888, "loss": 2.8068, "loss_aux_layer_0": 0.0126953125, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.06793212890625, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.0579833984375, "step": 3725, "total_loss": 0.701703742146492 }, { "epoch": 0.7376757077806375, "grad_norm": 0.8677030801773071, "learning_rate": 5e-05, "llm_loss": 0.5524729266762733, "loss": 2.545, "loss_aux_layer_0": 0.0130157470703125, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.055419921875, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05950927734375, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05963134765625, "step": 3726, "total_loss": 0.6362587809562683 }, { "epoch": 0.737873688378539, "grad_norm": 0.7962390184402466, "learning_rate": 5e-05, "llm_loss": 0.4818030744791031, "loss": 2.247, "loss_aux_layer_0": 0.0135498046875, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.0545654296875, "step": 3727, "total_loss": 0.5617515593767166 }, { "epoch": 0.7380716689764403, "grad_norm": 0.8847212791442871, "learning_rate": 5e-05, "llm_loss": 0.586174875497818, "loss": 2.6652, "loss_aux_layer_0": 0.012420654296875, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.056396484375, "step": 3728, "total_loss": 0.6663120537996292 }, { "epoch": 0.7382696495743417, "grad_norm": 0.9748876690864563, "learning_rate": 5e-05, "llm_loss": 0.6196929216384888, "loss": 2.8143, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.03399658203125, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06512451171875, "loss_aux_layer_12": 0.0694580078125, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.060302734375, "loss_aux_layer_6": 0.06341552734375, "loss_aux_layer_7": 0.06158447265625, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.05963134765625, "step": 3729, "total_loss": 0.7035757452249527 }, { "epoch": 0.7384676301722432, "grad_norm": 0.7893604636192322, "learning_rate": 5e-05, "llm_loss": 0.5808491855859756, "loss": 2.6412, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.06817626953125, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.0545654296875, "step": 3730, "total_loss": 0.6603038460016251 }, { "epoch": 0.7386656107701445, "grad_norm": 0.8869984149932861, "learning_rate": 5e-05, "llm_loss": 0.4826349541544914, "loss": 2.2644, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.033599853515625, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05908203125, "step": 3731, "total_loss": 0.5660998970270157 }, { "epoch": 0.7388635913680459, "grad_norm": 0.7608213424682617, "learning_rate": 5e-05, "llm_loss": 0.5813404619693756, "loss": 2.6529, "loss_aux_layer_0": 0.0131683349609375, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.05743408203125, "step": 3732, "total_loss": 0.6632157862186432 }, { "epoch": 0.7390615719659474, "grad_norm": 0.8885248899459839, "learning_rate": 5e-05, "llm_loss": 0.4847101867198944, "loss": 2.2608, "loss_aux_layer_0": 0.0143280029296875, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.0565185546875, "step": 3733, "total_loss": 0.5651974529027939 }, { "epoch": 0.7392595525638488, "grad_norm": 0.9280556440353394, "learning_rate": 5e-05, "llm_loss": 0.5968195647001266, "loss": 2.713, "loss_aux_layer_0": 0.0127410888671875, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05712890625, "step": 3734, "total_loss": 0.6782404035329819 }, { "epoch": 0.7394575331617501, "grad_norm": 0.8644648194313049, "learning_rate": 5e-05, "llm_loss": 0.6205252110958099, "loss": 2.8098, "loss_aux_layer_0": 0.01348876953125, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.0572509765625, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.0584716796875, "step": 3735, "total_loss": 0.702448159456253 }, { "epoch": 0.7396555137596516, "grad_norm": 0.977913498878479, "learning_rate": 5e-05, "llm_loss": 0.5590047389268875, "loss": 2.5635, "loss_aux_layer_0": 0.013397216796875, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06121826171875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05792236328125, "step": 3736, "total_loss": 0.6408807635307312 }, { "epoch": 0.739853494357553, "grad_norm": 0.8262501358985901, "learning_rate": 5e-05, "llm_loss": 0.5939592719078064, "loss": 2.7145, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.05975341796875, "step": 3737, "total_loss": 0.6786341965198517 }, { "epoch": 0.7400514749554543, "grad_norm": 0.8226810097694397, "learning_rate": 5e-05, "llm_loss": 0.5297787562012672, "loss": 2.4468, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.06707763671875, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05731201171875, "step": 3738, "total_loss": 0.611712247133255 }, { "epoch": 0.7402494555533558, "grad_norm": 0.9552282691001892, "learning_rate": 5e-05, "llm_loss": 0.5774840712547302, "loss": 2.6388, "loss_aux_layer_0": 0.0127105712890625, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.057373046875, "step": 3739, "total_loss": 0.659711942076683 }, { "epoch": 0.7404474361512572, "grad_norm": 0.9389923214912415, "learning_rate": 5e-05, "llm_loss": 0.5371912717819214, "loss": 2.4808, "loss_aux_layer_0": 0.0135040283203125, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.06817626953125, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.055419921875, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.06231689453125, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05877685546875, "step": 3740, "total_loss": 0.6201934367418289 }, { "epoch": 0.7406454167491586, "grad_norm": 0.9878548383712769, "learning_rate": 5e-05, "llm_loss": 0.5807359218597412, "loss": 2.6535, "loss_aux_layer_0": 0.01300048828125, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05816650390625, "step": 3741, "total_loss": 0.663372203707695 }, { "epoch": 0.74084339734706, "grad_norm": 1.1208925247192383, "learning_rate": 5e-05, "llm_loss": 0.5527607724070549, "loss": 2.5405, "loss_aux_layer_0": 0.01287841796875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.05694580078125, "step": 3742, "total_loss": 0.6351358741521835 }, { "epoch": 0.7410413779449614, "grad_norm": 0.9498941898345947, "learning_rate": 5e-05, "llm_loss": 0.5693916007876396, "loss": 2.5932, "loss_aux_layer_0": 0.0137176513671875, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05596923828125, "step": 3743, "total_loss": 0.6483079642057419 }, { "epoch": 0.7412393585428628, "grad_norm": 1.0613223314285278, "learning_rate": 5e-05, "llm_loss": 0.555767148733139, "loss": 2.549, "loss_aux_layer_0": 0.0136871337890625, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.0572509765625, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05755615234375, "step": 3744, "total_loss": 0.6372579038143158 }, { "epoch": 0.7414373391407642, "grad_norm": 0.8761062026023865, "learning_rate": 5e-05, "llm_loss": 0.5802011936903, "loss": 2.6518, "loss_aux_layer_0": 0.013458251953125, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05792236328125, "step": 3745, "total_loss": 0.6629586815834045 }, { "epoch": 0.7416353197386656, "grad_norm": 1.1713144779205322, "learning_rate": 5e-05, "llm_loss": 0.5565399080514908, "loss": 2.5591, "loss_aux_layer_0": 0.0130615234375, "loss_aux_layer_1": 0.0322265625, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.06768798828125, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.1009521484375, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.120361328125, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1285400390625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.057373046875, "step": 3746, "total_loss": 0.6397652179002762 }, { "epoch": 0.741833300336567, "grad_norm": 1.0409348011016846, "learning_rate": 5e-05, "llm_loss": 0.6461214274168015, "loss": 2.892, "loss_aux_layer_0": 0.0142974853515625, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05242919921875, "step": 3747, "total_loss": 0.7230043560266495 }, { "epoch": 0.7420312809344685, "grad_norm": 1.2173463106155396, "learning_rate": 5e-05, "llm_loss": 0.5464421436190605, "loss": 2.5238, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.1214599609375, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.05950927734375, "step": 3748, "total_loss": 0.6309607625007629 }, { "epoch": 0.7422292615323698, "grad_norm": 0.9966358542442322, "learning_rate": 5e-05, "llm_loss": 0.6343951672315598, "loss": 2.865, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05755615234375, "step": 3749, "total_loss": 0.7162421941757202 }, { "epoch": 0.7424272421302712, "grad_norm": 1.1079596281051636, "learning_rate": 5e-05, "llm_loss": 0.5726174265146255, "loss": 2.608, "loss_aux_layer_0": 0.01312255859375, "loss_aux_layer_1": 0.029022216796875, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05364990234375, "step": 3750, "total_loss": 0.6519899964332581 }, { "epoch": 0.7426252227281727, "grad_norm": 0.986789345741272, "learning_rate": 5e-05, "llm_loss": 0.5946111679077148, "loss": 2.7173, "loss_aux_layer_0": 0.013916015625, "loss_aux_layer_1": 0.032562255859375, "loss_aux_layer_10": 0.06109619140625, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05633544921875, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.0599365234375, "step": 3751, "total_loss": 0.6793127059936523 }, { "epoch": 0.7428232033260741, "grad_norm": 1.0074630975723267, "learning_rate": 5e-05, "llm_loss": 0.546827644109726, "loss": 2.5142, "loss_aux_layer_0": 0.01446533203125, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.05694580078125, "step": 3752, "total_loss": 0.6285613626241684 }, { "epoch": 0.7430211839239754, "grad_norm": 1.0860949754714966, "learning_rate": 5e-05, "llm_loss": 0.5567034780979156, "loss": 2.5441, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.030487060546875, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05462646484375, "step": 3753, "total_loss": 0.6360252946615219 }, { "epoch": 0.7432191645218769, "grad_norm": 1.1946109533309937, "learning_rate": 5e-05, "llm_loss": 0.5102424025535583, "loss": 2.3514, "loss_aux_layer_0": 0.015625, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.06683349609375, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0523681640625, "step": 3754, "total_loss": 0.5878517925739288 }, { "epoch": 0.7434171451197783, "grad_norm": 0.9997740387916565, "learning_rate": 5e-05, "llm_loss": 0.5336471050977707, "loss": 2.4575, "loss_aux_layer_0": 0.0128326416015625, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.0557861328125, "step": 3755, "total_loss": 0.6143770813941956 }, { "epoch": 0.7436151257176796, "grad_norm": 0.9404318332672119, "learning_rate": 5e-05, "llm_loss": 0.5908539891242981, "loss": 2.7016, "loss_aux_layer_0": 0.0142364501953125, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.06134033203125, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04693603515625, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06024169921875, "step": 3756, "total_loss": 0.6754064187407494 }, { "epoch": 0.7438131063155811, "grad_norm": 0.9295498132705688, "learning_rate": 5e-05, "llm_loss": 0.5971686094999313, "loss": 2.71, "loss_aux_layer_0": 0.0133056640625, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.05560302734375, "step": 3757, "total_loss": 0.677497461438179 }, { "epoch": 0.7440110869134825, "grad_norm": 0.9364544153213501, "learning_rate": 5e-05, "llm_loss": 0.5789584815502167, "loss": 2.6435, "loss_aux_layer_0": 0.013763427734375, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.06719970703125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.1339111328125, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.05706787109375, "step": 3758, "total_loss": 0.6608853042125702 }, { "epoch": 0.7442090675113839, "grad_norm": 0.7751473784446716, "learning_rate": 5e-05, "llm_loss": 0.6226330325007439, "loss": 2.815, "loss_aux_layer_0": 0.01220703125, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.05682373046875, "step": 3759, "total_loss": 0.7037489861249924 }, { "epoch": 0.7444070481092853, "grad_norm": 0.9178501963615417, "learning_rate": 5e-05, "llm_loss": 0.5254659652709961, "loss": 2.4436, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0653076171875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.102783203125, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1234130859375, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.131591796875, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.19873046875, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.063720703125, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.05999755859375, "step": 3760, "total_loss": 0.6109063029289246 }, { "epoch": 0.7446050287071867, "grad_norm": 0.8321179151535034, "learning_rate": 5e-05, "llm_loss": 0.5300190299749374, "loss": 2.452, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.06781005859375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.0582275390625, "step": 3761, "total_loss": 0.6130039542913437 }, { "epoch": 0.7448030093050881, "grad_norm": 0.761841356754303, "learning_rate": 5e-05, "llm_loss": 0.47306492179632187, "loss": 2.226, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.116943359375, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.05841064453125, "step": 3762, "total_loss": 0.5564927458763123 }, { "epoch": 0.7450009899029895, "grad_norm": 0.9437786340713501, "learning_rate": 5e-05, "llm_loss": 0.6567791253328323, "loss": 2.9529, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.032196044921875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.0579833984375, "step": 3763, "total_loss": 0.7382203787565231 }, { "epoch": 0.7451989705008909, "grad_norm": 0.7636390328407288, "learning_rate": 5e-05, "llm_loss": 0.5474314242601395, "loss": 2.5216, "loss_aux_layer_0": 0.0125732421875, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05792236328125, "step": 3764, "total_loss": 0.6303874999284744 }, { "epoch": 0.7453969510987923, "grad_norm": 0.8513519763946533, "learning_rate": 5e-05, "llm_loss": 0.58059923350811, "loss": 2.6569, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05902099609375, "step": 3765, "total_loss": 0.664220318198204 }, { "epoch": 0.7455949316966938, "grad_norm": 0.7732008099555969, "learning_rate": 5e-05, "llm_loss": 0.5384586453437805, "loss": 2.4794, "loss_aux_layer_0": 0.01251220703125, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0574951171875, "step": 3766, "total_loss": 0.6198444962501526 }, { "epoch": 0.7457929122945951, "grad_norm": 0.9690976142883301, "learning_rate": 5e-05, "llm_loss": 0.5343987345695496, "loss": 2.4708, "loss_aux_layer_0": 0.0127410888671875, "loss_aux_layer_1": 0.032470703125, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05859375, "step": 3767, "total_loss": 0.6177002191543579 }, { "epoch": 0.7459908928924965, "grad_norm": 0.7286674976348877, "learning_rate": 5e-05, "llm_loss": 0.535008043050766, "loss": 2.4656, "loss_aux_layer_0": 0.0130157470703125, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072509765625, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05706787109375, "step": 3768, "total_loss": 0.6164107173681259 }, { "epoch": 0.746188873490398, "grad_norm": 1.0635809898376465, "learning_rate": 5e-05, "llm_loss": 0.5896721184253693, "loss": 2.6767, "loss_aux_layer_0": 0.012908935546875, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.05731201171875, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.05596923828125, "step": 3769, "total_loss": 0.6691767424345016 }, { "epoch": 0.7463868540882993, "grad_norm": 1.068584680557251, "learning_rate": 5e-05, "llm_loss": 0.5232907384634018, "loss": 2.4312, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.03411865234375, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1092529296875, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05645751953125, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.059814453125, "step": 3770, "total_loss": 0.6078092306852341 }, { "epoch": 0.7465848346862007, "grad_norm": 0.9742433428764343, "learning_rate": 5e-05, "llm_loss": 0.5052024871110916, "loss": 2.3499, "loss_aux_layer_0": 0.012908935546875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05816650390625, "step": 3771, "total_loss": 0.5874679386615753 }, { "epoch": 0.7467828152841022, "grad_norm": 0.8404839038848877, "learning_rate": 5e-05, "llm_loss": 0.5427616387605667, "loss": 2.4898, "loss_aux_layer_0": 0.0129241943359375, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05438232421875, "step": 3772, "total_loss": 0.6224590688943863 }, { "epoch": 0.7469807958820036, "grad_norm": 0.9642499685287476, "learning_rate": 5e-05, "llm_loss": 0.6299517750740051, "loss": 2.8427, "loss_aux_layer_0": 0.0126495361328125, "loss_aux_layer_1": 0.032318115234375, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.056640625, "step": 3773, "total_loss": 0.710668295621872 }, { "epoch": 0.7471787764799049, "grad_norm": 0.717891275882721, "learning_rate": 5e-05, "llm_loss": 0.5872327983379364, "loss": 2.6762, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.06768798828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.05767822265625, "step": 3774, "total_loss": 0.6690546572208405 }, { "epoch": 0.7473767570778064, "grad_norm": 0.9824535250663757, "learning_rate": 5e-05, "llm_loss": 0.5254039466381073, "loss": 2.4215, "loss_aux_layer_0": 0.0131378173828125, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05584716796875, "step": 3775, "total_loss": 0.6053836792707443 }, { "epoch": 0.7475747376757078, "grad_norm": 0.9126870632171631, "learning_rate": 5e-05, "llm_loss": 0.6511697024106979, "loss": 2.9473, "loss_aux_layer_0": 0.0125732421875, "loss_aux_layer_1": 0.0347900390625, "loss_aux_layer_10": 0.0635986328125, "loss_aux_layer_11": 0.0677490234375, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.0775146484375, "loss_aux_layer_14": 0.0860595703125, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.103271484375, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04827880859375, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05859375, "loss_aux_layer_4": 0.0615234375, "loss_aux_layer_5": 0.06317138671875, "loss_aux_layer_6": 0.06646728515625, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.0635986328125, "loss_aux_layer_9": 0.062255859375, "step": 3776, "total_loss": 0.7368258833885193 }, { "epoch": 0.7477727182736091, "grad_norm": 1.282619833946228, "learning_rate": 5e-05, "llm_loss": 0.55801822245121, "loss": 2.5725, "loss_aux_layer_0": 0.0131988525390625, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.0477294921875, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.0604248046875, "loss_aux_layer_5": 0.06195068359375, "loss_aux_layer_6": 0.0650634765625, "loss_aux_layer_7": 0.06304931640625, "loss_aux_layer_8": 0.0621337890625, "loss_aux_layer_9": 0.06072998046875, "step": 3777, "total_loss": 0.6431224048137665 }, { "epoch": 0.7479706988715106, "grad_norm": 1.0067152976989746, "learning_rate": 5e-05, "llm_loss": 0.6821447461843491, "loss": 3.0481, "loss_aux_layer_0": 0.0129852294921875, "loss_aux_layer_1": 0.030426025390625, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.1322021484375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.055419921875, "step": 3778, "total_loss": 0.762026846408844 }, { "epoch": 0.748168679469412, "grad_norm": 1.03948974609375, "learning_rate": 5e-05, "llm_loss": 0.6184230893850327, "loss": 2.8052, "loss_aux_layer_0": 0.01416015625, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.082275390625, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.0469970703125, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.0611572265625, "loss_aux_layer_9": 0.0596923828125, "step": 3779, "total_loss": 0.7012938112020493 }, { "epoch": 0.7483666600673134, "grad_norm": 0.9967886805534363, "learning_rate": 5e-05, "llm_loss": 0.555407240986824, "loss": 2.5522, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.032135009765625, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.06854248046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.0555419921875, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05865478515625, "step": 3780, "total_loss": 0.6380613148212433 }, { "epoch": 0.7485646406652148, "grad_norm": 1.3839938640594482, "learning_rate": 5e-05, "llm_loss": 0.5925640761852264, "loss": 2.7102, "loss_aux_layer_0": 0.0132598876953125, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.06005859375, "step": 3781, "total_loss": 0.6775482147932053 }, { "epoch": 0.7487626212631162, "grad_norm": 0.9069117903709412, "learning_rate": 5e-05, "llm_loss": 0.5714772790670395, "loss": 2.6076, "loss_aux_layer_0": 0.0133056640625, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.0555419921875, "step": 3782, "total_loss": 0.6519100069999695 }, { "epoch": 0.7489606018610176, "grad_norm": 0.9406259059906006, "learning_rate": 5e-05, "llm_loss": 0.63651292771101, "loss": 2.8666, "loss_aux_layer_0": 0.0132904052734375, "loss_aux_layer_1": 0.031219482421875, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.0562744140625, "step": 3783, "total_loss": 0.7166467756032944 }, { "epoch": 0.749158582458919, "grad_norm": 1.0609517097473145, "learning_rate": 5e-05, "llm_loss": 0.6455662101507187, "loss": 2.9055, "loss_aux_layer_0": 0.013641357421875, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.055419921875, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05718994140625, "step": 3784, "total_loss": 0.7263680845499039 }, { "epoch": 0.7493565630568204, "grad_norm": 0.818713366985321, "learning_rate": 5e-05, "llm_loss": 0.5561523735523224, "loss": 2.5474, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06597900390625, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05706787109375, "step": 3785, "total_loss": 0.6368387788534164 }, { "epoch": 0.7495545436547218, "grad_norm": 0.9926738142967224, "learning_rate": 5e-05, "llm_loss": 0.5326651632785797, "loss": 2.4511, "loss_aux_layer_0": 0.01324462890625, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05731201171875, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.05535888671875, "step": 3786, "total_loss": 0.6127647235989571 }, { "epoch": 0.7497525242526233, "grad_norm": 0.9166219830513, "learning_rate": 5e-05, "llm_loss": 0.47932884097099304, "loss": 2.2559, "loss_aux_layer_0": 0.0135650634765625, "loss_aux_layer_1": 0.033233642578125, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1217041015625, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.13720703125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.05938720703125, "step": 3787, "total_loss": 0.5639733374118805 }, { "epoch": 0.7499505048505246, "grad_norm": 0.9849292039871216, "learning_rate": 5e-05, "llm_loss": 0.5314926430583, "loss": 2.4656, "loss_aux_layer_0": 0.0124359130859375, "loss_aux_layer_1": 0.0335693359375, "loss_aux_layer_10": 0.06170654296875, "loss_aux_layer_11": 0.06585693359375, "loss_aux_layer_12": 0.07025146484375, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.10986328125, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.1209716796875, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.0609130859375, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.06201171875, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.06036376953125, "step": 3788, "total_loss": 0.6164079904556274 }, { "epoch": 0.750148485448426, "grad_norm": 0.9284718632698059, "learning_rate": 5e-05, "llm_loss": 0.6062562465667725, "loss": 2.7277, "loss_aux_layer_0": 0.01348876953125, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.0509033203125, "step": 3789, "total_loss": 0.6819197535514832 }, { "epoch": 0.7503464660463275, "grad_norm": 0.9792134165763855, "learning_rate": 5e-05, "llm_loss": 0.5812538117170334, "loss": 2.6553, "loss_aux_layer_0": 0.0132598876953125, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.0582275390625, "step": 3790, "total_loss": 0.6638303250074387 }, { "epoch": 0.7505444466442288, "grad_norm": 0.8894696831703186, "learning_rate": 5e-05, "llm_loss": 0.5736799165606499, "loss": 2.6133, "loss_aux_layer_0": 0.0145721435546875, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.0546875, "step": 3791, "total_loss": 0.6533171087503433 }, { "epoch": 0.7507424272421303, "grad_norm": 0.8347205519676208, "learning_rate": 5e-05, "llm_loss": 0.5364388972520828, "loss": 2.4784, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.05908203125, "step": 3792, "total_loss": 0.6195888668298721 }, { "epoch": 0.7509404078400317, "grad_norm": 0.873957633972168, "learning_rate": 5e-05, "llm_loss": 0.4936641976237297, "loss": 2.3146, "loss_aux_layer_0": 0.0148162841796875, "loss_aux_layer_1": 0.0341796875, "loss_aux_layer_10": 0.061279296875, "loss_aux_layer_11": 0.06536865234375, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.12890625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.0614013671875, "loss_aux_layer_9": 0.0601806640625, "step": 3793, "total_loss": 0.5786485821008682 }, { "epoch": 0.7511383884379331, "grad_norm": 0.9866130352020264, "learning_rate": 5e-05, "llm_loss": 0.6101184636354446, "loss": 2.7704, "loss_aux_layer_0": 0.013427734375, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0555419921875, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.058837890625, "step": 3794, "total_loss": 0.6926093846559525 }, { "epoch": 0.7513363690358345, "grad_norm": 1.2744052410125732, "learning_rate": 5e-05, "llm_loss": 0.5446269810199738, "loss": 2.5086, "loss_aux_layer_0": 0.0138702392578125, "loss_aux_layer_1": 0.03216552734375, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.06829833984375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05841064453125, "step": 3795, "total_loss": 0.6271401345729828 }, { "epoch": 0.7515343496337359, "grad_norm": 0.9326571226119995, "learning_rate": 5e-05, "llm_loss": 0.5998545736074448, "loss": 2.7276, "loss_aux_layer_0": 0.0135040283203125, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.06805419921875, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.05810546875, "step": 3796, "total_loss": 0.6818944364786148 }, { "epoch": 0.7517323302316373, "grad_norm": 1.0018783807754517, "learning_rate": 5e-05, "llm_loss": 0.6692490726709366, "loss": 3.0177, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.0628662109375, "loss_aux_layer_11": 0.0670166015625, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.09423828125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.1346435546875, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.059814453125, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.06494140625, "loss_aux_layer_7": 0.06329345703125, "loss_aux_layer_8": 0.06292724609375, "loss_aux_layer_9": 0.061767578125, "step": 3797, "total_loss": 0.7544241547584534 }, { "epoch": 0.7519303108295387, "grad_norm": 0.9993280172348022, "learning_rate": 5e-05, "llm_loss": 0.5941126644611359, "loss": 2.71, "loss_aux_layer_0": 0.013916015625, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.0457763671875, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.060791015625, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.0587158203125, "step": 3798, "total_loss": 0.6775049120187759 }, { "epoch": 0.7521282914274401, "grad_norm": 1.0727850198745728, "learning_rate": 5e-05, "llm_loss": 0.5404036045074463, "loss": 2.4945, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.05767822265625, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.05859375, "step": 3799, "total_loss": 0.623614490032196 }, { "epoch": 0.7523262720253415, "grad_norm": 0.8151196837425232, "learning_rate": 5e-05, "llm_loss": 0.5355372875928879, "loss": 2.4501, "loss_aux_layer_0": 0.013580322265625, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06646728515625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05267333984375, "step": 3800, "total_loss": 0.6125184297561646 }, { "epoch": 0.752524252623243, "grad_norm": 1.1199402809143066, "learning_rate": 5e-05, "llm_loss": 0.5782139152288437, "loss": 2.6345, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1319580078125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05615234375, "step": 3801, "total_loss": 0.6586300134658813 }, { "epoch": 0.7527222332211443, "grad_norm": 0.8770346641540527, "learning_rate": 5e-05, "llm_loss": 0.6023898646235466, "loss": 2.7384, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.032806396484375, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06341552734375, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.05792236328125, "step": 3802, "total_loss": 0.6845982372760773 }, { "epoch": 0.7529202138190457, "grad_norm": 0.9230524897575378, "learning_rate": 5e-05, "llm_loss": 0.5553675442934036, "loss": 2.5521, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.032806396484375, "loss_aux_layer_10": 0.05938720703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05804443359375, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.0582275390625, "step": 3803, "total_loss": 0.638027086853981 }, { "epoch": 0.7531181944169472, "grad_norm": 0.7939836978912354, "learning_rate": 5e-05, "llm_loss": 0.6052471771836281, "loss": 2.7469, "loss_aux_layer_0": 0.0131072998046875, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.05804443359375, "loss_aux_layer_9": 0.05694580078125, "step": 3804, "total_loss": 0.6867315918207169 }, { "epoch": 0.7533161750148486, "grad_norm": 0.9990840554237366, "learning_rate": 5e-05, "llm_loss": 0.5508506745100021, "loss": 2.5308, "loss_aux_layer_0": 0.012664794921875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06683349609375, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05718994140625, "step": 3805, "total_loss": 0.6326911151409149 }, { "epoch": 0.7535141556127499, "grad_norm": 0.9856887459754944, "learning_rate": 5e-05, "llm_loss": 0.5323649793863297, "loss": 2.4638, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.05865478515625, "step": 3806, "total_loss": 0.6159449964761734 }, { "epoch": 0.7537121362106514, "grad_norm": 1.1876814365386963, "learning_rate": 5e-05, "llm_loss": 0.49270446598529816, "loss": 2.2974, "loss_aux_layer_0": 0.014190673828125, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.055908203125, "step": 3807, "total_loss": 0.5743423402309418 }, { "epoch": 0.7539101168085528, "grad_norm": 0.882902979850769, "learning_rate": 5e-05, "llm_loss": 0.5095737278461456, "loss": 2.3572, "loss_aux_layer_0": 0.0135040283203125, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05419921875, "step": 3808, "total_loss": 0.5893019288778305 }, { "epoch": 0.7541080974064541, "grad_norm": 1.2809100151062012, "learning_rate": 5e-05, "llm_loss": 0.541342556476593, "loss": 2.4772, "loss_aux_layer_0": 0.0138092041015625, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.0535888671875, "step": 3809, "total_loss": 0.6193018779158592 }, { "epoch": 0.7543060780043556, "grad_norm": 0.9504067301750183, "learning_rate": 5e-05, "llm_loss": 0.6099410951137543, "loss": 2.7737, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.0601806640625, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.05902099609375, "step": 3810, "total_loss": 0.6934236139059067 }, { "epoch": 0.754504058602257, "grad_norm": 0.9690174460411072, "learning_rate": 5e-05, "llm_loss": 0.6215823367238045, "loss": 2.8274, "loss_aux_layer_0": 0.0130615234375, "loss_aux_layer_1": 0.03326416015625, "loss_aux_layer_10": 0.06317138671875, "loss_aux_layer_11": 0.0673828125, "loss_aux_layer_12": 0.072021484375, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.1187744140625, "loss_aux_layer_19": 0.121337890625, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05718994140625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.0628662109375, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06170654296875, "step": 3811, "total_loss": 0.7068624198436737 }, { "epoch": 0.7547020392001584, "grad_norm": 0.8353314995765686, "learning_rate": 5e-05, "llm_loss": 0.5665530115365982, "loss": 2.5801, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.02972412109375, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.05377197265625, "step": 3812, "total_loss": 0.6450299769639969 }, { "epoch": 0.7549000197980598, "grad_norm": 1.113623023033142, "learning_rate": 5e-05, "llm_loss": 0.5854456871747971, "loss": 2.6815, "loss_aux_layer_0": 0.013275146484375, "loss_aux_layer_1": 0.03521728515625, "loss_aux_layer_10": 0.0626220703125, "loss_aux_layer_11": 0.067138671875, "loss_aux_layer_12": 0.071533203125, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.04815673828125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.1341552734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0584716796875, "loss_aux_layer_4": 0.06103515625, "loss_aux_layer_5": 0.0625, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06268310546875, "loss_aux_layer_9": 0.061279296875, "step": 3813, "total_loss": 0.6703819930553436 }, { "epoch": 0.7550980003959612, "grad_norm": 0.9079729318618774, "learning_rate": 5e-05, "llm_loss": 0.5834371894598007, "loss": 2.6547, "loss_aux_layer_0": 0.01263427734375, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.0582275390625, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05645751953125, "step": 3814, "total_loss": 0.6636631339788437 }, { "epoch": 0.7552959809938626, "grad_norm": 1.2713154554367065, "learning_rate": 5e-05, "llm_loss": 0.473255917429924, "loss": 2.2199, "loss_aux_layer_0": 0.01385498046875, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05670166015625, "step": 3815, "total_loss": 0.5549718886613846 }, { "epoch": 0.755493961591764, "grad_norm": 0.8382353782653809, "learning_rate": 5e-05, "llm_loss": 0.6548231095075607, "loss": 2.9569, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.033477783203125, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.06597900390625, "loss_aux_layer_12": 0.07049560546875, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.12060546875, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.05975341796875, "loss_aux_layer_5": 0.06134033203125, "loss_aux_layer_6": 0.0645751953125, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.0604248046875, "step": 3816, "total_loss": 0.7392178028821945 }, { "epoch": 0.7556919421896654, "grad_norm": 1.1514856815338135, "learning_rate": 5e-05, "llm_loss": 0.5633585900068283, "loss": 2.5662, "loss_aux_layer_0": 0.013427734375, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.0538330078125, "step": 3817, "total_loss": 0.6415546983480453 }, { "epoch": 0.7558899227875668, "grad_norm": 0.888146698474884, "learning_rate": 5e-05, "llm_loss": 0.4972095340490341, "loss": 2.3028, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0546875, "step": 3818, "total_loss": 0.5757111608982086 }, { "epoch": 0.7560879033854683, "grad_norm": 0.9667795896530151, "learning_rate": 5e-05, "llm_loss": 0.6286542266607285, "loss": 2.8353, "loss_aux_layer_0": 0.01348876953125, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06597900390625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.0565185546875, "step": 3819, "total_loss": 0.7088132500648499 }, { "epoch": 0.7562858839833696, "grad_norm": 1.1111688613891602, "learning_rate": 5e-05, "llm_loss": 0.5967680066823959, "loss": 2.7119, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.055908203125, "step": 3820, "total_loss": 0.6779804676771164 }, { "epoch": 0.756483864581271, "grad_norm": 1.1473197937011719, "learning_rate": 5e-05, "llm_loss": 0.5560541301965714, "loss": 2.5579, "loss_aux_layer_0": 0.0131683349609375, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.061279296875, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.05950927734375, "step": 3821, "total_loss": 0.6394772827625275 }, { "epoch": 0.7566818451791725, "grad_norm": 1.1110972166061401, "learning_rate": 5e-05, "llm_loss": 0.519918367266655, "loss": 2.406, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.1953125, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05682373046875, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.056396484375, "step": 3822, "total_loss": 0.601510152220726 }, { "epoch": 0.7568798257770738, "grad_norm": 0.9384027123451233, "learning_rate": 5e-05, "llm_loss": 0.5055287927389145, "loss": 2.3536, "loss_aux_layer_0": 0.013153076171875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05810546875, "step": 3823, "total_loss": 0.5884089693427086 }, { "epoch": 0.7570778063749752, "grad_norm": 0.994826078414917, "learning_rate": 5e-05, "llm_loss": 0.5596804767847061, "loss": 2.5785, "loss_aux_layer_0": 0.0130615234375, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.06109619140625, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.0938720703125, "loss_aux_layer_16": 0.103515625, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.1190185546875, "loss_aux_layer_19": 0.1220703125, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.1297607421875, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.059814453125, "step": 3824, "total_loss": 0.6446148604154587 }, { "epoch": 0.7572757869728767, "grad_norm": 0.881986141204834, "learning_rate": 5e-05, "llm_loss": 0.622541755437851, "loss": 2.797, "loss_aux_layer_0": 0.0125885009765625, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05291748046875, "step": 3825, "total_loss": 0.6992589235305786 }, { "epoch": 0.7574737675707781, "grad_norm": 0.9151471257209778, "learning_rate": 5e-05, "llm_loss": 0.5468971282243729, "loss": 2.5159, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.057861328125, "step": 3826, "total_loss": 0.6289679557085037 }, { "epoch": 0.7576717481686794, "grad_norm": 0.9432250261306763, "learning_rate": 5e-05, "llm_loss": 0.5719822719693184, "loss": 2.613, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1141357421875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05584716796875, "step": 3827, "total_loss": 0.6532543897628784 }, { "epoch": 0.7578697287665809, "grad_norm": 0.8447707891464233, "learning_rate": 5e-05, "llm_loss": 0.5439417883753777, "loss": 2.5093, "loss_aux_layer_0": 0.0140838623046875, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.06854248046875, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.0587158203125, "step": 3828, "total_loss": 0.627312958240509 }, { "epoch": 0.7580677093644823, "grad_norm": 0.8030197024345398, "learning_rate": 5e-05, "llm_loss": 0.5912618786096573, "loss": 2.6883, "loss_aux_layer_0": 0.0128936767578125, "loss_aux_layer_1": 0.03094482421875, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05609130859375, "step": 3829, "total_loss": 0.6720829755067825 }, { "epoch": 0.7582656899623836, "grad_norm": 0.8549922108650208, "learning_rate": 5e-05, "llm_loss": 0.6083531379699707, "loss": 2.7622, "loss_aux_layer_0": 0.013946533203125, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0572509765625, "step": 3830, "total_loss": 0.6905502825975418 }, { "epoch": 0.7584636705602851, "grad_norm": 0.8655327558517456, "learning_rate": 5e-05, "llm_loss": 0.5738126635551453, "loss": 2.618, "loss_aux_layer_0": 0.012664794921875, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05596923828125, "step": 3831, "total_loss": 0.6545015275478363 }, { "epoch": 0.7586616511581865, "grad_norm": 0.8348923921585083, "learning_rate": 5e-05, "llm_loss": 0.6115492135286331, "loss": 2.7595, "loss_aux_layer_0": 0.0138092041015625, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.06878662109375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05389404296875, "step": 3832, "total_loss": 0.6898789554834366 }, { "epoch": 0.758859631756088, "grad_norm": 0.9249614477157593, "learning_rate": 5e-05, "llm_loss": 0.604234904050827, "loss": 2.7296, "loss_aux_layer_0": 0.0132598876953125, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.05328369140625, "step": 3833, "total_loss": 0.6824090927839279 }, { "epoch": 0.7590576123539893, "grad_norm": 0.9454010128974915, "learning_rate": 5e-05, "llm_loss": 0.6077651157975197, "loss": 2.7637, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.05902099609375, "step": 3834, "total_loss": 0.6909224987030029 }, { "epoch": 0.7592555929518907, "grad_norm": 0.7969250082969666, "learning_rate": 5e-05, "llm_loss": 0.545118123292923, "loss": 2.51, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06341552734375, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.0579833984375, "step": 3835, "total_loss": 0.6274876147508621 }, { "epoch": 0.7594535735497921, "grad_norm": 0.9409666657447815, "learning_rate": 5e-05, "llm_loss": 0.5670329630374908, "loss": 2.59, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.030487060546875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1322021484375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.0556640625, "step": 3836, "total_loss": 0.6475056558847427 }, { "epoch": 0.7596515541476935, "grad_norm": 1.0680806636810303, "learning_rate": 5e-05, "llm_loss": 0.5205685049295425, "loss": 2.3975, "loss_aux_layer_0": 0.01300048828125, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05401611328125, "step": 3837, "total_loss": 0.5993775725364685 }, { "epoch": 0.7598495347455949, "grad_norm": 1.261975884437561, "learning_rate": 5e-05, "llm_loss": 0.5357611253857613, "loss": 2.4669, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.030181884765625, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.0556640625, "step": 3838, "total_loss": 0.616720512509346 }, { "epoch": 0.7600475153434963, "grad_norm": 1.398049235343933, "learning_rate": 5e-05, "llm_loss": 0.5986660867929459, "loss": 2.7168, "loss_aux_layer_0": 0.013458251953125, "loss_aux_layer_1": 0.030181884765625, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05902099609375, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.05548095703125, "step": 3839, "total_loss": 0.6791925877332687 }, { "epoch": 0.7602454959413978, "grad_norm": 1.163476824760437, "learning_rate": 5e-05, "llm_loss": 0.6639275103807449, "loss": 2.9884, "loss_aux_layer_0": 0.013153076171875, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.06781005859375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05780029296875, "step": 3840, "total_loss": 0.7470989525318146 }, { "epoch": 0.7604434765392991, "grad_norm": 1.6367661952972412, "learning_rate": 5e-05, "llm_loss": 0.6054153144359589, "loss": 2.7492, "loss_aux_layer_0": 0.01336669921875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0565185546875, "step": 3841, "total_loss": 0.6872905790805817 }, { "epoch": 0.7606414571372005, "grad_norm": 1.0704761743545532, "learning_rate": 5e-05, "llm_loss": 0.5682032406330109, "loss": 2.596, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05517578125, "step": 3842, "total_loss": 0.649004265666008 }, { "epoch": 0.760839437735102, "grad_norm": 1.1025868654251099, "learning_rate": 5e-05, "llm_loss": 0.5976987779140472, "loss": 2.7019, "loss_aux_layer_0": 0.013458251953125, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.0528564453125, "step": 3843, "total_loss": 0.6754785627126694 }, { "epoch": 0.7610374183330033, "grad_norm": 0.9844663739204407, "learning_rate": 5e-05, "llm_loss": 0.6268584281206131, "loss": 2.8299, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.03045654296875, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05584716796875, "step": 3844, "total_loss": 0.7074690014123917 }, { "epoch": 0.7612353989309047, "grad_norm": 0.9439274668693542, "learning_rate": 5e-05, "llm_loss": 0.4859575033187866, "loss": 2.2746, "loss_aux_layer_0": 0.0125732421875, "loss_aux_layer_1": 0.033233642578125, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.06915283203125, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0560302734375, "loss_aux_layer_4": 0.058349609375, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.0594482421875, "step": 3845, "total_loss": 0.5686388462781906 }, { "epoch": 0.7614333795288062, "grad_norm": 1.0693331956863403, "learning_rate": 5e-05, "llm_loss": 0.5552749037742615, "loss": 2.5359, "loss_aux_layer_0": 0.012786865234375, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.06781005859375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.0538330078125, "step": 3846, "total_loss": 0.6339635252952576 }, { "epoch": 0.7616313601267076, "grad_norm": 1.1826145648956299, "learning_rate": 5e-05, "llm_loss": 0.5949123948812485, "loss": 2.7013, "loss_aux_layer_0": 0.0127105712890625, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.06597900390625, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05584716796875, "step": 3847, "total_loss": 0.6753289997577667 }, { "epoch": 0.761829340724609, "grad_norm": 0.9640156626701355, "learning_rate": 5e-05, "llm_loss": 0.6156330108642578, "loss": 2.7987, "loss_aux_layer_0": 0.012939453125, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06103515625, "loss_aux_layer_11": 0.06524658203125, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.0472412109375, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.057373046875, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.061767578125, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.05963134765625, "step": 3848, "total_loss": 0.6996757984161377 }, { "epoch": 0.7620273213225104, "grad_norm": 1.270787239074707, "learning_rate": 5e-05, "llm_loss": 0.5106255263090134, "loss": 2.3756, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.05767822265625, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.0626220703125, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.0589599609375, "step": 3849, "total_loss": 0.593911349773407 }, { "epoch": 0.7622253019204118, "grad_norm": 0.9929392337799072, "learning_rate": 5e-05, "llm_loss": 0.6401583105325699, "loss": 2.8819, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05413818359375, "step": 3850, "total_loss": 0.7204701155424118 }, { "epoch": 0.7624232825183132, "grad_norm": 1.2908316850662231, "learning_rate": 5e-05, "llm_loss": 0.6562266051769257, "loss": 2.9558, "loss_aux_layer_0": 0.01336669921875, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.05767822265625, "step": 3851, "total_loss": 0.7389401495456696 }, { "epoch": 0.7626212631162146, "grad_norm": 1.00794517993927, "learning_rate": 5e-05, "llm_loss": 0.6400739103555679, "loss": 2.8858, "loss_aux_layer_0": 0.014404296875, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05792236328125, "step": 3852, "total_loss": 0.7214401513338089 }, { "epoch": 0.762819243714116, "grad_norm": 0.9928903579711914, "learning_rate": 5e-05, "llm_loss": 0.5214228108525276, "loss": 2.4231, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0836181640625, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.1099853515625, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.05914306640625, "step": 3853, "total_loss": 0.6057841926813126 }, { "epoch": 0.7630172243120175, "grad_norm": 1.2021912336349487, "learning_rate": 5e-05, "llm_loss": 0.5260425508022308, "loss": 2.445, "loss_aux_layer_0": 0.0147857666015625, "loss_aux_layer_1": 0.0340576171875, "loss_aux_layer_10": 0.062744140625, "loss_aux_layer_11": 0.0667724609375, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05828857421875, "loss_aux_layer_4": 0.0609130859375, "loss_aux_layer_5": 0.06256103515625, "loss_aux_layer_6": 0.0653076171875, "loss_aux_layer_7": 0.0635986328125, "loss_aux_layer_8": 0.06280517578125, "loss_aux_layer_9": 0.0614013671875, "step": 3854, "total_loss": 0.6112436503171921 }, { "epoch": 0.7632152049099188, "grad_norm": 1.0424805879592896, "learning_rate": 5e-05, "llm_loss": 0.6253153383731842, "loss": 2.8264, "loss_aux_layer_0": 0.012481689453125, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.07183837890625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05633544921875, "step": 3855, "total_loss": 0.7066106200218201 }, { "epoch": 0.7634131855078202, "grad_norm": 1.4588364362716675, "learning_rate": 5e-05, "llm_loss": 0.5441140532493591, "loss": 2.5035, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.06732177734375, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05792236328125, "step": 3856, "total_loss": 0.6258759051561356 }, { "epoch": 0.7636111661057217, "grad_norm": 1.1260732412338257, "learning_rate": 5e-05, "llm_loss": 0.5959746390581131, "loss": 2.7212, "loss_aux_layer_0": 0.0125885009765625, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.069091796875, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0845947265625, "loss_aux_layer_15": 0.093505859375, "loss_aux_layer_16": 0.10302734375, "loss_aux_layer_17": 0.1109619140625, "loss_aux_layer_18": 0.1192626953125, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05853271484375, "step": 3857, "total_loss": 0.680306926369667 }, { "epoch": 0.7638091467036231, "grad_norm": 1.5886644124984741, "learning_rate": 5e-05, "llm_loss": 0.5605470389127731, "loss": 2.572, "loss_aux_layer_0": 0.0145111083984375, "loss_aux_layer_1": 0.032623291015625, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.045654296875, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.0582275390625, "step": 3858, "total_loss": 0.6430086344480515 }, { "epoch": 0.7640071273015244, "grad_norm": 1.14302396774292, "learning_rate": 5e-05, "llm_loss": 0.5026151165366173, "loss": 2.3339, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05609130859375, "step": 3859, "total_loss": 0.5834843814373016 }, { "epoch": 0.7642051078994259, "grad_norm": 1.2188210487365723, "learning_rate": 5e-05, "llm_loss": 0.5014778822660446, "loss": 2.327, "loss_aux_layer_0": 0.013824462890625, "loss_aux_layer_1": 0.032318115234375, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.05645751953125, "step": 3860, "total_loss": 0.581747405230999 }, { "epoch": 0.7644030884973273, "grad_norm": 1.0583829879760742, "learning_rate": 5e-05, "llm_loss": 0.5092205703258514, "loss": 2.3591, "loss_aux_layer_0": 0.0153045654296875, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.1309814453125, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0565185546875, "step": 3861, "total_loss": 0.5897673070430756 }, { "epoch": 0.7646010690952286, "grad_norm": 0.9959999322891235, "learning_rate": 5e-05, "llm_loss": 0.6123572140932083, "loss": 2.7715, "loss_aux_layer_0": 0.0136566162109375, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.05615234375, "step": 3862, "total_loss": 0.6928811520338058 }, { "epoch": 0.7647990496931301, "grad_norm": 1.268437147140503, "learning_rate": 5e-05, "llm_loss": 0.568359762430191, "loss": 2.5976, "loss_aux_layer_0": 0.01617431640625, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.056884765625, "step": 3863, "total_loss": 0.6494122743606567 }, { "epoch": 0.7649970302910315, "grad_norm": 0.9160570502281189, "learning_rate": 5e-05, "llm_loss": 0.5830051153898239, "loss": 2.6567, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05682373046875, "step": 3864, "total_loss": 0.664163202047348 }, { "epoch": 0.7651950108889329, "grad_norm": 0.9758936762809753, "learning_rate": 5e-05, "llm_loss": 0.6278570592403412, "loss": 2.8413, "loss_aux_layer_0": 0.0151824951171875, "loss_aux_layer_1": 0.032501220703125, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06732177734375, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0577392578125, "step": 3865, "total_loss": 0.7103290557861328 }, { "epoch": 0.7653929914868343, "grad_norm": 1.0090162754058838, "learning_rate": 5e-05, "llm_loss": 0.5952150076627731, "loss": 2.7111, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0594482421875, "loss_aux_layer_9": 0.05865478515625, "step": 3866, "total_loss": 0.6777750849723816 }, { "epoch": 0.7655909720847357, "grad_norm": 0.9048487544059753, "learning_rate": 5e-05, "llm_loss": 0.5764361396431923, "loss": 2.6343, "loss_aux_layer_0": 0.0138092041015625, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0574951171875, "step": 3867, "total_loss": 0.6585764437913895 }, { "epoch": 0.7657889526826371, "grad_norm": 0.9931654334068298, "learning_rate": 5e-05, "llm_loss": 0.5426213070750237, "loss": 2.5055, "loss_aux_layer_0": 0.0137939453125, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.075927734375, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1085205078125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.0465087890625, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.05999755859375, "step": 3868, "total_loss": 0.6263633966445923 }, { "epoch": 0.7659869332805385, "grad_norm": 0.8562211394309998, "learning_rate": 5e-05, "llm_loss": 0.570876881480217, "loss": 2.6055, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05657958984375, "step": 3869, "total_loss": 0.651366114616394 }, { "epoch": 0.7661849138784399, "grad_norm": 0.8775098919868469, "learning_rate": 5e-05, "llm_loss": 0.6217600703239441, "loss": 2.8053, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.03033447265625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05731201171875, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.05584716796875, "step": 3870, "total_loss": 0.7013358473777771 }, { "epoch": 0.7663828944763413, "grad_norm": 0.9456186294555664, "learning_rate": 5e-05, "llm_loss": 0.6032518520951271, "loss": 2.7447, "loss_aux_layer_0": 0.0132598876953125, "loss_aux_layer_1": 0.0330810546875, "loss_aux_layer_10": 0.06072998046875, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.07464599609375, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.0458984375, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.06060791015625, "loss_aux_layer_9": 0.0595703125, "step": 3871, "total_loss": 0.6861841827630997 }, { "epoch": 0.7665808750742428, "grad_norm": 0.7811152935028076, "learning_rate": 5e-05, "llm_loss": 0.5771816819906235, "loss": 2.6319, "loss_aux_layer_0": 0.0128326416015625, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.0582275390625, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05633544921875, "step": 3872, "total_loss": 0.6579692363739014 }, { "epoch": 0.7667788556721441, "grad_norm": 0.9213300347328186, "learning_rate": 5e-05, "llm_loss": 0.5987687259912491, "loss": 2.7301, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06427001953125, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1015625, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.11767578125, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1292724609375, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.0599365234375, "loss_aux_layer_9": 0.05889892578125, "step": 3873, "total_loss": 0.6825272291898727 }, { "epoch": 0.7669768362700455, "grad_norm": 0.9590219855308533, "learning_rate": 5e-05, "llm_loss": 0.681348443031311, "loss": 3.0398, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05450439453125, "step": 3874, "total_loss": 0.7599482089281082 }, { "epoch": 0.767174816867947, "grad_norm": 0.8836876153945923, "learning_rate": 5e-05, "llm_loss": 0.5757721289992332, "loss": 2.6246, "loss_aux_layer_0": 0.01251220703125, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0555419921875, "step": 3875, "total_loss": 0.6561554223299026 }, { "epoch": 0.7673727974658483, "grad_norm": 0.9583507776260376, "learning_rate": 5e-05, "llm_loss": 0.6057780534029007, "loss": 2.7493, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.06707763671875, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05755615234375, "step": 3876, "total_loss": 0.687331423163414 }, { "epoch": 0.7675707780637497, "grad_norm": 0.8210572004318237, "learning_rate": 5e-05, "llm_loss": 0.5980106592178345, "loss": 2.724, "loss_aux_layer_0": 0.0135345458984375, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05950927734375, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05950927734375, "step": 3877, "total_loss": 0.6809879541397095 }, { "epoch": 0.7677687586616512, "grad_norm": 1.0089243650436401, "learning_rate": 5e-05, "llm_loss": 0.680110901594162, "loss": 3.0326, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0543212890625, "step": 3878, "total_loss": 0.7581380009651184 }, { "epoch": 0.7679667392595526, "grad_norm": 1.1706397533416748, "learning_rate": 5e-05, "llm_loss": 0.5099723786115646, "loss": 2.3621, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.054443359375, "step": 3879, "total_loss": 0.590513750910759 }, { "epoch": 0.7681647198574539, "grad_norm": 0.7554683685302734, "learning_rate": 5e-05, "llm_loss": 0.5808544754981995, "loss": 2.6613, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.0657958984375, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.084228515625, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1173095703125, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.06207275390625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.06048583984375, "step": 3880, "total_loss": 0.6653313487768173 }, { "epoch": 0.7683627004553554, "grad_norm": 1.0519182682037354, "learning_rate": 5e-05, "llm_loss": 0.572802446782589, "loss": 2.6136, "loss_aux_layer_0": 0.0131072998046875, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05523681640625, "step": 3881, "total_loss": 0.6533997654914856 }, { "epoch": 0.7685606810532568, "grad_norm": 0.964390218257904, "learning_rate": 5e-05, "llm_loss": 0.5758357346057892, "loss": 2.6268, "loss_aux_layer_0": 0.0128936767578125, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05657958984375, "step": 3882, "total_loss": 0.6567081063985825 }, { "epoch": 0.7687586616511581, "grad_norm": 0.9110543727874756, "learning_rate": 5e-05, "llm_loss": 0.6182890236377716, "loss": 2.8052, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.0584716796875, "step": 3883, "total_loss": 0.7013032734394073 }, { "epoch": 0.7689566422490596, "grad_norm": 1.126345157623291, "learning_rate": 5e-05, "llm_loss": 0.5384558662772179, "loss": 2.479, "loss_aux_layer_0": 0.013671875, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05657958984375, "step": 3884, "total_loss": 0.6197410076856613 }, { "epoch": 0.769154622846961, "grad_norm": 0.7986956834793091, "learning_rate": 5e-05, "llm_loss": 0.5517609566450119, "loss": 2.551, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.03363037109375, "loss_aux_layer_10": 0.06353759765625, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.077880859375, "loss_aux_layer_14": 0.086181640625, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.118408203125, "loss_aux_layer_19": 0.1207275390625, "loss_aux_layer_2": 0.04730224609375, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.060546875, "loss_aux_layer_5": 0.06231689453125, "loss_aux_layer_6": 0.06524658203125, "loss_aux_layer_7": 0.0634765625, "loss_aux_layer_8": 0.06298828125, "loss_aux_layer_9": 0.06207275390625, "step": 3885, "total_loss": 0.6377405077219009 }, { "epoch": 0.7693526034448624, "grad_norm": 1.1466175317764282, "learning_rate": 5e-05, "llm_loss": 0.5840647518634796, "loss": 2.6523, "loss_aux_layer_0": 0.012969970703125, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.054931640625, "step": 3886, "total_loss": 0.6630829572677612 }, { "epoch": 0.7695505840427638, "grad_norm": 0.8201111555099487, "learning_rate": 5e-05, "llm_loss": 0.541932687163353, "loss": 2.492, "loss_aux_layer_0": 0.01239013671875, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.0572509765625, "step": 3887, "total_loss": 0.6230088323354721 }, { "epoch": 0.7697485646406652, "grad_norm": 0.9406412243843079, "learning_rate": 5e-05, "llm_loss": 0.5552659332752228, "loss": 2.5409, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1300048828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0567626953125, "step": 3888, "total_loss": 0.6352181732654572 }, { "epoch": 0.7699465452385666, "grad_norm": 0.9355872869491577, "learning_rate": 5e-05, "llm_loss": 0.5304032415151596, "loss": 2.4664, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06280517578125, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.0714111328125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.0941162109375, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.111328125, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.0478515625, "loss_aux_layer_20": 0.1304931640625, "loss_aux_layer_21": 0.1385498046875, "loss_aux_layer_22": 0.16064453125, "loss_aux_layer_23": 0.19775390625, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.0601806640625, "loss_aux_layer_5": 0.06201171875, "loss_aux_layer_6": 0.0648193359375, "loss_aux_layer_7": 0.06317138671875, "loss_aux_layer_8": 0.0625, "loss_aux_layer_9": 0.06146240234375, "step": 3889, "total_loss": 0.61660435795784 }, { "epoch": 0.770144525836468, "grad_norm": 0.8913965225219727, "learning_rate": 5e-05, "llm_loss": 0.6034278273582458, "loss": 2.7427, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.1329345703125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.0621337890625, "loss_aux_layer_7": 0.06060791015625, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.05889892578125, "step": 3890, "total_loss": 0.6856869161128998 }, { "epoch": 0.7703425064343694, "grad_norm": 0.8801788091659546, "learning_rate": 5e-05, "llm_loss": 0.6362516134977341, "loss": 2.8531, "loss_aux_layer_0": 0.013671875, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.0531005859375, "step": 3891, "total_loss": 0.7132667750120163 }, { "epoch": 0.7705404870322708, "grad_norm": 0.8318515419960022, "learning_rate": 5e-05, "llm_loss": 0.6315974742174149, "loss": 2.8496, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.056396484375, "step": 3892, "total_loss": 0.7123987823724747 }, { "epoch": 0.7707384676301723, "grad_norm": 0.7734124064445496, "learning_rate": 5e-05, "llm_loss": 0.49762915074825287, "loss": 2.3101, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.03106689453125, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05596923828125, "step": 3893, "total_loss": 0.5775184482336044 }, { "epoch": 0.7709364482280736, "grad_norm": 0.868334174156189, "learning_rate": 5e-05, "llm_loss": 0.6893751323223114, "loss": 3.0903, "loss_aux_layer_0": 0.0129852294921875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.082275390625, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1165771484375, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05841064453125, "step": 3894, "total_loss": 0.7725819051265717 }, { "epoch": 0.771134428825975, "grad_norm": 0.6956474184989929, "learning_rate": 5e-05, "llm_loss": 0.5896204113960266, "loss": 2.68, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.05718994140625, "step": 3895, "total_loss": 0.669997438788414 }, { "epoch": 0.7713324094238765, "grad_norm": 0.8528833389282227, "learning_rate": 5e-05, "llm_loss": 0.6061221957206726, "loss": 2.7449, "loss_aux_layer_0": 0.0126953125, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.0555419921875, "step": 3896, "total_loss": 0.6862332075834274 }, { "epoch": 0.7715303900217778, "grad_norm": 0.8048486709594727, "learning_rate": 5e-05, "llm_loss": 0.5201791375875473, "loss": 2.4032, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05615234375, "step": 3897, "total_loss": 0.6008079200983047 }, { "epoch": 0.7717283706196792, "grad_norm": 0.8564968705177307, "learning_rate": 5e-05, "llm_loss": 0.6375151127576828, "loss": 2.854, "loss_aux_layer_0": 0.0118560791015625, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.0518798828125, "step": 3898, "total_loss": 0.7134950757026672 }, { "epoch": 0.7719263512175807, "grad_norm": 0.8377937078475952, "learning_rate": 5e-05, "llm_loss": 0.5810741037130356, "loss": 2.6443, "loss_aux_layer_0": 0.011810302734375, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05596923828125, "step": 3899, "total_loss": 0.6610738635063171 }, { "epoch": 0.7721243318154821, "grad_norm": 0.9627699851989746, "learning_rate": 5e-05, "llm_loss": 0.5552821457386017, "loss": 2.5434, "loss_aux_layer_0": 0.0118255615234375, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05621337890625, "step": 3900, "total_loss": 0.6358597278594971 }, { "epoch": 0.7723223124133834, "grad_norm": 1.0511528253555298, "learning_rate": 5e-05, "llm_loss": 0.5174725279211998, "loss": 2.3909, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05621337890625, "step": 3901, "total_loss": 0.5977154672145844 }, { "epoch": 0.7725202930112849, "grad_norm": 1.011602759361267, "learning_rate": 5e-05, "llm_loss": 0.6023091375827789, "loss": 2.7263, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05426025390625, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05584716796875, "step": 3902, "total_loss": 0.6815695762634277 }, { "epoch": 0.7727182736091863, "grad_norm": 1.2432411909103394, "learning_rate": 5e-05, "llm_loss": 0.5752330347895622, "loss": 2.6136, "loss_aux_layer_0": 0.013336181640625, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.06842041015625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.0537109375, "step": 3903, "total_loss": 0.6533969789743423 }, { "epoch": 0.7729162542070878, "grad_norm": 1.0054765939712524, "learning_rate": 5e-05, "llm_loss": 0.6061987429857254, "loss": 2.7447, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05682373046875, "step": 3904, "total_loss": 0.6861664652824402 }, { "epoch": 0.7731142348049891, "grad_norm": 1.0371708869934082, "learning_rate": 5e-05, "llm_loss": 0.5707332715392113, "loss": 2.6031, "loss_aux_layer_0": 0.01275634765625, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0552978515625, "step": 3905, "total_loss": 0.6507794409990311 }, { "epoch": 0.7733122154028905, "grad_norm": 0.9491729140281677, "learning_rate": 5e-05, "llm_loss": 0.5254645049571991, "loss": 2.4396, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.0325927734375, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.1279296875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05615234375, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06060791015625, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.06158447265625, "loss_aux_layer_9": 0.060302734375, "step": 3906, "total_loss": 0.6098878681659698 }, { "epoch": 0.773510196000792, "grad_norm": 0.9123111963272095, "learning_rate": 5e-05, "llm_loss": 0.50794817507267, "loss": 2.3622, "loss_aux_layer_0": 0.01220703125, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06756591796875, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05780029296875, "step": 3907, "total_loss": 0.5905470997095108 }, { "epoch": 0.7737081765986933, "grad_norm": 0.971411943435669, "learning_rate": 5e-05, "llm_loss": 0.605997622013092, "loss": 2.7405, "loss_aux_layer_0": 0.0133819580078125, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0543212890625, "step": 3908, "total_loss": 0.6851243078708649 }, { "epoch": 0.7739061571965947, "grad_norm": 0.9475653767585754, "learning_rate": 5e-05, "llm_loss": 0.45736126601696014, "loss": 2.1547, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05615234375, "step": 3909, "total_loss": 0.5386730432510376 }, { "epoch": 0.7741041377944962, "grad_norm": 0.7740710377693176, "learning_rate": 5e-05, "llm_loss": 0.4536973834037781, "loss": 2.1402, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.0565185546875, "step": 3910, "total_loss": 0.5350495204329491 }, { "epoch": 0.7743021183923976, "grad_norm": 1.0883971452713013, "learning_rate": 5e-05, "llm_loss": 0.552847295999527, "loss": 2.5424, "loss_aux_layer_0": 0.0130157470703125, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06396484375, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06231689453125, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05841064453125, "step": 3911, "total_loss": 0.6355886161327362 }, { "epoch": 0.7745000989902989, "grad_norm": 0.9566180109977722, "learning_rate": 5e-05, "llm_loss": 0.5893744975328445, "loss": 2.6865, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.03228759765625, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0546875, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.0579833984375, "step": 3912, "total_loss": 0.6716181635856628 }, { "epoch": 0.7746980795882004, "grad_norm": 1.1892764568328857, "learning_rate": 5e-05, "llm_loss": 0.5427858307957649, "loss": 2.4983, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.06597900390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05682373046875, "step": 3913, "total_loss": 0.6245659440755844 }, { "epoch": 0.7748960601861018, "grad_norm": 0.9288524389266968, "learning_rate": 5e-05, "llm_loss": 0.5373601913452148, "loss": 2.4727, "loss_aux_layer_0": 0.0131683349609375, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05670166015625, "step": 3914, "total_loss": 0.6181671023368835 }, { "epoch": 0.7750940407840031, "grad_norm": 1.1194216012954712, "learning_rate": 5e-05, "llm_loss": 0.5920230001211166, "loss": 2.6814, "loss_aux_layer_0": 0.01202392578125, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05377197265625, "step": 3915, "total_loss": 0.6703443080186844 }, { "epoch": 0.7752920213819046, "grad_norm": 0.9246846437454224, "learning_rate": 5e-05, "llm_loss": 0.5327749028801918, "loss": 2.4602, "loss_aux_layer_0": 0.0127105712890625, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.06805419921875, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.05926513671875, "loss_aux_layer_9": 0.05810546875, "step": 3916, "total_loss": 0.615050695836544 }, { "epoch": 0.775490001979806, "grad_norm": 1.0943779945373535, "learning_rate": 5e-05, "llm_loss": 0.5820609480142593, "loss": 2.6517, "loss_aux_layer_0": 0.012420654296875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0565185546875, "step": 3917, "total_loss": 0.6629178076982498 }, { "epoch": 0.7756879825777074, "grad_norm": 1.0555109977722168, "learning_rate": 5e-05, "llm_loss": 0.6541298627853394, "loss": 2.9495, "loss_aux_layer_0": 0.0135650634765625, "loss_aux_layer_1": 0.032257080078125, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.06854248046875, "loss_aux_layer_13": 0.0738525390625, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.194580078125, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.05999755859375, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.0582275390625, "step": 3918, "total_loss": 0.7373795360326767 }, { "epoch": 0.7758859631756088, "grad_norm": 0.8801319003105164, "learning_rate": 5e-05, "llm_loss": 0.5662890076637268, "loss": 2.6032, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.0615234375, "loss_aux_layer_11": 0.06591796875, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0758056640625, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1170654296875, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.058837890625, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.0616455078125, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.05999755859375, "step": 3919, "total_loss": 0.6508040875196457 }, { "epoch": 0.7760839437735102, "grad_norm": 1.1562092304229736, "learning_rate": 5e-05, "llm_loss": 0.6620963364839554, "loss": 2.9831, "loss_aux_layer_0": 0.0133819580078125, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1090087890625, "loss_aux_layer_18": 0.1180419921875, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.136962890625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.0579833984375, "step": 3920, "total_loss": 0.7457736879587173 }, { "epoch": 0.7762819243714116, "grad_norm": 0.8905764222145081, "learning_rate": 5e-05, "llm_loss": 0.6056051105260849, "loss": 2.7425, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05902099609375, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.05548095703125, "step": 3921, "total_loss": 0.6856371611356735 }, { "epoch": 0.776479904969313, "grad_norm": 1.056577444076538, "learning_rate": 5e-05, "llm_loss": 0.6230732500553131, "loss": 2.8185, "loss_aux_layer_0": 0.0129547119140625, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.11669921875, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05621337890625, "step": 3922, "total_loss": 0.7046236842870712 }, { "epoch": 0.7766778855672144, "grad_norm": 1.3593188524246216, "learning_rate": 5e-05, "llm_loss": 0.6304613053798676, "loss": 2.8507, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05462646484375, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.0577392578125, "step": 3923, "total_loss": 0.7126642316579819 }, { "epoch": 0.7768758661651158, "grad_norm": 0.9543038606643677, "learning_rate": 5e-05, "llm_loss": 0.4737386777997017, "loss": 2.2298, "loss_aux_layer_0": 0.01397705078125, "loss_aux_layer_1": 0.033538818359375, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082275390625, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.06011962890625, "loss_aux_layer_9": 0.0587158203125, "step": 3924, "total_loss": 0.5574541836977005 }, { "epoch": 0.7770738467630173, "grad_norm": 0.9770588874816895, "learning_rate": 5e-05, "llm_loss": 0.5942331999540329, "loss": 2.692, "loss_aux_layer_0": 0.0125579833984375, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.05419921875, "step": 3925, "total_loss": 0.6729971617460251 }, { "epoch": 0.7772718273609186, "grad_norm": 0.9894850850105286, "learning_rate": 5e-05, "llm_loss": 0.538733497262001, "loss": 2.4689, "loss_aux_layer_0": 0.014434814453125, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05340576171875, "step": 3926, "total_loss": 0.6172137409448624 }, { "epoch": 0.77746980795882, "grad_norm": 0.9386767745018005, "learning_rate": 5e-05, "llm_loss": 0.5948175489902496, "loss": 2.693, "loss_aux_layer_0": 0.0125885009765625, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0421142578125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.054443359375, "step": 3927, "total_loss": 0.6732473969459534 }, { "epoch": 0.7776677885567215, "grad_norm": 1.1572860479354858, "learning_rate": 5e-05, "llm_loss": 0.5233663320541382, "loss": 2.4101, "loss_aux_layer_0": 0.0152130126953125, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.1300048828125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05517578125, "step": 3928, "total_loss": 0.6025286838412285 }, { "epoch": 0.7778657691546228, "grad_norm": 0.8751861453056335, "learning_rate": 5e-05, "llm_loss": 0.633716031908989, "loss": 2.8698, "loss_aux_layer_0": 0.0128326416015625, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04638671875, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.0594482421875, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.06219482421875, "loss_aux_layer_8": 0.06146240234375, "loss_aux_layer_9": 0.0601806640625, "step": 3929, "total_loss": 0.7174511700868607 }, { "epoch": 0.7780637497525242, "grad_norm": 1.063677430152893, "learning_rate": 5e-05, "llm_loss": 0.6201871484518051, "loss": 2.8154, "loss_aux_layer_0": 0.014190673828125, "loss_aux_layer_1": 0.03369140625, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04620361328125, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.056396484375, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.0604248046875, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.0594482421875, "step": 3930, "total_loss": 0.703846737742424 }, { "epoch": 0.7782617303504257, "grad_norm": 0.9558725357055664, "learning_rate": 5e-05, "llm_loss": 0.5833447128534317, "loss": 2.6591, "loss_aux_layer_0": 0.0130462646484375, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0572509765625, "step": 3931, "total_loss": 0.6647858768701553 }, { "epoch": 0.7784597109483271, "grad_norm": 0.9330068230628967, "learning_rate": 5e-05, "llm_loss": 0.6302255392074585, "loss": 2.8401, "loss_aux_layer_0": 0.013946533203125, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05682373046875, "step": 3932, "total_loss": 0.710025429725647 }, { "epoch": 0.7786576915462284, "grad_norm": 0.909163773059845, "learning_rate": 5e-05, "llm_loss": 0.5787715315818787, "loss": 2.6408, "loss_aux_layer_0": 0.0127410888671875, "loss_aux_layer_1": 0.032440185546875, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04486083984375, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.058349609375, "step": 3933, "total_loss": 0.6601879745721817 }, { "epoch": 0.7788556721441299, "grad_norm": 0.8168757557868958, "learning_rate": 5e-05, "llm_loss": 0.5410170555114746, "loss": 2.4875, "loss_aux_layer_0": 0.0131072998046875, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.055908203125, "step": 3934, "total_loss": 0.6218787282705307 }, { "epoch": 0.7790536527420313, "grad_norm": 0.8517315983772278, "learning_rate": 5e-05, "llm_loss": 0.5233109593391418, "loss": 2.4149, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.0562744140625, "step": 3935, "total_loss": 0.6037357449531555 }, { "epoch": 0.7792516333399326, "grad_norm": 0.7658079266548157, "learning_rate": 5e-05, "llm_loss": 0.5939326882362366, "loss": 2.6952, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.030181884765625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05584716796875, "step": 3936, "total_loss": 0.6738110333681107 }, { "epoch": 0.7794496139378341, "grad_norm": 0.8725758194923401, "learning_rate": 5e-05, "llm_loss": 0.6518042832612991, "loss": 2.9358, "loss_aux_layer_0": 0.0128021240234375, "loss_aux_layer_1": 0.03387451171875, "loss_aux_layer_10": 0.0604248046875, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04681396484375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05657958984375, "loss_aux_layer_4": 0.05914306640625, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06365966796875, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05938720703125, "step": 3937, "total_loss": 0.7339465171098709 }, { "epoch": 0.7796475945357355, "grad_norm": 0.8106706142425537, "learning_rate": 5e-05, "llm_loss": 0.5495206415653229, "loss": 2.5216, "loss_aux_layer_0": 0.0124359130859375, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05621337890625, "step": 3938, "total_loss": 0.630410298705101 }, { "epoch": 0.779845575133637, "grad_norm": 0.812048077583313, "learning_rate": 5e-05, "llm_loss": 0.5714885368943214, "loss": 2.6187, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.0635986328125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05841064453125, "step": 3939, "total_loss": 0.6546752452850342 }, { "epoch": 0.7800435557315383, "grad_norm": 0.8705998659133911, "learning_rate": 5e-05, "llm_loss": 0.588936984539032, "loss": 2.675, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.030426025390625, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.05572509765625, "step": 3940, "total_loss": 0.6687535047531128 }, { "epoch": 0.7802415363294397, "grad_norm": 1.1615686416625977, "learning_rate": 5e-05, "llm_loss": 0.5432730689644814, "loss": 2.4966, "loss_aux_layer_0": 0.0121002197265625, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.05584716796875, "step": 3941, "total_loss": 0.6241471171379089 }, { "epoch": 0.7804395169273411, "grad_norm": 1.0286202430725098, "learning_rate": 5e-05, "llm_loss": 0.5845391973853111, "loss": 2.6714, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.0322265625, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0550537109375, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06072998046875, "loss_aux_layer_8": 0.0601806640625, "loss_aux_layer_9": 0.0589599609375, "step": 3942, "total_loss": 0.6678476780653 }, { "epoch": 0.7806374975252425, "grad_norm": 0.8429845571517944, "learning_rate": 5e-05, "llm_loss": 0.4979032278060913, "loss": 2.3164, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06573486328125, "loss_aux_layer_13": 0.07110595703125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.0560302734375, "step": 3943, "total_loss": 0.5790896192193031 }, { "epoch": 0.7808354781231439, "grad_norm": 0.9549359679222107, "learning_rate": 5e-05, "llm_loss": 0.580191545188427, "loss": 2.6469, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.056640625, "step": 3944, "total_loss": 0.661728173494339 }, { "epoch": 0.7810334587210453, "grad_norm": 1.120489239692688, "learning_rate": 5e-05, "llm_loss": 0.5834019035100937, "loss": 2.6584, "loss_aux_layer_0": 0.0128936767578125, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0572509765625, "step": 3945, "total_loss": 0.664602518081665 }, { "epoch": 0.7812314393189468, "grad_norm": 0.7936223149299622, "learning_rate": 5e-05, "llm_loss": 0.5103929340839386, "loss": 2.364, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.05560302734375, "step": 3946, "total_loss": 0.5909913331270218 }, { "epoch": 0.7814294199168481, "grad_norm": 0.8535479307174683, "learning_rate": 5e-05, "llm_loss": 0.5832353234291077, "loss": 2.6658, "loss_aux_layer_0": 0.0126800537109375, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.058837890625, "step": 3947, "total_loss": 0.6664394587278366 }, { "epoch": 0.7816274005147495, "grad_norm": 1.0872920751571655, "learning_rate": 5e-05, "llm_loss": 0.50612573325634, "loss": 2.3474, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.057373046875, "step": 3948, "total_loss": 0.5868596136569977 }, { "epoch": 0.781825381112651, "grad_norm": 0.8681173920631409, "learning_rate": 5e-05, "llm_loss": 0.5874369293451309, "loss": 2.6783, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.06121826171875, "loss_aux_layer_7": 0.05963134765625, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05792236328125, "step": 3949, "total_loss": 0.6695797741413116 }, { "epoch": 0.7820233617105523, "grad_norm": 0.9704567790031433, "learning_rate": 5e-05, "llm_loss": 0.527050755918026, "loss": 2.4322, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.03228759765625, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.05712890625, "step": 3950, "total_loss": 0.608052209019661 }, { "epoch": 0.7822213423084537, "grad_norm": 0.7445245981216431, "learning_rate": 5e-05, "llm_loss": 0.5271569639444351, "loss": 2.4393, "loss_aux_layer_0": 0.0119171142578125, "loss_aux_layer_1": 0.03228759765625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0654296875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.05999755859375, "loss_aux_layer_6": 0.063232421875, "loss_aux_layer_7": 0.06146240234375, "loss_aux_layer_8": 0.06072998046875, "loss_aux_layer_9": 0.0599365234375, "step": 3951, "total_loss": 0.6098165214061737 }, { "epoch": 0.7824193229063552, "grad_norm": 1.1860686540603638, "learning_rate": 5e-05, "llm_loss": 0.6158047765493393, "loss": 2.7902, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.032196044921875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.05816650390625, "step": 3952, "total_loss": 0.6975422352552414 }, { "epoch": 0.7826173035042566, "grad_norm": 0.9575477242469788, "learning_rate": 5e-05, "llm_loss": 0.5958177000284195, "loss": 2.6905, "loss_aux_layer_0": 0.0132293701171875, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.052734375, "step": 3953, "total_loss": 0.6726211458444595 }, { "epoch": 0.782815284102158, "grad_norm": 1.0598355531692505, "learning_rate": 5e-05, "llm_loss": 0.5598907247185707, "loss": 2.5635, "loss_aux_layer_0": 0.0121307373046875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.05706787109375, "step": 3954, "total_loss": 0.6408728510141373 }, { "epoch": 0.7830132647000594, "grad_norm": 0.8771134614944458, "learning_rate": 5e-05, "llm_loss": 0.59205661714077, "loss": 2.6834, "loss_aux_layer_0": 0.01287841796875, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05438232421875, "step": 3955, "total_loss": 0.6708459109067917 }, { "epoch": 0.7832112452979608, "grad_norm": 1.0825260877609253, "learning_rate": 5e-05, "llm_loss": 0.6180071234703064, "loss": 2.7971, "loss_aux_layer_0": 0.01409912109375, "loss_aux_layer_1": 0.032562255859375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05712890625, "step": 3956, "total_loss": 0.6992842257022858 }, { "epoch": 0.7834092258958623, "grad_norm": 0.85319983959198, "learning_rate": 5e-05, "llm_loss": 0.5425549000501633, "loss": 2.4993, "loss_aux_layer_0": 0.012664794921875, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.0592041015625, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.05810546875, "step": 3957, "total_loss": 0.6248141527175903 }, { "epoch": 0.7836072064937636, "grad_norm": 0.9120697379112244, "learning_rate": 5e-05, "llm_loss": 0.569778174161911, "loss": 2.6025, "loss_aux_layer_0": 0.0136566162109375, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.05670166015625, "step": 3958, "total_loss": 0.6506174057722092 }, { "epoch": 0.783805187091665, "grad_norm": 1.0676671266555786, "learning_rate": 5e-05, "llm_loss": 0.5686637312173843, "loss": 2.6087, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.116455078125, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05487060546875, "loss_aux_layer_4": 0.057373046875, "loss_aux_layer_5": 0.0589599609375, "loss_aux_layer_6": 0.06170654296875, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.0587158203125, "step": 3959, "total_loss": 0.6521849408745766 }, { "epoch": 0.7840031676895665, "grad_norm": 1.0020242929458618, "learning_rate": 5e-05, "llm_loss": 0.6047097742557526, "loss": 2.7421, "loss_aux_layer_0": 0.01287841796875, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.0567626953125, "step": 3960, "total_loss": 0.6855242252349854 }, { "epoch": 0.7842011482874678, "grad_norm": 0.9068772792816162, "learning_rate": 5e-05, "llm_loss": 0.49570170044898987, "loss": 2.3015, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05535888671875, "step": 3961, "total_loss": 0.5753794461488724 }, { "epoch": 0.7843991288853692, "grad_norm": 0.9478493332862854, "learning_rate": 5e-05, "llm_loss": 0.5795424282550812, "loss": 2.6355, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05517578125, "step": 3962, "total_loss": 0.6588866859674454 }, { "epoch": 0.7845971094832707, "grad_norm": 0.9652919173240662, "learning_rate": 5e-05, "llm_loss": 0.5580991059541702, "loss": 2.5526, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.0555419921875, "step": 3963, "total_loss": 0.6381561905145645 }, { "epoch": 0.7847950900811721, "grad_norm": 1.0452830791473389, "learning_rate": 5e-05, "llm_loss": 0.5342250019311905, "loss": 2.457, "loss_aux_layer_0": 0.0118560791015625, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.06500244140625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05487060546875, "step": 3964, "total_loss": 0.6142386645078659 }, { "epoch": 0.7849930706790734, "grad_norm": 1.026164174079895, "learning_rate": 5e-05, "llm_loss": 0.5633172914385796, "loss": 2.5737, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.055908203125, "step": 3965, "total_loss": 0.6434344202280045 }, { "epoch": 0.7851910512769749, "grad_norm": 1.0850937366485596, "learning_rate": 5e-05, "llm_loss": 0.4768463000655174, "loss": 2.236, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.0582275390625, "step": 3966, "total_loss": 0.5590035915374756 }, { "epoch": 0.7853890318748763, "grad_norm": 0.9341045022010803, "learning_rate": 5e-05, "llm_loss": 0.5437299087643623, "loss": 2.5139, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.033966064453125, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.06634521484375, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0762939453125, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.04754638671875, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05780029296875, "loss_aux_layer_4": 0.06036376953125, "loss_aux_layer_5": 0.06146240234375, "loss_aux_layer_6": 0.06451416015625, "loss_aux_layer_7": 0.06268310546875, "loss_aux_layer_8": 0.0618896484375, "loss_aux_layer_9": 0.060791015625, "step": 3967, "total_loss": 0.6284780204296112 }, { "epoch": 0.7855870124727776, "grad_norm": 1.103190541267395, "learning_rate": 5e-05, "llm_loss": 0.5785206258296967, "loss": 2.6329, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.055908203125, "step": 3968, "total_loss": 0.6582134664058685 }, { "epoch": 0.7857849930706791, "grad_norm": 0.9206110239028931, "learning_rate": 5e-05, "llm_loss": 0.5803579539060593, "loss": 2.6463, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.03106689453125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05657958984375, "step": 3969, "total_loss": 0.6615642011165619 }, { "epoch": 0.7859829736685805, "grad_norm": 0.917998194694519, "learning_rate": 5e-05, "llm_loss": 0.6345077902078629, "loss": 2.8675, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.057373046875, "step": 3970, "total_loss": 0.7168835699558258 }, { "epoch": 0.7861809542664819, "grad_norm": 0.9903396964073181, "learning_rate": 5e-05, "llm_loss": 0.5566323101520538, "loss": 2.5363, "loss_aux_layer_0": 0.0123291015625, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05401611328125, "step": 3971, "total_loss": 0.6340795010328293 }, { "epoch": 0.7863789348643833, "grad_norm": 0.8285987973213196, "learning_rate": 5e-05, "llm_loss": 0.69802226126194, "loss": 3.1266, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.06121826171875, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.083984375, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1011962890625, "loss_aux_layer_17": 0.1087646484375, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.0595703125, "step": 3972, "total_loss": 0.7816519439220428 }, { "epoch": 0.7865769154622847, "grad_norm": 0.847057580947876, "learning_rate": 5e-05, "llm_loss": 0.6623578518629074, "loss": 2.9739, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05767822265625, "step": 3973, "total_loss": 0.7434720546007156 }, { "epoch": 0.7867748960601861, "grad_norm": 1.195949673652649, "learning_rate": 5e-05, "llm_loss": 0.7402422428131104, "loss": 3.2881, "loss_aux_layer_0": 0.0115814208984375, "loss_aux_layer_1": 0.030364990234375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.120849609375, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05596923828125, "step": 3974, "total_loss": 0.8220265656709671 }, { "epoch": 0.7869728766580875, "grad_norm": 1.59326171875, "learning_rate": 5e-05, "llm_loss": 0.5405524969100952, "loss": 2.4773, "loss_aux_layer_0": 0.0117340087890625, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05474853515625, "step": 3975, "total_loss": 0.6193243116140366 }, { "epoch": 0.7871708572559889, "grad_norm": 0.8725237846374512, "learning_rate": 5e-05, "llm_loss": 0.5212876349687576, "loss": 2.4166, "loss_aux_layer_0": 0.01312255859375, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.119873046875, "loss_aux_layer_2": 0.0447998046875, "loss_aux_layer_20": 0.128173828125, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.059814453125, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.05828857421875, "step": 3976, "total_loss": 0.6041479408740997 }, { "epoch": 0.7873688378538903, "grad_norm": 1.038362979888916, "learning_rate": 5e-05, "llm_loss": 0.5038034468889236, "loss": 2.3357, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05621337890625, "step": 3977, "total_loss": 0.5839140862226486 }, { "epoch": 0.7875668184517918, "grad_norm": 0.9896733164787292, "learning_rate": 5e-05, "llm_loss": 0.5091597661376, "loss": 2.3671, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.032012939453125, "loss_aux_layer_10": 0.0595703125, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.0574951171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.058349609375, "step": 3978, "total_loss": 0.5917850732803345 }, { "epoch": 0.7877647990496931, "grad_norm": 1.0133992433547974, "learning_rate": 5e-05, "llm_loss": 0.5650844126939774, "loss": 2.5933, "loss_aux_layer_0": 0.0154876708984375, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.06036376953125, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.06292724609375, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.0589599609375, "step": 3979, "total_loss": 0.6483346372842789 }, { "epoch": 0.7879627796475945, "grad_norm": 0.8960669040679932, "learning_rate": 5e-05, "llm_loss": 0.5466637462377548, "loss": 2.5082, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05584716796875, "step": 3980, "total_loss": 0.6270510256290436 }, { "epoch": 0.788160760245496, "grad_norm": 1.1814228296279907, "learning_rate": 5e-05, "llm_loss": 0.5444770306348801, "loss": 2.5118, "loss_aux_layer_0": 0.016082763671875, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0906982421875, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04608154296875, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.0577392578125, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05828857421875, "step": 3981, "total_loss": 0.6279445886611938 }, { "epoch": 0.7883587408433973, "grad_norm": 0.9098619222640991, "learning_rate": 5e-05, "llm_loss": 0.5420055389404297, "loss": 2.4999, "loss_aux_layer_0": 0.0146484375, "loss_aux_layer_1": 0.032806396484375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0545654296875, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.05841064453125, "step": 3982, "total_loss": 0.6249691694974899 }, { "epoch": 0.7885567214412987, "grad_norm": 0.920737087726593, "learning_rate": 5e-05, "llm_loss": 0.6076406985521317, "loss": 2.7515, "loss_aux_layer_0": 0.0122222900390625, "loss_aux_layer_1": 0.032257080078125, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05816650390625, "step": 3983, "total_loss": 0.6878857910633087 }, { "epoch": 0.7887547020392002, "grad_norm": 0.891149640083313, "learning_rate": 5e-05, "llm_loss": 0.6255309134721756, "loss": 2.8224, "loss_aux_layer_0": 0.01336669921875, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.056640625, "step": 3984, "total_loss": 0.7056048363447189 }, { "epoch": 0.7889526826371016, "grad_norm": 0.8963476419448853, "learning_rate": 5e-05, "llm_loss": 0.4963139519095421, "loss": 2.3073, "loss_aux_layer_0": 0.012786865234375, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06573486328125, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05657958984375, "step": 3985, "total_loss": 0.576813206076622 }, { "epoch": 0.7891506632350029, "grad_norm": 0.8399654030799866, "learning_rate": 5e-05, "llm_loss": 0.5735466331243515, "loss": 2.6245, "loss_aux_layer_0": 0.0124969482421875, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.06427001953125, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08251953125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.060546875, "loss_aux_layer_8": 0.05987548828125, "loss_aux_layer_9": 0.0587158203125, "step": 3986, "total_loss": 0.6561289578676224 }, { "epoch": 0.7893486438329044, "grad_norm": 0.7744699120521545, "learning_rate": 5e-05, "llm_loss": 0.6245196908712387, "loss": 2.8071, "loss_aux_layer_0": 0.012298583984375, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05340576171875, "step": 3987, "total_loss": 0.7017793953418732 }, { "epoch": 0.7895466244308058, "grad_norm": 0.8755883574485779, "learning_rate": 5e-05, "llm_loss": 0.5854286700487137, "loss": 2.6624, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.055419921875, "step": 3988, "total_loss": 0.6656017899513245 }, { "epoch": 0.7897446050287071, "grad_norm": 1.064092993736267, "learning_rate": 5e-05, "llm_loss": 0.5283884927630424, "loss": 2.4395, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.03204345703125, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.06744384765625, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04498291015625, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0574951171875, "step": 3989, "total_loss": 0.6098832190036774 }, { "epoch": 0.7899425856266086, "grad_norm": 1.0512958765029907, "learning_rate": 5e-05, "llm_loss": 0.5330987572669983, "loss": 2.4426, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06744384765625, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.052490234375, "step": 3990, "total_loss": 0.6106467843055725 }, { "epoch": 0.79014056622451, "grad_norm": 1.097423791885376, "learning_rate": 5e-05, "llm_loss": 0.5866741091012955, "loss": 2.6734, "loss_aux_layer_0": 0.012420654296875, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.12109375, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.12939453125, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.15966796875, "loss_aux_layer_23": 0.1982421875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05755615234375, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05438232421875, "step": 3991, "total_loss": 0.6683470159769058 }, { "epoch": 0.7903385468224114, "grad_norm": 0.9328166246414185, "learning_rate": 5e-05, "llm_loss": 0.542859748005867, "loss": 2.497, "loss_aux_layer_0": 0.0126495361328125, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.0570068359375, "step": 3992, "total_loss": 0.6242514923214912 }, { "epoch": 0.7905365274203128, "grad_norm": 1.0795069932937622, "learning_rate": 5e-05, "llm_loss": 0.6743427217006683, "loss": 3.015, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0552978515625, "step": 3993, "total_loss": 0.7537490725517273 }, { "epoch": 0.7907345080182142, "grad_norm": 0.904660701751709, "learning_rate": 5e-05, "llm_loss": 0.5999794900417328, "loss": 2.73, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.033111572265625, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.06414794921875, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0819091796875, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04522705078125, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06256103515625, "loss_aux_layer_7": 0.0609130859375, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.0587158203125, "step": 3994, "total_loss": 0.6825041621923447 }, { "epoch": 0.7909324886161156, "grad_norm": 0.9322746992111206, "learning_rate": 5e-05, "llm_loss": 0.5930565744638443, "loss": 2.705, "loss_aux_layer_0": 0.0130767822265625, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05517578125, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05963134765625, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.06048583984375, "loss_aux_layer_9": 0.0592041015625, "step": 3995, "total_loss": 0.6762445569038391 }, { "epoch": 0.791130469214017, "grad_norm": 0.9581601023674011, "learning_rate": 5e-05, "llm_loss": 0.606914296746254, "loss": 2.7417, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0540771484375, "step": 3996, "total_loss": 0.6854365766048431 }, { "epoch": 0.7913284498119184, "grad_norm": 0.9020972847938538, "learning_rate": 5e-05, "llm_loss": 0.5130331292748451, "loss": 2.3678, "loss_aux_layer_0": 0.012664794921875, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.0540771484375, "step": 3997, "total_loss": 0.591957576572895 }, { "epoch": 0.7915264304098198, "grad_norm": 0.9674075245857239, "learning_rate": 5e-05, "llm_loss": 0.6040395051240921, "loss": 2.7474, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.032958984375, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.06817626953125, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04644775390625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05596923828125, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.05938720703125, "loss_aux_layer_6": 0.06219482421875, "loss_aux_layer_7": 0.06036376953125, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.0582275390625, "step": 3998, "total_loss": 0.6868495941162109 }, { "epoch": 0.7917244110077213, "grad_norm": 0.9449564814567566, "learning_rate": 5e-05, "llm_loss": 0.6046867370605469, "loss": 2.749, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.032318115234375, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.0645751953125, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.082275390625, "loss_aux_layer_15": 0.09033203125, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05572509765625, "loss_aux_layer_4": 0.05841064453125, "loss_aux_layer_5": 0.06011962890625, "loss_aux_layer_6": 0.0633544921875, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06085205078125, "loss_aux_layer_9": 0.05926513671875, "step": 3999, "total_loss": 0.6872607171535492 }, { "epoch": 0.7919223916056226, "grad_norm": 0.9222192764282227, "learning_rate": 5e-05, "llm_loss": 0.5759116485714912, "loss": 2.6244, "loss_aux_layer_0": 0.0137786865234375, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04364013671875, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05572509765625, "step": 4000, "total_loss": 0.6561116129159927 }, { "epoch": 0.792120372203524, "grad_norm": 0.8706445097923279, "learning_rate": 5e-05, "llm_loss": 0.5904482156038284, "loss": 2.6871, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.0577392578125, "step": 4001, "total_loss": 0.6717748194932938 }, { "epoch": 0.7923183528014255, "grad_norm": 0.8377547860145569, "learning_rate": 5e-05, "llm_loss": 0.5258365273475647, "loss": 2.4146, "loss_aux_layer_0": 0.013214111328125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05340576171875, "step": 4002, "total_loss": 0.6036426723003387 }, { "epoch": 0.7925163333993268, "grad_norm": 0.7390096187591553, "learning_rate": 5e-05, "llm_loss": 0.5957362353801727, "loss": 2.7101, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.05718994140625, "step": 4003, "total_loss": 0.6775254309177399 }, { "epoch": 0.7927143139972282, "grad_norm": 0.9483216404914856, "learning_rate": 5e-05, "llm_loss": 0.5116042271256447, "loss": 2.372, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.031463623046875, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.0582275390625, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05670166015625, "step": 4004, "total_loss": 0.5929959639906883 }, { "epoch": 0.7929122945951297, "grad_norm": 0.7548418641090393, "learning_rate": 5e-05, "llm_loss": 0.5615579783916473, "loss": 2.588, "loss_aux_layer_0": 0.0114898681640625, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.06195068359375, "loss_aux_layer_11": 0.066162109375, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.0765380859375, "loss_aux_layer_14": 0.0855712890625, "loss_aux_layer_15": 0.093994140625, "loss_aux_layer_16": 0.1036376953125, "loss_aux_layer_17": 0.1114501953125, "loss_aux_layer_18": 0.1199951171875, "loss_aux_layer_19": 0.1229248046875, "loss_aux_layer_2": 0.04595947265625, "loss_aux_layer_20": 0.13037109375, "loss_aux_layer_21": 0.137939453125, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05609130859375, "loss_aux_layer_4": 0.05865478515625, "loss_aux_layer_5": 0.060546875, "loss_aux_layer_6": 0.0638427734375, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.06060791015625, "step": 4005, "total_loss": 0.6469903290271759 }, { "epoch": 0.7931102751930311, "grad_norm": 0.9313575029373169, "learning_rate": 5e-05, "llm_loss": 0.5653955489397049, "loss": 2.593, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.032867431640625, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.06915283203125, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.0460205078125, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0562744140625, "loss_aux_layer_4": 0.05889892578125, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.05950927734375, "step": 4006, "total_loss": 0.6482558846473694 }, { "epoch": 0.7933082557909324, "grad_norm": 0.7740150690078735, "learning_rate": 5e-05, "llm_loss": 0.6229032278060913, "loss": 2.826, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.0333251953125, "loss_aux_layer_10": 0.06182861328125, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.0704345703125, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1007080078125, "loss_aux_layer_17": 0.1082763671875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05902099609375, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.06378173828125, "loss_aux_layer_7": 0.06231689453125, "loss_aux_layer_8": 0.06170654296875, "loss_aux_layer_9": 0.06048583984375, "step": 4007, "total_loss": 0.7064995169639587 }, { "epoch": 0.7935062363888339, "grad_norm": 0.7346057891845703, "learning_rate": 5e-05, "llm_loss": 0.5961175486445427, "loss": 2.7026, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.030548095703125, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05279541015625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05499267578125, "step": 4008, "total_loss": 0.6756564229726791 }, { "epoch": 0.7937042169867353, "grad_norm": 0.8008920550346375, "learning_rate": 5e-05, "llm_loss": 0.5975791215896606, "loss": 2.715, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05682373046875, "step": 4009, "total_loss": 0.6787498295307159 }, { "epoch": 0.7939021975846368, "grad_norm": 0.8615332245826721, "learning_rate": 5e-05, "llm_loss": 0.5095904096961021, "loss": 2.3763, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.06146240234375, "loss_aux_layer_11": 0.065673828125, "loss_aux_layer_12": 0.0703125, "loss_aux_layer_13": 0.0760498046875, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0928955078125, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.10888671875, "loss_aux_layer_18": 0.1168212890625, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.0467529296875, "loss_aux_layer_20": 0.1265869140625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05706787109375, "loss_aux_layer_4": 0.05950927734375, "loss_aux_layer_5": 0.06103515625, "loss_aux_layer_6": 0.0640869140625, "loss_aux_layer_7": 0.062255859375, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.06011962890625, "step": 4010, "total_loss": 0.5940771549940109 }, { "epoch": 0.7941001781825381, "grad_norm": 0.9305962920188904, "learning_rate": 5e-05, "llm_loss": 0.5361154824495316, "loss": 2.4603, "loss_aux_layer_0": 0.0117950439453125, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.0548095703125, "step": 4011, "total_loss": 0.615084633231163 }, { "epoch": 0.7942981587804395, "grad_norm": 0.8545617461204529, "learning_rate": 5e-05, "llm_loss": 0.5554818361997604, "loss": 2.5664, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.033447265625, "loss_aux_layer_10": 0.0638427734375, "loss_aux_layer_11": 0.068359375, "loss_aux_layer_12": 0.07275390625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.0863037109375, "loss_aux_layer_15": 0.094482421875, "loss_aux_layer_16": 0.103759765625, "loss_aux_layer_17": 0.1112060546875, "loss_aux_layer_18": 0.119140625, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.057861328125, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0623779296875, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06414794921875, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0623779296875, "step": 4012, "total_loss": 0.6416092813014984 }, { "epoch": 0.794496139378341, "grad_norm": 0.76533043384552, "learning_rate": 5e-05, "llm_loss": 0.5768637806177139, "loss": 2.6473, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.061767578125, "loss_aux_layer_11": 0.0660400390625, "loss_aux_layer_12": 0.070556640625, "loss_aux_layer_13": 0.076171875, "loss_aux_layer_14": 0.0850830078125, "loss_aux_layer_15": 0.0936279296875, "loss_aux_layer_16": 0.1029052734375, "loss_aux_layer_17": 0.1104736328125, "loss_aux_layer_18": 0.11865234375, "loss_aux_layer_19": 0.12158203125, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.056640625, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.060791015625, "loss_aux_layer_6": 0.06402587890625, "loss_aux_layer_7": 0.0621337890625, "loss_aux_layer_8": 0.0615234375, "loss_aux_layer_9": 0.060302734375, "step": 4013, "total_loss": 0.6618249416351318 }, { "epoch": 0.7946941199762423, "grad_norm": 0.9882105588912964, "learning_rate": 5e-05, "llm_loss": 0.5794740915298462, "loss": 2.6402, "loss_aux_layer_0": 0.012298583984375, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04364013671875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05657958984375, "step": 4014, "total_loss": 0.6600493937730789 }, { "epoch": 0.7948921005741437, "grad_norm": 1.1138885021209717, "learning_rate": 5e-05, "llm_loss": 0.584499791264534, "loss": 2.6727, "loss_aux_layer_0": 0.01214599609375, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.05987548828125, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0916748046875, "loss_aux_layer_16": 0.10205078125, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.1182861328125, "loss_aux_layer_19": 0.121826171875, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1298828125, "loss_aux_layer_21": 0.1376953125, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05828857421875, "step": 4015, "total_loss": 0.6681682914495468 }, { "epoch": 0.7950900811720452, "grad_norm": 0.9025281071662903, "learning_rate": 5e-05, "llm_loss": 0.6155120134353638, "loss": 2.7775, "loss_aux_layer_0": 0.0127410888671875, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.054443359375, "step": 4016, "total_loss": 0.6943706125020981 }, { "epoch": 0.7952880617699466, "grad_norm": 1.0024741888046265, "learning_rate": 5e-05, "llm_loss": 0.5207682400941849, "loss": 2.4008, "loss_aux_layer_0": 0.013397216796875, "loss_aux_layer_1": 0.030487060546875, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05511474609375, "step": 4017, "total_loss": 0.6001905798912048 }, { "epoch": 0.7954860423678479, "grad_norm": 0.9522546529769897, "learning_rate": 5e-05, "llm_loss": 0.5071522668004036, "loss": 2.3638, "loss_aux_layer_0": 0.0145721435546875, "loss_aux_layer_1": 0.03216552734375, "loss_aux_layer_10": 0.06085205078125, "loss_aux_layer_11": 0.06488037109375, "loss_aux_layer_12": 0.0697021484375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1083984375, "loss_aux_layer_18": 0.1162109375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.059814453125, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.06097412109375, "loss_aux_layer_8": 0.0604248046875, "loss_aux_layer_9": 0.05938720703125, "step": 4018, "total_loss": 0.5909445434808731 }, { "epoch": 0.7956840229657494, "grad_norm": 0.8565313816070557, "learning_rate": 5e-05, "llm_loss": 0.5184424221515656, "loss": 2.3921, "loss_aux_layer_0": 0.0115966796875, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.0556640625, "step": 4019, "total_loss": 0.5980263948440552 }, { "epoch": 0.7958820035636508, "grad_norm": 0.9487103819847107, "learning_rate": 5e-05, "llm_loss": 0.5241624265909195, "loss": 2.418, "loss_aux_layer_0": 0.0145263671875, "loss_aux_layer_1": 0.031219482421875, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05517578125, "step": 4020, "total_loss": 0.6044964641332626 }, { "epoch": 0.7960799841615521, "grad_norm": 0.887759268283844, "learning_rate": 5e-05, "llm_loss": 0.5579108372330666, "loss": 2.5532, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.03125, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05621337890625, "step": 4021, "total_loss": 0.6382946223020554 }, { "epoch": 0.7962779647594536, "grad_norm": 0.8713105320930481, "learning_rate": 5e-05, "llm_loss": 0.535972572863102, "loss": 2.461, "loss_aux_layer_0": 0.01336669921875, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05572509765625, "step": 4022, "total_loss": 0.6152500808238983 }, { "epoch": 0.796475945357355, "grad_norm": 1.0565621852874756, "learning_rate": 5e-05, "llm_loss": 0.5857738330960274, "loss": 2.6631, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.07135009765625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0565185546875, "step": 4023, "total_loss": 0.6657786667346954 }, { "epoch": 0.7966739259552564, "grad_norm": 0.8510356545448303, "learning_rate": 5e-05, "llm_loss": 0.5654136836528778, "loss": 2.5819, "loss_aux_layer_0": 0.01239013671875, "loss_aux_layer_1": 0.030364990234375, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.0546875, "step": 4024, "total_loss": 0.6454724222421646 }, { "epoch": 0.7968719065531578, "grad_norm": 0.8719430565834045, "learning_rate": 5e-05, "llm_loss": 0.5794651508331299, "loss": 2.6381, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.0552978515625, "step": 4025, "total_loss": 0.6595178246498108 }, { "epoch": 0.7970698871510592, "grad_norm": 0.9103284478187561, "learning_rate": 5e-05, "llm_loss": 0.6025214344263077, "loss": 2.7301, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.0557861328125, "step": 4026, "total_loss": 0.6825198382139206 }, { "epoch": 0.7972678677489606, "grad_norm": 0.9322820901870728, "learning_rate": 5e-05, "llm_loss": 0.5634905397891998, "loss": 2.5744, "loss_aux_layer_0": 0.012298583984375, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.054931640625, "step": 4027, "total_loss": 0.6435887813568115 }, { "epoch": 0.797465848346862, "grad_norm": 0.8300187587738037, "learning_rate": 5e-05, "llm_loss": 0.6306810826063156, "loss": 2.8265, "loss_aux_layer_0": 0.0117645263671875, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.05303955078125, "loss_aux_layer_9": 0.05181884765625, "step": 4028, "total_loss": 0.7066305726766586 }, { "epoch": 0.7976638289447634, "grad_norm": 0.7845385670661926, "learning_rate": 5e-05, "llm_loss": 0.5934345871210098, "loss": 2.7023, "loss_aux_layer_0": 0.0118560791015625, "loss_aux_layer_1": 0.032135009765625, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.06842041015625, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.05755615234375, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.062744140625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.06024169921875, "loss_aux_layer_9": 0.0589599609375, "step": 4029, "total_loss": 0.6755710691213608 }, { "epoch": 0.7978618095426648, "grad_norm": 0.9039686918258667, "learning_rate": 5e-05, "llm_loss": 0.513788029551506, "loss": 2.3793, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06683349609375, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.05743408203125, "step": 4030, "total_loss": 0.594826266169548 }, { "epoch": 0.7980597901405663, "grad_norm": 0.909341037273407, "learning_rate": 5e-05, "llm_loss": 0.5298990458250046, "loss": 2.4372, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.06951904296875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.0548095703125, "step": 4031, "total_loss": 0.6093120574951172 }, { "epoch": 0.7982577707384676, "grad_norm": 0.8588889837265015, "learning_rate": 5e-05, "llm_loss": 0.5852955281734467, "loss": 2.6505, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.0535888671875, "step": 4032, "total_loss": 0.6626373380422592 }, { "epoch": 0.798455751336369, "grad_norm": 0.9663193225860596, "learning_rate": 5e-05, "llm_loss": 0.6395970731973648, "loss": 2.8911, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.063720703125, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073486328125, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1160888671875, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05523681640625, "loss_aux_layer_4": 0.05743408203125, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.0596923828125, "loss_aux_layer_9": 0.05865478515625, "step": 4033, "total_loss": 0.722786471247673 }, { "epoch": 0.7986537319342705, "grad_norm": 0.9767754077911377, "learning_rate": 5e-05, "llm_loss": 0.5380342528223991, "loss": 2.4778, "loss_aux_layer_0": 0.0126800537109375, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.05560302734375, "step": 4034, "total_loss": 0.6194571256637573 }, { "epoch": 0.7988517125321718, "grad_norm": 0.8843578696250916, "learning_rate": 5e-05, "llm_loss": 0.5569679215550423, "loss": 2.5447, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05474853515625, "step": 4035, "total_loss": 0.6361757665872574 }, { "epoch": 0.7990496931300732, "grad_norm": 1.5292088985443115, "learning_rate": 5e-05, "llm_loss": 0.5871841907501221, "loss": 2.6737, "loss_aux_layer_0": 0.0129241943359375, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.056640625, "step": 4036, "total_loss": 0.6684153228998184 }, { "epoch": 0.7992476737279747, "grad_norm": 1.2318143844604492, "learning_rate": 5e-05, "llm_loss": 0.5344346314668655, "loss": 2.4529, "loss_aux_layer_0": 0.0116729736328125, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.054443359375, "step": 4037, "total_loss": 0.613219678401947 }, { "epoch": 0.7994456543258761, "grad_norm": 1.0183881521224976, "learning_rate": 5e-05, "llm_loss": 0.5939837247133255, "loss": 2.687, "loss_aux_layer_0": 0.012969970703125, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05303955078125, "step": 4038, "total_loss": 0.6717443466186523 }, { "epoch": 0.7996436349237774, "grad_norm": 0.9368143677711487, "learning_rate": 5e-05, "llm_loss": 0.5986697226762772, "loss": 2.7162, "loss_aux_layer_0": 0.0128936767578125, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.056884765625, "step": 4039, "total_loss": 0.67904993891716 }, { "epoch": 0.7998416155216789, "grad_norm": 1.0303869247436523, "learning_rate": 5e-05, "llm_loss": 0.5388674959540367, "loss": 2.4869, "loss_aux_layer_0": 0.01275634765625, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.060302734375, "loss_aux_layer_11": 0.06439208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1258544921875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.0596923828125, "loss_aux_layer_6": 0.06243896484375, "loss_aux_layer_7": 0.0606689453125, "loss_aux_layer_8": 0.06005859375, "loss_aux_layer_9": 0.058837890625, "step": 4040, "total_loss": 0.6217229515314102 }, { "epoch": 0.8000395961195803, "grad_norm": 0.9855749607086182, "learning_rate": 5e-05, "llm_loss": 0.5611648708581924, "loss": 2.5737, "loss_aux_layer_0": 0.0130462646484375, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.056884765625, "step": 4041, "total_loss": 0.6434201598167419 }, { "epoch": 0.8002375767174816, "grad_norm": 0.9858571290969849, "learning_rate": 5e-05, "llm_loss": 0.5516578853130341, "loss": 2.5313, "loss_aux_layer_0": 0.012939453125, "loss_aux_layer_1": 0.031951904296875, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06121826171875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05755615234375, "step": 4042, "total_loss": 0.6328253448009491 }, { "epoch": 0.8004355573153831, "grad_norm": 0.8477264642715454, "learning_rate": 5e-05, "llm_loss": 0.5044945925474167, "loss": 2.3477, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.06829833984375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05474853515625, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.0599365234375, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.0587158203125, "step": 4043, "total_loss": 0.586937353014946 }, { "epoch": 0.8006335379132845, "grad_norm": 0.8659713268280029, "learning_rate": 5e-05, "llm_loss": 0.6643183082342148, "loss": 2.9687, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05523681640625, "step": 4044, "total_loss": 0.7421736717224121 }, { "epoch": 0.800831518511186, "grad_norm": 1.0588529109954834, "learning_rate": 5e-05, "llm_loss": 0.6547747105360031, "loss": 2.9474, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.0574951171875, "step": 4045, "total_loss": 0.7368441075086594 }, { "epoch": 0.8010294991090873, "grad_norm": 0.7009773254394531, "learning_rate": 5e-05, "llm_loss": 0.5447888970375061, "loss": 2.5056, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.03228759765625, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.0633544921875, "loss_aux_layer_12": 0.06781005859375, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05804443359375, "step": 4046, "total_loss": 0.6264097541570663 }, { "epoch": 0.8012274797069887, "grad_norm": 1.1177737712860107, "learning_rate": 5e-05, "llm_loss": 0.6442621499300003, "loss": 2.9085, "loss_aux_layer_0": 0.012664794921875, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06365966796875, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.0579833984375, "step": 4047, "total_loss": 0.7271367609500885 }, { "epoch": 0.8014254603048901, "grad_norm": 0.7668957710266113, "learning_rate": 5e-05, "llm_loss": 0.569193497300148, "loss": 2.6009, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05615234375, "step": 4048, "total_loss": 0.6502246409654617 }, { "epoch": 0.8016234409027915, "grad_norm": 0.8983088731765747, "learning_rate": 5e-05, "llm_loss": 0.4945712983608246, "loss": 2.3001, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.031219482421875, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.0570068359375, "step": 4049, "total_loss": 0.5750311240553856 }, { "epoch": 0.8018214215006929, "grad_norm": 0.797187864780426, "learning_rate": 5e-05, "llm_loss": 0.5532618910074234, "loss": 2.5242, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05377197265625, "step": 4050, "total_loss": 0.6310445368289948 }, { "epoch": 0.8020194020985943, "grad_norm": 0.8118034601211548, "learning_rate": 5e-05, "llm_loss": 0.5653260350227356, "loss": 2.5773, "loss_aux_layer_0": 0.012237548828125, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0543212890625, "step": 4051, "total_loss": 0.6443319916725159 }, { "epoch": 0.8022173826964958, "grad_norm": 0.8871809840202332, "learning_rate": 5e-05, "llm_loss": 0.4778847098350525, "loss": 2.2436, "loss_aux_layer_0": 0.0117645263671875, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.06005859375, "loss_aux_layer_11": 0.0640869140625, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.073974609375, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.1002197265625, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.119140625, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1268310546875, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.058837890625, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.058837890625, "step": 4052, "total_loss": 0.5608986467123032 }, { "epoch": 0.8024153632943971, "grad_norm": 0.80223548412323, "learning_rate": 5e-05, "llm_loss": 0.5269387811422348, "loss": 2.4364, "loss_aux_layer_0": 0.0118408203125, "loss_aux_layer_1": 0.03106689453125, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0576171875, "step": 4053, "total_loss": 0.6090934425592422 }, { "epoch": 0.8026133438922985, "grad_norm": 0.833354115486145, "learning_rate": 5e-05, "llm_loss": 0.5306664258241653, "loss": 2.436, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.053955078125, "step": 4054, "total_loss": 0.6090122610330582 }, { "epoch": 0.8028113244902, "grad_norm": 0.9367315769195557, "learning_rate": 5e-05, "llm_loss": 0.495507188141346, "loss": 2.3053, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.031219482421875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.06756591796875, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0572509765625, "step": 4055, "total_loss": 0.5763348042964935 }, { "epoch": 0.8030093050881013, "grad_norm": 0.7286058068275452, "learning_rate": 5e-05, "llm_loss": 0.6324572116136551, "loss": 2.8522, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05694580078125, "step": 4056, "total_loss": 0.7130485028028488 }, { "epoch": 0.8032072856860027, "grad_norm": 0.8820903301239014, "learning_rate": 5e-05, "llm_loss": 0.6220080107450485, "loss": 2.8234, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.03143310546875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.083740234375, "loss_aux_layer_15": 0.0927734375, "loss_aux_layer_16": 0.1026611328125, "loss_aux_layer_17": 0.1107177734375, "loss_aux_layer_18": 0.119384765625, "loss_aux_layer_19": 0.1224365234375, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.13818359375, "loss_aux_layer_22": 0.1591796875, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0577392578125, "step": 4057, "total_loss": 0.705852821469307 }, { "epoch": 0.8034052662839042, "grad_norm": 1.0252798795700073, "learning_rate": 5e-05, "llm_loss": 0.4833115413784981, "loss": 2.2664, "loss_aux_layer_0": 0.0114898681640625, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.064697265625, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.0745849609375, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.0909423828125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0555419921875, "loss_aux_layer_4": 0.05853271484375, "loss_aux_layer_5": 0.0601806640625, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.0609130859375, "loss_aux_layer_9": 0.05975341796875, "step": 4058, "total_loss": 0.5666030496358871 }, { "epoch": 0.8036032468818056, "grad_norm": 1.094115138053894, "learning_rate": 5e-05, "llm_loss": 0.4708256125450134, "loss": 2.2007, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.055419921875, "step": 4059, "total_loss": 0.5501755848526955 }, { "epoch": 0.803801227479707, "grad_norm": 0.8515615463256836, "learning_rate": 5e-05, "llm_loss": 0.44407612830400467, "loss": 2.0981, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06573486328125, "loss_aux_layer_13": 0.07122802734375, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05645751953125, "step": 4060, "total_loss": 0.5245268791913986 }, { "epoch": 0.8039992080776084, "grad_norm": 1.2154706716537476, "learning_rate": 5e-05, "llm_loss": 0.592983216047287, "loss": 2.6774, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05194091796875, "step": 4061, "total_loss": 0.6693503558635712 }, { "epoch": 0.8041971886755098, "grad_norm": 1.1551703214645386, "learning_rate": 5e-05, "llm_loss": 0.6519096791744232, "loss": 2.9339, "loss_aux_layer_0": 0.0126953125, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06292724609375, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05792236328125, "step": 4062, "total_loss": 0.7334810793399811 }, { "epoch": 0.8043951692734113, "grad_norm": 1.0770204067230225, "learning_rate": 5e-05, "llm_loss": 0.5205854699015617, "loss": 2.4074, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.0574951171875, "step": 4063, "total_loss": 0.6018530130386353 }, { "epoch": 0.8045931498713126, "grad_norm": 1.0662976503372192, "learning_rate": 5e-05, "llm_loss": 0.6035035103559494, "loss": 2.7282, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0538330078125, "step": 4064, "total_loss": 0.682061180472374 }, { "epoch": 0.804791130469214, "grad_norm": 1.109915018081665, "learning_rate": 5e-05, "llm_loss": 0.49775853008031845, "loss": 2.3353, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.03350830078125, "loss_aux_layer_10": 0.06378173828125, "loss_aux_layer_11": 0.068115234375, "loss_aux_layer_12": 0.072509765625, "loss_aux_layer_13": 0.078369140625, "loss_aux_layer_14": 0.086669921875, "loss_aux_layer_15": 0.0947265625, "loss_aux_layer_16": 0.1033935546875, "loss_aux_layer_17": 0.110595703125, "loss_aux_layer_18": 0.1181640625, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.0474853515625, "loss_aux_layer_20": 0.1278076171875, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.158935546875, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05792236328125, "loss_aux_layer_4": 0.06072998046875, "loss_aux_layer_5": 0.0625, "loss_aux_layer_6": 0.065673828125, "loss_aux_layer_7": 0.06390380859375, "loss_aux_layer_8": 0.0634765625, "loss_aux_layer_9": 0.0625, "step": 4065, "total_loss": 0.5838228538632393 }, { "epoch": 0.8049891110671155, "grad_norm": 0.9932076930999756, "learning_rate": 5e-05, "llm_loss": 0.6415703743696213, "loss": 2.8877, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05291748046875, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.055908203125, "step": 4066, "total_loss": 0.7219284921884537 }, { "epoch": 0.8051870916650168, "grad_norm": 1.3477184772491455, "learning_rate": 5e-05, "llm_loss": 0.6212321221828461, "loss": 2.8098, "loss_aux_layer_0": 0.01251220703125, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.0576171875, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.05645751953125, "step": 4067, "total_loss": 0.7024524956941605 }, { "epoch": 0.8053850722629182, "grad_norm": 1.0033724308013916, "learning_rate": 5e-05, "llm_loss": 0.568052388727665, "loss": 2.5973, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.056884765625, "step": 4068, "total_loss": 0.6493296027183533 }, { "epoch": 0.8055830528608197, "grad_norm": 1.0513354539871216, "learning_rate": 5e-05, "llm_loss": 0.606745183467865, "loss": 2.7438, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.06842041015625, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.0540771484375, "step": 4069, "total_loss": 0.6859458684921265 }, { "epoch": 0.8057810334587211, "grad_norm": 1.2867512702941895, "learning_rate": 5e-05, "llm_loss": 0.5739968940615654, "loss": 2.6234, "loss_aux_layer_0": 0.0133056640625, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05548095703125, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05780029296875, "step": 4070, "total_loss": 0.6558523774147034 }, { "epoch": 0.8059790140566224, "grad_norm": 0.9865897297859192, "learning_rate": 5e-05, "llm_loss": 0.4850437790155411, "loss": 2.265, "loss_aux_layer_0": 0.012939453125, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05682373046875, "step": 4071, "total_loss": 0.5662414282560349 }, { "epoch": 0.8061769946545239, "grad_norm": 1.1135663986206055, "learning_rate": 5e-05, "llm_loss": 0.6591526716947556, "loss": 2.9625, "loss_aux_layer_0": 0.013763427734375, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.056640625, "step": 4072, "total_loss": 0.740613579750061 }, { "epoch": 0.8063749752524253, "grad_norm": 1.0251480340957642, "learning_rate": 5e-05, "llm_loss": 0.6817261725664139, "loss": 3.0507, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.0582275390625, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05657958984375, "step": 4073, "total_loss": 0.7626626342535019 }, { "epoch": 0.8065729558503266, "grad_norm": 1.0406256914138794, "learning_rate": 5e-05, "llm_loss": 0.5830509141087532, "loss": 2.6607, "loss_aux_layer_0": 0.0136566162109375, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.06719970703125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1182861328125, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.1259765625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05377197265625, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05755615234375, "step": 4074, "total_loss": 0.6651689410209656 }, { "epoch": 0.8067709364482281, "grad_norm": 1.2104042768478394, "learning_rate": 5e-05, "llm_loss": 0.5492313876748085, "loss": 2.5206, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.12841796875, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05474853515625, "step": 4075, "total_loss": 0.6301485002040863 }, { "epoch": 0.8069689170461295, "grad_norm": 1.04860258102417, "learning_rate": 5e-05, "llm_loss": 0.5616401582956314, "loss": 2.5585, "loss_aux_layer_0": 0.01214599609375, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.0540771484375, "step": 4076, "total_loss": 0.6396311670541763 }, { "epoch": 0.8071668976440309, "grad_norm": 1.1327704191207886, "learning_rate": 5e-05, "llm_loss": 0.5652430057525635, "loss": 2.5838, "loss_aux_layer_0": 0.01348876953125, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05902099609375, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.055908203125, "step": 4077, "total_loss": 0.6459434181451797 }, { "epoch": 0.8073648782419323, "grad_norm": 0.9325761795043945, "learning_rate": 5e-05, "llm_loss": 0.5926671922206879, "loss": 2.6967, "loss_aux_layer_0": 0.01239013671875, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.05694580078125, "step": 4078, "total_loss": 0.6741817146539688 }, { "epoch": 0.8075628588398337, "grad_norm": 1.0492666959762573, "learning_rate": 5e-05, "llm_loss": 0.5462318435311317, "loss": 2.5108, "loss_aux_layer_0": 0.01251220703125, "loss_aux_layer_1": 0.031524658203125, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.07244873046875, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05743408203125, "step": 4079, "total_loss": 0.627689391374588 }, { "epoch": 0.8077608394377351, "grad_norm": 0.9948787093162537, "learning_rate": 5e-05, "llm_loss": 0.5859555974602699, "loss": 2.6747, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.119384765625, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.19580078125, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.05767822265625, "step": 4080, "total_loss": 0.6686859503388405 }, { "epoch": 0.8079588200356365, "grad_norm": 1.0086325407028198, "learning_rate": 5e-05, "llm_loss": 0.4976372495293617, "loss": 2.3135, "loss_aux_layer_0": 0.011962890625, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.0560302734375, "step": 4081, "total_loss": 0.5783690437674522 }, { "epoch": 0.8081568006335379, "grad_norm": 0.9349277019500732, "learning_rate": 5e-05, "llm_loss": 0.4951416701078415, "loss": 2.3152, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.033538818359375, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.0648193359375, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134765625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.0565185546875, "loss_aux_layer_4": 0.05877685546875, "loss_aux_layer_5": 0.06036376953125, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.0615234375, "loss_aux_layer_8": 0.06097412109375, "loss_aux_layer_9": 0.0595703125, "step": 4082, "total_loss": 0.5788023322820663 }, { "epoch": 0.8083547812314393, "grad_norm": 1.063263177871704, "learning_rate": 5e-05, "llm_loss": 0.4768804907798767, "loss": 2.232, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.19482421875, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.05572509765625, "step": 4083, "total_loss": 0.5580102205276489 }, { "epoch": 0.8085527618293408, "grad_norm": 1.0859527587890625, "learning_rate": 5e-05, "llm_loss": 0.6197671294212341, "loss": 2.7994, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.05670166015625, "step": 4084, "total_loss": 0.6998492330312729 }, { "epoch": 0.8087507424272421, "grad_norm": 1.0582033395767212, "learning_rate": 5e-05, "llm_loss": 0.5876882672309875, "loss": 2.6787, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.10693359375, "loss_aux_layer_18": 0.115234375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.05731201171875, "step": 4085, "total_loss": 0.669683188199997 }, { "epoch": 0.8089487230251435, "grad_norm": 1.1241936683654785, "learning_rate": 5e-05, "llm_loss": 0.6096866577863693, "loss": 2.7578, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05462646484375, "step": 4086, "total_loss": 0.689460888504982 }, { "epoch": 0.809146703623045, "grad_norm": 0.9668426513671875, "learning_rate": 5e-05, "llm_loss": 0.5345312654972076, "loss": 2.4726, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.033172607421875, "loss_aux_layer_10": 0.06158447265625, "loss_aux_layer_11": 0.0655517578125, "loss_aux_layer_12": 0.0699462890625, "loss_aux_layer_13": 0.0753173828125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0570068359375, "loss_aux_layer_4": 0.05938720703125, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06390380859375, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.061279296875, "loss_aux_layer_9": 0.0601806640625, "step": 4087, "total_loss": 0.6181449741125107 }, { "epoch": 0.8093446842209463, "grad_norm": 1.0577266216278076, "learning_rate": 5e-05, "llm_loss": 0.5984687954187393, "loss": 2.7339, "loss_aux_layer_0": 0.01214599609375, "loss_aux_layer_1": 0.0338134765625, "loss_aux_layer_10": 0.0621337890625, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.0709228515625, "loss_aux_layer_13": 0.076416015625, "loss_aux_layer_14": 0.0848388671875, "loss_aux_layer_15": 0.093017578125, "loss_aux_layer_16": 0.1019287109375, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1201171875, "loss_aux_layer_2": 0.04718017578125, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05743408203125, "loss_aux_layer_4": 0.0599365234375, "loss_aux_layer_5": 0.0616455078125, "loss_aux_layer_6": 0.064697265625, "loss_aux_layer_7": 0.0626220703125, "loss_aux_layer_8": 0.06219482421875, "loss_aux_layer_9": 0.0606689453125, "step": 4088, "total_loss": 0.6834758073091507 }, { "epoch": 0.8095426648188477, "grad_norm": 0.9524495005607605, "learning_rate": 5e-05, "llm_loss": 0.5468457415699959, "loss": 2.5053, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.05523681640625, "step": 4089, "total_loss": 0.6263241022825241 }, { "epoch": 0.8097406454167492, "grad_norm": 1.2694498300552368, "learning_rate": 5e-05, "llm_loss": 0.5437101274728775, "loss": 2.4936, "loss_aux_layer_0": 0.011322021484375, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.055419921875, "step": 4090, "total_loss": 0.623390257358551 }, { "epoch": 0.8099386260146506, "grad_norm": 1.3101314306259155, "learning_rate": 5e-05, "llm_loss": 0.5420866459608078, "loss": 2.4789, "loss_aux_layer_0": 0.01220703125, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05328369140625, "step": 4091, "total_loss": 0.6197195500135422 }, { "epoch": 0.8101366066125519, "grad_norm": 1.6050022840499878, "learning_rate": 5e-05, "llm_loss": 0.5450275465846062, "loss": 2.4963, "loss_aux_layer_0": 0.0124969482421875, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0545654296875, "step": 4092, "total_loss": 0.6240811049938202 }, { "epoch": 0.8103345872104534, "grad_norm": 0.7900764346122742, "learning_rate": 5e-05, "llm_loss": 0.6239063441753387, "loss": 2.8179, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.0555419921875, "step": 4093, "total_loss": 0.7044733911752701 }, { "epoch": 0.8105325678083548, "grad_norm": 2.925980806350708, "learning_rate": 5e-05, "llm_loss": 0.5145677924156189, "loss": 2.3839, "loss_aux_layer_0": 0.0129241943359375, "loss_aux_layer_1": 0.03192138671875, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0537109375, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05645751953125, "step": 4094, "total_loss": 0.5959793776273727 }, { "epoch": 0.8107305484062561, "grad_norm": 1.343665361404419, "learning_rate": 5e-05, "llm_loss": 0.6083946228027344, "loss": 2.7613, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04583740234375, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.0623779296875, "loss_aux_layer_7": 0.06011962890625, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.057861328125, "step": 4095, "total_loss": 0.6903227120637894 }, { "epoch": 0.8109285290041576, "grad_norm": 1.0287944078445435, "learning_rate": 5e-05, "llm_loss": 0.5606899559497833, "loss": 2.5634, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.03216552734375, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05792236328125, "step": 4096, "total_loss": 0.640846848487854 }, { "epoch": 0.811126509602059, "grad_norm": 1.0890313386917114, "learning_rate": 5e-05, "llm_loss": 0.4744688346982002, "loss": 2.2283, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.031829833984375, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0616455078125, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05767822265625, "step": 4097, "total_loss": 0.5570735335350037 }, { "epoch": 0.8113244901999604, "grad_norm": 1.1209989786148071, "learning_rate": 5e-05, "llm_loss": 0.45211130380630493, "loss": 2.1229, "loss_aux_layer_0": 0.013946533203125, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05426025390625, "step": 4098, "total_loss": 0.5307231545448303 }, { "epoch": 0.8115224707978618, "grad_norm": 0.9267681837081909, "learning_rate": 5e-05, "llm_loss": 0.5426734238862991, "loss": 2.4912, "loss_aux_layer_0": 0.0139007568359375, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04364013671875, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05615234375, "step": 4099, "total_loss": 0.6228020191192627 }, { "epoch": 0.8117204513957632, "grad_norm": 0.9543522000312805, "learning_rate": 5e-05, "llm_loss": 0.5416925624012947, "loss": 2.4785, "loss_aux_layer_0": 0.0152435302734375, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.0550537109375, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05377197265625, "step": 4100, "total_loss": 0.6196141988039017 }, { "epoch": 0.8119184319936646, "grad_norm": 0.9919993281364441, "learning_rate": 5e-05, "llm_loss": 0.49708273261785507, "loss": 2.317, "loss_aux_layer_0": 0.016143798828125, "loss_aux_layer_1": 0.033294677734375, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.06768798828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04559326171875, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.054931640625, "loss_aux_layer_4": 0.05718994140625, "loss_aux_layer_5": 0.0587158203125, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05914306640625, "loss_aux_layer_9": 0.05792236328125, "step": 4101, "total_loss": 0.5792474299669266 }, { "epoch": 0.812116412591566, "grad_norm": 0.9795342087745667, "learning_rate": 5e-05, "llm_loss": 0.6343930959701538, "loss": 2.8657, "loss_aux_layer_0": 0.0125732421875, "loss_aux_layer_1": 0.03216552734375, "loss_aux_layer_10": 0.05950927734375, "loss_aux_layer_11": 0.06378173828125, "loss_aux_layer_12": 0.0682373046875, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1072998046875, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05810546875, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05804443359375, "step": 4102, "total_loss": 0.7164174318313599 }, { "epoch": 0.8123143931894674, "grad_norm": 1.0291739702224731, "learning_rate": 5e-05, "llm_loss": 0.545231819152832, "loss": 2.5147, "loss_aux_layer_0": 0.01611328125, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06719970703125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.1014404296875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1185302734375, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.131103515625, "loss_aux_layer_21": 0.139404296875, "loss_aux_layer_22": 0.16162109375, "loss_aux_layer_23": 0.2001953125, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.056640625, "step": 4103, "total_loss": 0.6286760419607162 }, { "epoch": 0.8125123737873688, "grad_norm": 1.0762003660202026, "learning_rate": 5e-05, "llm_loss": 0.6136371195316315, "loss": 2.7871, "loss_aux_layer_0": 0.0130767822265625, "loss_aux_layer_1": 0.03265380859375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0826416015625, "loss_aux_layer_15": 0.0908203125, "loss_aux_layer_16": 0.10009765625, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.0462646484375, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.0552978515625, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.06201171875, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05816650390625, "step": 4104, "total_loss": 0.6967815458774567 }, { "epoch": 0.8127103543852703, "grad_norm": 1.1271294355392456, "learning_rate": 5e-05, "llm_loss": 0.620308443903923, "loss": 2.803, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05645751953125, "step": 4105, "total_loss": 0.7007383108139038 }, { "epoch": 0.8129083349831716, "grad_norm": 0.8540797829627991, "learning_rate": 5e-05, "llm_loss": 0.5686409175395966, "loss": 2.6047, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.031982421875, "loss_aux_layer_10": 0.05908203125, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.107177734375, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1192626953125, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.15673828125, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.057861328125, "step": 4106, "total_loss": 0.6511867046356201 }, { "epoch": 0.813106315581073, "grad_norm": 1.0743306875228882, "learning_rate": 5e-05, "llm_loss": 0.5854193270206451, "loss": 2.6625, "loss_aux_layer_0": 0.0129852294921875, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.056396484375, "step": 4107, "total_loss": 0.6656173765659332 }, { "epoch": 0.8133042961789745, "grad_norm": 0.8307338356971741, "learning_rate": 5e-05, "llm_loss": 0.5616243109107018, "loss": 2.5724, "loss_aux_layer_0": 0.012908935546875, "loss_aux_layer_1": 0.032196044921875, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05853271484375, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.0582275390625, "step": 4108, "total_loss": 0.6431065797805786 }, { "epoch": 0.8135022767768759, "grad_norm": 1.0062402486801147, "learning_rate": 5e-05, "llm_loss": 0.6594251245260239, "loss": 2.9533, "loss_aux_layer_0": 0.01312255859375, "loss_aux_layer_1": 0.030426025390625, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.05548095703125, "step": 4109, "total_loss": 0.7383357286453247 }, { "epoch": 0.8137002573747772, "grad_norm": 0.8430692553520203, "learning_rate": 5e-05, "llm_loss": 0.5324662551283836, "loss": 2.4523, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05596923828125, "step": 4110, "total_loss": 0.6130646169185638 }, { "epoch": 0.8138982379726787, "grad_norm": 0.8381929993629456, "learning_rate": 5e-05, "llm_loss": 0.4690101146697998, "loss": 2.2064, "loss_aux_layer_0": 0.0129547119140625, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.06744384765625, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.11572265625, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.126953125, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.05706787109375, "step": 4111, "total_loss": 0.5515967905521393 }, { "epoch": 0.8140962185705801, "grad_norm": 1.0448009967803955, "learning_rate": 5e-05, "llm_loss": 0.4920624792575836, "loss": 2.3036, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.03271484375, "loss_aux_layer_10": 0.060791015625, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0751953125, "loss_aux_layer_14": 0.0841064453125, "loss_aux_layer_15": 0.092529296875, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1097412109375, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04632568359375, "loss_aux_layer_20": 0.12744140625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.05828857421875, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.0628662109375, "loss_aux_layer_7": 0.0611572265625, "loss_aux_layer_8": 0.0606689453125, "loss_aux_layer_9": 0.05938720703125, "step": 4112, "total_loss": 0.5758903622627258 }, { "epoch": 0.8142941991684814, "grad_norm": 1.3845176696777344, "learning_rate": 5e-05, "llm_loss": 0.6099258065223694, "loss": 2.7567, "loss_aux_layer_0": 0.0123138427734375, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05511474609375, "step": 4113, "total_loss": 0.6891664862632751 }, { "epoch": 0.8144921797663829, "grad_norm": 0.8887357711791992, "learning_rate": 5e-05, "llm_loss": 0.56136654317379, "loss": 2.58, "loss_aux_layer_0": 0.0123291015625, "loss_aux_layer_1": 0.032257080078125, "loss_aux_layer_10": 0.06024169921875, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.0689697265625, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.101318359375, "loss_aux_layer_17": 0.108642578125, "loss_aux_layer_18": 0.1171875, "loss_aux_layer_19": 0.1202392578125, "loss_aux_layer_2": 0.04534912109375, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05908203125, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.0584716796875, "step": 4114, "total_loss": 0.6449954211711884 }, { "epoch": 0.8146901603642843, "grad_norm": 1.1522879600524902, "learning_rate": 5e-05, "llm_loss": 0.605782687664032, "loss": 2.7596, "loss_aux_layer_0": 0.012054443359375, "loss_aux_layer_1": 0.033203125, "loss_aux_layer_10": 0.06219482421875, "loss_aux_layer_11": 0.06622314453125, "loss_aux_layer_12": 0.0701904296875, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.091552734375, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.10791015625, "loss_aux_layer_18": 0.1153564453125, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04669189453125, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05670166015625, "loss_aux_layer_4": 0.0595703125, "loss_aux_layer_5": 0.06109619140625, "loss_aux_layer_6": 0.064453125, "loss_aux_layer_7": 0.062744140625, "loss_aux_layer_8": 0.06195068359375, "loss_aux_layer_9": 0.06072998046875, "step": 4115, "total_loss": 0.6899062246084213 }, { "epoch": 0.8148881409621858, "grad_norm": 1.045984148979187, "learning_rate": 5e-05, "llm_loss": 0.6115435212850571, "loss": 2.7624, "loss_aux_layer_0": 0.0119171142578125, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.06988525390625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05487060546875, "step": 4116, "total_loss": 0.6906019151210785 }, { "epoch": 0.8150861215600871, "grad_norm": 1.0756607055664062, "learning_rate": 5e-05, "llm_loss": 0.6485642641782761, "loss": 2.9278, "loss_aux_layer_0": 0.0120849609375, "loss_aux_layer_1": 0.03302001953125, "loss_aux_layer_10": 0.0606689453125, "loss_aux_layer_11": 0.06494140625, "loss_aux_layer_12": 0.0693359375, "loss_aux_layer_13": 0.0750732421875, "loss_aux_layer_14": 0.0831298828125, "loss_aux_layer_15": 0.09130859375, "loss_aux_layer_16": 0.0997314453125, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.046875, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05694580078125, "loss_aux_layer_4": 0.059326171875, "loss_aux_layer_5": 0.06072998046875, "loss_aux_layer_6": 0.06329345703125, "loss_aux_layer_7": 0.0614013671875, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.0594482421875, "step": 4117, "total_loss": 0.731947660446167 }, { "epoch": 0.8152841021579885, "grad_norm": 1.0201013088226318, "learning_rate": 5e-05, "llm_loss": 0.598951905965805, "loss": 2.7227, "loss_aux_layer_0": 0.0125579833984375, "loss_aux_layer_1": 0.031280517578125, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0572509765625, "step": 4118, "total_loss": 0.6806639730930328 }, { "epoch": 0.81548208275589, "grad_norm": 1.0678622722625732, "learning_rate": 5e-05, "llm_loss": 0.5964556336402893, "loss": 2.7096, "loss_aux_layer_0": 0.012298583984375, "loss_aux_layer_1": 0.031463623046875, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.0443115234375, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.0567626953125, "step": 4119, "total_loss": 0.677398294210434 }, { "epoch": 0.8156800633537913, "grad_norm": 0.8840410709381104, "learning_rate": 5e-05, "llm_loss": 0.4733075350522995, "loss": 2.2149, "loss_aux_layer_0": 0.0135955810546875, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05511474609375, "step": 4120, "total_loss": 0.553727313876152 }, { "epoch": 0.8158780439516927, "grad_norm": 1.0386329889297485, "learning_rate": 5e-05, "llm_loss": 0.5749873071908951, "loss": 2.636, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.07470703125, "loss_aux_layer_14": 0.0838623046875, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.1025390625, "loss_aux_layer_17": 0.1103515625, "loss_aux_layer_18": 0.118896484375, "loss_aux_layer_19": 0.12255859375, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.130615234375, "loss_aux_layer_21": 0.138427734375, "loss_aux_layer_22": 0.158447265625, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.0579833984375, "step": 4121, "total_loss": 0.6589896529912949 }, { "epoch": 0.8160760245495942, "grad_norm": 0.8559611439704895, "learning_rate": 5e-05, "llm_loss": 0.621625155210495, "loss": 2.8068, "loss_aux_layer_0": 0.012969970703125, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.0570068359375, "step": 4122, "total_loss": 0.7017065435647964 }, { "epoch": 0.8162740051474956, "grad_norm": 1.068579077720642, "learning_rate": 5e-05, "llm_loss": 0.5718220323324203, "loss": 2.6205, "loss_aux_layer_0": 0.0140533447265625, "loss_aux_layer_1": 0.033050537109375, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.0643310546875, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.082763671875, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.100341796875, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.11767578125, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.193359375, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.05780029296875, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.05999755859375, "loss_aux_layer_9": 0.0587158203125, "step": 4123, "total_loss": 0.6551205366849899 }, { "epoch": 0.8164719857453969, "grad_norm": 0.9246877431869507, "learning_rate": 5e-05, "llm_loss": 0.6248131990432739, "loss": 2.8322, "loss_aux_layer_0": 0.012298583984375, "loss_aux_layer_1": 0.032867431640625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.06549072265625, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.075439453125, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.091064453125, "loss_aux_layer_16": 0.099609375, "loss_aux_layer_17": 0.1068115234375, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.0455322265625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.055908203125, "loss_aux_layer_4": 0.0587158203125, "loss_aux_layer_5": 0.06024169921875, "loss_aux_layer_6": 0.0635986328125, "loss_aux_layer_7": 0.0618896484375, "loss_aux_layer_8": 0.06103515625, "loss_aux_layer_9": 0.0599365234375, "step": 4124, "total_loss": 0.7080397754907608 }, { "epoch": 0.8166699663432984, "grad_norm": 1.162246823310852, "learning_rate": 5e-05, "llm_loss": 0.6414504796266556, "loss": 2.8897, "loss_aux_layer_0": 0.0131683349609375, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05633544921875, "step": 4125, "total_loss": 0.7224159836769104 }, { "epoch": 0.8168679469411998, "grad_norm": 0.9192013144493103, "learning_rate": 5e-05, "llm_loss": 0.5724926739931107, "loss": 2.6127, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05487060546875, "step": 4126, "total_loss": 0.6531659364700317 }, { "epoch": 0.8170659275391011, "grad_norm": 0.9410157799720764, "learning_rate": 5e-05, "llm_loss": 0.5743447244167328, "loss": 2.6343, "loss_aux_layer_0": 0.01214599609375, "loss_aux_layer_1": 0.03314208984375, "loss_aux_layer_10": 0.06097412109375, "loss_aux_layer_11": 0.065185546875, "loss_aux_layer_12": 0.06982421875, "loss_aux_layer_13": 0.0755615234375, "loss_aux_layer_14": 0.0843505859375, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05682373046875, "loss_aux_layer_4": 0.05908203125, "loss_aux_layer_5": 0.06048583984375, "loss_aux_layer_6": 0.06353759765625, "loss_aux_layer_7": 0.06170654296875, "loss_aux_layer_8": 0.06109619140625, "loss_aux_layer_9": 0.05987548828125, "step": 4127, "total_loss": 0.6585690379142761 }, { "epoch": 0.8172639081370026, "grad_norm": 0.8264709711074829, "learning_rate": 5e-05, "llm_loss": 0.5237606465816498, "loss": 2.4159, "loss_aux_layer_0": 0.013336181640625, "loss_aux_layer_1": 0.03045654296875, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05682373046875, "step": 4128, "total_loss": 0.6039708629250526 }, { "epoch": 0.817461888734904, "grad_norm": 0.885503351688385, "learning_rate": 5e-05, "llm_loss": 0.5776571929454803, "loss": 2.6228, "loss_aux_layer_0": 0.0116729736328125, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05364990234375, "step": 4129, "total_loss": 0.655689612030983 }, { "epoch": 0.8176598693328054, "grad_norm": 0.8563703298568726, "learning_rate": 5e-05, "llm_loss": 0.5644162893295288, "loss": 2.5795, "loss_aux_layer_0": 0.012939453125, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.056884765625, "step": 4130, "total_loss": 0.6448869407176971 }, { "epoch": 0.8178578499307068, "grad_norm": 0.8371461629867554, "learning_rate": 5e-05, "llm_loss": 0.5286023318767548, "loss": 2.4304, "loss_aux_layer_0": 0.0120697021484375, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.1317138671875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.0546875, "step": 4131, "total_loss": 0.6075879484415054 }, { "epoch": 0.8180558305286082, "grad_norm": 0.9022294282913208, "learning_rate": 5e-05, "llm_loss": 0.5673736482858658, "loss": 2.5831, "loss_aux_layer_0": 0.0129241943359375, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.05902099609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.055419921875, "step": 4132, "total_loss": 0.6457860916852951 }, { "epoch": 0.8182538111265096, "grad_norm": 0.7406571507453918, "learning_rate": 5e-05, "llm_loss": 0.5228613093495369, "loss": 2.433, "loss_aux_layer_0": 0.0113983154296875, "loss_aux_layer_1": 0.03375244140625, "loss_aux_layer_10": 0.0634765625, "loss_aux_layer_11": 0.06787109375, "loss_aux_layer_12": 0.0721435546875, "loss_aux_layer_13": 0.0777587890625, "loss_aux_layer_14": 0.08544921875, "loss_aux_layer_15": 0.09326171875, "loss_aux_layer_16": 0.101806640625, "loss_aux_layer_17": 0.1094970703125, "loss_aux_layer_18": 0.117431640625, "loss_aux_layer_19": 0.1199951171875, "loss_aux_layer_2": 0.047119140625, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.0577392578125, "loss_aux_layer_4": 0.06048583984375, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.0654296875, "loss_aux_layer_7": 0.063720703125, "loss_aux_layer_8": 0.063232421875, "loss_aux_layer_9": 0.06201171875, "step": 4133, "total_loss": 0.6082569807767868 }, { "epoch": 0.818451791724411, "grad_norm": 0.9406149983406067, "learning_rate": 5e-05, "llm_loss": 0.6029146909713745, "loss": 2.7169, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06646728515625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.052490234375, "step": 4134, "total_loss": 0.6792238354682922 }, { "epoch": 0.8186497723223124, "grad_norm": 1.0791659355163574, "learning_rate": 5e-05, "llm_loss": 0.6575970500707626, "loss": 2.9556, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.056640625, "step": 4135, "total_loss": 0.7388998717069626 }, { "epoch": 0.8188477529202138, "grad_norm": 0.9927948713302612, "learning_rate": 5e-05, "llm_loss": 0.5893582403659821, "loss": 2.6827, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.05780029296875, "step": 4136, "total_loss": 0.6706862300634384 }, { "epoch": 0.8190457335181153, "grad_norm": 1.0868690013885498, "learning_rate": 5e-05, "llm_loss": 0.5500334352254868, "loss": 2.5125, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0540771484375, "step": 4137, "total_loss": 0.6281137615442276 }, { "epoch": 0.8192437141160166, "grad_norm": 1.2100716829299927, "learning_rate": 5e-05, "llm_loss": 0.6412305235862732, "loss": 2.8739, "loss_aux_layer_0": 0.0140228271484375, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.053466796875, "step": 4138, "total_loss": 0.7184684574604034 }, { "epoch": 0.819441694713918, "grad_norm": 1.018957495689392, "learning_rate": 5e-05, "llm_loss": 0.5921753346920013, "loss": 2.6999, "loss_aux_layer_0": 0.01202392578125, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.05975341796875, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.068603515625, "loss_aux_layer_13": 0.0740966796875, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.090576171875, "loss_aux_layer_16": 0.0999755859375, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.11865234375, "loss_aux_layer_2": 0.04449462890625, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05694580078125, "loss_aux_layer_5": 0.05865478515625, "loss_aux_layer_6": 0.0618896484375, "loss_aux_layer_7": 0.06024169921875, "loss_aux_layer_8": 0.05950927734375, "loss_aux_layer_9": 0.058349609375, "step": 4139, "total_loss": 0.6749786883592606 }, { "epoch": 0.8196396753118195, "grad_norm": 1.0877997875213623, "learning_rate": 5e-05, "llm_loss": 0.5579981952905655, "loss": 2.5454, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.0537109375, "step": 4140, "total_loss": 0.6363543123006821 }, { "epoch": 0.8198376559097208, "grad_norm": 1.014224648475647, "learning_rate": 5e-05, "llm_loss": 0.5610344707965851, "loss": 2.5656, "loss_aux_layer_0": 0.0120697021484375, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05609130859375, "step": 4141, "total_loss": 0.6414123922586441 }, { "epoch": 0.8200356365076222, "grad_norm": 1.1526799201965332, "learning_rate": 5e-05, "llm_loss": 0.590417742729187, "loss": 2.675, "loss_aux_layer_0": 0.0132904052734375, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05322265625, "step": 4142, "total_loss": 0.6687444448471069 }, { "epoch": 0.8202336171055237, "grad_norm": 0.913476288318634, "learning_rate": 5e-05, "llm_loss": 0.49042901396751404, "loss": 2.2781, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.054931640625, "step": 4143, "total_loss": 0.5695260614156723 }, { "epoch": 0.8204315977034251, "grad_norm": 1.1679846048355103, "learning_rate": 5e-05, "llm_loss": 0.5389108955860138, "loss": 2.4663, "loss_aux_layer_0": 0.012054443359375, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0533447265625, "step": 4144, "total_loss": 0.616567388176918 }, { "epoch": 0.8206295783013264, "grad_norm": 0.8994584679603577, "learning_rate": 5e-05, "llm_loss": 0.47612595558166504, "loss": 2.2372, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0830078125, "loss_aux_layer_15": 0.092041015625, "loss_aux_layer_16": 0.1016845703125, "loss_aux_layer_17": 0.109375, "loss_aux_layer_18": 0.117919921875, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.129150390625, "loss_aux_layer_21": 0.137451171875, "loss_aux_layer_22": 0.15869140625, "loss_aux_layer_23": 0.196533203125, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.060302734375, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05706787109375, "step": 4145, "total_loss": 0.5592942908406258 }, { "epoch": 0.8208275588992279, "grad_norm": 0.7882481813430786, "learning_rate": 5e-05, "llm_loss": 0.4931487888097763, "loss": 2.3013, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.0599365234375, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.08203125, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.11474609375, "loss_aux_layer_19": 0.117431640625, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05670166015625, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.0584716796875, "step": 4146, "total_loss": 0.5753356441855431 }, { "epoch": 0.8210255394971293, "grad_norm": 1.0137742757797241, "learning_rate": 5e-05, "llm_loss": 0.5885051041841507, "loss": 2.6721, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.05462646484375, "step": 4147, "total_loss": 0.6680219173431396 }, { "epoch": 0.8212235200950306, "grad_norm": 0.8620580434799194, "learning_rate": 5e-05, "llm_loss": 0.6408336609601974, "loss": 2.8941, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.0596923828125, "loss_aux_layer_11": 0.0638427734375, "loss_aux_layer_12": 0.0687255859375, "loss_aux_layer_13": 0.07421875, "loss_aux_layer_14": 0.0821533203125, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05706787109375, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.059326171875, "loss_aux_layer_9": 0.05810546875, "step": 4148, "total_loss": 0.7235361635684967 }, { "epoch": 0.8214215006929321, "grad_norm": 0.9898326992988586, "learning_rate": 5e-05, "llm_loss": 0.5699283629655838, "loss": 2.599, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05523681640625, "step": 4149, "total_loss": 0.6497488170862198 }, { "epoch": 0.8216194812908335, "grad_norm": 0.8415452837944031, "learning_rate": 5e-05, "llm_loss": 0.5756430178880692, "loss": 2.6297, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.03094482421875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1063232421875, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.05755615234375, "step": 4150, "total_loss": 0.6574196815490723 }, { "epoch": 0.821817461888735, "grad_norm": 0.8236567974090576, "learning_rate": 5e-05, "llm_loss": 0.6137252897024155, "loss": 2.7734, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.0572509765625, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.05657958984375, "step": 4151, "total_loss": 0.6933498680591583 }, { "epoch": 0.8220154424866363, "grad_norm": 0.9602630138397217, "learning_rate": 5e-05, "llm_loss": 0.6675988733768463, "loss": 2.9887, "loss_aux_layer_0": 0.0122222900390625, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.06573486328125, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05609130859375, "step": 4152, "total_loss": 0.7471644580364227 }, { "epoch": 0.8222134230845377, "grad_norm": 0.931236207485199, "learning_rate": 5e-05, "llm_loss": 0.5629036128520966, "loss": 2.554, "loss_aux_layer_0": 0.0117340087890625, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0535888671875, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05206298828125, "step": 4153, "total_loss": 0.6385006159543991 }, { "epoch": 0.8224114036824391, "grad_norm": 0.8005508184432983, "learning_rate": 5e-05, "llm_loss": 0.5590045154094696, "loss": 2.5533, "loss_aux_layer_0": 0.0115966796875, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05535888671875, "step": 4154, "total_loss": 0.6383309364318848 }, { "epoch": 0.8226093842803405, "grad_norm": 0.8804327249526978, "learning_rate": 5e-05, "llm_loss": 0.5984337627887726, "loss": 2.7263, "loss_aux_layer_0": 0.01129150390625, "loss_aux_layer_1": 0.0318603515625, "loss_aux_layer_10": 0.0611572265625, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.069580078125, "loss_aux_layer_13": 0.0748291015625, "loss_aux_layer_14": 0.0833740234375, "loss_aux_layer_15": 0.0914306640625, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1077880859375, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05511474609375, "loss_aux_layer_4": 0.0579833984375, "loss_aux_layer_5": 0.05975341796875, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06109619140625, "loss_aux_layer_8": 0.060791015625, "loss_aux_layer_9": 0.05975341796875, "step": 4155, "total_loss": 0.6815699189901352 }, { "epoch": 0.8228073648782419, "grad_norm": 0.8613168001174927, "learning_rate": 5e-05, "llm_loss": 0.5578354001045227, "loss": 2.5565, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.0589599609375, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.0576171875, "step": 4156, "total_loss": 0.6391319036483765 }, { "epoch": 0.8230053454761433, "grad_norm": 0.8655795454978943, "learning_rate": 5e-05, "llm_loss": 0.546283945441246, "loss": 2.5084, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.0670166015625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.05706787109375, "step": 4157, "total_loss": 0.6271006688475609 }, { "epoch": 0.8232033260740448, "grad_norm": 1.1109565496444702, "learning_rate": 5e-05, "llm_loss": 0.6112275272607803, "loss": 2.7735, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.032379150390625, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.057373046875, "step": 4158, "total_loss": 0.693382054567337 }, { "epoch": 0.8234013066719461, "grad_norm": 0.8768792152404785, "learning_rate": 5e-05, "llm_loss": 0.5187248513102531, "loss": 2.4114, "loss_aux_layer_0": 0.0112152099609375, "loss_aux_layer_1": 0.0328369140625, "loss_aux_layer_10": 0.062255859375, "loss_aux_layer_11": 0.0665283203125, "loss_aux_layer_12": 0.071044921875, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0924072265625, "loss_aux_layer_16": 0.10107421875, "loss_aux_layer_17": 0.108154296875, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.118408203125, "loss_aux_layer_2": 0.04656982421875, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05712890625, "loss_aux_layer_4": 0.05963134765625, "loss_aux_layer_5": 0.06121826171875, "loss_aux_layer_6": 0.064208984375, "loss_aux_layer_7": 0.0625, "loss_aux_layer_8": 0.06182861328125, "loss_aux_layer_9": 0.060791015625, "step": 4159, "total_loss": 0.6028540208935738 }, { "epoch": 0.8235992872698475, "grad_norm": 1.0081508159637451, "learning_rate": 5e-05, "llm_loss": 0.5897421538829803, "loss": 2.6904, "loss_aux_layer_0": 0.0109710693359375, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.059326171875, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0902099609375, "loss_aux_layer_16": 0.099853515625, "loss_aux_layer_17": 0.1080322265625, "loss_aux_layer_18": 0.1163330078125, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.127685546875, "loss_aux_layer_21": 0.135986328125, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.1943359375, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.056640625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.05908203125, "loss_aux_layer_9": 0.05792236328125, "step": 4160, "total_loss": 0.6725912541151047 }, { "epoch": 0.823797267867749, "grad_norm": 1.2213678359985352, "learning_rate": 5e-05, "llm_loss": 0.5449960231781006, "loss": 2.4888, "loss_aux_layer_0": 0.0112762451171875, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0535888671875, "step": 4161, "total_loss": 0.6222122609615326 }, { "epoch": 0.8239952484656504, "grad_norm": 1.1193782091140747, "learning_rate": 5e-05, "llm_loss": 0.5816308334469795, "loss": 2.6441, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.0555419921875, "step": 4162, "total_loss": 0.6610269844532013 }, { "epoch": 0.8241932290635517, "grad_norm": 1.2438536882400513, "learning_rate": 5e-05, "llm_loss": 0.5390439108014107, "loss": 2.4735, "loss_aux_layer_0": 0.0113067626953125, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05584716796875, "step": 4163, "total_loss": 0.6183642596006393 }, { "epoch": 0.8243912096614532, "grad_norm": 1.1178710460662842, "learning_rate": 5e-05, "llm_loss": 0.5487970560789108, "loss": 2.5096, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05364990234375, "step": 4164, "total_loss": 0.6274022310972214 }, { "epoch": 0.8245891902593546, "grad_norm": 0.833659291267395, "learning_rate": 5e-05, "llm_loss": 0.5497418791055679, "loss": 2.5176, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.0618896484375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05682373046875, "step": 4165, "total_loss": 0.6293980181217194 }, { "epoch": 0.824787170857256, "grad_norm": 1.3518030643463135, "learning_rate": 5e-05, "llm_loss": 0.6444862931966782, "loss": 2.8977, "loss_aux_layer_0": 0.0123138427734375, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.07049560546875, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.0560302734375, "step": 4166, "total_loss": 0.7244251668453217 }, { "epoch": 0.8249851514551574, "grad_norm": 0.8623322248458862, "learning_rate": 5e-05, "llm_loss": 0.5811886340379715, "loss": 2.6482, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.0565185546875, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.0574951171875, "step": 4167, "total_loss": 0.6620596721768379 }, { "epoch": 0.8251831320530588, "grad_norm": 0.9633111357688904, "learning_rate": 5e-05, "llm_loss": 0.5413801670074463, "loss": 2.4732, "loss_aux_layer_0": 0.0131683349609375, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.05218505859375, "step": 4168, "total_loss": 0.6182940155267715 }, { "epoch": 0.8253811126509603, "grad_norm": 1.3687851428985596, "learning_rate": 5e-05, "llm_loss": 0.512925460934639, "loss": 2.3781, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072265625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.107421875, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1190185546875, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05615234375, "step": 4169, "total_loss": 0.5945164263248444 }, { "epoch": 0.8255790932488616, "grad_norm": 1.2117552757263184, "learning_rate": 5e-05, "llm_loss": 0.6280168741941452, "loss": 2.8327, "loss_aux_layer_0": 0.0141754150390625, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.1324462890625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0552978515625, "step": 4170, "total_loss": 0.7081661522388458 }, { "epoch": 0.825777073846763, "grad_norm": 1.0571403503417969, "learning_rate": 5e-05, "llm_loss": 0.5709395110607147, "loss": 2.6104, "loss_aux_layer_0": 0.0117950439453125, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.1923828125, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.05682373046875, "step": 4171, "total_loss": 0.6525910198688507 }, { "epoch": 0.8259750544446645, "grad_norm": 1.0241962671279907, "learning_rate": 5e-05, "llm_loss": 0.6026272475719452, "loss": 2.7217, "loss_aux_layer_0": 0.0140380859375, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05279541015625, "step": 4172, "total_loss": 0.6804337203502655 }, { "epoch": 0.8261730350425658, "grad_norm": 0.9315163493156433, "learning_rate": 5e-05, "llm_loss": 0.5493559837341309, "loss": 2.5276, "loss_aux_layer_0": 0.0127410888671875, "loss_aux_layer_1": 0.03277587890625, "loss_aux_layer_10": 0.05999755859375, "loss_aux_layer_11": 0.06402587890625, "loss_aux_layer_12": 0.068115234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04571533203125, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05584716796875, "loss_aux_layer_4": 0.0584716796875, "loss_aux_layer_5": 0.06005859375, "loss_aux_layer_6": 0.06304931640625, "loss_aux_layer_7": 0.06103515625, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.058837890625, "step": 4173, "total_loss": 0.6319083273410797 }, { "epoch": 0.8263710156404672, "grad_norm": 0.9795733690261841, "learning_rate": 5e-05, "llm_loss": 0.6071034669876099, "loss": 2.7505, "loss_aux_layer_0": 0.012420654296875, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05804443359375, "loss_aux_layer_9": 0.056884765625, "step": 4174, "total_loss": 0.6876186579465866 }, { "epoch": 0.8265689962383687, "grad_norm": 0.7688695192337036, "learning_rate": 5e-05, "llm_loss": 0.5545324385166168, "loss": 2.5396, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.05523681640625, "step": 4175, "total_loss": 0.6348883658647537 }, { "epoch": 0.8267669768362701, "grad_norm": 0.9442148208618164, "learning_rate": 5e-05, "llm_loss": 0.5893797501921654, "loss": 2.6873, "loss_aux_layer_0": 0.0123291015625, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.059814453125, "loss_aux_layer_11": 0.06390380859375, "loss_aux_layer_12": 0.068359375, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0897216796875, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05560302734375, "loss_aux_layer_4": 0.0576171875, "loss_aux_layer_5": 0.05914306640625, "loss_aux_layer_6": 0.061767578125, "loss_aux_layer_7": 0.06005859375, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.05853271484375, "step": 4176, "total_loss": 0.6718273907899857 }, { "epoch": 0.8269649574341714, "grad_norm": 1.0848253965377808, "learning_rate": 5e-05, "llm_loss": 0.6040484756231308, "loss": 2.7401, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05633544921875, "step": 4177, "total_loss": 0.6850128918886185 }, { "epoch": 0.8271629380320729, "grad_norm": 0.980894148349762, "learning_rate": 5e-05, "llm_loss": 0.5885603427886963, "loss": 2.6723, "loss_aux_layer_0": 0.0126953125, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05596923828125, "step": 4178, "total_loss": 0.6680745035409927 }, { "epoch": 0.8273609186299743, "grad_norm": 0.7454933524131775, "learning_rate": 5e-05, "llm_loss": 0.5630523711442947, "loss": 2.571, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.05682373046875, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05609130859375, "step": 4179, "total_loss": 0.6427495777606964 }, { "epoch": 0.8275588992278756, "grad_norm": 1.192772626876831, "learning_rate": 5e-05, "llm_loss": 0.6151126772165298, "loss": 2.7821, "loss_aux_layer_0": 0.0127105712890625, "loss_aux_layer_1": 0.031402587890625, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05645751953125, "step": 4180, "total_loss": 0.695519894361496 }, { "epoch": 0.8277568798257771, "grad_norm": 1.0756176710128784, "learning_rate": 5e-05, "llm_loss": 0.47803372144699097, "loss": 2.2274, "loss_aux_layer_0": 0.012786865234375, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.0550537109375, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1309814453125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0537109375, "step": 4181, "total_loss": 0.556859590113163 }, { "epoch": 0.8279548604236785, "grad_norm": 1.0068777799606323, "learning_rate": 5e-05, "llm_loss": 0.5987948030233383, "loss": 2.7147, "loss_aux_layer_0": 0.01275634765625, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.0560302734375, "step": 4182, "total_loss": 0.6786779910326004 }, { "epoch": 0.8281528410215799, "grad_norm": 0.9463549852371216, "learning_rate": 5e-05, "llm_loss": 0.5068760365247726, "loss": 2.3423, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.0526123046875, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.05438232421875, "step": 4183, "total_loss": 0.5855748951435089 }, { "epoch": 0.8283508216194813, "grad_norm": 1.0962417125701904, "learning_rate": 5e-05, "llm_loss": 0.6240168064832687, "loss": 2.8184, "loss_aux_layer_0": 0.0128326416015625, "loss_aux_layer_1": 0.031097412109375, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.056640625, "step": 4184, "total_loss": 0.7046018838882446 }, { "epoch": 0.8285488022173827, "grad_norm": 0.9051617980003357, "learning_rate": 5e-05, "llm_loss": 0.5859013497829437, "loss": 2.6636, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.030853271484375, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05548095703125, "step": 4185, "total_loss": 0.6658883094787598 }, { "epoch": 0.8287467828152841, "grad_norm": 0.9782636165618896, "learning_rate": 5e-05, "llm_loss": 0.5314618051052094, "loss": 2.443, "loss_aux_layer_0": 0.0113525390625, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.0556640625, "step": 4186, "total_loss": 0.6107429563999176 }, { "epoch": 0.8289447634131855, "grad_norm": 0.8988375067710876, "learning_rate": 5e-05, "llm_loss": 0.5133060589432716, "loss": 2.3736, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05596923828125, "step": 4187, "total_loss": 0.5934028774499893 }, { "epoch": 0.8291427440110869, "grad_norm": 1.1223869323730469, "learning_rate": 5e-05, "llm_loss": 0.6066252663731575, "loss": 2.738, "loss_aux_layer_0": 0.0122222900390625, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05340576171875, "step": 4188, "total_loss": 0.6845073103904724 }, { "epoch": 0.8293407246089883, "grad_norm": 0.8742420673370361, "learning_rate": 5e-05, "llm_loss": 0.5424052774906158, "loss": 2.483, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0545654296875, "step": 4189, "total_loss": 0.6207571625709534 }, { "epoch": 0.8295387052068898, "grad_norm": 0.8914000988006592, "learning_rate": 5e-05, "llm_loss": 0.5971149057149887, "loss": 2.7053, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.0550537109375, "step": 4190, "total_loss": 0.6763321757316589 }, { "epoch": 0.8297366858047911, "grad_norm": 1.1915420293807983, "learning_rate": 5e-05, "llm_loss": 0.5818125829100609, "loss": 2.6387, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0543212890625, "step": 4191, "total_loss": 0.6596740782260895 }, { "epoch": 0.8299346664026925, "grad_norm": 0.9443314075469971, "learning_rate": 5e-05, "llm_loss": 0.4949248731136322, "loss": 2.3027, "loss_aux_layer_0": 0.0120849609375, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04376220703125, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.0567626953125, "step": 4192, "total_loss": 0.5756644159555435 }, { "epoch": 0.830132647000594, "grad_norm": 0.9147481322288513, "learning_rate": 5e-05, "llm_loss": 0.5265991687774658, "loss": 2.4285, "loss_aux_layer_0": 0.011566162109375, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.057373046875, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.05657958984375, "step": 4193, "total_loss": 0.6071361750364304 }, { "epoch": 0.8303306275984953, "grad_norm": 0.8186060786247253, "learning_rate": 5e-05, "llm_loss": 0.6113146841526031, "loss": 2.7593, "loss_aux_layer_0": 0.01202392578125, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05419921875, "step": 4194, "total_loss": 0.6898245811462402 }, { "epoch": 0.8305286081963967, "grad_norm": 1.1692196130752563, "learning_rate": 5e-05, "llm_loss": 0.6308818310499191, "loss": 2.8648, "loss_aux_layer_0": 0.011566162109375, "loss_aux_layer_1": 0.03228759765625, "loss_aux_layer_10": 0.0618896484375, "loss_aux_layer_11": 0.06573486328125, "loss_aux_layer_12": 0.0706787109375, "loss_aux_layer_13": 0.07666015625, "loss_aux_layer_14": 0.0858154296875, "loss_aux_layer_15": 0.0948486328125, "loss_aux_layer_16": 0.1043701171875, "loss_aux_layer_17": 0.1126708984375, "loss_aux_layer_18": 0.12060546875, "loss_aux_layer_19": 0.122314453125, "loss_aux_layer_2": 0.046142578125, "loss_aux_layer_20": 0.1295166015625, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05621337890625, "loss_aux_layer_4": 0.0589599609375, "loss_aux_layer_5": 0.0606689453125, "loss_aux_layer_6": 0.06396484375, "loss_aux_layer_7": 0.06195068359375, "loss_aux_layer_8": 0.061767578125, "loss_aux_layer_9": 0.06060791015625, "step": 4195, "total_loss": 0.7161968946456909 }, { "epoch": 0.8307265887942982, "grad_norm": 0.8537341952323914, "learning_rate": 5e-05, "llm_loss": 0.5849536955356598, "loss": 2.6654, "loss_aux_layer_0": 0.011322021484375, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0576171875, "step": 4196, "total_loss": 0.6663405448198318 }, { "epoch": 0.8309245693921996, "grad_norm": 0.98292076587677, "learning_rate": 5e-05, "llm_loss": 0.5719539672136307, "loss": 2.602, "loss_aux_layer_0": 0.0112762451171875, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0540771484375, "step": 4197, "total_loss": 0.6504908502101898 }, { "epoch": 0.8311225499901009, "grad_norm": 1.081289291381836, "learning_rate": 5e-05, "llm_loss": 0.5062451213598251, "loss": 2.3343, "loss_aux_layer_0": 0.011322021484375, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.0535888671875, "step": 4198, "total_loss": 0.5835869163274765 }, { "epoch": 0.8313205305880024, "grad_norm": 0.9995447397232056, "learning_rate": 5e-05, "llm_loss": 0.6088822185993195, "loss": 2.7565, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05682373046875, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.0560302734375, "step": 4199, "total_loss": 0.6891291588544846 }, { "epoch": 0.8315185111859038, "grad_norm": 0.9345650672912598, "learning_rate": 5e-05, "llm_loss": 0.5338649749755859, "loss": 2.4656, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.03289794921875, "loss_aux_layer_10": 0.060546875, "loss_aux_layer_11": 0.064453125, "loss_aux_layer_12": 0.0684814453125, "loss_aux_layer_13": 0.07373046875, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04541015625, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.055419921875, "loss_aux_layer_4": 0.05810546875, "loss_aux_layer_5": 0.05987548828125, "loss_aux_layer_6": 0.0631103515625, "loss_aux_layer_7": 0.06134033203125, "loss_aux_layer_8": 0.060546875, "loss_aux_layer_9": 0.0592041015625, "step": 4200, "total_loss": 0.6164056360721588 }, { "epoch": 0.8317164917838051, "grad_norm": 0.8826743960380554, "learning_rate": 5e-05, "llm_loss": 0.5433656945824623, "loss": 2.4885, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.0548095703125, "step": 4201, "total_loss": 0.622124508023262 }, { "epoch": 0.8319144723817066, "grad_norm": 1.11918306350708, "learning_rate": 5e-05, "llm_loss": 0.5973285585641861, "loss": 2.7061, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05548095703125, "step": 4202, "total_loss": 0.6765253692865372 }, { "epoch": 0.832112452979608, "grad_norm": 1.2989270687103271, "learning_rate": 5e-05, "llm_loss": 0.5686378926038742, "loss": 2.6038, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0994873046875, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.115966796875, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.127197265625, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.156982421875, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.05712890625, "step": 4203, "total_loss": 0.6509596854448318 }, { "epoch": 0.8323104335775094, "grad_norm": 1.0802183151245117, "learning_rate": 5e-05, "llm_loss": 0.5927067548036575, "loss": 2.6922, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1263427734375, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.155517578125, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0550537109375, "step": 4204, "total_loss": 0.6730624735355377 }, { "epoch": 0.8325084141754108, "grad_norm": 1.019952654838562, "learning_rate": 5e-05, "llm_loss": 0.58415687084198, "loss": 2.6544, "loss_aux_layer_0": 0.012786865234375, "loss_aux_layer_1": 0.03094482421875, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06500244140625, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.05609130859375, "step": 4205, "total_loss": 0.6635906100273132 }, { "epoch": 0.8327063947733122, "grad_norm": 0.795291543006897, "learning_rate": 5e-05, "llm_loss": 0.4818509519100189, "loss": 2.2557, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.06707763671875, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.126220703125, "loss_aux_layer_21": 0.135009765625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.0574951171875, "step": 4206, "total_loss": 0.5639205724000931 }, { "epoch": 0.8329043753712136, "grad_norm": 1.2942310571670532, "learning_rate": 5e-05, "llm_loss": 0.57489924877882, "loss": 2.6059, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.05206298828125, "step": 4207, "total_loss": 0.6514848619699478 }, { "epoch": 0.833102355969115, "grad_norm": 1.0771245956420898, "learning_rate": 5e-05, "llm_loss": 0.5504623204469681, "loss": 2.5268, "loss_aux_layer_0": 0.012115478515625, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06671142578125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04425048828125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.06146240234375, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0574951171875, "step": 4208, "total_loss": 0.6316919475793839 }, { "epoch": 0.8333003365670164, "grad_norm": 0.9281184077262878, "learning_rate": 5e-05, "llm_loss": 0.5790857523679733, "loss": 2.6389, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.030487060546875, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.056884765625, "step": 4209, "total_loss": 0.6597286313772202 }, { "epoch": 0.8334983171649178, "grad_norm": 0.8176715970039368, "learning_rate": 5e-05, "llm_loss": 0.539456881582737, "loss": 2.4748, "loss_aux_layer_0": 0.0120697021484375, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0540771484375, "step": 4210, "total_loss": 0.6187019944190979 }, { "epoch": 0.8336962977628193, "grad_norm": 1.0059539079666138, "learning_rate": 5e-05, "llm_loss": 0.5513348877429962, "loss": 2.5283, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.030181884765625, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05609130859375, "step": 4211, "total_loss": 0.6320724934339523 }, { "epoch": 0.8338942783607206, "grad_norm": 0.9712575674057007, "learning_rate": 5e-05, "llm_loss": 0.5561472326517105, "loss": 2.5396, "loss_aux_layer_0": 0.0121307373046875, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.054931640625, "step": 4212, "total_loss": 0.6349089443683624 }, { "epoch": 0.834092258958622, "grad_norm": 1.0191588401794434, "learning_rate": 5e-05, "llm_loss": 0.6162388548254967, "loss": 2.7698, "loss_aux_layer_0": 0.0125885009765625, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05206298828125, "step": 4213, "total_loss": 0.6924498230218887 }, { "epoch": 0.8342902395565235, "grad_norm": 0.9108235239982605, "learning_rate": 5e-05, "llm_loss": 0.6010131686925888, "loss": 2.728, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.0316162109375, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.06317138671875, "loss_aux_layer_12": 0.067626953125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05792236328125, "step": 4214, "total_loss": 0.6819908618927002 }, { "epoch": 0.8344882201544249, "grad_norm": 1.0618573427200317, "learning_rate": 5e-05, "llm_loss": 0.5703760236501694, "loss": 2.6051, "loss_aux_layer_0": 0.0113983154296875, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.05682373046875, "step": 4215, "total_loss": 0.6512769907712936 }, { "epoch": 0.8346862007523262, "grad_norm": 0.9328106045722961, "learning_rate": 5e-05, "llm_loss": 0.51059590280056, "loss": 2.3577, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05377197265625, "step": 4216, "total_loss": 0.5894263088703156 }, { "epoch": 0.8348841813502277, "grad_norm": 0.9225631952285767, "learning_rate": 5e-05, "llm_loss": 0.5428864508867264, "loss": 2.4906, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.02972412109375, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.05560302734375, "step": 4217, "total_loss": 0.6226554214954376 }, { "epoch": 0.8350821619481291, "grad_norm": 0.7839387059211731, "learning_rate": 5e-05, "llm_loss": 0.6260091811418533, "loss": 2.8061, "loss_aux_layer_0": 0.0119171142578125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.05255126953125, "step": 4218, "total_loss": 0.7015131115913391 }, { "epoch": 0.8352801425460304, "grad_norm": 0.8774596452713013, "learning_rate": 5e-05, "llm_loss": 0.5790904760360718, "loss": 2.6473, "loss_aux_layer_0": 0.012481689453125, "loss_aux_layer_1": 0.03240966796875, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.062744140625, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.1158447265625, "loss_aux_layer_19": 0.11962890625, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.1275634765625, "loss_aux_layer_21": 0.13623046875, "loss_aux_layer_22": 0.157958984375, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.0574951171875, "step": 4219, "total_loss": 0.6618361175060272 }, { "epoch": 0.8354781231439319, "grad_norm": 0.7725217938423157, "learning_rate": 5e-05, "llm_loss": 0.6034943908452988, "loss": 2.7272, "loss_aux_layer_0": 0.0116729736328125, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.0540771484375, "step": 4220, "total_loss": 0.6818043142557144 }, { "epoch": 0.8356761037418333, "grad_norm": 0.7191551923751831, "learning_rate": 5e-05, "llm_loss": 0.5531932711601257, "loss": 2.5345, "loss_aux_layer_0": 0.0118255615234375, "loss_aux_layer_1": 0.03033447265625, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.06988525390625, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.1331787109375, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05572509765625, "step": 4221, "total_loss": 0.6336260885000229 }, { "epoch": 0.8358740843397348, "grad_norm": 0.9110064506530762, "learning_rate": 5e-05, "llm_loss": 0.5702432170510292, "loss": 2.6019, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05609130859375, "step": 4222, "total_loss": 0.650470033288002 }, { "epoch": 0.8360720649376361, "grad_norm": 0.8248071670532227, "learning_rate": 5e-05, "llm_loss": 0.6237456053495407, "loss": 2.8119, "loss_aux_layer_0": 0.0118560791015625, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05426025390625, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05596923828125, "step": 4223, "total_loss": 0.7029839307069778 }, { "epoch": 0.8362700455355375, "grad_norm": 0.8015996813774109, "learning_rate": 5e-05, "llm_loss": 0.6492415219545364, "loss": 2.9122, "loss_aux_layer_0": 0.0115966796875, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05584716796875, "step": 4224, "total_loss": 0.7280395776033401 }, { "epoch": 0.836468026133439, "grad_norm": 0.8475752472877502, "learning_rate": 5e-05, "llm_loss": 0.613422229886055, "loss": 2.7692, "loss_aux_layer_0": 0.01220703125, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.0531005859375, "step": 4225, "total_loss": 0.6922966837882996 }, { "epoch": 0.8366660067313403, "grad_norm": 0.7745795845985413, "learning_rate": 5e-05, "llm_loss": 0.5821874439716339, "loss": 2.6668, "loss_aux_layer_0": 0.0116729736328125, "loss_aux_layer_1": 0.033935546875, "loss_aux_layer_10": 0.0631103515625, "loss_aux_layer_11": 0.0672607421875, "loss_aux_layer_12": 0.07159423828125, "loss_aux_layer_13": 0.076904296875, "loss_aux_layer_14": 0.084716796875, "loss_aux_layer_15": 0.0921630859375, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.107666015625, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1180419921875, "loss_aux_layer_2": 0.046630859375, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1331787109375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05731201171875, "loss_aux_layer_4": 0.06024169921875, "loss_aux_layer_5": 0.062255859375, "loss_aux_layer_6": 0.0655517578125, "loss_aux_layer_7": 0.06378173828125, "loss_aux_layer_8": 0.06304931640625, "loss_aux_layer_9": 0.06170654296875, "step": 4226, "total_loss": 0.6666960120201111 }, { "epoch": 0.8368639873292417, "grad_norm": 0.6969987750053406, "learning_rate": 5e-05, "llm_loss": 0.4812272787094116, "loss": 2.2382, "loss_aux_layer_0": 0.0113067626953125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05499267578125, "step": 4227, "total_loss": 0.5595453232526779 }, { "epoch": 0.8370619679271432, "grad_norm": 0.9257709980010986, "learning_rate": 5e-05, "llm_loss": 0.6539999097585678, "loss": 2.9272, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05377197265625, "step": 4228, "total_loss": 0.7318066954612732 }, { "epoch": 0.8372599485250446, "grad_norm": 0.8701528906822205, "learning_rate": 5e-05, "llm_loss": 0.5403005853295326, "loss": 2.4724, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05267333984375, "step": 4229, "total_loss": 0.6180876642465591 }, { "epoch": 0.8374579291229459, "grad_norm": 0.9721938967704773, "learning_rate": 5e-05, "llm_loss": 0.5535419955849648, "loss": 2.5499, "loss_aux_layer_0": 0.0117950439453125, "loss_aux_layer_1": 0.0321044921875, "loss_aux_layer_10": 0.0609130859375, "loss_aux_layer_11": 0.0650634765625, "loss_aux_layer_12": 0.070068359375, "loss_aux_layer_13": 0.07568359375, "loss_aux_layer_14": 0.08447265625, "loss_aux_layer_15": 0.0926513671875, "loss_aux_layer_16": 0.102294921875, "loss_aux_layer_17": 0.109619140625, "loss_aux_layer_18": 0.1175537109375, "loss_aux_layer_19": 0.1197509765625, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.1273193359375, "loss_aux_layer_21": 0.1357421875, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.059326171875, "loss_aux_layer_6": 0.0625, "loss_aux_layer_7": 0.06085205078125, "loss_aux_layer_8": 0.060302734375, "loss_aux_layer_9": 0.0592041015625, "step": 4230, "total_loss": 0.6374664604663849 }, { "epoch": 0.8376559097208474, "grad_norm": 0.8617641925811768, "learning_rate": 5e-05, "llm_loss": 0.6205564141273499, "loss": 2.7808, "loss_aux_layer_0": 0.012115478515625, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.05975341796875, "loss_aux_layer_13": 0.064697265625, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.0511474609375, "step": 4231, "total_loss": 0.6951998770236969 }, { "epoch": 0.8378538903187488, "grad_norm": 1.03868567943573, "learning_rate": 5e-05, "llm_loss": 0.5718457847833633, "loss": 2.6078, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.0557861328125, "step": 4232, "total_loss": 0.6519462317228317 }, { "epoch": 0.8380518709166501, "grad_norm": 0.8649546504020691, "learning_rate": 5e-05, "llm_loss": 0.5361797958612442, "loss": 2.451, "loss_aux_layer_0": 0.0124053955078125, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.05303955078125, "step": 4233, "total_loss": 0.6127620488405228 }, { "epoch": 0.8382498515145516, "grad_norm": 1.3239281177520752, "learning_rate": 5e-05, "llm_loss": 0.5707423612475395, "loss": 2.5859, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05303955078125, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.051513671875, "step": 4234, "total_loss": 0.6464833170175552 }, { "epoch": 0.838447832112453, "grad_norm": 0.9348334670066833, "learning_rate": 5e-05, "llm_loss": 0.5360182672739029, "loss": 2.4577, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.053955078125, "step": 4235, "total_loss": 0.6144307255744934 }, { "epoch": 0.8386458127103544, "grad_norm": 0.8963489532470703, "learning_rate": 5e-05, "llm_loss": 0.5410773605108261, "loss": 2.4768, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.06878662109375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.0545654296875, "step": 4236, "total_loss": 0.6192068457603455 }, { "epoch": 0.8388437933082558, "grad_norm": 0.9527824521064758, "learning_rate": 5e-05, "llm_loss": 0.4904213696718216, "loss": 2.2762, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.0546875, "step": 4237, "total_loss": 0.5690616071224213 }, { "epoch": 0.8390417739061572, "grad_norm": 0.8482043743133545, "learning_rate": 5e-05, "llm_loss": 0.6049906462430954, "loss": 2.7295, "loss_aux_layer_0": 0.01123046875, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06842041015625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05377197265625, "step": 4238, "total_loss": 0.6823763251304626 }, { "epoch": 0.8392397545040586, "grad_norm": 0.8837475180625916, "learning_rate": 5e-05, "llm_loss": 0.5314662158489227, "loss": 2.4407, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05474853515625, "step": 4239, "total_loss": 0.6101771891117096 }, { "epoch": 0.83943773510196, "grad_norm": 0.893889307975769, "learning_rate": 5e-05, "llm_loss": 0.5751859545707703, "loss": 2.6018, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.1044921875, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 4240, "total_loss": 0.6504586637020111 }, { "epoch": 0.8396357156998614, "grad_norm": 0.7518424391746521, "learning_rate": 5e-05, "llm_loss": 0.5405515283346176, "loss": 2.4624, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05157470703125, "step": 4241, "total_loss": 0.6155898720026016 }, { "epoch": 0.8398336962977628, "grad_norm": 0.765108048915863, "learning_rate": 5e-05, "llm_loss": 0.5040912106633186, "loss": 2.3279, "loss_aux_layer_0": 0.01220703125, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.053955078125, "step": 4242, "total_loss": 0.5819831937551498 }, { "epoch": 0.8400316768956643, "grad_norm": 0.7871066331863403, "learning_rate": 5e-05, "llm_loss": 0.510889858007431, "loss": 2.3556, "loss_aux_layer_0": 0.0121307373046875, "loss_aux_layer_1": 0.029022216796875, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.0537109375, "step": 4243, "total_loss": 0.588908314704895 }, { "epoch": 0.8402296574935656, "grad_norm": 0.9045051336288452, "learning_rate": 5e-05, "llm_loss": 0.5669827610254288, "loss": 2.5712, "loss_aux_layer_0": 0.0124359130859375, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05181884765625, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05126953125, "step": 4244, "total_loss": 0.6428046673536301 }, { "epoch": 0.840427638091467, "grad_norm": 1.3323101997375488, "learning_rate": 5e-05, "llm_loss": 0.5136417746543884, "loss": 2.3712, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.0556640625, "step": 4245, "total_loss": 0.5928022265434265 }, { "epoch": 0.8406256186893685, "grad_norm": 0.7844385504722595, "learning_rate": 5e-05, "llm_loss": 0.5699102580547333, "loss": 2.5994, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.062255859375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.07122802734375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.05816650390625, "loss_aux_layer_9": 0.0570068359375, "step": 4246, "total_loss": 0.6498408168554306 }, { "epoch": 0.8408235992872698, "grad_norm": 1.1766424179077148, "learning_rate": 5e-05, "llm_loss": 0.5458717197179794, "loss": 2.499, "loss_aux_layer_0": 0.0122222900390625, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05413818359375, "step": 4247, "total_loss": 0.6247498989105225 }, { "epoch": 0.8410215798851712, "grad_norm": 0.8809701800346375, "learning_rate": 5e-05, "llm_loss": 0.5584205016493797, "loss": 2.5486, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05499267578125, "step": 4248, "total_loss": 0.6371544897556305 }, { "epoch": 0.8412195604830727, "grad_norm": 1.0178682804107666, "learning_rate": 5e-05, "llm_loss": 0.5152855068445206, "loss": 2.377, "loss_aux_layer_0": 0.012451171875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.05401611328125, "step": 4249, "total_loss": 0.5942514389753342 }, { "epoch": 0.8414175410809741, "grad_norm": 0.8839823007583618, "learning_rate": 5e-05, "llm_loss": 0.556926965713501, "loss": 2.5222, "loss_aux_layer_0": 0.0113067626953125, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05108642578125, "loss_aux_layer_11": 0.0548095703125, "loss_aux_layer_12": 0.0589599609375, "loss_aux_layer_13": 0.0638427734375, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0875244140625, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.10302734375, "loss_aux_layer_19": 0.10693359375, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1151123046875, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.05059814453125, "loss_aux_layer_6": 0.0531005859375, "loss_aux_layer_7": 0.0513916015625, "loss_aux_layer_8": 0.0509033203125, "loss_aux_layer_9": 0.0498046875, "step": 4250, "total_loss": 0.6305393576622009 }, { "epoch": 0.8416155216788754, "grad_norm": 0.9988276958465576, "learning_rate": 5e-05, "llm_loss": 0.5273111090064049, "loss": 2.4136, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.028656005859375, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.052978515625, "step": 4251, "total_loss": 0.6034021824598312 }, { "epoch": 0.8418135022767769, "grad_norm": 0.8955637812614441, "learning_rate": 5e-05, "llm_loss": 0.6288555562496185, "loss": 2.8345, "loss_aux_layer_0": 0.0116424560546875, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.07061767578125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.0556640625, "step": 4252, "total_loss": 0.708620548248291 }, { "epoch": 0.8420114828746783, "grad_norm": 0.9617878794670105, "learning_rate": 5e-05, "llm_loss": 0.6262615770101547, "loss": 2.8199, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.05535888671875, "step": 4253, "total_loss": 0.7049762308597565 }, { "epoch": 0.8422094634725796, "grad_norm": 0.7524687051773071, "learning_rate": 5e-05, "llm_loss": 0.6405338048934937, "loss": 2.8788, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05535888671875, "step": 4254, "total_loss": 0.719690203666687 }, { "epoch": 0.8424074440704811, "grad_norm": 0.8159418106079102, "learning_rate": 5e-05, "llm_loss": 0.4833555668592453, "loss": 2.2393, "loss_aux_layer_0": 0.0110015869140625, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.0517578125, "step": 4255, "total_loss": 0.5598195269703865 }, { "epoch": 0.8426054246683825, "grad_norm": 0.8217617273330688, "learning_rate": 5e-05, "llm_loss": 0.5963388830423355, "loss": 2.6841, "loss_aux_layer_0": 0.0109710693359375, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.0521240234375, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06475830078125, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.04705810546875, "loss_aux_layer_4": 0.04937744140625, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.05169677734375, "loss_aux_layer_9": 0.05084228515625, "step": 4256, "total_loss": 0.6710285693407059 }, { "epoch": 0.842803405266284, "grad_norm": 0.8934958577156067, "learning_rate": 5e-05, "llm_loss": 0.5807408690452576, "loss": 2.6323, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.052978515625, "step": 4257, "total_loss": 0.6580862551927567 }, { "epoch": 0.8430013858641853, "grad_norm": 0.854228138923645, "learning_rate": 5e-05, "llm_loss": 0.5468542277812958, "loss": 2.4892, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.05194091796875, "step": 4258, "total_loss": 0.6222987473011017 }, { "epoch": 0.8431993664620867, "grad_norm": 0.9797157049179077, "learning_rate": 5e-05, "llm_loss": 0.5915590226650238, "loss": 2.6938, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.044921875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05499267578125, "loss_aux_layer_4": 0.05731201171875, "loss_aux_layer_5": 0.05902099609375, "loss_aux_layer_6": 0.06195068359375, "loss_aux_layer_7": 0.0601806640625, "loss_aux_layer_8": 0.05938720703125, "loss_aux_layer_9": 0.05828857421875, "step": 4259, "total_loss": 0.6734504550695419 }, { "epoch": 0.8433973470599881, "grad_norm": 0.8417383432388306, "learning_rate": 5e-05, "llm_loss": 0.5592414066195488, "loss": 2.5483, "loss_aux_layer_0": 0.011474609375, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.05364990234375, "step": 4260, "total_loss": 0.6370684653520584 }, { "epoch": 0.8435953276578896, "grad_norm": 1.0209128856658936, "learning_rate": 5e-05, "llm_loss": 0.6168234497308731, "loss": 2.7833, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.05499267578125, "step": 4261, "total_loss": 0.695832148194313 }, { "epoch": 0.8437933082557909, "grad_norm": 0.751071035861969, "learning_rate": 5e-05, "llm_loss": 0.5170708000659943, "loss": 2.3959, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1156005859375, "loss_aux_layer_19": 0.1195068359375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.1280517578125, "loss_aux_layer_21": 0.13671875, "loss_aux_layer_22": 0.158203125, "loss_aux_layer_23": 0.1962890625, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05682373046875, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05621337890625, "step": 4262, "total_loss": 0.5989769548177719 }, { "epoch": 0.8439912888536923, "grad_norm": 1.0040743350982666, "learning_rate": 5e-05, "llm_loss": 0.5800875723361969, "loss": 2.647, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.03094482421875, "loss_aux_layer_10": 0.05889892578125, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.044677734375, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05841064453125, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0574951171875, "step": 4263, "total_loss": 0.6617550700902939 }, { "epoch": 0.8441892694515938, "grad_norm": 0.90153968334198, "learning_rate": 5e-05, "llm_loss": 0.5893066972494125, "loss": 2.6728, "loss_aux_layer_0": 0.0113525390625, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05755615234375, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05511474609375, "step": 4264, "total_loss": 0.6682040691375732 }, { "epoch": 0.8443872500494951, "grad_norm": 1.1256530284881592, "learning_rate": 5e-05, "llm_loss": 0.4989155903458595, "loss": 2.3186, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05621337890625, "step": 4265, "total_loss": 0.5796507820487022 }, { "epoch": 0.8445852306473965, "grad_norm": 0.8056372404098511, "learning_rate": 5e-05, "llm_loss": 0.537857785820961, "loss": 2.4657, "loss_aux_layer_0": 0.011810302734375, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.05499267578125, "step": 4266, "total_loss": 0.6164219379425049 }, { "epoch": 0.844783211245298, "grad_norm": 0.9101935029029846, "learning_rate": 5e-05, "llm_loss": 0.5632009878754616, "loss": 2.5763, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.031036376953125, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.0562744140625, "step": 4267, "total_loss": 0.6440640240907669 }, { "epoch": 0.8449811918431994, "grad_norm": 0.881014883518219, "learning_rate": 5e-05, "llm_loss": 0.5147667899727821, "loss": 2.3682, "loss_aux_layer_0": 0.012054443359375, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.0526123046875, "step": 4268, "total_loss": 0.5920419842004776 }, { "epoch": 0.8451791724411007, "grad_norm": 0.8133202195167542, "learning_rate": 5e-05, "llm_loss": 0.5656059980392456, "loss": 2.5844, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.0435791015625, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06060791015625, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05706787109375, "step": 4269, "total_loss": 0.6460973918437958 }, { "epoch": 0.8453771530390022, "grad_norm": 0.9443960189819336, "learning_rate": 5e-05, "llm_loss": 0.5832876265048981, "loss": 2.6511, "loss_aux_layer_0": 0.01202392578125, "loss_aux_layer_1": 0.030548095703125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.05645751953125, "step": 4270, "total_loss": 0.6627762615680695 }, { "epoch": 0.8455751336369036, "grad_norm": 0.8878402709960938, "learning_rate": 5e-05, "llm_loss": 0.5546010881662369, "loss": 2.5358, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05487060546875, "step": 4271, "total_loss": 0.6339432150125504 }, { "epoch": 0.845773114234805, "grad_norm": 0.9135959148406982, "learning_rate": 5e-05, "llm_loss": 0.5181100070476532, "loss": 2.3746, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05218505859375, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.051025390625, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.05096435546875, "step": 4272, "total_loss": 0.5936457514762878 }, { "epoch": 0.8459710948327064, "grad_norm": 0.9985905289649963, "learning_rate": 5e-05, "llm_loss": 0.5744135081768036, "loss": 2.6185, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.0560302734375, "step": 4273, "total_loss": 0.6546344459056854 }, { "epoch": 0.8461690754306078, "grad_norm": 0.9093592166900635, "learning_rate": 5e-05, "llm_loss": 0.557990625500679, "loss": 2.5405, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.05255126953125, "step": 4274, "total_loss": 0.6351262629032135 }, { "epoch": 0.8463670560285093, "grad_norm": 0.8513738512992859, "learning_rate": 5e-05, "llm_loss": 0.5240290686488152, "loss": 2.4131, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.07000732421875, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1314697265625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05474853515625, "step": 4275, "total_loss": 0.6032792329788208 }, { "epoch": 0.8465650366264106, "grad_norm": 1.072567105293274, "learning_rate": 5e-05, "llm_loss": 0.6057659238576889, "loss": 2.7276, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05279541015625, "loss_aux_layer_11": 0.0565185546875, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.06561279296875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.052490234375, "loss_aux_layer_9": 0.05157470703125, "step": 4276, "total_loss": 0.6819120496511459 }, { "epoch": 0.846763017224312, "grad_norm": 0.8066286444664001, "learning_rate": 5e-05, "llm_loss": 0.5994067043066025, "loss": 2.7009, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.052490234375, "loss_aux_layer_9": 0.0513916015625, "step": 4277, "total_loss": 0.6752258986234665 }, { "epoch": 0.8469609978222135, "grad_norm": 0.9694457650184631, "learning_rate": 5e-05, "llm_loss": 0.5821601003408432, "loss": 2.6318, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.0513916015625, "step": 4278, "total_loss": 0.657940149307251 }, { "epoch": 0.8471589784201148, "grad_norm": 1.0050485134124756, "learning_rate": 5e-05, "llm_loss": 0.6295728385448456, "loss": 2.8351, "loss_aux_layer_0": 0.0119781494140625, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.06939697265625, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05487060546875, "step": 4279, "total_loss": 0.7087811976671219 }, { "epoch": 0.8473569590180162, "grad_norm": 0.9792240262031555, "learning_rate": 5e-05, "llm_loss": 0.5803027004003525, "loss": 2.6356, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.0537109375, "step": 4280, "total_loss": 0.6589072197675705 }, { "epoch": 0.8475549396159177, "grad_norm": 0.8879914283752441, "learning_rate": 5e-05, "llm_loss": 0.6313713639974594, "loss": 2.8364, "loss_aux_layer_0": 0.012420654296875, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0540771484375, "step": 4281, "total_loss": 0.7090974003076553 }, { "epoch": 0.8477529202138191, "grad_norm": 0.9844169020652771, "learning_rate": 5e-05, "llm_loss": 0.5582506358623505, "loss": 2.544, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05450439453125, "step": 4282, "total_loss": 0.635993704199791 }, { "epoch": 0.8479509008117204, "grad_norm": 0.9092823266983032, "learning_rate": 5e-05, "llm_loss": 0.5481690913438797, "loss": 2.4975, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05316162109375, "step": 4283, "total_loss": 0.624365359544754 }, { "epoch": 0.8481488814096219, "grad_norm": 1.0554144382476807, "learning_rate": 5e-05, "llm_loss": 0.6382254660129547, "loss": 2.8775, "loss_aux_layer_0": 0.0120697021484375, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.0667724609375, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.05914306640625, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.05743408203125, "step": 4284, "total_loss": 0.7193653285503387 }, { "epoch": 0.8483468620075233, "grad_norm": 0.9348878264427185, "learning_rate": 5e-05, "llm_loss": 0.48756764829158783, "loss": 2.2685, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0555419921875, "step": 4285, "total_loss": 0.5671190991997719 }, { "epoch": 0.8485448426054246, "grad_norm": 1.0245137214660645, "learning_rate": 5e-05, "llm_loss": 0.6081757098436356, "loss": 2.7584, "loss_aux_layer_0": 0.0119171142578125, "loss_aux_layer_1": 0.031524658203125, "loss_aux_layer_10": 0.05914306640625, "loss_aux_layer_11": 0.063232421875, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0731201171875, "loss_aux_layer_14": 0.0809326171875, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04400634765625, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0538330078125, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.05792236328125, "step": 4286, "total_loss": 0.6895902454853058 }, { "epoch": 0.8487428232033261, "grad_norm": 0.8049898743629456, "learning_rate": 5e-05, "llm_loss": 0.5543474704027176, "loss": 2.5313, "loss_aux_layer_0": 0.0109405517578125, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.06927490234375, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.0548095703125, "step": 4287, "total_loss": 0.6328291147947311 }, { "epoch": 0.8489408038012275, "grad_norm": 0.9435349702835083, "learning_rate": 5e-05, "llm_loss": 0.6071041077375412, "loss": 2.751, "loss_aux_layer_0": 0.012176513671875, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.058349609375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.0567626953125, "step": 4288, "total_loss": 0.6877583712339401 }, { "epoch": 0.8491387843991289, "grad_norm": 0.8265694379806519, "learning_rate": 5e-05, "llm_loss": 0.5418392270803452, "loss": 2.488, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0552978515625, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05633544921875, "step": 4289, "total_loss": 0.6220062077045441 }, { "epoch": 0.8493367649970303, "grad_norm": 0.8752338290214539, "learning_rate": 5e-05, "llm_loss": 0.5564644038677216, "loss": 2.5483, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06134033203125, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05609130859375, "step": 4290, "total_loss": 0.6370652467012405 }, { "epoch": 0.8495347455949317, "grad_norm": 0.8724404573440552, "learning_rate": 5e-05, "llm_loss": 0.5736724063754082, "loss": 2.6037, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.06744384765625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0526123046875, "step": 4291, "total_loss": 0.6509324163198471 }, { "epoch": 0.8497327261928331, "grad_norm": 0.8784801363945007, "learning_rate": 5e-05, "llm_loss": 0.5397523492574692, "loss": 2.4726, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.0526123046875, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05389404296875, "step": 4292, "total_loss": 0.6181385070085526 }, { "epoch": 0.8499307067907345, "grad_norm": 1.0369223356246948, "learning_rate": 5e-05, "llm_loss": 0.4561877176165581, "loss": 2.1417, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.07012939453125, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05523681640625, "step": 4293, "total_loss": 0.5354256629943848 }, { "epoch": 0.8501286873886359, "grad_norm": 0.9486654996871948, "learning_rate": 5e-05, "llm_loss": 0.5228237807750702, "loss": 2.3967, "loss_aux_layer_0": 0.013519287109375, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0523681640625, "step": 4294, "total_loss": 0.5991823971271515 }, { "epoch": 0.8503266679865373, "grad_norm": 1.0853171348571777, "learning_rate": 5e-05, "llm_loss": 0.49410539865493774, "loss": 2.2776, "loss_aux_layer_0": 0.0109405517578125, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.05279541015625, "loss_aux_layer_11": 0.05621337890625, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.0516357421875, "step": 4295, "total_loss": 0.5694074556231499 }, { "epoch": 0.8505246485844388, "grad_norm": 0.9971420764923096, "learning_rate": 5e-05, "llm_loss": 0.6000383198261261, "loss": 2.7204, "loss_aux_layer_0": 0.0133209228515625, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05499267578125, "step": 4296, "total_loss": 0.6800989508628845 }, { "epoch": 0.8507226291823401, "grad_norm": 1.1277225017547607, "learning_rate": 5e-05, "llm_loss": 0.5930581837892532, "loss": 2.691, "loss_aux_layer_0": 0.0123291015625, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.054443359375, "step": 4297, "total_loss": 0.6727589815855026 }, { "epoch": 0.8509206097802415, "grad_norm": 0.97368985414505, "learning_rate": 5e-05, "llm_loss": 0.6127331107854843, "loss": 2.7625, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.06805419921875, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05352783203125, "step": 4298, "total_loss": 0.6906158775091171 }, { "epoch": 0.851118590378143, "grad_norm": 0.9319688081741333, "learning_rate": 5e-05, "llm_loss": 0.5365030989050865, "loss": 2.4552, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05291748046875, "step": 4299, "total_loss": 0.6138070523738861 }, { "epoch": 0.8513165709760443, "grad_norm": 0.917037844657898, "learning_rate": 5e-05, "llm_loss": 0.4439569488167763, "loss": 2.101, "loss_aux_layer_0": 0.0109405517578125, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1143798828125, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.0570068359375, "step": 4300, "total_loss": 0.5252383127808571 }, { "epoch": 0.8515145515739457, "grad_norm": 1.2166764736175537, "learning_rate": 5e-05, "llm_loss": 0.5364228412508965, "loss": 2.4592, "loss_aux_layer_0": 0.013580322265625, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0533447265625, "step": 4301, "total_loss": 0.6147961169481277 }, { "epoch": 0.8517125321718472, "grad_norm": 0.8101231455802917, "learning_rate": 5e-05, "llm_loss": 0.5481713861227036, "loss": 2.5032, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.052490234375, "step": 4302, "total_loss": 0.6257991194725037 }, { "epoch": 0.8519105127697486, "grad_norm": 1.0048103332519531, "learning_rate": 5e-05, "llm_loss": 0.583335243165493, "loss": 2.6404, "loss_aux_layer_0": 0.01287841796875, "loss_aux_layer_1": 0.03106689453125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.123046875, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.176025390625, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05462646484375, "step": 4303, "total_loss": 0.6601066738367081 }, { "epoch": 0.8521084933676499, "grad_norm": 0.8476354479789734, "learning_rate": 5e-05, "llm_loss": 0.5149941891431808, "loss": 2.3696, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 4304, "total_loss": 0.5924082100391388 }, { "epoch": 0.8523064739655514, "grad_norm": 0.9635205268859863, "learning_rate": 5e-05, "llm_loss": 0.5697350427508354, "loss": 2.5928, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.058349609375, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05487060546875, "step": 4305, "total_loss": 0.6481983661651611 }, { "epoch": 0.8525044545634528, "grad_norm": 0.9100278615951538, "learning_rate": 5e-05, "llm_loss": 0.566161185503006, "loss": 2.5794, "loss_aux_layer_0": 0.0111541748046875, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.054443359375, "step": 4306, "total_loss": 0.6448430716991425 }, { "epoch": 0.8527024351613541, "grad_norm": 0.9855244755744934, "learning_rate": 5e-05, "llm_loss": 0.5538827925920486, "loss": 2.5398, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.03155517578125, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0572509765625, "step": 4307, "total_loss": 0.6349406540393829 }, { "epoch": 0.8529004157592556, "grad_norm": 0.9887599349021912, "learning_rate": 5e-05, "llm_loss": 0.5247960388660431, "loss": 2.4127, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05548095703125, "step": 4308, "total_loss": 0.6031681895256042 }, { "epoch": 0.853098396357157, "grad_norm": 1.007874608039856, "learning_rate": 5e-05, "llm_loss": 0.5796738713979721, "loss": 2.6414, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05682373046875, "step": 4309, "total_loss": 0.6603557765483856 }, { "epoch": 0.8532963769550584, "grad_norm": 0.9146025776863098, "learning_rate": 5e-05, "llm_loss": 0.543682724237442, "loss": 2.4945, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.0546875, "step": 4310, "total_loss": 0.623622715473175 }, { "epoch": 0.8534943575529598, "grad_norm": 0.7778980731964111, "learning_rate": 5e-05, "llm_loss": 0.5284841507673264, "loss": 2.4308, "loss_aux_layer_0": 0.0113983154296875, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05462646484375, "step": 4311, "total_loss": 0.6077098399400711 }, { "epoch": 0.8536923381508612, "grad_norm": 0.87925124168396, "learning_rate": 5e-05, "llm_loss": 0.44314367324113846, "loss": 2.0944, "loss_aux_layer_0": 0.011962890625, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1043701171875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05560302734375, "step": 4312, "total_loss": 0.5235942900180817 }, { "epoch": 0.8538903187487626, "grad_norm": 0.7608381509780884, "learning_rate": 5e-05, "llm_loss": 0.5387150645256042, "loss": 2.475, "loss_aux_layer_0": 0.0115814208984375, "loss_aux_layer_1": 0.03094482421875, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.0582275390625, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05621337890625, "step": 4313, "total_loss": 0.618754968047142 }, { "epoch": 0.8540882993466641, "grad_norm": 1.051876187324524, "learning_rate": 5e-05, "llm_loss": 0.5934470444917679, "loss": 2.6844, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05316162109375, "step": 4314, "total_loss": 0.6711038947105408 }, { "epoch": 0.8542862799445654, "grad_norm": 0.7958506941795349, "learning_rate": 5e-05, "llm_loss": 0.6050187349319458, "loss": 2.7311, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.052978515625, "step": 4315, "total_loss": 0.6827830672264099 }, { "epoch": 0.8544842605424668, "grad_norm": 0.8529011011123657, "learning_rate": 5e-05, "llm_loss": 0.5704935491085052, "loss": 2.5916, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.06732177734375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.052490234375, "step": 4316, "total_loss": 0.6478980332612991 }, { "epoch": 0.8546822411403683, "grad_norm": 1.0150887966156006, "learning_rate": 5e-05, "llm_loss": 0.6173234730958939, "loss": 2.7911, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.03033447265625, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05615234375, "step": 4317, "total_loss": 0.6977872848510742 }, { "epoch": 0.8548802217382696, "grad_norm": 0.7642679810523987, "learning_rate": 5e-05, "llm_loss": 0.5799598321318626, "loss": 2.6468, "loss_aux_layer_0": 0.0105133056640625, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.0587158203125, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0814208984375, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.0992431640625, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1146240234375, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.05792236328125, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.0574951171875, "step": 4318, "total_loss": 0.6617075949907303 }, { "epoch": 0.855078202336171, "grad_norm": 0.9695137739181519, "learning_rate": 5e-05, "llm_loss": 0.5258143544197083, "loss": 2.4323, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.0323486328125, "loss_aux_layer_10": 0.06048583984375, "loss_aux_layer_11": 0.06463623046875, "loss_aux_layer_12": 0.0692138671875, "loss_aux_layer_13": 0.074462890625, "loss_aux_layer_14": 0.0823974609375, "loss_aux_layer_15": 0.090087890625, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0557861328125, "loss_aux_layer_4": 0.058349609375, "loss_aux_layer_5": 0.0599365234375, "loss_aux_layer_6": 0.06298828125, "loss_aux_layer_7": 0.06121826171875, "loss_aux_layer_8": 0.06036376953125, "loss_aux_layer_9": 0.0589599609375, "step": 4319, "total_loss": 0.6080847233533859 }, { "epoch": 0.8552761829340725, "grad_norm": 0.7758336663246155, "learning_rate": 5e-05, "llm_loss": 0.5561231821775436, "loss": 2.5378, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.054443359375, "step": 4320, "total_loss": 0.6344428360462189 }, { "epoch": 0.8554741635319739, "grad_norm": 0.903580367565155, "learning_rate": 5e-05, "llm_loss": 0.6224056035280228, "loss": 2.793, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.0535888671875, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.052001953125, "step": 4321, "total_loss": 0.69825778901577 }, { "epoch": 0.8556721441298752, "grad_norm": 0.947437047958374, "learning_rate": 5e-05, "llm_loss": 0.5550461187958717, "loss": 2.5333, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.0421142578125, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05584716796875, "step": 4322, "total_loss": 0.6333152204751968 }, { "epoch": 0.8558701247277767, "grad_norm": 0.8362036347389221, "learning_rate": 5e-05, "llm_loss": 0.5180569365620613, "loss": 2.3831, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.0526123046875, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.05303955078125, "step": 4323, "total_loss": 0.5957784652709961 }, { "epoch": 0.8560681053256781, "grad_norm": 0.95210200548172, "learning_rate": 5e-05, "llm_loss": 0.6151247918605804, "loss": 2.7792, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.05474853515625, "step": 4324, "total_loss": 0.6948086470365524 }, { "epoch": 0.8562660859235794, "grad_norm": 0.9273642301559448, "learning_rate": 5e-05, "llm_loss": 0.5683945417404175, "loss": 2.6001, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.03167724609375, "loss_aux_layer_10": 0.05926513671875, "loss_aux_layer_11": 0.0634765625, "loss_aux_layer_12": 0.0679931640625, "loss_aux_layer_13": 0.0736083984375, "loss_aux_layer_14": 0.0816650390625, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.06121826171875, "loss_aux_layer_7": 0.05987548828125, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.0579833984375, "step": 4325, "total_loss": 0.6500307023525238 }, { "epoch": 0.8564640665214809, "grad_norm": 0.9592467546463013, "learning_rate": 5e-05, "llm_loss": 0.5492230802774429, "loss": 2.5162, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.030914306640625, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05731201171875, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05767822265625, "loss_aux_layer_9": 0.05657958984375, "step": 4326, "total_loss": 0.6290442794561386 }, { "epoch": 0.8566620471193823, "grad_norm": 0.8461751341819763, "learning_rate": 5e-05, "llm_loss": 0.5953565835952759, "loss": 2.682, "loss_aux_layer_0": 0.01092529296875, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.051513671875, "step": 4327, "total_loss": 0.6705064326524734 }, { "epoch": 0.8568600277172838, "grad_norm": 0.8489012718200684, "learning_rate": 5e-05, "llm_loss": 0.6116004586219788, "loss": 2.7451, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.059814453125, "loss_aux_layer_13": 0.064697265625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.0506591796875, "step": 4328, "total_loss": 0.6862710863351822 }, { "epoch": 0.8570580083151851, "grad_norm": 0.7863774299621582, "learning_rate": 5e-05, "llm_loss": 0.48736900836229324, "loss": 2.2705, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1173095703125, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.125732421875, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.0552978515625, "step": 4329, "total_loss": 0.5676142573356628 }, { "epoch": 0.8572559889130865, "grad_norm": 0.8319767117500305, "learning_rate": 5e-05, "llm_loss": 0.5815135836601257, "loss": 2.6314, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05712890625, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.05242919921875, "step": 4330, "total_loss": 0.6578468978404999 }, { "epoch": 0.857453969510988, "grad_norm": 0.8249842524528503, "learning_rate": 5e-05, "llm_loss": 0.48488447070121765, "loss": 2.2534, "loss_aux_layer_0": 0.010589599609375, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.0552978515625, "step": 4331, "total_loss": 0.5633541494607925 }, { "epoch": 0.8576519501088893, "grad_norm": 0.8275566101074219, "learning_rate": 5e-05, "llm_loss": 0.4707081541419029, "loss": 2.2015, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.05426025390625, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.05572509765625, "step": 4332, "total_loss": 0.5503647103905678 }, { "epoch": 0.8578499307067907, "grad_norm": 1.0688679218292236, "learning_rate": 5e-05, "llm_loss": 0.648120567202568, "loss": 2.9074, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.030426025390625, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.06964111328125, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0421142578125, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.0548095703125, "step": 4333, "total_loss": 0.7268440127372742 }, { "epoch": 0.8580479113046922, "grad_norm": 1.4706469774246216, "learning_rate": 5e-05, "llm_loss": 0.600092701613903, "loss": 2.7284, "loss_aux_layer_0": 0.010833740234375, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.0672607421875, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.089599609375, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.106689453125, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.044189453125, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.1337890625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.058349609375, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.0574951171875, "step": 4334, "total_loss": 0.6821024417877197 }, { "epoch": 0.8582458919025936, "grad_norm": 2.7379391193389893, "learning_rate": 5e-05, "llm_loss": 0.5861743688583374, "loss": 2.6673, "loss_aux_layer_0": 0.014068603515625, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.0560302734375, "step": 4335, "total_loss": 0.6668131798505783 }, { "epoch": 0.8584438725004949, "grad_norm": 2.715902805328369, "learning_rate": 5e-05, "llm_loss": 0.5510399490594864, "loss": 2.5346, "loss_aux_layer_0": 0.019134521484375, "loss_aux_layer_1": 0.03289794921875, "loss_aux_layer_10": 0.05963134765625, "loss_aux_layer_11": 0.06329345703125, "loss_aux_layer_12": 0.0677490234375, "loss_aux_layer_13": 0.0732421875, "loss_aux_layer_14": 0.08154296875, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.09912109375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.0484619140625, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.0574951171875, "loss_aux_layer_4": 0.05859375, "loss_aux_layer_5": 0.0595703125, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.0604248046875, "loss_aux_layer_8": 0.05963134765625, "loss_aux_layer_9": 0.0584716796875, "step": 4336, "total_loss": 0.6336411684751511 }, { "epoch": 0.8586418530983964, "grad_norm": 2.1831939220428467, "learning_rate": 5e-05, "llm_loss": 0.5709412544965744, "loss": 2.6212, "loss_aux_layer_0": 0.0184326171875, "loss_aux_layer_1": 0.032073974609375, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0733642578125, "loss_aux_layer_14": 0.083251953125, "loss_aux_layer_15": 0.0933837890625, "loss_aux_layer_16": 0.10400390625, "loss_aux_layer_17": 0.112548828125, "loss_aux_layer_18": 0.12158203125, "loss_aux_layer_19": 0.125, "loss_aux_layer_2": 0.04510498046875, "loss_aux_layer_20": 0.132568359375, "loss_aux_layer_21": 0.14013671875, "loss_aux_layer_22": 0.159912109375, "loss_aux_layer_23": 0.19677734375, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.0562744140625, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05657958984375, "step": 4337, "total_loss": 0.6552916914224625 }, { "epoch": 0.8588398336962978, "grad_norm": 1.5381430387496948, "learning_rate": 5e-05, "llm_loss": 0.625957116484642, "loss": 2.8314, "loss_aux_layer_0": 0.021881103515625, "loss_aux_layer_1": 0.032440185546875, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.072998046875, "loss_aux_layer_14": 0.081298828125, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.1309814453125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05682373046875, "loss_aux_layer_5": 0.05889892578125, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.0594482421875, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0572509765625, "step": 4338, "total_loss": 0.7078584879636765 }, { "epoch": 0.8590378142941991, "grad_norm": 1.6631544828414917, "learning_rate": 5e-05, "llm_loss": 0.5708721876144409, "loss": 2.6068, "loss_aux_layer_0": 0.023193359375, "loss_aux_layer_1": 0.032135009765625, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05511474609375, "step": 4339, "total_loss": 0.6517030596733093 }, { "epoch": 0.8592357948921006, "grad_norm": 1.2615604400634766, "learning_rate": 5e-05, "llm_loss": 0.5050776153802872, "loss": 2.3453, "loss_aux_layer_0": 0.0147552490234375, "loss_aux_layer_1": 0.03179931640625, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0452880859375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.05999755859375, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05609130859375, "step": 4340, "total_loss": 0.5863142311573029 }, { "epoch": 0.859433775490002, "grad_norm": 1.2612290382385254, "learning_rate": 5e-05, "llm_loss": 0.5075358226895332, "loss": 2.3516, "loss_aux_layer_0": 0.020477294921875, "loss_aux_layer_1": 0.032684326171875, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.045166015625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.055908203125, "step": 4341, "total_loss": 0.5879009962081909 }, { "epoch": 0.8596317560879034, "grad_norm": 1.2356951236724854, "learning_rate": 5e-05, "llm_loss": 0.5850704610347748, "loss": 2.6605, "loss_aux_layer_0": 0.017669677734375, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06561279296875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05621337890625, "step": 4342, "total_loss": 0.6651341319084167 }, { "epoch": 0.8598297366858048, "grad_norm": 0.9839916825294495, "learning_rate": 5e-05, "llm_loss": 0.5220007598400116, "loss": 2.4072, "loss_aux_layer_0": 0.01483154296875, "loss_aux_layer_1": 0.031707763671875, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.07049560546875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0601806640625, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.0560302734375, "step": 4343, "total_loss": 0.601799726486206 }, { "epoch": 0.8600277172837062, "grad_norm": 1.289957880973816, "learning_rate": 5e-05, "llm_loss": 0.5127839669585228, "loss": 2.3606, "loss_aux_layer_0": 0.0142822265625, "loss_aux_layer_1": 0.030426025390625, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.0526123046875, "step": 4344, "total_loss": 0.5901479423046112 }, { "epoch": 0.8602256978816076, "grad_norm": 1.085800290107727, "learning_rate": 5e-05, "llm_loss": 0.6195140033960342, "loss": 2.799, "loss_aux_layer_0": 0.0153961181640625, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1126708984375, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.05511474609375, "step": 4345, "total_loss": 0.6997607797384262 }, { "epoch": 0.860423678479509, "grad_norm": 1.2227476835250854, "learning_rate": 5e-05, "llm_loss": 0.5990902930498123, "loss": 2.7077, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.0526123046875, "step": 4346, "total_loss": 0.6769204437732697 }, { "epoch": 0.8606216590774104, "grad_norm": 0.8780529499053955, "learning_rate": 5e-05, "llm_loss": 0.572003036737442, "loss": 2.6118, "loss_aux_layer_0": 0.0135498046875, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06146240234375, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05389404296875, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.0562744140625, "step": 4347, "total_loss": 0.6529513448476791 }, { "epoch": 0.8608196396753118, "grad_norm": 1.0564037561416626, "learning_rate": 5e-05, "llm_loss": 0.5776296854019165, "loss": 2.6319, "loss_aux_layer_0": 0.01397705078125, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1048583984375, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0421142578125, "loss_aux_layer_20": 0.12646484375, "loss_aux_layer_21": 0.135498046875, "loss_aux_layer_22": 0.157470703125, "loss_aux_layer_23": 0.195556640625, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0543212890625, "step": 4348, "total_loss": 0.6579791158437729 }, { "epoch": 0.8610176202732133, "grad_norm": 1.1781752109527588, "learning_rate": 5e-05, "llm_loss": 0.6198561638593674, "loss": 2.7986, "loss_aux_layer_0": 0.0151214599609375, "loss_aux_layer_1": 0.032196044921875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.06500244140625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0438232421875, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.05572509765625, "step": 4349, "total_loss": 0.699649453163147 }, { "epoch": 0.8612156008711146, "grad_norm": 0.8354665637016296, "learning_rate": 5e-05, "llm_loss": 0.49795037508010864, "loss": 2.3247, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.03253173828125, "loss_aux_layer_10": 0.06011962890625, "loss_aux_layer_11": 0.064208984375, "loss_aux_layer_12": 0.06884765625, "loss_aux_layer_13": 0.0743408203125, "loss_aux_layer_14": 0.0828857421875, "loss_aux_layer_15": 0.0911865234375, "loss_aux_layer_16": 0.1005859375, "loss_aux_layer_17": 0.1075439453125, "loss_aux_layer_18": 0.115478515625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.0450439453125, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.15625, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05535888671875, "loss_aux_layer_4": 0.05792236328125, "loss_aux_layer_5": 0.05926513671875, "loss_aux_layer_6": 0.062255859375, "loss_aux_layer_7": 0.06048583984375, "loss_aux_layer_8": 0.05975341796875, "loss_aux_layer_9": 0.05877685546875, "step": 4350, "total_loss": 0.5811810195446014 }, { "epoch": 0.861413581469016, "grad_norm": 1.188635230064392, "learning_rate": 5e-05, "llm_loss": 0.678323283791542, "loss": 3.0342, "loss_aux_layer_0": 0.012969970703125, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05316162109375, "loss_aux_layer_4": 0.05584716796875, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05670166015625, "step": 4351, "total_loss": 0.7585466057062149 }, { "epoch": 0.8616115620669175, "grad_norm": 0.8188969492912292, "learning_rate": 5e-05, "llm_loss": 0.559229388833046, "loss": 2.5578, "loss_aux_layer_0": 0.0139923095703125, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0433349609375, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05291748046875, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.056396484375, "step": 4352, "total_loss": 0.6394559890031815 }, { "epoch": 0.8618095426648188, "grad_norm": 1.1458624601364136, "learning_rate": 5e-05, "llm_loss": 0.5879334956407547, "loss": 2.6611, "loss_aux_layer_0": 0.01324462890625, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05426025390625, "step": 4353, "total_loss": 0.6652824580669403 }, { "epoch": 0.8620075232627202, "grad_norm": 0.8495781421661377, "learning_rate": 5e-05, "llm_loss": 0.554302528500557, "loss": 2.5275, "loss_aux_layer_0": 0.01318359375, "loss_aux_layer_1": 0.030242919921875, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05426025390625, "step": 4354, "total_loss": 0.6318754851818085 }, { "epoch": 0.8622055038606217, "grad_norm": 0.892988920211792, "learning_rate": 5e-05, "llm_loss": 0.6572708785533905, "loss": 2.9397, "loss_aux_layer_0": 0.0139007568359375, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.06817626953125, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05322265625, "step": 4355, "total_loss": 0.7349348664283752 }, { "epoch": 0.8624034844585231, "grad_norm": 0.984249472618103, "learning_rate": 5e-05, "llm_loss": 0.5390341356396675, "loss": 2.4866, "loss_aux_layer_0": 0.013336181640625, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.081787109375, "loss_aux_layer_15": 0.0904541015625, "loss_aux_layer_16": 0.1004638671875, "loss_aux_layer_17": 0.109130859375, "loss_aux_layer_18": 0.1177978515625, "loss_aux_layer_19": 0.1212158203125, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.128662109375, "loss_aux_layer_21": 0.136474609375, "loss_aux_layer_22": 0.15771484375, "loss_aux_layer_23": 0.195068359375, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.056640625, "step": 4356, "total_loss": 0.6216384917497635 }, { "epoch": 0.8626014650564244, "grad_norm": 0.7534884214401245, "learning_rate": 5e-05, "llm_loss": 0.5028895139694214, "loss": 2.3267, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.05377197265625, "step": 4357, "total_loss": 0.5816720724105835 }, { "epoch": 0.8627994456543259, "grad_norm": 0.8553579449653625, "learning_rate": 5e-05, "llm_loss": 0.5127817019820213, "loss": 2.3588, "loss_aux_layer_0": 0.014312744140625, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.06561279296875, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.0526123046875, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.05145263671875, "step": 4358, "total_loss": 0.5897038131952286 }, { "epoch": 0.8629974262522273, "grad_norm": 0.8297107219696045, "learning_rate": 5e-05, "llm_loss": 0.542856752872467, "loss": 2.4982, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.03173828125, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04443359375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05426025390625, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.0614013671875, "loss_aux_layer_7": 0.05908203125, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05718994140625, "step": 4359, "total_loss": 0.624558724462986 }, { "epoch": 0.8631954068501286, "grad_norm": 0.7246631383895874, "learning_rate": 5e-05, "llm_loss": 0.4892195463180542, "loss": 2.2745, "loss_aux_layer_0": 0.0123748779296875, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.05914306640625, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.05548095703125, "step": 4360, "total_loss": 0.5686221718788147 }, { "epoch": 0.8633933874480301, "grad_norm": 0.7873618602752686, "learning_rate": 5e-05, "llm_loss": 0.6129175573587418, "loss": 2.7682, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.0423583984375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05181884765625, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05487060546875, "step": 4361, "total_loss": 0.6920419335365295 }, { "epoch": 0.8635913680459315, "grad_norm": 0.7829390168190002, "learning_rate": 5e-05, "llm_loss": 0.5502806305885315, "loss": 2.5138, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.054443359375, "step": 4362, "total_loss": 0.6284604370594025 }, { "epoch": 0.8637893486438329, "grad_norm": 0.8905971646308899, "learning_rate": 5e-05, "llm_loss": 0.667939692735672, "loss": 2.9693, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.08935546875, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05078125, "step": 4363, "total_loss": 0.7423264533281326 }, { "epoch": 0.8639873292417343, "grad_norm": 0.8247187733650208, "learning_rate": 5e-05, "llm_loss": 0.5624281540513039, "loss": 2.5731, "loss_aux_layer_0": 0.0115966796875, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.08837890625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.114013671875, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.05718994140625, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05712890625, "step": 4364, "total_loss": 0.6432849615812302 }, { "epoch": 0.8641853098396357, "grad_norm": 0.8476805090904236, "learning_rate": 5e-05, "llm_loss": 0.511728823184967, "loss": 2.3634, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.0653076171875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05572509765625, "step": 4365, "total_loss": 0.59084652364254 }, { "epoch": 0.8643832904375371, "grad_norm": 0.770021915435791, "learning_rate": 5e-05, "llm_loss": 0.5503784567117691, "loss": 2.505, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.05181884765625, "step": 4366, "total_loss": 0.6262525320053101 }, { "epoch": 0.8645812710354386, "grad_norm": 0.7623249292373657, "learning_rate": 5e-05, "llm_loss": 0.5416633486747742, "loss": 2.4834, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.0543212890625, "step": 4367, "total_loss": 0.620849497616291 }, { "epoch": 0.8647792516333399, "grad_norm": 0.9680432081222534, "learning_rate": 5e-05, "llm_loss": 0.6047955006361008, "loss": 2.7333, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.030548095703125, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.0552978515625, "step": 4368, "total_loss": 0.6833353340625763 }, { "epoch": 0.8649772322312413, "grad_norm": 0.7563826441764832, "learning_rate": 5e-05, "llm_loss": 0.5728887915611267, "loss": 2.5904, "loss_aux_layer_0": 0.010955810546875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.0556640625, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04754638671875, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.0521240234375, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.0506591796875, "step": 4369, "total_loss": 0.6476057469844818 }, { "epoch": 0.8651752128291428, "grad_norm": 0.9148375988006592, "learning_rate": 5e-05, "llm_loss": 0.5602076798677444, "loss": 2.5525, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0543212890625, "step": 4370, "total_loss": 0.6381146162748337 }, { "epoch": 0.8653731934270441, "grad_norm": 0.9537983536720276, "learning_rate": 5e-05, "llm_loss": 0.5423783585429192, "loss": 2.4779, "loss_aux_layer_0": 0.011474609375, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05340576171875, "step": 4371, "total_loss": 0.619483470916748 }, { "epoch": 0.8655711740249455, "grad_norm": 0.9196308255195618, "learning_rate": 5e-05, "llm_loss": 0.5315055847167969, "loss": 2.4482, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04364013671875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05645751953125, "step": 4372, "total_loss": 0.6120385378599167 }, { "epoch": 0.865769154622847, "grad_norm": 0.8277393579483032, "learning_rate": 5e-05, "llm_loss": 0.5711634755134583, "loss": 2.6025, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.07025146484375, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0545654296875, "step": 4373, "total_loss": 0.6506287902593613 }, { "epoch": 0.8659671352207484, "grad_norm": 0.9869719743728638, "learning_rate": 5e-05, "llm_loss": 0.6042542457580566, "loss": 2.7184, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.027740478515625, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1170654296875, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.05255126953125, "step": 4374, "total_loss": 0.6795946061611176 }, { "epoch": 0.8661651158186497, "grad_norm": 0.8425791263580322, "learning_rate": 5e-05, "llm_loss": 0.5490225553512573, "loss": 2.5119, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.06817626953125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.053955078125, "step": 4375, "total_loss": 0.627980425953865 }, { "epoch": 0.8663630964165512, "grad_norm": 1.2170555591583252, "learning_rate": 5e-05, "llm_loss": 0.5167871788144112, "loss": 2.387, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.106201171875, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.117919921875, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05462646484375, "step": 4376, "total_loss": 0.5967418104410172 }, { "epoch": 0.8665610770144526, "grad_norm": 0.7667085528373718, "learning_rate": 5e-05, "llm_loss": 0.5681025609374046, "loss": 2.5867, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05364990234375, "step": 4377, "total_loss": 0.6466770321130753 }, { "epoch": 0.866759057612354, "grad_norm": 1.0349377393722534, "learning_rate": 5e-05, "llm_loss": 0.592369943857193, "loss": 2.6738, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.028045654296875, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05474853515625, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.052001953125, "step": 4378, "total_loss": 0.6684400141239166 }, { "epoch": 0.8669570382102554, "grad_norm": 0.9292194843292236, "learning_rate": 5e-05, "llm_loss": 0.5307934582233429, "loss": 2.4361, "loss_aux_layer_0": 0.0109405517578125, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05377197265625, "step": 4379, "total_loss": 0.6090271770954132 }, { "epoch": 0.8671550188081568, "grad_norm": 0.8584700226783752, "learning_rate": 5e-05, "llm_loss": 0.6196079701185226, "loss": 2.7849, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.0523681640625, "step": 4380, "total_loss": 0.6962297409772873 }, { "epoch": 0.8673529994060583, "grad_norm": 0.8821648955345154, "learning_rate": 5e-05, "llm_loss": 0.5612900108098984, "loss": 2.5616, "loss_aux_layer_0": 0.0112762451171875, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05487060546875, "step": 4381, "total_loss": 0.6404039859771729 }, { "epoch": 0.8675509800039596, "grad_norm": 0.7623862624168396, "learning_rate": 5e-05, "llm_loss": 0.5313980802893639, "loss": 2.4377, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05438232421875, "step": 4382, "total_loss": 0.6094200015068054 }, { "epoch": 0.867748960601861, "grad_norm": 1.180480718612671, "learning_rate": 5e-05, "llm_loss": 0.5742044895887375, "loss": 2.6053, "loss_aux_layer_0": 0.011566162109375, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05364990234375, "step": 4383, "total_loss": 0.6513229608535767 }, { "epoch": 0.8679469411997625, "grad_norm": 1.0711214542388916, "learning_rate": 5e-05, "llm_loss": 0.5437268614768982, "loss": 2.4848, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.06732177734375, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05352783203125, "step": 4384, "total_loss": 0.6212119162082672 }, { "epoch": 0.8681449217976638, "grad_norm": 0.9545615315437317, "learning_rate": 5e-05, "llm_loss": 0.604786790907383, "loss": 2.7377, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0792236328125, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1129150390625, "loss_aux_layer_19": 0.115966796875, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05487060546875, "step": 4385, "total_loss": 0.684417724609375 }, { "epoch": 0.8683429023955652, "grad_norm": 0.9478079080581665, "learning_rate": 5e-05, "llm_loss": 0.5709523558616638, "loss": 2.5846, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05120849609375, "step": 4386, "total_loss": 0.6461559236049652 }, { "epoch": 0.8685408829934667, "grad_norm": 0.9778178334236145, "learning_rate": 5e-05, "llm_loss": 0.5723304450511932, "loss": 2.5987, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05230712890625, "step": 4387, "total_loss": 0.6496667712926865 }, { "epoch": 0.8687388635913681, "grad_norm": 1.2847185134887695, "learning_rate": 5e-05, "llm_loss": 0.5130319967865944, "loss": 2.3545, "loss_aux_layer_0": 0.01129150390625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.055419921875, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.05169677734375, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.05023193359375, "step": 4388, "total_loss": 0.5886373072862625 }, { "epoch": 0.8689368441892694, "grad_norm": 1.0885671377182007, "learning_rate": 5e-05, "llm_loss": 0.5566631332039833, "loss": 2.5403, "loss_aux_layer_0": 0.0130767822265625, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05438232421875, "step": 4389, "total_loss": 0.6350766271352768 }, { "epoch": 0.8691348247871709, "grad_norm": 0.9497517943382263, "learning_rate": 5e-05, "llm_loss": 0.5031740963459015, "loss": 2.3376, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1142578125, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.1256103515625, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.0565185546875, "step": 4390, "total_loss": 0.5844013094902039 }, { "epoch": 0.8693328053850723, "grad_norm": 1.0558069944381714, "learning_rate": 5e-05, "llm_loss": 0.49221737682819366, "loss": 2.2916, "loss_aux_layer_0": 0.0128173828125, "loss_aux_layer_1": 0.031524658203125, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.0968017578125, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.054443359375, "loss_aux_layer_4": 0.056884765625, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.05902099609375, "loss_aux_layer_8": 0.05828857421875, "loss_aux_layer_9": 0.0570068359375, "step": 4391, "total_loss": 0.5728968530893326 }, { "epoch": 0.8695307859829736, "grad_norm": 1.1521159410476685, "learning_rate": 5e-05, "llm_loss": 0.5763331651687622, "loss": 2.6223, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.054443359375, "step": 4392, "total_loss": 0.655577763915062 }, { "epoch": 0.8697287665808751, "grad_norm": 1.2041041851043701, "learning_rate": 5e-05, "llm_loss": 0.5719012841582298, "loss": 2.6, "loss_aux_layer_0": 0.0138397216796875, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05291748046875, "step": 4393, "total_loss": 0.6499877423048019 }, { "epoch": 0.8699267471787765, "grad_norm": 0.8668252825737, "learning_rate": 5e-05, "llm_loss": 0.5908565521240234, "loss": 2.6741, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.052978515625, "step": 4394, "total_loss": 0.6685202270746231 }, { "epoch": 0.8701247277766779, "grad_norm": 1.0177772045135498, "learning_rate": 5e-05, "llm_loss": 0.5790912210941315, "loss": 2.6304, "loss_aux_layer_0": 0.0137939453125, "loss_aux_layer_1": 0.0308837890625, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.06903076171875, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.05523681640625, "step": 4395, "total_loss": 0.6575991809368134 }, { "epoch": 0.8703227083745793, "grad_norm": 1.2481085062026978, "learning_rate": 5e-05, "llm_loss": 0.5588388592004776, "loss": 2.558, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.1322021484375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05291748046875, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.0556640625, "step": 4396, "total_loss": 0.6394901722669601 }, { "epoch": 0.8705206889724807, "grad_norm": 1.063705325126648, "learning_rate": 5e-05, "llm_loss": 0.6332836598157883, "loss": 2.847, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.06903076171875, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05377197265625, "step": 4397, "total_loss": 0.7117487341165543 }, { "epoch": 0.8707186695703821, "grad_norm": 1.140956163406372, "learning_rate": 5e-05, "llm_loss": 0.566708393394947, "loss": 2.5784, "loss_aux_layer_0": 0.0124664306640625, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.0531005859375, "step": 4398, "total_loss": 0.6446115076541901 }, { "epoch": 0.8709166501682835, "grad_norm": 1.0101823806762695, "learning_rate": 5e-05, "llm_loss": 0.5357519686222076, "loss": 2.4521, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.053466796875, "step": 4399, "total_loss": 0.6130198389291763 }, { "epoch": 0.8711146307661849, "grad_norm": 1.102925419807434, "learning_rate": 5e-05, "llm_loss": 0.5469721853733063, "loss": 2.4841, "loss_aux_layer_0": 0.0131988525390625, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.0504150390625, "loss_aux_layer_11": 0.05364990234375, "loss_aux_layer_12": 0.05816650390625, "loss_aux_layer_13": 0.06329345703125, "loss_aux_layer_14": 0.0711669921875, "loss_aux_layer_15": 0.079345703125, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04644775390625, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.05010986328125, "loss_aux_layer_6": 0.05255126953125, "loss_aux_layer_7": 0.050537109375, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.0491943359375, "step": 4400, "total_loss": 0.6210329383611679 }, { "epoch": 0.8713126113640863, "grad_norm": 0.784927487373352, "learning_rate": 5e-05, "llm_loss": 0.5643448531627655, "loss": 2.5749, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.030548095703125, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04302978515625, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.05718994140625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.05572509765625, "step": 4401, "total_loss": 0.6437125205993652 }, { "epoch": 0.8715105919619878, "grad_norm": 1.156114101409912, "learning_rate": 5e-05, "llm_loss": 0.5929225236177444, "loss": 2.6896, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.031158447265625, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05413818359375, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05615234375, "step": 4402, "total_loss": 0.6723925173282623 }, { "epoch": 0.8717085725598891, "grad_norm": 1.0066641569137573, "learning_rate": 5e-05, "llm_loss": 0.5278094559907913, "loss": 2.4276, "loss_aux_layer_0": 0.01116943359375, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05450439453125, "step": 4403, "total_loss": 0.6068956777453423 }, { "epoch": 0.8719065531577905, "grad_norm": 1.1524525880813599, "learning_rate": 5e-05, "llm_loss": 0.5535928457975388, "loss": 2.5244, "loss_aux_layer_0": 0.01153564453125, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.053466796875, "step": 4404, "total_loss": 0.6310969144105911 }, { "epoch": 0.872104533755692, "grad_norm": 0.9244378805160522, "learning_rate": 5e-05, "llm_loss": 0.53956738114357, "loss": 2.4818, "loss_aux_layer_0": 0.0115814208984375, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.06646728515625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0526123046875, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.05670166015625, "step": 4405, "total_loss": 0.6204472333192825 }, { "epoch": 0.8723025143535933, "grad_norm": 1.0723739862442017, "learning_rate": 5e-05, "llm_loss": 0.6013590767979622, "loss": 2.7192, "loss_aux_layer_0": 0.01226806640625, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05438232421875, "step": 4406, "total_loss": 0.6797989755868912 }, { "epoch": 0.8725004949514947, "grad_norm": 1.0327491760253906, "learning_rate": 5e-05, "llm_loss": 0.48301659524440765, "loss": 2.2479, "loss_aux_layer_0": 0.0115814208984375, "loss_aux_layer_1": 0.030517578125, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05487060546875, "step": 4407, "total_loss": 0.5619860887527466 }, { "epoch": 0.8726984755493962, "grad_norm": 1.226775884628296, "learning_rate": 5e-05, "llm_loss": 0.5585266053676605, "loss": 2.5532, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05523681640625, "step": 4408, "total_loss": 0.6383040621876717 }, { "epoch": 0.8728964561472976, "grad_norm": 0.9054981470108032, "learning_rate": 5e-05, "llm_loss": 0.5490187853574753, "loss": 2.5038, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05267333984375, "step": 4409, "total_loss": 0.6259468495845795 }, { "epoch": 0.8730944367451989, "grad_norm": 1.1131726503372192, "learning_rate": 5e-05, "llm_loss": 0.5927145779132843, "loss": 2.6974, "loss_aux_layer_0": 0.0124359130859375, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.0594482421875, "loss_aux_layer_11": 0.06353759765625, "loss_aux_layer_12": 0.06787109375, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0810546875, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04437255859375, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05450439453125, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.05859375, "loss_aux_layer_6": 0.06158447265625, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.0589599609375, "loss_aux_layer_9": 0.0579833984375, "step": 4410, "total_loss": 0.6743420362472534 }, { "epoch": 0.8732924173431004, "grad_norm": 0.906010091304779, "learning_rate": 5e-05, "llm_loss": 0.6295564919710159, "loss": 2.8302, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05328369140625, "step": 4411, "total_loss": 0.7075439244508743 }, { "epoch": 0.8734903979410018, "grad_norm": 0.9257340431213379, "learning_rate": 5e-05, "llm_loss": 0.5344847291707993, "loss": 2.4371, "loss_aux_layer_0": 0.012237548828125, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.0537109375, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.05047607421875, "step": 4412, "total_loss": 0.6092767119407654 }, { "epoch": 0.8736883785389031, "grad_norm": 1.035394549369812, "learning_rate": 5e-05, "llm_loss": 0.608802855014801, "loss": 2.7516, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05462646484375, "step": 4413, "total_loss": 0.6879065185785294 }, { "epoch": 0.8738863591368046, "grad_norm": 0.9082397818565369, "learning_rate": 5e-05, "llm_loss": 0.4959585815668106, "loss": 2.2983, "loss_aux_layer_0": 0.011688232421875, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05487060546875, "step": 4414, "total_loss": 0.574586495757103 }, { "epoch": 0.874084339734706, "grad_norm": 1.1541725397109985, "learning_rate": 5e-05, "llm_loss": 0.60167396068573, "loss": 2.7095, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04754638671875, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.05230712890625, "step": 4415, "total_loss": 0.6773755848407745 }, { "epoch": 0.8742823203326074, "grad_norm": 1.0571188926696777, "learning_rate": 5e-05, "llm_loss": 0.5782201588153839, "loss": 2.6222, "loss_aux_layer_0": 0.0121307373046875, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05096435546875, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.05279541015625, "step": 4416, "total_loss": 0.6555543839931488 }, { "epoch": 0.8744803009305088, "grad_norm": 0.8563846349716187, "learning_rate": 5e-05, "llm_loss": 0.5297669470310211, "loss": 2.4255, "loss_aux_layer_0": 0.011566162109375, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0526123046875, "step": 4417, "total_loss": 0.6063636839389801 }, { "epoch": 0.8746782815284102, "grad_norm": 0.989173412322998, "learning_rate": 5e-05, "llm_loss": 0.5921008363366127, "loss": 2.6762, "loss_aux_layer_0": 0.0119171142578125, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.0531005859375, "step": 4418, "total_loss": 0.6690496355295181 }, { "epoch": 0.8748762621263116, "grad_norm": 0.8168864846229553, "learning_rate": 5e-05, "llm_loss": 0.6220852807164192, "loss": 2.793, "loss_aux_layer_0": 0.0119476318359375, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05303955078125, "step": 4419, "total_loss": 0.6982515752315521 }, { "epoch": 0.8750742427242131, "grad_norm": 0.8571701049804688, "learning_rate": 5e-05, "llm_loss": 0.53483597189188, "loss": 2.442, "loss_aux_layer_0": 0.0105743408203125, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.05096435546875, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05194091796875, "step": 4420, "total_loss": 0.6104920506477356 }, { "epoch": 0.8752722233221144, "grad_norm": 0.9258110523223877, "learning_rate": 5e-05, "llm_loss": 0.6038585305213928, "loss": 2.7343, "loss_aux_layer_0": 0.0123443603515625, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05767822265625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.05718994140625, "loss_aux_layer_9": 0.05615234375, "step": 4421, "total_loss": 0.6835673600435257 }, { "epoch": 0.8754702039200158, "grad_norm": 0.8375924229621887, "learning_rate": 5e-05, "llm_loss": 0.5819154158234596, "loss": 2.646, "loss_aux_layer_0": 0.0115814208984375, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0543212890625, "step": 4422, "total_loss": 0.661504864692688 }, { "epoch": 0.8756681845179173, "grad_norm": 0.8398430943489075, "learning_rate": 5e-05, "llm_loss": 0.59091517329216, "loss": 2.6723, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05328369140625, "step": 4423, "total_loss": 0.6680641323328018 }, { "epoch": 0.8758661651158186, "grad_norm": 0.8866263628005981, "learning_rate": 5e-05, "llm_loss": 0.5196429863572121, "loss": 2.4035, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.03045654296875, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.0628662109375, "loss_aux_layer_12": 0.06719970703125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.0888671875, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.10498046875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.1317138671875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.05596923828125, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.05926513671875, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.057373046875, "step": 4424, "total_loss": 0.6008825153112411 }, { "epoch": 0.87606414571372, "grad_norm": 0.7561327219009399, "learning_rate": 5e-05, "llm_loss": 0.5279486924409866, "loss": 2.4184, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05352783203125, "step": 4425, "total_loss": 0.6046007424592972 }, { "epoch": 0.8762621263116215, "grad_norm": 0.6733981966972351, "learning_rate": 5e-05, "llm_loss": 0.5054643899202347, "loss": 2.328, "loss_aux_layer_0": 0.0110015869140625, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.0531005859375, "step": 4426, "total_loss": 0.5820108205080032 }, { "epoch": 0.8764601069095229, "grad_norm": 0.8293753266334534, "learning_rate": 5e-05, "llm_loss": 0.5915766209363937, "loss": 2.6841, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05853271484375, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05517578125, "step": 4427, "total_loss": 0.6710216850042343 }, { "epoch": 0.8766580875074242, "grad_norm": 0.7319024205207825, "learning_rate": 5e-05, "llm_loss": 0.5407539531588554, "loss": 2.4793, "loss_aux_layer_0": 0.0104217529296875, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05462646484375, "step": 4428, "total_loss": 0.6198158487677574 }, { "epoch": 0.8768560681053257, "grad_norm": 0.8369433283805847, "learning_rate": 5e-05, "llm_loss": 0.6261088401079178, "loss": 2.8174, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.05657958984375, "loss_aux_layer_9": 0.0552978515625, "step": 4429, "total_loss": 0.7043609023094177 }, { "epoch": 0.8770540487032271, "grad_norm": 0.6747851371765137, "learning_rate": 5e-05, "llm_loss": 0.5688231959939003, "loss": 2.5985, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.031494140625, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05364990234375, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06109619140625, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.0587158203125, "loss_aux_layer_9": 0.05731201171875, "step": 4430, "total_loss": 0.649622455239296 }, { "epoch": 0.8772520293011284, "grad_norm": 0.7866369485855103, "learning_rate": 5e-05, "llm_loss": 0.5264667868614197, "loss": 2.4285, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.031646728515625, "loss_aux_layer_10": 0.05853271484375, "loss_aux_layer_11": 0.0625, "loss_aux_layer_12": 0.06683349609375, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05657958984375, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.0611572265625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.0584716796875, "loss_aux_layer_9": 0.05712890625, "step": 4431, "total_loss": 0.6071362197399139 }, { "epoch": 0.8774500098990299, "grad_norm": 0.6894909143447876, "learning_rate": 5e-05, "llm_loss": 0.5514271408319473, "loss": 2.5256, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.0313720703125, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04388427734375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.053955078125, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05804443359375, "loss_aux_layer_6": 0.06097412109375, "loss_aux_layer_7": 0.05938720703125, "loss_aux_layer_8": 0.05877685546875, "loss_aux_layer_9": 0.0572509765625, "step": 4432, "total_loss": 0.6313883066177368 }, { "epoch": 0.8776479904969313, "grad_norm": 0.6816888451576233, "learning_rate": 5e-05, "llm_loss": 0.6225656569004059, "loss": 2.7975, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.053466796875, "step": 4433, "total_loss": 0.6993765830993652 }, { "epoch": 0.8778459710948328, "grad_norm": 0.8330391645431519, "learning_rate": 5e-05, "llm_loss": 0.5950766131281853, "loss": 2.6857, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05303955078125, "step": 4434, "total_loss": 0.671414390206337 }, { "epoch": 0.8780439516927341, "grad_norm": 0.7431662082672119, "learning_rate": 5e-05, "llm_loss": 0.6045119762420654, "loss": 2.733, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05523681640625, "step": 4435, "total_loss": 0.6832381188869476 }, { "epoch": 0.8782419322906355, "grad_norm": 0.7796404957771301, "learning_rate": 5e-05, "llm_loss": 0.552019938826561, "loss": 2.5211, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.06903076171875, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05474853515625, "step": 4436, "total_loss": 0.6302825063467026 }, { "epoch": 0.878439912888537, "grad_norm": 0.8428199291229248, "learning_rate": 5e-05, "llm_loss": 0.5552383586764336, "loss": 2.5383, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.030792236328125, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.06988525390625, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05279541015625, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.0555419921875, "step": 4437, "total_loss": 0.6345673054456711 }, { "epoch": 0.8786378934864383, "grad_norm": 0.7300480604171753, "learning_rate": 5e-05, "llm_loss": 0.5975417345762253, "loss": 2.7005, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.06817626953125, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05352783203125, "step": 4438, "total_loss": 0.6751161515712738 }, { "epoch": 0.8788358740843397, "grad_norm": 1.0024646520614624, "learning_rate": 5e-05, "llm_loss": 0.5756390541791916, "loss": 2.6224, "loss_aux_layer_0": 0.0103302001953125, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.05828857421875, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.05615234375, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.05694580078125, "step": 4439, "total_loss": 0.6555977463722229 }, { "epoch": 0.8790338546822412, "grad_norm": 0.8397429585456848, "learning_rate": 5e-05, "llm_loss": 0.5209823697805405, "loss": 2.4003, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05426025390625, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.055419921875, "step": 4440, "total_loss": 0.6000864952802658 }, { "epoch": 0.8792318352801426, "grad_norm": 0.8914185762405396, "learning_rate": 5e-05, "llm_loss": 0.5240109115839005, "loss": 2.4116, "loss_aux_layer_0": 0.010955810546875, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0540771484375, "step": 4441, "total_loss": 0.602899894118309 }, { "epoch": 0.8794298158780439, "grad_norm": 0.6898903846740723, "learning_rate": 5e-05, "llm_loss": 0.6016961634159088, "loss": 2.7109, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0528564453125, "step": 4442, "total_loss": 0.6777332574129105 }, { "epoch": 0.8796277964759454, "grad_norm": 0.9489282965660095, "learning_rate": 5e-05, "llm_loss": 0.5030107498168945, "loss": 2.3257, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05517578125, "step": 4443, "total_loss": 0.5814258232712746 }, { "epoch": 0.8798257770738468, "grad_norm": 0.9536065459251404, "learning_rate": 5e-05, "llm_loss": 0.5951469838619232, "loss": 2.6891, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05377197265625, "step": 4444, "total_loss": 0.6722817569971085 }, { "epoch": 0.8800237576717481, "grad_norm": 0.9090418219566345, "learning_rate": 5e-05, "llm_loss": 0.6534401476383209, "loss": 2.9312, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1121826171875, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.0548095703125, "step": 4445, "total_loss": 0.7328101843595505 }, { "epoch": 0.8802217382696496, "grad_norm": 0.8276332020759583, "learning_rate": 5e-05, "llm_loss": 0.6644874960184097, "loss": 2.9655, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.05255126953125, "step": 4446, "total_loss": 0.7413834631443024 }, { "epoch": 0.880419718867551, "grad_norm": 1.0323196649551392, "learning_rate": 5e-05, "llm_loss": 0.6226269155740738, "loss": 2.8039, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.0546875, "step": 4447, "total_loss": 0.7009711712598801 }, { "epoch": 0.8806176994654524, "grad_norm": 0.8303091526031494, "learning_rate": 5e-05, "llm_loss": 0.5405637994408607, "loss": 2.4653, "loss_aux_layer_0": 0.01031494140625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.052734375, "step": 4448, "total_loss": 0.6163357794284821 }, { "epoch": 0.8808156800633538, "grad_norm": 1.1843311786651611, "learning_rate": 5e-05, "llm_loss": 0.6406717151403427, "loss": 2.8757, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05450439453125, "step": 4449, "total_loss": 0.7189313173294067 }, { "epoch": 0.8810136606612552, "grad_norm": 1.000383973121643, "learning_rate": 5e-05, "llm_loss": 0.636009931564331, "loss": 2.8595, "loss_aux_layer_0": 0.011810302734375, "loss_aux_layer_1": 0.029937744140625, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05987548828125, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.05755615234375, "loss_aux_layer_9": 0.05615234375, "step": 4450, "total_loss": 0.7148785442113876 }, { "epoch": 0.8812116412591566, "grad_norm": 1.0531736612319946, "learning_rate": 5e-05, "llm_loss": 0.5213136821985245, "loss": 2.4055, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04296875, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05853271484375, "loss_aux_layer_9": 0.05731201171875, "step": 4451, "total_loss": 0.6013634502887726 }, { "epoch": 0.881409621857058, "grad_norm": 1.1253329515457153, "learning_rate": 5e-05, "llm_loss": 0.615177795290947, "loss": 2.7879, "loss_aux_layer_0": 0.011444091796875, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.06671142578125, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.0987548828125, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04473876953125, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.05645751953125, "loss_aux_layer_5": 0.05780029296875, "loss_aux_layer_6": 0.060791015625, "loss_aux_layer_7": 0.05889892578125, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05718994140625, "step": 4452, "total_loss": 0.6969812512397766 }, { "epoch": 0.8816076024549594, "grad_norm": 1.0530140399932861, "learning_rate": 5e-05, "llm_loss": 0.6223040521144867, "loss": 2.7971, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0537109375, "step": 4453, "total_loss": 0.6992724388837814 }, { "epoch": 0.8818055830528608, "grad_norm": 0.937645435333252, "learning_rate": 5e-05, "llm_loss": 0.48510509729385376, "loss": 2.2543, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.055419921875, "step": 4454, "total_loss": 0.563579797744751 }, { "epoch": 0.8820035636507623, "grad_norm": 0.9984484314918518, "learning_rate": 5e-05, "llm_loss": 0.5868638455867767, "loss": 2.6607, "loss_aux_layer_0": 0.01104736328125, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.0535888671875, "step": 4455, "total_loss": 0.6651797443628311 }, { "epoch": 0.8822015442486636, "grad_norm": 1.1126080751419067, "learning_rate": 5e-05, "llm_loss": 0.48949387669563293, "loss": 2.2833, "loss_aux_layer_0": 0.01116943359375, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06622314453125, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1185302734375, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.134521484375, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.0576171875, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05621337890625, "step": 4456, "total_loss": 0.5708197504281998 }, { "epoch": 0.882399524846565, "grad_norm": 1.0714166164398193, "learning_rate": 5e-05, "llm_loss": 0.5696180760860443, "loss": 2.5834, "loss_aux_layer_0": 0.011077880859375, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.052734375, "step": 4457, "total_loss": 0.6458466351032257 }, { "epoch": 0.8825975054444665, "grad_norm": 0.9664086103439331, "learning_rate": 5e-05, "llm_loss": 0.6514669582247734, "loss": 2.9163, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0538330078125, "step": 4458, "total_loss": 0.7290859594941139 }, { "epoch": 0.8827954860423678, "grad_norm": 0.9143973588943481, "learning_rate": 5e-05, "llm_loss": 0.49230096489191055, "loss": 2.28, "loss_aux_layer_0": 0.012359619140625, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.053955078125, "step": 4459, "total_loss": 0.5699961856007576 }, { "epoch": 0.8829934666402692, "grad_norm": 1.4175174236297607, "learning_rate": 5e-05, "llm_loss": 0.5391310155391693, "loss": 2.4805, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0986328125, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.1151123046875, "loss_aux_layer_19": 0.1187744140625, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1270751953125, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.1572265625, "loss_aux_layer_23": 0.196044921875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05499267578125, "step": 4460, "total_loss": 0.6201128214597702 }, { "epoch": 0.8831914472381707, "grad_norm": 1.228919506072998, "learning_rate": 5e-05, "llm_loss": 0.5656962096691132, "loss": 2.5659, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05157470703125, "step": 4461, "total_loss": 0.6414726376533508 }, { "epoch": 0.8833894278360721, "grad_norm": 1.0661979913711548, "learning_rate": 5e-05, "llm_loss": 0.5038569569587708, "loss": 2.3176, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.052490234375, "step": 4462, "total_loss": 0.5794033706188202 }, { "epoch": 0.8835874084339734, "grad_norm": 1.221563696861267, "learning_rate": 5e-05, "llm_loss": 0.5003254264593124, "loss": 2.3266, "loss_aux_layer_0": 0.015167236328125, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.0623779296875, "loss_aux_layer_12": 0.06658935546875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.04339599609375, "loss_aux_layer_20": 0.12353515625, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.193115234375, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.05572509765625, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.05718994140625, "step": 4463, "total_loss": 0.5816441178321838 }, { "epoch": 0.8837853890318749, "grad_norm": 0.8338344693183899, "learning_rate": 5e-05, "llm_loss": 0.5587185695767403, "loss": 2.5514, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.05474853515625, "step": 4464, "total_loss": 0.6378433257341385 }, { "epoch": 0.8839833696297763, "grad_norm": 1.1098471879959106, "learning_rate": 5e-05, "llm_loss": 0.5092578381299973, "loss": 2.358, "loss_aux_layer_0": 0.0144805908203125, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.0556640625, "step": 4465, "total_loss": 0.5894942134618759 }, { "epoch": 0.8841813502276777, "grad_norm": 0.8418265581130981, "learning_rate": 5e-05, "llm_loss": 0.5876683741807938, "loss": 2.6636, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.05999755859375, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.0548095703125, "step": 4466, "total_loss": 0.6659080535173416 }, { "epoch": 0.8843793308255791, "grad_norm": 0.8812658786773682, "learning_rate": 5e-05, "llm_loss": 0.6118321716785431, "loss": 2.7549, "loss_aux_layer_0": 0.0132904052734375, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.05279541015625, "step": 4467, "total_loss": 0.6887253522872925 }, { "epoch": 0.8845773114234805, "grad_norm": 0.7329416871070862, "learning_rate": 5e-05, "llm_loss": 0.5220386683940887, "loss": 2.405, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.054443359375, "step": 4468, "total_loss": 0.6012490540742874 }, { "epoch": 0.8847752920213819, "grad_norm": 0.9794452786445618, "learning_rate": 5e-05, "llm_loss": 0.5519834607839584, "loss": 2.5342, "loss_aux_layer_0": 0.0109710693359375, "loss_aux_layer_1": 0.031585693359375, "loss_aux_layer_10": 0.0592041015625, "loss_aux_layer_11": 0.06341552734375, "loss_aux_layer_12": 0.0675048828125, "loss_aux_layer_13": 0.0728759765625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.0445556640625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0548095703125, "loss_aux_layer_4": 0.057861328125, "loss_aux_layer_5": 0.0594482421875, "loss_aux_layer_6": 0.06268310546875, "loss_aux_layer_7": 0.060302734375, "loss_aux_layer_8": 0.0595703125, "loss_aux_layer_9": 0.05804443359375, "step": 4469, "total_loss": 0.6335414350032806 }, { "epoch": 0.8849732726192833, "grad_norm": 0.8234682679176331, "learning_rate": 5e-05, "llm_loss": 0.5103593021631241, "loss": 2.3599, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.030364990234375, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05279541015625, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.06048583984375, "loss_aux_layer_7": 0.05853271484375, "loss_aux_layer_8": 0.05780029296875, "loss_aux_layer_9": 0.05633544921875, "step": 4470, "total_loss": 0.5899867564439774 }, { "epoch": 0.8851712532171847, "grad_norm": 0.8886873126029968, "learning_rate": 5e-05, "llm_loss": 0.5562130957841873, "loss": 2.5506, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.058837890625, "loss_aux_layer_11": 0.06280517578125, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080810546875, "loss_aux_layer_15": 0.089111328125, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0440673828125, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05419921875, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.0584716796875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0595703125, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05743408203125, "step": 4471, "total_loss": 0.6376592665910721 }, { "epoch": 0.8853692338150861, "grad_norm": 0.9590379595756531, "learning_rate": 5e-05, "llm_loss": 0.6428031921386719, "loss": 2.881, "loss_aux_layer_0": 0.011322021484375, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.0533447265625, "step": 4472, "total_loss": 0.7202519625425339 }, { "epoch": 0.8855672144129876, "grad_norm": 0.9562867283821106, "learning_rate": 5e-05, "llm_loss": 0.6063211262226105, "loss": 2.7395, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.05389404296875, "step": 4473, "total_loss": 0.684870183467865 }, { "epoch": 0.8857651950108889, "grad_norm": 0.787199854850769, "learning_rate": 5e-05, "llm_loss": 0.5845668092370033, "loss": 2.6531, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05352783203125, "step": 4474, "total_loss": 0.6632704436779022 }, { "epoch": 0.8859631756087903, "grad_norm": 0.8365973830223083, "learning_rate": 5e-05, "llm_loss": 0.5450292080640793, "loss": 2.4891, "loss_aux_layer_0": 0.010955810546875, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.05377197265625, "step": 4475, "total_loss": 0.6222858130931854 }, { "epoch": 0.8861611562066918, "grad_norm": 0.7846906781196594, "learning_rate": 5e-05, "llm_loss": 0.5397762507200241, "loss": 2.4785, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.052734375, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05633544921875, "step": 4476, "total_loss": 0.6196193099021912 }, { "epoch": 0.8863591368045931, "grad_norm": 0.917966902256012, "learning_rate": 5e-05, "llm_loss": 0.5482495576143265, "loss": 2.5041, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05291748046875, "step": 4477, "total_loss": 0.6260189563035965 }, { "epoch": 0.8865571174024945, "grad_norm": 1.0762251615524292, "learning_rate": 5e-05, "llm_loss": 0.5427085310220718, "loss": 2.4877, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.03076171875, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.0557861328125, "step": 4478, "total_loss": 0.6219243407249451 }, { "epoch": 0.886755098000396, "grad_norm": 0.8309561610221863, "learning_rate": 5e-05, "llm_loss": 0.5052233710885048, "loss": 2.3259, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.05291748046875, "step": 4479, "total_loss": 0.5814751088619232 }, { "epoch": 0.8869530785982974, "grad_norm": 1.0333114862442017, "learning_rate": 5e-05, "llm_loss": 0.5428995788097382, "loss": 2.4865, "loss_aux_layer_0": 0.0116424560546875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.05401611328125, "step": 4480, "total_loss": 0.6216199398040771 }, { "epoch": 0.8871510591961987, "grad_norm": 0.9806079864501953, "learning_rate": 5e-05, "llm_loss": 0.5843386724591255, "loss": 2.6445, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.052978515625, "step": 4481, "total_loss": 0.6611363887786865 }, { "epoch": 0.8873490397941002, "grad_norm": 1.1188181638717651, "learning_rate": 5e-05, "llm_loss": 0.6122518479824066, "loss": 2.7676, "loss_aux_layer_0": 0.011474609375, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.19091796875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05499267578125, "step": 4482, "total_loss": 0.6918894797563553 }, { "epoch": 0.8875470203920016, "grad_norm": 1.040175437927246, "learning_rate": 5e-05, "llm_loss": 0.5602366775274277, "loss": 2.5572, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.054931640625, "step": 4483, "total_loss": 0.6393007189035416 }, { "epoch": 0.8877450009899029, "grad_norm": 0.9958299994468689, "learning_rate": 5e-05, "llm_loss": 0.539388082921505, "loss": 2.4731, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.0543212890625, "step": 4484, "total_loss": 0.618263378739357 }, { "epoch": 0.8879429815878044, "grad_norm": 1.254579782485962, "learning_rate": 5e-05, "llm_loss": 0.6617480665445328, "loss": 2.9606, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05413818359375, "step": 4485, "total_loss": 0.7401625365018845 }, { "epoch": 0.8881409621857058, "grad_norm": 0.9571825265884399, "learning_rate": 5e-05, "llm_loss": 0.5678012669086456, "loss": 2.5747, "loss_aux_layer_0": 0.0117950439453125, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04730224609375, "loss_aux_layer_4": 0.049560546875, "loss_aux_layer_5": 0.05096435546875, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05230712890625, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05120849609375, "step": 4486, "total_loss": 0.6436763256788254 }, { "epoch": 0.8883389427836073, "grad_norm": 1.1282061338424683, "learning_rate": 5e-05, "llm_loss": 0.5593791604042053, "loss": 2.5546, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1314697265625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.0545654296875, "step": 4487, "total_loss": 0.6386576816439629 }, { "epoch": 0.8885369233815086, "grad_norm": 0.9118900895118713, "learning_rate": 5e-05, "llm_loss": 0.5514703840017319, "loss": 2.507, "loss_aux_layer_0": 0.0127716064453125, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06573486328125, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05206298828125, "step": 4488, "total_loss": 0.6267519220709801 }, { "epoch": 0.88873490397941, "grad_norm": 0.9459208846092224, "learning_rate": 5e-05, "llm_loss": 0.5165117532014847, "loss": 2.3918, "loss_aux_layer_0": 0.012786865234375, "loss_aux_layer_1": 0.03045654296875, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.066650390625, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.125244140625, "loss_aux_layer_21": 0.133544921875, "loss_aux_layer_22": 0.15576171875, "loss_aux_layer_23": 0.19287109375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.05535888671875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.057861328125, "loss_aux_layer_9": 0.05694580078125, "step": 4489, "total_loss": 0.5979381799697876 }, { "epoch": 0.8889328845773115, "grad_norm": 0.9677485227584839, "learning_rate": 5e-05, "llm_loss": 0.502480149269104, "loss": 2.3126, "loss_aux_layer_0": 0.0117340087890625, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05120849609375, "step": 4490, "total_loss": 0.5781483203172684 }, { "epoch": 0.8891308651752128, "grad_norm": 0.901960015296936, "learning_rate": 5e-05, "llm_loss": 0.6058710813522339, "loss": 2.7375, "loss_aux_layer_0": 0.013702392578125, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.05548095703125, "step": 4491, "total_loss": 0.6843631565570831 }, { "epoch": 0.8893288457731142, "grad_norm": 0.9298076033592224, "learning_rate": 5e-05, "llm_loss": 0.582073301076889, "loss": 2.6495, "loss_aux_layer_0": 0.0106353759765625, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06195068359375, "loss_aux_layer_12": 0.06634521484375, "loss_aux_layer_13": 0.071533203125, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1239013671875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05615234375, "step": 4492, "total_loss": 0.6623682230710983 }, { "epoch": 0.8895268263710157, "grad_norm": 0.993705153465271, "learning_rate": 5e-05, "llm_loss": 0.6133993566036224, "loss": 2.7681, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.0552978515625, "step": 4493, "total_loss": 0.6920273900032043 }, { "epoch": 0.8897248069689171, "grad_norm": 0.9929487109184265, "learning_rate": 5e-05, "llm_loss": 0.6292399317026138, "loss": 2.8274, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.05291748046875, "step": 4494, "total_loss": 0.7068386673927307 }, { "epoch": 0.8899227875668184, "grad_norm": 0.9013938307762146, "learning_rate": 5e-05, "llm_loss": 0.5986486226320267, "loss": 2.7027, "loss_aux_layer_0": 0.0108642578125, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05303955078125, "step": 4495, "total_loss": 0.6756756156682968 }, { "epoch": 0.8901207681647199, "grad_norm": 0.8700937628746033, "learning_rate": 5e-05, "llm_loss": 0.584015354514122, "loss": 2.6431, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05352783203125, "step": 4496, "total_loss": 0.6607827991247177 }, { "epoch": 0.8903187487626213, "grad_norm": 0.8671864867210388, "learning_rate": 5e-05, "llm_loss": 0.4965970814228058, "loss": 2.2903, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05279541015625, "step": 4497, "total_loss": 0.572579026222229 }, { "epoch": 0.8905167293605226, "grad_norm": 0.8985645771026611, "learning_rate": 5e-05, "llm_loss": 0.5701476335525513, "loss": 2.5883, "loss_aux_layer_0": 0.0104827880859375, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05340576171875, "step": 4498, "total_loss": 0.6470756977796555 }, { "epoch": 0.8907147099584241, "grad_norm": 0.8972131013870239, "learning_rate": 5e-05, "llm_loss": 0.5448485687375069, "loss": 2.492, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.06890869140625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.0548095703125, "step": 4499, "total_loss": 0.6229944676160812 }, { "epoch": 0.8909126905563255, "grad_norm": 0.8456436991691589, "learning_rate": 5e-05, "llm_loss": 0.5196279287338257, "loss": 2.3899, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05419921875, "step": 4500, "total_loss": 0.5974669605493546 }, { "epoch": 0.8911106711542269, "grad_norm": 1.0223259925842285, "learning_rate": 5e-05, "llm_loss": 0.5946308821439743, "loss": 2.6785, "loss_aux_layer_0": 0.011444091796875, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04974365234375, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05096435546875, "step": 4501, "total_loss": 0.6696201264858246 }, { "epoch": 0.8913086517521283, "grad_norm": 0.9957889318466187, "learning_rate": 5e-05, "llm_loss": 0.6164516657590866, "loss": 2.7819, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05450439453125, "step": 4502, "total_loss": 0.6954731345176697 }, { "epoch": 0.8915066323500297, "grad_norm": 0.8645437359809875, "learning_rate": 5e-05, "llm_loss": 0.5151235908269882, "loss": 2.3597, "loss_aux_layer_0": 0.011962890625, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.05950927734375, "loss_aux_layer_13": 0.064453125, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05084228515625, "step": 4503, "total_loss": 0.5899325758218765 }, { "epoch": 0.8917046129479311, "grad_norm": 0.8173264265060425, "learning_rate": 5e-05, "llm_loss": 0.5821829810738564, "loss": 2.633, "loss_aux_layer_0": 0.0103607177734375, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.05133056640625, "step": 4504, "total_loss": 0.6582399308681488 }, { "epoch": 0.8919025935458325, "grad_norm": 0.8441969752311707, "learning_rate": 5e-05, "llm_loss": 0.5097963213920593, "loss": 2.357, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.029876708984375, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.0611572265625, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05206298828125, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05596923828125, "step": 4505, "total_loss": 0.5892585963010788 }, { "epoch": 0.8921005741437339, "grad_norm": 0.8676524758338928, "learning_rate": 5e-05, "llm_loss": 0.5251391381025314, "loss": 2.4121, "loss_aux_layer_0": 0.0106353759765625, "loss_aux_layer_1": 0.029022216796875, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.06768798828125, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.0528564453125, "step": 4506, "total_loss": 0.603030800819397 }, { "epoch": 0.8922985547416353, "grad_norm": 0.8047788739204407, "learning_rate": 5e-05, "llm_loss": 0.567937895655632, "loss": 2.5802, "loss_aux_layer_0": 0.01080322265625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05316162109375, "step": 4507, "total_loss": 0.6450487226247787 }, { "epoch": 0.8924965353395368, "grad_norm": 0.7778618931770325, "learning_rate": 5e-05, "llm_loss": 0.6017442792654037, "loss": 2.7231, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.030242919921875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.06500244140625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05426025390625, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.05560302734375, "step": 4508, "total_loss": 0.6807841509580612 }, { "epoch": 0.8926945159374381, "grad_norm": 0.8146283626556396, "learning_rate": 5e-05, "llm_loss": 0.5696222931146622, "loss": 2.5793, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.0523681640625, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.0517578125, "step": 4509, "total_loss": 0.644836038351059 }, { "epoch": 0.8928924965353395, "grad_norm": 0.7461328506469727, "learning_rate": 5e-05, "llm_loss": 0.6343983858823776, "loss": 2.8488, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0540771484375, "step": 4510, "total_loss": 0.7121903747320175 }, { "epoch": 0.893090477133241, "grad_norm": 0.7292087078094482, "learning_rate": 5e-05, "llm_loss": 0.5642184913158417, "loss": 2.5518, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.02679443359375, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.05462646484375, "loss_aux_layer_12": 0.0584716796875, "loss_aux_layer_13": 0.063232421875, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.0791015625, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.1046142578125, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.0465087890625, "loss_aux_layer_4": 0.048828125, "loss_aux_layer_5": 0.0504150390625, "loss_aux_layer_6": 0.0533447265625, "loss_aux_layer_7": 0.05169677734375, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.05029296875, "step": 4511, "total_loss": 0.6379609704017639 }, { "epoch": 0.8932884577311423, "grad_norm": 0.8670387864112854, "learning_rate": 5e-05, "llm_loss": 0.5755843222141266, "loss": 2.6204, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.0556640625, "step": 4512, "total_loss": 0.6550935730338097 }, { "epoch": 0.8934864383290437, "grad_norm": 0.8466463088989258, "learning_rate": 5e-05, "llm_loss": 0.5208227187395096, "loss": 2.4076, "loss_aux_layer_0": 0.01092529296875, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.05859375, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.04345703125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.0577392578125, "loss_aux_layer_6": 0.06085205078125, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.058349609375, "loss_aux_layer_9": 0.05718994140625, "step": 4513, "total_loss": 0.6018931269645691 }, { "epoch": 0.8936844189269452, "grad_norm": 0.9786208271980286, "learning_rate": 5e-05, "llm_loss": 0.5186992287635803, "loss": 2.3778, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.027008056640625, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.051025390625, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.0521240234375, "loss_aux_layer_9": 0.05145263671875, "step": 4514, "total_loss": 0.5944492667913437 }, { "epoch": 0.8938823995248466, "grad_norm": 0.857431948184967, "learning_rate": 5e-05, "llm_loss": 0.6229645013809204, "loss": 2.8027, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.05413818359375, "step": 4515, "total_loss": 0.7006766498088837 }, { "epoch": 0.8940803801227479, "grad_norm": 0.833665132522583, "learning_rate": 5e-05, "llm_loss": 0.5161555036902428, "loss": 2.3774, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.0533447265625, "step": 4516, "total_loss": 0.5943398177623749 }, { "epoch": 0.8942783607206494, "grad_norm": 0.8029161095619202, "learning_rate": 5e-05, "llm_loss": 0.5322107598185539, "loss": 2.4452, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05511474609375, "step": 4517, "total_loss": 0.6113029345870018 }, { "epoch": 0.8944763413185508, "grad_norm": 0.8249316811561584, "learning_rate": 5e-05, "llm_loss": 0.4953194931149483, "loss": 2.295, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05438232421875, "step": 4518, "total_loss": 0.573749378323555 }, { "epoch": 0.8946743219164522, "grad_norm": 0.7450066208839417, "learning_rate": 5e-05, "llm_loss": 0.5344069078564644, "loss": 2.4475, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.06805419921875, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.0535888671875, "step": 4519, "total_loss": 0.6118832603096962 }, { "epoch": 0.8948723025143536, "grad_norm": 0.9894436001777649, "learning_rate": 5e-05, "llm_loss": 0.6225695461034775, "loss": 2.8034, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05389404296875, "step": 4520, "total_loss": 0.7008572965860367 }, { "epoch": 0.895070283112255, "grad_norm": 0.7976666688919067, "learning_rate": 5e-05, "llm_loss": 0.556055411696434, "loss": 2.5399, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.0567626953125, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.05706787109375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0556640625, "step": 4521, "total_loss": 0.6349699944257736 }, { "epoch": 0.8952682637101564, "grad_norm": 1.0918915271759033, "learning_rate": 5e-05, "llm_loss": 0.5934926569461823, "loss": 2.6755, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05169677734375, "step": 4522, "total_loss": 0.6688677370548248 }, { "epoch": 0.8954662443080578, "grad_norm": 0.9246070384979248, "learning_rate": 5e-05, "llm_loss": 0.604527160525322, "loss": 2.7303, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 4523, "total_loss": 0.682575911283493 }, { "epoch": 0.8956642249059592, "grad_norm": 1.0489518642425537, "learning_rate": 5e-05, "llm_loss": 0.553737610578537, "loss": 2.5282, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.06829833984375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1153564453125, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05328369140625, "step": 4524, "total_loss": 0.6320575773715973 }, { "epoch": 0.8958622055038606, "grad_norm": 1.1264150142669678, "learning_rate": 5e-05, "llm_loss": 0.5926370620727539, "loss": 2.678, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.0528564453125, "step": 4525, "total_loss": 0.6694979071617126 }, { "epoch": 0.8960601861017621, "grad_norm": 0.9992328882217407, "learning_rate": 5e-05, "llm_loss": 0.4799514710903168, "loss": 2.2244, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.0521240234375, "loss_aux_layer_9": 0.0513916015625, "step": 4526, "total_loss": 0.5560999810695648 }, { "epoch": 0.8962581666996634, "grad_norm": 1.0279890298843384, "learning_rate": 5e-05, "llm_loss": 0.5211716294288635, "loss": 2.3891, "loss_aux_layer_0": 0.0108642578125, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06646728515625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05218505859375, "step": 4527, "total_loss": 0.5972744673490524 }, { "epoch": 0.8964561472975648, "grad_norm": 1.1244630813598633, "learning_rate": 5e-05, "llm_loss": 0.5868709236383438, "loss": 2.6551, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.029022216796875, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05322265625, "step": 4528, "total_loss": 0.6637699007987976 }, { "epoch": 0.8966541278954663, "grad_norm": 1.1558667421340942, "learning_rate": 5e-05, "llm_loss": 0.5966009274125099, "loss": 2.7022, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.0609130859375, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.0555419921875, "step": 4529, "total_loss": 0.6755439341068268 }, { "epoch": 0.8968521084933676, "grad_norm": 1.0390994548797607, "learning_rate": 5e-05, "llm_loss": 0.6302792876958847, "loss": 2.8319, "loss_aux_layer_0": 0.0103607177734375, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05352783203125, "step": 4530, "total_loss": 0.707984209060669 }, { "epoch": 0.897050089091269, "grad_norm": 1.0530246496200562, "learning_rate": 5e-05, "llm_loss": 0.6038860976696014, "loss": 2.7201, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.05474853515625, "loss_aux_layer_7": 0.05303955078125, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05218505859375, "step": 4531, "total_loss": 0.6800224781036377 }, { "epoch": 0.8972480696891705, "grad_norm": 1.0052443742752075, "learning_rate": 5e-05, "llm_loss": 0.6524946838617325, "loss": 2.9336, "loss_aux_layer_0": 0.0098419189453125, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05841064453125, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06646728515625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.0804443359375, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.09765625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.113037109375, "loss_aux_layer_19": 0.1162109375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.053466796875, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.0587158203125, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.05706787109375, "step": 4532, "total_loss": 0.7333959192037582 }, { "epoch": 0.8974460502870719, "grad_norm": 1.1417344808578491, "learning_rate": 5e-05, "llm_loss": 0.5635988861322403, "loss": 2.5462, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.05426025390625, "loss_aux_layer_12": 0.05828857421875, "loss_aux_layer_13": 0.06280517578125, "loss_aux_layer_14": 0.0701904296875, "loss_aux_layer_15": 0.0782470703125, "loss_aux_layer_16": 0.0869140625, "loss_aux_layer_17": 0.0946044921875, "loss_aux_layer_18": 0.102294921875, "loss_aux_layer_19": 0.106689453125, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.1148681640625, "loss_aux_layer_21": 0.123046875, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.0489501953125, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.0531005859375, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.051025390625, "loss_aux_layer_9": 0.04986572265625, "step": 4533, "total_loss": 0.6365524381399155 }, { "epoch": 0.8976440308849732, "grad_norm": 0.893355131149292, "learning_rate": 5e-05, "llm_loss": 0.6001328080892563, "loss": 2.7104, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0531005859375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.054443359375, "step": 4534, "total_loss": 0.6775970906019211 }, { "epoch": 0.8978420114828747, "grad_norm": 0.898514986038208, "learning_rate": 5e-05, "llm_loss": 0.5777371972799301, "loss": 2.6315, "loss_aux_layer_0": 0.01165771484375, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09814453125, "loss_aux_layer_17": 0.1058349609375, "loss_aux_layer_18": 0.11376953125, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05517578125, "step": 4535, "total_loss": 0.6578766107559204 }, { "epoch": 0.8980399920807761, "grad_norm": 0.8718390464782715, "learning_rate": 5e-05, "llm_loss": 0.4844626933336258, "loss": 2.2489, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.0526123046875, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.05377197265625, "step": 4536, "total_loss": 0.5622270554304123 }, { "epoch": 0.8982379726786774, "grad_norm": 0.8737384080886841, "learning_rate": 5e-05, "llm_loss": 0.4967219680547714, "loss": 2.2913, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.052490234375, "step": 4537, "total_loss": 0.5728247091174126 }, { "epoch": 0.8984359532765789, "grad_norm": 0.8498656749725342, "learning_rate": 5e-05, "llm_loss": 0.5302647948265076, "loss": 2.432, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.052734375, "step": 4538, "total_loss": 0.6080018281936646 }, { "epoch": 0.8986339338744803, "grad_norm": 0.7530249953269958, "learning_rate": 5e-05, "llm_loss": 0.5390416234731674, "loss": 2.4607, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05206298828125, "step": 4539, "total_loss": 0.6151679158210754 }, { "epoch": 0.8988319144723818, "grad_norm": 0.789624810218811, "learning_rate": 5e-05, "llm_loss": 0.5169422701001167, "loss": 2.3879, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.05609130859375, "step": 4540, "total_loss": 0.5969844087958336 }, { "epoch": 0.8990298950702831, "grad_norm": 0.8738695383071899, "learning_rate": 5e-05, "llm_loss": 0.6180360019207001, "loss": 2.7757, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05181884765625, "step": 4541, "total_loss": 0.6939250826835632 }, { "epoch": 0.8992278756681845, "grad_norm": 0.9011600613594055, "learning_rate": 5e-05, "llm_loss": 0.6809821426868439, "loss": 3.0426, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1248779296875, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05426025390625, "step": 4542, "total_loss": 0.7606381475925446 }, { "epoch": 0.899425856266086, "grad_norm": 0.784696638584137, "learning_rate": 5e-05, "llm_loss": 0.4827743098139763, "loss": 2.2218, "loss_aux_layer_0": 0.0104827880859375, "loss_aux_layer_1": 0.026824951171875, "loss_aux_layer_10": 0.050048828125, "loss_aux_layer_11": 0.05364990234375, "loss_aux_layer_12": 0.0577392578125, "loss_aux_layer_13": 0.06268310546875, "loss_aux_layer_14": 0.0703125, "loss_aux_layer_15": 0.0784912109375, "loss_aux_layer_16": 0.0872802734375, "loss_aux_layer_17": 0.094970703125, "loss_aux_layer_18": 0.1025390625, "loss_aux_layer_19": 0.106689453125, "loss_aux_layer_2": 0.037109375, "loss_aux_layer_20": 0.1146240234375, "loss_aux_layer_21": 0.123291015625, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04608154296875, "loss_aux_layer_4": 0.04815673828125, "loss_aux_layer_5": 0.04949951171875, "loss_aux_layer_6": 0.05194091796875, "loss_aux_layer_7": 0.05035400390625, "loss_aux_layer_8": 0.04998779296875, "loss_aux_layer_9": 0.0489501953125, "step": 4543, "total_loss": 0.5554601848125458 }, { "epoch": 0.8996238368639873, "grad_norm": 0.8173992037773132, "learning_rate": 5e-05, "llm_loss": 0.5745110511779785, "loss": 2.614, "loss_aux_layer_0": 0.0108184814453125, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.0606689453125, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.0421142578125, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.05615234375, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.05548095703125, "step": 4544, "total_loss": 0.6534998416900635 }, { "epoch": 0.8998218174618887, "grad_norm": 0.720747709274292, "learning_rate": 5e-05, "llm_loss": 0.5323716998100281, "loss": 2.4273, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.060302734375, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.0535888671875, "loss_aux_layer_7": 0.0521240234375, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05096435546875, "step": 4545, "total_loss": 0.6068230420351028 }, { "epoch": 0.9000197980597902, "grad_norm": 0.9493733644485474, "learning_rate": 5e-05, "llm_loss": 0.6671540439128876, "loss": 2.9811, "loss_aux_layer_0": 0.01025390625, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.0570068359375, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05609130859375, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.05712890625, "loss_aux_layer_9": 0.055908203125, "step": 4546, "total_loss": 0.7452625781297684 }, { "epoch": 0.9002177786576916, "grad_norm": 0.734224796295166, "learning_rate": 5e-05, "llm_loss": 0.564901627600193, "loss": 2.5686, "loss_aux_layer_0": 0.0101318359375, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0533447265625, "step": 4547, "total_loss": 0.6421512216329575 }, { "epoch": 0.9004157592555929, "grad_norm": 0.8649343848228455, "learning_rate": 5e-05, "llm_loss": 0.6103603094816208, "loss": 2.7439, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.0523681640625, "step": 4548, "total_loss": 0.6859842240810394 }, { "epoch": 0.9006137398534944, "grad_norm": 0.7731000185012817, "learning_rate": 5e-05, "llm_loss": 0.5328594893217087, "loss": 2.4574, "loss_aux_layer_0": 0.0102996826171875, "loss_aux_layer_1": 0.0311279296875, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06256103515625, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0892333984375, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1171875, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.1251220703125, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.05633544921875, "loss_aux_layer_5": 0.057861328125, "loss_aux_layer_6": 0.06072998046875, "loss_aux_layer_7": 0.0589599609375, "loss_aux_layer_8": 0.05841064453125, "loss_aux_layer_9": 0.0572509765625, "step": 4549, "total_loss": 0.6143485754728317 }, { "epoch": 0.9008117204513958, "grad_norm": 0.8303873538970947, "learning_rate": 5e-05, "llm_loss": 0.562575951218605, "loss": 2.5618, "loss_aux_layer_0": 0.0104217529296875, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05364990234375, "step": 4550, "total_loss": 0.640451580286026 }, { "epoch": 0.9010097010492971, "grad_norm": 0.6988595128059387, "learning_rate": 5e-05, "llm_loss": 0.506605364382267, "loss": 2.3243, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.02716064453125, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.0546875, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.0799560546875, "loss_aux_layer_16": 0.0892333984375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.0380859375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04693603515625, "loss_aux_layer_4": 0.04913330078125, "loss_aux_layer_5": 0.05047607421875, "loss_aux_layer_6": 0.05279541015625, "loss_aux_layer_7": 0.05108642578125, "loss_aux_layer_8": 0.05084228515625, "loss_aux_layer_9": 0.04974365234375, "step": 4551, "total_loss": 0.5810688138008118 }, { "epoch": 0.9012076816471986, "grad_norm": 0.9376832246780396, "learning_rate": 5e-05, "llm_loss": 0.5130575224757195, "loss": 2.3679, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05517578125, "step": 4552, "total_loss": 0.591964602470398 }, { "epoch": 0.9014056622451, "grad_norm": 0.7253037095069885, "learning_rate": 5e-05, "llm_loss": 0.5754800587892532, "loss": 2.6272, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.031463623046875, "loss_aux_layer_10": 0.058349609375, "loss_aux_layer_11": 0.06243896484375, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.097412109375, "loss_aux_layer_17": 0.1053466796875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0430908203125, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0557861328125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.0606689453125, "loss_aux_layer_7": 0.058837890625, "loss_aux_layer_8": 0.0582275390625, "loss_aux_layer_9": 0.0567626953125, "step": 4553, "total_loss": 0.6567951142787933 }, { "epoch": 0.9016036428430014, "grad_norm": 0.8003387451171875, "learning_rate": 5e-05, "llm_loss": 0.5430130586028099, "loss": 2.4751, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1170654296875, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.0523681640625, "step": 4554, "total_loss": 0.618769034743309 }, { "epoch": 0.9018016234409028, "grad_norm": 0.8061603307723999, "learning_rate": 5e-05, "llm_loss": 0.6074022054672241, "loss": 2.7414, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0546875, "step": 4555, "total_loss": 0.6853374987840652 }, { "epoch": 0.9019996040388042, "grad_norm": 0.8543406128883362, "learning_rate": 5e-05, "llm_loss": 0.451937235891819, "loss": 2.1211, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05413818359375, "step": 4556, "total_loss": 0.5302666574716568 }, { "epoch": 0.9021975846367056, "grad_norm": 0.7607079744338989, "learning_rate": 5e-05, "llm_loss": 0.5945551693439484, "loss": 2.6839, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0528564453125, "step": 4557, "total_loss": 0.6709851920604706 }, { "epoch": 0.902395565234607, "grad_norm": 0.9561595916748047, "learning_rate": 5e-05, "llm_loss": 0.5017390698194504, "loss": 2.3053, "loss_aux_layer_0": 0.0098419189453125, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.059814453125, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04669189453125, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.05059814453125, "step": 4558, "total_loss": 0.57631815969944 }, { "epoch": 0.9025935458325084, "grad_norm": 0.7491773366928101, "learning_rate": 5e-05, "llm_loss": 0.5889192819595337, "loss": 2.6661, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05755615234375, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.053955078125, "step": 4559, "total_loss": 0.6665147989988327 }, { "epoch": 0.9027915264304098, "grad_norm": 0.7955593466758728, "learning_rate": 5e-05, "llm_loss": 0.6577511131763458, "loss": 2.9325, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.052001953125, "step": 4560, "total_loss": 0.7331322282552719 }, { "epoch": 0.9029895070283113, "grad_norm": 0.8124097585678101, "learning_rate": 5e-05, "llm_loss": 0.5712867677211761, "loss": 2.5969, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.030364990234375, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.0543212890625, "step": 4561, "total_loss": 0.6492269337177277 }, { "epoch": 0.9031874876262126, "grad_norm": 0.7882872819900513, "learning_rate": 5e-05, "llm_loss": 0.5620227158069611, "loss": 2.557, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.0535888671875, "step": 4562, "total_loss": 0.6392575651407242 }, { "epoch": 0.903385468224114, "grad_norm": 0.7777034640312195, "learning_rate": 5e-05, "llm_loss": 0.5067879930138588, "loss": 2.3419, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05517578125, "step": 4563, "total_loss": 0.585470974445343 }, { "epoch": 0.9035834488220155, "grad_norm": 0.7868382334709167, "learning_rate": 5e-05, "llm_loss": 0.5308856442570686, "loss": 2.4341, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.0537109375, "step": 4564, "total_loss": 0.6085174381732941 }, { "epoch": 0.9037814294199168, "grad_norm": 0.7507492303848267, "learning_rate": 5e-05, "llm_loss": 0.5994449108839035, "loss": 2.7061, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05340576171875, "step": 4565, "total_loss": 0.6765315234661102 }, { "epoch": 0.9039794100178182, "grad_norm": 0.8407086730003357, "learning_rate": 5e-05, "llm_loss": 0.5920037180185318, "loss": 2.6766, "loss_aux_layer_0": 0.0096435546875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05377197265625, "step": 4566, "total_loss": 0.6691588312387466 }, { "epoch": 0.9041773906157197, "grad_norm": 0.866900622844696, "learning_rate": 5e-05, "llm_loss": 0.5927063971757889, "loss": 2.6778, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05340576171875, "step": 4567, "total_loss": 0.6694566756486893 }, { "epoch": 0.9043753712136211, "grad_norm": 0.7556458115577698, "learning_rate": 5e-05, "llm_loss": 0.510673999786377, "loss": 2.3673, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.0631103515625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1041259765625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.15380859375, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.05621337890625, "loss_aux_layer_5": 0.0579833984375, "loss_aux_layer_6": 0.06103515625, "loss_aux_layer_7": 0.059326171875, "loss_aux_layer_8": 0.058837890625, "loss_aux_layer_9": 0.0576171875, "step": 4568, "total_loss": 0.5918330326676369 }, { "epoch": 0.9045733518115224, "grad_norm": 0.8250210881233215, "learning_rate": 5e-05, "llm_loss": 0.610820934176445, "loss": 2.7517, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.06756591796875, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05255126953125, "step": 4569, "total_loss": 0.6879254430532455 }, { "epoch": 0.9047713324094239, "grad_norm": 0.8656564354896545, "learning_rate": 5e-05, "llm_loss": 0.5914729908108711, "loss": 2.6885, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05865478515625, "loss_aux_layer_11": 0.0626220703125, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.104736328125, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.0574951171875, "loss_aux_layer_6": 0.0609130859375, "loss_aux_layer_7": 0.0592041015625, "loss_aux_layer_8": 0.05865478515625, "loss_aux_layer_9": 0.0574951171875, "step": 4570, "total_loss": 0.6721213161945343 }, { "epoch": 0.9049693130073253, "grad_norm": 0.8148636221885681, "learning_rate": 5e-05, "llm_loss": 0.5695241689682007, "loss": 2.5954, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.111572265625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.05169677734375, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.05596923828125, "step": 4571, "total_loss": 0.6488439589738846 }, { "epoch": 0.9051672936052267, "grad_norm": 0.8884846568107605, "learning_rate": 5e-05, "llm_loss": 0.5523402690887451, "loss": 2.5165, "loss_aux_layer_0": 0.010986328125, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.05255126953125, "step": 4572, "total_loss": 0.6291277557611465 }, { "epoch": 0.9053652742031281, "grad_norm": 0.8417229652404785, "learning_rate": 5e-05, "llm_loss": 0.6226174831390381, "loss": 2.8087, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05718994140625, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06549072265625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04248046875, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.056640625, "loss_aux_layer_6": 0.05938720703125, "loss_aux_layer_7": 0.05767822265625, "loss_aux_layer_8": 0.05706787109375, "loss_aux_layer_9": 0.0560302734375, "step": 4573, "total_loss": 0.7021703571081161 }, { "epoch": 0.9055632548010295, "grad_norm": 0.954958438873291, "learning_rate": 5e-05, "llm_loss": 0.5613537728786469, "loss": 2.5513, "loss_aux_layer_0": 0.0104827880859375, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05255126953125, "step": 4574, "total_loss": 0.6378306001424789 }, { "epoch": 0.9057612353989309, "grad_norm": 0.8554589748382568, "learning_rate": 5e-05, "llm_loss": 0.5078350305557251, "loss": 2.3285, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.026763916015625, "loss_aux_layer_10": 0.05120849609375, "loss_aux_layer_11": 0.05487060546875, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.06463623046875, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03717041015625, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.04595947265625, "loss_aux_layer_4": 0.04815673828125, "loss_aux_layer_5": 0.04949951171875, "loss_aux_layer_6": 0.05224609375, "loss_aux_layer_7": 0.05072021484375, "loss_aux_layer_8": 0.05047607421875, "loss_aux_layer_9": 0.04986572265625, "step": 4575, "total_loss": 0.5821311846375465 }, { "epoch": 0.9059592159968323, "grad_norm": 1.2696056365966797, "learning_rate": 5e-05, "llm_loss": 0.5393441841006279, "loss": 2.4657, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05328369140625, "step": 4576, "total_loss": 0.6164270788431168 }, { "epoch": 0.9061571965947337, "grad_norm": 0.9254215955734253, "learning_rate": 5e-05, "llm_loss": 0.5604438781738281, "loss": 2.5414, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05621337890625, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05126953125, "step": 4577, "total_loss": 0.635350838303566 }, { "epoch": 0.9063551771926351, "grad_norm": 1.0712978839874268, "learning_rate": 5e-05, "llm_loss": 0.5357402488589287, "loss": 2.4369, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05078125, "loss_aux_layer_11": 0.054443359375, "loss_aux_layer_12": 0.0584716796875, "loss_aux_layer_13": 0.0635986328125, "loss_aux_layer_14": 0.0714111328125, "loss_aux_layer_15": 0.079345703125, "loss_aux_layer_16": 0.0882568359375, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.1036376953125, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05035400390625, "loss_aux_layer_6": 0.0531005859375, "loss_aux_layer_7": 0.0513916015625, "loss_aux_layer_8": 0.05078125, "loss_aux_layer_9": 0.0496826171875, "step": 4578, "total_loss": 0.6092171221971512 }, { "epoch": 0.9065531577905366, "grad_norm": 0.8860811591148376, "learning_rate": 5e-05, "llm_loss": 0.4916345626115799, "loss": 2.2796, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05462646484375, "step": 4579, "total_loss": 0.569889634847641 }, { "epoch": 0.9067511383884379, "grad_norm": 0.8750795125961304, "learning_rate": 5e-05, "llm_loss": 0.5645778328180313, "loss": 2.5844, "loss_aux_layer_0": 0.0112152099609375, "loss_aux_layer_1": 0.030975341796875, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06298828125, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.112548828125, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.04351806640625, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.0535888671875, "loss_aux_layer_4": 0.056396484375, "loss_aux_layer_5": 0.05816650390625, "loss_aux_layer_6": 0.061279296875, "loss_aux_layer_7": 0.05950927734375, "loss_aux_layer_8": 0.0592041015625, "loss_aux_layer_9": 0.05792236328125, "step": 4580, "total_loss": 0.6460966467857361 }, { "epoch": 0.9069491189863393, "grad_norm": 0.9074639678001404, "learning_rate": 5e-05, "llm_loss": 0.6438333690166473, "loss": 2.8855, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05322265625, "step": 4581, "total_loss": 0.721385270357132 }, { "epoch": 0.9071470995842408, "grad_norm": 0.892095685005188, "learning_rate": 5e-05, "llm_loss": 0.5779468864202499, "loss": 2.624, "loss_aux_layer_0": 0.0115966796875, "loss_aux_layer_1": 0.03082275390625, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05487060546875, "step": 4582, "total_loss": 0.6560058668255806 }, { "epoch": 0.9073450801821421, "grad_norm": 0.907688319683075, "learning_rate": 5e-05, "llm_loss": 0.568469449877739, "loss": 2.5849, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.06781005859375, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05328369140625, "step": 4583, "total_loss": 0.6462182998657227 }, { "epoch": 0.9075430607800435, "grad_norm": 0.9389263987541199, "learning_rate": 5e-05, "llm_loss": 0.5850563198328018, "loss": 2.6395, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.06549072265625, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.17724609375, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05474853515625, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05169677734375, "step": 4584, "total_loss": 0.6598653793334961 }, { "epoch": 0.907741041377945, "grad_norm": 0.8608658313751221, "learning_rate": 5e-05, "llm_loss": 0.5783572494983673, "loss": 2.6229, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.0535888671875, "step": 4585, "total_loss": 0.6557316333055496 }, { "epoch": 0.9079390219758464, "grad_norm": 1.1902658939361572, "learning_rate": 5e-05, "llm_loss": 0.5884197354316711, "loss": 2.665, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.0531005859375, "step": 4586, "total_loss": 0.6662504374980927 }, { "epoch": 0.9081370025737477, "grad_norm": 0.8356343507766724, "learning_rate": 5e-05, "llm_loss": 0.5487486869096756, "loss": 2.5177, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.0302734375, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0887451171875, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.1318359375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05621337890625, "step": 4587, "total_loss": 0.629416286945343 }, { "epoch": 0.9083349831716492, "grad_norm": 0.9124560952186584, "learning_rate": 5e-05, "llm_loss": 0.5211945027112961, "loss": 2.3845, "loss_aux_layer_0": 0.010955810546875, "loss_aux_layer_1": 0.0264892578125, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.0555419921875, "loss_aux_layer_12": 0.0595703125, "loss_aux_layer_13": 0.06494140625, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03741455078125, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0465087890625, "loss_aux_layer_4": 0.04876708984375, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.0533447265625, "loss_aux_layer_7": 0.0517578125, "loss_aux_layer_8": 0.0513916015625, "loss_aux_layer_9": 0.0506591796875, "step": 4588, "total_loss": 0.5961306393146515 }, { "epoch": 0.9085329637695506, "grad_norm": 0.9713057279586792, "learning_rate": 5e-05, "llm_loss": 0.5740471929311752, "loss": 2.614, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.0604248046875, "loss_aux_layer_12": 0.06488037109375, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1148681640625, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.0550537109375, "step": 4589, "total_loss": 0.6535098105669022 }, { "epoch": 0.9087309443674519, "grad_norm": 0.888253927230835, "learning_rate": 5e-05, "llm_loss": 0.5213027745485306, "loss": 2.4075, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1165771484375, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05633544921875, "step": 4590, "total_loss": 0.6018634736537933 }, { "epoch": 0.9089289249653534, "grad_norm": 0.8803752660751343, "learning_rate": 5e-05, "llm_loss": 0.5524168461561203, "loss": 2.522, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.05535888671875, "step": 4591, "total_loss": 0.630503699183464 }, { "epoch": 0.9091269055632548, "grad_norm": 0.7927771210670471, "learning_rate": 5e-05, "llm_loss": 0.574817068874836, "loss": 2.6102, "loss_aux_layer_0": 0.0110015869140625, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0543212890625, "step": 4592, "total_loss": 0.652558296918869 }, { "epoch": 0.9093248861611563, "grad_norm": 0.913391649723053, "learning_rate": 5e-05, "llm_loss": 0.5184959769248962, "loss": 2.3867, "loss_aux_layer_0": 0.010345458984375, "loss_aux_layer_1": 0.028656005859375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.122314453125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.0531005859375, "step": 4593, "total_loss": 0.5966680496931076 }, { "epoch": 0.9095228667590576, "grad_norm": 0.7701594829559326, "learning_rate": 5e-05, "llm_loss": 0.5207240507006645, "loss": 2.3878, "loss_aux_layer_0": 0.011016845703125, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.0521240234375, "loss_aux_layer_9": 0.05133056640625, "step": 4594, "total_loss": 0.5969549864530563 }, { "epoch": 0.909720847356959, "grad_norm": 0.8250667452812195, "learning_rate": 5e-05, "llm_loss": 0.5809490606188774, "loss": 2.6252, "loss_aux_layer_0": 0.01025390625, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.05548095703125, "loss_aux_layer_12": 0.05963134765625, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.04986572265625, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05047607421875, "step": 4595, "total_loss": 0.6563098281621933 }, { "epoch": 0.9099188279548605, "grad_norm": 0.8691863417625427, "learning_rate": 5e-05, "llm_loss": 0.5049208104610443, "loss": 2.3307, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05340576171875, "step": 4596, "total_loss": 0.5826849043369293 }, { "epoch": 0.9101168085527618, "grad_norm": 1.0517046451568604, "learning_rate": 5e-05, "llm_loss": 0.6067996472120285, "loss": 2.7236, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.02606201171875, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.0548095703125, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.0638427734375, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.0372314453125, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04595947265625, "loss_aux_layer_4": 0.04840087890625, "loss_aux_layer_5": 0.05010986328125, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05108642578125, "loss_aux_layer_9": 0.05023193359375, "step": 4597, "total_loss": 0.680909126996994 }, { "epoch": 0.9103147891506632, "grad_norm": 0.832832932472229, "learning_rate": 5e-05, "llm_loss": 0.5272043198347092, "loss": 2.4144, "loss_aux_layer_0": 0.0106353759765625, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.06683349609375, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05352783203125, "step": 4598, "total_loss": 0.6036028414964676 }, { "epoch": 0.9105127697485647, "grad_norm": 1.047471046447754, "learning_rate": 5e-05, "llm_loss": 0.5923865884542465, "loss": 2.6704, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06573486328125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.0521240234375, "step": 4599, "total_loss": 0.6676069796085358 }, { "epoch": 0.9107107503464661, "grad_norm": 1.0824769735336304, "learning_rate": 5e-05, "llm_loss": 0.5386486276984215, "loss": 2.4687, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.053466796875, "step": 4600, "total_loss": 0.6171694844961166 }, { "epoch": 0.9109087309443674, "grad_norm": 0.9411707520484924, "learning_rate": 5e-05, "llm_loss": 0.5335241258144379, "loss": 2.4482, "loss_aux_layer_0": 0.01251220703125, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05413818359375, "step": 4601, "total_loss": 0.6120472699403763 }, { "epoch": 0.9111067115422689, "grad_norm": 1.14139986038208, "learning_rate": 5e-05, "llm_loss": 0.5934762358665466, "loss": 2.6848, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.0543212890625, "step": 4602, "total_loss": 0.6712122857570648 }, { "epoch": 0.9113046921401703, "grad_norm": 0.9708243608474731, "learning_rate": 5e-05, "llm_loss": 0.5564160645008087, "loss": 2.5323, "loss_aux_layer_0": 0.0124969482421875, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05322265625, "step": 4603, "total_loss": 0.6330758184194565 }, { "epoch": 0.9115026727380716, "grad_norm": 1.0377655029296875, "learning_rate": 5e-05, "llm_loss": 0.5420099347829819, "loss": 2.4831, "loss_aux_layer_0": 0.0129547119140625, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.0535888671875, "step": 4604, "total_loss": 0.6207637190818787 }, { "epoch": 0.911700653335973, "grad_norm": 0.8537608981132507, "learning_rate": 5e-05, "llm_loss": 0.5799012035131454, "loss": 2.629, "loss_aux_layer_0": 0.0115203857421875, "loss_aux_layer_1": 0.029022216796875, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05352783203125, "step": 4605, "total_loss": 0.6572429537773132 }, { "epoch": 0.9118986339338745, "grad_norm": 1.049275279045105, "learning_rate": 5e-05, "llm_loss": 0.5679220110177994, "loss": 2.5808, "loss_aux_layer_0": 0.0129547119140625, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04119873046875, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.0526123046875, "step": 4606, "total_loss": 0.6451925337314606 }, { "epoch": 0.9120966145317759, "grad_norm": 1.1310397386550903, "learning_rate": 5e-05, "llm_loss": 0.5816868245601654, "loss": 2.6243, "loss_aux_layer_0": 0.01165771484375, "loss_aux_layer_1": 0.02703857421875, "loss_aux_layer_10": 0.050537109375, "loss_aux_layer_11": 0.054443359375, "loss_aux_layer_12": 0.05877685546875, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.037353515625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.04864501953125, "loss_aux_layer_5": 0.0499267578125, "loss_aux_layer_6": 0.05267333984375, "loss_aux_layer_7": 0.0509033203125, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.04931640625, "step": 4607, "total_loss": 0.6560708582401276 }, { "epoch": 0.9122945951296773, "grad_norm": 1.2225579023361206, "learning_rate": 5e-05, "llm_loss": 0.5907254070043564, "loss": 2.6628, "loss_aux_layer_0": 0.0133514404296875, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05096435546875, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.05303955078125, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05126953125, "step": 4608, "total_loss": 0.6657117903232574 }, { "epoch": 0.9124925757275787, "grad_norm": 0.8701860308647156, "learning_rate": 5e-05, "llm_loss": 0.547264575958252, "loss": 2.5012, "loss_aux_layer_0": 0.01202392578125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0552978515625, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05499267578125, "step": 4609, "total_loss": 0.6253074109554291 }, { "epoch": 0.9126905563254801, "grad_norm": 1.0017238855361938, "learning_rate": 5e-05, "llm_loss": 0.5733135864138603, "loss": 2.6141, "loss_aux_layer_0": 0.012481689453125, "loss_aux_layer_1": 0.0306396484375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.06121826171875, "loss_aux_layer_12": 0.0655517578125, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.11181640625, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.054931640625, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0596923828125, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05615234375, "step": 4610, "total_loss": 0.6535132080316544 }, { "epoch": 0.9128885369233815, "grad_norm": 1.0239909887313843, "learning_rate": 5e-05, "llm_loss": 0.5483136773109436, "loss": 2.5073, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.0538330078125, "step": 4611, "total_loss": 0.6268320977687836 }, { "epoch": 0.9130865175212829, "grad_norm": 0.968670129776001, "learning_rate": 5e-05, "llm_loss": 0.5722256079316139, "loss": 2.6014, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05322265625, "step": 4612, "total_loss": 0.6503422856330872 }, { "epoch": 0.9132844981191843, "grad_norm": 1.0224676132202148, "learning_rate": 5e-05, "llm_loss": 0.5151592269539833, "loss": 2.3815, "loss_aux_layer_0": 0.012939453125, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0963134765625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.133056640625, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.0584716796875, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.0552978515625, "step": 4613, "total_loss": 0.5953833311796188 }, { "epoch": 0.9134824787170858, "grad_norm": 0.8450872302055359, "learning_rate": 5e-05, "llm_loss": 0.5475278347730637, "loss": 2.499, "loss_aux_layer_0": 0.0111236572265625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05303955078125, "step": 4614, "total_loss": 0.6247588247060776 }, { "epoch": 0.9136804593149871, "grad_norm": 0.8773815035820007, "learning_rate": 5e-05, "llm_loss": 0.5399696305394173, "loss": 2.4858, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.030242919921875, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.0615234375, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08935546875, "loss_aux_layer_16": 0.098876953125, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.1148681640625, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.1260986328125, "loss_aux_layer_21": 0.13427734375, "loss_aux_layer_22": 0.156005859375, "loss_aux_layer_23": 0.19384765625, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.0548095703125, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.0562744140625, "step": 4615, "total_loss": 0.6214491128921509 }, { "epoch": 0.9138784399128885, "grad_norm": 0.8156583905220032, "learning_rate": 5e-05, "llm_loss": 0.4870205223560333, "loss": 2.2552, "loss_aux_layer_0": 0.011383056640625, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05426025390625, "step": 4616, "total_loss": 0.5638051480054855 }, { "epoch": 0.91407642051079, "grad_norm": 0.8538126349449158, "learning_rate": 5e-05, "llm_loss": 0.5994289666414261, "loss": 2.7077, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.06903076171875, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.054443359375, "step": 4617, "total_loss": 0.6769204437732697 }, { "epoch": 0.9142744011086914, "grad_norm": 0.8300346732139587, "learning_rate": 5e-05, "llm_loss": 0.5692598521709442, "loss": 2.595, "loss_aux_layer_0": 0.01129150390625, "loss_aux_layer_1": 0.03106689453125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.0654296875, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05279541015625, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05621337890625, "step": 4618, "total_loss": 0.6487520337104797 }, { "epoch": 0.9144723817065927, "grad_norm": 0.768066942691803, "learning_rate": 5e-05, "llm_loss": 0.5343006551265717, "loss": 2.4508, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.0545654296875, "step": 4619, "total_loss": 0.6126986593008041 }, { "epoch": 0.9146703623044942, "grad_norm": 0.7457331418991089, "learning_rate": 5e-05, "llm_loss": 0.5437168776988983, "loss": 2.4912, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.054443359375, "step": 4620, "total_loss": 0.622802697122097 }, { "epoch": 0.9148683429023956, "grad_norm": 0.857970118522644, "learning_rate": 5e-05, "llm_loss": 0.5401665642857552, "loss": 2.4539, "loss_aux_layer_0": 0.0106353759765625, "loss_aux_layer_1": 0.027618408203125, "loss_aux_layer_10": 0.05029296875, "loss_aux_layer_11": 0.0537109375, "loss_aux_layer_12": 0.0576171875, "loss_aux_layer_13": 0.06231689453125, "loss_aux_layer_14": 0.0701904296875, "loss_aux_layer_15": 0.0782470703125, "loss_aux_layer_16": 0.0875244140625, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.103759765625, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.0377197265625, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.04876708984375, "loss_aux_layer_5": 0.05023193359375, "loss_aux_layer_6": 0.05267333984375, "loss_aux_layer_7": 0.05084228515625, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.04925537109375, "step": 4621, "total_loss": 0.6134629845619202 }, { "epoch": 0.9150663235002969, "grad_norm": 0.8295873999595642, "learning_rate": 5e-05, "llm_loss": 0.5757203102111816, "loss": 2.6129, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05328369140625, "step": 4622, "total_loss": 0.6532171666622162 }, { "epoch": 0.9152643040981984, "grad_norm": 0.9264320135116577, "learning_rate": 5e-05, "llm_loss": 0.6444835215806961, "loss": 2.8873, "loss_aux_layer_0": 0.009918212890625, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.0557861328125, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.05413818359375, "step": 4623, "total_loss": 0.7218184471130371 }, { "epoch": 0.9154622846960998, "grad_norm": 0.7816097736358643, "learning_rate": 5e-05, "llm_loss": 0.5896181017160416, "loss": 2.6535, "loss_aux_layer_0": 0.0103759765625, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05169677734375, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.0955810546875, "loss_aux_layer_18": 0.103759765625, "loss_aux_layer_19": 0.10693359375, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1143798828125, "loss_aux_layer_21": 0.122802734375, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.0526123046875, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.050537109375, "step": 4624, "total_loss": 0.6633830666542053 }, { "epoch": 0.9156602652940012, "grad_norm": 0.8630095720291138, "learning_rate": 5e-05, "llm_loss": 0.5429998338222504, "loss": 2.4764, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0535888671875, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05194091796875, "step": 4625, "total_loss": 0.6190946996212006 }, { "epoch": 0.9158582458919026, "grad_norm": 0.771316647529602, "learning_rate": 5e-05, "llm_loss": 0.5211556032299995, "loss": 2.3777, "loss_aux_layer_0": 0.00970458984375, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.05902099609375, "loss_aux_layer_13": 0.06390380859375, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.0791015625, "loss_aux_layer_16": 0.087890625, "loss_aux_layer_17": 0.0955810546875, "loss_aux_layer_18": 0.1033935546875, "loss_aux_layer_19": 0.106689453125, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.1142578125, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.176513671875, "loss_aux_layer_3": 0.04705810546875, "loss_aux_layer_4": 0.04925537109375, "loss_aux_layer_5": 0.0506591796875, "loss_aux_layer_6": 0.05328369140625, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.05133056640625, "loss_aux_layer_9": 0.05035400390625, "step": 4626, "total_loss": 0.5944167226552963 }, { "epoch": 0.916056226489804, "grad_norm": 0.6756441593170166, "learning_rate": 5e-05, "llm_loss": 0.5280239284038544, "loss": 2.4246, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.056884765625, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05633544921875, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.056884765625, "loss_aux_layer_9": 0.0556640625, "step": 4627, "total_loss": 0.6061555296182632 }, { "epoch": 0.9162542070877054, "grad_norm": 0.8167654275894165, "learning_rate": 5e-05, "llm_loss": 0.5393849238753319, "loss": 2.4688, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.06793212890625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05364990234375, "step": 4628, "total_loss": 0.6171886324882507 }, { "epoch": 0.9164521876856068, "grad_norm": 0.7488877177238464, "learning_rate": 5e-05, "llm_loss": 0.50189608335495, "loss": 2.3138, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1300048828125, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05224609375, "step": 4629, "total_loss": 0.5784613490104675 }, { "epoch": 0.9166501682835082, "grad_norm": 0.7460681200027466, "learning_rate": 5e-05, "llm_loss": 0.5450777858495712, "loss": 2.481, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.05218505859375, "step": 4630, "total_loss": 0.6202475130558014 }, { "epoch": 0.9168481488814096, "grad_norm": 0.8281307220458984, "learning_rate": 5e-05, "llm_loss": 0.4742191582918167, "loss": 2.2198, "loss_aux_layer_0": 0.0095672607421875, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.0584716796875, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.06689453125, "loss_aux_layer_13": 0.0723876953125, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.096923828125, "loss_aux_layer_17": 0.1044921875, "loss_aux_layer_18": 0.1131591796875, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.1241455078125, "loss_aux_layer_21": 0.132568359375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.0567626953125, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05841064453125, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.056884765625, "step": 4631, "total_loss": 0.5549396798014641 }, { "epoch": 0.9170461294793111, "grad_norm": 0.8344240188598633, "learning_rate": 5e-05, "llm_loss": 0.5828557908535004, "loss": 2.6374, "loss_aux_layer_0": 0.009552001953125, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.06744384765625, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.053466796875, "step": 4632, "total_loss": 0.6593589037656784 }, { "epoch": 0.9172441100772124, "grad_norm": 0.8186808228492737, "learning_rate": 5e-05, "llm_loss": 0.5715613961219788, "loss": 2.5988, "loss_aux_layer_0": 0.009735107421875, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.0543212890625, "step": 4633, "total_loss": 0.6497098803520203 }, { "epoch": 0.9174420906751138, "grad_norm": 0.9272564649581909, "learning_rate": 5e-05, "llm_loss": 0.5238505750894547, "loss": 2.3863, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.026824951171875, "loss_aux_layer_10": 0.0504150390625, "loss_aux_layer_11": 0.0538330078125, "loss_aux_layer_12": 0.0577392578125, "loss_aux_layer_13": 0.0626220703125, "loss_aux_layer_14": 0.0703125, "loss_aux_layer_15": 0.0780029296875, "loss_aux_layer_16": 0.0869140625, "loss_aux_layer_17": 0.0946044921875, "loss_aux_layer_18": 0.1026611328125, "loss_aux_layer_19": 0.107421875, "loss_aux_layer_2": 0.03668212890625, "loss_aux_layer_20": 0.115478515625, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04559326171875, "loss_aux_layer_4": 0.0477294921875, "loss_aux_layer_5": 0.0491943359375, "loss_aux_layer_6": 0.052001953125, "loss_aux_layer_7": 0.05029296875, "loss_aux_layer_8": 0.0499267578125, "loss_aux_layer_9": 0.04901123046875, "step": 4634, "total_loss": 0.5965837389230728 }, { "epoch": 0.9176400712730153, "grad_norm": 0.7754743099212646, "learning_rate": 5e-05, "llm_loss": 0.49903856962919235, "loss": 2.3064, "loss_aux_layer_0": 0.0095977783203125, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.06378173828125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0543212890625, "step": 4635, "total_loss": 0.576593279838562 }, { "epoch": 0.9178380518709166, "grad_norm": 0.8692073822021484, "learning_rate": 5e-05, "llm_loss": 0.5345519408583641, "loss": 2.4441, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.0516357421875, "step": 4636, "total_loss": 0.611018568277359 }, { "epoch": 0.918036032468818, "grad_norm": 0.8010016679763794, "learning_rate": 5e-05, "llm_loss": 0.5942928194999695, "loss": 2.6813, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.0528564453125, "step": 4637, "total_loss": 0.6703232526779175 }, { "epoch": 0.9182340130667195, "grad_norm": 0.8617807626724243, "learning_rate": 5e-05, "llm_loss": 0.6313518285751343, "loss": 2.8286, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05194091796875, "step": 4638, "total_loss": 0.7071588635444641 }, { "epoch": 0.9184319936646209, "grad_norm": 0.9138669371604919, "learning_rate": 5e-05, "llm_loss": 0.5754255428910255, "loss": 2.6033, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05230712890625, "step": 4639, "total_loss": 0.6508138924837112 }, { "epoch": 0.9186299742625222, "grad_norm": 0.7644307613372803, "learning_rate": 5e-05, "llm_loss": 0.5290686935186386, "loss": 2.4137, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.02734375, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.05902099609375, "loss_aux_layer_13": 0.06396484375, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.0377197265625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04669189453125, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.05029296875, "loss_aux_layer_6": 0.05303955078125, "loss_aux_layer_7": 0.05133056640625, "loss_aux_layer_8": 0.0509033203125, "loss_aux_layer_9": 0.04998779296875, "step": 4640, "total_loss": 0.6034214794635773 }, { "epoch": 0.9188279548604237, "grad_norm": 0.9498969316482544, "learning_rate": 5e-05, "llm_loss": 0.597474955022335, "loss": 2.6963, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0523681640625, "step": 4641, "total_loss": 0.6740849018096924 }, { "epoch": 0.9190259354583251, "grad_norm": 1.2677395343780518, "learning_rate": 5e-05, "llm_loss": 0.5124734565615654, "loss": 2.3574, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.06793212890625, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05328369140625, "step": 4642, "total_loss": 0.5893527418375015 }, { "epoch": 0.9192239160562264, "grad_norm": 0.8963335752487183, "learning_rate": 5e-05, "llm_loss": 0.5096983164548874, "loss": 2.3467, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.052978515625, "step": 4643, "total_loss": 0.5866872370243073 }, { "epoch": 0.9194218966541279, "grad_norm": 0.8660326600074768, "learning_rate": 5e-05, "llm_loss": 0.5808805972337723, "loss": 2.629, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05364990234375, "step": 4644, "total_loss": 0.6572500169277191 }, { "epoch": 0.9196198772520293, "grad_norm": 0.8743804097175598, "learning_rate": 5e-05, "llm_loss": 0.5997886210680008, "loss": 2.7199, "loss_aux_layer_0": 0.0111083984375, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.0574951171875, "loss_aux_layer_11": 0.0616455078125, "loss_aux_layer_12": 0.065673828125, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.087890625, "loss_aux_layer_16": 0.0975341796875, "loss_aux_layer_17": 0.10546875, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.1160888671875, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.123779296875, "loss_aux_layer_21": 0.13134765625, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.0517578125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05596923828125, "step": 4645, "total_loss": 0.6799689382314682 }, { "epoch": 0.9198178578499308, "grad_norm": 1.145790457725525, "learning_rate": 5e-05, "llm_loss": 0.68918876349926, "loss": 3.0585, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05206298828125, "step": 4646, "total_loss": 0.7646153271198273 }, { "epoch": 0.9200158384478321, "grad_norm": 1.027072548866272, "learning_rate": 5e-05, "llm_loss": 0.5776843205094337, "loss": 2.6213, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.053955078125, "step": 4647, "total_loss": 0.6553137451410294 }, { "epoch": 0.9202138190457335, "grad_norm": 0.8812260031700134, "learning_rate": 5e-05, "llm_loss": 0.5589991733431816, "loss": 2.5472, "loss_aux_layer_0": 0.011016845703125, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.0538330078125, "step": 4648, "total_loss": 0.6368019580841064 }, { "epoch": 0.920411799643635, "grad_norm": 0.9254568815231323, "learning_rate": 5e-05, "llm_loss": 0.5961382687091827, "loss": 2.6944, "loss_aux_layer_0": 0.01123046875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05322265625, "step": 4649, "total_loss": 0.6736045330762863 }, { "epoch": 0.9206097802415363, "grad_norm": 0.8922121524810791, "learning_rate": 5e-05, "llm_loss": 0.589185506105423, "loss": 2.6791, "loss_aux_layer_0": 0.0110015869140625, "loss_aux_layer_1": 0.03131103515625, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.067138671875, "loss_aux_layer_13": 0.0726318359375, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0439453125, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.0543212890625, "loss_aux_layer_4": 0.0570068359375, "loss_aux_layer_5": 0.05828857421875, "loss_aux_layer_6": 0.06134033203125, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.05902099609375, "loss_aux_layer_9": 0.05780029296875, "step": 4650, "total_loss": 0.6697666794061661 }, { "epoch": 0.9208077608394377, "grad_norm": 0.8166549205780029, "learning_rate": 5e-05, "llm_loss": 0.5256317853927612, "loss": 2.4086, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.05322265625, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0538330078125, "step": 4651, "total_loss": 0.6021469756960869 }, { "epoch": 0.9210057414373392, "grad_norm": 0.8779417276382446, "learning_rate": 5e-05, "llm_loss": 0.4877134934067726, "loss": 2.2455, "loss_aux_layer_0": 0.01092529296875, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.0506591796875, "loss_aux_layer_11": 0.0540771484375, "loss_aux_layer_12": 0.05828857421875, "loss_aux_layer_13": 0.06341552734375, "loss_aux_layer_14": 0.0714111328125, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.0887451171875, "loss_aux_layer_17": 0.096435546875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.0469970703125, "loss_aux_layer_4": 0.04913330078125, "loss_aux_layer_5": 0.05047607421875, "loss_aux_layer_6": 0.0528564453125, "loss_aux_layer_7": 0.0511474609375, "loss_aux_layer_8": 0.0506591796875, "loss_aux_layer_9": 0.04949951171875, "step": 4652, "total_loss": 0.5613774284720421 }, { "epoch": 0.9212037220352406, "grad_norm": 0.8136433959007263, "learning_rate": 5e-05, "llm_loss": 0.5450766533613205, "loss": 2.4812, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.107421875, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.114501953125, "loss_aux_layer_21": 0.122314453125, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.17626953125, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05322265625, "step": 4653, "total_loss": 0.6203026324510574 }, { "epoch": 0.9214017026331419, "grad_norm": 0.888829231262207, "learning_rate": 5e-05, "llm_loss": 0.6243555396795273, "loss": 2.8202, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.06182861328125, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.07159423828125, "loss_aux_layer_14": 0.080322265625, "loss_aux_layer_15": 0.088134765625, "loss_aux_layer_16": 0.0972900390625, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15478515625, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.05230712890625, "loss_aux_layer_4": 0.05499267578125, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05828857421875, "loss_aux_layer_8": 0.0577392578125, "loss_aux_layer_9": 0.0565185546875, "step": 4654, "total_loss": 0.7050476670265198 }, { "epoch": 0.9215996832310434, "grad_norm": 0.7714042663574219, "learning_rate": 5e-05, "llm_loss": 0.5991887375712395, "loss": 2.6988, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.04705810546875, "loss_aux_layer_4": 0.04925537109375, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.05181884765625, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.05059814453125, "step": 4655, "total_loss": 0.6747028231620789 }, { "epoch": 0.9217976638289448, "grad_norm": 0.9297341704368591, "learning_rate": 5e-05, "llm_loss": 0.5350930988788605, "loss": 2.4599, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.03045654296875, "loss_aux_layer_10": 0.057861328125, "loss_aux_layer_11": 0.06207275390625, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.0570068359375, "loss_aux_layer_6": 0.059814453125, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.05621337890625, "step": 4656, "total_loss": 0.6149694621562958 }, { "epoch": 0.9219956444268461, "grad_norm": 0.7835530042648315, "learning_rate": 5e-05, "llm_loss": 0.5460505783557892, "loss": 2.4928, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.0531005859375, "step": 4657, "total_loss": 0.6231928318738937 }, { "epoch": 0.9221936250247476, "grad_norm": 0.9973194599151611, "learning_rate": 5e-05, "llm_loss": 0.6791581213474274, "loss": 3.017, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.051025390625, "step": 4658, "total_loss": 0.754244476556778 }, { "epoch": 0.922391605622649, "grad_norm": 0.7636909484863281, "learning_rate": 5e-05, "llm_loss": 0.4442942515015602, "loss": 2.0911, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.030181884765625, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04229736328125, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05450439453125, "loss_aux_layer_5": 0.055908203125, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.05523681640625, "step": 4659, "total_loss": 0.5227846726775169 }, { "epoch": 0.9225895862205504, "grad_norm": 0.7838987708091736, "learning_rate": 5e-05, "llm_loss": 0.5539564788341522, "loss": 2.5254, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.06744384765625, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05340576171875, "step": 4660, "total_loss": 0.6313537359237671 }, { "epoch": 0.9227875668184518, "grad_norm": 0.8990318775177002, "learning_rate": 5e-05, "llm_loss": 0.6349480301141739, "loss": 2.849, "loss_aux_layer_0": 0.0099945068359375, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.052490234375, "step": 4661, "total_loss": 0.7122388780117035 }, { "epoch": 0.9229855474163532, "grad_norm": 0.7067061066627502, "learning_rate": 5e-05, "llm_loss": 0.5628980249166489, "loss": 2.5678, "loss_aux_layer_0": 0.009735107421875, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05535888671875, "step": 4662, "total_loss": 0.6419391185045242 }, { "epoch": 0.9231835280142546, "grad_norm": 0.8142167329788208, "learning_rate": 5e-05, "llm_loss": 0.5134036019444466, "loss": 2.3518, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.027740478515625, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.05035400390625, "step": 4663, "total_loss": 0.5879411995410919 }, { "epoch": 0.923381508612156, "grad_norm": 0.9370799660682678, "learning_rate": 5e-05, "llm_loss": 0.6018159985542297, "loss": 2.7135, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05267333984375, "step": 4664, "total_loss": 0.6783869862556458 }, { "epoch": 0.9235794892100574, "grad_norm": 0.8370361328125, "learning_rate": 5e-05, "llm_loss": 0.5435717403888702, "loss": 2.4723, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.04974365234375, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.05047607421875, "step": 4665, "total_loss": 0.6180788278579712 }, { "epoch": 0.9237774698079588, "grad_norm": 0.8146084547042847, "learning_rate": 5e-05, "llm_loss": 0.6137562245130539, "loss": 2.7526, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.05474853515625, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.063720703125, "loss_aux_layer_14": 0.0714111328125, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.088134765625, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.0380859375, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.05340576171875, "loss_aux_layer_7": 0.05169677734375, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.050048828125, "step": 4666, "total_loss": 0.6881382316350937 }, { "epoch": 0.9239754504058603, "grad_norm": 0.9155721068382263, "learning_rate": 5e-05, "llm_loss": 0.5801348686218262, "loss": 2.6224, "loss_aux_layer_0": 0.0095977783203125, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.0513916015625, "step": 4667, "total_loss": 0.6556064486503601 }, { "epoch": 0.9241734310037616, "grad_norm": 1.0725724697113037, "learning_rate": 5e-05, "llm_loss": 0.5425393283367157, "loss": 2.4691, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.0594482421875, "loss_aux_layer_13": 0.06463623046875, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.05328369140625, "loss_aux_layer_7": 0.0517578125, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.05029296875, "step": 4668, "total_loss": 0.6172788739204407 }, { "epoch": 0.924371411601663, "grad_norm": 0.8223277926445007, "learning_rate": 5e-05, "llm_loss": 0.5524432063102722, "loss": 2.5286, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.06085205078125, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.05242919921875, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.056640625, "loss_aux_layer_9": 0.055419921875, "step": 4669, "total_loss": 0.6321512907743454 }, { "epoch": 0.9245693921995645, "grad_norm": 0.8498099446296692, "learning_rate": 5e-05, "llm_loss": 0.5924351885914803, "loss": 2.6888, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.05792236328125, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.0660400390625, "loss_aux_layer_13": 0.07122802734375, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0875244140625, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1036376953125, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05352783203125, "loss_aux_layer_4": 0.0560302734375, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.0604248046875, "loss_aux_layer_7": 0.05877685546875, "loss_aux_layer_8": 0.05804443359375, "loss_aux_layer_9": 0.05670166015625, "step": 4670, "total_loss": 0.6721911877393723 }, { "epoch": 0.9247673727974659, "grad_norm": 0.7823892831802368, "learning_rate": 5e-05, "llm_loss": 0.6179337874054909, "loss": 2.7839, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0560302734375, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.0572509765625, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05517578125, "step": 4671, "total_loss": 0.6959697753190994 }, { "epoch": 0.9249653533953672, "grad_norm": 0.749293863773346, "learning_rate": 5e-05, "llm_loss": 0.5797648727893829, "loss": 2.6335, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.103271484375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.0543212890625, "step": 4672, "total_loss": 0.6583743393421173 }, { "epoch": 0.9251633339932687, "grad_norm": 0.793854296207428, "learning_rate": 5e-05, "llm_loss": 0.5579159110784531, "loss": 2.5399, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.06842041015625, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0538330078125, "step": 4673, "total_loss": 0.6349784433841705 }, { "epoch": 0.9253613145911701, "grad_norm": 0.7877159714698792, "learning_rate": 5e-05, "llm_loss": 0.6866574883460999, "loss": 3.0609, "loss_aux_layer_0": 0.0099945068359375, "loss_aux_layer_1": 0.030670166015625, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.04278564453125, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05267333984375, "loss_aux_layer_4": 0.05511474609375, "loss_aux_layer_5": 0.0565185546875, "loss_aux_layer_6": 0.05950927734375, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.055419921875, "step": 4674, "total_loss": 0.7652294486761093 }, { "epoch": 0.9255592951890714, "grad_norm": 0.7879067063331604, "learning_rate": 5e-05, "llm_loss": 0.5213272869586945, "loss": 2.4045, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06109619140625, "loss_aux_layer_12": 0.06536865234375, "loss_aux_layer_13": 0.0711669921875, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.0877685546875, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.115234375, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05743408203125, "loss_aux_layer_8": 0.05694580078125, "loss_aux_layer_9": 0.05572509765625, "step": 4675, "total_loss": 0.6011203676462173 }, { "epoch": 0.9257572757869729, "grad_norm": 0.9024222493171692, "learning_rate": 5e-05, "llm_loss": 0.5124230459332466, "loss": 2.3445, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.02630615234375, "loss_aux_layer_10": 0.050537109375, "loss_aux_layer_11": 0.05419921875, "loss_aux_layer_12": 0.05828857421875, "loss_aux_layer_13": 0.06329345703125, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.0892333984375, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03668212890625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04571533203125, "loss_aux_layer_4": 0.0479736328125, "loss_aux_layer_5": 0.049560546875, "loss_aux_layer_6": 0.05224609375, "loss_aux_layer_7": 0.05072021484375, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.04925537109375, "step": 4676, "total_loss": 0.5861357301473618 }, { "epoch": 0.9259552563848743, "grad_norm": 0.8724135756492615, "learning_rate": 5e-05, "llm_loss": 0.5277305096387863, "loss": 2.4223, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.0537109375, "step": 4677, "total_loss": 0.6055668592453003 }, { "epoch": 0.9261532369827757, "grad_norm": 0.8092084527015686, "learning_rate": 5e-05, "llm_loss": 0.5978313535451889, "loss": 2.6952, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.05279541015625, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.054443359375, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.0513916015625, "step": 4678, "total_loss": 0.6738085746765137 }, { "epoch": 0.9263512175806771, "grad_norm": 0.8351644277572632, "learning_rate": 5e-05, "llm_loss": 0.5626013427972794, "loss": 2.5632, "loss_aux_layer_0": 0.009765625, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.0538330078125, "step": 4679, "total_loss": 0.6408021599054337 }, { "epoch": 0.9265491981785785, "grad_norm": 0.7606856822967529, "learning_rate": 5e-05, "llm_loss": 0.6725680083036423, "loss": 2.9834, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.05072021484375, "loss_aux_layer_11": 0.05413818359375, "loss_aux_layer_12": 0.0582275390625, "loss_aux_layer_13": 0.06317138671875, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.0792236328125, "loss_aux_layer_16": 0.0880126953125, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.10400390625, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.0367431640625, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04583740234375, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.04998779296875, "loss_aux_layer_6": 0.0528564453125, "loss_aux_layer_7": 0.0511474609375, "loss_aux_layer_8": 0.0506591796875, "loss_aux_layer_9": 0.04949951171875, "step": 4680, "total_loss": 0.7458465844392776 }, { "epoch": 0.9267471787764799, "grad_norm": 0.8535929918289185, "learning_rate": 5e-05, "llm_loss": 0.5596539080142975, "loss": 2.5358, "loss_aux_layer_0": 0.0105133056640625, "loss_aux_layer_1": 0.02752685546875, "loss_aux_layer_10": 0.05133056640625, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.0587158203125, "loss_aux_layer_13": 0.063720703125, "loss_aux_layer_14": 0.0716552734375, "loss_aux_layer_15": 0.07958984375, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.0960693359375, "loss_aux_layer_18": 0.1043701171875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.05023193359375, "step": 4681, "total_loss": 0.6339502036571503 }, { "epoch": 0.9269451593743813, "grad_norm": 0.9066555500030518, "learning_rate": 5e-05, "llm_loss": 0.6143378615379333, "loss": 2.7735, "loss_aux_layer_0": 0.0092926025390625, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.05670166015625, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05548095703125, "step": 4682, "total_loss": 0.6933661550283432 }, { "epoch": 0.9271431399722827, "grad_norm": 0.891272246837616, "learning_rate": 5e-05, "llm_loss": 0.5705422684550285, "loss": 2.5893, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.053466796875, "step": 4683, "total_loss": 0.6473289504647255 }, { "epoch": 0.9273411205701841, "grad_norm": 1.185879111289978, "learning_rate": 5e-05, "llm_loss": 0.6176029443740845, "loss": 2.7837, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.030731201171875, "loss_aux_layer_10": 0.056396484375, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.0645751953125, "loss_aux_layer_13": 0.06964111328125, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05889892578125, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.054931640625, "step": 4684, "total_loss": 0.6959159523248672 }, { "epoch": 0.9275391011680856, "grad_norm": 1.1680763959884644, "learning_rate": 5e-05, "llm_loss": 0.6469641327857971, "loss": 2.8931, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.0528564453125, "step": 4685, "total_loss": 0.7232702225446701 }, { "epoch": 0.9277370817659869, "grad_norm": 1.0479599237442017, "learning_rate": 5e-05, "llm_loss": 0.5099108815193176, "loss": 2.3494, "loss_aux_layer_0": 0.010986328125, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.0537109375, "step": 4686, "total_loss": 0.5873505845665932 }, { "epoch": 0.9279350623638883, "grad_norm": 0.9215664863586426, "learning_rate": 5e-05, "llm_loss": 0.5714939832687378, "loss": 2.581, "loss_aux_layer_0": 0.0097198486328125, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.05084228515625, "loss_aux_layer_11": 0.0543212890625, "loss_aux_layer_12": 0.05853271484375, "loss_aux_layer_13": 0.06365966796875, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.0885009765625, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.0469970703125, "loss_aux_layer_4": 0.04937744140625, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.05303955078125, "loss_aux_layer_7": 0.05126953125, "loss_aux_layer_8": 0.05078125, "loss_aux_layer_9": 0.04986572265625, "step": 4687, "total_loss": 0.645253986120224 }, { "epoch": 0.9281330429617898, "grad_norm": 1.0928380489349365, "learning_rate": 5e-05, "llm_loss": 0.5637626573443413, "loss": 2.5636, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.06707763671875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05279541015625, "step": 4688, "total_loss": 0.6408935338258743 }, { "epoch": 0.9283310235596911, "grad_norm": 0.9529284834861755, "learning_rate": 5e-05, "llm_loss": 0.5148098319768906, "loss": 2.3829, "loss_aux_layer_0": 0.0098419189453125, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05780029296875, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0716552734375, "loss_aux_layer_14": 0.0802001953125, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.09716796875, "loss_aux_layer_17": 0.1051025390625, "loss_aux_layer_18": 0.113525390625, "loss_aux_layer_19": 0.116943359375, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.1552734375, "loss_aux_layer_23": 0.193603515625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.05523681640625, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.057861328125, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.0562744140625, "step": 4689, "total_loss": 0.5957334712147713 }, { "epoch": 0.9285290041575925, "grad_norm": 0.9177556037902832, "learning_rate": 5e-05, "llm_loss": 0.5738704800605774, "loss": 2.6066, "loss_aux_layer_0": 0.010894775390625, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.05303955078125, "step": 4690, "total_loss": 0.651662215590477 }, { "epoch": 0.928726984755494, "grad_norm": 0.9922481775283813, "learning_rate": 5e-05, "llm_loss": 0.5924567729234695, "loss": 2.6707, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.05218505859375, "step": 4691, "total_loss": 0.6676787286996841 }, { "epoch": 0.9289249653533954, "grad_norm": 0.9765635132789612, "learning_rate": 5e-05, "llm_loss": 0.5831006765365601, "loss": 2.6312, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.02734375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.05133056640625, "step": 4692, "total_loss": 0.6578044593334198 }, { "epoch": 0.9291229459512967, "grad_norm": 0.9515141248703003, "learning_rate": 5e-05, "llm_loss": 0.598766878247261, "loss": 2.701, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05279541015625, "step": 4693, "total_loss": 0.6752583086490631 }, { "epoch": 0.9293209265491982, "grad_norm": 1.0090233087539673, "learning_rate": 5e-05, "llm_loss": 0.5946730375289917, "loss": 2.6772, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.06427001953125, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.049560546875, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.0513916015625, "loss_aux_layer_9": 0.05035400390625, "step": 4694, "total_loss": 0.6692925691604614 }, { "epoch": 0.9295189071470996, "grad_norm": 0.8774135708808899, "learning_rate": 5e-05, "llm_loss": 0.5430857837200165, "loss": 2.4809, "loss_aux_layer_0": 0.0109405517578125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05322265625, "step": 4695, "total_loss": 0.6202293038368225 }, { "epoch": 0.9297168877450009, "grad_norm": 0.781769871711731, "learning_rate": 5e-05, "llm_loss": 0.5063243135809898, "loss": 2.3372, "loss_aux_layer_0": 0.00958251953125, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.0540771484375, "step": 4696, "total_loss": 0.5842998921871185 }, { "epoch": 0.9299148683429024, "grad_norm": 0.8921736478805542, "learning_rate": 5e-05, "llm_loss": 0.5894852429628372, "loss": 2.6727, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05328369140625, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.0543212890625, "step": 4697, "total_loss": 0.6681714951992035 }, { "epoch": 0.9301128489408038, "grad_norm": 0.8788633942604065, "learning_rate": 5e-05, "llm_loss": 0.6161914169788361, "loss": 2.7918, "loss_aux_layer_0": 0.010345458984375, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.05902099609375, "loss_aux_layer_11": 0.06304931640625, "loss_aux_layer_12": 0.0673828125, "loss_aux_layer_13": 0.07275390625, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0894775390625, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1060791015625, "loss_aux_layer_18": 0.1136474609375, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.043701171875, "loss_aux_layer_20": 0.12548828125, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0540771484375, "loss_aux_layer_4": 0.0567626953125, "loss_aux_layer_5": 0.0582275390625, "loss_aux_layer_6": 0.0615234375, "loss_aux_layer_7": 0.05975341796875, "loss_aux_layer_8": 0.05889892578125, "loss_aux_layer_9": 0.05755615234375, "step": 4698, "total_loss": 0.6979612857103348 }, { "epoch": 0.9303108295387053, "grad_norm": 0.8233447670936584, "learning_rate": 5e-05, "llm_loss": 0.5589845553040504, "loss": 2.5445, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05303955078125, "step": 4699, "total_loss": 0.6361275985836983 }, { "epoch": 0.9305088101366066, "grad_norm": 0.8600349426269531, "learning_rate": 5e-05, "llm_loss": 0.5794713050127029, "loss": 2.6314, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.029693603515625, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.060302734375, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.0550537109375, "step": 4700, "total_loss": 0.6578491181135178 }, { "epoch": 0.930706790734508, "grad_norm": 0.7668083310127258, "learning_rate": 5e-05, "llm_loss": 0.46111904084682465, "loss": 2.1606, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1141357421875, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.0545654296875, "loss_aux_layer_5": 0.05584716796875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.0546875, "step": 4701, "total_loss": 0.540154255926609 }, { "epoch": 0.9309047713324095, "grad_norm": 1.0247639417648315, "learning_rate": 5e-05, "llm_loss": 0.5454850643873215, "loss": 2.488, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 4702, "total_loss": 0.6220101863145828 }, { "epoch": 0.9311027519303108, "grad_norm": 0.77508544921875, "learning_rate": 5e-05, "llm_loss": 0.6092859655618668, "loss": 2.7442, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.06707763671875, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.0533447265625, "step": 4703, "total_loss": 0.6860577166080475 }, { "epoch": 0.9313007325282122, "grad_norm": 0.8515816926956177, "learning_rate": 5e-05, "llm_loss": 0.5035668611526489, "loss": 2.3271, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05377197265625, "step": 4704, "total_loss": 0.5817800909280777 }, { "epoch": 0.9314987131261137, "grad_norm": 1.0777443647384644, "learning_rate": 5e-05, "llm_loss": 0.5451602041721344, "loss": 2.4976, "loss_aux_layer_0": 0.011077880859375, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.11474609375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.1314697265625, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.190673828125, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05474853515625, "step": 4705, "total_loss": 0.6243876814842224 }, { "epoch": 0.9316966937240151, "grad_norm": 0.9726946949958801, "learning_rate": 5e-05, "llm_loss": 0.5607773661613464, "loss": 2.5445, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.02642822265625, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.059814453125, "loss_aux_layer_13": 0.06512451171875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.037353515625, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0462646484375, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.050048828125, "loss_aux_layer_6": 0.052978515625, "loss_aux_layer_7": 0.0513916015625, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.05035400390625, "step": 4706, "total_loss": 0.6361374258995056 }, { "epoch": 0.9318946743219164, "grad_norm": 0.9158992171287537, "learning_rate": 5e-05, "llm_loss": 0.5666685551404953, "loss": 2.5802, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0545654296875, "step": 4707, "total_loss": 0.6450427919626236 }, { "epoch": 0.9320926549198179, "grad_norm": 0.8751142024993896, "learning_rate": 5e-05, "llm_loss": 0.5866885408759117, "loss": 2.6611, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.11083984375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.0570068359375, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.053955078125, "step": 4708, "total_loss": 0.6652794033288956 }, { "epoch": 0.9322906355177193, "grad_norm": 0.9227803349494934, "learning_rate": 5e-05, "llm_loss": 0.5931816920638084, "loss": 2.6778, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05340576171875, "step": 4709, "total_loss": 0.6694516241550446 }, { "epoch": 0.9324886161156206, "grad_norm": 0.841607928276062, "learning_rate": 5e-05, "llm_loss": 0.5184274315834045, "loss": 2.3791, "loss_aux_layer_0": 0.0104827880859375, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.0531005859375, "step": 4710, "total_loss": 0.59477199614048 }, { "epoch": 0.932686596713522, "grad_norm": 0.9715459942817688, "learning_rate": 5e-05, "llm_loss": 0.5255332067608833, "loss": 2.4181, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.02935791015625, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.1304931640625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05474853515625, "step": 4711, "total_loss": 0.6045367866754532 }, { "epoch": 0.9328845773114235, "grad_norm": 0.8098977208137512, "learning_rate": 5e-05, "llm_loss": 0.5317976772785187, "loss": 2.4301, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05194091796875, "step": 4712, "total_loss": 0.6075161695480347 }, { "epoch": 0.9330825579093249, "grad_norm": 1.038478136062622, "learning_rate": 5e-05, "llm_loss": 0.6053244471549988, "loss": 2.7421, "loss_aux_layer_0": 0.0102081298828125, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0709228515625, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.088623046875, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.1181640625, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.13232421875, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.0552978515625, "step": 4713, "total_loss": 0.6855239719152451 }, { "epoch": 0.9332805385072263, "grad_norm": 0.8684679269790649, "learning_rate": 5e-05, "llm_loss": 0.5829833075404167, "loss": 2.6352, "loss_aux_layer_0": 0.0094146728515625, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05279541015625, "step": 4714, "total_loss": 0.6588057428598404 }, { "epoch": 0.9334785191051277, "grad_norm": 0.9405161142349243, "learning_rate": 5e-05, "llm_loss": 0.5233820602297783, "loss": 2.4097, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1124267578125, "loss_aux_layer_19": 0.1163330078125, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05364990234375, "step": 4715, "total_loss": 0.6024263203144073 }, { "epoch": 0.9336764997030291, "grad_norm": 0.9054723381996155, "learning_rate": 5e-05, "llm_loss": 0.5587319433689117, "loss": 2.5381, "loss_aux_layer_0": 0.009765625, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06182861328125, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.052734375, "step": 4716, "total_loss": 0.6345147788524628 }, { "epoch": 0.9338744803009305, "grad_norm": 0.9686304330825806, "learning_rate": 5e-05, "llm_loss": 0.6434390246868134, "loss": 2.8856, "loss_aux_layer_0": 0.0099945068359375, "loss_aux_layer_1": 0.028778076171875, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.06854248046875, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.053955078125, "step": 4717, "total_loss": 0.7213944047689438 }, { "epoch": 0.9340724608988319, "grad_norm": 0.9636523723602295, "learning_rate": 5e-05, "llm_loss": 0.5561728775501251, "loss": 2.5257, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.0528564453125, "step": 4718, "total_loss": 0.6314189732074738 }, { "epoch": 0.9342704414967333, "grad_norm": 0.9663089513778687, "learning_rate": 5e-05, "llm_loss": 0.5285536199808121, "loss": 2.4198, "loss_aux_layer_0": 0.0103302001953125, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05255126953125, "step": 4719, "total_loss": 0.6049388349056244 }, { "epoch": 0.9344684220946348, "grad_norm": 0.981073796749115, "learning_rate": 5e-05, "llm_loss": 0.6159921288490295, "loss": 2.7694, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05145263671875, "step": 4720, "total_loss": 0.6923559308052063 }, { "epoch": 0.9346664026925361, "grad_norm": 1.208113431930542, "learning_rate": 5e-05, "llm_loss": 0.5729885101318359, "loss": 2.5999, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.05230712890625, "step": 4721, "total_loss": 0.6499674171209335 }, { "epoch": 0.9348643832904375, "grad_norm": 1.2006491422653198, "learning_rate": 5e-05, "llm_loss": 0.5159902423620224, "loss": 2.375, "loss_aux_layer_0": 0.011871337890625, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.0528564453125, "step": 4722, "total_loss": 0.5937583446502686 }, { "epoch": 0.935062363888339, "grad_norm": 1.0626424551010132, "learning_rate": 5e-05, "llm_loss": 0.6492517739534378, "loss": 2.9008, "loss_aux_layer_0": 0.0120697021484375, "loss_aux_layer_1": 0.028778076171875, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05230712890625, "step": 4723, "total_loss": 0.725197046995163 }, { "epoch": 0.9352603444862404, "grad_norm": 0.9949816465377808, "learning_rate": 5e-05, "llm_loss": 0.6401398777961731, "loss": 2.8555, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.0594482421875, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.08935546875, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.1077880859375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.17578125, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.04925537109375, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.05169677734375, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.05059814453125, "step": 4724, "total_loss": 0.7138659656047821 }, { "epoch": 0.9354583250841417, "grad_norm": 1.5844721794128418, "learning_rate": 5e-05, "llm_loss": 0.5342163369059563, "loss": 2.4486, "loss_aux_layer_0": 0.0132598876953125, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.0533447265625, "step": 4725, "total_loss": 0.6121425777673721 }, { "epoch": 0.9356563056820432, "grad_norm": 1.4556128978729248, "learning_rate": 5e-05, "llm_loss": 0.5872570425271988, "loss": 2.6722, "loss_aux_layer_0": 0.0146942138671875, "loss_aux_layer_1": 0.031768798828125, "loss_aux_layer_10": 0.05877685546875, "loss_aux_layer_11": 0.06268310546875, "loss_aux_layer_12": 0.06695556640625, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.079833984375, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.096435546875, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.04461669921875, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.05438232421875, "loss_aux_layer_4": 0.05712890625, "loss_aux_layer_5": 0.05877685546875, "loss_aux_layer_6": 0.06182861328125, "loss_aux_layer_7": 0.0596923828125, "loss_aux_layer_8": 0.05859375, "loss_aux_layer_9": 0.0576171875, "step": 4726, "total_loss": 0.6680440008640289 }, { "epoch": 0.9358542862799446, "grad_norm": 1.2289990186691284, "learning_rate": 5e-05, "llm_loss": 0.603879377245903, "loss": 2.7178, "loss_aux_layer_0": 0.015777587890625, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.06494140625, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.0521240234375, "step": 4727, "total_loss": 0.6794509738683701 }, { "epoch": 0.9360522668778459, "grad_norm": 1.1599687337875366, "learning_rate": 5e-05, "llm_loss": 0.5559996291995049, "loss": 2.5357, "loss_aux_layer_0": 0.012847900390625, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.114990234375, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.1314697265625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.05230712890625, "step": 4728, "total_loss": 0.6339212507009506 }, { "epoch": 0.9362502474757474, "grad_norm": 1.1375133991241455, "learning_rate": 5e-05, "llm_loss": 0.6171996966004372, "loss": 2.7869, "loss_aux_layer_0": 0.016387939453125, "loss_aux_layer_1": 0.03118896484375, "loss_aux_layer_10": 0.05712890625, "loss_aux_layer_11": 0.06103515625, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.05218505859375, "loss_aux_layer_4": 0.05462646484375, "loss_aux_layer_5": 0.0562744140625, "loss_aux_layer_6": 0.059326171875, "loss_aux_layer_7": 0.0574951171875, "loss_aux_layer_8": 0.05682373046875, "loss_aux_layer_9": 0.0555419921875, "step": 4729, "total_loss": 0.6967126280069351 }, { "epoch": 0.9364482280736488, "grad_norm": 0.9884443283081055, "learning_rate": 5e-05, "llm_loss": 0.5547153800725937, "loss": 2.5254, "loss_aux_layer_0": 0.0126190185546875, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.052490234375, "step": 4730, "total_loss": 0.6313612312078476 }, { "epoch": 0.9366462086715502, "grad_norm": 1.1034855842590332, "learning_rate": 5e-05, "llm_loss": 0.5064932852983475, "loss": 2.3322, "loss_aux_layer_0": 0.0154571533203125, "loss_aux_layer_1": 0.028778076171875, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.05206298828125, "step": 4731, "total_loss": 0.5830553770065308 }, { "epoch": 0.9368441892694516, "grad_norm": 1.0230646133422852, "learning_rate": 5e-05, "llm_loss": 0.5378693491220474, "loss": 2.4602, "loss_aux_layer_0": 0.01519775390625, "loss_aux_layer_1": 0.029449462890625, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06732177734375, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.05291748046875, "step": 4732, "total_loss": 0.615043580532074 }, { "epoch": 0.937042169867353, "grad_norm": 0.9303416609764099, "learning_rate": 5e-05, "llm_loss": 0.5671271905303001, "loss": 2.5763, "loss_aux_layer_0": 0.0121307373046875, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06195068359375, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0531005859375, "step": 4733, "total_loss": 0.6440654993057251 }, { "epoch": 0.9372401504652544, "grad_norm": 1.0967950820922852, "learning_rate": 5e-05, "llm_loss": 0.5339358299970627, "loss": 2.4549, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.031005859375, "loss_aux_layer_10": 0.05731201171875, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06524658203125, "loss_aux_layer_13": 0.0704345703125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0869140625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05712890625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05609130859375, "step": 4734, "total_loss": 0.6137184351682663 }, { "epoch": 0.9374381310631558, "grad_norm": 0.8753867149353027, "learning_rate": 5e-05, "llm_loss": 0.5780863761901855, "loss": 2.6211, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04052734375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05352783203125, "step": 4735, "total_loss": 0.6552834510803223 }, { "epoch": 0.9376361116610572, "grad_norm": 0.972087562084198, "learning_rate": 5e-05, "llm_loss": 0.5514996275305748, "loss": 2.5287, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.030120849609375, "loss_aux_layer_10": 0.05810546875, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.06658935546875, "loss_aux_layer_13": 0.0718994140625, "loss_aux_layer_14": 0.08056640625, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0982666015625, "loss_aux_layer_17": 0.105712890625, "loss_aux_layer_18": 0.1134033203125, "loss_aux_layer_19": 0.116455078125, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.124267578125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0521240234375, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05645751953125, "loss_aux_layer_6": 0.0594482421875, "loss_aux_layer_7": 0.0579833984375, "loss_aux_layer_8": 0.057373046875, "loss_aux_layer_9": 0.05645751953125, "step": 4736, "total_loss": 0.6321734040975571 }, { "epoch": 0.9378340922589586, "grad_norm": 0.8368616700172424, "learning_rate": 5e-05, "llm_loss": 0.59840889275074, "loss": 2.6977, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06658935546875, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05242919921875, "step": 4737, "total_loss": 0.6744285225868225 }, { "epoch": 0.9380320728568601, "grad_norm": 0.9316603541374207, "learning_rate": 5e-05, "llm_loss": 0.6301436871290207, "loss": 2.8188, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.05517578125, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.07177734375, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.03826904296875, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.05389404296875, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05084228515625, "step": 4738, "total_loss": 0.7046932727098465 }, { "epoch": 0.9382300534547614, "grad_norm": 0.8691094517707825, "learning_rate": 5e-05, "llm_loss": 0.602153979241848, "loss": 2.7193, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05963134765625, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.05572509765625, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05413818359375, "step": 4739, "total_loss": 0.6798370480537415 }, { "epoch": 0.9384280340526628, "grad_norm": 0.902366578578949, "learning_rate": 5e-05, "llm_loss": 0.5386820733547211, "loss": 2.457, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05255126953125, "step": 4740, "total_loss": 0.6142463088035583 }, { "epoch": 0.9386260146505643, "grad_norm": 0.7595744729042053, "learning_rate": 5e-05, "llm_loss": 0.483342707157135, "loss": 2.2406, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.06756591796875, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05316162109375, "step": 4741, "total_loss": 0.5601464733481407 }, { "epoch": 0.9388239952484656, "grad_norm": 0.9283676743507385, "learning_rate": 5e-05, "llm_loss": 0.5385134518146515, "loss": 2.4451, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.026458740234375, "loss_aux_layer_10": 0.050537109375, "loss_aux_layer_11": 0.05413818359375, "loss_aux_layer_12": 0.05804443359375, "loss_aux_layer_13": 0.06292724609375, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0880126953125, "loss_aux_layer_17": 0.095703125, "loss_aux_layer_18": 0.1036376953125, "loss_aux_layer_19": 0.1072998046875, "loss_aux_layer_2": 0.0364990234375, "loss_aux_layer_20": 0.1151123046875, "loss_aux_layer_21": 0.1231689453125, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.17626953125, "loss_aux_layer_3": 0.04541015625, "loss_aux_layer_4": 0.04791259765625, "loss_aux_layer_5": 0.0494384765625, "loss_aux_layer_6": 0.05206298828125, "loss_aux_layer_7": 0.0506591796875, "loss_aux_layer_8": 0.05035400390625, "loss_aux_layer_9": 0.04925537109375, "step": 4742, "total_loss": 0.611267015337944 }, { "epoch": 0.939021975846367, "grad_norm": 0.9016048312187195, "learning_rate": 5e-05, "llm_loss": 0.5968266129493713, "loss": 2.6974, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05609130859375, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05810546875, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.0546875, "step": 4743, "total_loss": 0.6743624061346054 }, { "epoch": 0.9392199564442685, "grad_norm": 0.9261197447776794, "learning_rate": 5e-05, "llm_loss": 0.6026148498058319, "loss": 2.7226, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05755615234375, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.05389404296875, "step": 4744, "total_loss": 0.6806431263685226 }, { "epoch": 0.9394179370421699, "grad_norm": 0.9911494851112366, "learning_rate": 5e-05, "llm_loss": 0.5500963032245636, "loss": 2.5064, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05291748046875, "step": 4745, "total_loss": 0.6265908628702164 }, { "epoch": 0.9396159176400712, "grad_norm": 0.927348256111145, "learning_rate": 5e-05, "llm_loss": 0.5518400371074677, "loss": 2.5146, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.06182861328125, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05267333984375, "step": 4746, "total_loss": 0.6286384761333466 }, { "epoch": 0.9398138982379727, "grad_norm": 0.9097908735275269, "learning_rate": 5e-05, "llm_loss": 0.5245011150836945, "loss": 2.3981, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.03826904296875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04693603515625, "loss_aux_layer_4": 0.0491943359375, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.05072021484375, "step": 4747, "total_loss": 0.5995242148637772 }, { "epoch": 0.9400118788358741, "grad_norm": 0.9442844390869141, "learning_rate": 5e-05, "llm_loss": 0.6298652067780495, "loss": 2.8273, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.05322265625, "step": 4748, "total_loss": 0.7068194150924683 }, { "epoch": 0.9402098594337754, "grad_norm": 0.8107135891914368, "learning_rate": 5e-05, "llm_loss": 0.5107341632246971, "loss": 2.3452, "loss_aux_layer_0": 0.01092529296875, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.0518798828125, "step": 4749, "total_loss": 0.586302250623703 }, { "epoch": 0.9404078400316769, "grad_norm": 0.8559091687202454, "learning_rate": 5e-05, "llm_loss": 0.5361701473593712, "loss": 2.4539, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.05364990234375, "step": 4750, "total_loss": 0.6134843677282333 }, { "epoch": 0.9406058206295783, "grad_norm": 0.7960101962089539, "learning_rate": 5e-05, "llm_loss": 0.5166180804371834, "loss": 2.3622, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.027069091796875, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.05511474609375, "loss_aux_layer_12": 0.05926513671875, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.0504150390625, "step": 4751, "total_loss": 0.5905593186616898 }, { "epoch": 0.9408038012274798, "grad_norm": 0.8897213935852051, "learning_rate": 5e-05, "llm_loss": 0.4823489710688591, "loss": 2.2423, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.153076171875, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.053466796875, "step": 4752, "total_loss": 0.5605656206607819 }, { "epoch": 0.9410017818253811, "grad_norm": 0.8068259954452515, "learning_rate": 5e-05, "llm_loss": 0.5613718032836914, "loss": 2.5493, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.0523681640625, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05303955078125, "loss_aux_layer_9": 0.052001953125, "step": 4753, "total_loss": 0.6373330652713776 }, { "epoch": 0.9411997624232825, "grad_norm": 1.0015144348144531, "learning_rate": 5e-05, "llm_loss": 0.5537970960140228, "loss": 2.5278, "loss_aux_layer_0": 0.01031494140625, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.053955078125, "step": 4754, "total_loss": 0.6319556385278702 }, { "epoch": 0.941397743021184, "grad_norm": 0.938962459564209, "learning_rate": 5e-05, "llm_loss": 0.5011159256100655, "loss": 2.3025, "loss_aux_layer_0": 0.0103759765625, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.05963134765625, "loss_aux_layer_13": 0.064697265625, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.0892333984375, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05126953125, "step": 4755, "total_loss": 0.5756366103887558 }, { "epoch": 0.9415957236190853, "grad_norm": 0.7906086444854736, "learning_rate": 5e-05, "llm_loss": 0.5135341957211494, "loss": 2.3677, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1226806640625, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0540771484375, "step": 4756, "total_loss": 0.5919255465269089 }, { "epoch": 0.9417937042169867, "grad_norm": 0.8275790214538574, "learning_rate": 5e-05, "llm_loss": 0.5538175702095032, "loss": 2.5071, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.02691650390625, "loss_aux_layer_10": 0.0506591796875, "loss_aux_layer_11": 0.05401611328125, "loss_aux_layer_12": 0.057861328125, "loss_aux_layer_13": 0.06268310546875, "loss_aux_layer_14": 0.0704345703125, "loss_aux_layer_15": 0.0780029296875, "loss_aux_layer_16": 0.0867919921875, "loss_aux_layer_17": 0.0943603515625, "loss_aux_layer_18": 0.1025390625, "loss_aux_layer_19": 0.107177734375, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.114990234375, "loss_aux_layer_21": 0.1231689453125, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.046630859375, "loss_aux_layer_4": 0.0487060546875, "loss_aux_layer_5": 0.0501708984375, "loss_aux_layer_6": 0.0526123046875, "loss_aux_layer_7": 0.0509033203125, "loss_aux_layer_8": 0.05047607421875, "loss_aux_layer_9": 0.04962158203125, "step": 4757, "total_loss": 0.6267642974853516 }, { "epoch": 0.9419916848148882, "grad_norm": 0.8458173274993896, "learning_rate": 5e-05, "llm_loss": 0.6632578372955322, "loss": 2.9563, "loss_aux_layer_0": 0.00970458984375, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.124267578125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.0523681640625, "step": 4758, "total_loss": 0.7390628606081009 }, { "epoch": 0.9421896654127896, "grad_norm": 0.8632738590240479, "learning_rate": 5e-05, "llm_loss": 0.5543682277202606, "loss": 2.5337, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.056640625, "loss_aux_layer_11": 0.06060791015625, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04132080078125, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05645751953125, "loss_aux_layer_9": 0.05523681640625, "step": 4759, "total_loss": 0.6334143280982971 }, { "epoch": 0.9423876460106909, "grad_norm": 0.9184375405311584, "learning_rate": 5e-05, "llm_loss": 0.5669502764940262, "loss": 2.5796, "loss_aux_layer_0": 0.0103607177734375, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.06854248046875, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05364990234375, "step": 4760, "total_loss": 0.6449117958545685 }, { "epoch": 0.9425856266085924, "grad_norm": 0.9887933731079102, "learning_rate": 5e-05, "llm_loss": 0.6044652909040451, "loss": 2.7248, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.0291748046875, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.053955078125, "step": 4761, "total_loss": 0.681200310587883 }, { "epoch": 0.9427836072064938, "grad_norm": 0.8943772912025452, "learning_rate": 5e-05, "llm_loss": 0.5857147425413132, "loss": 2.642, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04730224609375, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.0511474609375, "step": 4762, "total_loss": 0.6605017930269241 }, { "epoch": 0.9429815878043951, "grad_norm": 1.0308003425598145, "learning_rate": 5e-05, "llm_loss": 0.5743860006332397, "loss": 2.594, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.0648193359375, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.107421875, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.115478515625, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05377197265625, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05120849609375, "step": 4763, "total_loss": 0.6485082060098648 }, { "epoch": 0.9431795684022966, "grad_norm": 0.8056155443191528, "learning_rate": 5e-05, "llm_loss": 0.5662772580981255, "loss": 2.5691, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.02691650390625, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.0513916015625, "step": 4764, "total_loss": 0.6422684043645859 }, { "epoch": 0.943377549000198, "grad_norm": 0.8885478377342224, "learning_rate": 5e-05, "llm_loss": 0.6081558763980865, "loss": 2.7419, "loss_aux_layer_0": 0.0101318359375, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.0562744140625, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.056396484375, "loss_aux_layer_9": 0.05517578125, "step": 4765, "total_loss": 0.6854628473520279 }, { "epoch": 0.9435755295980994, "grad_norm": 0.7652087211608887, "learning_rate": 5e-05, "llm_loss": 0.5619880855083466, "loss": 2.5606, "loss_aux_layer_0": 0.009429931640625, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05279541015625, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0546875, "step": 4766, "total_loss": 0.640151634812355 }, { "epoch": 0.9437735101960008, "grad_norm": 0.8352867960929871, "learning_rate": 5e-05, "llm_loss": 0.5249709784984589, "loss": 2.4161, "loss_aux_layer_0": 0.0102996826171875, "loss_aux_layer_1": 0.028656005859375, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1112060546875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.0550537109375, "step": 4767, "total_loss": 0.6040264517068863 }, { "epoch": 0.9439714907939022, "grad_norm": 0.8414844870567322, "learning_rate": 5e-05, "llm_loss": 0.6043295562267303, "loss": 2.734, "loss_aux_layer_0": 0.009490966796875, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05682373046875, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.08642578125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1038818359375, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.055419921875, "step": 4768, "total_loss": 0.683489203453064 }, { "epoch": 0.9441694713918036, "grad_norm": 0.8032206892967224, "learning_rate": 5e-05, "llm_loss": 0.4852435365319252, "loss": 2.2522, "loss_aux_layer_0": 0.0102081298828125, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.0400390625, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.1314697265625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.19189453125, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0531005859375, "step": 4769, "total_loss": 0.5630423650145531 }, { "epoch": 0.944367451989705, "grad_norm": 0.8411446809768677, "learning_rate": 5e-05, "llm_loss": 0.5751770734786987, "loss": 2.6023, "loss_aux_layer_0": 0.009246826171875, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05218505859375, "step": 4770, "total_loss": 0.6505692899227142 }, { "epoch": 0.9445654325876064, "grad_norm": 0.7901291847229004, "learning_rate": 5e-05, "llm_loss": 0.5045238956809044, "loss": 2.3223, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.0517578125, "step": 4771, "total_loss": 0.5805786848068237 }, { "epoch": 0.9447634131855078, "grad_norm": 1.228198528289795, "learning_rate": 5e-05, "llm_loss": 0.5332558527588844, "loss": 2.4479, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.029632568359375, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0701904296875, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.0418701171875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.0587158203125, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05517578125, "step": 4772, "total_loss": 0.6119695752859116 }, { "epoch": 0.9449613937834093, "grad_norm": 0.8132213354110718, "learning_rate": 5e-05, "llm_loss": 0.5624385625123978, "loss": 2.5537, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.0516357421875, "step": 4773, "total_loss": 0.6384271681308746 }, { "epoch": 0.9451593743813106, "grad_norm": 1.0246236324310303, "learning_rate": 5e-05, "llm_loss": 0.572045348584652, "loss": 2.6005, "loss_aux_layer_0": 0.010894775390625, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05352783203125, "step": 4774, "total_loss": 0.6501256078481674 }, { "epoch": 0.945357354979212, "grad_norm": 0.8065378665924072, "learning_rate": 5e-05, "llm_loss": 0.6128093153238297, "loss": 2.752, "loss_aux_layer_0": 0.0098419189453125, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.0516357421875, "step": 4775, "total_loss": 0.6879896074533463 }, { "epoch": 0.9455553355771135, "grad_norm": 0.8532286882400513, "learning_rate": 5e-05, "llm_loss": 0.5374246165156364, "loss": 2.453, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.0523681640625, "step": 4776, "total_loss": 0.6132542192935944 }, { "epoch": 0.9457533161750149, "grad_norm": 0.8578711748123169, "learning_rate": 5e-05, "llm_loss": 0.5681088790297508, "loss": 2.576, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05230712890625, "step": 4777, "total_loss": 0.6440086588263512 }, { "epoch": 0.9459512967729162, "grad_norm": 0.8742008209228516, "learning_rate": 5e-05, "llm_loss": 0.5297938957810402, "loss": 2.4212, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.06597900390625, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05120849609375, "step": 4778, "total_loss": 0.6052907109260559 }, { "epoch": 0.9461492773708177, "grad_norm": 1.0744946002960205, "learning_rate": 5e-05, "llm_loss": 0.603402242064476, "loss": 2.7241, "loss_aux_layer_0": 0.012542724609375, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05645751953125, "loss_aux_layer_11": 0.0601806640625, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.055908203125, "loss_aux_layer_9": 0.05499267578125, "step": 4779, "total_loss": 0.6810280531644821 }, { "epoch": 0.9463472579687191, "grad_norm": 0.8510785102844238, "learning_rate": 5e-05, "llm_loss": 0.5997388809919357, "loss": 2.7011, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04974365234375, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05133056640625, "step": 4780, "total_loss": 0.6752859354019165 }, { "epoch": 0.9465452385666204, "grad_norm": 0.9296659827232361, "learning_rate": 5e-05, "llm_loss": 0.5454243421554565, "loss": 2.4627, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.025543212890625, "loss_aux_layer_10": 0.04803466796875, "loss_aux_layer_11": 0.05145263671875, "loss_aux_layer_12": 0.05548095703125, "loss_aux_layer_13": 0.06036376953125, "loss_aux_layer_14": 0.0677490234375, "loss_aux_layer_15": 0.0755615234375, "loss_aux_layer_16": 0.084228515625, "loss_aux_layer_17": 0.0921630859375, "loss_aux_layer_18": 0.099853515625, "loss_aux_layer_19": 0.1036376953125, "loss_aux_layer_2": 0.03515625, "loss_aux_layer_20": 0.1116943359375, "loss_aux_layer_21": 0.1192626953125, "loss_aux_layer_22": 0.138427734375, "loss_aux_layer_23": 0.17333984375, "loss_aux_layer_3": 0.04400634765625, "loss_aux_layer_4": 0.04644775390625, "loss_aux_layer_5": 0.04791259765625, "loss_aux_layer_6": 0.0504150390625, "loss_aux_layer_7": 0.0487060546875, "loss_aux_layer_8": 0.0482177734375, "loss_aux_layer_9": 0.04718017578125, "step": 4781, "total_loss": 0.615686446428299 }, { "epoch": 0.9467432191645219, "grad_norm": 0.8548259139060974, "learning_rate": 5e-05, "llm_loss": 0.5844497382640839, "loss": 2.6357, "loss_aux_layer_0": 0.01080322265625, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05145263671875, "step": 4782, "total_loss": 0.658915638923645 }, { "epoch": 0.9469411997624233, "grad_norm": 0.9586085677146912, "learning_rate": 5e-05, "llm_loss": 0.6191411316394806, "loss": 2.7861, "loss_aux_layer_0": 0.009521484375, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05743408203125, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05401611328125, "step": 4783, "total_loss": 0.6965292245149612 }, { "epoch": 0.9471391803603247, "grad_norm": 1.2073044776916504, "learning_rate": 5e-05, "llm_loss": 0.6088921874761581, "loss": 2.7515, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06396484375, "loss_aux_layer_13": 0.06939697265625, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.054443359375, "loss_aux_layer_5": 0.0557861328125, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.05670166015625, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.05487060546875, "step": 4784, "total_loss": 0.6878740638494492 }, { "epoch": 0.9473371609582261, "grad_norm": 0.972447395324707, "learning_rate": 5e-05, "llm_loss": 0.56478650867939, "loss": 2.5703, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1151123046875, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.1229248046875, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.0531005859375, "step": 4785, "total_loss": 0.6425841674208641 }, { "epoch": 0.9475351415561275, "grad_norm": 1.0113459825515747, "learning_rate": 5e-05, "llm_loss": 0.5039435923099518, "loss": 2.3126, "loss_aux_layer_0": 0.01141357421875, "loss_aux_layer_1": 0.027740478515625, "loss_aux_layer_10": 0.0521240234375, "loss_aux_layer_11": 0.0556640625, "loss_aux_layer_12": 0.0599365234375, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.1046142578125, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.0511474609375, "loss_aux_layer_6": 0.0537109375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.05084228515625, "step": 4786, "total_loss": 0.5781494677066803 }, { "epoch": 0.9477331221540289, "grad_norm": 0.8615487217903137, "learning_rate": 5e-05, "llm_loss": 0.5565463528037071, "loss": 2.5326, "loss_aux_layer_0": 0.009918212890625, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.0528564453125, "step": 4787, "total_loss": 0.633159413933754 }, { "epoch": 0.9479311027519303, "grad_norm": 0.9562321901321411, "learning_rate": 5e-05, "llm_loss": 0.5089825391769409, "loss": 2.3427, "loss_aux_layer_0": 0.010498046875, "loss_aux_layer_1": 0.02734375, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05218505859375, "step": 4788, "total_loss": 0.5856689959764481 }, { "epoch": 0.9481290833498317, "grad_norm": 0.8381816744804382, "learning_rate": 5e-05, "llm_loss": 0.5513892099261284, "loss": 2.5114, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.06683349609375, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0535888671875, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.0523681640625, "step": 4789, "total_loss": 0.6278461217880249 }, { "epoch": 0.9483270639477331, "grad_norm": 1.1626251935958862, "learning_rate": 5e-05, "llm_loss": 0.5282769948244095, "loss": 2.4292, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.103759765625, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.11669921875, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.124755859375, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.153564453125, "loss_aux_layer_23": 0.19140625, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.053955078125, "step": 4790, "total_loss": 0.6072938442230225 }, { "epoch": 0.9485250445456346, "grad_norm": 1.3422516584396362, "learning_rate": 5e-05, "llm_loss": 0.622612327337265, "loss": 2.8142, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.0577392578125, "loss_aux_layer_11": 0.061767578125, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.080078125, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.0977783203125, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.11328125, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.1243896484375, "loss_aux_layer_21": 0.1328125, "loss_aux_layer_22": 0.154296875, "loss_aux_layer_23": 0.191162109375, "loss_aux_layer_3": 0.0528564453125, "loss_aux_layer_4": 0.05548095703125, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.06005859375, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.0565185546875, "step": 4791, "total_loss": 0.7035552859306335 }, { "epoch": 0.9487230251435359, "grad_norm": 0.8218873143196106, "learning_rate": 5e-05, "llm_loss": 0.528239518404007, "loss": 2.4158, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.0523681640625, "step": 4792, "total_loss": 0.6039401292800903 }, { "epoch": 0.9489210057414373, "grad_norm": 1.1903759241104126, "learning_rate": 5e-05, "llm_loss": 0.5328483432531357, "loss": 2.4482, "loss_aux_layer_0": 0.0110015869140625, "loss_aux_layer_1": 0.0301513671875, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.06494140625, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.077880859375, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04327392578125, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.05340576171875, "loss_aux_layer_4": 0.05560302734375, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.0599365234375, "loss_aux_layer_7": 0.05804443359375, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.0560302734375, "step": 4793, "total_loss": 0.6120617538690567 }, { "epoch": 0.9491189863393388, "grad_norm": 0.9708351492881775, "learning_rate": 5e-05, "llm_loss": 0.5904881805181503, "loss": 2.6574, "loss_aux_layer_0": 0.0105133056640625, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.05926513671875, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.0716552734375, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.095703125, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.1077880859375, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.124267578125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.049560546875, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0513916015625, "loss_aux_layer_9": 0.05029296875, "step": 4794, "total_loss": 0.6643550843000412 }, { "epoch": 0.9493169669372401, "grad_norm": 1.0479779243469238, "learning_rate": 5e-05, "llm_loss": 0.5282673090696335, "loss": 2.4231, "loss_aux_layer_0": 0.0121612548828125, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.0565185546875, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.12255859375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05303955078125, "loss_aux_layer_9": 0.05194091796875, "step": 4795, "total_loss": 0.6057669371366501 }, { "epoch": 0.9495149475351415, "grad_norm": 0.9960693120956421, "learning_rate": 5e-05, "llm_loss": 0.5047655254602432, "loss": 2.3309, "loss_aux_layer_0": 0.0103607177734375, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.1533203125, "loss_aux_layer_23": 0.191650390625, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05291748046875, "step": 4796, "total_loss": 0.5827268213033676 }, { "epoch": 0.949712928133043, "grad_norm": 0.874413013458252, "learning_rate": 5e-05, "llm_loss": 0.5501853302121162, "loss": 2.5059, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.052001953125, "step": 4797, "total_loss": 0.6264831870794296 }, { "epoch": 0.9499109087309444, "grad_norm": 1.37433660030365, "learning_rate": 5e-05, "llm_loss": 0.6137229949235916, "loss": 2.7615, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.028656005859375, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.05712890625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.05206298828125, "step": 4798, "total_loss": 0.6903766095638275 }, { "epoch": 0.9501088893288457, "grad_norm": 0.8445994853973389, "learning_rate": 5e-05, "llm_loss": 0.5564054548740387, "loss": 2.5305, "loss_aux_layer_0": 0.0104217529296875, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.052001953125, "step": 4799, "total_loss": 0.6326216757297516 }, { "epoch": 0.9503068699267472, "grad_norm": 0.9255750775337219, "learning_rate": 5e-05, "llm_loss": 0.4937548190355301, "loss": 2.2907, "loss_aux_layer_0": 0.0122528076171875, "loss_aux_layer_1": 0.028778076171875, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06439208984375, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1302490234375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.0552978515625, "step": 4800, "total_loss": 0.5726663768291473 }, { "epoch": 0.9505048505246486, "grad_norm": 0.9872087240219116, "learning_rate": 5e-05, "llm_loss": 0.5484118983149529, "loss": 2.5027, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06683349609375, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.1300048828125, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05224609375, "step": 4801, "total_loss": 0.6256793886423111 }, { "epoch": 0.9507028311225499, "grad_norm": 0.9892311096191406, "learning_rate": 5e-05, "llm_loss": 0.5455369129776955, "loss": 2.4844, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.0599365234375, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.05078125, "step": 4802, "total_loss": 0.6211112290620804 }, { "epoch": 0.9509008117204514, "grad_norm": 1.2053256034851074, "learning_rate": 5e-05, "llm_loss": 0.5024464800953865, "loss": 2.3329, "loss_aux_layer_0": 0.0099945068359375, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.05615234375, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.06475830078125, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.0799560546875, "loss_aux_layer_15": 0.0889892578125, "loss_aux_layer_16": 0.0985107421875, "loss_aux_layer_17": 0.1064453125, "loss_aux_layer_18": 0.114990234375, "loss_aux_layer_19": 0.118896484375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.126708984375, "loss_aux_layer_21": 0.13525390625, "loss_aux_layer_22": 0.156494140625, "loss_aux_layer_23": 0.194091796875, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.0560302734375, "loss_aux_layer_9": 0.054931640625, "step": 4803, "total_loss": 0.5832180231809616 }, { "epoch": 0.9510987923183528, "grad_norm": 1.0844391584396362, "learning_rate": 5e-05, "llm_loss": 0.582945853471756, "loss": 2.6317, "loss_aux_layer_0": 0.0115509033203125, "loss_aux_layer_1": 0.02685546875, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.0550537109375, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04669189453125, "loss_aux_layer_4": 0.04913330078125, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.05023193359375, "step": 4804, "total_loss": 0.6579234823584557 }, { "epoch": 0.9512967729162543, "grad_norm": 1.0855334997177124, "learning_rate": 5e-05, "llm_loss": 0.5331786051392555, "loss": 2.4365, "loss_aux_layer_0": 0.01165771484375, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05120849609375, "step": 4805, "total_loss": 0.6091300845146179 }, { "epoch": 0.9514947535141556, "grad_norm": 1.100063681602478, "learning_rate": 5e-05, "llm_loss": 0.5987065732479095, "loss": 2.6959, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.05474853515625, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05096435546875, "step": 4806, "total_loss": 0.6739857643842697 }, { "epoch": 0.951692734112057, "grad_norm": 0.9198100566864014, "learning_rate": 5e-05, "llm_loss": 0.5236010104417801, "loss": 2.3994, "loss_aux_layer_0": 0.0121917724609375, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06182861328125, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 4807, "total_loss": 0.5998432114720345 }, { "epoch": 0.9518907147099585, "grad_norm": 1.0293623208999634, "learning_rate": 5e-05, "llm_loss": 0.5712812170386314, "loss": 2.5849, "loss_aux_layer_0": 0.0103759765625, "loss_aux_layer_1": 0.027374267578125, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.0526123046875, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05126953125, "step": 4808, "total_loss": 0.6462211012840271 }, { "epoch": 0.9520886953078598, "grad_norm": 0.8067131638526917, "learning_rate": 5e-05, "llm_loss": 0.5041695758700371, "loss": 2.323, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.0518798828125, "step": 4809, "total_loss": 0.5807551741600037 }, { "epoch": 0.9522866759057612, "grad_norm": 0.915631890296936, "learning_rate": 5e-05, "llm_loss": 0.5333669558167458, "loss": 2.4385, "loss_aux_layer_0": 0.01165771484375, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.05279541015625, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.06597900390625, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.054443359375, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.05145263671875, "step": 4810, "total_loss": 0.6096304655075073 }, { "epoch": 0.9524846565036627, "grad_norm": 0.9792761206626892, "learning_rate": 5e-05, "llm_loss": 0.6370519995689392, "loss": 2.8552, "loss_aux_layer_0": 0.0103759765625, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.0550537109375, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05474853515625, "loss_aux_layer_9": 0.0537109375, "step": 4811, "total_loss": 0.7138011604547501 }, { "epoch": 0.9526826371015641, "grad_norm": 0.997279167175293, "learning_rate": 5e-05, "llm_loss": 0.4786701947450638, "loss": 2.2241, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.02972412109375, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.0582275390625, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.0533447265625, "step": 4812, "total_loss": 0.5560348033905029 }, { "epoch": 0.9528806176994654, "grad_norm": 0.9981123208999634, "learning_rate": 5e-05, "llm_loss": 0.5695605278015137, "loss": 2.5796, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.0599365234375, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.052490234375, "loss_aux_layer_9": 0.05157470703125, "step": 4813, "total_loss": 0.6448967605829239 }, { "epoch": 0.9530785982973669, "grad_norm": 1.0179632902145386, "learning_rate": 5e-05, "llm_loss": 0.5860529541969299, "loss": 2.6468, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.027496337890625, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05194091796875, "step": 4814, "total_loss": 0.6617027670145035 }, { "epoch": 0.9532765788952683, "grad_norm": 0.8356319665908813, "learning_rate": 5e-05, "llm_loss": 0.5962570160627365, "loss": 2.7032, "loss_aux_layer_0": 0.011444091796875, "loss_aux_layer_1": 0.0299072265625, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0872802734375, "loss_aux_layer_16": 0.0966796875, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.11572265625, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1231689453125, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05511474609375, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05462646484375, "step": 4815, "total_loss": 0.675791785120964 }, { "epoch": 0.9534745594931696, "grad_norm": 0.8856832385063171, "learning_rate": 5e-05, "llm_loss": 0.5878423005342484, "loss": 2.6569, "loss_aux_layer_0": 0.0103759765625, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.0552978515625, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05206298828125, "step": 4816, "total_loss": 0.6642159521579742 }, { "epoch": 0.953672540091071, "grad_norm": 0.9357513785362244, "learning_rate": 5e-05, "llm_loss": 0.5915839225053787, "loss": 2.6749, "loss_aux_layer_0": 0.01031494140625, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.0545654296875, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05426025390625, "step": 4817, "total_loss": 0.6687129139900208 }, { "epoch": 0.9538705206889725, "grad_norm": 0.8215110898017883, "learning_rate": 5e-05, "llm_loss": 0.5284715890884399, "loss": 2.4062, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.026641845703125, "loss_aux_layer_10": 0.05023193359375, "loss_aux_layer_11": 0.0538330078125, "loss_aux_layer_12": 0.05816650390625, "loss_aux_layer_13": 0.06317138671875, "loss_aux_layer_14": 0.0711669921875, "loss_aux_layer_15": 0.0792236328125, "loss_aux_layer_16": 0.0882568359375, "loss_aux_layer_17": 0.095703125, "loss_aux_layer_18": 0.1041259765625, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.037109375, "loss_aux_layer_20": 0.115478515625, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.0458984375, "loss_aux_layer_4": 0.04815673828125, "loss_aux_layer_5": 0.049560546875, "loss_aux_layer_6": 0.052001953125, "loss_aux_layer_7": 0.05047607421875, "loss_aux_layer_8": 0.0499267578125, "loss_aux_layer_9": 0.049072265625, "step": 4818, "total_loss": 0.6015530228614807 }, { "epoch": 0.9540685012868739, "grad_norm": 1.2727948427200317, "learning_rate": 5e-05, "llm_loss": 0.5396037846803665, "loss": 2.4594, "loss_aux_layer_0": 0.0118865966796875, "loss_aux_layer_1": 0.026702880859375, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03753662109375, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0467529296875, "loss_aux_layer_4": 0.049072265625, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.05322265625, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.050537109375, "step": 4819, "total_loss": 0.6148594319820404 }, { "epoch": 0.9542664818847753, "grad_norm": 0.7261201739311218, "learning_rate": 5e-05, "llm_loss": 0.5221087336540222, "loss": 2.3985, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05035400390625, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.0550537109375, "loss_aux_layer_9": 0.0540771484375, "step": 4820, "total_loss": 0.5996347591280937 }, { "epoch": 0.9544644624826767, "grad_norm": 1.1655848026275635, "learning_rate": 5e-05, "llm_loss": 0.6596400439739227, "loss": 2.9481, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0533447265625, "step": 4821, "total_loss": 0.7370307743549347 }, { "epoch": 0.9546624430805781, "grad_norm": 1.7661480903625488, "learning_rate": 5e-05, "llm_loss": 0.5910069048404694, "loss": 2.6745, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.062255859375, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05303955078125, "step": 4822, "total_loss": 0.6686157435178757 }, { "epoch": 0.9548604236784796, "grad_norm": 1.0056326389312744, "learning_rate": 5e-05, "llm_loss": 0.5372124016284943, "loss": 2.4484, "loss_aux_layer_0": 0.0108184814453125, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.0526123046875, "loss_aux_layer_8": 0.0521240234375, "loss_aux_layer_9": 0.05072021484375, "step": 4823, "total_loss": 0.612099826335907 }, { "epoch": 0.9550584042763809, "grad_norm": 1.2691295146942139, "learning_rate": 5e-05, "llm_loss": 0.600252091884613, "loss": 2.7107, "loss_aux_layer_0": 0.0111236572265625, "loss_aux_layer_1": 0.02978515625, "loss_aux_layer_10": 0.0556640625, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.052001953125, "loss_aux_layer_4": 0.0543212890625, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05438232421875, "step": 4824, "total_loss": 0.6776809692382812 }, { "epoch": 0.9552563848742823, "grad_norm": 1.0865767002105713, "learning_rate": 5e-05, "llm_loss": 0.5005122870206833, "loss": 2.315, "loss_aux_layer_0": 0.0116119384765625, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.04144287109375, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05426025390625, "step": 4825, "total_loss": 0.5787610411643982 }, { "epoch": 0.9554543654721838, "grad_norm": 0.8657164573669434, "learning_rate": 5e-05, "llm_loss": 0.5074648410081863, "loss": 2.3433, "loss_aux_layer_0": 0.011138916015625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05560302734375, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.069091796875, "loss_aux_layer_14": 0.0770263671875, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1212158203125, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.0543212890625, "step": 4826, "total_loss": 0.5858373194932938 }, { "epoch": 0.9556523460700851, "grad_norm": 1.2491295337677002, "learning_rate": 5e-05, "llm_loss": 0.5662667155265808, "loss": 2.5634, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.05950927734375, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.0509033203125, "step": 4827, "total_loss": 0.6408596336841583 }, { "epoch": 0.9558503266679865, "grad_norm": 0.8935853838920593, "learning_rate": 5e-05, "llm_loss": 0.4973004013299942, "loss": 2.3004, "loss_aux_layer_0": 0.0111541748046875, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189453125, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05291748046875, "step": 4828, "total_loss": 0.5751064792275429 }, { "epoch": 0.956048307265888, "grad_norm": 1.0750832557678223, "learning_rate": 5e-05, "llm_loss": 0.5557060092687607, "loss": 2.5258, "loss_aux_layer_0": 0.0120086669921875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05218505859375, "step": 4829, "total_loss": 0.6314476132392883 }, { "epoch": 0.9562462878637894, "grad_norm": 1.0066150426864624, "learning_rate": 5e-05, "llm_loss": 0.5373015701770782, "loss": 2.4467, "loss_aux_layer_0": 0.0104217529296875, "loss_aux_layer_1": 0.02691650390625, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.0556640625, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.0885009765625, "loss_aux_layer_17": 0.0960693359375, "loss_aux_layer_18": 0.1046142578125, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.04986572265625, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05120849609375, "step": 4830, "total_loss": 0.611680656671524 }, { "epoch": 0.9564442684616907, "grad_norm": 0.9359758496284485, "learning_rate": 5e-05, "llm_loss": 0.531309187412262, "loss": 2.4265, "loss_aux_layer_0": 0.011322021484375, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.0516357421875, "step": 4831, "total_loss": 0.6066301614046097 }, { "epoch": 0.9566422490595922, "grad_norm": 0.8963329195976257, "learning_rate": 5e-05, "llm_loss": 0.504662774503231, "loss": 2.3251, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05267333984375, "step": 4832, "total_loss": 0.5812761932611465 }, { "epoch": 0.9568402296574936, "grad_norm": 1.0533791780471802, "learning_rate": 5e-05, "llm_loss": 0.5618748962879181, "loss": 2.5508, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.05084228515625, "step": 4833, "total_loss": 0.6377110779285431 }, { "epoch": 0.9570382102553949, "grad_norm": 0.9362943768501282, "learning_rate": 5e-05, "llm_loss": 0.5694852620363235, "loss": 2.5797, "loss_aux_layer_0": 0.0101318359375, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.0518798828125, "step": 4834, "total_loss": 0.6449248790740967 }, { "epoch": 0.9572361908532964, "grad_norm": 0.9376899003982544, "learning_rate": 5e-05, "llm_loss": 0.5627849996089935, "loss": 2.5632, "loss_aux_layer_0": 0.010345458984375, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.05419921875, "step": 4835, "total_loss": 0.6408106237649918 }, { "epoch": 0.9574341714511978, "grad_norm": 0.8242855072021484, "learning_rate": 5e-05, "llm_loss": 0.46606748551130295, "loss": 2.1726, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05328369140625, "step": 4836, "total_loss": 0.54316146671772 }, { "epoch": 0.9576321520490992, "grad_norm": 0.919660747051239, "learning_rate": 5e-05, "llm_loss": 0.4661383628845215, "loss": 2.1809, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.03021240234375, "loss_aux_layer_10": 0.05621337890625, "loss_aux_layer_11": 0.0599365234375, "loss_aux_layer_12": 0.06402587890625, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.043212890625, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0533447265625, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05926513671875, "loss_aux_layer_7": 0.05712890625, "loss_aux_layer_8": 0.0562744140625, "loss_aux_layer_9": 0.05499267578125, "step": 4837, "total_loss": 0.5452224165201187 }, { "epoch": 0.9578301326470006, "grad_norm": 0.8653954267501831, "learning_rate": 5e-05, "llm_loss": 0.5131489112973213, "loss": 2.3569, "loss_aux_layer_0": 0.010345458984375, "loss_aux_layer_1": 0.02685546875, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0377197265625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04705810546875, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.05389404296875, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.0511474609375, "step": 4838, "total_loss": 0.5892270058393478 }, { "epoch": 0.958028113244902, "grad_norm": 0.9109578728675842, "learning_rate": 5e-05, "llm_loss": 0.49958309531211853, "loss": 2.3095, "loss_aux_layer_0": 0.0116729736328125, "loss_aux_layer_1": 0.029510498046875, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0938720703125, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0576171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.05389404296875, "step": 4839, "total_loss": 0.5773649513721466 }, { "epoch": 0.9582260938428034, "grad_norm": 1.0806217193603516, "learning_rate": 5e-05, "llm_loss": 0.5378106161952019, "loss": 2.4679, "loss_aux_layer_0": 0.01080322265625, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.064697265625, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.0780029296875, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.111328125, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05401611328125, "loss_aux_layer_5": 0.05572509765625, "loss_aux_layer_6": 0.05877685546875, "loss_aux_layer_7": 0.05682373046875, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.054931640625, "step": 4840, "total_loss": 0.6169684380292892 }, { "epoch": 0.9584240744407048, "grad_norm": 0.9234529733657837, "learning_rate": 5e-05, "llm_loss": 0.5765920132398605, "loss": 2.6088, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.0517578125, "step": 4841, "total_loss": 0.6522086709737778 }, { "epoch": 0.9586220550386062, "grad_norm": 0.8689784407615662, "learning_rate": 5e-05, "llm_loss": 0.542187824845314, "loss": 2.4753, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0526123046875, "step": 4842, "total_loss": 0.6188159510493279 }, { "epoch": 0.9588200356365076, "grad_norm": 1.0776984691619873, "learning_rate": 5e-05, "llm_loss": 0.6031674295663834, "loss": 2.7154, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.0516357421875, "step": 4843, "total_loss": 0.6788537055253983 }, { "epoch": 0.9590180162344091, "grad_norm": 0.9388342499732971, "learning_rate": 5e-05, "llm_loss": 0.5876358151435852, "loss": 2.6708, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.031341552734375, "loss_aux_layer_10": 0.05804443359375, "loss_aux_layer_11": 0.06219482421875, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.07177734375, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.08740234375, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.04412841796875, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05401611328125, "loss_aux_layer_4": 0.05609130859375, "loss_aux_layer_5": 0.05767822265625, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.05682373046875, "step": 4844, "total_loss": 0.6676940768957138 }, { "epoch": 0.9592159968323104, "grad_norm": 0.7724067568778992, "learning_rate": 5e-05, "llm_loss": 0.5316637456417084, "loss": 2.4252, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.0277099609375, "loss_aux_layer_10": 0.05133056640625, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.050537109375, "loss_aux_layer_6": 0.05340576171875, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.050048828125, "step": 4845, "total_loss": 0.6062968671321869 }, { "epoch": 0.9594139774302118, "grad_norm": 0.8565411567687988, "learning_rate": 5e-05, "llm_loss": 0.5018168836832047, "loss": 2.3138, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.1904296875, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05169677734375, "step": 4846, "total_loss": 0.578460231423378 }, { "epoch": 0.9596119580281133, "grad_norm": 0.7734318971633911, "learning_rate": 5e-05, "llm_loss": 0.49302251636981964, "loss": 2.2736, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.060302734375, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05108642578125, "step": 4847, "total_loss": 0.5684080943465233 }, { "epoch": 0.9598099386260146, "grad_norm": 0.865195631980896, "learning_rate": 5e-05, "llm_loss": 0.5861707329750061, "loss": 2.6598, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.05706787109375, "loss_aux_layer_11": 0.06097412109375, "loss_aux_layer_12": 0.06512451171875, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.095458984375, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.041748046875, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.0546875, "loss_aux_layer_5": 0.056396484375, "loss_aux_layer_6": 0.0595703125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.0570068359375, "loss_aux_layer_9": 0.05596923828125, "step": 4848, "total_loss": 0.6649563163518906 }, { "epoch": 0.960007919223916, "grad_norm": 0.8291950225830078, "learning_rate": 5e-05, "llm_loss": 0.5565002635121346, "loss": 2.5328, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1219482421875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.0513916015625, "step": 4849, "total_loss": 0.6332053542137146 }, { "epoch": 0.9602058998218175, "grad_norm": 0.8932059407234192, "learning_rate": 5e-05, "llm_loss": 0.5554793030023575, "loss": 2.5246, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06597900390625, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.054443359375, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.0517578125, "step": 4850, "total_loss": 0.6311608999967575 }, { "epoch": 0.9604038804197189, "grad_norm": 1.1768144369125366, "learning_rate": 5e-05, "llm_loss": 0.5129540637135506, "loss": 2.3476, "loss_aux_layer_0": 0.009918212890625, "loss_aux_layer_1": 0.02685546875, "loss_aux_layer_10": 0.0513916015625, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.0799560546875, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.1170654296875, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.0498046875, "loss_aux_layer_6": 0.05267333984375, "loss_aux_layer_7": 0.05120849609375, "loss_aux_layer_8": 0.05096435546875, "loss_aux_layer_9": 0.050048828125, "step": 4851, "total_loss": 0.5869097262620926 }, { "epoch": 0.9606018610176202, "grad_norm": 0.8158963322639465, "learning_rate": 5e-05, "llm_loss": 0.5308362543582916, "loss": 2.4307, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05255126953125, "step": 4852, "total_loss": 0.6076758056879044 }, { "epoch": 0.9607998416155217, "grad_norm": 0.8365635275840759, "learning_rate": 5e-05, "llm_loss": 0.5243996381759644, "loss": 2.3975, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05096435546875, "step": 4853, "total_loss": 0.5993851572275162 }, { "epoch": 0.9609978222134231, "grad_norm": 1.236464262008667, "learning_rate": 5e-05, "llm_loss": 0.45943092554807663, "loss": 2.1487, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.06829833984375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.057373046875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05426025390625, "step": 4854, "total_loss": 0.5371652245521545 }, { "epoch": 0.9611958028113244, "grad_norm": 0.9076001644134521, "learning_rate": 5e-05, "llm_loss": 0.6267031282186508, "loss": 2.7947, "loss_aux_layer_0": 0.010498046875, "loss_aux_layer_1": 0.02606201171875, "loss_aux_layer_10": 0.04949951171875, "loss_aux_layer_11": 0.05267333984375, "loss_aux_layer_12": 0.0562744140625, "loss_aux_layer_13": 0.06121826171875, "loss_aux_layer_14": 0.0697021484375, "loss_aux_layer_15": 0.0780029296875, "loss_aux_layer_16": 0.0872802734375, "loss_aux_layer_17": 0.0953369140625, "loss_aux_layer_18": 0.1036376953125, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.03546142578125, "loss_aux_layer_20": 0.1153564453125, "loss_aux_layer_21": 0.123046875, "loss_aux_layer_22": 0.141357421875, "loss_aux_layer_23": 0.176025390625, "loss_aux_layer_3": 0.04425048828125, "loss_aux_layer_4": 0.04669189453125, "loss_aux_layer_5": 0.04840087890625, "loss_aux_layer_6": 0.05126953125, "loss_aux_layer_7": 0.049560546875, "loss_aux_layer_8": 0.04931640625, "loss_aux_layer_9": 0.0482177734375, "step": 4855, "total_loss": 0.6986777782440186 }, { "epoch": 0.9613937834092259, "grad_norm": 0.9796584248542786, "learning_rate": 5e-05, "llm_loss": 0.5590220838785172, "loss": 2.5401, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05194091796875, "step": 4856, "total_loss": 0.6350259780883789 }, { "epoch": 0.9615917640071273, "grad_norm": 0.9024073481559753, "learning_rate": 5e-05, "llm_loss": 0.5221173912286758, "loss": 2.389, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.123291015625, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0523681640625, "step": 4857, "total_loss": 0.5972405076026917 }, { "epoch": 0.9617897446050288, "grad_norm": 0.7699090838432312, "learning_rate": 5e-05, "llm_loss": 0.520293578505516, "loss": 2.3852, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0523681640625, "step": 4858, "total_loss": 0.596305638551712 }, { "epoch": 0.9619877252029301, "grad_norm": 0.8900915384292603, "learning_rate": 5e-05, "llm_loss": 0.48372090607881546, "loss": 2.2473, "loss_aux_layer_0": 0.010986328125, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.0767822265625, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1119384765625, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1234130859375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05322265625, "step": 4859, "total_loss": 0.5618356466293335 }, { "epoch": 0.9621857058008315, "grad_norm": 0.975001335144043, "learning_rate": 5e-05, "llm_loss": 0.5681236386299133, "loss": 2.5716, "loss_aux_layer_0": 0.009979248046875, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.06512451171875, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.0380859375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.04974365234375, "loss_aux_layer_5": 0.0511474609375, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05096435546875, "step": 4860, "total_loss": 0.6428899317979813 }, { "epoch": 0.962383686398733, "grad_norm": 0.8108114004135132, "learning_rate": 5e-05, "llm_loss": 0.47301552444696426, "loss": 2.1935, "loss_aux_layer_0": 0.0110626220703125, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06097412109375, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.0516357421875, "step": 4861, "total_loss": 0.5483690798282623 }, { "epoch": 0.9625816669966343, "grad_norm": 0.8623015284538269, "learning_rate": 5e-05, "llm_loss": 0.6504790484905243, "loss": 2.899, "loss_aux_layer_0": 0.0095062255859375, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.1072998046875, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.11474609375, "loss_aux_layer_21": 0.122314453125, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05181884765625, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.05120849609375, "step": 4862, "total_loss": 0.7247612178325653 }, { "epoch": 0.9627796475945357, "grad_norm": 1.0412765741348267, "learning_rate": 5e-05, "llm_loss": 0.5516039580106735, "loss": 2.5196, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.0584716796875, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.06842041015625, "loss_aux_layer_14": 0.0771484375, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.052734375, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.053466796875, "step": 4863, "total_loss": 0.6299115121364594 }, { "epoch": 0.9629776281924372, "grad_norm": 0.8739772439002991, "learning_rate": 5e-05, "llm_loss": 0.5677116513252258, "loss": 2.5764, "loss_aux_layer_0": 0.0105743408203125, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.06683349609375, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05267333984375, "step": 4864, "total_loss": 0.6440933495759964 }, { "epoch": 0.9631756087903386, "grad_norm": 1.1113027334213257, "learning_rate": 5e-05, "llm_loss": 0.5760484039783478, "loss": 2.6096, "loss_aux_layer_0": 0.0105133056640625, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.05169677734375, "step": 4865, "total_loss": 0.6524000614881516 }, { "epoch": 0.9633735893882399, "grad_norm": 0.7586309313774109, "learning_rate": 5e-05, "llm_loss": 0.5562988966703415, "loss": 2.5304, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05206298828125, "step": 4866, "total_loss": 0.632605716586113 }, { "epoch": 0.9635715699861414, "grad_norm": 0.9802893400192261, "learning_rate": 5e-05, "llm_loss": 0.5962195545434952, "loss": 2.6925, "loss_aux_layer_0": 0.00933837890625, "loss_aux_layer_1": 0.028411865234375, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.06756591796875, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.12060546875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.0526123046875, "step": 4867, "total_loss": 0.673134908080101 }, { "epoch": 0.9637695505840428, "grad_norm": 0.8915992379188538, "learning_rate": 5e-05, "llm_loss": 0.5319328606128693, "loss": 2.4147, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.04937744140625, "loss_aux_layer_11": 0.05279541015625, "loss_aux_layer_12": 0.05670166015625, "loss_aux_layer_13": 0.06146240234375, "loss_aux_layer_14": 0.069091796875, "loss_aux_layer_15": 0.0771484375, "loss_aux_layer_16": 0.0859375, "loss_aux_layer_17": 0.093505859375, "loss_aux_layer_18": 0.101318359375, "loss_aux_layer_19": 0.1051025390625, "loss_aux_layer_2": 0.0372314453125, "loss_aux_layer_20": 0.113525390625, "loss_aux_layer_21": 0.1214599609375, "loss_aux_layer_22": 0.14111328125, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.0458984375, "loss_aux_layer_4": 0.04791259765625, "loss_aux_layer_5": 0.04901123046875, "loss_aux_layer_6": 0.0516357421875, "loss_aux_layer_7": 0.04998779296875, "loss_aux_layer_8": 0.04949951171875, "loss_aux_layer_9": 0.04827880859375, "step": 4868, "total_loss": 0.6036845445632935 }, { "epoch": 0.9639675311819441, "grad_norm": 0.9390655159950256, "learning_rate": 5e-05, "llm_loss": 0.6043700352311134, "loss": 2.7129, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.05078125, "loss_aux_layer_11": 0.054443359375, "loss_aux_layer_12": 0.05865478515625, "loss_aux_layer_13": 0.06365966796875, "loss_aux_layer_14": 0.07177734375, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0963134765625, "loss_aux_layer_18": 0.1044921875, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0467529296875, "loss_aux_layer_4": 0.0491943359375, "loss_aux_layer_5": 0.0504150390625, "loss_aux_layer_6": 0.05291748046875, "loss_aux_layer_7": 0.05133056640625, "loss_aux_layer_8": 0.05084228515625, "loss_aux_layer_9": 0.0496826171875, "step": 4869, "total_loss": 0.6782177165150642 }, { "epoch": 0.9641655117798456, "grad_norm": 0.7452171444892883, "learning_rate": 5e-05, "llm_loss": 0.5866471379995346, "loss": 2.6508, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05712890625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05206298828125, "step": 4870, "total_loss": 0.6626886874437332 }, { "epoch": 0.964363492377747, "grad_norm": 0.9039862751960754, "learning_rate": 5e-05, "llm_loss": 0.6020503640174866, "loss": 2.7053, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.05133056640625, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.064453125, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.05023193359375, "loss_aux_layer_6": 0.05291748046875, "loss_aux_layer_7": 0.05133056640625, "loss_aux_layer_8": 0.05108642578125, "loss_aux_layer_9": 0.050048828125, "step": 4871, "total_loss": 0.676324337720871 }, { "epoch": 0.9645614729756484, "grad_norm": 0.9719907641410828, "learning_rate": 5e-05, "llm_loss": 0.5462634116411209, "loss": 2.4923, "loss_aux_layer_0": 0.009429931640625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05340576171875, "step": 4872, "total_loss": 0.6230863183736801 }, { "epoch": 0.9647594535735498, "grad_norm": 1.0268590450286865, "learning_rate": 5e-05, "llm_loss": 0.5395245030522346, "loss": 2.4734, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06451416015625, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.055419921875, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05657958984375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.0548095703125, "step": 4873, "total_loss": 0.6183440238237381 }, { "epoch": 0.9649574341714512, "grad_norm": 0.8898181319236755, "learning_rate": 5e-05, "llm_loss": 0.6333124339580536, "loss": 2.8476, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.112060546875, "loss_aux_layer_19": 0.115478515625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.123046875, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.185302734375, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05377197265625, "step": 4874, "total_loss": 0.7118877172470093 }, { "epoch": 0.9651554147693526, "grad_norm": 0.9675056338310242, "learning_rate": 5e-05, "llm_loss": 0.6180214136838913, "loss": 2.7786, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05364990234375, "step": 4875, "total_loss": 0.6946375668048859 }, { "epoch": 0.9653533953672541, "grad_norm": 0.7710760831832886, "learning_rate": 5e-05, "llm_loss": 0.5871438831090927, "loss": 2.6541, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05224609375, "step": 4876, "total_loss": 0.6635328978300095 }, { "epoch": 0.9655513759651554, "grad_norm": 0.9781942963600159, "learning_rate": 5e-05, "llm_loss": 0.484505794942379, "loss": 2.2494, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.06854248046875, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.0506591796875, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.05401611328125, "step": 4877, "total_loss": 0.5623486638069153 }, { "epoch": 0.9657493565630568, "grad_norm": 0.8335387110710144, "learning_rate": 5e-05, "llm_loss": 0.6274902150034904, "loss": 2.8164, "loss_aux_layer_0": 0.00958251953125, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.053955078125, "loss_aux_layer_9": 0.05303955078125, "step": 4878, "total_loss": 0.7040895074605942 }, { "epoch": 0.9659473371609583, "grad_norm": 0.8960857391357422, "learning_rate": 5e-05, "llm_loss": 0.5923761874437332, "loss": 2.6744, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05755615234375, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05267333984375, "step": 4879, "total_loss": 0.6686058640480042 }, { "epoch": 0.9661453177588596, "grad_norm": 0.7061061263084412, "learning_rate": 5e-05, "llm_loss": 0.5442765951156616, "loss": 2.4827, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.06707763671875, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.052490234375, "step": 4880, "total_loss": 0.6206710338592529 }, { "epoch": 0.966343298356761, "grad_norm": 0.7872814536094666, "learning_rate": 5e-05, "llm_loss": 0.5637592375278473, "loss": 2.556, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.05242919921875, "step": 4881, "total_loss": 0.639007955789566 }, { "epoch": 0.9665412789546625, "grad_norm": 0.726937472820282, "learning_rate": 5e-05, "llm_loss": 0.47735314071178436, "loss": 2.2187, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05352783203125, "step": 4882, "total_loss": 0.5546679124236107 }, { "epoch": 0.9667392595525639, "grad_norm": 1.0023202896118164, "learning_rate": 5e-05, "llm_loss": 0.543674111366272, "loss": 2.4949, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.0560302734375, "loss_aux_layer_11": 0.06011962890625, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070556640625, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.0880126953125, "loss_aux_layer_16": 0.097900390625, "loss_aux_layer_17": 0.1055908203125, "loss_aux_layer_18": 0.1141357421875, "loss_aux_layer_19": 0.1177978515625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1253662109375, "loss_aux_layer_21": 0.134033203125, "loss_aux_layer_22": 0.154541015625, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05462646484375, "step": 4883, "total_loss": 0.6237262636423111 }, { "epoch": 0.9669372401504652, "grad_norm": 0.8273791670799255, "learning_rate": 5e-05, "llm_loss": 0.5836861804127693, "loss": 2.6447, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113525390625, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1297607421875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0528564453125, "step": 4884, "total_loss": 0.6611678451299667 }, { "epoch": 0.9671352207483667, "grad_norm": 0.8831263780593872, "learning_rate": 5e-05, "llm_loss": 0.5189106911420822, "loss": 2.3772, "loss_aux_layer_0": 0.0102081298828125, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0521240234375, "step": 4885, "total_loss": 0.5942960381507874 }, { "epoch": 0.9673332013462681, "grad_norm": 0.7574895024299622, "learning_rate": 5e-05, "llm_loss": 0.5204946100711823, "loss": 2.3939, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.11962890625, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.055908203125, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.0537109375, "step": 4886, "total_loss": 0.5984829217195511 }, { "epoch": 0.9675311819441694, "grad_norm": 0.900626003742218, "learning_rate": 5e-05, "llm_loss": 0.5891132205724716, "loss": 2.6608, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.0533447265625, "step": 4887, "total_loss": 0.6651916801929474 }, { "epoch": 0.9677291625420709, "grad_norm": 0.9723854064941406, "learning_rate": 5e-05, "llm_loss": 0.42546720802783966, "loss": 2.0154, "loss_aux_layer_0": 0.009674072265625, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.121826171875, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.05413818359375, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.0538330078125, "step": 4888, "total_loss": 0.5038607642054558 }, { "epoch": 0.9679271431399723, "grad_norm": 0.8257967233657837, "learning_rate": 5e-05, "llm_loss": 0.5820027887821198, "loss": 2.6191, "loss_aux_layer_0": 0.0100555419921875, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.05059814453125, "loss_aux_layer_11": 0.053955078125, "loss_aux_layer_12": 0.05792236328125, "loss_aux_layer_13": 0.06292724609375, "loss_aux_layer_14": 0.07073974609375, "loss_aux_layer_15": 0.0784912109375, "loss_aux_layer_16": 0.08740234375, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.1033935546875, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.037017822265625, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.122802734375, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.046142578125, "loss_aux_layer_4": 0.0482177734375, "loss_aux_layer_5": 0.0496826171875, "loss_aux_layer_6": 0.05206298828125, "loss_aux_layer_7": 0.050537109375, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.0494384765625, "step": 4889, "total_loss": 0.6547703146934509 }, { "epoch": 0.9681251237378737, "grad_norm": 0.8662002086639404, "learning_rate": 5e-05, "llm_loss": 0.5728522464632988, "loss": 2.5884, "loss_aux_layer_0": 0.010894775390625, "loss_aux_layer_1": 0.026611328125, "loss_aux_layer_10": 0.05108642578125, "loss_aux_layer_11": 0.0546875, "loss_aux_layer_12": 0.0587158203125, "loss_aux_layer_13": 0.06378173828125, "loss_aux_layer_14": 0.07177734375, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.037109375, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.04864501953125, "loss_aux_layer_5": 0.05035400390625, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05096435546875, "loss_aux_layer_9": 0.050048828125, "step": 4890, "total_loss": 0.6470920592546463 }, { "epoch": 0.9683231043357751, "grad_norm": 1.110393762588501, "learning_rate": 5e-05, "llm_loss": 0.5061319693922997, "loss": 2.3406, "loss_aux_layer_0": 0.0098876953125, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.0970458984375, "loss_aux_layer_17": 0.1046142578125, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.132080078125, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.189697265625, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.05511474609375, "loss_aux_layer_9": 0.053955078125, "step": 4891, "total_loss": 0.5851411074399948 }, { "epoch": 0.9685210849336765, "grad_norm": 0.9677834510803223, "learning_rate": 5e-05, "llm_loss": 0.5692508071660995, "loss": 2.5752, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05181884765625, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.08935546875, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.1170654296875, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05078125, "step": 4892, "total_loss": 0.6438076496124268 }, { "epoch": 0.9687190655315779, "grad_norm": 1.0835201740264893, "learning_rate": 5e-05, "llm_loss": 0.5008640885353088, "loss": 2.3073, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.026947021484375, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.055419921875, "loss_aux_layer_12": 0.05963134765625, "loss_aux_layer_13": 0.06488037109375, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.04949951171875, "loss_aux_layer_5": 0.05096435546875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.05096435546875, "step": 4893, "total_loss": 0.5768251270055771 }, { "epoch": 0.9689170461294793, "grad_norm": 0.9183312058448792, "learning_rate": 5e-05, "llm_loss": 0.6428080946207047, "loss": 2.8702, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.0550537109375, "loss_aux_layer_12": 0.05914306640625, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.05145263671875, "loss_aux_layer_9": 0.05035400390625, "step": 4894, "total_loss": 0.7175616770982742 }, { "epoch": 0.9691150267273807, "grad_norm": 0.846724808216095, "learning_rate": 5e-05, "llm_loss": 0.5449923723936081, "loss": 2.4775, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.05963134765625, "loss_aux_layer_13": 0.0645751953125, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.03826904296875, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.0511474609375, "loss_aux_layer_6": 0.05377197265625, "loss_aux_layer_7": 0.0521240234375, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.05078125, "step": 4895, "total_loss": 0.619382232427597 }, { "epoch": 0.9693130073252821, "grad_norm": 0.8787368535995483, "learning_rate": 5e-05, "llm_loss": 0.5784080028533936, "loss": 2.6197, "loss_aux_layer_0": 0.0103302001953125, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.05224609375, "step": 4896, "total_loss": 0.6549331992864609 }, { "epoch": 0.9695109879231836, "grad_norm": 0.8488624095916748, "learning_rate": 5e-05, "llm_loss": 0.6301024854183197, "loss": 2.8301, "loss_aux_layer_0": 0.009857177734375, "loss_aux_layer_1": 0.02984619140625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.051513671875, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05535888671875, "loss_aux_layer_9": 0.0540771484375, "step": 4897, "total_loss": 0.7075313180685043 }, { "epoch": 0.9697089685210849, "grad_norm": 0.861473560333252, "learning_rate": 5e-05, "llm_loss": 0.5536406934261322, "loss": 2.5173, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.028045654296875, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.0516357421875, "step": 4898, "total_loss": 0.6293181777000427 }, { "epoch": 0.9699069491189863, "grad_norm": 0.7644956111907959, "learning_rate": 5e-05, "llm_loss": 0.5762242376804352, "loss": 2.6071, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.052490234375, "step": 4899, "total_loss": 0.6517870873212814 }, { "epoch": 0.9701049297168878, "grad_norm": 1.0158822536468506, "learning_rate": 5e-05, "llm_loss": 0.6171470433473587, "loss": 2.7759, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.052490234375, "step": 4900, "total_loss": 0.6939747184514999 }, { "epoch": 0.9703029103147891, "grad_norm": 0.9139134287834167, "learning_rate": 5e-05, "llm_loss": 0.5081362724304199, "loss": 2.331, "loss_aux_layer_0": 0.009552001953125, "loss_aux_layer_1": 0.027496337890625, "loss_aux_layer_10": 0.0521240234375, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05230712890625, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05084228515625, "step": 4901, "total_loss": 0.5827381610870361 }, { "epoch": 0.9705008909126905, "grad_norm": 0.9169842600822449, "learning_rate": 5e-05, "llm_loss": 0.5710373520851135, "loss": 2.5937, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05328369140625, "step": 4902, "total_loss": 0.6484226882457733 }, { "epoch": 0.970698871510592, "grad_norm": 0.9011296033859253, "learning_rate": 5e-05, "llm_loss": 0.5289248377084732, "loss": 2.4106, "loss_aux_layer_0": 0.009613037109375, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05218505859375, "loss_aux_layer_11": 0.05596923828125, "loss_aux_layer_12": 0.059814453125, "loss_aux_layer_13": 0.0645751953125, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.0880126953125, "loss_aux_layer_17": 0.0953369140625, "loss_aux_layer_18": 0.1031494140625, "loss_aux_layer_19": 0.106689453125, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.11474609375, "loss_aux_layer_21": 0.12255859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.0511474609375, "step": 4903, "total_loss": 0.6026404201984406 }, { "epoch": 0.9708968521084934, "grad_norm": 1.102471947669983, "learning_rate": 5e-05, "llm_loss": 0.6520604193210602, "loss": 2.912, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.028045654296875, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05218505859375, "step": 4904, "total_loss": 0.728011429309845 }, { "epoch": 0.9710948327063947, "grad_norm": 1.0220845937728882, "learning_rate": 5e-05, "llm_loss": 0.5884801298379898, "loss": 2.6708, "loss_aux_layer_0": 0.009613037109375, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06072998046875, "loss_aux_layer_12": 0.065185546875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.103515625, "loss_aux_layer_18": 0.1114501953125, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04217529296875, "loss_aux_layer_20": 0.1214599609375, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05474853515625, "loss_aux_layer_5": 0.05596923828125, "loss_aux_layer_6": 0.05908203125, "loss_aux_layer_7": 0.05755615234375, "loss_aux_layer_8": 0.05670166015625, "loss_aux_layer_9": 0.05535888671875, "step": 4905, "total_loss": 0.6676905453205109 }, { "epoch": 0.9712928133042962, "grad_norm": 0.9320259094238281, "learning_rate": 5e-05, "llm_loss": 0.5623399168252945, "loss": 2.5576, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.0594482421875, "loss_aux_layer_12": 0.06353759765625, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.05059814453125, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05517578125, "loss_aux_layer_9": 0.05413818359375, "step": 4906, "total_loss": 0.6394010186195374 }, { "epoch": 0.9714907939021976, "grad_norm": 1.0350230932235718, "learning_rate": 5e-05, "llm_loss": 0.600477322936058, "loss": 2.7109, "loss_aux_layer_0": 0.0095062255859375, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.05548095703125, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.05352783203125, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.05767822265625, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05438232421875, "step": 4907, "total_loss": 0.6777227222919464 }, { "epoch": 0.9716887745000989, "grad_norm": 1.0448964834213257, "learning_rate": 5e-05, "llm_loss": 0.6699711084365845, "loss": 2.9819, "loss_aux_layer_0": 0.009735107421875, "loss_aux_layer_1": 0.029266357421875, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.124267578125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.049560546875, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.0543212890625, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05242919921875, "step": 4908, "total_loss": 0.7454808354377747 }, { "epoch": 0.9718867550980004, "grad_norm": 1.0198993682861328, "learning_rate": 5e-05, "llm_loss": 0.6686353236436844, "loss": 2.9806, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05255126953125, "step": 4909, "total_loss": 0.7451421767473221 }, { "epoch": 0.9720847356959018, "grad_norm": 0.9870590567588806, "learning_rate": 5e-05, "llm_loss": 0.608919307589531, "loss": 2.7283, "loss_aux_layer_0": 0.0102996826171875, "loss_aux_layer_1": 0.026397705078125, "loss_aux_layer_10": 0.04931640625, "loss_aux_layer_11": 0.05291748046875, "loss_aux_layer_12": 0.0567626953125, "loss_aux_layer_13": 0.0616455078125, "loss_aux_layer_14": 0.06982421875, "loss_aux_layer_15": 0.0782470703125, "loss_aux_layer_16": 0.08740234375, "loss_aux_layer_17": 0.095703125, "loss_aux_layer_18": 0.103759765625, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.036865234375, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04559326171875, "loss_aux_layer_4": 0.047607421875, "loss_aux_layer_5": 0.04901123046875, "loss_aux_layer_6": 0.051513671875, "loss_aux_layer_7": 0.04986572265625, "loss_aux_layer_8": 0.0494384765625, "loss_aux_layer_9": 0.0482177734375, "step": 4910, "total_loss": 0.6820674538612366 }, { "epoch": 0.9722827162938033, "grad_norm": 0.9572519659996033, "learning_rate": 5e-05, "llm_loss": 0.5133978426456451, "loss": 2.3604, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05224609375, "step": 4911, "total_loss": 0.5901041030883789 }, { "epoch": 0.9724806968917046, "grad_norm": 0.9926567077636719, "learning_rate": 5e-05, "llm_loss": 0.557675838470459, "loss": 2.5359, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05255126953125, "step": 4912, "total_loss": 0.6339662820100784 }, { "epoch": 0.972678677489606, "grad_norm": 0.8605089783668518, "learning_rate": 5e-05, "llm_loss": 0.5590465441346169, "loss": 2.5352, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.1077880859375, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.05218505859375, "step": 4913, "total_loss": 0.633811891078949 }, { "epoch": 0.9728766580875075, "grad_norm": 0.9398223161697388, "learning_rate": 5e-05, "llm_loss": 0.5592318847775459, "loss": 2.5292, "loss_aux_layer_0": 0.0134124755859375, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.063720703125, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0872802734375, "loss_aux_layer_17": 0.0947265625, "loss_aux_layer_18": 0.1026611328125, "loss_aux_layer_19": 0.1055908203125, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.1131591796875, "loss_aux_layer_21": 0.1207275390625, "loss_aux_layer_22": 0.13916015625, "loss_aux_layer_23": 0.173095703125, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.04986572265625, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.0535888671875, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05078125, "step": 4914, "total_loss": 0.6322989165782928 }, { "epoch": 0.9730746386854088, "grad_norm": 0.8976902961730957, "learning_rate": 5e-05, "llm_loss": 0.5904100835323334, "loss": 2.6626, "loss_aux_layer_0": 0.01080322265625, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06036376953125, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04730224609375, "loss_aux_layer_4": 0.04986572265625, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.0511474609375, "step": 4915, "total_loss": 0.6656589806079865 }, { "epoch": 0.9732726192833102, "grad_norm": 0.8611388802528381, "learning_rate": 5e-05, "llm_loss": 0.5897209718823433, "loss": 2.6718, "loss_aux_layer_0": 0.0122833251953125, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.0552978515625, "loss_aux_layer_11": 0.0592041015625, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.11376953125, "loss_aux_layer_2": 0.04058837890625, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.1295166015625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05029296875, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.0552978515625, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05389404296875, "step": 4916, "total_loss": 0.6679607331752777 }, { "epoch": 0.9734705998812117, "grad_norm": 0.8842044472694397, "learning_rate": 5e-05, "llm_loss": 0.5897710099816322, "loss": 2.667, "loss_aux_layer_0": 0.0112152099609375, "loss_aux_layer_1": 0.029296875, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.05303955078125, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.05731201171875, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05389404296875, "step": 4917, "total_loss": 0.6667593717575073 }, { "epoch": 0.9736685804791131, "grad_norm": 1.0792471170425415, "learning_rate": 5e-05, "llm_loss": 0.5614822804927826, "loss": 2.5475, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.0518798828125, "step": 4918, "total_loss": 0.6368812322616577 }, { "epoch": 0.9738665610770144, "grad_norm": 1.0539830923080444, "learning_rate": 5e-05, "llm_loss": 0.6427130103111267, "loss": 2.893, "loss_aux_layer_0": 0.0102081298828125, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05816650390625, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.0665283203125, "loss_aux_layer_13": 0.0721435546875, "loss_aux_layer_14": 0.0811767578125, "loss_aux_layer_15": 0.0899658203125, "loss_aux_layer_16": 0.099365234375, "loss_aux_layer_17": 0.1070556640625, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1170654296875, "loss_aux_layer_2": 0.04034423828125, "loss_aux_layer_20": 0.1240234375, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.0592041015625, "loss_aux_layer_7": 0.05780029296875, "loss_aux_layer_8": 0.0574951171875, "loss_aux_layer_9": 0.05645751953125, "step": 4919, "total_loss": 0.7232501804828644 }, { "epoch": 0.9740645416749159, "grad_norm": 1.2030279636383057, "learning_rate": 5e-05, "llm_loss": 0.6329531520605087, "loss": 2.8334, "loss_aux_layer_0": 0.010711669921875, "loss_aux_layer_1": 0.026611328125, "loss_aux_layer_10": 0.0521240234375, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.0516357421875, "loss_aux_layer_9": 0.05078125, "step": 4920, "total_loss": 0.7083389908075333 }, { "epoch": 0.9742625222728173, "grad_norm": 0.8400160670280457, "learning_rate": 5e-05, "llm_loss": 0.5522060245275497, "loss": 2.5081, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.0517578125, "step": 4921, "total_loss": 0.6270260587334633 }, { "epoch": 0.9744605028707186, "grad_norm": 1.0075794458389282, "learning_rate": 5e-05, "llm_loss": 0.591157466173172, "loss": 2.6701, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.028564453125, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05316162109375, "step": 4922, "total_loss": 0.667519599199295 }, { "epoch": 0.97465848346862, "grad_norm": 0.7946797609329224, "learning_rate": 5e-05, "llm_loss": 0.5533731877803802, "loss": 2.5113, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.0556640625, "loss_aux_layer_12": 0.05963134765625, "loss_aux_layer_13": 0.0648193359375, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1046142578125, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1170654296875, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.04974365234375, "loss_aux_layer_5": 0.05096435546875, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05059814453125, "step": 4923, "total_loss": 0.6278184503316879 }, { "epoch": 0.9748564640665215, "grad_norm": 1.0039833784103394, "learning_rate": 5e-05, "llm_loss": 0.6317713558673859, "loss": 2.8326, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05340576171875, "step": 4924, "total_loss": 0.7081569880247116 }, { "epoch": 0.9750544446644229, "grad_norm": 0.9190124869346619, "learning_rate": 5e-05, "llm_loss": 0.5325785428285599, "loss": 2.4323, "loss_aux_layer_0": 0.01031494140625, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05181884765625, "step": 4925, "total_loss": 0.6080728471279144 }, { "epoch": 0.9752524252623243, "grad_norm": 0.8497016429901123, "learning_rate": 5e-05, "llm_loss": 0.5550925061106682, "loss": 2.5341, "loss_aux_layer_0": 0.0093231201171875, "loss_aux_layer_1": 0.029388427734375, "loss_aux_layer_10": 0.05657958984375, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.0703125, "loss_aux_layer_14": 0.0784912109375, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05694580078125, "loss_aux_layer_8": 0.05633544921875, "loss_aux_layer_9": 0.05523681640625, "step": 4926, "total_loss": 0.6335130706429482 }, { "epoch": 0.9754504058602257, "grad_norm": 0.9457700848579407, "learning_rate": 5e-05, "llm_loss": 0.5202524811029434, "loss": 2.3833, "loss_aux_layer_0": 0.0101776123046875, "loss_aux_layer_1": 0.027618408203125, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04852294921875, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.051513671875, "step": 4927, "total_loss": 0.5958295166492462 }, { "epoch": 0.9756483864581271, "grad_norm": 0.9542652368545532, "learning_rate": 5e-05, "llm_loss": 0.5679674744606018, "loss": 2.5745, "loss_aux_layer_0": 0.0096588134765625, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05218505859375, "step": 4928, "total_loss": 0.643629640340805 }, { "epoch": 0.9758463670560286, "grad_norm": 0.8609963059425354, "learning_rate": 5e-05, "llm_loss": 0.5881754904985428, "loss": 2.6604, "loss_aux_layer_0": 0.0096282958984375, "loss_aux_layer_1": 0.029541015625, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.04107666015625, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.053466796875, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.05755615234375, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05499267578125, "loss_aux_layer_9": 0.0535888671875, "step": 4929, "total_loss": 0.6651001423597336 }, { "epoch": 0.9760443476539299, "grad_norm": 1.023919939994812, "learning_rate": 5e-05, "llm_loss": 0.6059817522764206, "loss": 2.7268, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.027374267578125, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.054443359375, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05133056640625, "step": 4930, "total_loss": 0.6816987246274948 }, { "epoch": 0.9762423282518313, "grad_norm": 0.9535402059555054, "learning_rate": 5e-05, "llm_loss": 0.6164566427469254, "loss": 2.7679, "loss_aux_layer_0": 0.009490966796875, "loss_aux_layer_1": 0.02752685546875, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05206298828125, "step": 4931, "total_loss": 0.6919759213924408 }, { "epoch": 0.9764403088497328, "grad_norm": 0.8692634105682373, "learning_rate": 5e-05, "llm_loss": 0.5474035739898682, "loss": 2.4864, "loss_aux_layer_0": 0.0109710693359375, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.05975341796875, "loss_aux_layer_13": 0.06463623046875, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.08935546875, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.0511474609375, "step": 4932, "total_loss": 0.6216011345386505 }, { "epoch": 0.9766382894476341, "grad_norm": 0.7636030912399292, "learning_rate": 5e-05, "llm_loss": 0.5412917584180832, "loss": 2.4749, "loss_aux_layer_0": 0.009246826171875, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.083984375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05096435546875, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.0546875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.0538330078125, "step": 4933, "total_loss": 0.6187232732772827 }, { "epoch": 0.9768362700455355, "grad_norm": 0.8460361957550049, "learning_rate": 5e-05, "llm_loss": 0.5788966119289398, "loss": 2.6215, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.02752685546875, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.05511474609375, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05181884765625, "step": 4934, "total_loss": 0.6553789228200912 }, { "epoch": 0.977034250643437, "grad_norm": 0.762883722782135, "learning_rate": 5e-05, "llm_loss": 0.5905818939208984, "loss": 2.6653, "loss_aux_layer_0": 0.009033203125, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.114990234375, "loss_aux_layer_21": 0.1229248046875, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.0531005859375, "step": 4935, "total_loss": 0.6663241386413574 }, { "epoch": 0.9772322312413384, "grad_norm": 0.8200961351394653, "learning_rate": 5e-05, "llm_loss": 0.5876540690660477, "loss": 2.6484, "loss_aux_layer_0": 0.0099945068359375, "loss_aux_layer_1": 0.027496337890625, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05596923828125, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.0650634765625, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05096435546875, "step": 4936, "total_loss": 0.6621075719594955 }, { "epoch": 0.9774302118392397, "grad_norm": 0.7789313197135925, "learning_rate": 5e-05, "llm_loss": 0.5631267428398132, "loss": 2.5516, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.06549072265625, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05535888671875, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05181884765625, "step": 4937, "total_loss": 0.6379002034664154 }, { "epoch": 0.9776281924371412, "grad_norm": 0.7655547857284546, "learning_rate": 5e-05, "llm_loss": 0.5436861515045166, "loss": 2.4788, "loss_aux_layer_0": 0.0092926025390625, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05322265625, "loss_aux_layer_9": 0.05206298828125, "step": 4938, "total_loss": 0.619690865278244 }, { "epoch": 0.9778261730350426, "grad_norm": 0.8163459300994873, "learning_rate": 5e-05, "llm_loss": 0.5058327466249466, "loss": 2.3261, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.027496337890625, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03826904296875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.05145263671875, "step": 4939, "total_loss": 0.5815144926309586 }, { "epoch": 0.9780241536329439, "grad_norm": 0.7585136294364929, "learning_rate": 5e-05, "llm_loss": 0.5964483469724655, "loss": 2.6935, "loss_aux_layer_0": 0.0092926025390625, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1221923828125, "loss_aux_layer_21": 0.1307373046875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05352783203125, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.052001953125, "step": 4940, "total_loss": 0.6733827590942383 }, { "epoch": 0.9782221342308454, "grad_norm": 0.6983643770217896, "learning_rate": 5e-05, "llm_loss": 0.5803660303354263, "loss": 2.6178, "loss_aux_layer_0": 0.0096282958984375, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.0648193359375, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.1043701171875, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05120849609375, "step": 4941, "total_loss": 0.6544505804777145 }, { "epoch": 0.9784201148287468, "grad_norm": 1.0282580852508545, "learning_rate": 5e-05, "llm_loss": 0.6009541302919388, "loss": 2.7162, "loss_aux_layer_0": 0.009674072265625, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05865478515625, "loss_aux_layer_12": 0.06329345703125, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.10302734375, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130859375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.05303955078125, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05364990234375, "step": 4942, "total_loss": 0.6790390610694885 }, { "epoch": 0.9786180954266482, "grad_norm": 0.858512818813324, "learning_rate": 5e-05, "llm_loss": 0.5812401175498962, "loss": 2.6414, "loss_aux_layer_0": 0.0097198486328125, "loss_aux_layer_1": 0.03009033203125, "loss_aux_layer_10": 0.0576171875, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.06591796875, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.07861328125, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04205322265625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05224609375, "loss_aux_layer_4": 0.0550537109375, "loss_aux_layer_5": 0.05657958984375, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.05743408203125, "loss_aux_layer_9": 0.05615234375, "step": 4943, "total_loss": 0.6603565514087677 }, { "epoch": 0.9788160760245496, "grad_norm": 1.0234860181808472, "learning_rate": 5e-05, "llm_loss": 0.5106962472200394, "loss": 2.3523, "loss_aux_layer_0": 0.0092926025390625, "loss_aux_layer_1": 0.0277099609375, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05328369140625, "step": 4944, "total_loss": 0.5880649238824844 }, { "epoch": 0.979014056622451, "grad_norm": 0.9160444736480713, "learning_rate": 5e-05, "llm_loss": 0.5030952095985413, "loss": 2.3219, "loss_aux_layer_0": 0.009674072265625, "loss_aux_layer_1": 0.02801513671875, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1124267578125, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05328369140625, "step": 4945, "total_loss": 0.5804823935031891 }, { "epoch": 0.9792120372203524, "grad_norm": 1.1056455373764038, "learning_rate": 5e-05, "llm_loss": 0.6907074600458145, "loss": 3.0605, "loss_aux_layer_0": 0.0093994140625, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.06512451171875, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.05059814453125, "step": 4946, "total_loss": 0.7651215642690659 }, { "epoch": 0.9794100178182538, "grad_norm": 0.9736956357955933, "learning_rate": 5e-05, "llm_loss": 0.5670248493552208, "loss": 2.5708, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.052978515625, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05169677734375, "step": 4947, "total_loss": 0.6427077800035477 }, { "epoch": 0.9796079984161552, "grad_norm": 0.9038985967636108, "learning_rate": 5e-05, "llm_loss": 0.5325420275330544, "loss": 2.4279, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.02679443359375, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.06427001953125, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.04888916015625, "loss_aux_layer_5": 0.05035400390625, "loss_aux_layer_6": 0.05328369140625, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.05120849609375, "loss_aux_layer_9": 0.05023193359375, "step": 4948, "total_loss": 0.6069852039217949 }, { "epoch": 0.9798059790140566, "grad_norm": 1.3158320188522339, "learning_rate": 5e-05, "llm_loss": 0.6000900566577911, "loss": 2.6973, "loss_aux_layer_0": 0.0107879638671875, "loss_aux_layer_1": 0.026458740234375, "loss_aux_layer_10": 0.05169677734375, "loss_aux_layer_11": 0.05511474609375, "loss_aux_layer_12": 0.05914306640625, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03692626953125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.0462646484375, "loss_aux_layer_4": 0.0487060546875, "loss_aux_layer_5": 0.05029296875, "loss_aux_layer_6": 0.0533447265625, "loss_aux_layer_7": 0.05194091796875, "loss_aux_layer_8": 0.05145263671875, "loss_aux_layer_9": 0.05035400390625, "step": 4949, "total_loss": 0.6743230521678925 }, { "epoch": 0.9800039596119581, "grad_norm": 1.0361524820327759, "learning_rate": 5e-05, "llm_loss": 0.5750073790550232, "loss": 2.605, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.056884765625, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05340576171875, "step": 4950, "total_loss": 0.6512624621391296 }, { "epoch": 0.9802019402098594, "grad_norm": 0.9873462915420532, "learning_rate": 5e-05, "llm_loss": 0.5730826258659363, "loss": 2.5844, "loss_aux_layer_0": 0.0110931396484375, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.05096435546875, "loss_aux_layer_11": 0.0545654296875, "loss_aux_layer_12": 0.0587158203125, "loss_aux_layer_13": 0.063720703125, "loss_aux_layer_14": 0.0711669921875, "loss_aux_layer_15": 0.0791015625, "loss_aux_layer_16": 0.0875244140625, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.103271484375, "loss_aux_layer_19": 0.107177734375, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.1148681640625, "loss_aux_layer_21": 0.1226806640625, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.176513671875, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.048583984375, "loss_aux_layer_5": 0.050048828125, "loss_aux_layer_6": 0.05267333984375, "loss_aux_layer_7": 0.0511474609375, "loss_aux_layer_8": 0.05072021484375, "loss_aux_layer_9": 0.0496826171875, "step": 4951, "total_loss": 0.646108090877533 }, { "epoch": 0.9803999208077608, "grad_norm": 1.0912507772445679, "learning_rate": 5e-05, "llm_loss": 0.6042362451553345, "loss": 2.7288, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06341552734375, "loss_aux_layer_13": 0.06854248046875, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.05792236328125, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05426025390625, "step": 4952, "total_loss": 0.6821882575750351 }, { "epoch": 0.9805979014056623, "grad_norm": 1.0302566289901733, "learning_rate": 5e-05, "llm_loss": 0.5015910565853119, "loss": 2.3131, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.028167724609375, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.05181884765625, "loss_aux_layer_6": 0.05462646484375, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.0513916015625, "step": 4953, "total_loss": 0.5782736986875534 }, { "epoch": 0.9807958820035636, "grad_norm": 0.9476298689842224, "learning_rate": 5e-05, "llm_loss": 0.5882776379585266, "loss": 2.6483, "loss_aux_layer_0": 0.0103607177734375, "loss_aux_layer_1": 0.027618408203125, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.05975341796875, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.0885009765625, "loss_aux_layer_17": 0.0958251953125, "loss_aux_layer_18": 0.103515625, "loss_aux_layer_19": 0.1070556640625, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.114990234375, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.051025390625, "step": 4954, "total_loss": 0.6620742529630661 }, { "epoch": 0.980993862601465, "grad_norm": 0.9644437432289124, "learning_rate": 5e-05, "llm_loss": 0.5570536106824875, "loss": 2.5169, "loss_aux_layer_0": 0.0106048583984375, "loss_aux_layer_1": 0.026275634765625, "loss_aux_layer_10": 0.04888916015625, "loss_aux_layer_11": 0.05255126953125, "loss_aux_layer_12": 0.05657958984375, "loss_aux_layer_13": 0.061767578125, "loss_aux_layer_14": 0.0693359375, "loss_aux_layer_15": 0.0771484375, "loss_aux_layer_16": 0.08642578125, "loss_aux_layer_17": 0.09423828125, "loss_aux_layer_18": 0.1031494140625, "loss_aux_layer_19": 0.107177734375, "loss_aux_layer_2": 0.03607177734375, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.044921875, "loss_aux_layer_4": 0.046875, "loss_aux_layer_5": 0.0479736328125, "loss_aux_layer_6": 0.0504150390625, "loss_aux_layer_7": 0.0489501953125, "loss_aux_layer_8": 0.048583984375, "loss_aux_layer_9": 0.047607421875, "step": 4955, "total_loss": 0.6292291730642319 }, { "epoch": 0.9811918431993665, "grad_norm": 0.8411765694618225, "learning_rate": 5e-05, "llm_loss": 0.6632883101701736, "loss": 2.9581, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.075927734375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05670166015625, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05316162109375, "step": 4956, "total_loss": 0.7395244836807251 }, { "epoch": 0.9813898237972679, "grad_norm": 0.8648618459701538, "learning_rate": 5e-05, "llm_loss": 0.5749703869223595, "loss": 2.5991, "loss_aux_layer_0": 0.0111083984375, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.122314453125, "loss_aux_layer_22": 0.141357421875, "loss_aux_layer_23": 0.174560546875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.0518798828125, "step": 4957, "total_loss": 0.6497761160135269 }, { "epoch": 0.9815878043951692, "grad_norm": 0.7472148537635803, "learning_rate": 5e-05, "llm_loss": 0.48314136266708374, "loss": 2.2278, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.054443359375, "loss_aux_layer_12": 0.05859375, "loss_aux_layer_13": 0.06365966796875, "loss_aux_layer_14": 0.07177734375, "loss_aux_layer_15": 0.07958984375, "loss_aux_layer_16": 0.0887451171875, "loss_aux_layer_17": 0.0963134765625, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03753662109375, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.05035400390625, "loss_aux_layer_6": 0.0531005859375, "loss_aux_layer_7": 0.05145263671875, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.0499267578125, "step": 4958, "total_loss": 0.5569386854767799 }, { "epoch": 0.9817857849930707, "grad_norm": 0.9127999544143677, "learning_rate": 5e-05, "llm_loss": 0.5945388823747635, "loss": 2.6949, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.029998779296875, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06024169921875, "loss_aux_layer_12": 0.06463623046875, "loss_aux_layer_13": 0.0699462890625, "loss_aux_layer_14": 0.078369140625, "loss_aux_layer_15": 0.086669921875, "loss_aux_layer_16": 0.0958251953125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.114013671875, "loss_aux_layer_2": 0.04156494140625, "loss_aux_layer_20": 0.1217041015625, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05157470703125, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05865478515625, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.0550537109375, "step": 4959, "total_loss": 0.673722043633461 }, { "epoch": 0.9819837655909721, "grad_norm": 0.8265067934989929, "learning_rate": 5e-05, "llm_loss": 0.5874546468257904, "loss": 2.6493, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.0548095703125, "loss_aux_layer_12": 0.0589599609375, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.048828125, "loss_aux_layer_5": 0.05023193359375, "loss_aux_layer_6": 0.05291748046875, "loss_aux_layer_7": 0.0513916015625, "loss_aux_layer_8": 0.05096435546875, "loss_aux_layer_9": 0.050048828125, "step": 4960, "total_loss": 0.6623354852199554 }, { "epoch": 0.9821817461888734, "grad_norm": 0.9602214694023132, "learning_rate": 5e-05, "llm_loss": 0.5848933234810829, "loss": 2.6506, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.0294189453125, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.068359375, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05364990234375, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05828857421875, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.054443359375, "step": 4961, "total_loss": 0.6626472920179367 }, { "epoch": 0.9823797267867749, "grad_norm": 0.7114129662513733, "learning_rate": 5e-05, "llm_loss": 0.5340711250901222, "loss": 2.4365, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.06561279296875, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05218505859375, "loss_aux_layer_9": 0.05126953125, "step": 4962, "total_loss": 0.6091237962245941 }, { "epoch": 0.9825777073846763, "grad_norm": 1.0279085636138916, "learning_rate": 5e-05, "llm_loss": 0.5826751291751862, "loss": 2.6316, "loss_aux_layer_0": 0.0121002197265625, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.05181884765625, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05120849609375, "step": 4963, "total_loss": 0.6579084247350693 }, { "epoch": 0.9827756879825778, "grad_norm": 0.8563385605812073, "learning_rate": 5e-05, "llm_loss": 0.5512958317995071, "loss": 2.5058, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.0516357421875, "step": 4964, "total_loss": 0.62644362449646 }, { "epoch": 0.9829736685804791, "grad_norm": 0.880114734172821, "learning_rate": 5e-05, "llm_loss": 0.46416211128234863, "loss": 2.154, "loss_aux_layer_0": 0.010833740234375, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.05126953125, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.05914306640625, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0372314453125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04632568359375, "loss_aux_layer_4": 0.04864501953125, "loss_aux_layer_5": 0.05010986328125, "loss_aux_layer_6": 0.0528564453125, "loss_aux_layer_7": 0.05126953125, "loss_aux_layer_8": 0.0509033203125, "loss_aux_layer_9": 0.0499267578125, "step": 4965, "total_loss": 0.5385085791349411 }, { "epoch": 0.9831716491783805, "grad_norm": 0.8808914422988892, "learning_rate": 5e-05, "llm_loss": 0.5843242779374123, "loss": 2.6371, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05706787109375, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.141357421875, "loss_aux_layer_23": 0.176513671875, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.052001953125, "step": 4966, "total_loss": 0.6592625677585602 }, { "epoch": 0.983369629776282, "grad_norm": 0.9449571371078491, "learning_rate": 5e-05, "llm_loss": 0.44383741170167923, "loss": 2.0895, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.05499267578125, "loss_aux_layer_11": 0.05889892578125, "loss_aux_layer_12": 0.0633544921875, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.04083251953125, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.15283203125, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.050537109375, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.0543212890625, "loss_aux_layer_6": 0.05706787109375, "loss_aux_layer_7": 0.05535888671875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05377197265625, "step": 4967, "total_loss": 0.5223856121301651 }, { "epoch": 0.9835676103741833, "grad_norm": 0.8773103952407837, "learning_rate": 5e-05, "llm_loss": 0.6215057671070099, "loss": 2.7879, "loss_aux_layer_0": 0.011444091796875, "loss_aux_layer_1": 0.027740478515625, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.056884765625, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.0535888671875, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05224609375, "step": 4968, "total_loss": 0.6969637274742126 }, { "epoch": 0.9837655909720847, "grad_norm": 0.7998838424682617, "learning_rate": 5e-05, "llm_loss": 0.44568414986133575, "loss": 2.0822, "loss_aux_layer_0": 0.0095367431640625, "loss_aux_layer_1": 0.026519775390625, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.0550537109375, "loss_aux_layer_12": 0.05926513671875, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.0372314453125, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.04644775390625, "loss_aux_layer_4": 0.0487060546875, "loss_aux_layer_5": 0.04998779296875, "loss_aux_layer_6": 0.05255126953125, "loss_aux_layer_7": 0.0509033203125, "loss_aux_layer_8": 0.05047607421875, "loss_aux_layer_9": 0.04974365234375, "step": 4969, "total_loss": 0.5205574333667755 }, { "epoch": 0.9839635715699862, "grad_norm": 0.9670066237449646, "learning_rate": 5e-05, "llm_loss": 0.494492270052433, "loss": 2.2784, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.05279541015625, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.051513671875, "step": 4970, "total_loss": 0.569605827331543 }, { "epoch": 0.9841615521678876, "grad_norm": 1.054695725440979, "learning_rate": 5e-05, "llm_loss": 0.5181488916277885, "loss": 2.3786, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.02703857421875, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1090087890625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.0513916015625, "step": 4971, "total_loss": 0.5946558117866516 }, { "epoch": 0.9843595327657889, "grad_norm": 1.005001187324524, "learning_rate": 5e-05, "llm_loss": 0.5578504875302315, "loss": 2.5277, "loss_aux_layer_0": 0.0097198486328125, "loss_aux_layer_1": 0.02685546875, "loss_aux_layer_10": 0.0517578125, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.046630859375, "loss_aux_layer_4": 0.04864501953125, "loss_aux_layer_5": 0.04986572265625, "loss_aux_layer_6": 0.052490234375, "loss_aux_layer_7": 0.05096435546875, "loss_aux_layer_8": 0.0506591796875, "loss_aux_layer_9": 0.0504150390625, "step": 4972, "total_loss": 0.6319341361522675 }, { "epoch": 0.9845575133636904, "grad_norm": 1.0174980163574219, "learning_rate": 5e-05, "llm_loss": 0.5505675971508026, "loss": 2.497, "loss_aux_layer_0": 0.01171875, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.0509033203125, "loss_aux_layer_11": 0.05462646484375, "loss_aux_layer_12": 0.05865478515625, "loss_aux_layer_13": 0.0638427734375, "loss_aux_layer_14": 0.0716552734375, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.1043701171875, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.124267578125, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04669189453125, "loss_aux_layer_4": 0.048828125, "loss_aux_layer_5": 0.05029296875, "loss_aux_layer_6": 0.05291748046875, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05084228515625, "loss_aux_layer_9": 0.04974365234375, "step": 4973, "total_loss": 0.6242527514696121 }, { "epoch": 0.9847554939615918, "grad_norm": 0.8364516496658325, "learning_rate": 5e-05, "llm_loss": 0.46804429590702057, "loss": 2.1738, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.056884765625, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.05218505859375, "step": 4974, "total_loss": 0.5434426367282867 }, { "epoch": 0.9849534745594932, "grad_norm": 1.0401839017868042, "learning_rate": 5e-05, "llm_loss": 0.507871612906456, "loss": 2.3331, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.06494140625, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.0513916015625, "step": 4975, "total_loss": 0.5832752287387848 }, { "epoch": 0.9851514551573946, "grad_norm": 0.8153311014175415, "learning_rate": 5e-05, "llm_loss": 0.5048631280660629, "loss": 2.3218, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.0535888671875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.0523681640625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0523681640625, "step": 4976, "total_loss": 0.5804531127214432 }, { "epoch": 0.985349435755296, "grad_norm": 1.0259836912155151, "learning_rate": 5e-05, "llm_loss": 0.4928833693265915, "loss": 2.277, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.0517578125, "step": 4977, "total_loss": 0.5692413747310638 }, { "epoch": 0.9855474163531974, "grad_norm": 0.9746801257133484, "learning_rate": 5e-05, "llm_loss": 0.5671685636043549, "loss": 2.5652, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.026611328125, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05487060546875, "loss_aux_layer_12": 0.05889892578125, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.0537109375, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.0501708984375, "step": 4978, "total_loss": 0.6413021087646484 }, { "epoch": 0.9857453969510988, "grad_norm": 0.8715305924415588, "learning_rate": 5e-05, "llm_loss": 0.5192089751362801, "loss": 2.3829, "loss_aux_layer_0": 0.0095977783203125, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.06695556640625, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.05316162109375, "step": 4979, "total_loss": 0.5957341343164444 }, { "epoch": 0.9859433775490002, "grad_norm": 0.9483984112739563, "learning_rate": 5e-05, "llm_loss": 0.5892317742109299, "loss": 2.6577, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05255126953125, "step": 4980, "total_loss": 0.6644319593906403 }, { "epoch": 0.9861413581469016, "grad_norm": 0.8633547425270081, "learning_rate": 5e-05, "llm_loss": 0.515714481472969, "loss": 2.3645, "loss_aux_layer_0": 0.0092926025390625, "loss_aux_layer_1": 0.02655029296875, "loss_aux_layer_10": 0.05108642578125, "loss_aux_layer_11": 0.05474853515625, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.064697265625, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.188232421875, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.051025390625, "loss_aux_layer_9": 0.04986572265625, "step": 4981, "total_loss": 0.5911184102296829 }, { "epoch": 0.9863393387448031, "grad_norm": 0.9400967955589294, "learning_rate": 5e-05, "llm_loss": 0.5199508592486382, "loss": 2.3855, "loss_aux_layer_0": 0.0111083984375, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04974365234375, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05279541015625, "step": 4982, "total_loss": 0.5963747352361679 }, { "epoch": 0.9865373193427044, "grad_norm": 0.8736298680305481, "learning_rate": 5e-05, "llm_loss": 0.5385065972805023, "loss": 2.4587, "loss_aux_layer_0": 0.0089263916015625, "loss_aux_layer_1": 0.02734375, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.03814697265625, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.052490234375, "loss_aux_layer_9": 0.051513671875, "step": 4983, "total_loss": 0.6146869659423828 }, { "epoch": 0.9867352999406058, "grad_norm": 0.931214451789856, "learning_rate": 5e-05, "llm_loss": 0.5818377584218979, "loss": 2.6272, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.052001953125, "step": 4984, "total_loss": 0.6568045169115067 }, { "epoch": 0.9869332805385073, "grad_norm": 0.8020797371864319, "learning_rate": 5e-05, "llm_loss": 0.5746279954910278, "loss": 2.6033, "loss_aux_layer_0": 0.0096588134765625, "loss_aux_layer_1": 0.026947021484375, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.05267333984375, "step": 4985, "total_loss": 0.6508137434720993 }, { "epoch": 0.9871312611364086, "grad_norm": 0.828112006187439, "learning_rate": 5e-05, "llm_loss": 0.564889594912529, "loss": 2.5656, "loss_aux_layer_0": 0.0096435546875, "loss_aux_layer_1": 0.0283203125, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06292724609375, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.075439453125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03973388671875, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.05322265625, "step": 4986, "total_loss": 0.6413892358541489 }, { "epoch": 0.98732924173431, "grad_norm": 0.8507399559020996, "learning_rate": 5e-05, "llm_loss": 0.6330264508724213, "loss": 2.8275, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.025909423828125, "loss_aux_layer_10": 0.0509033203125, "loss_aux_layer_11": 0.054443359375, "loss_aux_layer_12": 0.05865478515625, "loss_aux_layer_13": 0.06427001953125, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03662109375, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.0457763671875, "loss_aux_layer_4": 0.04803466796875, "loss_aux_layer_5": 0.0496826171875, "loss_aux_layer_6": 0.05242919921875, "loss_aux_layer_7": 0.05084228515625, "loss_aux_layer_8": 0.0504150390625, "loss_aux_layer_9": 0.0494384765625, "step": 4987, "total_loss": 0.7068627029657364 }, { "epoch": 0.9875272223322115, "grad_norm": 0.8090927600860596, "learning_rate": 5e-05, "llm_loss": 0.5957184284925461, "loss": 2.6808, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.04742431640625, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.0513916015625, "step": 4988, "total_loss": 0.6702106446027756 }, { "epoch": 0.9877252029301129, "grad_norm": 0.7682209610939026, "learning_rate": 5e-05, "llm_loss": 0.5682346448302269, "loss": 2.5787, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.126708984375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.052734375, "step": 4989, "total_loss": 0.6446837335824966 }, { "epoch": 0.9879231835280142, "grad_norm": 0.9416714906692505, "learning_rate": 5e-05, "llm_loss": 0.5961472541093826, "loss": 2.6836, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.051025390625, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05157470703125, "step": 4990, "total_loss": 0.6708997488021851 }, { "epoch": 0.9881211641259157, "grad_norm": 0.7988898754119873, "learning_rate": 5e-05, "llm_loss": 0.5597087889909744, "loss": 2.5428, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.052734375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.0523681640625, "step": 4991, "total_loss": 0.6357016861438751 }, { "epoch": 0.9883191447238171, "grad_norm": 0.8553196787834167, "learning_rate": 5e-05, "llm_loss": 0.602433055639267, "loss": 2.7049, "loss_aux_layer_0": 0.0098876953125, "loss_aux_layer_1": 0.026763916015625, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.05548095703125, "loss_aux_layer_12": 0.0594482421875, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1044921875, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04937744140625, "loss_aux_layer_5": 0.0506591796875, "loss_aux_layer_6": 0.05340576171875, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.0506591796875, "step": 4992, "total_loss": 0.6762216836214066 }, { "epoch": 0.9885171253217184, "grad_norm": 0.8612894415855408, "learning_rate": 5e-05, "llm_loss": 0.5349451079964638, "loss": 2.4404, "loss_aux_layer_0": 0.009521484375, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05328369140625, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.06597900390625, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.052001953125, "step": 4993, "total_loss": 0.6101113259792328 }, { "epoch": 0.9887151059196199, "grad_norm": 0.9128947257995605, "learning_rate": 5e-05, "llm_loss": 0.5543320626020432, "loss": 2.5357, "loss_aux_layer_0": 0.0101318359375, "loss_aux_layer_1": 0.0303955078125, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.0614013671875, "loss_aux_layer_12": 0.06573486328125, "loss_aux_layer_13": 0.07080078125, "loss_aux_layer_14": 0.0787353515625, "loss_aux_layer_15": 0.0867919921875, "loss_aux_layer_16": 0.095703125, "loss_aux_layer_17": 0.1029052734375, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04254150390625, "loss_aux_layer_20": 0.12109375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05255126953125, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05670166015625, "loss_aux_layer_6": 0.05963134765625, "loss_aux_layer_7": 0.05792236328125, "loss_aux_layer_8": 0.0572509765625, "loss_aux_layer_9": 0.0560302734375, "step": 4994, "total_loss": 0.633935734629631 }, { "epoch": 0.9889130865175213, "grad_norm": 0.8695858716964722, "learning_rate": 5e-05, "llm_loss": 0.47139232605695724, "loss": 2.1837, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.05926513671875, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.0799560546875, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.038818359375, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05078125, "step": 4995, "total_loss": 0.5459318310022354 }, { "epoch": 0.9891110671154227, "grad_norm": 0.8944112062454224, "learning_rate": 5e-05, "llm_loss": 0.5018304064869881, "loss": 2.3098, "loss_aux_layer_0": 0.0102081298828125, "loss_aux_layer_1": 0.027374267578125, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.05596923828125, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1278076171875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05120849609375, "step": 4996, "total_loss": 0.5774449035525322 }, { "epoch": 0.9893090477133241, "grad_norm": 0.7883355617523193, "learning_rate": 5e-05, "llm_loss": 0.6199118793010712, "loss": 2.7883, "loss_aux_layer_0": 0.0101470947265625, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.062744140625, "loss_aux_layer_13": 0.0682373046875, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05389404296875, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.053466796875, "step": 4997, "total_loss": 0.6970851123332977 }, { "epoch": 0.9895070283112255, "grad_norm": 0.6660741567611694, "learning_rate": 5e-05, "llm_loss": 0.5241422653198242, "loss": 2.4067, "loss_aux_layer_0": 0.0095062255859375, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.1292724609375, "loss_aux_layer_22": 0.1513671875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05010986328125, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05328369140625, "step": 4998, "total_loss": 0.6016818135976791 }, { "epoch": 0.9897050089091269, "grad_norm": 0.8099021911621094, "learning_rate": 5e-05, "llm_loss": 0.6168124079704285, "loss": 2.7835, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.029754638671875, "loss_aux_layer_10": 0.05694580078125, "loss_aux_layer_11": 0.060791015625, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.129638671875, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.187744140625, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.0556640625, "loss_aux_layer_6": 0.0589599609375, "loss_aux_layer_7": 0.057373046875, "loss_aux_layer_8": 0.0567626953125, "loss_aux_layer_9": 0.0556640625, "step": 4999, "total_loss": 0.6958785951137543 }, { "epoch": 0.9899029895070283, "grad_norm": 0.6772750616073608, "learning_rate": 5e-05, "llm_loss": 0.4920489564538002, "loss": 2.2664, "loss_aux_layer_0": 0.009613037109375, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.060302734375, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.0513916015625, "step": 5000, "total_loss": 0.5665933042764664 }, { "epoch": 0.9901009701049297, "grad_norm": 0.7597166895866394, "learning_rate": 5e-05, "llm_loss": 0.5300319269299507, "loss": 2.4381, "loss_aux_layer_0": 0.009765625, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06201171875, "loss_aux_layer_12": 0.066162109375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.078857421875, "loss_aux_layer_15": 0.0865478515625, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.0419921875, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0523681640625, "loss_aux_layer_4": 0.05517578125, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.06024169921875, "loss_aux_layer_7": 0.05865478515625, "loss_aux_layer_8": 0.0579833984375, "loss_aux_layer_9": 0.0570068359375, "step": 5001, "total_loss": 0.6095294952392578 }, { "epoch": 0.9902989507028311, "grad_norm": 0.7368273735046387, "learning_rate": 5e-05, "llm_loss": 0.5086078122258186, "loss": 2.3386, "loss_aux_layer_0": 0.0097198486328125, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.05340576171875, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.052001953125, "step": 5002, "total_loss": 0.5846451222896576 }, { "epoch": 0.9904969313007326, "grad_norm": 0.7289026975631714, "learning_rate": 5e-05, "llm_loss": 0.634006530046463, "loss": 2.8393, "loss_aux_layer_0": 0.009490966796875, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.05731201171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1195068359375, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.05194091796875, "step": 5003, "total_loss": 0.7098328769207001 }, { "epoch": 0.9906949118986339, "grad_norm": 0.8455026149749756, "learning_rate": 5e-05, "llm_loss": 0.49549708515405655, "loss": 2.2886, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.052001953125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.052734375, "step": 5004, "total_loss": 0.5721463561058044 }, { "epoch": 0.9908928924965353, "grad_norm": 0.9509952664375305, "learning_rate": 5e-05, "llm_loss": 0.5991428047418594, "loss": 2.709, "loss_aux_layer_0": 0.00933837890625, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0859375, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.104248046875, "loss_aux_layer_18": 0.1123046875, "loss_aux_layer_19": 0.1158447265625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1236572265625, "loss_aux_layer_21": 0.1312255859375, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.05267333984375, "step": 5005, "total_loss": 0.6772445142269135 }, { "epoch": 0.9910908730944368, "grad_norm": 0.9458484649658203, "learning_rate": 5e-05, "llm_loss": 0.6105050444602966, "loss": 2.7446, "loss_aux_layer_0": 0.0111846923828125, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.05364990234375, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.05194091796875, "step": 5006, "total_loss": 0.6861590594053268 }, { "epoch": 0.9912888536923381, "grad_norm": 0.932015597820282, "learning_rate": 5e-05, "llm_loss": 0.5726791620254517, "loss": 2.5961, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.027862548828125, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.0611572265625, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.119873046875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.05096435546875, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.05279541015625, "loss_aux_layer_9": 0.05194091796875, "step": 5007, "total_loss": 0.649027869105339 }, { "epoch": 0.9914868342902395, "grad_norm": 1.269985318183899, "learning_rate": 5e-05, "llm_loss": 0.5610308349132538, "loss": 2.5456, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.0662841796875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05157470703125, "step": 5008, "total_loss": 0.6363900303840637 }, { "epoch": 0.991684814888141, "grad_norm": 1.0899450778961182, "learning_rate": 5e-05, "llm_loss": 0.5853305160999298, "loss": 2.6459, "loss_aux_layer_0": 0.0108795166015625, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.06170654296875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.052490234375, "step": 5009, "total_loss": 0.6614863350987434 }, { "epoch": 0.9918827954860424, "grad_norm": 1.2125065326690674, "learning_rate": 5e-05, "llm_loss": 0.5126902014017105, "loss": 2.3598, "loss_aux_layer_0": 0.011993408203125, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.053955078125, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.06182861328125, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0504150390625, "loss_aux_layer_4": 0.05242919921875, "loss_aux_layer_5": 0.0538330078125, "loss_aux_layer_6": 0.05645751953125, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05279541015625, "step": 5010, "total_loss": 0.5899442732334137 }, { "epoch": 0.9920807760839437, "grad_norm": 0.9175985455513, "learning_rate": 5e-05, "llm_loss": 0.4821041077375412, "loss": 2.2381, "loss_aux_layer_0": 0.010345458984375, "loss_aux_layer_1": 0.029144287109375, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0841064453125, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.041015625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.0509033203125, "loss_aux_layer_4": 0.05316162109375, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.05548095703125, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05377197265625, "step": 5011, "total_loss": 0.5595329999923706 }, { "epoch": 0.9922787566818452, "grad_norm": 1.0572304725646973, "learning_rate": 5e-05, "llm_loss": 0.4998058006167412, "loss": 2.3248, "loss_aux_layer_0": 0.012908935546875, "loss_aux_layer_1": 0.03070068359375, "loss_aux_layer_10": 0.057373046875, "loss_aux_layer_11": 0.061279296875, "loss_aux_layer_12": 0.06610107421875, "loss_aux_layer_13": 0.072021484375, "loss_aux_layer_14": 0.0806884765625, "loss_aux_layer_15": 0.08984375, "loss_aux_layer_16": 0.0989990234375, "loss_aux_layer_17": 0.1065673828125, "loss_aux_layer_18": 0.114501953125, "loss_aux_layer_19": 0.1175537109375, "loss_aux_layer_2": 0.04315185546875, "loss_aux_layer_20": 0.1246337890625, "loss_aux_layer_21": 0.13330078125, "loss_aux_layer_22": 0.155029296875, "loss_aux_layer_23": 0.192138671875, "loss_aux_layer_3": 0.05303955078125, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.0577392578125, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05609130859375, "step": 5012, "total_loss": 0.5812063664197922 }, { "epoch": 0.9924767372797466, "grad_norm": 0.9464245438575745, "learning_rate": 5e-05, "llm_loss": 0.52339106798172, "loss": 2.3953, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.026641845703125, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.0555419921875, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.06549072265625, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0927734375, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04925537109375, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.05181884765625, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.050537109375, "step": 5013, "total_loss": 0.5988302677869797 }, { "epoch": 0.9926747178776479, "grad_norm": 1.0020453929901123, "learning_rate": 5e-05, "llm_loss": 0.4905119687318802, "loss": 2.265, "loss_aux_layer_0": 0.0117034912109375, "loss_aux_layer_1": 0.02734375, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.0550537109375, "loss_aux_layer_12": 0.0595703125, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.152099609375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04888916015625, "loss_aux_layer_5": 0.050048828125, "loss_aux_layer_6": 0.0526123046875, "loss_aux_layer_7": 0.0511474609375, "loss_aux_layer_8": 0.05072021484375, "loss_aux_layer_9": 0.04998779296875, "step": 5014, "total_loss": 0.5662535130977631 }, { "epoch": 0.9928726984755494, "grad_norm": 0.9061494469642639, "learning_rate": 5e-05, "llm_loss": 0.5266278833150864, "loss": 2.413, "loss_aux_layer_0": 0.011749267578125, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.0673828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.0511474609375, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.05401611328125, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.0528564453125, "step": 5015, "total_loss": 0.6032575219869614 }, { "epoch": 0.9930706790734508, "grad_norm": 0.9335958361625671, "learning_rate": 5e-05, "llm_loss": 0.5998537912964821, "loss": 2.7018, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.0274658203125, "loss_aux_layer_10": 0.05352783203125, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.051513671875, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.05242919921875, "step": 5016, "total_loss": 0.6754396259784698 }, { "epoch": 0.9932686596713522, "grad_norm": 0.8600162267684937, "learning_rate": 5e-05, "llm_loss": 0.576942965388298, "loss": 2.6137, "loss_aux_layer_0": 0.010528564453125, "loss_aux_layer_1": 0.027008056640625, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.06658935546875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.128173828125, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.053466796875, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.052001953125, "step": 5017, "total_loss": 0.6534285396337509 }, { "epoch": 0.9934666402692536, "grad_norm": 0.9206135272979736, "learning_rate": 5e-05, "llm_loss": 0.5668851658701897, "loss": 2.5692, "loss_aux_layer_0": 0.0113372802734375, "loss_aux_layer_1": 0.026885986328125, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.060302734375, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03729248046875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.04913330078125, "loss_aux_layer_5": 0.05072021484375, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05096435546875, "step": 5018, "total_loss": 0.6423116624355316 }, { "epoch": 0.993664620867155, "grad_norm": 0.9245895743370056, "learning_rate": 5e-05, "llm_loss": 0.5394540056586266, "loss": 2.4596, "loss_aux_layer_0": 0.0110321044921875, "loss_aux_layer_1": 0.02679443359375, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.0562744140625, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.06549072265625, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04754638671875, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05145263671875, "step": 5019, "total_loss": 0.6148914694786072 }, { "epoch": 0.9938626014650564, "grad_norm": 0.7312023043632507, "learning_rate": 5e-05, "llm_loss": 0.5391539633274078, "loss": 2.4561, "loss_aux_layer_0": 0.01092529296875, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.0550537109375, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.0516357421875, "step": 5020, "total_loss": 0.6140188276767731 }, { "epoch": 0.9940605820629578, "grad_norm": 0.8236938714981079, "learning_rate": 5e-05, "llm_loss": 0.5800885334610939, "loss": 2.6216, "loss_aux_layer_0": 0.0107574462890625, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.05206298828125, "step": 5021, "total_loss": 0.6554037481546402 }, { "epoch": 0.9942585626608592, "grad_norm": 0.9239447712898254, "learning_rate": 5e-05, "llm_loss": 0.6432070285081863, "loss": 2.8835, "loss_aux_layer_0": 0.0111083984375, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.076904296875, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1015625, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.1207275390625, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05340576171875, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05316162109375, "step": 5022, "total_loss": 0.7208776921033859 }, { "epoch": 0.9944565432587607, "grad_norm": 0.8075198531150818, "learning_rate": 5e-05, "llm_loss": 0.580001175403595, "loss": 2.6422, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.030609130859375, "loss_aux_layer_10": 0.05755615234375, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.0797119140625, "loss_aux_layer_15": 0.0882568359375, "loss_aux_layer_16": 0.0980224609375, "loss_aux_layer_17": 0.105224609375, "loss_aux_layer_18": 0.11279296875, "loss_aux_layer_19": 0.1156005859375, "loss_aux_layer_2": 0.04266357421875, "loss_aux_layer_20": 0.123291015625, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.151611328125, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.0531005859375, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05743408203125, "loss_aux_layer_6": 0.060546875, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05633544921875, "step": 5023, "total_loss": 0.6605400741100311 }, { "epoch": 0.9946545238566621, "grad_norm": 0.986977756023407, "learning_rate": 5e-05, "llm_loss": 0.5986549332737923, "loss": 2.7039, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.0281982421875, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.0625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0853271484375, "loss_aux_layer_16": 0.0948486328125, "loss_aux_layer_17": 0.1026611328125, "loss_aux_layer_18": 0.110595703125, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1209716796875, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04901123046875, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05279541015625, "step": 5024, "total_loss": 0.6759787797927856 }, { "epoch": 0.9948525044545634, "grad_norm": 0.9474225044250488, "learning_rate": 5e-05, "llm_loss": 0.6323220133781433, "loss": 2.8217, "loss_aux_layer_0": 0.011505126953125, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05487060546875, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.06329345703125, "loss_aux_layer_14": 0.070556640625, "loss_aux_layer_15": 0.0780029296875, "loss_aux_layer_16": 0.086669921875, "loss_aux_layer_17": 0.094482421875, "loss_aux_layer_18": 0.1019287109375, "loss_aux_layer_19": 0.105712890625, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.11328125, "loss_aux_layer_21": 0.1212158203125, "loss_aux_layer_22": 0.140625, "loss_aux_layer_23": 0.175537109375, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.050537109375, "step": 5025, "total_loss": 0.7054300457239151 }, { "epoch": 0.9950504850524649, "grad_norm": 1.6777592897415161, "learning_rate": 5e-05, "llm_loss": 0.5482481941580772, "loss": 2.4999, "loss_aux_layer_0": 0.0102996826171875, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0574951171875, "loss_aux_layer_12": 0.0616455078125, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1083984375, "loss_aux_layer_19": 0.1119384765625, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05389404296875, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0526123046875, "step": 5026, "total_loss": 0.6249869465827942 }, { "epoch": 0.9952484656503663, "grad_norm": 1.0138436555862427, "learning_rate": 5e-05, "llm_loss": 0.5841974318027496, "loss": 2.6408, "loss_aux_layer_0": 0.0116424560546875, "loss_aux_layer_1": 0.0277099609375, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.0565185546875, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.0518798828125, "step": 5027, "total_loss": 0.6602039262652397 }, { "epoch": 0.9954464462482677, "grad_norm": 1.1237423419952393, "learning_rate": 5e-05, "llm_loss": 0.5734442919492722, "loss": 2.6078, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.029083251953125, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.060546875, "loss_aux_layer_12": 0.0648193359375, "loss_aux_layer_13": 0.070068359375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04168701171875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0516357421875, "loss_aux_layer_4": 0.05419921875, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05859375, "loss_aux_layer_7": 0.0570068359375, "loss_aux_layer_8": 0.0565185546875, "loss_aux_layer_9": 0.0555419921875, "step": 5028, "total_loss": 0.6519475281238556 }, { "epoch": 0.995644426846169, "grad_norm": 1.0580930709838867, "learning_rate": 5e-05, "llm_loss": 0.5582548975944519, "loss": 2.5489, "loss_aux_layer_0": 0.0114288330078125, "loss_aux_layer_1": 0.028900146484375, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05987548828125, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.06982421875, "loss_aux_layer_14": 0.078125, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.0955810546875, "loss_aux_layer_17": 0.1025390625, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.1142578125, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.13037109375, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.051025390625, "loss_aux_layer_4": 0.05377197265625, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.058349609375, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05609130859375, "loss_aux_layer_9": 0.054931640625, "step": 5029, "total_loss": 0.6372347772121429 }, { "epoch": 0.9958424074440705, "grad_norm": 0.9721993803977966, "learning_rate": 5e-05, "llm_loss": 0.6339640021324158, "loss": 2.8406, "loss_aux_layer_0": 0.0117340087890625, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05419921875, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.0623779296875, "loss_aux_layer_13": 0.0675048828125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0518798828125, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.0562744140625, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.0528564453125, "step": 5030, "total_loss": 0.710146427154541 }, { "epoch": 0.9960403880419719, "grad_norm": 0.890805184841156, "learning_rate": 5e-05, "llm_loss": 0.5235768407583237, "loss": 2.4058, "loss_aux_layer_0": 0.0104827880859375, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093994140625, "loss_aux_layer_17": 0.1011962890625, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1131591796875, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.131591796875, "loss_aux_layer_22": 0.154052734375, "loss_aux_layer_23": 0.192626953125, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.05322265625, "loss_aux_layer_6": 0.05584716796875, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05352783203125, "loss_aux_layer_9": 0.05267333984375, "step": 5031, "total_loss": 0.6014495342969894 }, { "epoch": 0.9962383686398733, "grad_norm": 1.0095078945159912, "learning_rate": 5e-05, "llm_loss": 0.5401569902896881, "loss": 2.4717, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.02960205078125, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05938720703125, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0858154296875, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1021728515625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.05108642578125, "loss_aux_layer_4": 0.05340576171875, "loss_aux_layer_5": 0.0548095703125, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.0546875, "loss_aux_layer_9": 0.05377197265625, "step": 5032, "total_loss": 0.6179309040307999 }, { "epoch": 0.9964363492377747, "grad_norm": 0.8315026760101318, "learning_rate": 5e-05, "llm_loss": 0.6010035946965218, "loss": 2.7052, "loss_aux_layer_0": 0.011444091796875, "loss_aux_layer_1": 0.027587890625, "loss_aux_layer_10": 0.05322265625, "loss_aux_layer_11": 0.05682373046875, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.06610107421875, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05059814453125, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.054931640625, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.052001953125, "step": 5033, "total_loss": 0.6762905418872833 }, { "epoch": 0.9966343298356761, "grad_norm": 0.8980143666267395, "learning_rate": 5e-05, "llm_loss": 0.48742296546697617, "loss": 2.2509, "loss_aux_layer_0": 0.010650634765625, "loss_aux_layer_1": 0.026275634765625, "loss_aux_layer_10": 0.05108642578125, "loss_aux_layer_11": 0.05474853515625, "loss_aux_layer_12": 0.0589599609375, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.03692626953125, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.130615234375, "loss_aux_layer_22": 0.15185546875, "loss_aux_layer_23": 0.18994140625, "loss_aux_layer_3": 0.0458984375, "loss_aux_layer_4": 0.048095703125, "loss_aux_layer_5": 0.0496826171875, "loss_aux_layer_6": 0.05230712890625, "loss_aux_layer_7": 0.0506591796875, "loss_aux_layer_8": 0.05023193359375, "loss_aux_layer_9": 0.04986572265625, "step": 5034, "total_loss": 0.5627324879169464 }, { "epoch": 0.9968323104335776, "grad_norm": 0.9974374175071716, "learning_rate": 5e-05, "llm_loss": 0.6129370927810669, "loss": 2.7664, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.029815673828125, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.0782470703125, "loss_aux_layer_15": 0.086181640625, "loss_aux_layer_16": 0.0950927734375, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1103515625, "loss_aux_layer_19": 0.1129150390625, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.129150390625, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18701171875, "loss_aux_layer_3": 0.05126953125, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.0543212890625, "step": 5035, "total_loss": 0.6916036158800125 }, { "epoch": 0.9970302910314789, "grad_norm": 0.8216988444328308, "learning_rate": 5e-05, "llm_loss": 0.5632255226373672, "loss": 2.5553, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.0606689453125, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03857421875, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05181884765625, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.05133056640625, "step": 5036, "total_loss": 0.6388228982686996 }, { "epoch": 0.9972282716293803, "grad_norm": 0.966666579246521, "learning_rate": 5e-05, "llm_loss": 0.5906596854329109, "loss": 2.6756, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.0296630859375, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.059326171875, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.0684814453125, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.084716796875, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.10107421875, "loss_aux_layer_18": 0.109375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04180908203125, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.187255859375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.05523681640625, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.055419921875, "loss_aux_layer_9": 0.05401611328125, "step": 5037, "total_loss": 0.6688898205757141 }, { "epoch": 0.9974262522272818, "grad_norm": 0.8747206926345825, "learning_rate": 5e-05, "llm_loss": 0.557178184390068, "loss": 2.5401, "loss_aux_layer_0": 0.0112457275390625, "loss_aux_layer_1": 0.02947998046875, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1087646484375, "loss_aux_layer_19": 0.112548828125, "loss_aux_layer_2": 0.04150390625, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.12939453125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.188720703125, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.0550537109375, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05328369140625, "step": 5038, "total_loss": 0.6350280493497849 }, { "epoch": 0.9976242328251831, "grad_norm": 0.814079225063324, "learning_rate": 5e-05, "llm_loss": 0.5143667310476303, "loss": 2.3712, "loss_aux_layer_0": 0.0113677978515625, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0555419921875, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.06365966796875, "loss_aux_layer_13": 0.0687255859375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.042236328125, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.1290283203125, "loss_aux_layer_22": 0.151123046875, "loss_aux_layer_23": 0.18798828125, "loss_aux_layer_3": 0.05194091796875, "loss_aux_layer_4": 0.05438232421875, "loss_aux_layer_5": 0.05560302734375, "loss_aux_layer_6": 0.05841064453125, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05584716796875, "loss_aux_layer_9": 0.05450439453125, "step": 5039, "total_loss": 0.5928092300891876 }, { "epoch": 0.9978222134230845, "grad_norm": 0.7864919900894165, "learning_rate": 5e-05, "llm_loss": 0.5230788439512253, "loss": 2.4055, "loss_aux_layer_0": 0.0105438232421875, "loss_aux_layer_1": 0.029205322265625, "loss_aux_layer_10": 0.0557861328125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.05120849609375, "loss_aux_layer_4": 0.05389404296875, "loss_aux_layer_5": 0.0550537109375, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05609130859375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05438232421875, "step": 5040, "total_loss": 0.6013824641704559 }, { "epoch": 0.998020194020986, "grad_norm": 0.7881852984428406, "learning_rate": 5e-05, "llm_loss": 0.5740074887871742, "loss": 2.6072, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.05511474609375, "loss_aux_layer_11": 0.05877685546875, "loss_aux_layer_12": 0.06304931640625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.076416015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1121826171875, "loss_aux_layer_2": 0.0406494140625, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.150390625, "loss_aux_layer_23": 0.18603515625, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.05438232421875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.0555419921875, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05377197265625, "step": 5041, "total_loss": 0.6517966240644455 }, { "epoch": 0.9982181746188874, "grad_norm": 0.8138405084609985, "learning_rate": 5e-05, "llm_loss": 0.5218599885702133, "loss": 2.3882, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05303955078125, "loss_aux_layer_9": 0.05181884765625, "step": 5042, "total_loss": 0.5970532670617104 }, { "epoch": 0.9984161552167887, "grad_norm": 0.7378658056259155, "learning_rate": 5e-05, "llm_loss": 0.4960858151316643, "loss": 2.2782, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.05120849609375, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.05877685546875, "loss_aux_layer_13": 0.0635986328125, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.079345703125, "loss_aux_layer_16": 0.088134765625, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.107177734375, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.1148681640625, "loss_aux_layer_21": 0.123046875, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.04718017578125, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05108642578125, "loss_aux_layer_6": 0.05377197265625, "loss_aux_layer_7": 0.05194091796875, "loss_aux_layer_8": 0.0513916015625, "loss_aux_layer_9": 0.0501708984375, "step": 5043, "total_loss": 0.5695485472679138 }, { "epoch": 0.9986141358146902, "grad_norm": 0.9170035719871521, "learning_rate": 5e-05, "llm_loss": 0.5101140812039375, "loss": 2.3369, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.027801513671875, "loss_aux_layer_10": 0.05126953125, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.06378173828125, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.07958984375, "loss_aux_layer_16": 0.0885009765625, "loss_aux_layer_17": 0.0958251953125, "loss_aux_layer_18": 0.103759765625, "loss_aux_layer_19": 0.107666015625, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.0501708984375, "step": 5044, "total_loss": 0.584233969449997 }, { "epoch": 0.9988121164125916, "grad_norm": 0.7516260743141174, "learning_rate": 5e-05, "llm_loss": 0.4994259849190712, "loss": 2.2976, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.0533447265625, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.10400390625, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.1231689453125, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.0535888671875, "loss_aux_layer_9": 0.0523681640625, "step": 5045, "total_loss": 0.5744003057479858 }, { "epoch": 0.9990100970104929, "grad_norm": 0.8946696519851685, "learning_rate": 5e-05, "llm_loss": 0.4947690963745117, "loss": 2.2994, "loss_aux_layer_0": 0.0108642578125, "loss_aux_layer_1": 0.029052734375, "loss_aux_layer_10": 0.0565185546875, "loss_aux_layer_11": 0.06048583984375, "loss_aux_layer_12": 0.0650634765625, "loss_aux_layer_13": 0.0706787109375, "loss_aux_layer_14": 0.07958984375, "loss_aux_layer_15": 0.0885009765625, "loss_aux_layer_16": 0.098388671875, "loss_aux_layer_17": 0.10595703125, "loss_aux_layer_18": 0.1138916015625, "loss_aux_layer_19": 0.1168212890625, "loss_aux_layer_2": 0.0413818359375, "loss_aux_layer_20": 0.12451171875, "loss_aux_layer_21": 0.1324462890625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.18896484375, "loss_aux_layer_3": 0.05133056640625, "loss_aux_layer_4": 0.0538330078125, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05804443359375, "loss_aux_layer_7": 0.05645751953125, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.054931640625, "step": 5046, "total_loss": 0.5748519599437714 }, { "epoch": 0.9992080776083944, "grad_norm": 0.7827939987182617, "learning_rate": 5e-05, "llm_loss": 0.5792372599244118, "loss": 2.6298, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.059814453125, "loss_aux_layer_12": 0.064208984375, "loss_aux_layer_13": 0.069580078125, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0863037109375, "loss_aux_layer_16": 0.0953369140625, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1109619140625, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04095458984375, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0556640625, "loss_aux_layer_9": 0.0545654296875, "step": 5047, "total_loss": 0.6574526131153107 }, { "epoch": 0.9994060582062958, "grad_norm": 0.917214572429657, "learning_rate": 5e-05, "llm_loss": 0.5317486897110939, "loss": 2.4304, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.06103515625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.08203125, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.05255126953125, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.053466796875, "loss_aux_layer_9": 0.0523681640625, "step": 5048, "total_loss": 0.6076054722070694 }, { "epoch": 0.9996040388041972, "grad_norm": 0.7807745337486267, "learning_rate": 5e-05, "llm_loss": 0.5974633544683456, "loss": 2.6887, "loss_aux_layer_0": 0.01055908203125, "loss_aux_layer_1": 0.027496337890625, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04986572265625, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05157470703125, "step": 5049, "total_loss": 0.6721688657999039 }, { "epoch": 0.9998020194020986, "grad_norm": 0.7733916640281677, "learning_rate": 5e-05, "llm_loss": 0.529767818748951, "loss": 2.4263, "loss_aux_layer_0": 0.0104217529296875, "loss_aux_layer_1": 0.029571533203125, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05487060546875, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05596923828125, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0538330078125, "step": 5050, "total_loss": 0.6065840795636177 }, { "epoch": 1.0, "grad_norm": 0.7586356997489929, "learning_rate": 5e-05, "llm_loss": 0.5601960569620132, "loss": 2.5429, "loss_aux_layer_0": 0.009429931640625, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.057373046875, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.0513916015625, "loss_aux_layer_5": 0.0528564453125, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05426025390625, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.0523681640625, "step": 5051, "total_loss": 0.6357362121343613 }, { "epoch": 1.0001979805979013, "grad_norm": 0.8318032026290894, "learning_rate": 5e-05, "llm_loss": 0.5606222748756409, "loss": 2.5424, "loss_aux_layer_0": 0.01025390625, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05621337890625, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.0390625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.048828125, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.05218505859375, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.05108642578125, "step": 5052, "total_loss": 0.6355888694524765 }, { "epoch": 1.0003959611958029, "grad_norm": 0.7623459696769714, "learning_rate": 5e-05, "llm_loss": 0.5282363891601562, "loss": 2.4331, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.03057861328125, "loss_aux_layer_10": 0.05743408203125, "loss_aux_layer_11": 0.06158447265625, "loss_aux_layer_12": 0.06585693359375, "loss_aux_layer_13": 0.07122802734375, "loss_aux_layer_14": 0.0791015625, "loss_aux_layer_15": 0.087646484375, "loss_aux_layer_16": 0.0965576171875, "loss_aux_layer_17": 0.10400390625, "loss_aux_layer_18": 0.1116943359375, "loss_aux_layer_19": 0.1143798828125, "loss_aux_layer_2": 0.0428466796875, "loss_aux_layer_20": 0.12158203125, "loss_aux_layer_21": 0.1298828125, "loss_aux_layer_22": 0.15087890625, "loss_aux_layer_23": 0.1875, "loss_aux_layer_3": 0.05322265625, "loss_aux_layer_4": 0.0556640625, "loss_aux_layer_5": 0.05706787109375, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05810546875, "loss_aux_layer_8": 0.0576171875, "loss_aux_layer_9": 0.05615234375, "step": 5053, "total_loss": 0.6082636117935181 }, { "epoch": 1.0005939417937042, "grad_norm": 0.8379309773445129, "learning_rate": 5e-05, "llm_loss": 0.6122632026672363, "loss": 2.7396, "loss_aux_layer_0": 0.0095672607421875, "loss_aux_layer_1": 0.0267333984375, "loss_aux_layer_10": 0.050048828125, "loss_aux_layer_11": 0.0538330078125, "loss_aux_layer_12": 0.0574951171875, "loss_aux_layer_13": 0.06231689453125, "loss_aux_layer_14": 0.0703125, "loss_aux_layer_15": 0.0784912109375, "loss_aux_layer_16": 0.087646484375, "loss_aux_layer_17": 0.095458984375, "loss_aux_layer_18": 0.103271484375, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.0367431640625, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.123291015625, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04571533203125, "loss_aux_layer_4": 0.0478515625, "loss_aux_layer_5": 0.04925537109375, "loss_aux_layer_6": 0.05181884765625, "loss_aux_layer_7": 0.05010986328125, "loss_aux_layer_8": 0.0498046875, "loss_aux_layer_9": 0.0487060546875, "step": 5054, "total_loss": 0.6848896592855453 }, { "epoch": 1.0007919223916055, "grad_norm": 0.8892002701759338, "learning_rate": 5e-05, "llm_loss": 0.5361632853746414, "loss": 2.4531, "loss_aux_layer_0": 0.01019287109375, "loss_aux_layer_1": 0.028839111328125, "loss_aux_layer_10": 0.0543212890625, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.08349609375, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1085205078125, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1201171875, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.05267333984375, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05694580078125, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.052978515625, "step": 5055, "total_loss": 0.6132688820362091 }, { "epoch": 1.000989902989507, "grad_norm": 0.7805729508399963, "learning_rate": 5e-05, "llm_loss": 0.4767587408423424, "loss": 2.209, "loss_aux_layer_0": 0.009429931640625, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.0565185546875, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0736083984375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0909423828125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0478515625, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.052490234375, "loss_aux_layer_9": 0.051513671875, "step": 5056, "total_loss": 0.5522404611110687 }, { "epoch": 1.0011878835874084, "grad_norm": 0.8113840818405151, "learning_rate": 5e-05, "llm_loss": 0.47367503494024277, "loss": 2.2027, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06256103515625, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0498046875, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.054443359375, "loss_aux_layer_9": 0.05316162109375, "step": 5057, "total_loss": 0.5506782904267311 }, { "epoch": 1.00138586418531, "grad_norm": 0.7782441973686218, "learning_rate": 5e-05, "llm_loss": 0.4723656699061394, "loss": 2.1865, "loss_aux_layer_0": 0.0096588134765625, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.05145263671875, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04937744140625, "loss_aux_layer_5": 0.0506591796875, "loss_aux_layer_6": 0.05340576171875, "loss_aux_layer_7": 0.0518798828125, "loss_aux_layer_8": 0.05120849609375, "loss_aux_layer_9": 0.05023193359375, "step": 5058, "total_loss": 0.5466337203979492 }, { "epoch": 1.0015838447832113, "grad_norm": 0.7872233390808105, "learning_rate": 5e-05, "llm_loss": 0.535411074757576, "loss": 2.438, "loss_aux_layer_0": 0.0089569091796875, "loss_aux_layer_1": 0.0263671875, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.05517578125, "loss_aux_layer_12": 0.05902099609375, "loss_aux_layer_13": 0.0640869140625, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.110107421875, "loss_aux_layer_2": 0.037109375, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04620361328125, "loss_aux_layer_4": 0.04864501953125, "loss_aux_layer_5": 0.04998779296875, "loss_aux_layer_6": 0.052978515625, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.05035400390625, "step": 5059, "total_loss": 0.6094951629638672 }, { "epoch": 1.0017818253811126, "grad_norm": 0.8771255016326904, "learning_rate": 5e-05, "llm_loss": 0.5188149437308311, "loss": 2.3789, "loss_aux_layer_0": 0.0097198486328125, "loss_aux_layer_1": 0.027435302734375, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.099853515625, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.052001953125, "loss_aux_layer_6": 0.0548095703125, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.05181884765625, "step": 5060, "total_loss": 0.5947283059358597 }, { "epoch": 1.0019798059790141, "grad_norm": 0.865129828453064, "learning_rate": 5e-05, "llm_loss": 0.5461672320961952, "loss": 2.4778, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.026458740234375, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.06439208984375, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.1072998046875, "loss_aux_layer_2": 0.03717041015625, "loss_aux_layer_20": 0.1146240234375, "loss_aux_layer_21": 0.1220703125, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.176513671875, "loss_aux_layer_3": 0.04632568359375, "loss_aux_layer_4": 0.04876708984375, "loss_aux_layer_5": 0.05029296875, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.0517578125, "loss_aux_layer_8": 0.05120849609375, "loss_aux_layer_9": 0.05029296875, "step": 5061, "total_loss": 0.6194608509540558 }, { "epoch": 1.0021777865769155, "grad_norm": 0.9588850140571594, "learning_rate": 5e-05, "llm_loss": 0.5264318883419037, "loss": 2.4121, "loss_aux_layer_0": 0.00921630859375, "loss_aux_layer_1": 0.027740478515625, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.06707763671875, "loss_aux_layer_14": 0.0750732421875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.0535888671875, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05352783203125, "step": 5062, "total_loss": 0.6030267179012299 }, { "epoch": 1.0023757671748168, "grad_norm": 1.0789505243301392, "learning_rate": 5e-05, "llm_loss": 0.585979625582695, "loss": 2.6495, "loss_aux_layer_0": 0.009613037109375, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.054931640625, "loss_aux_layer_11": 0.0587158203125, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0755615234375, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0992431640625, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1248779296875, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.052490234375, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.0548095703125, "loss_aux_layer_9": 0.05364990234375, "step": 5063, "total_loss": 0.6623864322900772 }, { "epoch": 1.0025737477727183, "grad_norm": 0.8484581708908081, "learning_rate": 5e-05, "llm_loss": 0.581521138548851, "loss": 2.645, "loss_aux_layer_0": 0.0089263916015625, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.0579833984375, "loss_aux_layer_11": 0.0621337890625, "loss_aux_layer_12": 0.06640625, "loss_aux_layer_13": 0.0714111328125, "loss_aux_layer_14": 0.0794677734375, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.0960693359375, "loss_aux_layer_17": 0.1031494140625, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.11328125, "loss_aux_layer_2": 0.04241943359375, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.05279541015625, "loss_aux_layer_4": 0.0555419921875, "loss_aux_layer_5": 0.05694580078125, "loss_aux_layer_6": 0.06011962890625, "loss_aux_layer_7": 0.05859375, "loss_aux_layer_8": 0.05792236328125, "loss_aux_layer_9": 0.0567626953125, "step": 5064, "total_loss": 0.6612520068883896 }, { "epoch": 1.0027717283706197, "grad_norm": 0.950445830821991, "learning_rate": 5e-05, "llm_loss": 0.5476315319538116, "loss": 2.5024, "loss_aux_layer_0": 0.0093231201171875, "loss_aux_layer_1": 0.028717041015625, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.0640869140625, "loss_aux_layer_13": 0.0693359375, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1099853515625, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.041259765625, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.053955078125, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.0582275390625, "loss_aux_layer_7": 0.05633544921875, "loss_aux_layer_8": 0.0557861328125, "loss_aux_layer_9": 0.05462646484375, "step": 5065, "total_loss": 0.6256067901849747 }, { "epoch": 1.002969708968521, "grad_norm": 0.7714684009552002, "learning_rate": 5e-05, "llm_loss": 0.5322218015789986, "loss": 2.4298, "loss_aux_layer_0": 0.00970458984375, "loss_aux_layer_1": 0.02691650390625, "loss_aux_layer_10": 0.05242919921875, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.05975341796875, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1109619140625, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.05126953125, "step": 5066, "total_loss": 0.6074472367763519 }, { "epoch": 1.0031676895664225, "grad_norm": 0.762367844581604, "learning_rate": 5e-05, "llm_loss": 0.5253468006849289, "loss": 2.3953, "loss_aux_layer_0": 0.0091094970703125, "loss_aux_layer_1": 0.026458740234375, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.0638427734375, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0877685546875, "loss_aux_layer_17": 0.0953369140625, "loss_aux_layer_18": 0.103515625, "loss_aux_layer_19": 0.1072998046875, "loss_aux_layer_2": 0.03729248046875, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04644775390625, "loss_aux_layer_4": 0.04876708984375, "loss_aux_layer_5": 0.05047607421875, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.05169677734375, "loss_aux_layer_8": 0.05126953125, "loss_aux_layer_9": 0.0504150390625, "step": 5067, "total_loss": 0.598819762468338 }, { "epoch": 1.0033656701643239, "grad_norm": 0.857394814491272, "learning_rate": 5e-05, "llm_loss": 0.5312095806002617, "loss": 2.4184, "loss_aux_layer_0": 0.009918212890625, "loss_aux_layer_1": 0.025115966796875, "loss_aux_layer_10": 0.04962158203125, "loss_aux_layer_11": 0.05322265625, "loss_aux_layer_12": 0.0574951171875, "loss_aux_layer_13": 0.0623779296875, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03546142578125, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04443359375, "loss_aux_layer_4": 0.04638671875, "loss_aux_layer_5": 0.0478515625, "loss_aux_layer_6": 0.05059814453125, "loss_aux_layer_7": 0.04931640625, "loss_aux_layer_8": 0.04888916015625, "loss_aux_layer_9": 0.04833984375, "step": 5068, "total_loss": 0.6046041324734688 }, { "epoch": 1.0035636507622252, "grad_norm": 0.7994987964630127, "learning_rate": 5e-05, "llm_loss": 0.5447623655200005, "loss": 2.4716, "loss_aux_layer_0": 0.00872802734375, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.05120849609375, "loss_aux_layer_11": 0.05474853515625, "loss_aux_layer_12": 0.0587158203125, "loss_aux_layer_13": 0.06329345703125, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.0791015625, "loss_aux_layer_16": 0.08740234375, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.1026611328125, "loss_aux_layer_19": 0.1063232421875, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1141357421875, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.177734375, "loss_aux_layer_3": 0.04754638671875, "loss_aux_layer_4": 0.049560546875, "loss_aux_layer_5": 0.05096435546875, "loss_aux_layer_6": 0.05377197265625, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05133056640625, "loss_aux_layer_9": 0.0501708984375, "step": 5069, "total_loss": 0.617892786860466 }, { "epoch": 1.0037616313601267, "grad_norm": 0.7575060725212097, "learning_rate": 5e-05, "llm_loss": 0.5175068825483322, "loss": 2.3819, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0946044921875, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.1275634765625, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.05078125, "loss_aux_layer_4": 0.0533447265625, "loss_aux_layer_5": 0.05474853515625, "loss_aux_layer_6": 0.0577392578125, "loss_aux_layer_7": 0.0560302734375, "loss_aux_layer_8": 0.05548095703125, "loss_aux_layer_9": 0.05450439453125, "step": 5070, "total_loss": 0.5954712629318237 }, { "epoch": 1.003959611958028, "grad_norm": 0.8499872088432312, "learning_rate": 5e-05, "llm_loss": 0.4803417846560478, "loss": 2.2076, "loss_aux_layer_0": 0.009124755859375, "loss_aux_layer_1": 0.0260009765625, "loss_aux_layer_10": 0.0499267578125, "loss_aux_layer_11": 0.0535888671875, "loss_aux_layer_12": 0.05755615234375, "loss_aux_layer_13": 0.0625, "loss_aux_layer_14": 0.0701904296875, "loss_aux_layer_15": 0.0782470703125, "loss_aux_layer_16": 0.087158203125, "loss_aux_layer_17": 0.094970703125, "loss_aux_layer_18": 0.1025390625, "loss_aux_layer_19": 0.10498046875, "loss_aux_layer_2": 0.03631591796875, "loss_aux_layer_20": 0.1123046875, "loss_aux_layer_21": 0.11962890625, "loss_aux_layer_22": 0.138427734375, "loss_aux_layer_23": 0.173095703125, "loss_aux_layer_3": 0.04498291015625, "loss_aux_layer_4": 0.04736328125, "loss_aux_layer_5": 0.04876708984375, "loss_aux_layer_6": 0.05145263671875, "loss_aux_layer_7": 0.04998779296875, "loss_aux_layer_8": 0.04962158203125, "loss_aux_layer_9": 0.04876708984375, "step": 5071, "total_loss": 0.5519068092107773 }, { "epoch": 1.0041575925559296, "grad_norm": 0.8392246961593628, "learning_rate": 5e-05, "llm_loss": 0.4387032613158226, "loss": 2.0573, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.0269775390625, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.06573486328125, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1114501953125, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.119140625, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.051513671875, "step": 5072, "total_loss": 0.5143308490514755 }, { "epoch": 1.004355573153831, "grad_norm": 0.9405243396759033, "learning_rate": 5e-05, "llm_loss": 0.4930117428302765, "loss": 2.27, "loss_aux_layer_0": 0.00921630859375, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.05169677734375, "loss_aux_layer_11": 0.055419921875, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.064453125, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05072021484375, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.0517578125, "loss_aux_layer_8": 0.05133056640625, "loss_aux_layer_9": 0.050537109375, "step": 5073, "total_loss": 0.5674934834241867 }, { "epoch": 1.0045535537517323, "grad_norm": 1.029306411743164, "learning_rate": 5e-05, "llm_loss": 0.5682989209890366, "loss": 2.5665, "loss_aux_layer_0": 0.0096893310546875, "loss_aux_layer_1": 0.027069091796875, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.05462646484375, "loss_aux_layer_12": 0.05853271484375, "loss_aux_layer_13": 0.06329345703125, "loss_aux_layer_14": 0.0714111328125, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.0882568359375, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.10400390625, "loss_aux_layer_19": 0.107177734375, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.122802734375, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.0491943359375, "loss_aux_layer_5": 0.0506591796875, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.051025390625, "loss_aux_layer_9": 0.04974365234375, "step": 5074, "total_loss": 0.6416255980730057 }, { "epoch": 1.0047515343496338, "grad_norm": 0.8641497492790222, "learning_rate": 5e-05, "llm_loss": 0.5106848329305649, "loss": 2.3441, "loss_aux_layer_0": 0.009674072265625, "loss_aux_layer_1": 0.0264892578125, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.0560302734375, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.0740966796875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.09228515625, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.108154296875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.119384765625, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04718017578125, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05096435546875, "step": 5075, "total_loss": 0.5860202610492706 }, { "epoch": 1.0049495149475351, "grad_norm": 0.8653287887573242, "learning_rate": 5e-05, "llm_loss": 0.4781375080347061, "loss": 2.211, "loss_aux_layer_0": 0.0090484619140625, "loss_aux_layer_1": 0.027130126953125, "loss_aux_layer_10": 0.05169677734375, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.05926513671875, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04766845703125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.05224609375, "loss_aux_layer_8": 0.05169677734375, "loss_aux_layer_9": 0.0506591796875, "step": 5076, "total_loss": 0.5527448952198029 }, { "epoch": 1.0051474955454365, "grad_norm": 0.9953750967979431, "learning_rate": 5e-05, "llm_loss": 0.47400810569524765, "loss": 2.208, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.028778076171875, "loss_aux_layer_10": 0.055419921875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.063720703125, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.07763671875, "loss_aux_layer_15": 0.085693359375, "loss_aux_layer_16": 0.094970703125, "loss_aux_layer_17": 0.1024169921875, "loss_aux_layer_18": 0.1104736328125, "loss_aux_layer_19": 0.113037109375, "loss_aux_layer_2": 0.040283203125, "loss_aux_layer_20": 0.1204833984375, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.05023193359375, "loss_aux_layer_4": 0.0528564453125, "loss_aux_layer_5": 0.054443359375, "loss_aux_layer_6": 0.0572509765625, "loss_aux_layer_7": 0.0556640625, "loss_aux_layer_8": 0.05523681640625, "loss_aux_layer_9": 0.0540771484375, "step": 5077, "total_loss": 0.552000530064106 }, { "epoch": 1.005345476143338, "grad_norm": 0.8843179941177368, "learning_rate": 5e-05, "llm_loss": 0.6016043499112129, "loss": 2.7168, "loss_aux_layer_0": 0.009002685546875, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.055908203125, "loss_aux_layer_11": 0.0595703125, "loss_aux_layer_12": 0.0638427734375, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.110595703125, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.0513916015625, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05548095703125, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.056640625, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05487060546875, "step": 5078, "total_loss": 0.6791914701461792 }, { "epoch": 1.0055434567412393, "grad_norm": 0.9600127339363098, "learning_rate": 5e-05, "llm_loss": 0.5773622691631317, "loss": 2.6093, "loss_aux_layer_0": 0.0105133056640625, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.0738525390625, "loss_aux_layer_15": 0.081787109375, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05352783203125, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.05181884765625, "step": 5079, "total_loss": 0.6523222625255585 }, { "epoch": 1.0057414373391407, "grad_norm": 0.8039948344230652, "learning_rate": 5e-05, "llm_loss": 0.5224622786045074, "loss": 2.3902, "loss_aux_layer_0": 0.0096588134765625, "loss_aux_layer_1": 0.02716064453125, "loss_aux_layer_10": 0.05255126953125, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0908203125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.0513916015625, "loss_aux_layer_6": 0.05419921875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05242919921875, "loss_aux_layer_9": 0.0513916015625, "step": 5080, "total_loss": 0.5975514352321625 }, { "epoch": 1.0059394179370422, "grad_norm": 1.0254775285720825, "learning_rate": 5e-05, "llm_loss": 0.5235463976860046, "loss": 2.4017, "loss_aux_layer_0": 0.0101318359375, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0836181640625, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.1845703125, "loss_aux_layer_3": 0.04937744140625, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.0545654296875, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.05303955078125, "step": 5081, "total_loss": 0.6004125475883484 }, { "epoch": 1.0061373985349436, "grad_norm": 0.833155632019043, "learning_rate": 5e-05, "llm_loss": 0.461896687746048, "loss": 2.1626, "loss_aux_layer_0": 0.0095367431640625, "loss_aux_layer_1": 0.030303955078125, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.0643310546875, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.0775146484375, "loss_aux_layer_15": 0.0855712890625, "loss_aux_layer_16": 0.0943603515625, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0426025390625, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.1280517578125, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.052490234375, "loss_aux_layer_4": 0.05487060546875, "loss_aux_layer_5": 0.05621337890625, "loss_aux_layer_6": 0.058837890625, "loss_aux_layer_7": 0.056884765625, "loss_aux_layer_8": 0.05615234375, "loss_aux_layer_9": 0.05487060546875, "step": 5082, "total_loss": 0.5406514406204224 }, { "epoch": 1.0063353791328449, "grad_norm": 0.929332435131073, "learning_rate": 5e-05, "llm_loss": 0.5220476165413857, "loss": 2.3833, "loss_aux_layer_0": 0.011199951171875, "loss_aux_layer_1": 0.025726318359375, "loss_aux_layer_10": 0.0501708984375, "loss_aux_layer_11": 0.0538330078125, "loss_aux_layer_12": 0.0582275390625, "loss_aux_layer_13": 0.06378173828125, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.036865234375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.04559326171875, "loss_aux_layer_4": 0.0478515625, "loss_aux_layer_5": 0.049072265625, "loss_aux_layer_6": 0.0517578125, "loss_aux_layer_7": 0.04998779296875, "loss_aux_layer_8": 0.04962158203125, "loss_aux_layer_9": 0.048828125, "step": 5083, "total_loss": 0.5958207547664642 }, { "epoch": 1.0065333597307464, "grad_norm": 0.8854802250862122, "learning_rate": 5e-05, "llm_loss": 0.577039897441864, "loss": 2.6113, "loss_aux_layer_0": 0.0095977783203125, "loss_aux_layer_1": 0.02703857421875, "loss_aux_layer_10": 0.05316162109375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.07421875, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091552734375, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.0523681640625, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.05352783203125, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.0518798828125, "step": 5084, "total_loss": 0.6528297364711761 }, { "epoch": 1.0067313403286478, "grad_norm": 0.9539029002189636, "learning_rate": 5e-05, "llm_loss": 0.5253625735640526, "loss": 2.4091, "loss_aux_layer_0": 0.01129150390625, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05859375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0679931640625, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.10009765625, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1268310546875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05206298828125, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05682373046875, "loss_aux_layer_7": 0.05517578125, "loss_aux_layer_8": 0.0545654296875, "loss_aux_layer_9": 0.05340576171875, "step": 5085, "total_loss": 0.6022654473781586 }, { "epoch": 1.0069293209265493, "grad_norm": 0.7962554693222046, "learning_rate": 5e-05, "llm_loss": 0.46199625730514526, "loss": 2.1478, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.0277099609375, "loss_aux_layer_10": 0.05169677734375, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.064453125, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0518798828125, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.052490234375, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.05059814453125, "step": 5086, "total_loss": 0.5369487851858139 }, { "epoch": 1.0071273015244506, "grad_norm": 0.9409555196762085, "learning_rate": 5e-05, "llm_loss": 0.5818741247057915, "loss": 2.6174, "loss_aux_layer_0": 0.0096435546875, "loss_aux_layer_1": 0.027130126953125, "loss_aux_layer_10": 0.05072021484375, "loss_aux_layer_11": 0.05438232421875, "loss_aux_layer_12": 0.05810546875, "loss_aux_layer_13": 0.06292724609375, "loss_aux_layer_14": 0.0704345703125, "loss_aux_layer_15": 0.077880859375, "loss_aux_layer_16": 0.08642578125, "loss_aux_layer_17": 0.093994140625, "loss_aux_layer_18": 0.1016845703125, "loss_aux_layer_19": 0.1051025390625, "loss_aux_layer_2": 0.037353515625, "loss_aux_layer_20": 0.113037109375, "loss_aux_layer_21": 0.1209716796875, "loss_aux_layer_22": 0.140869140625, "loss_aux_layer_23": 0.17578125, "loss_aux_layer_3": 0.046630859375, "loss_aux_layer_4": 0.04937744140625, "loss_aux_layer_5": 0.05084228515625, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.05181884765625, "loss_aux_layer_8": 0.051025390625, "loss_aux_layer_9": 0.04962158203125, "step": 5087, "total_loss": 0.6543471664190292 }, { "epoch": 1.007325282122352, "grad_norm": 0.7828376889228821, "learning_rate": 5e-05, "llm_loss": 0.5451902225613594, "loss": 2.475, "loss_aux_layer_0": 0.0106201171875, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05133056640625, "loss_aux_layer_11": 0.05517578125, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.0640869140625, "loss_aux_layer_14": 0.07177734375, "loss_aux_layer_15": 0.0797119140625, "loss_aux_layer_16": 0.0880126953125, "loss_aux_layer_17": 0.0955810546875, "loss_aux_layer_18": 0.1031494140625, "loss_aux_layer_19": 0.1064453125, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.114013671875, "loss_aux_layer_21": 0.1221923828125, "loss_aux_layer_22": 0.141845703125, "loss_aux_layer_23": 0.1767578125, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.0504150390625, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0516357421875, "loss_aux_layer_9": 0.05029296875, "step": 5088, "total_loss": 0.6187385320663452 }, { "epoch": 1.0075232627202535, "grad_norm": 0.9153685569763184, "learning_rate": 5e-05, "llm_loss": 0.5514184832572937, "loss": 2.4935, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.025115966796875, "loss_aux_layer_10": 0.0489501953125, "loss_aux_layer_11": 0.052490234375, "loss_aux_layer_12": 0.05645751953125, "loss_aux_layer_13": 0.0618896484375, "loss_aux_layer_14": 0.0699462890625, "loss_aux_layer_15": 0.078125, "loss_aux_layer_16": 0.0872802734375, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.1038818359375, "loss_aux_layer_19": 0.107666015625, "loss_aux_layer_2": 0.03533935546875, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.044189453125, "loss_aux_layer_4": 0.04644775390625, "loss_aux_layer_5": 0.04779052734375, "loss_aux_layer_6": 0.05029296875, "loss_aux_layer_7": 0.04864501953125, "loss_aux_layer_8": 0.04840087890625, "loss_aux_layer_9": 0.047607421875, "step": 5089, "total_loss": 0.6233674734830856 }, { "epoch": 1.0077212433181548, "grad_norm": 0.8126181960105896, "learning_rate": 5e-05, "llm_loss": 0.5436641573905945, "loss": 2.4823, "loss_aux_layer_0": 0.0096282958984375, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.101318359375, "loss_aux_layer_18": 0.110107421875, "loss_aux_layer_19": 0.1138916015625, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1224365234375, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.1884765625, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.0517578125, "step": 5090, "total_loss": 0.6205809414386749 }, { "epoch": 1.0079192239160562, "grad_norm": 0.8531185388565063, "learning_rate": 5e-05, "llm_loss": 0.46994275599718094, "loss": 2.1783, "loss_aux_layer_0": 0.010040283203125, "loss_aux_layer_1": 0.02740478515625, "loss_aux_layer_10": 0.0518798828125, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06463623046875, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05072021484375, "step": 5091, "total_loss": 0.5445663332939148 }, { "epoch": 1.0081172045139577, "grad_norm": 0.9555481672286987, "learning_rate": 5e-05, "llm_loss": 0.5919336825609207, "loss": 2.6679, "loss_aux_layer_0": 0.009613037109375, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.065673828125, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05133056640625, "loss_aux_layer_5": 0.05279541015625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.0528564453125, "loss_aux_layer_9": 0.0517578125, "step": 5092, "total_loss": 0.6669788807630539 }, { "epoch": 1.008315185111859, "grad_norm": 0.8940584659576416, "learning_rate": 5e-05, "llm_loss": 0.5610148310661316, "loss": 2.5359, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.025604248046875, "loss_aux_layer_10": 0.050048828125, "loss_aux_layer_11": 0.0533447265625, "loss_aux_layer_12": 0.0574951171875, "loss_aux_layer_13": 0.06231689453125, "loss_aux_layer_14": 0.0701904296875, "loss_aux_layer_15": 0.078857421875, "loss_aux_layer_16": 0.0882568359375, "loss_aux_layer_17": 0.0963134765625, "loss_aux_layer_18": 0.10400390625, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.0362548828125, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04541015625, "loss_aux_layer_4": 0.04779052734375, "loss_aux_layer_5": 0.04925537109375, "loss_aux_layer_6": 0.05206298828125, "loss_aux_layer_7": 0.05047607421875, "loss_aux_layer_8": 0.04998779296875, "loss_aux_layer_9": 0.04901123046875, "step": 5093, "total_loss": 0.6339809894561768 }, { "epoch": 1.0085131657097604, "grad_norm": 0.8968465328216553, "learning_rate": 5e-05, "llm_loss": 0.4872811585664749, "loss": 2.2561, "loss_aux_layer_0": 0.010101318359375, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06207275390625, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.127685546875, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.186279296875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.0531005859375, "step": 5094, "total_loss": 0.5640164986252785 }, { "epoch": 1.008711146307662, "grad_norm": 0.9348371028900146, "learning_rate": 5e-05, "llm_loss": 0.5722158700227737, "loss": 2.594, "loss_aux_layer_0": 0.010467529296875, "loss_aux_layer_1": 0.028106689453125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05792236328125, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.06671142578125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.039794921875, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04949951171875, "loss_aux_layer_4": 0.05218505859375, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.05279541015625, "step": 5095, "total_loss": 0.6485050320625305 }, { "epoch": 1.0089091269055632, "grad_norm": 0.9371212124824524, "learning_rate": 5e-05, "llm_loss": 0.5225399434566498, "loss": 2.3956, "loss_aux_layer_0": 0.01068115234375, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.05438232421875, "loss_aux_layer_11": 0.05816650390625, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.0985107421875, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1175537109375, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04986572265625, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.05621337890625, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.0531005859375, "step": 5096, "total_loss": 0.5989057272672653 }, { "epoch": 1.0091071075034646, "grad_norm": 0.9188143014907837, "learning_rate": 5e-05, "llm_loss": 0.5528904348611832, "loss": 2.5194, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.0545654296875, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.06793212890625, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.101806640625, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.11279296875, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1202392578125, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05426025390625, "loss_aux_layer_9": 0.05322265625, "step": 5097, "total_loss": 0.6298572421073914 }, { "epoch": 1.009305088101366, "grad_norm": 0.8834199905395508, "learning_rate": 5e-05, "llm_loss": 0.4935246706008911, "loss": 2.2703, "loss_aux_layer_0": 0.0111541748046875, "loss_aux_layer_1": 0.02691650390625, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.0548095703125, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.08935546875, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.104736328125, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.036865234375, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1253662109375, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18359375, "loss_aux_layer_3": 0.04608154296875, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.0499267578125, "loss_aux_layer_6": 0.052734375, "loss_aux_layer_7": 0.05108642578125, "loss_aux_layer_8": 0.05072021484375, "loss_aux_layer_9": 0.0496826171875, "step": 5098, "total_loss": 0.5675668939948082 }, { "epoch": 1.0095030686992674, "grad_norm": 1.0392228364944458, "learning_rate": 5e-05, "llm_loss": 0.5390600264072418, "loss": 2.4477, "loss_aux_layer_0": 0.00958251953125, "loss_aux_layer_1": 0.0263671875, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.0545654296875, "loss_aux_layer_12": 0.058349609375, "loss_aux_layer_13": 0.0634765625, "loss_aux_layer_14": 0.071533203125, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.08837890625, "loss_aux_layer_17": 0.0958251953125, "loss_aux_layer_18": 0.1029052734375, "loss_aux_layer_19": 0.106201171875, "loss_aux_layer_2": 0.037353515625, "loss_aux_layer_20": 0.11376953125, "loss_aux_layer_21": 0.1219482421875, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.04632568359375, "loss_aux_layer_4": 0.048583984375, "loss_aux_layer_5": 0.05010986328125, "loss_aux_layer_6": 0.05279541015625, "loss_aux_layer_7": 0.051025390625, "loss_aux_layer_8": 0.0506591796875, "loss_aux_layer_9": 0.0498046875, "step": 5099, "total_loss": 0.6119196265935898 }, { "epoch": 1.009701049297169, "grad_norm": 0.917820394039154, "learning_rate": 5e-05, "llm_loss": 0.5220386832952499, "loss": 2.3859, "loss_aux_layer_0": 0.011962890625, "loss_aux_layer_1": 0.027130126953125, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.0599365234375, "loss_aux_layer_13": 0.0648193359375, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.104248046875, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.0377197265625, "loss_aux_layer_20": 0.11572265625, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.0543212890625, "loss_aux_layer_7": 0.0528564453125, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.0511474609375, "step": 5100, "total_loss": 0.5964775085449219 }, { "epoch": 1.0098990298950703, "grad_norm": 0.8380401134490967, "learning_rate": 5e-05, "llm_loss": 0.5662216618657112, "loss": 2.5729, "loss_aux_layer_0": 0.008941650390625, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05902099609375, "loss_aux_layer_12": 0.0634765625, "loss_aux_layer_13": 0.06866455078125, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.1072998046875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.14794921875, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.0523681640625, "loss_aux_layer_5": 0.0540771484375, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.055419921875, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05377197265625, "step": 5101, "total_loss": 0.6432188749313354 }, { "epoch": 1.0100970104929716, "grad_norm": 1.037114143371582, "learning_rate": 5e-05, "llm_loss": 0.60939821600914, "loss": 2.7307, "loss_aux_layer_0": 0.0108184814453125, "loss_aux_layer_1": 0.026763916015625, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.0545654296875, "loss_aux_layer_12": 0.05853271484375, "loss_aux_layer_13": 0.06353759765625, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0877685546875, "loss_aux_layer_17": 0.094970703125, "loss_aux_layer_18": 0.103515625, "loss_aux_layer_19": 0.1072998046875, "loss_aux_layer_2": 0.03753662109375, "loss_aux_layer_20": 0.114990234375, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.04913330078125, "loss_aux_layer_5": 0.05047607421875, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05120849609375, "loss_aux_layer_9": 0.050048828125, "step": 5102, "total_loss": 0.6826648265123367 }, { "epoch": 1.0102949910908732, "grad_norm": 0.7240302562713623, "learning_rate": 5e-05, "llm_loss": 0.4912284389138222, "loss": 2.2623, "loss_aux_layer_0": 0.0086822509765625, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.05291748046875, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.1221923828125, "loss_aux_layer_22": 0.141845703125, "loss_aux_layer_23": 0.17626953125, "loss_aux_layer_3": 0.04833984375, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.0521240234375, "loss_aux_layer_6": 0.0546875, "loss_aux_layer_7": 0.0531005859375, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.05181884765625, "step": 5103, "total_loss": 0.5655825734138489 }, { "epoch": 1.0104929716887745, "grad_norm": 0.8554826378822327, "learning_rate": 5e-05, "llm_loss": 0.4806954860687256, "loss": 2.2213, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027557373046875, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.05975341796875, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05206298828125, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.05316162109375, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.0513916015625, "step": 5104, "total_loss": 0.555335208773613 }, { "epoch": 1.0106909522866758, "grad_norm": 0.8114809989929199, "learning_rate": 5e-05, "llm_loss": 0.5375895798206329, "loss": 2.4425, "loss_aux_layer_0": 0.0096588134765625, "loss_aux_layer_1": 0.02655029296875, "loss_aux_layer_10": 0.05096435546875, "loss_aux_layer_11": 0.05462646484375, "loss_aux_layer_12": 0.0587158203125, "loss_aux_layer_13": 0.0634765625, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.0789794921875, "loss_aux_layer_16": 0.0877685546875, "loss_aux_layer_17": 0.094970703125, "loss_aux_layer_18": 0.102783203125, "loss_aux_layer_19": 0.106201171875, "loss_aux_layer_2": 0.0372314453125, "loss_aux_layer_20": 0.1141357421875, "loss_aux_layer_21": 0.12255859375, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04681396484375, "loss_aux_layer_4": 0.049072265625, "loss_aux_layer_5": 0.05059814453125, "loss_aux_layer_6": 0.05316162109375, "loss_aux_layer_7": 0.05157470703125, "loss_aux_layer_8": 0.05078125, "loss_aux_layer_9": 0.04974365234375, "step": 5105, "total_loss": 0.6106287688016891 }, { "epoch": 1.0108889328845774, "grad_norm": 0.9755324721336365, "learning_rate": 5e-05, "llm_loss": 0.5128337070345879, "loss": 2.3327, "loss_aux_layer_0": 0.010772705078125, "loss_aux_layer_1": 0.0260009765625, "loss_aux_layer_10": 0.048583984375, "loss_aux_layer_11": 0.05181884765625, "loss_aux_layer_12": 0.05584716796875, "loss_aux_layer_13": 0.06060791015625, "loss_aux_layer_14": 0.0682373046875, "loss_aux_layer_15": 0.075927734375, "loss_aux_layer_16": 0.0845947265625, "loss_aux_layer_17": 0.0921630859375, "loss_aux_layer_18": 0.0997314453125, "loss_aux_layer_19": 0.1038818359375, "loss_aux_layer_2": 0.03582763671875, "loss_aux_layer_20": 0.11181640625, "loss_aux_layer_21": 0.1190185546875, "loss_aux_layer_22": 0.137451171875, "loss_aux_layer_23": 0.172119140625, "loss_aux_layer_3": 0.04443359375, "loss_aux_layer_4": 0.04656982421875, "loss_aux_layer_5": 0.04803466796875, "loss_aux_layer_6": 0.05023193359375, "loss_aux_layer_7": 0.048828125, "loss_aux_layer_8": 0.04833984375, "loss_aux_layer_9": 0.047607421875, "step": 5106, "total_loss": 0.5831698179244995 }, { "epoch": 1.0110869134824787, "grad_norm": 0.8909436464309692, "learning_rate": 5e-05, "llm_loss": 0.5676493048667908, "loss": 2.5697, "loss_aux_layer_0": 0.0091552734375, "loss_aux_layer_1": 0.027252197265625, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.0911865234375, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1065673828125, "loss_aux_layer_19": 0.1097412109375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05145263671875, "step": 5107, "total_loss": 0.6424205601215363 }, { "epoch": 1.01128489408038, "grad_norm": 1.0984828472137451, "learning_rate": 5e-05, "llm_loss": 0.6062511652708054, "loss": 2.7243, "loss_aux_layer_0": 0.013275146484375, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.0517578125, "loss_aux_layer_11": 0.05517578125, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.06475830078125, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03765869140625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.049072265625, "loss_aux_layer_5": 0.05059814453125, "loss_aux_layer_6": 0.0533447265625, "loss_aux_layer_7": 0.0516357421875, "loss_aux_layer_8": 0.05120849609375, "loss_aux_layer_9": 0.0504150390625, "step": 5108, "total_loss": 0.681085616350174 }, { "epoch": 1.0114828746782816, "grad_norm": 0.8313411474227905, "learning_rate": 5e-05, "llm_loss": 0.5133720189332962, "loss": 2.3514, "loss_aux_layer_0": 0.0088958740234375, "loss_aux_layer_1": 0.02716064453125, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05548095703125, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.12548828125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.050048828125, "loss_aux_layer_5": 0.051513671875, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.0518798828125, "loss_aux_layer_9": 0.05078125, "step": 5109, "total_loss": 0.5878611505031586 }, { "epoch": 1.011680855276183, "grad_norm": 1.2313001155853271, "learning_rate": 5e-05, "llm_loss": 0.5149425119161606, "loss": 2.3573, "loss_aux_layer_0": 0.0128631591796875, "loss_aux_layer_1": 0.02777099609375, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.0589599609375, "loss_aux_layer_13": 0.0640869140625, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.080078125, "loss_aux_layer_16": 0.0889892578125, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.0384521484375, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.051025390625, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05145263671875, "loss_aux_layer_9": 0.0504150390625, "step": 5110, "total_loss": 0.5893349349498749 }, { "epoch": 1.0118788358740844, "grad_norm": 0.983103334903717, "learning_rate": 5e-05, "llm_loss": 0.49954158812761307, "loss": 2.3064, "loss_aux_layer_0": 0.009033203125, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.05474853515625, "loss_aux_layer_11": 0.058837890625, "loss_aux_layer_12": 0.063232421875, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0850830078125, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.118896484375, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05181884765625, "loss_aux_layer_5": 0.0533447265625, "loss_aux_layer_6": 0.05615234375, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.05352783203125, "step": 5111, "total_loss": 0.5766006857156754 }, { "epoch": 1.0120768164719858, "grad_norm": 1.4163016080856323, "learning_rate": 5e-05, "llm_loss": 0.5512684285640717, "loss": 2.5088, "loss_aux_layer_0": 0.0137481689453125, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.048583984375, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.05279541015625, "loss_aux_layer_8": 0.05230712890625, "loss_aux_layer_9": 0.05108642578125, "step": 5112, "total_loss": 0.6272035539150238 }, { "epoch": 1.012274797069887, "grad_norm": 0.945888102054596, "learning_rate": 5e-05, "llm_loss": 0.543192058801651, "loss": 2.459, "loss_aux_layer_0": 0.0098114013671875, "loss_aux_layer_1": 0.02667236328125, "loss_aux_layer_10": 0.0498046875, "loss_aux_layer_11": 0.05316162109375, "loss_aux_layer_12": 0.05694580078125, "loss_aux_layer_13": 0.06146240234375, "loss_aux_layer_14": 0.0689697265625, "loss_aux_layer_15": 0.0767822265625, "loss_aux_layer_16": 0.0855712890625, "loss_aux_layer_17": 0.0928955078125, "loss_aux_layer_18": 0.1005859375, "loss_aux_layer_19": 0.1043701171875, "loss_aux_layer_2": 0.037841796875, "loss_aux_layer_20": 0.1123046875, "loss_aux_layer_21": 0.1197509765625, "loss_aux_layer_22": 0.139404296875, "loss_aux_layer_23": 0.1748046875, "loss_aux_layer_3": 0.04656982421875, "loss_aux_layer_4": 0.0487060546875, "loss_aux_layer_5": 0.0498046875, "loss_aux_layer_6": 0.05230712890625, "loss_aux_layer_7": 0.050537109375, "loss_aux_layer_8": 0.0498046875, "loss_aux_layer_9": 0.04864501953125, "step": 5113, "total_loss": 0.6147592067718506 }, { "epoch": 1.0124727776677886, "grad_norm": 1.1481966972351074, "learning_rate": 5e-05, "llm_loss": 0.5255169197916985, "loss": 2.4122, "loss_aux_layer_0": 0.0134429931640625, "loss_aux_layer_1": 0.028472900390625, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.06060791015625, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.09423828125, "loss_aux_layer_17": 0.10205078125, "loss_aux_layer_18": 0.1102294921875, "loss_aux_layer_19": 0.1146240234375, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.122802734375, "loss_aux_layer_21": 0.131103515625, "loss_aux_layer_22": 0.152587890625, "loss_aux_layer_23": 0.190185546875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05517578125, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.0518798828125, "step": 5114, "total_loss": 0.6030538231134415 }, { "epoch": 1.01267075826569, "grad_norm": 0.9579693078994751, "learning_rate": 5e-05, "llm_loss": 0.6287039965391159, "loss": 2.819, "loss_aux_layer_0": 0.0102691650390625, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05364990234375, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06634521484375, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.107177734375, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.04888916015625, "loss_aux_layer_4": 0.05120849609375, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.0523681640625, "step": 5115, "total_loss": 0.7047390937805176 }, { "epoch": 1.0128687388635913, "grad_norm": 0.9292165637016296, "learning_rate": 5e-05, "llm_loss": 0.5429215505719185, "loss": 2.4686, "loss_aux_layer_0": 0.011932373046875, "loss_aux_layer_1": 0.02716064453125, "loss_aux_layer_10": 0.05181884765625, "loss_aux_layer_11": 0.05535888671875, "loss_aux_layer_12": 0.05950927734375, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0804443359375, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.09765625, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.0377197265625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.05047607421875, "loss_aux_layer_6": 0.05303955078125, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05108642578125, "loss_aux_layer_9": 0.05035400390625, "step": 5116, "total_loss": 0.6171472370624542 }, { "epoch": 1.0130667194614928, "grad_norm": 1.0258287191390991, "learning_rate": 5e-05, "llm_loss": 0.5949332118034363, "loss": 2.6706, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.026641845703125, "loss_aux_layer_10": 0.05029296875, "loss_aux_layer_11": 0.05389404296875, "loss_aux_layer_12": 0.05780029296875, "loss_aux_layer_13": 0.0625, "loss_aux_layer_14": 0.07000732421875, "loss_aux_layer_15": 0.078125, "loss_aux_layer_16": 0.08740234375, "loss_aux_layer_17": 0.0947265625, "loss_aux_layer_18": 0.102294921875, "loss_aux_layer_19": 0.10595703125, "loss_aux_layer_2": 0.03753662109375, "loss_aux_layer_20": 0.11376953125, "loss_aux_layer_21": 0.1220703125, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.046630859375, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.05029296875, "loss_aux_layer_6": 0.052734375, "loss_aux_layer_7": 0.0509033203125, "loss_aux_layer_8": 0.05047607421875, "loss_aux_layer_9": 0.04931640625, "step": 5117, "total_loss": 0.6676474660634995 }, { "epoch": 1.0132647000593942, "grad_norm": 0.8922082781791687, "learning_rate": 5e-05, "llm_loss": 0.499014213681221, "loss": 2.2996, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.02886962890625, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.0577392578125, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.0743408203125, "loss_aux_layer_15": 0.0819091796875, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.104248046875, "loss_aux_layer_19": 0.107666015625, "loss_aux_layer_2": 0.0411376953125, "loss_aux_layer_20": 0.1153564453125, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.05047607421875, "loss_aux_layer_4": 0.0526123046875, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05609130859375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05279541015625, "step": 5118, "total_loss": 0.5749044120311737 }, { "epoch": 1.0134626806572955, "grad_norm": 0.8155919909477234, "learning_rate": 5e-05, "llm_loss": 0.4855150058865547, "loss": 2.2391, "loss_aux_layer_0": 0.0101165771484375, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.05517578125, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.0643310546875, "loss_aux_layer_14": 0.0721435546875, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03753662109375, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04925537109375, "loss_aux_layer_5": 0.0506591796875, "loss_aux_layer_6": 0.05322265625, "loss_aux_layer_7": 0.05181884765625, "loss_aux_layer_8": 0.05133056640625, "loss_aux_layer_9": 0.05035400390625, "step": 5119, "total_loss": 0.559778019785881 }, { "epoch": 1.013660661255197, "grad_norm": 0.887693464756012, "learning_rate": 5e-05, "llm_loss": 0.48533374816179276, "loss": 2.2468, "loss_aux_layer_0": 0.009796142578125, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.054443359375, "loss_aux_layer_11": 0.05810546875, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0987548828125, "loss_aux_layer_18": 0.1064453125, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.0401611328125, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.05401611328125, "loss_aux_layer_6": 0.05657958984375, "loss_aux_layer_7": 0.054931640625, "loss_aux_layer_8": 0.0543212890625, "loss_aux_layer_9": 0.0531005859375, "step": 5120, "total_loss": 0.5617059022188187 }, { "epoch": 1.0138586418530984, "grad_norm": 0.8192316889762878, "learning_rate": 5e-05, "llm_loss": 0.517056368291378, "loss": 2.3676, "loss_aux_layer_0": 0.01043701171875, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.05645751953125, "loss_aux_layer_12": 0.060546875, "loss_aux_layer_13": 0.06536865234375, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.10791015625, "loss_aux_layer_2": 0.03924560546875, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.1239013671875, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.18017578125, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.052490234375, "loss_aux_layer_6": 0.05487060546875, "loss_aux_layer_7": 0.05322265625, "loss_aux_layer_8": 0.052734375, "loss_aux_layer_9": 0.05169677734375, "step": 5121, "total_loss": 0.5919105708599091 }, { "epoch": 1.0140566224509997, "grad_norm": 0.9516534209251404, "learning_rate": 5e-05, "llm_loss": 0.46615447849035263, "loss": 2.1491, "loss_aux_layer_0": 0.0098876953125, "loss_aux_layer_1": 0.025054931640625, "loss_aux_layer_10": 0.0479736328125, "loss_aux_layer_11": 0.0511474609375, "loss_aux_layer_12": 0.054931640625, "loss_aux_layer_13": 0.059814453125, "loss_aux_layer_14": 0.06787109375, "loss_aux_layer_15": 0.0760498046875, "loss_aux_layer_16": 0.0855712890625, "loss_aux_layer_17": 0.09326171875, "loss_aux_layer_18": 0.1019287109375, "loss_aux_layer_19": 0.10693359375, "loss_aux_layer_2": 0.0350341796875, "loss_aux_layer_20": 0.1151123046875, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.043701171875, "loss_aux_layer_4": 0.046142578125, "loss_aux_layer_5": 0.04730224609375, "loss_aux_layer_6": 0.0498046875, "loss_aux_layer_7": 0.04827880859375, "loss_aux_layer_8": 0.0477294921875, "loss_aux_layer_9": 0.0469970703125, "step": 5122, "total_loss": 0.5372636020183563 }, { "epoch": 1.0142546030489012, "grad_norm": 0.9425774216651917, "learning_rate": 5e-05, "llm_loss": 0.532031737267971, "loss": 2.4234, "loss_aux_layer_0": 0.0102386474609375, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05584716796875, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.064697265625, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0965576171875, "loss_aux_layer_18": 0.10400390625, "loss_aux_layer_19": 0.107421875, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.114990234375, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.04730224609375, "loss_aux_layer_4": 0.04998779296875, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05096435546875, "step": 5123, "total_loss": 0.6058607548475266 }, { "epoch": 1.0144525836468026, "grad_norm": 0.8047144412994385, "learning_rate": 5e-05, "llm_loss": 0.61906498670578, "loss": 2.7726, "loss_aux_layer_0": 0.0091705322265625, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.052001953125, "loss_aux_layer_11": 0.0555419921875, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.0645751953125, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0970458984375, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.1160888671875, "loss_aux_layer_21": 0.123779296875, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05389404296875, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.05059814453125, "step": 5124, "total_loss": 0.6931397318840027 }, { "epoch": 1.0146505642447041, "grad_norm": 0.851722240447998, "learning_rate": 5e-05, "llm_loss": 0.523415245115757, "loss": 2.4003, "loss_aux_layer_0": 0.010986328125, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.0548095703125, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.0924072265625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.107421875, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.0396728515625, "loss_aux_layer_20": 0.1182861328125, "loss_aux_layer_21": 0.1263427734375, "loss_aux_layer_22": 0.146240234375, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.0496826171875, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.0565185546875, "loss_aux_layer_7": 0.05499267578125, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05352783203125, "step": 5125, "total_loss": 0.6000873297452927 }, { "epoch": 1.0148485448426054, "grad_norm": 0.8262786865234375, "learning_rate": 5e-05, "llm_loss": 0.48369182646274567, "loss": 2.2472, "loss_aux_layer_0": 0.00872802734375, "loss_aux_layer_1": 0.030059814453125, "loss_aux_layer_10": 0.05633544921875, "loss_aux_layer_11": 0.06036376953125, "loss_aux_layer_12": 0.064453125, "loss_aux_layer_13": 0.06964111328125, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.0848388671875, "loss_aux_layer_16": 0.0936279296875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1082763671875, "loss_aux_layer_19": 0.1112060546875, "loss_aux_layer_2": 0.0416259765625, "loss_aux_layer_20": 0.1187744140625, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.149169921875, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.05145263671875, "loss_aux_layer_4": 0.05413818359375, "loss_aux_layer_5": 0.0555419921875, "loss_aux_layer_6": 0.058349609375, "loss_aux_layer_7": 0.0567626953125, "loss_aux_layer_8": 0.05621337890625, "loss_aux_layer_9": 0.05499267578125, "step": 5126, "total_loss": 0.5617984682321548 }, { "epoch": 1.0150465254405068, "grad_norm": 0.7515177726745605, "learning_rate": 5e-05, "llm_loss": 0.5017394125461578, "loss": 2.3054, "loss_aux_layer_0": 0.009765625, "loss_aux_layer_1": 0.026580810546875, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.055419921875, "loss_aux_layer_12": 0.05938720703125, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0810546875, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.09814453125, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109619140625, "loss_aux_layer_2": 0.03765869140625, "loss_aux_layer_20": 0.117431640625, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04931640625, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.05413818359375, "loss_aux_layer_7": 0.0523681640625, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.05108642578125, "step": 5127, "total_loss": 0.576339490711689 }, { "epoch": 1.0152445060384083, "grad_norm": 0.9834172129631042, "learning_rate": 5e-05, "llm_loss": 0.5594123601913452, "loss": 2.5377, "loss_aux_layer_0": 0.01025390625, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05694580078125, "loss_aux_layer_12": 0.06121826171875, "loss_aux_layer_13": 0.06640625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.08251953125, "loss_aux_layer_16": 0.09130859375, "loss_aux_layer_17": 0.0989990234375, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1239013671875, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.17919921875, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.054443359375, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.0516357421875, "step": 5128, "total_loss": 0.6344286799430847 }, { "epoch": 1.0154424866363096, "grad_norm": 0.7244586944580078, "learning_rate": 5e-05, "llm_loss": 0.49458588659763336, "loss": 2.2823, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.02728271484375, "loss_aux_layer_10": 0.052490234375, "loss_aux_layer_11": 0.056640625, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06622314453125, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.083251953125, "loss_aux_layer_16": 0.0928955078125, "loss_aux_layer_17": 0.100341796875, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0380859375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.128662109375, "loss_aux_layer_22": 0.148681640625, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.04736328125, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.05389404296875, "loss_aux_layer_7": 0.05242919921875, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.0511474609375, "step": 5129, "total_loss": 0.5705680698156357 }, { "epoch": 1.015640467234211, "grad_norm": 1.0532476902008057, "learning_rate": 5e-05, "llm_loss": 0.5764771178364754, "loss": 2.6247, "loss_aux_layer_0": 0.009124755859375, "loss_aux_layer_1": 0.030548095703125, "loss_aux_layer_10": 0.0582275390625, "loss_aux_layer_11": 0.06231689453125, "loss_aux_layer_12": 0.0662841796875, "loss_aux_layer_13": 0.0712890625, "loss_aux_layer_14": 0.079345703125, "loss_aux_layer_15": 0.087158203125, "loss_aux_layer_16": 0.095947265625, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.04290771484375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.150634765625, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.05328369140625, "loss_aux_layer_4": 0.055908203125, "loss_aux_layer_5": 0.05755615234375, "loss_aux_layer_6": 0.06036376953125, "loss_aux_layer_7": 0.0584716796875, "loss_aux_layer_8": 0.05810546875, "loss_aux_layer_9": 0.0570068359375, "step": 5130, "total_loss": 0.6561862677335739 }, { "epoch": 1.0158384478321125, "grad_norm": 0.909542441368103, "learning_rate": 5e-05, "llm_loss": 0.5753403827548027, "loss": 2.6007, "loss_aux_layer_0": 0.0103912353515625, "loss_aux_layer_1": 0.02874755859375, "loss_aux_layer_10": 0.05267333984375, "loss_aux_layer_11": 0.05633544921875, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.0653076171875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.1077880859375, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1153564453125, "loss_aux_layer_21": 0.1231689453125, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.05157470703125, "step": 5131, "total_loss": 0.650170624256134 }, { "epoch": 1.0160364284300138, "grad_norm": 0.7833054065704346, "learning_rate": 5e-05, "llm_loss": 0.5262464880943298, "loss": 2.3961, "loss_aux_layer_0": 0.0095977783203125, "loss_aux_layer_1": 0.02630615234375, "loss_aux_layer_10": 0.0509033203125, "loss_aux_layer_11": 0.05438232421875, "loss_aux_layer_12": 0.058349609375, "loss_aux_layer_13": 0.0631103515625, "loss_aux_layer_14": 0.070556640625, "loss_aux_layer_15": 0.078369140625, "loss_aux_layer_16": 0.0872802734375, "loss_aux_layer_17": 0.094482421875, "loss_aux_layer_18": 0.1019287109375, "loss_aux_layer_19": 0.1063232421875, "loss_aux_layer_2": 0.03656005859375, "loss_aux_layer_20": 0.1146240234375, "loss_aux_layer_21": 0.123291015625, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.04595947265625, "loss_aux_layer_4": 0.0484619140625, "loss_aux_layer_5": 0.04998779296875, "loss_aux_layer_6": 0.05291748046875, "loss_aux_layer_7": 0.05120849609375, "loss_aux_layer_8": 0.0506591796875, "loss_aux_layer_9": 0.0496826171875, "step": 5132, "total_loss": 0.599014163017273 }, { "epoch": 1.0162344090279152, "grad_norm": 1.026899814605713, "learning_rate": 5e-05, "llm_loss": 0.4971734881401062, "loss": 2.2877, "loss_aux_layer_0": 0.0099334716796875, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06048583984375, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.0391845703125, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.125, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04840087890625, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.05426025390625, "loss_aux_layer_7": 0.05291748046875, "loss_aux_layer_8": 0.05255126953125, "loss_aux_layer_9": 0.05145263671875, "step": 5133, "total_loss": 0.5719304978847504 }, { "epoch": 1.0164323896258167, "grad_norm": 0.999656617641449, "learning_rate": 5e-05, "llm_loss": 0.4631483256816864, "loss": 2.1514, "loss_aux_layer_0": 0.0102996826171875, "loss_aux_layer_1": 0.027923583984375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05615234375, "loss_aux_layer_12": 0.05999755859375, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.081298828125, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.04791259765625, "loss_aux_layer_4": 0.05029296875, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.052734375, "loss_aux_layer_8": 0.05224609375, "loss_aux_layer_9": 0.05108642578125, "step": 5134, "total_loss": 0.537848450243473 }, { "epoch": 1.016630370223718, "grad_norm": 0.8904941082000732, "learning_rate": 5e-05, "llm_loss": 0.563925176858902, "loss": 2.5609, "loss_aux_layer_0": 0.009735107421875, "loss_aux_layer_1": 0.028289794921875, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.0618896484375, "loss_aux_layer_13": 0.0672607421875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0833740234375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.0999755859375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.111083984375, "loss_aux_layer_2": 0.03948974609375, "loss_aux_layer_20": 0.11865234375, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0557861328125, "loss_aux_layer_7": 0.05419921875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05255126953125, "step": 5135, "total_loss": 0.6402255594730377 }, { "epoch": 1.0168283508216194, "grad_norm": 1.0921491384506226, "learning_rate": 5e-05, "llm_loss": 0.49684225022792816, "loss": 2.2927, "loss_aux_layer_0": 0.0094451904296875, "loss_aux_layer_1": 0.027587890625, "loss_aux_layer_10": 0.0538330078125, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.098876953125, "loss_aux_layer_18": 0.1068115234375, "loss_aux_layer_19": 0.1107177734375, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1273193359375, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.049072265625, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05255126953125, "step": 5136, "total_loss": 0.5731873065233231 }, { "epoch": 1.017026331419521, "grad_norm": 1.044837236404419, "learning_rate": 5e-05, "llm_loss": 0.45913804322481155, "loss": 2.1392, "loss_aux_layer_0": 0.0095062255859375, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.05780029296875, "loss_aux_layer_12": 0.061767578125, "loss_aux_layer_13": 0.0670166015625, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0821533203125, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.03961181640625, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.14404296875, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.05224609375, "loss_aux_layer_5": 0.05377197265625, "loss_aux_layer_6": 0.056640625, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05401611328125, "loss_aux_layer_9": 0.0528564453125, "step": 5137, "total_loss": 0.5347949266433716 }, { "epoch": 1.0172243120174222, "grad_norm": 1.0239439010620117, "learning_rate": 5e-05, "llm_loss": 0.5863876640796661, "loss": 2.6416, "loss_aux_layer_0": 0.0096893310546875, "loss_aux_layer_1": 0.02703857421875, "loss_aux_layer_10": 0.0513916015625, "loss_aux_layer_11": 0.05499267578125, "loss_aux_layer_12": 0.05908203125, "loss_aux_layer_13": 0.06427001953125, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.10498046875, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.03826904296875, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.17822265625, "loss_aux_layer_3": 0.0474853515625, "loss_aux_layer_4": 0.0498046875, "loss_aux_layer_5": 0.05126953125, "loss_aux_layer_6": 0.0538330078125, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.05145263671875, "loss_aux_layer_9": 0.05023193359375, "step": 5138, "total_loss": 0.6603924483060837 }, { "epoch": 1.0174222926153238, "grad_norm": 1.1108384132385254, "learning_rate": 5e-05, "llm_loss": 0.5599774420261383, "loss": 2.5431, "loss_aux_layer_0": 0.00982666015625, "loss_aux_layer_1": 0.027008056640625, "loss_aux_layer_10": 0.05230712890625, "loss_aux_layer_11": 0.05596923828125, "loss_aux_layer_12": 0.06005859375, "loss_aux_layer_13": 0.06524658203125, "loss_aux_layer_14": 0.073486328125, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.10791015625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.038330078125, "loss_aux_layer_20": 0.1197509765625, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.1865234375, "loss_aux_layer_3": 0.047607421875, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.05181884765625, "loss_aux_layer_9": 0.05096435546875, "step": 5139, "total_loss": 0.6357739567756653 }, { "epoch": 1.0176202732132251, "grad_norm": 1.1320009231567383, "learning_rate": 5e-05, "llm_loss": 0.5144345611333847, "loss": 2.3426, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.025054931640625, "loss_aux_layer_10": 0.0479736328125, "loss_aux_layer_11": 0.05145263671875, "loss_aux_layer_12": 0.0552978515625, "loss_aux_layer_13": 0.06024169921875, "loss_aux_layer_14": 0.068359375, "loss_aux_layer_15": 0.076416015625, "loss_aux_layer_16": 0.0860595703125, "loss_aux_layer_17": 0.0936279296875, "loss_aux_layer_18": 0.1015625, "loss_aux_layer_19": 0.1064453125, "loss_aux_layer_2": 0.035125732421875, "loss_aux_layer_20": 0.1143798828125, "loss_aux_layer_21": 0.1234130859375, "loss_aux_layer_22": 0.143310546875, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.04388427734375, "loss_aux_layer_4": 0.04608154296875, "loss_aux_layer_5": 0.04736328125, "loss_aux_layer_6": 0.04974365234375, "loss_aux_layer_7": 0.04815673828125, "loss_aux_layer_8": 0.0477294921875, "loss_aux_layer_9": 0.046875, "step": 5140, "total_loss": 0.5856416150927544 }, { "epoch": 1.0178182538111264, "grad_norm": 1.214561939239502, "learning_rate": 5e-05, "llm_loss": 0.5527166351675987, "loss": 2.5204, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.06268310546875, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1094970703125, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.03955078125, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.14990234375, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.04931640625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0546875, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.0531005859375, "step": 5141, "total_loss": 0.6300925016403198 }, { "epoch": 1.018016234409028, "grad_norm": 1.227221131324768, "learning_rate": 5e-05, "llm_loss": 0.5765671283006668, "loss": 2.6087, "loss_aux_layer_0": 0.0137939453125, "loss_aux_layer_1": 0.029327392578125, "loss_aux_layer_10": 0.0537109375, "loss_aux_layer_11": 0.0576171875, "loss_aux_layer_12": 0.06158447265625, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1158447265625, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.04962158203125, "loss_aux_layer_4": 0.0521240234375, "loss_aux_layer_5": 0.053466796875, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05438232421875, "loss_aux_layer_8": 0.0537109375, "loss_aux_layer_9": 0.05242919921875, "step": 5142, "total_loss": 0.652187392115593 }, { "epoch": 1.0182142150069293, "grad_norm": 0.9786635637283325, "learning_rate": 5e-05, "llm_loss": 0.5378539264202118, "loss": 2.4366, "loss_aux_layer_0": 0.0098724365234375, "loss_aux_layer_1": 0.024383544921875, "loss_aux_layer_10": 0.04840087890625, "loss_aux_layer_11": 0.05157470703125, "loss_aux_layer_12": 0.05572509765625, "loss_aux_layer_13": 0.06085205078125, "loss_aux_layer_14": 0.0689697265625, "loss_aux_layer_15": 0.07763671875, "loss_aux_layer_16": 0.0869140625, "loss_aux_layer_17": 0.0948486328125, "loss_aux_layer_18": 0.1026611328125, "loss_aux_layer_19": 0.1068115234375, "loss_aux_layer_2": 0.034912109375, "loss_aux_layer_20": 0.1148681640625, "loss_aux_layer_21": 0.1229248046875, "loss_aux_layer_22": 0.14208984375, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04302978515625, "loss_aux_layer_4": 0.0455322265625, "loss_aux_layer_5": 0.0472412109375, "loss_aux_layer_6": 0.05023193359375, "loss_aux_layer_7": 0.04852294921875, "loss_aux_layer_8": 0.0479736328125, "loss_aux_layer_9": 0.04705810546875, "step": 5143, "total_loss": 0.6091591417789459 }, { "epoch": 1.0184121956048307, "grad_norm": 1.2075895071029663, "learning_rate": 5e-05, "llm_loss": 0.5683360993862152, "loss": 2.5685, "loss_aux_layer_0": 0.013092041015625, "loss_aux_layer_1": 0.027130126953125, "loss_aux_layer_10": 0.05084228515625, "loss_aux_layer_11": 0.05462646484375, "loss_aux_layer_12": 0.0589599609375, "loss_aux_layer_13": 0.06402587890625, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.0887451171875, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.1046142578125, "loss_aux_layer_19": 0.108642578125, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.04901123046875, "loss_aux_layer_5": 0.0501708984375, "loss_aux_layer_6": 0.0528564453125, "loss_aux_layer_7": 0.05126953125, "loss_aux_layer_8": 0.05059814453125, "loss_aux_layer_9": 0.04962158203125, "step": 5144, "total_loss": 0.6421309113502502 }, { "epoch": 1.0186101762027322, "grad_norm": 0.9371588230133057, "learning_rate": 5e-05, "llm_loss": 0.5454112440347672, "loss": 2.4848, "loss_aux_layer_0": 0.0093536376953125, "loss_aux_layer_1": 0.02783203125, "loss_aux_layer_10": 0.053466796875, "loss_aux_layer_11": 0.05712890625, "loss_aux_layer_12": 0.06146240234375, "loss_aux_layer_13": 0.0667724609375, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.107666015625, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1258544921875, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.1806640625, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05267333984375, "loss_aux_layer_6": 0.05548095703125, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.05224609375, "step": 5145, "total_loss": 0.6212079152464867 }, { "epoch": 1.0188081568006335, "grad_norm": 1.1394110918045044, "learning_rate": 5e-05, "llm_loss": 0.5190855413675308, "loss": 2.38, "loss_aux_layer_0": 0.0125274658203125, "loss_aux_layer_1": 0.02899169921875, "loss_aux_layer_10": 0.05389404296875, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1053466796875, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.04071044921875, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.04998779296875, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.0537109375, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05462646484375, "loss_aux_layer_8": 0.0538330078125, "loss_aux_layer_9": 0.052734375, "step": 5146, "total_loss": 0.5949917286634445 }, { "epoch": 1.0190061373985349, "grad_norm": 0.9267736673355103, "learning_rate": 5e-05, "llm_loss": 0.45658065378665924, "loss": 2.1232, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.0513916015625, "loss_aux_layer_11": 0.05523681640625, "loss_aux_layer_12": 0.059326171875, "loss_aux_layer_13": 0.0640869140625, "loss_aux_layer_14": 0.072265625, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1092529296875, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.1171875, "loss_aux_layer_21": 0.12451171875, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.178466796875, "loss_aux_layer_3": 0.0479736328125, "loss_aux_layer_4": 0.05010986328125, "loss_aux_layer_5": 0.05133056640625, "loss_aux_layer_6": 0.053955078125, "loss_aux_layer_7": 0.05230712890625, "loss_aux_layer_8": 0.05145263671875, "loss_aux_layer_9": 0.0501708984375, "step": 5147, "total_loss": 0.530812457203865 }, { "epoch": 1.0192041179964364, "grad_norm": 0.9900500178337097, "learning_rate": 5e-05, "llm_loss": 0.4982912316918373, "loss": 2.2908, "loss_aux_layer_0": 0.01177978515625, "loss_aux_layer_1": 0.028076171875, "loss_aux_layer_10": 0.0526123046875, "loss_aux_layer_11": 0.05621337890625, "loss_aux_layer_12": 0.0601806640625, "loss_aux_layer_13": 0.06500244140625, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.0888671875, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.1041259765625, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.115478515625, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.142822265625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05078125, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.0526123046875, "loss_aux_layer_9": 0.05145263671875, "step": 5148, "total_loss": 0.5726968199014664 }, { "epoch": 1.0194020985943377, "grad_norm": 0.8549109697341919, "learning_rate": 5e-05, "llm_loss": 0.512338787317276, "loss": 2.3551, "loss_aux_layer_0": 0.0106658935546875, "loss_aux_layer_1": 0.0286865234375, "loss_aux_layer_10": 0.05328369140625, "loss_aux_layer_11": 0.05712890625, "loss_aux_layer_12": 0.06134033203125, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.07470703125, "loss_aux_layer_15": 0.0826416015625, "loss_aux_layer_16": 0.091796875, "loss_aux_layer_17": 0.099365234375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.03985595703125, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1279296875, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.05560302734375, "loss_aux_layer_7": 0.053955078125, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.052001953125, "step": 5149, "total_loss": 0.5887781530618668 }, { "epoch": 1.019600079192239, "grad_norm": 0.9571189284324646, "learning_rate": 5e-05, "llm_loss": 0.4972817748785019, "loss": 2.2689, "loss_aux_layer_0": 0.0106964111328125, "loss_aux_layer_1": 0.024749755859375, "loss_aux_layer_10": 0.04705810546875, "loss_aux_layer_11": 0.05029296875, "loss_aux_layer_12": 0.05450439453125, "loss_aux_layer_13": 0.0594482421875, "loss_aux_layer_14": 0.067626953125, "loss_aux_layer_15": 0.0758056640625, "loss_aux_layer_16": 0.0850830078125, "loss_aux_layer_17": 0.093017578125, "loss_aux_layer_18": 0.10107421875, "loss_aux_layer_19": 0.10498046875, "loss_aux_layer_2": 0.0345458984375, "loss_aux_layer_20": 0.112548828125, "loss_aux_layer_21": 0.1204833984375, "loss_aux_layer_22": 0.139404296875, "loss_aux_layer_23": 0.17431640625, "loss_aux_layer_3": 0.04302978515625, "loss_aux_layer_4": 0.045166015625, "loss_aux_layer_5": 0.0465087890625, "loss_aux_layer_6": 0.04876708984375, "loss_aux_layer_7": 0.047119140625, "loss_aux_layer_8": 0.0467529296875, "loss_aux_layer_9": 0.04583740234375, "step": 5150, "total_loss": 0.5672275125980377 }, { "epoch": 1.0197980597901406, "grad_norm": 0.9014211893081665, "learning_rate": 5e-05, "llm_loss": 0.5917297154664993, "loss": 2.6748, "loss_aux_layer_0": 0.0098419189453125, "loss_aux_layer_1": 0.0284423828125, "loss_aux_layer_10": 0.05401611328125, "loss_aux_layer_11": 0.057861328125, "loss_aux_layer_12": 0.06201171875, "loss_aux_layer_13": 0.067138671875, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0838623046875, "loss_aux_layer_16": 0.09326171875, "loss_aux_layer_17": 0.100830078125, "loss_aux_layer_18": 0.1092529296875, "loss_aux_layer_19": 0.1123046875, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.1285400390625, "loss_aux_layer_22": 0.1484375, "loss_aux_layer_23": 0.184326171875, "loss_aux_layer_3": 0.0494384765625, "loss_aux_layer_4": 0.0517578125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.054443359375, "loss_aux_layer_8": 0.0540771484375, "loss_aux_layer_9": 0.052734375, "step": 5151, "total_loss": 0.6686962097883224 }, { "epoch": 1.019996040388042, "grad_norm": 0.8888223171234131, "learning_rate": 5e-05, "llm_loss": 0.5444458797574043, "loss": 2.4727, "loss_aux_layer_0": 0.0109100341796875, "loss_aux_layer_1": 0.028045654296875, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.05609130859375, "loss_aux_layer_12": 0.06024169921875, "loss_aux_layer_13": 0.065185546875, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.088623046875, "loss_aux_layer_17": 0.0960693359375, "loss_aux_layer_18": 0.103515625, "loss_aux_layer_19": 0.106201171875, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1138916015625, "loss_aux_layer_21": 0.1212158203125, "loss_aux_layer_22": 0.140869140625, "loss_aux_layer_23": 0.1748046875, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.050537109375, "loss_aux_layer_5": 0.05169677734375, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05108642578125, "step": 5152, "total_loss": 0.6181708574295044 }, { "epoch": 1.0201940209859435, "grad_norm": 0.9230656623840332, "learning_rate": 5e-05, "llm_loss": 0.526131197810173, "loss": 2.4031, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.026763916015625, "loss_aux_layer_10": 0.0523681640625, "loss_aux_layer_11": 0.055908203125, "loss_aux_layer_12": 0.06011962890625, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.090087890625, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1051025390625, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.18212890625, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0499267578125, "loss_aux_layer_5": 0.05145263671875, "loss_aux_layer_6": 0.05401611328125, "loss_aux_layer_7": 0.05230712890625, "loss_aux_layer_8": 0.05194091796875, "loss_aux_layer_9": 0.051025390625, "step": 5153, "total_loss": 0.6007831543684006 }, { "epoch": 1.0203920015838448, "grad_norm": 0.8068868517875671, "learning_rate": 5e-05, "llm_loss": 0.4714403823018074, "loss": 2.1702, "loss_aux_layer_0": 0.01080322265625, "loss_aux_layer_1": 0.025726318359375, "loss_aux_layer_10": 0.04876708984375, "loss_aux_layer_11": 0.05194091796875, "loss_aux_layer_12": 0.0556640625, "loss_aux_layer_13": 0.0604248046875, "loss_aux_layer_14": 0.068359375, "loss_aux_layer_15": 0.0762939453125, "loss_aux_layer_16": 0.0848388671875, "loss_aux_layer_17": 0.092529296875, "loss_aux_layer_18": 0.1005859375, "loss_aux_layer_19": 0.1046142578125, "loss_aux_layer_2": 0.0357666015625, "loss_aux_layer_20": 0.1131591796875, "loss_aux_layer_21": 0.12158203125, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.04461669921875, "loss_aux_layer_4": 0.0469970703125, "loss_aux_layer_5": 0.04840087890625, "loss_aux_layer_6": 0.05078125, "loss_aux_layer_7": 0.0491943359375, "loss_aux_layer_8": 0.0489501953125, "loss_aux_layer_9": 0.04766845703125, "step": 5154, "total_loss": 0.5425408631563187 }, { "epoch": 1.0205899821817461, "grad_norm": 0.9110113978385925, "learning_rate": 5e-05, "llm_loss": 0.4905388727784157, "loss": 2.2694, "loss_aux_layer_0": 0.010406494140625, "loss_aux_layer_1": 0.02923583984375, "loss_aux_layer_10": 0.05450439453125, "loss_aux_layer_11": 0.05853271484375, "loss_aux_layer_12": 0.06280517578125, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.07666015625, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.093505859375, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.04046630859375, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05419921875, "loss_aux_layer_6": 0.05712890625, "loss_aux_layer_7": 0.05523681640625, "loss_aux_layer_8": 0.05462646484375, "loss_aux_layer_9": 0.05328369140625, "step": 5155, "total_loss": 0.5673386752605438 }, { "epoch": 1.0207879627796477, "grad_norm": 0.9144807457923889, "learning_rate": 5e-05, "llm_loss": 0.531777910888195, "loss": 2.4352, "loss_aux_layer_0": 0.0114593505859375, "loss_aux_layer_1": 0.028961181640625, "loss_aux_layer_10": 0.05462646484375, "loss_aux_layer_11": 0.058349609375, "loss_aux_layer_12": 0.0628662109375, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.0843505859375, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1007080078125, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.14599609375, "loss_aux_layer_23": 0.181640625, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.056396484375, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05438232421875, "loss_aux_layer_9": 0.053466796875, "step": 5156, "total_loss": 0.6087971031665802 }, { "epoch": 1.020985943377549, "grad_norm": 0.8486172556877136, "learning_rate": 5e-05, "llm_loss": 0.4420342966914177, "loss": 2.0693, "loss_aux_layer_0": 0.0096893310546875, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.0556640625, "loss_aux_layer_12": 0.05987548828125, "loss_aux_layer_13": 0.06488037109375, "loss_aux_layer_14": 0.0728759765625, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0904541015625, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1103515625, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.118408203125, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.148193359375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.0482177734375, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.0540771484375, "loss_aux_layer_7": 0.05255126953125, "loss_aux_layer_8": 0.05206298828125, "loss_aux_layer_9": 0.05096435546875, "step": 5157, "total_loss": 0.5173298120498657 }, { "epoch": 1.0211839239754503, "grad_norm": 0.8300037384033203, "learning_rate": 5e-05, "llm_loss": 0.414247527718544, "loss": 1.9672, "loss_aux_layer_0": 0.011260986328125, "loss_aux_layer_1": 0.02880859375, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.05926513671875, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.06884765625, "loss_aux_layer_14": 0.0765380859375, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.093017578125, "loss_aux_layer_17": 0.1002197265625, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.040771484375, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.127197265625, "loss_aux_layer_22": 0.147705078125, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.05084228515625, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.054931640625, "loss_aux_layer_6": 0.057861328125, "loss_aux_layer_7": 0.05621337890625, "loss_aux_layer_8": 0.0552978515625, "loss_aux_layer_9": 0.05413818359375, "step": 5158, "total_loss": 0.49181216210126877 }, { "epoch": 1.0213819045733519, "grad_norm": 0.801191508769989, "learning_rate": 5e-05, "llm_loss": 0.570336252450943, "loss": 2.5847, "loss_aux_layer_0": 0.010009765625, "loss_aux_layer_1": 0.027679443359375, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.06219482421875, "loss_aux_layer_13": 0.06689453125, "loss_aux_layer_14": 0.0748291015625, "loss_aux_layer_15": 0.082763671875, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106201171875, "loss_aux_layer_19": 0.109130859375, "loss_aux_layer_2": 0.03936767578125, "loss_aux_layer_20": 0.1168212890625, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.181396484375, "loss_aux_layer_3": 0.04913330078125, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05328369140625, "loss_aux_layer_6": 0.0560302734375, "loss_aux_layer_7": 0.05450439453125, "loss_aux_layer_8": 0.05389404296875, "loss_aux_layer_9": 0.052734375, "step": 5159, "total_loss": 0.6461838185787201 }, { "epoch": 1.0215798851712532, "grad_norm": 0.9468953013420105, "learning_rate": 5e-05, "llm_loss": 0.5059665963053703, "loss": 2.3317, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.028350830078125, "loss_aux_layer_10": 0.05426025390625, "loss_aux_layer_11": 0.0579833984375, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0760498046875, "loss_aux_layer_15": 0.084228515625, "loss_aux_layer_16": 0.0931396484375, "loss_aux_layer_17": 0.1005859375, "loss_aux_layer_18": 0.108642578125, "loss_aux_layer_19": 0.1116943359375, "loss_aux_layer_2": 0.04010009765625, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.1270751953125, "loss_aux_layer_22": 0.14697265625, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.05255126953125, "loss_aux_layer_5": 0.053955078125, "loss_aux_layer_6": 0.0567626953125, "loss_aux_layer_7": 0.05487060546875, "loss_aux_layer_8": 0.05419921875, "loss_aux_layer_9": 0.052978515625, "step": 5160, "total_loss": 0.5829170867800713 }, { "epoch": 1.0217778657691545, "grad_norm": 0.8465684056282043, "learning_rate": 5e-05, "llm_loss": 0.5119382590055466, "loss": 2.3371, "loss_aux_layer_0": 0.010284423828125, "loss_aux_layer_1": 0.0257568359375, "loss_aux_layer_10": 0.04974365234375, "loss_aux_layer_11": 0.0533447265625, "loss_aux_layer_12": 0.0574951171875, "loss_aux_layer_13": 0.06268310546875, "loss_aux_layer_14": 0.070556640625, "loss_aux_layer_15": 0.07861328125, "loss_aux_layer_16": 0.0875244140625, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.1026611328125, "loss_aux_layer_19": 0.1063232421875, "loss_aux_layer_2": 0.035888671875, "loss_aux_layer_20": 0.1143798828125, "loss_aux_layer_21": 0.12255859375, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04498291015625, "loss_aux_layer_4": 0.0474853515625, "loss_aux_layer_5": 0.04901123046875, "loss_aux_layer_6": 0.05194091796875, "loss_aux_layer_7": 0.0504150390625, "loss_aux_layer_8": 0.0498046875, "loss_aux_layer_9": 0.04864501953125, "step": 5161, "total_loss": 0.5842795372009277 }, { "epoch": 1.021975846367056, "grad_norm": 0.7381684184074402, "learning_rate": 5e-05, "llm_loss": 0.486738383769989, "loss": 2.2492, "loss_aux_layer_0": 0.009368896484375, "loss_aux_layer_1": 0.027984619140625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.06085205078125, "loss_aux_layer_13": 0.06585693359375, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.0980224609375, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1099853515625, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183349609375, "loss_aux_layer_3": 0.0484619140625, "loss_aux_layer_4": 0.05096435546875, "loss_aux_layer_5": 0.0523681640625, "loss_aux_layer_6": 0.05523681640625, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.052978515625, "loss_aux_layer_9": 0.0518798828125, "step": 5162, "total_loss": 0.5623072236776352 }, { "epoch": 1.0221738269649574, "grad_norm": 0.8504678606987, "learning_rate": 5e-05, "llm_loss": 0.4899653270840645, "loss": 2.2563, "loss_aux_layer_0": 0.0097503662109375, "loss_aux_layer_1": 0.026519775390625, "loss_aux_layer_10": 0.05157470703125, "loss_aux_layer_11": 0.0552978515625, "loss_aux_layer_12": 0.0592041015625, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.0718994140625, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.088623046875, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.104248046875, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.0374755859375, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.046875, "loss_aux_layer_4": 0.0491943359375, "loss_aux_layer_5": 0.05072021484375, "loss_aux_layer_6": 0.053466796875, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.05157470703125, "loss_aux_layer_9": 0.050537109375, "step": 5163, "total_loss": 0.5640841126441956 }, { "epoch": 1.022371807562859, "grad_norm": 0.876541256904602, "learning_rate": 5e-05, "llm_loss": 0.48999519646167755, "loss": 2.255, "loss_aux_layer_0": 0.0093231201171875, "loss_aux_layer_1": 0.026947021484375, "loss_aux_layer_10": 0.0511474609375, "loss_aux_layer_11": 0.05487060546875, "loss_aux_layer_12": 0.058837890625, "loss_aux_layer_13": 0.0640869140625, "loss_aux_layer_14": 0.072021484375, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.0897216796875, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1058349609375, "loss_aux_layer_19": 0.10888671875, "loss_aux_layer_2": 0.0369873046875, "loss_aux_layer_20": 0.1163330078125, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.0462646484375, "loss_aux_layer_4": 0.048583984375, "loss_aux_layer_5": 0.05010986328125, "loss_aux_layer_6": 0.05303955078125, "loss_aux_layer_7": 0.051513671875, "loss_aux_layer_8": 0.05096435546875, "loss_aux_layer_9": 0.04986572265625, "step": 5164, "total_loss": 0.5637445747852325 }, { "epoch": 1.0225697881607603, "grad_norm": 1.1215187311172485, "learning_rate": 5e-05, "llm_loss": 0.5459379032254219, "loss": 2.489, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.028228759765625, "loss_aux_layer_10": 0.05487060546875, "loss_aux_layer_11": 0.05841064453125, "loss_aux_layer_12": 0.0626220703125, "loss_aux_layer_13": 0.06787109375, "loss_aux_layer_14": 0.0758056640625, "loss_aux_layer_15": 0.083740234375, "loss_aux_layer_16": 0.092529296875, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.039306640625, "loss_aux_layer_20": 0.11767578125, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04925537109375, "loss_aux_layer_4": 0.05157470703125, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.05596923828125, "loss_aux_layer_7": 0.0548095703125, "loss_aux_layer_8": 0.05450439453125, "loss_aux_layer_9": 0.05352783203125, "step": 5165, "total_loss": 0.6222579404711723 }, { "epoch": 1.0227677687586616, "grad_norm": 0.8890701532363892, "learning_rate": 5e-05, "llm_loss": 0.5738562941551208, "loss": 2.5993, "loss_aux_layer_0": 0.009552001953125, "loss_aux_layer_1": 0.0272216796875, "loss_aux_layer_10": 0.05377197265625, "loss_aux_layer_11": 0.05743408203125, "loss_aux_layer_12": 0.0615234375, "loss_aux_layer_13": 0.066650390625, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0921630859375, "loss_aux_layer_17": 0.099609375, "loss_aux_layer_18": 0.1075439453125, "loss_aux_layer_19": 0.11083984375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.1181640625, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.146728515625, "loss_aux_layer_23": 0.18310546875, "loss_aux_layer_3": 0.04815673828125, "loss_aux_layer_4": 0.0509033203125, "loss_aux_layer_5": 0.0526123046875, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.05377197265625, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0523681640625, "step": 5166, "total_loss": 0.6498375236988068 }, { "epoch": 1.0229657493565631, "grad_norm": 0.9411789178848267, "learning_rate": 5e-05, "llm_loss": 0.5549004748463631, "loss": 2.5224, "loss_aux_layer_0": 0.0118408203125, "loss_aux_layer_1": 0.02862548828125, "loss_aux_layer_10": 0.05413818359375, "loss_aux_layer_11": 0.05804443359375, "loss_aux_layer_12": 0.0621337890625, "loss_aux_layer_13": 0.06732177734375, "loss_aux_layer_14": 0.074951171875, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.0914306640625, "loss_aux_layer_17": 0.0986328125, "loss_aux_layer_18": 0.105712890625, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1153564453125, "loss_aux_layer_21": 0.1236572265625, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05169677734375, "loss_aux_layer_5": 0.05316162109375, "loss_aux_layer_6": 0.055908203125, "loss_aux_layer_7": 0.05413818359375, "loss_aux_layer_8": 0.05377197265625, "loss_aux_layer_9": 0.05291748046875, "step": 5167, "total_loss": 0.6305932775139809 }, { "epoch": 1.0231637299544645, "grad_norm": 0.8362548351287842, "learning_rate": 5e-05, "llm_loss": 0.5318049043416977, "loss": 2.4276, "loss_aux_layer_0": 0.0094757080078125, "loss_aux_layer_1": 0.027587890625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.0567626953125, "loss_aux_layer_12": 0.0609130859375, "loss_aux_layer_13": 0.06591796875, "loss_aux_layer_14": 0.07373046875, "loss_aux_layer_15": 0.0816650390625, "loss_aux_layer_16": 0.0906982421875, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.1094970703125, "loss_aux_layer_2": 0.03887939453125, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.1435546875, "loss_aux_layer_23": 0.1787109375, "loss_aux_layer_3": 0.0487060546875, "loss_aux_layer_4": 0.05108642578125, "loss_aux_layer_5": 0.05242919921875, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05328369140625, "loss_aux_layer_8": 0.05303955078125, "loss_aux_layer_9": 0.05169677734375, "step": 5168, "total_loss": 0.6068927645683289 }, { "epoch": 1.0233617105523658, "grad_norm": 1.1607394218444824, "learning_rate": 5e-05, "llm_loss": 0.5636434108018875, "loss": 2.5536, "loss_aux_layer_0": 0.0135345458984375, "loss_aux_layer_1": 0.027587890625, "loss_aux_layer_10": 0.0521240234375, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.0645751953125, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.097900390625, "loss_aux_layer_18": 0.1063232421875, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.03802490234375, "loss_aux_layer_20": 0.117919921875, "loss_aux_layer_21": 0.1256103515625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04693603515625, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.0521240234375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05096435546875, "step": 5169, "total_loss": 0.6383934170007706 }, { "epoch": 1.0235596911502673, "grad_norm": 0.9332833290100098, "learning_rate": 5e-05, "llm_loss": 0.5515172183513641, "loss": 2.5027, "loss_aux_layer_0": 0.010223388671875, "loss_aux_layer_1": 0.027313232421875, "loss_aux_layer_10": 0.0516357421875, "loss_aux_layer_11": 0.05548095703125, "loss_aux_layer_12": 0.0594482421875, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.080322265625, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.1048583984375, "loss_aux_layer_19": 0.1082763671875, "loss_aux_layer_2": 0.0379638671875, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1240234375, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.0472412109375, "loss_aux_layer_4": 0.0496826171875, "loss_aux_layer_5": 0.05120849609375, "loss_aux_layer_6": 0.0537109375, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.051513671875, "loss_aux_layer_9": 0.0504150390625, "step": 5170, "total_loss": 0.6256683170795441 }, { "epoch": 1.0237576717481687, "grad_norm": 0.8856039643287659, "learning_rate": 5e-05, "llm_loss": 0.6088433414697647, "loss": 2.7496, "loss_aux_layer_0": 0.010589599609375, "loss_aux_layer_1": 0.02996826171875, "loss_aux_layer_10": 0.05584716796875, "loss_aux_layer_11": 0.05975341796875, "loss_aux_layer_12": 0.06390380859375, "loss_aux_layer_13": 0.0692138671875, "loss_aux_layer_14": 0.077392578125, "loss_aux_layer_15": 0.085205078125, "loss_aux_layer_16": 0.0947265625, "loss_aux_layer_17": 0.102294921875, "loss_aux_layer_18": 0.10986328125, "loss_aux_layer_19": 0.1134033203125, "loss_aux_layer_2": 0.04193115234375, "loss_aux_layer_20": 0.120849609375, "loss_aux_layer_21": 0.1282958984375, "loss_aux_layer_22": 0.1494140625, "loss_aux_layer_23": 0.185791015625, "loss_aux_layer_3": 0.0518798828125, "loss_aux_layer_4": 0.0540771484375, "loss_aux_layer_5": 0.05535888671875, "loss_aux_layer_6": 0.05816650390625, "loss_aux_layer_7": 0.056396484375, "loss_aux_layer_8": 0.05572509765625, "loss_aux_layer_9": 0.05450439453125, "step": 5171, "total_loss": 0.687397763133049 }, { "epoch": 1.02395565234607, "grad_norm": 0.8813930749893188, "learning_rate": 5e-05, "llm_loss": 0.5137163996696472, "loss": 2.3544, "loss_aux_layer_0": 0.0107421875, "loss_aux_layer_1": 0.027374267578125, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.061279296875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.082275390625, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.108154296875, "loss_aux_layer_2": 0.03839111328125, "loss_aux_layer_20": 0.1151123046875, "loss_aux_layer_21": 0.122802734375, "loss_aux_layer_22": 0.142578125, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05072021484375, "loss_aux_layer_5": 0.05230712890625, "loss_aux_layer_6": 0.05499267578125, "loss_aux_layer_7": 0.05352783203125, "loss_aux_layer_8": 0.05340576171875, "loss_aux_layer_9": 0.0521240234375, "step": 5172, "total_loss": 0.5886122286319733 }, { "epoch": 1.0241536329439715, "grad_norm": 0.8331280946731567, "learning_rate": 5e-05, "llm_loss": 0.4757416620850563, "loss": 2.2, "loss_aux_layer_0": 0.00946044921875, "loss_aux_layer_1": 0.025482177734375, "loss_aux_layer_10": 0.05120849609375, "loss_aux_layer_11": 0.0546875, "loss_aux_layer_12": 0.05889892578125, "loss_aux_layer_13": 0.06414794921875, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0899658203125, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.109375, "loss_aux_layer_2": 0.03717041015625, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.184814453125, "loss_aux_layer_3": 0.04638671875, "loss_aux_layer_4": 0.048583984375, "loss_aux_layer_5": 0.050048828125, "loss_aux_layer_6": 0.05279541015625, "loss_aux_layer_7": 0.051025390625, "loss_aux_layer_8": 0.05084228515625, "loss_aux_layer_9": 0.04986572265625, "step": 5173, "total_loss": 0.5499898046255112 }, { "epoch": 1.0243516135418729, "grad_norm": 0.9094839692115784, "learning_rate": 5e-05, "llm_loss": 0.49724031239748, "loss": 2.2874, "loss_aux_layer_0": 0.0098876953125, "loss_aux_layer_1": 0.027069091796875, "loss_aux_layer_10": 0.05206298828125, "loss_aux_layer_11": 0.05560302734375, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06463623046875, "loss_aux_layer_14": 0.0723876953125, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0968017578125, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.03778076171875, "loss_aux_layer_20": 0.1173095703125, "loss_aux_layer_21": 0.12646484375, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.04730224609375, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.0509033203125, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.052001953125, "loss_aux_layer_8": 0.0516357421875, "loss_aux_layer_9": 0.05084228515625, "step": 5174, "total_loss": 0.571855790913105 }, { "epoch": 1.0245495941397742, "grad_norm": 0.8400135040283203, "learning_rate": 5e-05, "llm_loss": 0.5874350816011429, "loss": 2.6549, "loss_aux_layer_0": 0.008819580078125, "loss_aux_layer_1": 0.026947021484375, "loss_aux_layer_10": 0.05328369140625, "loss_aux_layer_11": 0.0570068359375, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.06658935546875, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0828857421875, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.0997314453125, "loss_aux_layer_18": 0.1077880859375, "loss_aux_layer_19": 0.11181640625, "loss_aux_layer_2": 0.03790283203125, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.12890625, "loss_aux_layer_22": 0.150146484375, "loss_aux_layer_23": 0.186767578125, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.0506591796875, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.0533447265625, "loss_aux_layer_9": 0.0521240234375, "step": 5175, "total_loss": 0.6637281030416489 }, { "epoch": 1.0247475747376757, "grad_norm": 1.0027326345443726, "learning_rate": 5e-05, "llm_loss": 0.4897974953055382, "loss": 2.2539, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.02655029296875, "loss_aux_layer_10": 0.051025390625, "loss_aux_layer_11": 0.05450439453125, "loss_aux_layer_12": 0.0584716796875, "loss_aux_layer_13": 0.0634765625, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.0794677734375, "loss_aux_layer_16": 0.0885009765625, "loss_aux_layer_17": 0.095947265625, "loss_aux_layer_18": 0.1041259765625, "loss_aux_layer_19": 0.1083984375, "loss_aux_layer_2": 0.03759765625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.1796875, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04949951171875, "loss_aux_layer_5": 0.05059814453125, "loss_aux_layer_6": 0.0533447265625, "loss_aux_layer_7": 0.05181884765625, "loss_aux_layer_8": 0.0511474609375, "loss_aux_layer_9": 0.04998779296875, "step": 5176, "total_loss": 0.5634703785181046 }, { "epoch": 1.024945555335577, "grad_norm": 0.8099938631057739, "learning_rate": 5e-05, "llm_loss": 0.5446976572275162, "loss": 2.4897, "loss_aux_layer_0": 0.0095062255859375, "loss_aux_layer_1": 0.02825927734375, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.0596923828125, "loss_aux_layer_12": 0.06414794921875, "loss_aux_layer_13": 0.0694580078125, "loss_aux_layer_14": 0.07708740234375, "loss_aux_layer_15": 0.0849609375, "loss_aux_layer_16": 0.0941162109375, "loss_aux_layer_17": 0.1019287109375, "loss_aux_layer_18": 0.109619140625, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.0399169921875, "loss_aux_layer_20": 0.1199951171875, "loss_aux_layer_21": 0.12744140625, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.1826171875, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05426025390625, "loss_aux_layer_6": 0.0574951171875, "loss_aux_layer_7": 0.05584716796875, "loss_aux_layer_8": 0.05560302734375, "loss_aux_layer_9": 0.05438232421875, "step": 5177, "total_loss": 0.622436061501503 }, { "epoch": 1.0251435359334786, "grad_norm": 0.8864875435829163, "learning_rate": 5e-05, "llm_loss": 0.484775111079216, "loss": 2.2366, "loss_aux_layer_0": 0.0108489990234375, "loss_aux_layer_1": 0.02764892578125, "loss_aux_layer_10": 0.0517578125, "loss_aux_layer_11": 0.05548095703125, "loss_aux_layer_12": 0.0595703125, "loss_aux_layer_13": 0.06451416015625, "loss_aux_layer_14": 0.0726318359375, "loss_aux_layer_15": 0.08056640625, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0972900390625, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0386962890625, "loss_aux_layer_20": 0.1162109375, "loss_aux_layer_21": 0.1239013671875, "loss_aux_layer_22": 0.143798828125, "loss_aux_layer_23": 0.178955078125, "loss_aux_layer_3": 0.04779052734375, "loss_aux_layer_4": 0.0501708984375, "loss_aux_layer_5": 0.05157470703125, "loss_aux_layer_6": 0.05438232421875, "loss_aux_layer_7": 0.05267333984375, "loss_aux_layer_8": 0.052001953125, "loss_aux_layer_9": 0.05078125, "step": 5178, "total_loss": 0.5591433644294739 }, { "epoch": 1.02534151653138, "grad_norm": 0.7991258502006531, "learning_rate": 5e-05, "llm_loss": 0.579495020210743, "loss": 2.6149, "loss_aux_layer_0": 0.009124755859375, "loss_aux_layer_1": 0.026947021484375, "loss_aux_layer_10": 0.05224609375, "loss_aux_layer_11": 0.0557861328125, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06488037109375, "loss_aux_layer_14": 0.07275390625, "loss_aux_layer_15": 0.0809326171875, "loss_aux_layer_16": 0.08984375, "loss_aux_layer_17": 0.0977783203125, "loss_aux_layer_18": 0.1055908203125, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.03741455078125, "loss_aux_layer_20": 0.1165771484375, "loss_aux_layer_21": 0.1243896484375, "loss_aux_layer_22": 0.144775390625, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.0467529296875, "loss_aux_layer_4": 0.0494384765625, "loss_aux_layer_5": 0.05078125, "loss_aux_layer_6": 0.05352783203125, "loss_aux_layer_7": 0.05206298828125, "loss_aux_layer_8": 0.05169677734375, "loss_aux_layer_9": 0.05078125, "step": 5179, "total_loss": 0.6537237465381622 }, { "epoch": 1.0255394971292813, "grad_norm": 0.9533302783966064, "learning_rate": 5e-05, "llm_loss": 0.5281398594379425, "loss": 2.401, "loss_aux_layer_0": 0.0104522705078125, "loss_aux_layer_1": 0.026123046875, "loss_aux_layer_10": 0.04949951171875, "loss_aux_layer_11": 0.0531005859375, "loss_aux_layer_12": 0.05718994140625, "loss_aux_layer_13": 0.06195068359375, "loss_aux_layer_14": 0.06976318359375, "loss_aux_layer_15": 0.0772705078125, "loss_aux_layer_16": 0.085693359375, "loss_aux_layer_17": 0.093505859375, "loss_aux_layer_18": 0.1014404296875, "loss_aux_layer_19": 0.10546875, "loss_aux_layer_2": 0.03607177734375, "loss_aux_layer_20": 0.113525390625, "loss_aux_layer_21": 0.122802734375, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0450439453125, "loss_aux_layer_4": 0.04718017578125, "loss_aux_layer_5": 0.04852294921875, "loss_aux_layer_6": 0.0509033203125, "loss_aux_layer_7": 0.049560546875, "loss_aux_layer_8": 0.04931640625, "loss_aux_layer_9": 0.04827880859375, "step": 5180, "total_loss": 0.6002534627914429 }, { "epoch": 1.0257374777271828, "grad_norm": 0.8623244166374207, "learning_rate": 5e-05, "llm_loss": 0.6049233973026276, "loss": 2.7171, "loss_aux_layer_0": 0.0087432861328125, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.05340576171875, "loss_aux_layer_11": 0.0572509765625, "loss_aux_layer_12": 0.06109619140625, "loss_aux_layer_13": 0.0660400390625, "loss_aux_layer_14": 0.0733642578125, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.089111328125, "loss_aux_layer_17": 0.09619140625, "loss_aux_layer_18": 0.103271484375, "loss_aux_layer_19": 0.105712890625, "loss_aux_layer_2": 0.03912353515625, "loss_aux_layer_20": 0.113037109375, "loss_aux_layer_21": 0.120849609375, "loss_aux_layer_22": 0.141845703125, "loss_aux_layer_23": 0.177490234375, "loss_aux_layer_3": 0.0489501953125, "loss_aux_layer_4": 0.0516357421875, "loss_aux_layer_5": 0.0531005859375, "loss_aux_layer_6": 0.0556640625, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05328369140625, "loss_aux_layer_9": 0.05224609375, "step": 5181, "total_loss": 0.679267406463623 }, { "epoch": 1.0259354583250841, "grad_norm": 1.0178202390670776, "learning_rate": 5e-05, "llm_loss": 0.5260404571890831, "loss": 2.4096, "loss_aux_layer_0": 0.009552001953125, "loss_aux_layer_1": 0.028594970703125, "loss_aux_layer_10": 0.0546875, "loss_aux_layer_11": 0.05828857421875, "loss_aux_layer_12": 0.06243896484375, "loss_aux_layer_13": 0.06719970703125, "loss_aux_layer_14": 0.0751953125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0919189453125, "loss_aux_layer_17": 0.0994873046875, "loss_aux_layer_18": 0.1070556640625, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.1180419921875, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.0499267578125, "loss_aux_layer_4": 0.05230712890625, "loss_aux_layer_5": 0.05364990234375, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05474853515625, "loss_aux_layer_8": 0.05413818359375, "loss_aux_layer_9": 0.05322265625, "step": 5182, "total_loss": 0.60239677131176 }, { "epoch": 1.0261334389229855, "grad_norm": 0.977056086063385, "learning_rate": 5e-05, "llm_loss": 0.5767650008201599, "loss": 2.6045, "loss_aux_layer_0": 0.0099029541015625, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.052734375, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.06072998046875, "loss_aux_layer_13": 0.0655517578125, "loss_aux_layer_14": 0.072998046875, "loss_aux_layer_15": 0.080810546875, "loss_aux_layer_16": 0.089599609375, "loss_aux_layer_17": 0.096923828125, "loss_aux_layer_18": 0.1044921875, "loss_aux_layer_19": 0.1075439453125, "loss_aux_layer_2": 0.03851318359375, "loss_aux_layer_20": 0.115234375, "loss_aux_layer_21": 0.12255859375, "loss_aux_layer_22": 0.141845703125, "loss_aux_layer_23": 0.17578125, "loss_aux_layer_3": 0.04827880859375, "loss_aux_layer_4": 0.05084228515625, "loss_aux_layer_5": 0.05224609375, "loss_aux_layer_6": 0.055419921875, "loss_aux_layer_7": 0.0537109375, "loss_aux_layer_8": 0.0531005859375, "loss_aux_layer_9": 0.0516357421875, "step": 5183, "total_loss": 0.6511246263980865 }, { "epoch": 1.026331419520887, "grad_norm": 0.9383372664451599, "learning_rate": 5e-05, "llm_loss": 0.5233290940523148, "loss": 2.3939, "loss_aux_layer_0": 0.010162353515625, "loss_aux_layer_1": 0.027191162109375, "loss_aux_layer_10": 0.0531005859375, "loss_aux_layer_11": 0.05657958984375, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.0657958984375, "loss_aux_layer_14": 0.073974609375, "loss_aux_layer_15": 0.0814208984375, "loss_aux_layer_16": 0.0902099609375, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.0382080078125, "loss_aux_layer_20": 0.116943359375, "loss_aux_layer_21": 0.125732421875, "loss_aux_layer_22": 0.146484375, "loss_aux_layer_23": 0.182373046875, "loss_aux_layer_3": 0.0477294921875, "loss_aux_layer_4": 0.05023193359375, "loss_aux_layer_5": 0.0516357421875, "loss_aux_layer_6": 0.05474853515625, "loss_aux_layer_7": 0.0533447265625, "loss_aux_layer_8": 0.05291748046875, "loss_aux_layer_9": 0.0518798828125, "step": 5184, "total_loss": 0.5984750390052795 }, { "epoch": 1.0265294001187883, "grad_norm": 0.8592725396156311, "learning_rate": 5e-05, "llm_loss": 0.5301842242479324, "loss": 2.4107, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.0255126953125, "loss_aux_layer_10": 0.0496826171875, "loss_aux_layer_11": 0.05340576171875, "loss_aux_layer_12": 0.0577392578125, "loss_aux_layer_13": 0.0628662109375, "loss_aux_layer_14": 0.071044921875, "loss_aux_layer_15": 0.078857421875, "loss_aux_layer_16": 0.0880126953125, "loss_aux_layer_17": 0.095458984375, "loss_aux_layer_18": 0.1033935546875, "loss_aux_layer_19": 0.106689453125, "loss_aux_layer_2": 0.0357666015625, "loss_aux_layer_20": 0.1146240234375, "loss_aux_layer_21": 0.12353515625, "loss_aux_layer_22": 0.14453125, "loss_aux_layer_23": 0.180419921875, "loss_aux_layer_3": 0.04473876953125, "loss_aux_layer_4": 0.04705810546875, "loss_aux_layer_5": 0.04852294921875, "loss_aux_layer_6": 0.051025390625, "loss_aux_layer_7": 0.0496826171875, "loss_aux_layer_8": 0.04931640625, "loss_aux_layer_9": 0.04840087890625, "step": 5185, "total_loss": 0.6026777774095535 }, { "epoch": 1.0267273807166897, "grad_norm": 1.004751443862915, "learning_rate": 5e-05, "llm_loss": 0.5767787918448448, "loss": 2.6137, "loss_aux_layer_0": 0.0097808837890625, "loss_aux_layer_1": 0.02813720703125, "loss_aux_layer_10": 0.05517578125, "loss_aux_layer_11": 0.05914306640625, "loss_aux_layer_12": 0.0631103515625, "loss_aux_layer_13": 0.068115234375, "loss_aux_layer_14": 0.07568359375, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.092041015625, "loss_aux_layer_17": 0.09912109375, "loss_aux_layer_18": 0.10693359375, "loss_aux_layer_19": 0.10986328125, "loss_aux_layer_2": 0.0394287109375, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.1260986328125, "loss_aux_layer_22": 0.1474609375, "loss_aux_layer_23": 0.183837890625, "loss_aux_layer_3": 0.0491943359375, "loss_aux_layer_4": 0.05194091796875, "loss_aux_layer_5": 0.05352783203125, "loss_aux_layer_6": 0.05633544921875, "loss_aux_layer_7": 0.05511474609375, "loss_aux_layer_8": 0.054931640625, "loss_aux_layer_9": 0.05389404296875, "step": 5186, "total_loss": 0.6534361988306046 }, { "epoch": 1.0269253613145912, "grad_norm": 0.8994444608688354, "learning_rate": 5e-05, "llm_loss": 0.5556327104568481, "loss": 2.5235, "loss_aux_layer_0": 0.0095672607421875, "loss_aux_layer_1": 0.02789306640625, "loss_aux_layer_10": 0.05328369140625, "loss_aux_layer_11": 0.05718994140625, "loss_aux_layer_12": 0.0614013671875, "loss_aux_layer_13": 0.0665283203125, "loss_aux_layer_14": 0.074462890625, "loss_aux_layer_15": 0.0823974609375, "loss_aux_layer_16": 0.091064453125, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.1060791015625, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.03863525390625, "loss_aux_layer_20": 0.116455078125, "loss_aux_layer_21": 0.1241455078125, "loss_aux_layer_22": 0.14306640625, "loss_aux_layer_23": 0.177978515625, "loss_aux_layer_3": 0.04876708984375, "loss_aux_layer_4": 0.05145263671875, "loss_aux_layer_5": 0.052978515625, "loss_aux_layer_6": 0.0555419921875, "loss_aux_layer_7": 0.0538330078125, "loss_aux_layer_8": 0.05316162109375, "loss_aux_layer_9": 0.05218505859375, "step": 5187, "total_loss": 0.6308822333812714 }, { "epoch": 1.0271233419124925, "grad_norm": 0.880725085735321, "learning_rate": 5e-05, "llm_loss": 0.5594889968633652, "loss": 2.5337, "loss_aux_layer_0": 0.0099639892578125, "loss_aux_layer_1": 0.02630615234375, "loss_aux_layer_10": 0.051513671875, "loss_aux_layer_11": 0.054931640625, "loss_aux_layer_12": 0.05889892578125, "loss_aux_layer_13": 0.064208984375, "loss_aux_layer_14": 0.072509765625, "loss_aux_layer_15": 0.0806884765625, "loss_aux_layer_16": 0.09033203125, "loss_aux_layer_17": 0.0975341796875, "loss_aux_layer_18": 0.10546875, "loss_aux_layer_19": 0.1090087890625, "loss_aux_layer_2": 0.03692626953125, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.124755859375, "loss_aux_layer_22": 0.144287109375, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04620361328125, "loss_aux_layer_4": 0.0487060546875, "loss_aux_layer_5": 0.0499267578125, "loss_aux_layer_6": 0.052734375, "loss_aux_layer_7": 0.05133056640625, "loss_aux_layer_8": 0.05108642578125, "loss_aux_layer_9": 0.05035400390625, "step": 5188, "total_loss": 0.633430689573288 }, { "epoch": 1.0273213225103939, "grad_norm": 0.9549612998962402, "learning_rate": 5e-05, "llm_loss": 0.6065393388271332, "loss": 2.7166, "loss_aux_layer_0": 0.0100860595703125, "loss_aux_layer_1": 0.026702880859375, "loss_aux_layer_10": 0.05072021484375, "loss_aux_layer_11": 0.0543212890625, "loss_aux_layer_12": 0.05841064453125, "loss_aux_layer_13": 0.0633544921875, "loss_aux_layer_14": 0.0709228515625, "loss_aux_layer_15": 0.07861328125, "loss_aux_layer_16": 0.08740234375, "loss_aux_layer_17": 0.0950927734375, "loss_aux_layer_18": 0.102783203125, "loss_aux_layer_19": 0.10595703125, "loss_aux_layer_2": 0.03692626953125, "loss_aux_layer_20": 0.114013671875, "loss_aux_layer_21": 0.1224365234375, "loss_aux_layer_22": 0.1416015625, "loss_aux_layer_23": 0.17626953125, "loss_aux_layer_3": 0.0458984375, "loss_aux_layer_4": 0.04833984375, "loss_aux_layer_5": 0.049560546875, "loss_aux_layer_6": 0.0523681640625, "loss_aux_layer_7": 0.0509033203125, "loss_aux_layer_8": 0.050537109375, "loss_aux_layer_9": 0.04949951171875, "step": 5189, "total_loss": 0.6791508793830872 }, { "epoch": 1.0275193031082954, "grad_norm": 0.7991030812263489, "learning_rate": 5e-05, "llm_loss": 0.4937172085046768, "loss": 2.295, "loss_aux_layer_0": 0.01007080078125, "loss_aux_layer_1": 0.030029296875, "loss_aux_layer_10": 0.0572509765625, "loss_aux_layer_11": 0.06170654296875, "loss_aux_layer_12": 0.0657958984375, "loss_aux_layer_13": 0.071044921875, "loss_aux_layer_14": 0.0789794921875, "loss_aux_layer_15": 0.0870361328125, "loss_aux_layer_16": 0.09619140625, "loss_aux_layer_17": 0.1033935546875, "loss_aux_layer_18": 0.111083984375, "loss_aux_layer_19": 0.114501953125, "loss_aux_layer_2": 0.042724609375, "loss_aux_layer_20": 0.1220703125, "loss_aux_layer_21": 0.130126953125, "loss_aux_layer_22": 0.15234375, "loss_aux_layer_23": 0.189208984375, "loss_aux_layer_3": 0.052978515625, "loss_aux_layer_4": 0.055419921875, "loss_aux_layer_5": 0.056884765625, "loss_aux_layer_6": 0.05975341796875, "loss_aux_layer_7": 0.05816650390625, "loss_aux_layer_8": 0.05731201171875, "loss_aux_layer_9": 0.05615234375, "step": 5190, "total_loss": 0.5737471133470535 }, { "epoch": 1.0277172837061967, "grad_norm": 0.8867177963256836, "learning_rate": 5e-05, "llm_loss": 0.5734689682722092, "loss": 2.5926, "loss_aux_layer_0": 0.00994873046875, "loss_aux_layer_1": 0.026519775390625, "loss_aux_layer_10": 0.05194091796875, "loss_aux_layer_11": 0.05572509765625, "loss_aux_layer_12": 0.0596923828125, "loss_aux_layer_13": 0.06494140625, "loss_aux_layer_14": 0.0731201171875, "loss_aux_layer_15": 0.08154296875, "loss_aux_layer_16": 0.090576171875, "loss_aux_layer_17": 0.098388671875, "loss_aux_layer_18": 0.106689453125, "loss_aux_layer_19": 0.1102294921875, "loss_aux_layer_2": 0.03741455078125, "loss_aux_layer_20": 0.1177978515625, "loss_aux_layer_21": 0.125244140625, "loss_aux_layer_22": 0.145263671875, "loss_aux_layer_23": 0.180908203125, "loss_aux_layer_3": 0.047119140625, "loss_aux_layer_4": 0.04962158203125, "loss_aux_layer_5": 0.051025390625, "loss_aux_layer_6": 0.05364990234375, "loss_aux_layer_7": 0.05218505859375, "loss_aux_layer_8": 0.0517578125, "loss_aux_layer_9": 0.05084228515625, "step": 5191, "total_loss": 0.6481501758098602 }, { "epoch": 1.0279152643040983, "grad_norm": 0.9493382573127747, "learning_rate": 5e-05, "llm_loss": 0.5457283779978752, "loss": 2.4937, "loss_aux_layer_0": 0.0093536376953125, "loss_aux_layer_1": 0.028533935546875, "loss_aux_layer_10": 0.05572509765625, "loss_aux_layer_11": 0.05950927734375, "loss_aux_layer_12": 0.0635986328125, "loss_aux_layer_13": 0.0689697265625, "loss_aux_layer_14": 0.0772705078125, "loss_aux_layer_15": 0.08544921875, "loss_aux_layer_16": 0.094482421875, "loss_aux_layer_17": 0.1016845703125, "loss_aux_layer_18": 0.1097412109375, "loss_aux_layer_19": 0.112060546875, "loss_aux_layer_2": 0.03997802734375, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.126953125, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.182861328125, "loss_aux_layer_3": 0.0501708984375, "loss_aux_layer_4": 0.05291748046875, "loss_aux_layer_5": 0.05462646484375, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.0562744140625, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05462646484375, "step": 5192, "total_loss": 0.623431384563446 }, { "epoch": 1.0281132449019996, "grad_norm": 0.8304389715194702, "learning_rate": 5e-05, "llm_loss": 0.5287135615944862, "loss": 2.4196, "loss_aux_layer_0": 0.0100250244140625, "loss_aux_layer_1": 0.0289306640625, "loss_aux_layer_10": 0.05535888671875, "loss_aux_layer_11": 0.0589599609375, "loss_aux_layer_12": 0.06298828125, "loss_aux_layer_13": 0.0677490234375, "loss_aux_layer_14": 0.0753173828125, "loss_aux_layer_15": 0.0830078125, "loss_aux_layer_16": 0.0916748046875, "loss_aux_layer_17": 0.0982666015625, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.1085205078125, "loss_aux_layer_2": 0.04022216796875, "loss_aux_layer_20": 0.115966796875, "loss_aux_layer_21": 0.123046875, "loss_aux_layer_22": 0.142333984375, "loss_aux_layer_23": 0.177001953125, "loss_aux_layer_3": 0.05072021484375, "loss_aux_layer_4": 0.0535888671875, "loss_aux_layer_5": 0.05517578125, "loss_aux_layer_6": 0.05780029296875, "loss_aux_layer_7": 0.05615234375, "loss_aux_layer_8": 0.0555419921875, "loss_aux_layer_9": 0.05426025390625, "step": 5193, "total_loss": 0.604892335832119 }, { "epoch": 1.028311225499901, "grad_norm": 1.1166040897369385, "learning_rate": 5e-05, "llm_loss": 0.5780251771211624, "loss": 2.6043, "loss_aux_layer_0": 0.0092010498046875, "loss_aux_layer_1": 0.0252685546875, "loss_aux_layer_10": 0.0494384765625, "loss_aux_layer_11": 0.052978515625, "loss_aux_layer_12": 0.05731201171875, "loss_aux_layer_13": 0.06268310546875, "loss_aux_layer_14": 0.0712890625, "loss_aux_layer_15": 0.079833984375, "loss_aux_layer_16": 0.0892333984375, "loss_aux_layer_17": 0.097412109375, "loss_aux_layer_18": 0.10595703125, "loss_aux_layer_19": 0.1104736328125, "loss_aux_layer_2": 0.0352783203125, "loss_aux_layer_20": 0.1185302734375, "loss_aux_layer_21": 0.1259765625, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.18115234375, "loss_aux_layer_3": 0.0443115234375, "loss_aux_layer_4": 0.046630859375, "loss_aux_layer_5": 0.04803466796875, "loss_aux_layer_6": 0.0506591796875, "loss_aux_layer_7": 0.04913330078125, "loss_aux_layer_8": 0.04901123046875, "loss_aux_layer_9": 0.0482177734375, "step": 5194, "total_loss": 0.6510841399431229 }, { "epoch": 1.0285092060978025, "grad_norm": 1.4019142389297485, "learning_rate": 5e-05, "llm_loss": 0.47426142543554306, "loss": 2.192, "loss_aux_layer_0": 0.011627197265625, "loss_aux_layer_1": 0.027099609375, "loss_aux_layer_10": 0.05078125, "loss_aux_layer_11": 0.05419921875, "loss_aux_layer_12": 0.05804443359375, "loss_aux_layer_13": 0.06304931640625, "loss_aux_layer_14": 0.0716552734375, "loss_aux_layer_15": 0.0802001953125, "loss_aux_layer_16": 0.0892333984375, "loss_aux_layer_17": 0.09716796875, "loss_aux_layer_18": 0.105224609375, "loss_aux_layer_19": 0.1087646484375, "loss_aux_layer_2": 0.03765869140625, "loss_aux_layer_20": 0.11669921875, "loss_aux_layer_21": 0.1251220703125, "loss_aux_layer_22": 0.14501953125, "loss_aux_layer_23": 0.179443359375, "loss_aux_layer_3": 0.0467529296875, "loss_aux_layer_4": 0.0489501953125, "loss_aux_layer_5": 0.050048828125, "loss_aux_layer_6": 0.05224609375, "loss_aux_layer_7": 0.05078125, "loss_aux_layer_8": 0.050537109375, "loss_aux_layer_9": 0.0494384765625, "step": 5195, "total_loss": 0.5479901283979416 }, { "epoch": 1.0287071866957038, "grad_norm": 0.7669717669487, "learning_rate": 5e-05, "llm_loss": 0.5239728540182114, "loss": 2.4103, "loss_aux_layer_0": 0.009124755859375, "loss_aux_layer_1": 0.02911376953125, "loss_aux_layer_10": 0.05596923828125, "loss_aux_layer_11": 0.06005859375, "loss_aux_layer_12": 0.06427001953125, "loss_aux_layer_13": 0.0697021484375, "loss_aux_layer_14": 0.0777587890625, "loss_aux_layer_15": 0.0860595703125, "loss_aux_layer_16": 0.09521484375, "loss_aux_layer_17": 0.102783203125, "loss_aux_layer_18": 0.1107177734375, "loss_aux_layer_19": 0.1136474609375, "loss_aux_layer_2": 0.0408935546875, "loss_aux_layer_20": 0.121337890625, "loss_aux_layer_21": 0.1287841796875, "loss_aux_layer_22": 0.149658203125, "loss_aux_layer_23": 0.185546875, "loss_aux_layer_3": 0.0511474609375, "loss_aux_layer_4": 0.0537109375, "loss_aux_layer_5": 0.05499267578125, "loss_aux_layer_6": 0.0579833984375, "loss_aux_layer_7": 0.0565185546875, "loss_aux_layer_8": 0.05596923828125, "loss_aux_layer_9": 0.05487060546875, "step": 5196, "total_loss": 0.6025643646717072 }, { "epoch": 1.0289051672936051, "grad_norm": 1.2229351997375488, "learning_rate": 5e-05, "llm_loss": 0.5654393434524536, "loss": 2.5649, "loss_aux_layer_0": 0.012115478515625, "loss_aux_layer_1": 0.0279541015625, "loss_aux_layer_10": 0.05303955078125, "loss_aux_layer_11": 0.05670166015625, "loss_aux_layer_12": 0.060791015625, "loss_aux_layer_13": 0.066162109375, "loss_aux_layer_14": 0.0745849609375, "loss_aux_layer_15": 0.0831298828125, "loss_aux_layer_16": 0.0926513671875, "loss_aux_layer_17": 0.1004638671875, "loss_aux_layer_18": 0.1080322265625, "loss_aux_layer_19": 0.111328125, "loss_aux_layer_2": 0.0389404296875, "loss_aux_layer_20": 0.1190185546875, "loss_aux_layer_21": 0.126220703125, "loss_aux_layer_22": 0.145751953125, "loss_aux_layer_23": 0.179931640625, "loss_aux_layer_3": 0.04803466796875, "loss_aux_layer_4": 0.05047607421875, "loss_aux_layer_5": 0.05194091796875, "loss_aux_layer_6": 0.0545654296875, "loss_aux_layer_7": 0.052978515625, "loss_aux_layer_8": 0.0523681640625, "loss_aux_layer_9": 0.0516357421875, "step": 5197, "total_loss": 0.6412320658564568 }, { "epoch": 1.0291031478915067, "grad_norm": 0.9180154800415039, "learning_rate": 5e-05, "llm_loss": 0.5095743760466576, "loss": 2.3477, "loss_aux_layer_0": 0.00958251953125, "loss_aux_layer_1": 0.02838134765625, "loss_aux_layer_10": 0.05523681640625, "loss_aux_layer_11": 0.05908203125, "loss_aux_layer_12": 0.06317138671875, "loss_aux_layer_13": 0.068603515625, "loss_aux_layer_14": 0.0762939453125, "loss_aux_layer_15": 0.08447265625, "loss_aux_layer_16": 0.0933837890625, "loss_aux_layer_17": 0.1009521484375, "loss_aux_layer_18": 0.10888671875, "loss_aux_layer_19": 0.111572265625, "loss_aux_layer_2": 0.0404052734375, "loss_aux_layer_20": 0.1192626953125, "loss_aux_layer_21": 0.1265869140625, "loss_aux_layer_22": 0.147216796875, "loss_aux_layer_23": 0.18408203125, "loss_aux_layer_3": 0.050048828125, "loss_aux_layer_4": 0.052978515625, "loss_aux_layer_5": 0.05450439453125, "loss_aux_layer_6": 0.05718994140625, "loss_aux_layer_7": 0.05560302734375, "loss_aux_layer_8": 0.05487060546875, "loss_aux_layer_9": 0.05389404296875, "step": 5198, "total_loss": 0.5869365930557251 }, { "epoch": 1.029301128489408, "grad_norm": 0.8669989705085754, "learning_rate": 5e-05, "llm_loss": 0.5044724196195602, "loss": 2.3173, "loss_aux_layer_0": 0.01190185546875, "loss_aux_layer_1": 0.02850341796875, "loss_aux_layer_10": 0.0528564453125, "loss_aux_layer_11": 0.056396484375, "loss_aux_layer_12": 0.0604248046875, "loss_aux_layer_13": 0.0654296875, "loss_aux_layer_14": 0.0732421875, "loss_aux_layer_15": 0.0811767578125, "loss_aux_layer_16": 0.0894775390625, "loss_aux_layer_17": 0.0966796875, "loss_aux_layer_18": 0.104248046875, "loss_aux_layer_19": 0.1080322265625, "loss_aux_layer_2": 0.03875732421875, "loss_aux_layer_20": 0.1156005859375, "loss_aux_layer_21": 0.1246337890625, "loss_aux_layer_22": 0.1455078125, "loss_aux_layer_23": 0.181884765625, "loss_aux_layer_3": 0.048095703125, "loss_aux_layer_4": 0.05035400390625, "loss_aux_layer_5": 0.0517578125, "loss_aux_layer_6": 0.05450439453125, "loss_aux_layer_7": 0.05303955078125, "loss_aux_layer_8": 0.05267333984375, "loss_aux_layer_9": 0.05157470703125, "step": 5199, "total_loss": 0.5793221294879913 }, { "epoch": 1.0294991090873093, "grad_norm": 0.851561963558197, "learning_rate": 5e-05, "llm_loss": 0.5158687382936478, "loss": 2.3715, "loss_aux_layer_0": 0.0107269287109375, "loss_aux_layer_1": 0.027618408203125, "loss_aux_layer_10": 0.0540771484375, "loss_aux_layer_11": 0.05767822265625, "loss_aux_layer_12": 0.06231689453125, "loss_aux_layer_13": 0.067626953125, "loss_aux_layer_14": 0.076171875, "loss_aux_layer_15": 0.0845947265625, "loss_aux_layer_16": 0.09375, "loss_aux_layer_17": 0.1014404296875, "loss_aux_layer_18": 0.109130859375, "loss_aux_layer_19": 0.1126708984375, "loss_aux_layer_2": 0.03900146484375, "loss_aux_layer_20": 0.120361328125, "loss_aux_layer_21": 0.12841796875, "loss_aux_layer_22": 0.14892578125, "loss_aux_layer_23": 0.18505859375, "loss_aux_layer_3": 0.04864501953125, "loss_aux_layer_4": 0.05126953125, "loss_aux_layer_5": 0.05291748046875, "loss_aux_layer_6": 0.05572509765625, "loss_aux_layer_7": 0.0540771484375, "loss_aux_layer_8": 0.05364990234375, "loss_aux_layer_9": 0.05255126953125, "step": 5200, "total_loss": 0.5928841233253479 } ], "logging_steps": 1, "max_steps": 15153, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.308650636855109e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }