[ { "loss": 1.4307, "grad_norm": 0.6940367817878723, "learning_rate": 3e-05, "entropy": 1.1788243889808654, "num_tokens": 155850.0, "mean_token_accuracy": 0.6935877948999405, "epoch": 0.11764705882352941, "step": 10 }, { "loss": 1.1806, "grad_norm": 0.7020200490951538, "learning_rate": 6.333333333333333e-05, "entropy": 1.2202600359916687, "num_tokens": 312325.0, "mean_token_accuracy": 0.7312566488981247, "epoch": 0.23529411764705882, "step": 20 }, { "loss": 0.7607, "grad_norm": 0.47733989357948303, "learning_rate": 9.666666666666667e-05, "entropy": 0.8012344032526016, "num_tokens": 469039.0, "mean_token_accuracy": 0.8055870264768601, "epoch": 0.35294117647058826, "step": 30 }, { "loss": 0.5203, "grad_norm": 0.45993906259536743, "learning_rate": 9.96057350657239e-05, "entropy": 0.5153448149561882, "num_tokens": 623975.0, "mean_token_accuracy": 0.8535495191812515, "epoch": 0.47058823529411764, "step": 40 }, { "loss": 0.4475, "grad_norm": 0.3403550088405609, "learning_rate": 9.825082472361557e-05, "entropy": 0.44883628338575365, "num_tokens": 779685.0, "mean_token_accuracy": 0.8701241254806519, "epoch": 0.5882352941176471, "step": 50 }, { "loss": 0.4282, "grad_norm": 0.2295123040676117, "learning_rate": 9.595676696276172e-05, "entropy": 0.43300508707761765, "num_tokens": 936272.0, "mean_token_accuracy": 0.874285313487053, "epoch": 0.7058823529411765, "step": 60 }, { "loss": 0.4267, "grad_norm": 0.17979678511619568, "learning_rate": 9.276821300802534e-05, "entropy": 0.4283462271094322, "num_tokens": 1091178.0, "mean_token_accuracy": 0.8739971458911896, "epoch": 0.8235294117647058, "step": 70 }, { "loss": 0.4321, "grad_norm": 0.14834725856781006, "learning_rate": 8.874722443520899e-05, "entropy": 0.4309688463807106, "num_tokens": 1247181.0, "mean_token_accuracy": 0.87317014336586, "epoch": 0.9411764705882353, "step": 80 }, { "loss": 0.4183, "grad_norm": 0.1518690288066864, "learning_rate": 8.397206521307584e-05, "entropy": 0.4187887296080589, "num_tokens": 1399415.0, "mean_token_accuracy": 0.8748669505119324, "epoch": 1.0588235294117647, "step": 90 }, { "loss": 0.4285, "grad_norm": 0.16970738768577576, "learning_rate": 7.85356783842216e-05, "entropy": 0.42983078211545944, "num_tokens": 1552473.0, "mean_token_accuracy": 0.8730897456407547, "epoch": 1.1764705882352942, "step": 100 }, { "loss": 0.4141, "grad_norm": 0.1731289178133011, "learning_rate": 7.254387703447154e-05, "entropy": 0.4201514065265656, "num_tokens": 1709461.0, "mean_token_accuracy": 0.875280225276947, "epoch": 1.2941176470588236, "step": 110 }, { "loss": 0.4096, "grad_norm": 0.1622275412082672, "learning_rate": 6.611328476152557e-05, "entropy": 0.41087806075811384, "num_tokens": 1867531.0, "mean_token_accuracy": 0.8777904808521271, "epoch": 1.4117647058823528, "step": 120 }, { "loss": 0.4074, "grad_norm": 0.15023751556873322, "learning_rate": 5.9369065729286245e-05, "entropy": 0.40929861813783647, "num_tokens": 2023122.0, "mean_token_accuracy": 0.8778889566659928, "epoch": 1.5294117647058822, "step": 130 }, { "loss": 0.4091, "grad_norm": 0.13255436718463898, "learning_rate": 5.244248848978067e-05, "entropy": 0.41203168481588365, "num_tokens": 2178280.0, "mean_token_accuracy": 0.8771576941013336, "epoch": 1.6470588235294117, "step": 140 }, { "loss": 0.4027, "grad_norm": 0.156217560172081, "learning_rate": 4.5468370990111006e-05, "entropy": 0.4050563558936119, "num_tokens": 2335590.0, "mean_token_accuracy": 0.8797280818223954, "epoch": 1.7647058823529411, "step": 150 }, { "loss": 0.4089, "grad_norm": 0.1703469306230545, "learning_rate": 3.858245649446721e-05, "entropy": 0.4095383077859879, "num_tokens": 2490848.0, "mean_token_accuracy": 0.8773213714361191, "epoch": 1.8823529411764706, "step": 160 }, { "loss": 0.4052, "grad_norm": 0.19920362532138824, "learning_rate": 3.1918771495895396e-05, "entropy": 0.4070162191987038, "num_tokens": 2644022.0, "mean_token_accuracy": 0.8788870364427567, "epoch": 2.0, "step": 170 }, { "loss": 0.4031, "grad_norm": 0.13015073537826538, "learning_rate": 2.560701704306336e-05, "entropy": 0.40551580488681793, "num_tokens": 2800120.0, "mean_token_accuracy": 0.8789848744869232, "epoch": 2.1176470588235294, "step": 180 }, { "loss": 0.4067, "grad_norm": 0.14793093502521515, "learning_rate": 1.977004425688126e-05, "entropy": 0.41181707233190534, "num_tokens": 2955965.0, "mean_token_accuracy": 0.877298504114151, "epoch": 2.235294117647059, "step": 190 }, { "loss": 0.4079, "grad_norm": 0.1416511833667755, "learning_rate": 1.4521463173173965e-05, "entropy": 0.41118668466806413, "num_tokens": 3111112.0, "mean_token_accuracy": 0.8776616156101227, "epoch": 2.3529411764705883, "step": 200 }, { "loss": 0.3992, "grad_norm": 0.14714457094669342, "learning_rate": 9.963431452563332e-06, "entropy": 0.4004165202379227, "num_tokens": 3268703.0, "mean_token_accuracy": 0.8799054056406022, "epoch": 2.4705882352941178, "step": 210 }, { "loss": 0.4058, "grad_norm": 0.14480003714561462, "learning_rate": 6.184665997806832e-06, "entropy": 0.4070618197321892, "num_tokens": 3423830.0, "mean_token_accuracy": 0.8782488256692886, "epoch": 2.588235294117647, "step": 220 }, { "loss": 0.406, "grad_norm": 0.14828461408615112, "learning_rate": 3.258716180199278e-06, "entropy": 0.4072370767593384, "num_tokens": 3579594.0, "mean_token_accuracy": 0.8782227575778961, "epoch": 2.7058823529411766, "step": 230 }, { "loss": 0.4079, "grad_norm": 0.16037017107009888, "learning_rate": 1.2425322847218368e-06, "entropy": 0.40918404459953306, "num_tokens": 3734972.0, "mean_token_accuracy": 0.8776972621679306, "epoch": 2.8235294117647056, "step": 240 }, { "loss": 0.4035, "grad_norm": 0.1524023711681366, "learning_rate": 1.753570375247815e-07, "entropy": 0.4051833271980286, "num_tokens": 3891447.0, "mean_token_accuracy": 0.8786362946033478, "epoch": 2.9411764705882355, "step": 250 }, { "train_runtime": 1295.9489, "train_samples_per_second": 6.28, "train_steps_per_second": 0.197, "total_flos": 1.3072916160774144e+18, "train_loss": 0.5008774252498851, "entropy": 0.406230491399765, "num_tokens": 3966033.0, "mean_token_accuracy": 0.8788680493831634, "epoch": 3.0, "step": 255 } ]